summaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorDavid Woodhouse <dwmw2@infradead.org>2006-10-21 16:46:04 +0100
committerDavid Woodhouse <dwmw2@infradead.org>2006-10-21 16:46:04 +0100
commit513b046c96cc2fbce730a3474f6f7ff0c4fdd05c (patch)
treee8006368b6f643067486f92405a404757807d6da /fs
parent82810b7b6cc7a74c68881a13b0eb66c7a6370fcc (diff)
parentc7a3bd177f248d01ee18a01d22048c80e071c331 (diff)
downloadlinux-513b046c96cc2fbce730a3474f6f7ff0c4fdd05c.tar.gz
linux-513b046c96cc2fbce730a3474f6f7ff0c4fdd05c.tar.bz2
linux-513b046c96cc2fbce730a3474f6f7ff0c4fdd05c.zip
Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6
Diffstat (limited to 'fs')
-rw-r--r--fs/Kconfig131
-rw-r--r--fs/Makefile5
-rw-r--r--fs/afs/dir.c8
-rw-r--r--fs/autofs/autofs_i.h1
-rw-r--r--fs/autofs/dirhash.c1
-rw-r--r--fs/autofs/init.c2
-rw-r--r--fs/autofs/inode.c4
-rw-r--r--fs/autofs4/autofs_i.h3
-rw-r--r--fs/autofs4/init.c2
-rw-r--r--fs/autofs4/inode.c22
-rw-r--r--fs/autofs4/waitq.c1
-rw-r--r--fs/befs/befs.h6
-rw-r--r--fs/befs/befs_fs_types.h112
-rw-r--r--fs/befs/btree.c29
-rw-r--r--fs/befs/datastream.c11
-rw-r--r--fs/befs/debug.c12
-rw-r--r--fs/befs/endian.h57
-rw-r--r--fs/befs/inode.c1
-rw-r--r--fs/befs/linuxvfs.c1
-rw-r--r--fs/befs/super.c1
-rw-r--r--fs/binfmt_elf.c10
-rw-r--r--fs/binfmt_som.c18
-rw-r--r--fs/bio.c9
-rw-r--r--fs/buffer.c39
-rw-r--r--fs/cifs/cifsacl.h4
-rw-r--r--fs/cifs/cifsencrypt.h2
-rw-r--r--fs/cifs/cifsfs.c27
-rw-r--r--fs/cifs/cifsfs.h2
-rw-r--r--fs/cifs/cifsglob.h15
-rw-r--r--fs/cifs/cifspdu.h12
-rw-r--r--fs/cifs/cifsproto.h12
-rw-r--r--fs/cifs/cifssmb.c102
-rw-r--r--fs/cifs/connect.c35
-rw-r--r--fs/cifs/inode.c12
-rw-r--r--fs/cifs/link.c6
-rw-r--r--fs/cifs/md5.c8
-rw-r--r--fs/cifs/md5.h8
-rw-r--r--fs/cifs/misc.c44
-rw-r--r--fs/cifs/netmisc.c58
-rw-r--r--fs/cifs/readdir.c27
-rw-r--r--fs/cifs/sess.c23
-rw-r--r--fs/cifs/smbdes.c6
-rw-r--r--fs/cifs/smbencrypt.c11
-rw-r--r--fs/compat.c2
-rw-r--r--fs/compat_ioctl.c10
-rw-r--r--fs/configfs/file.c14
-rw-r--r--fs/configfs/item.c2
-rw-r--r--fs/dcache.c139
-rw-r--r--fs/dlm/Kconfig20
-rw-r--r--fs/dlm/Makefile19
-rw-r--r--fs/dlm/ast.c173
-rw-r--r--fs/dlm/ast.h26
-rw-r--r--fs/dlm/config.c789
-rw-r--r--fs/dlm/config.h42
-rw-r--r--fs/dlm/debug_fs.c387
-rw-r--r--fs/dlm/dir.c423
-rw-r--r--fs/dlm/dir.h30
-rw-r--r--fs/dlm/dlm_internal.h543
-rw-r--r--fs/dlm/lock.c3871
-rw-r--r--fs/dlm/lock.h62
-rw-r--r--fs/dlm/lockspace.c717
-rw-r--r--fs/dlm/lockspace.h25
-rw-r--r--fs/dlm/lowcomms.c1239
-rw-r--r--fs/dlm/lowcomms.h26
-rw-r--r--fs/dlm/lvb_table.h18
-rw-r--r--fs/dlm/main.c97
-rw-r--r--fs/dlm/member.c327
-rw-r--r--fs/dlm/member.h24
-rw-r--r--fs/dlm/memory.c116
-rw-r--r--fs/dlm/memory.h29
-rw-r--r--fs/dlm/midcomms.c140
-rw-r--r--fs/dlm/midcomms.h21
-rw-r--r--fs/dlm/rcom.c472
-rw-r--r--fs/dlm/rcom.h24
-rw-r--r--fs/dlm/recover.c765
-rw-r--r--fs/dlm/recover.h34
-rw-r--r--fs/dlm/recoverd.c290
-rw-r--r--fs/dlm/recoverd.h24
-rw-r--r--fs/dlm/requestqueue.c184
-rw-r--r--fs/dlm/requestqueue.h22
-rw-r--r--fs/dlm/user.c788
-rw-r--r--fs/dlm/user.h16
-rw-r--r--fs/dlm/util.c161
-rw-r--r--fs/dlm/util.h22
-rw-r--r--fs/ecryptfs/Makefile7
-rw-r--r--fs/ecryptfs/crypto.c1659
-rw-r--r--fs/ecryptfs/debug.c123
-rw-r--r--fs/ecryptfs/dentry.c87
-rw-r--r--fs/ecryptfs/ecryptfs_kernel.h482
-rw-r--r--fs/ecryptfs/file.c440
-rw-r--r--fs/ecryptfs/inode.c1079
-rw-r--r--fs/ecryptfs/keystore.c1061
-rw-r--r--fs/ecryptfs/main.c828
-rw-r--r--fs/ecryptfs/mmap.c788
-rw-r--r--fs/ecryptfs/super.c198
-rw-r--r--fs/eventpoll.c56
-rw-r--r--fs/ext2/super.c16
-rw-r--r--fs/ext3/super.c2
-rw-r--r--fs/ext4/Makefile12
-rw-r--r--fs/ext4/acl.c551
-rw-r--r--fs/ext4/acl.h81
-rw-r--r--fs/ext4/balloc.c1833
-rw-r--r--fs/ext4/bitmap.c32
-rw-r--r--fs/ext4/dir.c518
-rw-r--r--fs/ext4/extents.c2152
-rw-r--r--fs/ext4/file.c139
-rw-r--r--fs/ext4/fsync.c88
-rw-r--r--fs/ext4/hash.c152
-rw-r--r--fs/ext4/ialloc.c772
-rw-r--r--fs/ext4/inode.c3233
-rw-r--r--fs/ext4/ioctl.c306
-rw-r--r--fs/ext4/namei.c2395
-rw-r--r--fs/ext4/namei.h8
-rw-r--r--fs/ext4/resize.c1045
-rw-r--r--fs/ext4/super.c2829
-rw-r--r--fs/ext4/symlink.c54
-rw-r--r--fs/ext4/xattr.c1317
-rw-r--r--fs/ext4/xattr.h145
-rw-r--r--fs/ext4/xattr_security.c77
-rw-r--r--fs/ext4/xattr_trusted.c62
-rw-r--r--fs/ext4/xattr_user.c64
-rw-r--r--fs/fat/file.c3
-rw-r--r--fs/fat/inode.c4
-rw-r--r--fs/fuse/dir.c107
-rw-r--r--fs/fuse/file.c12
-rw-r--r--fs/fuse/fuse_i.h3
-rw-r--r--fs/fuse/inode.c15
-rw-r--r--fs/gfs2/Kconfig44
-rw-r--r--fs/gfs2/Makefile10
-rw-r--r--fs/gfs2/acl.c309
-rw-r--r--fs/gfs2/acl.h39
-rw-r--r--fs/gfs2/bmap.c1222
-rw-r--r--fs/gfs2/bmap.h31
-rw-r--r--fs/gfs2/daemon.c196
-rw-r--r--fs/gfs2/daemon.h19
-rw-r--r--fs/gfs2/dir.c1957
-rw-r--r--fs/gfs2/dir.h79
-rw-r--r--fs/gfs2/eaops.c230
-rw-r--r--fs/gfs2/eaops.h30
-rw-r--r--fs/gfs2/eattr.c1501
-rw-r--r--fs/gfs2/eattr.h100
-rw-r--r--fs/gfs2/gfs2.h31
-rw-r--r--fs/gfs2/glock.c2231
-rw-r--r--fs/gfs2/glock.h153
-rw-r--r--fs/gfs2/glops.c615
-rw-r--r--fs/gfs2/glops.h25
-rw-r--r--fs/gfs2/incore.h634
-rw-r--r--fs/gfs2/inode.c1379
-rw-r--r--fs/gfs2/inode.h56
-rw-r--r--fs/gfs2/lm.c217
-rw-r--r--fs/gfs2/lm.h42
-rw-r--r--fs/gfs2/locking.c184
-rw-r--r--fs/gfs2/locking/dlm/Makefile3
-rw-r--r--fs/gfs2/locking/dlm/lock.c524
-rw-r--r--fs/gfs2/locking/dlm/lock_dlm.h187
-rw-r--r--fs/gfs2/locking/dlm/main.c64
-rw-r--r--fs/gfs2/locking/dlm/mount.c255
-rw-r--r--fs/gfs2/locking/dlm/plock.c301
-rw-r--r--fs/gfs2/locking/dlm/sysfs.c226
-rw-r--r--fs/gfs2/locking/dlm/thread.c359
-rw-r--r--fs/gfs2/locking/nolock/Makefile3
-rw-r--r--fs/gfs2/locking/nolock/main.c246
-rw-r--r--fs/gfs2/log.c688
-rw-r--r--fs/gfs2/log.h65
-rw-r--r--fs/gfs2/lops.c809
-rw-r--r--fs/gfs2/lops.h99
-rw-r--r--fs/gfs2/main.c150
-rw-r--r--fs/gfs2/meta_io.c590
-rw-r--r--fs/gfs2/meta_io.h78
-rw-r--r--fs/gfs2/mount.c214
-rw-r--r--fs/gfs2/mount.h17
-rw-r--r--fs/gfs2/ondisk.c308
-rw-r--r--fs/gfs2/ops_address.c793
-rw-r--r--fs/gfs2/ops_address.h22
-rw-r--r--fs/gfs2/ops_dentry.c119
-rw-r--r--fs/gfs2/ops_dentry.h17
-rw-r--r--fs/gfs2/ops_export.c298
-rw-r--r--fs/gfs2/ops_export.h22
-rw-r--r--fs/gfs2/ops_file.c661
-rw-r--r--fs/gfs2/ops_file.h24
-rw-r--r--fs/gfs2/ops_fstype.c925
-rw-r--r--fs/gfs2/ops_fstype.h18
-rw-r--r--fs/gfs2/ops_inode.c1151
-rw-r--r--fs/gfs2/ops_inode.h20
-rw-r--r--fs/gfs2/ops_super.c468
-rw-r--r--fs/gfs2/ops_super.h17
-rw-r--r--fs/gfs2/ops_vm.c184
-rw-r--r--fs/gfs2/ops_vm.h18
-rw-r--r--fs/gfs2/quota.c1228
-rw-r--r--fs/gfs2/quota.h35
-rw-r--r--fs/gfs2/recovery.c571
-rw-r--r--fs/gfs2/recovery.h34
-rw-r--r--fs/gfs2/rgrp.c1513
-rw-r--r--fs/gfs2/rgrp.h69
-rw-r--r--fs/gfs2/super.c976
-rw-r--r--fs/gfs2/super.h55
-rw-r--r--fs/gfs2/sys.c583
-rw-r--r--fs/gfs2/sys.h27
-rw-r--r--fs/gfs2/trans.c184
-rw-r--r--fs/gfs2/trans.h39
-rw-r--r--fs/gfs2/util.c245
-rw-r--r--fs/gfs2/util.h170
-rw-r--r--fs/hpfs/inode.c11
-rw-r--r--fs/hppfs/hppfs_kern.c2
-rw-r--r--fs/hugetlbfs/inode.c2
-rw-r--r--fs/inode.c2
-rw-r--r--fs/ioprio.c5
-rw-r--r--fs/isofs/joliet.c10
-rw-r--r--fs/isofs/namei.c1
-rw-r--r--fs/jbd/journal.c3
-rw-r--r--fs/jbd/transaction.c5
-rw-r--r--fs/jbd2/Makefile7
-rw-r--r--fs/jbd2/checkpoint.c697
-rw-r--r--fs/jbd2/commit.c920
-rw-r--r--fs/jbd2/journal.c2084
-rw-r--r--fs/jbd2/recovery.c609
-rw-r--r--fs/jbd2/revoke.c712
-rw-r--r--fs/jbd2/transaction.c2081
-rw-r--r--fs/jffs2/super.c8
-rw-r--r--fs/lockd/clntlock.c58
-rw-r--r--fs/lockd/clntproc.c17
-rw-r--r--fs/lockd/host.c325
-rw-r--r--fs/lockd/mon.c77
-rw-r--r--fs/lockd/svc.c19
-rw-r--r--fs/lockd/svc4proc.c85
-rw-r--r--fs/lockd/svclock.c207
-rw-r--r--fs/lockd/svcproc.c89
-rw-r--r--fs/lockd/svcshare.c24
-rw-r--r--fs/lockd/svcsubs.c183
-rw-r--r--fs/lockd/xdr.c76
-rw-r--r--fs/lockd/xdr4.c80
-rw-r--r--fs/minix/inode.c8
-rw-r--r--fs/ncpfs/ioctl.c2
-rw-r--r--fs/nfs/callback.h10
-rw-r--r--fs/nfs/callback_proc.c6
-rw-r--r--fs/nfs/callback_xdr.c106
-rw-r--r--fs/nfs/client.c52
-rw-r--r--fs/nfs/dir.c16
-rw-r--r--fs/nfs/direct.c25
-rw-r--r--fs/nfs/getroot.c1
-rw-r--r--fs/nfs/inode.c30
-rw-r--r--fs/nfs/internal.h6
-rw-r--r--fs/nfs/mount_clnt.c6
-rw-r--r--fs/nfs/namespace.c2
-rw-r--r--fs/nfs/nfs2xdr.c78
-rw-r--r--fs/nfs/nfs3proc.c2
-rw-r--r--fs/nfs/nfs3xdr.c118
-rw-r--r--fs/nfs/nfs4_fs.h2
-rw-r--r--fs/nfs/nfs4namespace.c2
-rw-r--r--fs/nfs/nfs4proc.c16
-rw-r--r--fs/nfs/nfs4xdr.c360
-rw-r--r--fs/nfs/nfsroot.c1
-rw-r--r--fs/nfs/super.c3
-rw-r--r--fs/nfs/write.c8
-rw-r--r--fs/nfs_common/nfsacl.c4
-rw-r--r--fs/nfsd/export.c153
-rw-r--r--fs/nfsd/lockd.c16
-rw-r--r--fs/nfsd/nfs2acl.c37
-rw-r--r--fs/nfsd/nfs3acl.c23
-rw-r--r--fs/nfsd/nfs3proc.c108
-rw-r--r--fs/nfsd/nfs3xdr.c182
-rw-r--r--fs/nfsd/nfs4acl.c711
-rw-r--r--fs/nfsd/nfs4callback.c26
-rw-r--r--fs/nfsd/nfs4proc.c140
-rw-r--r--fs/nfsd/nfs4recover.c14
-rw-r--r--fs/nfsd/nfs4state.c119
-rw-r--r--fs/nfsd/nfs4xdr.c454
-rw-r--r--fs/nfsd/nfscache.c8
-rw-r--r--fs/nfsd/nfsctl.c49
-rw-r--r--fs/nfsd/nfsfh.c10
-rw-r--r--fs/nfsd/nfsproc.c91
-rw-r--r--fs/nfsd/nfssvc.c29
-rw-r--r--fs/nfsd/nfsxdr.c115
-rw-r--r--fs/nfsd/vfs.c381
-rw-r--r--fs/ocfs2/cluster/nodemanager.c10
-rw-r--r--fs/ocfs2/file.c51
-rw-r--r--fs/ocfs2/namei.c8
-rw-r--r--fs/ocfs2/super.c2
-rw-r--r--fs/partitions/check.c50
-rw-r--r--fs/partitions/msdos.c6
-rw-r--r--fs/proc/base.c6
-rw-r--r--fs/proc/proc_misc.c2
-rw-r--r--fs/reiserfs/bitmap.c4
-rw-r--r--fs/reiserfs/file.c1
-rw-r--r--fs/reiserfs/inode.c2
-rw-r--r--fs/reiserfs/journal.c3
-rw-r--r--fs/reiserfs/super.c31
-rw-r--r--fs/splice.c6
-rw-r--r--fs/super.c12
-rw-r--r--fs/sysfs/file.c7
-rw-r--r--fs/sysv/super.c15
-rw-r--r--fs/udf/super.c3
-rw-r--r--fs/ufs/util.c14
-rw-r--r--fs/xattr.c33
-rw-r--r--fs/xfs/linux-2.6/kmem.c5
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.c3
296 files changed, 74480 insertions, 2597 deletions
diff --git a/fs/Kconfig b/fs/Kconfig
index 68f4561423ff..fee318e6f4bb 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -140,6 +140,73 @@ config EXT3_FS_SECURITY
If you are not using a security module that requires using
extended attributes for file security labels, say N.
+config EXT4DEV_FS
+ tristate "Ext4dev/ext4 extended fs support development (EXPERIMENTAL)"
+ depends on EXPERIMENTAL
+ select JBD2
+ help
+ Ext4dev is a predecessor filesystem of the next generation
+ extended fs ext4, based on ext3 filesystem code. It will be
+ renamed ext4 fs later, once ext4dev is mature and stabilized.
+
+ Unlike the change from ext2 filesystem to ext3 filesystem,
+ the on-disk format of ext4dev is not the same as ext3 any more:
+ it is based on extent maps and it supports 48-bit physical block
+ numbers. These combined on-disk format changes will allow
+ ext4dev/ext4 to handle more than 16 TB filesystem volumes --
+ a hard limit that ext3 cannot overcome without changing the
+ on-disk format.
+
+ Other than extent maps and 48-bit block numbers, ext4dev also is
+ likely to have other new features such as persistent preallocation,
+ high resolution time stamps, and larger file support etc. These
+ features will be added to ext4dev gradually.
+
+ To compile this file system support as a module, choose M here. The
+ module will be called ext4dev. Be aware, however, that the filesystem
+ of your root partition (the one containing the directory /) cannot
+ be compiled as a module, and so this could be dangerous.
+
+ If unsure, say N.
+
+config EXT4DEV_FS_XATTR
+ bool "Ext4dev extended attributes"
+ depends on EXT4DEV_FS
+ default y
+ help
+ Extended attributes are name:value pairs associated with inodes by
+ the kernel or by users (see the attr(5) manual page, or visit
+ <http://acl.bestbits.at/> for details).
+
+ If unsure, say N.
+
+ You need this for POSIX ACL support on ext4dev/ext4.
+
+config EXT4DEV_FS_POSIX_ACL
+ bool "Ext4dev POSIX Access Control Lists"
+ depends on EXT4DEV_FS_XATTR
+ select FS_POSIX_ACL
+ help
+ POSIX Access Control Lists (ACLs) support permissions for users and
+ groups beyond the owner/group/world scheme.
+
+ To learn more about Access Control Lists, visit the POSIX ACLs for
+ Linux website <http://acl.bestbits.at/>.
+
+ If you don't know what Access Control Lists are, say N
+
+config EXT4DEV_FS_SECURITY
+ bool "Ext4dev Security Labels"
+ depends on EXT4DEV_FS_XATTR
+ help
+ Security labels support alternative access control models
+ implemented by security modules like SELinux. This option
+ enables an extended attribute handler for file security
+ labels in the ext4dev/ext4 filesystem.
+
+ If you are not using a security module that requires using
+ extended attributes for file security labels, say N.
+
config JBD
tristate
help
@@ -172,12 +239,44 @@ config JBD_DEBUG
generated. To turn debugging off again, do
"echo 0 > /proc/sys/fs/jbd-debug".
+config JBD2
+ tristate
+ help
+ This is a generic journaling layer for block devices that support
+ both 32-bit and 64-bit block numbers. It is currently used by
+ the ext4dev/ext4 filesystem, but it could also be used to add
+ journal support to other file systems or block devices such
+ as RAID or LVM.
+
+ If you are using ext4dev/ext4, you need to say Y here. If you are not
+ using ext4dev/ext4 then you will probably want to say N.
+
+ To compile this device as a module, choose M here. The module will be
+ called jbd2. If you are compiling ext4dev/ext4 into the kernel,
+ you cannot compile this code as a module.
+
+config JBD2_DEBUG
+ bool "JBD2 (ext4dev/ext4) debugging support"
+ depends on JBD2
+ help
+ If you are using the ext4dev/ext4 journaled file system (or
+ potentially any other filesystem/device using JBD2), this option
+ allows you to enable debugging output while the system is running,
+ in order to help track down any problems you are having.
+ By default, the debugging output will be turned off.
+
+ If you select Y here, then you will be able to turn on debugging
+ with "echo N > /proc/sys/fs/jbd2-debug", where N is a number between
+ 1 and 5. The higher the number, the more debugging output is
+ generated. To turn debugging off again, do
+ "echo 0 > /proc/sys/fs/jbd2-debug".
+
config FS_MBCACHE
-# Meta block cache for Extended Attributes (ext2/ext3)
+# Meta block cache for Extended Attributes (ext2/ext3/ext4)
tristate
- depends on EXT2_FS_XATTR || EXT3_FS_XATTR
- default y if EXT2_FS=y || EXT3_FS=y
- default m if EXT2_FS=m || EXT3_FS=m
+ depends on EXT2_FS_XATTR || EXT3_FS_XATTR || EXT4DEV_FS_XATTR
+ default y if EXT2_FS=y || EXT3_FS=y || EXT4DEV_FS=y
+ default m if EXT2_FS=m || EXT3_FS=m || EXT4DEV_FS=m
config REISERFS_FS
tristate "Reiserfs support"
@@ -325,6 +424,7 @@ config FS_POSIX_ACL
default n
source "fs/xfs/Kconfig"
+source "fs/gfs2/Kconfig"
config OCFS2_FS
tristate "OCFS2 file system support"
@@ -534,6 +634,10 @@ config FUSE_FS
If you want to develop a userspace FS, or if you want to use
a filesystem based on FUSE, answer Y or M.
+config GENERIC_ACL
+ bool
+ select FS_POSIX_ACL
+
if BLOCK
menu "CD-ROM/DVD Filesystems"
@@ -995,6 +1099,18 @@ config AFFS_FS
To compile this file system support as a module, choose M here: the
module will be called affs. If unsure, say N.
+config ECRYPT_FS
+ tristate "eCrypt filesystem layer support (EXPERIMENTAL)"
+ depends on EXPERIMENTAL && KEYS && CRYPTO
+ help
+ Encrypted filesystem that operates on the VFS layer. See
+ <file:Documentation/ecryptfs.txt> to learn more about
+ eCryptfs. Userspace components are required and can be
+ obtained from <http://ecryptfs.sf.net>.
+
+ To compile this file system support as a module, choose M here: the
+ module will be called ecryptfs.
+
config HFS_FS
tristate "Apple Macintosh file system support (EXPERIMENTAL)"
depends on BLOCK && EXPERIMENTAL
@@ -1874,7 +1990,7 @@ config CIFS_EXPERIMENTAL
config CIFS_UPCALL
bool "Kerberos/SPNEGO advanced session setup (EXPERIMENTAL)"
depends on CIFS_EXPERIMENTAL
- select CONNECTOR
+ depends on CONNECTOR
help
Enables an upcall mechanism for CIFS which will be used to contact
userspace helper utilities to provide SPNEGO packaged Kerberos
@@ -1968,10 +2084,6 @@ config 9P_FS
If unsure, say N.
-config GENERIC_ACL
- bool
- select FS_POSIX_ACL
-
endmenu
if BLOCK
@@ -1983,6 +2095,7 @@ endmenu
endif
source "fs/nls/Kconfig"
+source "fs/dlm/Kconfig"
endmenu
diff --git a/fs/Makefile b/fs/Makefile
index 819b2a93bebe..9a5ce9323bfd 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -57,11 +57,14 @@ obj-$(CONFIG_CONFIGFS_FS) += configfs/
obj-y += devpts/
obj-$(CONFIG_PROFILING) += dcookies.o
+obj-$(CONFIG_DLM) += dlm/
# Do not add any filesystems before this line
obj-$(CONFIG_REISERFS_FS) += reiserfs/
obj-$(CONFIG_EXT3_FS) += ext3/ # Before ext2 so root fs can be ext3
+obj-$(CONFIG_EXT4DEV_FS) += ext4/ # Before ext2 so root fs can be ext4dev
obj-$(CONFIG_JBD) += jbd/
+obj-$(CONFIG_JBD2) += jbd2/
obj-$(CONFIG_EXT2_FS) += ext2/
obj-$(CONFIG_CRAMFS) += cramfs/
obj-$(CONFIG_RAMFS) += ramfs/
@@ -75,6 +78,7 @@ obj-$(CONFIG_BFS_FS) += bfs/
obj-$(CONFIG_ISO9660_FS) += isofs/
obj-$(CONFIG_HFSPLUS_FS) += hfsplus/ # Before hfs to find wrapped HFS+
obj-$(CONFIG_HFS_FS) += hfs/
+obj-$(CONFIG_ECRYPT_FS) += ecryptfs/
obj-$(CONFIG_VXFS_FS) += freevxfs/
obj-$(CONFIG_NFS_FS) += nfs/
obj-$(CONFIG_EXPORTFS) += exportfs/
@@ -109,3 +113,4 @@ obj-$(CONFIG_HOSTFS) += hostfs/
obj-$(CONFIG_HPPFS) += hppfs/
obj-$(CONFIG_DEBUG_FS) += debugfs/
obj-$(CONFIG_OCFS2_FS) += ocfs2/
+obj-$(CONFIG_GFS2_FS) += gfs2/
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index cf8a2cb28505..a6ec75c56fcf 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -211,8 +211,8 @@ static int afs_dir_open(struct inode *inode, struct file *file)
{
_enter("{%lu}", inode->i_ino);
- BUG_ON(sizeof(union afs_dir_block) != 2048);
- BUG_ON(sizeof(union afs_dirent) != 32);
+ BUILD_BUG_ON(sizeof(union afs_dir_block) != 2048);
+ BUILD_BUG_ON(sizeof(union afs_dirent) != 32);
if (AFS_FS_I(inode)->flags & AFS_VNODE_DELETED)
return -ENOENT;
@@ -446,8 +446,8 @@ static struct dentry *afs_dir_lookup(struct inode *dir, struct dentry *dentry,
_enter("{%lu},%p{%s}", dir->i_ino, dentry, dentry->d_name.name);
/* insanity checks first */
- BUG_ON(sizeof(union afs_dir_block) != 2048);
- BUG_ON(sizeof(union afs_dirent) != 32);
+ BUILD_BUG_ON(sizeof(union afs_dir_block) != 2048);
+ BUILD_BUG_ON(sizeof(union afs_dirent) != 32);
if (dentry->d_name.len > 255) {
_leave(" = -ENAMETOOLONG");
diff --git a/fs/autofs/autofs_i.h b/fs/autofs/autofs_i.h
index c7700d9b3f96..906ba5ce2261 100644
--- a/fs/autofs/autofs_i.h
+++ b/fs/autofs/autofs_i.h
@@ -149,6 +149,7 @@ extern const struct file_operations autofs_root_operations;
/* Initializing function */
int autofs_fill_super(struct super_block *, void *, int);
+void autofs_kill_sb(struct super_block *sb);
/* Queue management functions */
diff --git a/fs/autofs/dirhash.c b/fs/autofs/dirhash.c
index 3fded389d06b..bf8c8af98004 100644
--- a/fs/autofs/dirhash.c
+++ b/fs/autofs/dirhash.c
@@ -246,5 +246,4 @@ void autofs_hash_nuke(struct autofs_sb_info *sbi)
kfree(ent);
}
}
- shrink_dcache_sb(sbi->sb);
}
diff --git a/fs/autofs/init.c b/fs/autofs/init.c
index aca123752406..cea5219b4f37 100644
--- a/fs/autofs/init.c
+++ b/fs/autofs/init.c
@@ -24,7 +24,7 @@ static struct file_system_type autofs_fs_type = {
.owner = THIS_MODULE,
.name = "autofs",
.get_sb = autofs_get_sb,
- .kill_sb = kill_anon_super,
+ .kill_sb = autofs_kill_sb,
};
static int __init init_autofs_fs(void)
diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c
index 2c9759baad61..54c518c89e4c 100644
--- a/fs/autofs/inode.c
+++ b/fs/autofs/inode.c
@@ -20,7 +20,7 @@
#include "autofs_i.h"
#include <linux/module.h>
-static void autofs_put_super(struct super_block *sb)
+void autofs_kill_sb(struct super_block *sb)
{
struct autofs_sb_info *sbi = autofs_sbi(sb);
unsigned int n;
@@ -37,13 +37,13 @@ static void autofs_put_super(struct super_block *sb)
kfree(sb->s_fs_info);
DPRINTK(("autofs: shutting down\n"));
+ kill_anon_super(sb);
}
static void autofs_read_inode(struct inode *inode);
static struct super_operations autofs_sops = {
.read_inode = autofs_read_inode,
- .put_super = autofs_put_super,
.statfs = simple_statfs,
};
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index 480ab178cba5..b13f32c8aeee 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -94,7 +94,6 @@ struct autofs_wait_queue {
struct autofs_sb_info {
u32 magic;
- struct dentry *root;
int pipefd;
struct file *pipe;
pid_t oz_pgrp;
@@ -229,4 +228,4 @@ out:
}
void autofs4_dentry_release(struct dentry *);
-
+extern void autofs4_kill_sb(struct super_block *);
diff --git a/fs/autofs4/init.c b/fs/autofs4/init.c
index 5d9193332bef..723a1c5e361b 100644
--- a/fs/autofs4/init.c
+++ b/fs/autofs4/init.c
@@ -24,7 +24,7 @@ static struct file_system_type autofs_fs_type = {
.owner = THIS_MODULE,
.name = "autofs",
.get_sb = autofs_get_sb,
- .kill_sb = kill_anon_super,
+ .kill_sb = autofs4_kill_sb,
};
static int __init init_autofs4_fs(void)
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index 800ce876caec..51fd8595bf85 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -96,7 +96,7 @@ void autofs4_free_ino(struct autofs_info *ino)
*/
static void autofs4_force_release(struct autofs_sb_info *sbi)
{
- struct dentry *this_parent = sbi->root;
+ struct dentry *this_parent = sbi->sb->s_root;
struct list_head *next;
spin_lock(&dcache_lock);
@@ -127,7 +127,7 @@ resume:
spin_lock(&dcache_lock);
}
- if (this_parent != sbi->root) {
+ if (this_parent != sbi->sb->s_root) {
struct dentry *dentry = this_parent;
next = this_parent->d_u.d_child.next;
@@ -140,15 +140,9 @@ resume:
goto resume;
}
spin_unlock(&dcache_lock);
-
- dput(sbi->root);
- sbi->root = NULL;
- shrink_dcache_sb(sbi->sb);
-
- return;
}
-static void autofs4_put_super(struct super_block *sb)
+void autofs4_kill_sb(struct super_block *sb)
{
struct autofs_sb_info *sbi = autofs4_sbi(sb);
@@ -163,6 +157,7 @@ static void autofs4_put_super(struct super_block *sb)
kfree(sbi);
DPRINTK("shutting down");
+ kill_anon_super(sb);
}
static int autofs4_show_options(struct seq_file *m, struct vfsmount *mnt)
@@ -189,7 +184,6 @@ static int autofs4_show_options(struct seq_file *m, struct vfsmount *mnt)
}
static struct super_operations autofs4_sops = {
- .put_super = autofs4_put_super,
.statfs = simple_statfs,
.show_options = autofs4_show_options,
};
@@ -315,7 +309,6 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
s->s_fs_info = sbi;
sbi->magic = AUTOFS_SBI_MAGIC;
- sbi->root = NULL;
sbi->pipefd = -1;
sbi->catatonic = 0;
sbi->exp_timeout = 0;
@@ -397,13 +390,6 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
sbi->pipefd = pipefd;
/*
- * Take a reference to the root dentry so we get a chance to
- * clean up the dentry tree on umount.
- * See autofs4_force_release.
- */
- sbi->root = dget(root);
-
- /*
* Success! Install the root dentry now to indicate completion.
*/
s->s_root = root;
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index ce103e7b0bc3..c0a6c8d445c7 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -45,7 +45,6 @@ void autofs4_catatonic_mode(struct autofs_sb_info *sbi)
fput(sbi->pipe); /* Close the pipe */
sbi->pipe = NULL;
}
- shrink_dcache_sb(sbi->sb);
}
static int autofs4_write(struct file *file, const void *addr, int bytes)
diff --git a/fs/befs/befs.h b/fs/befs/befs.h
index 057a2c3d73b7..d9a40abda6b7 100644
--- a/fs/befs/befs.h
+++ b/fs/befs/befs.h
@@ -94,7 +94,7 @@ void befs_debug(const struct super_block *sb, const char *fmt, ...);
void befs_dump_super_block(const struct super_block *sb, befs_super_block *);
void befs_dump_inode(const struct super_block *sb, befs_inode *);
-void befs_dump_index_entry(const struct super_block *sb, befs_btree_super *);
+void befs_dump_index_entry(const struct super_block *sb, befs_disk_btree_super *);
void befs_dump_index_node(const struct super_block *sb, befs_btree_nodehead *);
/****************************/
@@ -136,7 +136,7 @@ blockno2iaddr(struct super_block *sb, befs_blocknr_t blockno)
static inline unsigned int
befs_iaddrs_per_block(struct super_block *sb)
{
- return BEFS_SB(sb)->block_size / sizeof (befs_inode_addr);
+ return BEFS_SB(sb)->block_size / sizeof (befs_disk_inode_addr);
}
static inline int
@@ -151,4 +151,6 @@ befs_brun_size(struct super_block *sb, befs_block_run run)
return BEFS_SB(sb)->block_size * run.len;
}
+#include "endian.h"
+
#endif /* _LINUX_BEFS_H */
diff --git a/fs/befs/befs_fs_types.h b/fs/befs/befs_fs_types.h
index 63ef1e18fb84..e2595c2c403a 100644
--- a/fs/befs/befs_fs_types.h
+++ b/fs/befs/befs_fs_types.h
@@ -79,17 +79,27 @@ enum inode_flags {
* On-Disk datastructures of BeFS
*/
+typedef u64 __bitwise fs64;
+typedef u32 __bitwise fs32;
+typedef u16 __bitwise fs16;
+
typedef u64 befs_off_t;
-typedef u64 befs_time_t;
-typedef void befs_binode_etc;
+typedef fs64 befs_time_t;
/* Block runs */
typedef struct {
+ fs32 allocation_group;
+ fs16 start;
+ fs16 len;
+} PACKED befs_disk_block_run;
+
+typedef struct {
u32 allocation_group;
u16 start;
u16 len;
} PACKED befs_block_run;
+typedef befs_disk_block_run befs_disk_inode_addr;
typedef befs_block_run befs_inode_addr;
/*
@@ -97,31 +107,31 @@ typedef befs_block_run befs_inode_addr;
*/
typedef struct {
char name[B_OS_NAME_LENGTH];
- u32 magic1;
- u32 fs_byte_order;
+ fs32 magic1;
+ fs32 fs_byte_order;
- u32 block_size;
- u32 block_shift;
+ fs32 block_size;
+ fs32 block_shift;
- befs_off_t num_blocks;
- befs_off_t used_blocks;
+ fs64 num_blocks;
+ fs64 used_blocks;
- u32 inode_size;
+ fs32 inode_size;
- u32 magic2;
- u32 blocks_per_ag;
- u32 ag_shift;
- u32 num_ags;
+ fs32 magic2;
+ fs32 blocks_per_ag;
+ fs32 ag_shift;
+ fs32 num_ags;
- u32 flags;
+ fs32 flags;
- befs_block_run log_blocks;
- befs_off_t log_start;
- befs_off_t log_end;
+ befs_disk_block_run log_blocks;
+ fs64 log_start;
+ fs64 log_end;
- u32 magic3;
- befs_inode_addr root_dir;
- befs_inode_addr indices;
+ fs32 magic3;
+ befs_disk_inode_addr root_dir;
+ befs_disk_inode_addr indices;
} PACKED befs_super_block;
@@ -130,6 +140,16 @@ typedef struct {
* be longer than one block!
*/
typedef struct {
+ befs_disk_block_run direct[BEFS_NUM_DIRECT_BLOCKS];
+ fs64 max_direct_range;
+ befs_disk_block_run indirect;
+ fs64 max_indirect_range;
+ befs_disk_block_run double_indirect;
+ fs64 max_double_indirect_range;
+ fs64 size;
+} PACKED befs_disk_data_stream;
+
+typedef struct {
befs_block_run direct[BEFS_NUM_DIRECT_BLOCKS];
befs_off_t max_direct_range;
befs_block_run indirect;
@@ -141,35 +161,35 @@ typedef struct {
/* Attribute */
typedef struct {
- u32 type;
- u16 name_size;
- u16 data_size;
+ fs32 type;
+ fs16 name_size;
+ fs16 data_size;
char name[1];
} PACKED befs_small_data;
/* Inode structure */
typedef struct {
- u32 magic1;
- befs_inode_addr inode_num;
- u32 uid;
- u32 gid;
- u32 mode;
- u32 flags;
+ fs32 magic1;
+ befs_disk_inode_addr inode_num;
+ fs32 uid;
+ fs32 gid;
+ fs32 mode;
+ fs32 flags;
befs_time_t create_time;
befs_time_t last_modified_time;
- befs_inode_addr parent;
- befs_inode_addr attributes;
- u32 type;
+ befs_disk_inode_addr parent;
+ befs_disk_inode_addr attributes;
+ fs32 type;
- u32 inode_size;
- u32 etc; /* not use */
+ fs32 inode_size;
+ fs32 etc; /* not use */
union {
- befs_data_stream datastream;
+ befs_disk_data_stream datastream;
char symlink[BEFS_SYMLINK_LEN];
} data;
- u32 pad[4]; /* not use */
+ fs32 pad[4]; /* not use */
befs_small_data small_data[1];
} PACKED befs_inode;
@@ -190,6 +210,16 @@ enum btree_types {
};
typedef struct {
+ fs32 magic;
+ fs32 node_size;
+ fs32 max_depth;
+ fs32 data_type;
+ fs64 root_node_ptr;
+ fs64 free_node_ptr;
+ fs64 max_size;
+} PACKED befs_disk_btree_super;
+
+typedef struct {
u32 magic;
u32 node_size;
u32 max_depth;
@@ -203,11 +233,19 @@ typedef struct {
* Header stucture of each btree node
*/
typedef struct {
+ fs64 left;
+ fs64 right;
+ fs64 overflow;
+ fs16 all_key_count;
+ fs16 all_key_length;
+} PACKED befs_btree_nodehead;
+
+typedef struct {
befs_off_t left;
befs_off_t right;
befs_off_t overflow;
u16 all_key_count;
u16 all_key_length;
-} PACKED befs_btree_nodehead;
+} PACKED befs_host_btree_nodehead;
#endif /* _LINUX_BEFS_FS_TYPES */
diff --git a/fs/befs/btree.c b/fs/befs/btree.c
index 76e219799409..81b042ee24e6 100644
--- a/fs/befs/btree.c
+++ b/fs/befs/btree.c
@@ -30,7 +30,6 @@
#include "befs.h"
#include "btree.h"
#include "datastream.h"
-#include "endian.h"
/*
* The btree functions in this file are built on top of the
@@ -80,7 +79,7 @@
* In memory structure of each btree node
*/
typedef struct {
- befs_btree_nodehead head; /* head of node converted to cpu byteorder */
+ befs_host_btree_nodehead head; /* head of node converted to cpu byteorder */
struct buffer_head *bh;
befs_btree_nodehead *od_node; /* on disk node */
} befs_btree_node;
@@ -102,9 +101,9 @@ static int befs_bt_read_node(struct super_block *sb, befs_data_stream * ds,
static int befs_leafnode(befs_btree_node * node);
-static u16 *befs_bt_keylen_index(befs_btree_node * node);
+static fs16 *befs_bt_keylen_index(befs_btree_node * node);
-static befs_off_t *befs_bt_valarray(befs_btree_node * node);
+static fs64 *befs_bt_valarray(befs_btree_node * node);
static char *befs_bt_keydata(befs_btree_node * node);
@@ -136,7 +135,7 @@ befs_bt_read_super(struct super_block *sb, befs_data_stream * ds,
befs_btree_super * sup)
{
struct buffer_head *bh = NULL;
- befs_btree_super *od_sup = NULL;
+ befs_disk_btree_super *od_sup = NULL;
befs_debug(sb, "---> befs_btree_read_super()");
@@ -146,7 +145,7 @@ befs_bt_read_super(struct super_block *sb, befs_data_stream * ds,
befs_error(sb, "Couldn't read index header.");
goto error;
}
- od_sup = (befs_btree_super *) bh->b_data;
+ od_sup = (befs_disk_btree_super *) bh->b_data;
befs_dump_index_entry(sb, od_sup);
sup->magic = fs32_to_cpu(sb, od_sup->magic);
@@ -342,7 +341,7 @@ befs_find_key(struct super_block *sb, befs_btree_node * node,
u16 keylen;
int findkey_len;
char *thiskey;
- befs_off_t *valarray;
+ fs64 *valarray;
befs_debug(sb, "---> befs_find_key() %s", findkey);
@@ -422,7 +421,7 @@ befs_btree_read(struct super_block *sb, befs_data_stream * ds,
befs_btree_super bt_super;
befs_off_t node_off = 0;
int cur_key;
- befs_off_t *valarray;
+ fs64 *valarray;
char *keystart;
u16 keylen;
int res;
@@ -572,7 +571,7 @@ befs_btree_seekleaf(struct super_block *sb, befs_data_stream * ds,
this_node->head.overflow);
*node_off = this_node->head.overflow;
} else {
- befs_off_t *valarray = befs_bt_valarray(this_node);
+ fs64 *valarray = befs_bt_valarray(this_node);
*node_off = fs64_to_cpu(sb, valarray[0]);
}
if (befs_bt_read_node(sb, ds, this_node, *node_off) != BEFS_OK) {
@@ -622,7 +621,7 @@ befs_leafnode(befs_btree_node * node)
*
* Except that rounding up to 8 works, and rounding up to 4 doesn't.
*/
-static u16 *
+static fs16 *
befs_bt_keylen_index(befs_btree_node * node)
{
const int keylen_align = 8;
@@ -633,7 +632,7 @@ befs_bt_keylen_index(befs_btree_node * node)
if (tmp)
off += keylen_align - tmp;
- return (u16 *) ((void *) node->od_node + off);
+ return (fs16 *) ((void *) node->od_node + off);
}
/**
@@ -643,13 +642,13 @@ befs_bt_keylen_index(befs_btree_node * node)
* Returns a pointer to the start of the value array
* of the node pointed to by the node header
*/
-static befs_off_t *
+static fs64 *
befs_bt_valarray(befs_btree_node * node)
{
void *keylen_index_start = (void *) befs_bt_keylen_index(node);
- size_t keylen_index_size = node->head.all_key_count * sizeof (u16);
+ size_t keylen_index_size = node->head.all_key_count * sizeof (fs16);
- return (befs_off_t *) (keylen_index_start + keylen_index_size);
+ return (fs64 *) (keylen_index_start + keylen_index_size);
}
/**
@@ -681,7 +680,7 @@ befs_bt_get_key(struct super_block *sb, befs_btree_node * node,
{
int prev_key_end;
char *keystart;
- u16 *keylen_index;
+ fs16 *keylen_index;
if (index < 0 || index > node->head.all_key_count) {
*keylen = 0;
diff --git a/fs/befs/datastream.c b/fs/befs/datastream.c
index b7d6b920f65f..aacb4da6298a 100644
--- a/fs/befs/datastream.c
+++ b/fs/befs/datastream.c
@@ -18,7 +18,6 @@
#include "befs.h"
#include "datastream.h"
#include "io.h"
-#include "endian.h"
const befs_inode_addr BAD_IADDR = { 0, 0, 0 };
@@ -312,7 +311,7 @@ befs_find_brun_indirect(struct super_block *sb,
befs_blocknr_t indir_start_blk;
befs_blocknr_t search_blk;
struct buffer_head *indirblock;
- befs_block_run *array;
+ befs_disk_block_run *array;
befs_block_run indirect = data->indirect;
befs_blocknr_t indirblockno = iaddr2blockno(sb, &indirect);
@@ -334,7 +333,7 @@ befs_find_brun_indirect(struct super_block *sb,
return BEFS_ERR;
}
- array = (befs_block_run *) indirblock->b_data;
+ array = (befs_disk_block_run *) indirblock->b_data;
for (j = 0; j < arraylen; ++j) {
int len = fs16_to_cpu(sb, array[j].len);
@@ -427,7 +426,7 @@ befs_find_brun_dblindirect(struct super_block *sb,
struct buffer_head *dbl_indir_block;
struct buffer_head *indir_block;
befs_block_run indir_run;
- befs_inode_addr *iaddr_array = NULL;
+ befs_disk_inode_addr *iaddr_array = NULL;
befs_sb_info *befs_sb = BEFS_SB(sb);
befs_blocknr_t indir_start_blk =
@@ -482,7 +481,7 @@ befs_find_brun_dblindirect(struct super_block *sb,
dbl_block_indx =
dblindir_indx - (dbl_which_block * befs_iaddrs_per_block(sb));
- iaddr_array = (befs_inode_addr *) dbl_indir_block->b_data;
+ iaddr_array = (befs_disk_inode_addr *) dbl_indir_block->b_data;
indir_run = fsrun_to_cpu(sb, iaddr_array[dbl_block_indx]);
brelse(dbl_indir_block);
iaddr_array = NULL;
@@ -507,7 +506,7 @@ befs_find_brun_dblindirect(struct super_block *sb,
}
block_indx = indir_indx - (which_block * befs_iaddrs_per_block(sb));
- iaddr_array = (befs_inode_addr *) indir_block->b_data;
+ iaddr_array = (befs_disk_inode_addr *) indir_block->b_data;
*run = fsrun_to_cpu(sb, iaddr_array[block_indx]);
brelse(indir_block);
iaddr_array = NULL;
diff --git a/fs/befs/debug.c b/fs/befs/debug.c
index 875cc0aa318c..e831a8f30849 100644
--- a/fs/befs/debug.c
+++ b/fs/befs/debug.c
@@ -21,7 +21,6 @@
#endif /* __KERNEL__ */
#include "befs.h"
-#include "endian.h"
#define ERRBUFSIZE 1024
@@ -125,7 +124,7 @@ befs_dump_inode(const struct super_block *sb, befs_inode * inode)
befs_debug(sb, " type %08x", fs32_to_cpu(sb, inode->type));
befs_debug(sb, " inode_size %u", fs32_to_cpu(sb, inode->inode_size));
- if (S_ISLNK(inode->mode)) {
+ if (S_ISLNK(fs32_to_cpu(sb, inode->mode))) {
befs_debug(sb, " Symbolic link [%s]", inode->data.symlink);
} else {
int i;
@@ -231,21 +230,20 @@ befs_dump_small_data(const struct super_block *sb, befs_small_data * sd)
/* unused */
void
-befs_dump_run(const struct super_block *sb, befs_block_run run)
+befs_dump_run(const struct super_block *sb, befs_disk_block_run run)
{
#ifdef CONFIG_BEFS_DEBUG
- run = fsrun_to_cpu(sb, run);
+ befs_block_run n = fsrun_to_cpu(sb, run);
- befs_debug(sb, "[%u, %hu, %hu]",
- run.allocation_group, run.start, run.len);
+ befs_debug(sb, "[%u, %hu, %hu]", n.allocation_group, n.start, n.len);
#endif //CONFIG_BEFS_DEBUG
}
#endif /* 0 */
void
-befs_dump_index_entry(const struct super_block *sb, befs_btree_super * super)
+befs_dump_index_entry(const struct super_block *sb, befs_disk_btree_super * super)
{
#ifdef CONFIG_BEFS_DEBUG
diff --git a/fs/befs/endian.h b/fs/befs/endian.h
index 9ecaea4e3325..e254a20869f4 100644
--- a/fs/befs/endian.h
+++ b/fs/befs/endian.h
@@ -10,85 +10,84 @@
#define LINUX_BEFS_ENDIAN
#include <linux/byteorder/generic.h>
-#include "befs.h"
static inline u64
-fs64_to_cpu(const struct super_block *sb, u64 n)
+fs64_to_cpu(const struct super_block *sb, fs64 n)
{
if (BEFS_SB(sb)->byte_order == BEFS_BYTESEX_LE)
- return le64_to_cpu(n);
+ return le64_to_cpu((__force __le64)n);
else
- return be64_to_cpu(n);
+ return be64_to_cpu((__force __be64)n);
}
-static inline u64
+static inline fs64
cpu_to_fs64(const struct super_block *sb, u64 n)
{
if (BEFS_SB(sb)->byte_order == BEFS_BYTESEX_LE)
- return cpu_to_le64(n);
+ return (__force fs64)cpu_to_le64(n);
else
- return cpu_to_be64(n);
+ return (__force fs64)cpu_to_be64(n);
}
static inline u32
-fs32_to_cpu(const struct super_block *sb, u32 n)
+fs32_to_cpu(const struct super_block *sb, fs32 n)
{
if (BEFS_SB(sb)->byte_order == BEFS_BYTESEX_LE)
- return le32_to_cpu(n);
+ return le32_to_cpu((__force __le32)n);
else
- return be32_to_cpu(n);
+ return be32_to_cpu((__force __be32)n);
}
-static inline u32
+static inline fs32
cpu_to_fs32(const struct super_block *sb, u32 n)
{
if (BEFS_SB(sb)->byte_order == BEFS_BYTESEX_LE)
- return cpu_to_le32(n);
+ return (__force fs32)cpu_to_le32(n);
else
- return cpu_to_be32(n);
+ return (__force fs32)cpu_to_be32(n);
}
static inline u16
-fs16_to_cpu(const struct super_block *sb, u16 n)
+fs16_to_cpu(const struct super_block *sb, fs16 n)
{
if (BEFS_SB(sb)->byte_order == BEFS_BYTESEX_LE)
- return le16_to_cpu(n);
+ return le16_to_cpu((__force __le16)n);
else
- return be16_to_cpu(n);
+ return be16_to_cpu((__force __be16)n);
}
-static inline u16
+static inline fs16
cpu_to_fs16(const struct super_block *sb, u16 n)
{
if (BEFS_SB(sb)->byte_order == BEFS_BYTESEX_LE)
- return cpu_to_le16(n);
+ return (__force fs16)cpu_to_le16(n);
else
- return cpu_to_be16(n);
+ return (__force fs16)cpu_to_be16(n);
}
/* Composite types below here */
static inline befs_block_run
-fsrun_to_cpu(const struct super_block *sb, befs_block_run n)
+fsrun_to_cpu(const struct super_block *sb, befs_disk_block_run n)
{
befs_block_run run;
if (BEFS_SB(sb)->byte_order == BEFS_BYTESEX_LE) {
- run.allocation_group = le32_to_cpu(n.allocation_group);
- run.start = le16_to_cpu(n.start);
- run.len = le16_to_cpu(n.len);
+ run.allocation_group = le32_to_cpu((__force __le32)n.allocation_group);
+ run.start = le16_to_cpu((__force __le16)n.start);
+ run.len = le16_to_cpu((__force __le16)n.len);
} else {
- run.allocation_group = be32_to_cpu(n.allocation_group);
- run.start = be16_to_cpu(n.start);
- run.len = be16_to_cpu(n.len);
+ run.allocation_group = be32_to_cpu((__force __be32)n.allocation_group);
+ run.start = be16_to_cpu((__force __be16)n.start);
+ run.len = be16_to_cpu((__force __be16)n.len);
}
return run;
}
-static inline befs_block_run
+static inline befs_disk_block_run
cpu_to_fsrun(const struct super_block *sb, befs_block_run n)
{
- befs_block_run run;
+ befs_disk_block_run run;
if (BEFS_SB(sb)->byte_order == BEFS_BYTESEX_LE) {
run.allocation_group = cpu_to_le32(n.allocation_group);
@@ -103,7 +102,7 @@ cpu_to_fsrun(const struct super_block *sb, befs_block_run n)
}
static inline befs_data_stream
-fsds_to_cpu(const struct super_block *sb, befs_data_stream n)
+fsds_to_cpu(const struct super_block *sb, befs_disk_data_stream n)
{
befs_data_stream data;
int i;
diff --git a/fs/befs/inode.c b/fs/befs/inode.c
index d41c9247ae8a..94c17f9a9576 100644
--- a/fs/befs/inode.c
+++ b/fs/befs/inode.c
@@ -8,7 +8,6 @@
#include "befs.h"
#include "inode.h"
-#include "endian.h"
/*
Validates the correctness of the befs inode
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 57020c7a7e65..07f7144f0e2e 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -22,7 +22,6 @@
#include "datastream.h"
#include "super.h"
#include "io.h"
-#include "endian.h"
MODULE_DESCRIPTION("BeOS File System (BeFS) driver");
MODULE_AUTHOR("Will Dyson");
diff --git a/fs/befs/super.c b/fs/befs/super.c
index 4557acbac528..8c3401ff6d6a 100644
--- a/fs/befs/super.c
+++ b/fs/befs/super.c
@@ -11,7 +11,6 @@
#include "befs.h"
#include "super.h"
-#include "endian.h"
/**
* load_befs_sb -- Read from disk and properly byteswap all the fields
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 06435f3665f4..79b05a1a4365 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1152,7 +1152,7 @@ static int dump_write(struct file *file, const void *addr, int nr)
static int dump_seek(struct file *file, loff_t off)
{
if (file->f_op->llseek && file->f_op->llseek != no_llseek) {
- if (file->f_op->llseek(file, off, 1) != off)
+ if (file->f_op->llseek(file, off, SEEK_CUR) < 0)
return 0;
} else {
char *buf = (char *)get_zeroed_page(GFP_KERNEL);
@@ -1220,7 +1220,7 @@ static int notesize(struct memelfnote *en)
static int alignfile(struct file *file, loff_t *foffset)
{
- char buf[4] = { 0, };
+ static const char buf[4] = { 0, };
DUMP_WRITE(buf, roundup(*foffset, 4) - *foffset, foffset);
return 1;
}
@@ -1569,7 +1569,8 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file)
DUMP_WRITE(elf, sizeof(*elf));
offset += sizeof(*elf); /* Elf header */
- offset += (segs+1) * sizeof(struct elf_phdr); /* Program headers */
+ offset += (segs + 1) * sizeof(struct elf_phdr); /* Program headers */
+ foffset = offset;
/* Write notes phdr entry */
{
@@ -1586,8 +1587,6 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file)
DUMP_WRITE(&phdr, sizeof(phdr));
}
- foffset = offset;
-
dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE);
/* Write program headers for segments dump */
@@ -1612,7 +1611,6 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file)
phdr.p_align = ELF_EXEC_PAGESIZE;
DUMP_WRITE(&phdr, sizeof(phdr));
- foffset += sizeof(phdr);
}
#ifdef ELF_CORE_WRITE_EXTRA_PHDRS
diff --git a/fs/binfmt_som.c b/fs/binfmt_som.c
index 32b5d625ce9c..5bcdaaf4eae0 100644
--- a/fs/binfmt_som.c
+++ b/fs/binfmt_som.c
@@ -29,6 +29,7 @@
#include <linux/personality.h>
#include <linux/init.h>
+#include <asm/a.out.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
@@ -194,6 +195,7 @@ load_som_binary(struct linux_binprm * bprm, struct pt_regs * regs)
unsigned long som_entry;
struct som_hdr *som_ex;
struct som_exec_auxhdr *hpuxhdr;
+ struct files_struct *files;
/* Get the exec-header */
som_ex = (struct som_hdr *) bprm->buf;
@@ -208,15 +210,27 @@ load_som_binary(struct linux_binprm * bprm, struct pt_regs * regs)
size = som_ex->aux_header_size;
if (size > SOM_PAGESIZE)
goto out;
- hpuxhdr = (struct som_exec_auxhdr *) kmalloc(size, GFP_KERNEL);
+ hpuxhdr = kmalloc(size, GFP_KERNEL);
if (!hpuxhdr)
goto out;
retval = kernel_read(bprm->file, som_ex->aux_header_location,
(char *) hpuxhdr, size);
+ if (retval != size) {
+ if (retval >= 0)
+ retval = -EIO;
+ goto out_free;
+ }
+
+ files = current->files; /* Refcounted so ok */
+ retval = unshare_files();
if (retval < 0)
goto out_free;
-#error "Fix security hole before enabling me"
+ if (files == current->files) {
+ put_files_struct(files);
+ files = NULL;
+ }
+
retval = get_unused_fd();
if (retval < 0)
goto out_free;
diff --git a/fs/bio.c b/fs/bio.c
index 8f93e939f213..f95c8749499f 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -79,7 +79,6 @@ static struct bio_set *fs_bio_set;
static inline struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx, struct bio_set *bs)
{
struct bio_vec *bvl;
- struct biovec_slab *bp;
/*
* see comment near bvec_array define!
@@ -98,10 +97,12 @@ static inline struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned lon
* idx now points to the pool we want to allocate from
*/
- bp = bvec_slabs + *idx;
bvl = mempool_alloc(bs->bvec_pools[*idx], gfp_mask);
- if (bvl)
+ if (bvl) {
+ struct biovec_slab *bp = bvec_slabs + *idx;
+
memset(bvl, 0, bp->nr_vecs * sizeof(struct bio_vec));
+ }
return bvl;
}
@@ -166,7 +167,7 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
bio_init(bio);
if (likely(nr_iovecs)) {
- unsigned long idx;
+ unsigned long idx = 0; /* shut up gcc */
bvl = bvec_alloc_bs(gfp_mask, nr_iovecs, &idx, bs);
if (unlikely(!bvl)) {
diff --git a/fs/buffer.c b/fs/buffer.c
index 16cfbcd254f1..35527dca1dbc 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -452,6 +452,7 @@ static void end_buffer_async_write(struct buffer_head *bh, int uptodate)
bdevname(bh->b_bdev, b));
}
set_bit(AS_EIO, &page->mapping->flags);
+ set_buffer_write_io_error(bh);
clear_buffer_uptodate(bh);
SetPageError(page);
}
@@ -571,6 +572,10 @@ EXPORT_SYMBOL(mark_buffer_async_write);
static inline void __remove_assoc_queue(struct buffer_head *bh)
{
list_del_init(&bh->b_assoc_buffers);
+ WARN_ON(!bh->b_assoc_map);
+ if (buffer_write_io_error(bh))
+ set_bit(AS_EIO, &bh->b_assoc_map->flags);
+ bh->b_assoc_map = NULL;
}
int inode_has_buffers(struct inode *inode)
@@ -669,6 +674,7 @@ void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)
spin_lock(&buffer_mapping->private_lock);
list_move_tail(&bh->b_assoc_buffers,
&mapping->private_list);
+ bh->b_assoc_map = mapping;
spin_unlock(&buffer_mapping->private_lock);
}
}
@@ -701,7 +707,10 @@ EXPORT_SYMBOL(mark_buffer_dirty_inode);
*/
int __set_page_dirty_buffers(struct page *page)
{
- struct address_space * const mapping = page->mapping;
+ struct address_space * const mapping = page_mapping(page);
+
+ if (unlikely(!mapping))
+ return !TestSetPageDirty(page);
spin_lock(&mapping->private_lock);
if (page_has_buffers(page)) {
@@ -762,7 +771,7 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
spin_lock(lock);
while (!list_empty(list)) {
bh = BH_ENTRY(list->next);
- list_del_init(&bh->b_assoc_buffers);
+ __remove_assoc_queue(bh);
if (buffer_dirty(bh) || buffer_locked(bh)) {
list_add(&bh->b_assoc_buffers, &tmp);
if (buffer_dirty(bh)) {
@@ -783,7 +792,7 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
while (!list_empty(&tmp)) {
bh = BH_ENTRY(tmp.prev);
- __remove_assoc_queue(bh);
+ list_del_init(&bh->b_assoc_buffers);
get_bh(bh);
spin_unlock(lock);
wait_on_buffer(bh);
@@ -1039,8 +1048,21 @@ grow_buffers(struct block_device *bdev, sector_t block, int size)
} while ((size << sizebits) < PAGE_SIZE);
index = block >> sizebits;
- block = index << sizebits;
+ /*
+ * Check for a block which wants to lie outside our maximum possible
+ * pagecache index. (this comparison is done using sector_t types).
+ */
+ if (unlikely(index != block >> sizebits)) {
+ char b[BDEVNAME_SIZE];
+
+ printk(KERN_ERR "%s: requested out-of-range block %llu for "
+ "device %s\n",
+ __FUNCTION__, (unsigned long long)block,
+ bdevname(bdev, b));
+ return -EIO;
+ }
+ block = index << sizebits;
/* Create a page with the proper size buffers.. */
page = grow_dev_page(bdev, block, index, size);
if (!page)
@@ -1067,12 +1089,16 @@ __getblk_slow(struct block_device *bdev, sector_t block, int size)
for (;;) {
struct buffer_head * bh;
+ int ret;
bh = __find_get_block(bdev, block, size);
if (bh)
return bh;
- if (!grow_buffers(bdev, block, size))
+ ret = grow_buffers(bdev, block, size);
+ if (ret < 0)
+ return NULL;
+ if (ret == 0)
free_more_memory();
}
}
@@ -1147,6 +1173,7 @@ void __bforget(struct buffer_head *bh)
spin_lock(&buffer_mapping->private_lock);
list_del_init(&bh->b_assoc_buffers);
+ bh->b_assoc_map = NULL;
spin_unlock(&buffer_mapping->private_lock);
}
__brelse(bh);
@@ -1834,6 +1861,7 @@ static int __block_prepare_write(struct inode *inode, struct page *page,
clear_buffer_new(bh);
kaddr = kmap_atomic(page, KM_USER0);
memset(kaddr+block_start, 0, bh->b_size);
+ flush_dcache_page(page);
kunmap_atomic(kaddr, KM_USER0);
set_buffer_uptodate(bh);
mark_buffer_dirty(bh);
@@ -2340,6 +2368,7 @@ failed:
*/
kaddr = kmap_atomic(page, KM_USER0);
memset(kaddr, 0, PAGE_CACHE_SIZE);
+ flush_dcache_page(page);
kunmap_atomic(kaddr, KM_USER0);
SetPageUptodate(page);
set_page_dirty(page);
diff --git a/fs/cifs/cifsacl.h b/fs/cifs/cifsacl.h
index d0776ac2b804..5eff35d6e564 100644
--- a/fs/cifs/cifsacl.h
+++ b/fs/cifs/cifsacl.h
@@ -31,8 +31,8 @@ struct cifs_sid {
} __attribute__((packed));
/* everyone */
-extern const struct cifs_sid sid_everyone;
+/* extern const struct cifs_sid sid_everyone;*/
/* group users */
-extern const struct cifs_sid sid_user;
+/* extern const struct cifs_sid sid_user;*/
#endif /* _CIFSACL_H */
diff --git a/fs/cifs/cifsencrypt.h b/fs/cifs/cifsencrypt.h
index 03e359b32861..152fa2dcfc6c 100644
--- a/fs/cifs/cifsencrypt.h
+++ b/fs/cifs/cifsencrypt.h
@@ -27,8 +27,6 @@ extern void mdfour(unsigned char *out, unsigned char *in, int n);
/* smbdes.c */
extern void E_P16(unsigned char *p14, unsigned char *p16);
extern void E_P24(unsigned char *p21, unsigned char *c8, unsigned char *p24);
-extern void D_P16(unsigned char *p14, unsigned char *in, unsigned char *out);
-extern void E_old_pw_hash(unsigned char *, unsigned char *, unsigned char *);
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index c00c654f2e11..84976cdbe713 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -63,6 +63,7 @@ extern struct task_struct * oplockThread; /* remove sparse warning */
struct task_struct * oplockThread = NULL;
extern struct task_struct * dnotifyThread; /* remove sparse warning */
struct task_struct * dnotifyThread = NULL;
+static struct super_operations cifs_super_ops;
unsigned int CIFSMaxBufSize = CIFS_MAX_MSGSIZE;
module_param(CIFSMaxBufSize, int, 0);
MODULE_PARM_DESC(CIFSMaxBufSize,"Network buffer size (not including header). Default: 16384 Range: 8192 to 130048");
@@ -198,10 +199,12 @@ cifs_statfs(struct dentry *dentry, struct kstatfs *buf)
/* Only need to call the old QFSInfo if failed
on newer one */
if(rc)
- rc = CIFSSMBQFSInfo(xid, pTcon, buf);
+ if(pTcon->ses->capabilities & CAP_NT_SMBS)
+ rc = CIFSSMBQFSInfo(xid, pTcon, buf); /* not supported by OS2 */
- /* Old Windows servers do not support level 103, retry with level
- one if old server failed the previous call */
+ /* Some old Windows servers also do not support level 103, retry with
+ older level one if old server failed the previous call or we
+ bypassed it because we detected that this was an older LANMAN sess */
if(rc)
rc = SMBOldQFSInfo(xid, pTcon, buf);
/*
@@ -435,13 +438,21 @@ static void cifs_umount_begin(struct vfsmount * vfsmnt, int flags)
return;
}
+#ifdef CONFIG_CIFS_STATS2
+static int cifs_show_stats(struct seq_file *s, struct vfsmount *mnt)
+{
+ /* BB FIXME */
+ return 0;
+}
+#endif
+
static int cifs_remount(struct super_block *sb, int *flags, char *data)
{
*flags |= MS_NODIRATIME;
return 0;
}
-struct super_operations cifs_super_ops = {
+static struct super_operations cifs_super_ops = {
.read_inode = cifs_read_inode,
.put_super = cifs_put_super,
.statfs = cifs_statfs,
@@ -454,6 +465,9 @@ struct super_operations cifs_super_ops = {
.show_options = cifs_show_options,
.umount_begin = cifs_umount_begin,
.remount_fs = cifs_remount,
+#ifdef CONFIG_CIFS_STATS2
+ .show_stats = cifs_show_stats,
+#endif
};
static int
@@ -495,7 +509,7 @@ static ssize_t cifs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
static loff_t cifs_llseek(struct file *file, loff_t offset, int origin)
{
/* origin == SEEK_END => we must revalidate the cached file length */
- if (origin == 2) {
+ if (origin == SEEK_END) {
int retval = cifs_revalidate(file->f_dentry);
if (retval < 0)
return (loff_t)retval;
@@ -903,7 +917,7 @@ init_cifs(void)
#ifdef CONFIG_PROC_FS
cifs_proc_init();
#endif
- INIT_LIST_HEAD(&GlobalServerList); /* BB not implemented yet */
+/* INIT_LIST_HEAD(&GlobalServerList);*/ /* BB not implemented yet */
INIT_LIST_HEAD(&GlobalSMBSessionList);
INIT_LIST_HEAD(&GlobalTreeConnectionList);
INIT_LIST_HEAD(&GlobalOplock_Q);
@@ -931,6 +945,7 @@ init_cifs(void)
GlobalCurrentXid = 0;
GlobalTotalActiveXid = 0;
GlobalMaxActiveXid = 0;
+ memset(Local_System_Name, 0, 15);
rwlock_init(&GlobalSMBSeslock);
spin_lock_init(&GlobalMid_Lock);
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index bea875d9a46a..a243f779b363 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -36,7 +36,7 @@ extern const struct address_space_operations cifs_addr_ops;
extern const struct address_space_operations cifs_addr_ops_smallbuf;
/* Functions related to super block operations */
-extern struct super_operations cifs_super_ops;
+/* extern struct super_operations cifs_super_ops;*/
extern void cifs_read_inode(struct inode *);
extern void cifs_delete_inode(struct inode *);
/* extern void cifs_write_inode(struct inode *); *//* BB not needed yet */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index b24006c47df1..74d3ccbb103b 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -153,7 +153,7 @@ struct TCP_Server_Info {
char sessid[4]; /* unique token id for this session */
/* (returned on Negotiate */
int capabilities; /* allow selective disabling of caps by smb sess */
- __u16 timeZone;
+ int timeAdj; /* Adjust for difference in server time zone in sec */
__u16 CurrentMid; /* multiplex id - rotating counter */
char cryptKey[CIFS_CRYPTO_KEY_SIZE];
/* 16th byte of RFC1001 workstation name is always null */
@@ -203,9 +203,14 @@ struct cifsSesInfo {
char * domainName;
char * password;
};
-/* session flags */
+/* no more than one of the following three session flags may be set */
#define CIFS_SES_NT4 1
-
+#define CIFS_SES_OS2 2
+#define CIFS_SES_W9X 4
+/* following flag is set for old servers such as OS2 (and Win95?)
+ which do not negotiate NTLM or POSIX dialects, but instead
+ negotiate one of the older LANMAN dialects */
+#define CIFS_SES_LANMAN 8
/*
* there is one of these for each connection to a resource on a particular
* session
@@ -512,7 +517,8 @@ require use of the stronger protocol */
* This list helps improve performance and eliminate the messages indicating
* that we had a communications error talking to the server in this list.
*/
-GLOBAL_EXTERN struct servers_not_supported *NotSuppList; /*@z4a */
+/* Feature not supported */
+/* GLOBAL_EXTERN struct servers_not_supported *NotSuppList; */
/*
* The following is a hash table of all the users we know about.
@@ -568,7 +574,6 @@ GLOBAL_EXTERN unsigned int lookupCacheEnabled;
GLOBAL_EXTERN unsigned int extended_security; /* if on, session setup sent
with more secure ntlmssp2 challenge/resp */
GLOBAL_EXTERN unsigned int sign_CIFS_PDUs; /* enable smb packet signing */
-GLOBAL_EXTERN unsigned int secFlags;
GLOBAL_EXTERN unsigned int linuxExtEnabled;/*enable Linux/Unix CIFS extensions*/
GLOBAL_EXTERN unsigned int CIFSMaxBufSize; /* max size not including hdr */
GLOBAL_EXTERN unsigned int cifs_min_rcv; /* min size of big ntwrk buf pool */
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index 81df2bf8e75a..6df9dadba647 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -26,7 +26,8 @@
#ifdef CONFIG_CIFS_WEAK_PW_HASH
#define LANMAN_PROT 0
-#define CIFS_PROT 1
+#define LANMAN2_PROT 1
+#define CIFS_PROT 2
#else
#define CIFS_PROT 0
#endif
@@ -408,6 +409,8 @@ typedef struct negotiate_req {
/* Dialect index is 13 for LANMAN */
+#define MIN_TZ_ADJ (15 * 60) /* minimum grid for timezones in seconds */
+
typedef struct lanman_neg_rsp {
struct smb_hdr hdr; /* wct = 13 */
__le16 DialectIndex;
@@ -417,7 +420,10 @@ typedef struct lanman_neg_rsp {
__le16 MaxNumberVcs;
__le16 RawMode;
__le32 SessionKey;
- __le32 ServerTime;
+ struct {
+ __le16 Time;
+ __le16 Date;
+ } __attribute__((packed)) SrvTime;
__le16 ServerTimeZone;
__le16 EncryptionKeyLength;
__le16 Reserved;
@@ -674,7 +680,7 @@ typedef union smb_com_tree_disconnect { /* as an altetnative can use flag on
typedef struct smb_com_close_req {
struct smb_hdr hdr; /* wct = 3 */
__u16 FileID;
- __u32 LastWriteTime; /* should be zero */
+ __u32 LastWriteTime; /* should be zero or -1 */
__u16 ByteCount; /* 0 */
} __attribute__((packed)) CLOSE_REQ;
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index b35c55c3c8bb..f1f8225102f0 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -50,12 +50,12 @@ extern int SendReceive(const unsigned int /* xid */ , struct cifsSesInfo *,
extern int SendReceive2(const unsigned int /* xid */ , struct cifsSesInfo *,
struct kvec *, int /* nvec to send */,
int * /* type of buf returned */ , const int long_op);
-extern int SendReceiveBlockingLock(const unsigned int /* xid */ , struct cifsTconInfo *,
+extern int SendReceiveBlockingLock(const unsigned int /* xid */ ,
+ struct cifsTconInfo *,
struct smb_hdr * /* input */ ,
struct smb_hdr * /* out */ ,
int * /* bytes returned */);
-extern int checkSMBhdr(struct smb_hdr *smb, __u16 mid);
-extern int checkSMB(struct smb_hdr *smb, __u16 mid, int length);
+extern int checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length);
extern int is_valid_oplock_break(struct smb_hdr *smb, struct TCP_Server_Info *);
extern int is_size_safe_to_change(struct cifsInodeInfo *);
extern struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *);
@@ -80,6 +80,9 @@ extern struct oplock_q_entry * AllocOplockQEntry(struct inode *, u16,
extern void DeleteOplockQEntry(struct oplock_q_entry *);
extern struct timespec cifs_NTtimeToUnix(u64 /* utc nanoseconds since 1601 */ );
extern u64 cifs_UnixTimeToNT(struct timespec);
+extern __le64 cnvrtDosCifsTm(__u16 date, __u16 time);
+extern struct timespec cnvrtDosUnixTm(__u16 date, __u16 time);
+
extern int cifs_get_inode_info(struct inode **pinode,
const unsigned char *search_path,
FILE_ALL_INFO * pfile_info,
@@ -116,6 +119,7 @@ extern int CIFSFindClose(const int, struct cifsTconInfo *tcon,
extern int CIFSSMBQPathInfo(const int xid, struct cifsTconInfo *tcon,
const unsigned char *searchName,
FILE_ALL_INFO * findData,
+ int legacy /* whether to use old info level */,
const struct nls_table *nls_codepage, int remap);
extern int SMBQueryInformation(const int xid, struct cifsTconInfo *tcon,
const unsigned char *searchName,
@@ -279,8 +283,6 @@ extern void sesInfoFree(struct cifsSesInfo *);
extern struct cifsTconInfo *tconInfoAlloc(void);
extern void tconInfoFree(struct cifsTconInfo *);
-extern int cifs_reconnect(struct TCP_Server_Info *server);
-
extern int cifs_sign_smb(struct smb_hdr *, struct TCP_Server_Info *,__u32 *);
extern int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *,
__u32 *);
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 075d8fb3d376..098790eb2aa1 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -46,6 +46,7 @@ static struct {
} protocols[] = {
#ifdef CONFIG_CIFS_WEAK_PW_HASH
{LANMAN_PROT, "\2LM1.2X002"},
+ {LANMAN2_PROT, "\2LANMAN2.1"},
#endif /* weak password hashing for legacy clients */
{CIFS_PROT, "\2NT LM 0.12"},
{POSIX_PROT, "\2POSIX 2"},
@@ -58,6 +59,7 @@ static struct {
} protocols[] = {
#ifdef CONFIG_CIFS_WEAK_PW_HASH
{LANMAN_PROT, "\2LM1.2X002"},
+ {LANMAN2_PROT, "\2LANMAN2.1"},
#endif /* weak password hashing for legacy clients */
{CIFS_PROT, "\2NT LM 0.12"},
{BAD_PROT, "\2"}
@@ -67,13 +69,13 @@ static struct {
/* define the number of elements in the cifs dialect array */
#ifdef CONFIG_CIFS_POSIX
#ifdef CONFIG_CIFS_WEAK_PW_HASH
-#define CIFS_NUM_PROT 3
+#define CIFS_NUM_PROT 4
#else
#define CIFS_NUM_PROT 2
#endif /* CIFS_WEAK_PW_HASH */
#else /* not posix */
#ifdef CONFIG_CIFS_WEAK_PW_HASH
-#define CIFS_NUM_PROT 2
+#define CIFS_NUM_PROT 3
#else
#define CIFS_NUM_PROT 1
#endif /* CONFIG_CIFS_WEAK_PW_HASH */
@@ -397,6 +399,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
struct TCP_Server_Info * server;
u16 count;
unsigned int secFlags;
+ u16 dialect;
if(ses->server)
server = ses->server;
@@ -436,9 +439,10 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
if (rc != 0)
goto neg_err_exit;
- cFYI(1,("Dialect: %d", pSMBr->DialectIndex));
+ dialect = le16_to_cpu(pSMBr->DialectIndex);
+ cFYI(1,("Dialect: %d", dialect));
/* Check wct = 1 error case */
- if((pSMBr->hdr.WordCount < 13) || (pSMBr->DialectIndex == BAD_PROT)) {
+ if((pSMBr->hdr.WordCount < 13) || (dialect == BAD_PROT)) {
/* core returns wct = 1, but we do not ask for core - otherwise
small wct just comes when dialect index is -1 indicating we
could not negotiate a common dialect */
@@ -446,7 +450,9 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
goto neg_err_exit;
#ifdef CONFIG_CIFS_WEAK_PW_HASH
} else if((pSMBr->hdr.WordCount == 13)
- && (pSMBr->DialectIndex == LANMAN_PROT)) {
+ && ((dialect == LANMAN_PROT)
+ || (dialect == LANMAN2_PROT))) {
+ __s16 tmp;
struct lanman_neg_rsp * rsp = (struct lanman_neg_rsp *)pSMBr;
if((secFlags & CIFSSEC_MAY_LANMAN) ||
@@ -472,12 +478,44 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
server->maxRw = 0;/* we do not need to use raw anyway */
server->capabilities = CAP_MPX_MODE;
}
- server->timeZone = le16_to_cpu(rsp->ServerTimeZone);
+ tmp = (__s16)le16_to_cpu(rsp->ServerTimeZone);
+ if (tmp == -1) {
+ /* OS/2 often does not set timezone therefore
+ * we must use server time to calc time zone.
+ * Could deviate slightly from the right zone.
+ * Smallest defined timezone difference is 15 minutes
+ * (i.e. Nepal). Rounding up/down is done to match
+ * this requirement.
+ */
+ int val, seconds, remain, result;
+ struct timespec ts, utc;
+ utc = CURRENT_TIME;
+ ts = cnvrtDosUnixTm(le16_to_cpu(rsp->SrvTime.Date),
+ le16_to_cpu(rsp->SrvTime.Time));
+ cFYI(1,("SrvTime: %d sec since 1970 (utc: %d) diff: %d",
+ (int)ts.tv_sec, (int)utc.tv_sec,
+ (int)(utc.tv_sec - ts.tv_sec)));
+ val = (int)(utc.tv_sec - ts.tv_sec);
+ seconds = val < 0 ? -val : val;
+ result = (seconds / MIN_TZ_ADJ) * MIN_TZ_ADJ;
+ remain = seconds % MIN_TZ_ADJ;
+ if(remain >= (MIN_TZ_ADJ / 2))
+ result += MIN_TZ_ADJ;
+ if(val < 0)
+ result = - result;
+ server->timeAdj = result;
+ } else {
+ server->timeAdj = (int)tmp;
+ server->timeAdj *= 60; /* also in seconds */
+ }
+ cFYI(1,("server->timeAdj: %d seconds", server->timeAdj));
+
/* BB get server time for time conversions and add
code to use it and timezone since this is not UTC */
- if (rsp->EncryptionKeyLength == cpu_to_le16(CIFS_CRYPTO_KEY_SIZE)) {
+ if (rsp->EncryptionKeyLength ==
+ cpu_to_le16(CIFS_CRYPTO_KEY_SIZE)) {
memcpy(server->cryptKey, rsp->EncryptionKey,
CIFS_CRYPTO_KEY_SIZE);
} else if (server->secMode & SECMODE_PW_ENCRYPT) {
@@ -531,7 +569,8 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
cFYI(0, ("Max buf = %d", ses->server->maxBuf));
GETU32(ses->server->sessid) = le32_to_cpu(pSMBr->SessionKey);
server->capabilities = le32_to_cpu(pSMBr->Capabilities);
- server->timeZone = le16_to_cpu(pSMBr->ServerTimeZone);
+ server->timeAdj = (int)(__s16)le16_to_cpu(pSMBr->ServerTimeZone);
+ server->timeAdj *= 60;
if (pSMBr->EncryptionKeyLength == CIFS_CRYPTO_KEY_SIZE) {
memcpy(server->cryptKey, pSMBr->u.EncryptionKey,
CIFS_CRYPTO_KEY_SIZE);
@@ -1617,7 +1656,7 @@ CIFSSMBClose(const int xid, struct cifsTconInfo *tcon, int smb_file_id)
pSMBr = (CLOSE_RSP *)pSMB; /* BB removeme BB */
pSMB->FileID = (__u16) smb_file_id;
- pSMB->LastWriteTime = 0;
+ pSMB->LastWriteTime = 0xFFFFFFFF;
pSMB->ByteCount = 0;
rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
(struct smb_hdr *) pSMBr, &bytes_returned, 0);
@@ -2773,9 +2812,11 @@ GetExtAttrOut:
/* security id for everyone */
-const struct cifs_sid sid_everyone = {1, 1, {0, 0, 0, 0, 0, 0}, {0, 0, 0, 0}};
+const static struct cifs_sid sid_everyone =
+ {1, 1, {0, 0, 0, 0, 0, 0}, {0, 0, 0, 0}};
/* group users */
-const struct cifs_sid sid_user = {1, 2 , {0, 0, 0, 0, 0, 5}, {32, 545, 0, 0}};
+const static struct cifs_sid sid_user =
+ {1, 2 , {0, 0, 0, 0, 0, 5}, {32, 545, 0, 0}};
/* Convert CIFS ACL to POSIX form */
static int parse_sec_desc(struct cifs_sid * psec_desc, int acl_len)
@@ -2856,7 +2897,6 @@ qsec_out:
return rc;
}
-
/* Legacy Query Path Information call for lookup to old servers such
as Win9x/WinME */
int SMBQueryInformation(const int xid, struct cifsTconInfo *tcon,
@@ -2898,7 +2938,16 @@ QInfRetry:
if (rc) {
cFYI(1, ("Send error in QueryInfo = %d", rc));
} else if (pFinfo) { /* decode response */
+ struct timespec ts;
+ __u32 time = le32_to_cpu(pSMBr->last_write_time);
+ /* BB FIXME - add time zone adjustment BB */
memset(pFinfo, 0, sizeof(FILE_ALL_INFO));
+ ts.tv_nsec = 0;
+ ts.tv_sec = time;
+ /* decode time fields */
+ pFinfo->ChangeTime = cpu_to_le64(cifs_UnixTimeToNT(ts));
+ pFinfo->LastWriteTime = pFinfo->ChangeTime;
+ pFinfo->LastAccessTime = 0;
pFinfo->AllocationSize =
cpu_to_le64(le32_to_cpu(pSMBr->size));
pFinfo->EndOfFile = pFinfo->AllocationSize;
@@ -2922,6 +2971,7 @@ int
CIFSSMBQPathInfo(const int xid, struct cifsTconInfo *tcon,
const unsigned char *searchName,
FILE_ALL_INFO * pFindData,
+ int legacy /* old style infolevel */,
const struct nls_table *nls_codepage, int remap)
{
/* level 263 SMB_QUERY_FILE_ALL_INFO */
@@ -2970,7 +3020,10 @@ QPathInfoRetry:
byte_count = params + 1 /* pad */ ;
pSMB->TotalParameterCount = cpu_to_le16(params);
pSMB->ParameterCount = pSMB->TotalParameterCount;
- pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_FILE_ALL_INFO);
+ if(legacy)
+ pSMB->InformationLevel = cpu_to_le16(SMB_INFO_STANDARD);
+ else
+ pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_FILE_ALL_INFO);
pSMB->Reserved4 = 0;
pSMB->hdr.smb_buf_length += byte_count;
pSMB->ByteCount = cpu_to_le16(byte_count);
@@ -2982,13 +3035,24 @@ QPathInfoRetry:
} else { /* decode response */
rc = validate_t2((struct smb_t2_rsp *)pSMBr);
- if (rc || (pSMBr->ByteCount < 40))
+ if (rc) /* BB add auto retry on EOPNOTSUPP? */
+ rc = -EIO;
+ else if (!legacy && (pSMBr->ByteCount < 40))
rc = -EIO; /* bad smb */
+ else if(legacy && (pSMBr->ByteCount < 24))
+ rc = -EIO; /* 24 or 26 expected but we do not read last field */
else if (pFindData){
+ int size;
__u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
+ if(legacy) /* we do not read the last field, EAsize, fortunately
+ since it varies by subdialect and on Set vs. Get, is
+ two bytes or 4 bytes depending but we don't care here */
+ size = sizeof(FILE_INFO_STANDARD);
+ else
+ size = sizeof(FILE_ALL_INFO);
memcpy((char *) pFindData,
(char *) &pSMBr->hdr.Protocol +
- data_offset, sizeof (FILE_ALL_INFO));
+ data_offset, size);
} else
rc = -ENOMEM;
}
@@ -3613,6 +3677,14 @@ getDFSRetry:
strncpy(pSMB->RequestFileName, searchName, name_len);
}
+ if(ses->server) {
+ if(ses->server->secMode &
+ (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
+ pSMB->hdr.Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
+ }
+
+ pSMB->hdr.Uid = ses->Suid;
+
params = 2 /* level */ + name_len /*includes null */ ;
pSMB->TotalDataCount = 0;
pSMB->DataCount = 0;
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index c78762051da4..4093d5332930 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -109,7 +109,7 @@ static int ipv6_connect(struct sockaddr_in6 *psin_server,
* wake up waiters on reconnection? - (not needed currently)
*/
-int
+static int
cifs_reconnect(struct TCP_Server_Info *server)
{
int rc = 0;
@@ -771,13 +771,18 @@ cifs_parse_mount_options(char *options, const char *devname,struct smb_vol *vol)
separator[0] = ',';
separator[1] = 0;
- memset(vol->source_rfc1001_name,0x20,15);
- for(i=0;i < strnlen(utsname()->nodename,15);i++) {
- /* does not have to be a perfect mapping since the field is
- informational, only used for servers that do not support
- port 445 and it can be overridden at mount time */
- vol->source_rfc1001_name[i] =
- toupper(utsname()->nodename[i]);
+ if (Local_System_Name[0] != 0)
+ memcpy(vol->source_rfc1001_name, Local_System_Name,15);
+ else {
+ char *nodename = utsname()->nodename;
+ int n = strnlen(nodename,15);
+ memset(vol->source_rfc1001_name,0x20,15);
+ for(i=0 ; i < n ; i++) {
+ /* does not have to be perfect mapping since field is
+ informational, only used for servers that do not support
+ port 445 and it can be overridden at mount time */
+ vol->source_rfc1001_name[i] = toupper(nodename[i]);
+ }
}
vol->source_rfc1001_name[15] = 0;
/* null target name indicates to use *SMBSERVR default called name
@@ -3215,7 +3220,9 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
}
/* else do not bother copying these informational fields */
}
- if(smb_buffer_response->WordCount == 3)
+ if((smb_buffer_response->WordCount == 3) ||
+ (smb_buffer_response->WordCount == 7))
+ /* field is in same location */
tcon->Flags = le16_to_cpu(pSMBr->OptionalSupport);
else
tcon->Flags = 0;
@@ -3312,19 +3319,21 @@ int cifs_setup_session(unsigned int xid, struct cifsSesInfo *pSesInfo,
first_time = 1;
}
if (!rc) {
+ pSesInfo->flags = 0;
pSesInfo->capabilities = pSesInfo->server->capabilities;
if(linuxExtEnabled == 0)
pSesInfo->capabilities &= (~CAP_UNIX);
/* pSesInfo->sequence_number = 0;*/
- cFYI(1,("Security Mode: 0x%x Capabilities: 0x%x Time Zone: %d",
+ cFYI(1,("Security Mode: 0x%x Capabilities: 0x%x TimeAdjust: %d",
pSesInfo->server->secMode,
pSesInfo->server->capabilities,
- pSesInfo->server->timeZone));
+ pSesInfo->server->timeAdj));
if(experimEnabled < 2)
rc = CIFS_SessSetup(xid, pSesInfo,
first_time, nls_info);
else if (extended_security
- && (pSesInfo->capabilities & CAP_EXTENDED_SECURITY)
+ && (pSesInfo->capabilities
+ & CAP_EXTENDED_SECURITY)
&& (pSesInfo->server->secType == NTLMSSP)) {
rc = -EOPNOTSUPP;
} else if (extended_security
@@ -3338,7 +3347,7 @@ int cifs_setup_session(unsigned int xid, struct cifsSesInfo *pSesInfo,
if (!rc) {
if(ntlmv2_flag) {
char * v2_response;
- cFYI(1,("Can use more secure NTLM version 2 password hash"));
+ cFYI(1,("more secure NTLM ver2 hash"));
if(CalcNTLMv2_partial_mac_key(pSesInfo,
nls_info)) {
rc = -ENOMEM;
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 6b90ef98e4cf..35d54bb0869a 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -337,6 +337,7 @@ int cifs_get_inode_info(struct inode **pinode,
pfindData = (FILE_ALL_INFO *)buf;
/* could do find first instead but this returns more info */
rc = CIFSSMBQPathInfo(xid, pTcon, search_path, pfindData,
+ 0 /* not legacy */,
cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
CIFS_MOUNT_MAP_SPECIAL_CHR);
/* BB optimize code so we do not make the above call
@@ -384,8 +385,10 @@ int cifs_get_inode_info(struct inode **pinode,
/* get new inode */
if (*pinode == NULL) {
*pinode = new_inode(sb);
- if (*pinode == NULL)
+ if (*pinode == NULL) {
+ kfree(buf);
return -ENOMEM;
+ }
/* Is an i_ino of zero legal? Can we use that to check
if the server supports returning inode numbers? Are
there other sanity checks we can use to ensure that
@@ -431,8 +434,11 @@ int cifs_get_inode_info(struct inode **pinode,
(pTcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE) & 0xFFFFFE00;*/
/* Linux can not store file creation time so ignore it */
- inode->i_atime =
- cifs_NTtimeToUnix(le64_to_cpu(pfindData->LastAccessTime));
+ if(pfindData->LastAccessTime)
+ inode->i_atime = cifs_NTtimeToUnix
+ (le64_to_cpu(pfindData->LastAccessTime));
+ else /* do not need to use current_fs_time - time not stored */
+ inode->i_atime = CURRENT_TIME;
inode->i_mtime =
cifs_NTtimeToUnix(le64_to_cpu(pfindData->LastWriteTime));
inode->i_ctime =
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index a57f5d6e6213..0bee8b7e521a 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -254,7 +254,11 @@ cifs_readlink(struct dentry *direntry, char __user *pBuffer, int buflen)
tmpbuffer,
len - 1,
cifs_sb->local_nls);
- else {
+ else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) {
+ cERROR(1,("SFU style symlinks not implemented yet"));
+ /* add open and read as in fs/cifs/inode.c */
+
+ } else {
rc = CIFSSMBOpen(xid, pTcon, full_path, FILE_OPEN, GENERIC_READ,
OPEN_REPARSE_POINT,&fid, &oplock, NULL,
cifs_sb->local_nls,
diff --git a/fs/cifs/md5.c b/fs/cifs/md5.c
index 7aa23490541f..ccebf9b7eb86 100644
--- a/fs/cifs/md5.c
+++ b/fs/cifs/md5.c
@@ -252,10 +252,11 @@ MD5Transform(__u32 buf[4], __u32 const in[16])
buf[3] += d;
}
+#if 0 /* currently unused */
/***********************************************************************
the rfc 2104 version of hmac_md5 initialisation.
***********************************************************************/
-void
+static void
hmac_md5_init_rfc2104(unsigned char *key, int key_len,
struct HMACMD5Context *ctx)
{
@@ -289,6 +290,7 @@ hmac_md5_init_rfc2104(unsigned char *key, int key_len,
MD5Init(&ctx->ctx);
MD5Update(&ctx->ctx, ctx->k_ipad, 64);
}
+#endif
/***********************************************************************
the microsoft version of hmac_md5 initialisation.
@@ -350,7 +352,8 @@ hmac_md5_final(unsigned char *digest, struct HMACMD5Context *ctx)
single function to calculate an HMAC MD5 digest from data.
use the microsoft hmacmd5 init method because the key is 16 bytes.
************************************************************/
-void
+#if 0 /* currently unused */
+static void
hmac_md5(unsigned char key[16], unsigned char *data, int data_len,
unsigned char *digest)
{
@@ -361,3 +364,4 @@ hmac_md5(unsigned char key[16], unsigned char *data, int data_len,
}
hmac_md5_final(digest, &ctx);
}
+#endif
diff --git a/fs/cifs/md5.h b/fs/cifs/md5.h
index 00e1c5394fe1..f7d4f4197bac 100644
--- a/fs/cifs/md5.h
+++ b/fs/cifs/md5.h
@@ -27,12 +27,12 @@ void MD5Final(unsigned char digest[16], struct MD5Context *context);
/* The following definitions come from lib/hmacmd5.c */
-void hmac_md5_init_rfc2104(unsigned char *key, int key_len,
- struct HMACMD5Context *ctx);
+/* void hmac_md5_init_rfc2104(unsigned char *key, int key_len,
+ struct HMACMD5Context *ctx);*/
void hmac_md5_init_limK_to_64(const unsigned char *key, int key_len,
struct HMACMD5Context *ctx);
void hmac_md5_update(const unsigned char *text, int text_len,
struct HMACMD5Context *ctx);
void hmac_md5_final(unsigned char *digest, struct HMACMD5Context *ctx);
-void hmac_md5(unsigned char key[16], unsigned char *data, int data_len,
- unsigned char *digest);
+/* void hmac_md5(unsigned char key[16], unsigned char *data, int data_len,
+ unsigned char *digest);*/
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 22c937e5884f..bbc9cd34b6ea 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -389,7 +389,7 @@ header_assemble(struct smb_hdr *buffer, char smb_command /* command */ ,
return;
}
-int
+static int
checkSMBhdr(struct smb_hdr *smb, __u16 mid)
{
/* Make sure that this really is an SMB, that it is a response,
@@ -418,26 +418,42 @@ checkSMBhdr(struct smb_hdr *smb, __u16 mid)
}
int
-checkSMB(struct smb_hdr *smb, __u16 mid, int length)
+checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length)
{
__u32 len = smb->smb_buf_length;
__u32 clc_len; /* calculated length */
cFYI(0, ("checkSMB Length: 0x%x, smb_buf_length: 0x%x", length, len));
- if (((unsigned int)length < 2 + sizeof (struct smb_hdr)) ||
- (len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4)) {
- if ((unsigned int)length < 2 + sizeof (struct smb_hdr)) {
- if (((unsigned int)length >=
- sizeof (struct smb_hdr) - 1)
+
+ if (length < 2 + sizeof (struct smb_hdr)) {
+ if ((length >= sizeof (struct smb_hdr) - 1)
&& (smb->Status.CifsError != 0)) {
- smb->WordCount = 0;
- /* some error cases do not return wct and bcc */
+ smb->WordCount = 0;
+ /* some error cases do not return wct and bcc */
+ return 0;
+ } else if ((length == sizeof(struct smb_hdr) + 1) &&
+ (smb->WordCount == 0)) {
+ char * tmp = (char *)smb;
+ /* Need to work around a bug in two servers here */
+ /* First, check if the part of bcc they sent was zero */
+ if (tmp[sizeof(struct smb_hdr)] == 0) {
+ /* some servers return only half of bcc
+ * on simple responses (wct, bcc both zero)
+ * in particular have seen this on
+ * ulogoffX and FindClose. This leaves
+ * one byte of bcc potentially unitialized
+ */
+ /* zero rest of bcc */
+ tmp[sizeof(struct smb_hdr)+1] = 0;
return 0;
- } else {
- cERROR(1, ("Length less than smb header size"));
}
+ cERROR(1,("rcvd invalid byte count (bcc)"));
+ } else {
+ cERROR(1, ("Length less than smb header size"));
}
- if (len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4)
- cERROR(1, ("smb length greater than MaxBufSize, mid=%d",
+ return 1;
+ }
+ if (len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) {
+ cERROR(1, ("smb length greater than MaxBufSize, mid=%d",
smb->Mid));
return 1;
}
@@ -446,7 +462,7 @@ checkSMB(struct smb_hdr *smb, __u16 mid, int length)
return 1;
clc_len = smbCalcSize_LE(smb);
- if(4 + len != (unsigned int)length) {
+ if(4 + len != length) {
cERROR(1, ("Length read does not match RFC1001 length %d",len));
return 1;
}
diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c
index ce87550e918f..992e80edc720 100644
--- a/fs/cifs/netmisc.c
+++ b/fs/cifs/netmisc.c
@@ -909,3 +909,61 @@ cifs_UnixTimeToNT(struct timespec t)
/* Convert to 100ns intervals and then add the NTFS time offset. */
return (u64) t.tv_sec * 10000000 + t.tv_nsec/100 + NTFS_TIME_OFFSET;
}
+
+static int total_days_of_prev_months[] =
+{0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334};
+
+
+__le64 cnvrtDosCifsTm(__u16 date, __u16 time)
+{
+ return cpu_to_le64(cifs_UnixTimeToNT(cnvrtDosUnixTm(date, time)));
+}
+
+struct timespec cnvrtDosUnixTm(__u16 date, __u16 time)
+{
+ struct timespec ts;
+ int sec, min, days, month, year;
+ SMB_TIME * st = (SMB_TIME *)&time;
+ SMB_DATE * sd = (SMB_DATE *)&date;
+
+ cFYI(1,("date %d time %d",date, time));
+
+ sec = 2 * st->TwoSeconds;
+ min = st->Minutes;
+ if((sec > 59) || (min > 59))
+ cERROR(1,("illegal time min %d sec %d", min, sec));
+ sec += (min * 60);
+ sec += 60 * 60 * st->Hours;
+ if(st->Hours > 24)
+ cERROR(1,("illegal hours %d",st->Hours));
+ days = sd->Day;
+ month = sd->Month;
+ if((days > 31) || (month > 12))
+ cERROR(1,("illegal date, month %d day: %d", month, days));
+ month -= 1;
+ days += total_days_of_prev_months[month];
+ days += 3652; /* account for difference in days between 1980 and 1970 */
+ year = sd->Year;
+ days += year * 365;
+ days += (year/4); /* leap year */
+ /* generalized leap year calculation is more complex, ie no leap year
+ for years/100 except for years/400, but since the maximum number for DOS
+ year is 2**7, the last year is 1980+127, which means we need only
+ consider 2 special case years, ie the years 2000 and 2100, and only
+ adjust for the lack of leap year for the year 2100, as 2000 was a
+ leap year (divisable by 400) */
+ if(year >= 120) /* the year 2100 */
+ days = days - 1; /* do not count leap year for the year 2100 */
+
+ /* adjust for leap year where we are still before leap day */
+ if(year != 120)
+ days -= ((year & 0x03) == 0) && (month < 2 ? 1 : 0);
+ sec += 24 * 60 * 60 * days;
+
+ ts.tv_sec = sec;
+
+ /* cFYI(1,("sec after cnvrt dos to unix time %d",sec)); */
+
+ ts.tv_nsec = 0;
+ return ts;
+}
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index b27b34537bf2..b5b0a2a41bef 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -106,6 +106,17 @@ static int construct_dentry(struct qstr *qstring, struct file *file,
return rc;
}
+static void AdjustForTZ(struct cifsTconInfo * tcon, struct inode * inode)
+{
+ if((tcon) && (tcon->ses) && (tcon->ses->server)) {
+ inode->i_ctime.tv_sec += tcon->ses->server->timeAdj;
+ inode->i_mtime.tv_sec += tcon->ses->server->timeAdj;
+ inode->i_atime.tv_sec += tcon->ses->server->timeAdj;
+ }
+ return;
+}
+
+
static void fill_in_inode(struct inode *tmp_inode, int new_buf_type,
char * buf, int *pobject_type, int isNewInode)
{
@@ -135,16 +146,23 @@ static void fill_in_inode(struct inode *tmp_inode, int new_buf_type,
tmp_inode->i_ctime =
cifs_NTtimeToUnix(le64_to_cpu(pfindData->ChangeTime));
} else { /* legacy, OS2 and DOS style */
+/* struct timespec ts;*/
FIND_FILE_STANDARD_INFO * pfindData =
(FIND_FILE_STANDARD_INFO *)buf;
+ tmp_inode->i_mtime = cnvrtDosUnixTm(
+ le16_to_cpu(pfindData->LastWriteDate),
+ le16_to_cpu(pfindData->LastWriteTime));
+ tmp_inode->i_atime = cnvrtDosUnixTm(
+ le16_to_cpu(pfindData->LastAccessDate),
+ le16_to_cpu(pfindData->LastAccessTime));
+ tmp_inode->i_ctime = cnvrtDosUnixTm(
+ le16_to_cpu(pfindData->LastWriteDate),
+ le16_to_cpu(pfindData->LastWriteTime));
+ AdjustForTZ(cifs_sb->tcon, tmp_inode);
attr = le16_to_cpu(pfindData->Attributes);
allocation_size = le32_to_cpu(pfindData->AllocationSize);
end_of_file = le32_to_cpu(pfindData->DataSize);
- tmp_inode->i_atime = CURRENT_TIME;
- /* tmp_inode->i_mtime = BB FIXME - add dos time handling
- tmp_inode->i_ctime = 0; BB FIXME */
-
}
/* Linux can not store file creation time unfortunately so ignore it */
@@ -938,6 +956,7 @@ static int cifs_save_resume_key(const char *current_entry,
filename = &pFindData->FileName[0];
/* one byte length, no name conversion */
len = (unsigned int)pFindData->FileNameLength;
+ cifsFile->srch_inf.resume_key = pFindData->ResumeKey;
} else {
cFYI(1,("Unknown findfirst level %d",level));
return -EINVAL;
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 22b4c35dcfe3..a8a083543ba0 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -268,6 +268,10 @@ static int decode_ascii_ssetup(char ** pbcc_area, int bleft, struct cifsSesInfo
ses->serverOS = kzalloc(len + 1, GFP_KERNEL);
if(ses->serverOS)
strncpy(ses->serverOS, bcc_ptr, len);
+ if(strncmp(ses->serverOS, "OS/2",4) == 0) {
+ cFYI(1,("OS/2 server"));
+ ses->flags |= CIFS_SES_OS2;
+ }
bcc_ptr += len + 1;
bleft -= len + 1;
@@ -290,16 +294,11 @@ static int decode_ascii_ssetup(char ** pbcc_area, int bleft, struct cifsSesInfo
if(len > bleft)
return rc;
- if(ses->serverDomain)
- kfree(ses->serverDomain);
-
- ses->serverDomain = kzalloc(len + 1, GFP_KERNEL);
- if(ses->serverOS)
- strncpy(ses->serverOS, bcc_ptr, len);
-
- bcc_ptr += len + 1;
- bleft -= len + 1;
-
+ /* No domain field in LANMAN case. Domain is
+ returned by old servers in the SMB negprot response */
+ /* BB For newer servers which do not support Unicode,
+ but thus do return domain here we could add parsing
+ for it later, but it is not very important */
cFYI(1,("ascii: bytes left %d",bleft));
return rc;
@@ -366,6 +365,8 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,
str_area = kmalloc(2000, GFP_KERNEL);
bcc_ptr = str_area;
+ ses->flags &= ~CIFS_SES_LANMAN;
+
if(type == LANMAN) {
#ifdef CONFIG_CIFS_WEAK_PW_HASH
char lnm_session_key[CIFS_SESS_KEY_SIZE];
@@ -377,7 +378,7 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,
/* and copy into bcc */
calc_lanman_hash(ses, lnm_session_key);
-
+ ses->flags |= CIFS_SES_LANMAN;
/* #ifdef CONFIG_CIFS_DEBUG2
cifs_dump_mem("cryptkey: ",ses->server->cryptKey,
CIFS_SESS_KEY_SIZE);
diff --git a/fs/cifs/smbdes.c b/fs/cifs/smbdes.c
index efaa044523a7..7a1b2b961ec8 100644
--- a/fs/cifs/smbdes.c
+++ b/fs/cifs/smbdes.c
@@ -364,20 +364,20 @@ E_P24(unsigned char *p21, unsigned char *c8, unsigned char *p24)
smbhash(p24 + 16, c8, p21 + 14, 1);
}
-void
+#if 0 /* currently unsued */
+static void
D_P16(unsigned char *p14, unsigned char *in, unsigned char *out)
{
smbhash(out, in, p14, 0);
smbhash(out + 8, in + 8, p14 + 7, 0);
}
-void
+static void
E_old_pw_hash(unsigned char *p14, unsigned char *in, unsigned char *out)
{
smbhash(out, in, p14, 1);
smbhash(out + 8, in + 8, p14 + 7, 1);
}
-#if 0
/* these routines are currently unneeded, but may be
needed later */
void
diff --git a/fs/cifs/smbencrypt.c b/fs/cifs/smbencrypt.c
index f518c5e45035..4b25ba92180d 100644
--- a/fs/cifs/smbencrypt.c
+++ b/fs/cifs/smbencrypt.c
@@ -51,11 +51,8 @@
void SMBencrypt(unsigned char *passwd, unsigned char *c8, unsigned char *p24);
void E_md4hash(const unsigned char *passwd, unsigned char *p16);
-void nt_lm_owf_gen(char *pwd, unsigned char nt_p16[16], unsigned char p16[16]);
static void SMBOWFencrypt(unsigned char passwd[16], unsigned char *c8,
unsigned char p24[24]);
-void NTLMSSPOWFencrypt(unsigned char passwd[8],
- unsigned char *ntlmchalresp, unsigned char p24[24]);
void SMBNTencrypt(unsigned char *passwd, unsigned char *c8, unsigned char *p24);
/*
@@ -144,8 +141,9 @@ E_md4hash(const unsigned char *passwd, unsigned char *p16)
memset(wpwd,0,129 * 2);
}
+#if 0 /* currently unused */
/* Does both the NT and LM owfs of a user's password */
-void
+static void
nt_lm_owf_gen(char *pwd, unsigned char nt_p16[16], unsigned char p16[16])
{
char passwd[514];
@@ -171,6 +169,7 @@ nt_lm_owf_gen(char *pwd, unsigned char nt_p16[16], unsigned char p16[16])
/* clear out local copy of user's password (just being paranoid). */
memset(passwd, '\0', sizeof (passwd));
}
+#endif
/* Does the NTLMv2 owfs of a user's password */
#if 0 /* function not needed yet - but will be soon */
@@ -223,7 +222,8 @@ SMBOWFencrypt(unsigned char passwd[16], unsigned char *c8,
}
/* Does the des encryption from the FIRST 8 BYTES of the NT or LM MD4 hash. */
-void
+#if 0 /* currently unused */
+static void
NTLMSSPOWFencrypt(unsigned char passwd[8],
unsigned char *ntlmchalresp, unsigned char p24[24])
{
@@ -235,6 +235,7 @@ NTLMSSPOWFencrypt(unsigned char passwd[8],
E_P24(p21, ntlmchalresp, p24);
}
+#endif
/* Does the NT MD4 hash then des encryption. */
diff --git a/fs/compat.c b/fs/compat.c
index 4d3fbcb2ddb1..50624d4a70c6 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1316,7 +1316,7 @@ compat_sys_vmsplice(int fd, const struct compat_iovec __user *iov32,
unsigned int nr_segs, unsigned int flags)
{
unsigned i;
- struct iovec *iov;
+ struct iovec __user *iov;
if (nr_segs > UIO_MAXIOV)
return -EINVAL;
iov = compat_alloc_user_space(nr_segs * sizeof(struct iovec));
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 27ca1aa30562..a91f2628c981 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -2438,13 +2438,17 @@ HANDLE_IOCTL(0x1260, broken_blkgetsize)
HANDLE_IOCTL(BLKFRAGET, w_long)
HANDLE_IOCTL(BLKSECTGET, w_long)
HANDLE_IOCTL(BLKPG, blkpg_ioctl_trans)
-HANDLE_IOCTL(HDIO_GET_KEEPSETTINGS, hdio_ioctl_trans)
HANDLE_IOCTL(HDIO_GET_UNMASKINTR, hdio_ioctl_trans)
-HANDLE_IOCTL(HDIO_GET_DMA, hdio_ioctl_trans)
-HANDLE_IOCTL(HDIO_GET_32BIT, hdio_ioctl_trans)
HANDLE_IOCTL(HDIO_GET_MULTCOUNT, hdio_ioctl_trans)
+HANDLE_IOCTL(HDIO_GET_KEEPSETTINGS, hdio_ioctl_trans)
+HANDLE_IOCTL(HDIO_GET_32BIT, hdio_ioctl_trans)
HANDLE_IOCTL(HDIO_GET_NOWERR, hdio_ioctl_trans)
+HANDLE_IOCTL(HDIO_GET_DMA, hdio_ioctl_trans)
HANDLE_IOCTL(HDIO_GET_NICE, hdio_ioctl_trans)
+HANDLE_IOCTL(HDIO_GET_WCACHE, hdio_ioctl_trans)
+HANDLE_IOCTL(HDIO_GET_ACOUSTIC, hdio_ioctl_trans)
+HANDLE_IOCTL(HDIO_GET_ADDRESS, hdio_ioctl_trans)
+HANDLE_IOCTL(HDIO_GET_BUSSTATE, hdio_ioctl_trans)
HANDLE_IOCTL(FDSETPRM32, fd_ioctl_trans)
HANDLE_IOCTL(FDDEFPRM32, fd_ioctl_trans)
HANDLE_IOCTL(FDGETPRM32, fd_ioctl_trans)
diff --git a/fs/configfs/file.c b/fs/configfs/file.c
index e6d5754a715e..cf33fac68c84 100644
--- a/fs/configfs/file.c
+++ b/fs/configfs/file.c
@@ -275,13 +275,14 @@ static int check_perm(struct inode * inode, struct file * file)
* it in file->private_data for easy access.
*/
buffer = kzalloc(sizeof(struct configfs_buffer),GFP_KERNEL);
- if (buffer) {
- init_MUTEX(&buffer->sem);
- buffer->needs_read_fill = 1;
- buffer->ops = ops;
- file->private_data = buffer;
- } else
+ if (!buffer) {
error = -ENOMEM;
+ goto Enomem;
+ }
+ init_MUTEX(&buffer->sem);
+ buffer->needs_read_fill = 1;
+ buffer->ops = ops;
+ file->private_data = buffer;
goto Done;
Einval:
@@ -289,6 +290,7 @@ static int check_perm(struct inode * inode, struct file * file)
goto Done;
Eaccess:
error = -EACCES;
+ Enomem:
module_put(attr->ca_owner);
Done:
if (error && item)
diff --git a/fs/configfs/item.c b/fs/configfs/item.c
index e07485ac50ad..24421209f854 100644
--- a/fs/configfs/item.c
+++ b/fs/configfs/item.c
@@ -224,4 +224,4 @@ EXPORT_SYMBOL(config_item_init);
EXPORT_SYMBOL(config_group_init);
EXPORT_SYMBOL(config_item_get);
EXPORT_SYMBOL(config_item_put);
-
+EXPORT_SYMBOL(config_group_find_obj);
diff --git a/fs/dcache.c b/fs/dcache.c
index fc2faa44f8d1..2bac4ba1d1d3 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -291,9 +291,9 @@ struct dentry * dget_locked(struct dentry *dentry)
* it can be unhashed only if it has no children, or if it is the root
* of a filesystem.
*
- * If the inode has a DCACHE_DISCONNECTED alias, then prefer
+ * If the inode has an IS_ROOT, DCACHE_DISCONNECTED alias, then prefer
* any other hashed alias over that one unless @want_discon is set,
- * in which case only return a DCACHE_DISCONNECTED alias.
+ * in which case only return an IS_ROOT, DCACHE_DISCONNECTED alias.
*/
static struct dentry * __d_find_alias(struct inode *inode, int want_discon)
@@ -309,7 +309,8 @@ static struct dentry * __d_find_alias(struct inode *inode, int want_discon)
prefetch(next);
alias = list_entry(tmp, struct dentry, d_alias);
if (S_ISDIR(inode->i_mode) || !d_unhashed(alias)) {
- if (alias->d_flags & DCACHE_DISCONNECTED)
+ if (IS_ROOT(alias) &&
+ (alias->d_flags & DCACHE_DISCONNECTED))
discon_alias = alias;
else if (!want_discon) {
__dget_locked(alias);
@@ -548,6 +549,136 @@ repeat:
}
/*
+ * destroy a single subtree of dentries for unmount
+ * - see the comments on shrink_dcache_for_umount() for a description of the
+ * locking
+ */
+static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
+{
+ struct dentry *parent;
+
+ BUG_ON(!IS_ROOT(dentry));
+
+ /* detach this root from the system */
+ spin_lock(&dcache_lock);
+ if (!list_empty(&dentry->d_lru)) {
+ dentry_stat.nr_unused--;
+ list_del_init(&dentry->d_lru);
+ }
+ __d_drop(dentry);
+ spin_unlock(&dcache_lock);
+
+ for (;;) {
+ /* descend to the first leaf in the current subtree */
+ while (!list_empty(&dentry->d_subdirs)) {
+ struct dentry *loop;
+
+ /* this is a branch with children - detach all of them
+ * from the system in one go */
+ spin_lock(&dcache_lock);
+ list_for_each_entry(loop, &dentry->d_subdirs,
+ d_u.d_child) {
+ if (!list_empty(&loop->d_lru)) {
+ dentry_stat.nr_unused--;
+ list_del_init(&loop->d_lru);
+ }
+
+ __d_drop(loop);
+ cond_resched_lock(&dcache_lock);
+ }
+ spin_unlock(&dcache_lock);
+
+ /* move to the first child */
+ dentry = list_entry(dentry->d_subdirs.next,
+ struct dentry, d_u.d_child);
+ }
+
+ /* consume the dentries from this leaf up through its parents
+ * until we find one with children or run out altogether */
+ do {
+ struct inode *inode;
+
+ if (atomic_read(&dentry->d_count) != 0) {
+ printk(KERN_ERR
+ "BUG: Dentry %p{i=%lx,n=%s}"
+ " still in use (%d)"
+ " [unmount of %s %s]\n",
+ dentry,
+ dentry->d_inode ?
+ dentry->d_inode->i_ino : 0UL,
+ dentry->d_name.name,
+ atomic_read(&dentry->d_count),
+ dentry->d_sb->s_type->name,
+ dentry->d_sb->s_id);
+ BUG();
+ }
+
+ parent = dentry->d_parent;
+ if (parent == dentry)
+ parent = NULL;
+ else
+ atomic_dec(&parent->d_count);
+
+ list_del(&dentry->d_u.d_child);
+ dentry_stat.nr_dentry--; /* For d_free, below */
+
+ inode = dentry->d_inode;
+ if (inode) {
+ dentry->d_inode = NULL;
+ list_del_init(&dentry->d_alias);
+ if (dentry->d_op && dentry->d_op->d_iput)
+ dentry->d_op->d_iput(dentry, inode);
+ else
+ iput(inode);
+ }
+
+ d_free(dentry);
+
+ /* finished when we fall off the top of the tree,
+ * otherwise we ascend to the parent and move to the
+ * next sibling if there is one */
+ if (!parent)
+ return;
+
+ dentry = parent;
+
+ } while (list_empty(&dentry->d_subdirs));
+
+ dentry = list_entry(dentry->d_subdirs.next,
+ struct dentry, d_u.d_child);
+ }
+}
+
+/*
+ * destroy the dentries attached to a superblock on unmounting
+ * - we don't need to use dentry->d_lock, and only need dcache_lock when
+ * removing the dentry from the system lists and hashes because:
+ * - the superblock is detached from all mountings and open files, so the
+ * dentry trees will not be rearranged by the VFS
+ * - s_umount is write-locked, so the memory pressure shrinker will ignore
+ * any dentries belonging to this superblock that it comes across
+ * - the filesystem itself is no longer permitted to rearrange the dentries
+ * in this superblock
+ */
+void shrink_dcache_for_umount(struct super_block *sb)
+{
+ struct dentry *dentry;
+
+ if (down_read_trylock(&sb->s_umount))
+ BUG();
+
+ dentry = sb->s_root;
+ sb->s_root = NULL;
+ atomic_dec(&dentry->d_count);
+ shrink_dcache_for_umount_subtree(dentry);
+
+ while (!hlist_empty(&sb->s_anon)) {
+ dentry = hlist_entry(sb->s_anon.first, struct dentry, d_hash);
+ shrink_dcache_for_umount_subtree(dentry);
+ }
+}
+
+/*
* Search for at least 1 mount point in the dentry's subdirs.
* We descend to the next level whenever the d_subdirs
* list is non-empty and continue searching.
@@ -1004,7 +1135,7 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
{
struct dentry *new = NULL;
- if (inode) {
+ if (inode && S_ISDIR(inode->i_mode)) {
spin_lock(&dcache_lock);
new = __d_find_alias(inode, 1);
if (new) {
diff --git a/fs/dlm/Kconfig b/fs/dlm/Kconfig
new file mode 100644
index 000000000000..81b2c6465eeb
--- /dev/null
+++ b/fs/dlm/Kconfig
@@ -0,0 +1,20 @@
+menu "Distributed Lock Manager"
+ depends on INET && IP_SCTP && EXPERIMENTAL
+
+config DLM
+ tristate "Distributed Lock Manager (DLM)"
+ depends on IPV6 || IPV6=n
+ select CONFIGFS_FS
+ help
+ A general purpose distributed lock manager for kernel or userspace
+ applications.
+
+config DLM_DEBUG
+ bool "DLM debugging"
+ depends on DLM
+ help
+ Under the debugfs mount point, the name of each lockspace will
+ appear as a file in the "dlm" directory. The output is the
+ list of resource and locks the local node knows about.
+
+endmenu
diff --git a/fs/dlm/Makefile b/fs/dlm/Makefile
new file mode 100644
index 000000000000..1832e0297f7d
--- /dev/null
+++ b/fs/dlm/Makefile
@@ -0,0 +1,19 @@
+obj-$(CONFIG_DLM) += dlm.o
+dlm-y := ast.o \
+ config.o \
+ dir.o \
+ lock.o \
+ lockspace.o \
+ lowcomms.o \
+ main.o \
+ member.o \
+ memory.o \
+ midcomms.o \
+ rcom.o \
+ recover.o \
+ recoverd.o \
+ requestqueue.o \
+ user.o \
+ util.o
+dlm-$(CONFIG_DLM_DEBUG) += debug_fs.o
+
diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c
new file mode 100644
index 000000000000..f91d39cb1e0b
--- /dev/null
+++ b/fs/dlm/ast.c
@@ -0,0 +1,173 @@
+/******************************************************************************
+*******************************************************************************
+**
+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
+**
+** This copyrighted material is made available to anyone wishing to use,
+** modify, copy, or redistribute it subject to the terms and conditions
+** of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#include "dlm_internal.h"
+#include "lock.h"
+#include "user.h"
+
+#define WAKE_ASTS 0
+
+static struct list_head ast_queue;
+static spinlock_t ast_queue_lock;
+static struct task_struct * astd_task;
+static unsigned long astd_wakeflags;
+static struct mutex astd_running;
+
+
+void dlm_del_ast(struct dlm_lkb *lkb)
+{
+ spin_lock(&ast_queue_lock);
+ if (lkb->lkb_ast_type & (AST_COMP | AST_BAST))
+ list_del(&lkb->lkb_astqueue);
+ spin_unlock(&ast_queue_lock);
+}
+
+void dlm_add_ast(struct dlm_lkb *lkb, int type)
+{
+ if (lkb->lkb_flags & DLM_IFL_USER) {
+ dlm_user_add_ast(lkb, type);
+ return;
+ }
+ DLM_ASSERT(lkb->lkb_astaddr != DLM_FAKE_USER_AST, dlm_print_lkb(lkb););
+
+ spin_lock(&ast_queue_lock);
+ if (!(lkb->lkb_ast_type & (AST_COMP | AST_BAST))) {
+ kref_get(&lkb->lkb_ref);
+ list_add_tail(&lkb->lkb_astqueue, &ast_queue);
+ }
+ lkb->lkb_ast_type |= type;
+ spin_unlock(&ast_queue_lock);
+
+ set_bit(WAKE_ASTS, &astd_wakeflags);
+ wake_up_process(astd_task);
+}
+
+static void process_asts(void)
+{
+ struct dlm_ls *ls = NULL;
+ struct dlm_rsb *r = NULL;
+ struct dlm_lkb *lkb;
+ void (*cast) (long param);
+ void (*bast) (long param, int mode);
+ int type = 0, found, bmode;
+
+ for (;;) {
+ found = 0;
+ spin_lock(&ast_queue_lock);
+ list_for_each_entry(lkb, &ast_queue, lkb_astqueue) {
+ r = lkb->lkb_resource;
+ ls = r->res_ls;
+
+ if (dlm_locking_stopped(ls))
+ continue;
+
+ list_del(&lkb->lkb_astqueue);
+ type = lkb->lkb_ast_type;
+ lkb->lkb_ast_type = 0;
+ found = 1;
+ break;
+ }
+ spin_unlock(&ast_queue_lock);
+
+ if (!found)
+ break;
+
+ cast = lkb->lkb_astaddr;
+ bast = lkb->lkb_bastaddr;
+ bmode = lkb->lkb_bastmode;
+
+ if ((type & AST_COMP) && cast)
+ cast(lkb->lkb_astparam);
+
+ /* FIXME: Is it safe to look at lkb_grmode here
+ without doing a lock_rsb() ?
+ Look at other checks in v1 to avoid basts. */
+
+ if ((type & AST_BAST) && bast)
+ if (!dlm_modes_compat(lkb->lkb_grmode, bmode))
+ bast(lkb->lkb_astparam, bmode);
+
+ /* this removes the reference added by dlm_add_ast
+ and may result in the lkb being freed */
+ dlm_put_lkb(lkb);
+
+ schedule();
+ }
+}
+
+static inline int no_asts(void)
+{
+ int ret;
+
+ spin_lock(&ast_queue_lock);
+ ret = list_empty(&ast_queue);
+ spin_unlock(&ast_queue_lock);
+ return ret;
+}
+
+static int dlm_astd(void *data)
+{
+ while (!kthread_should_stop()) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ if (!test_bit(WAKE_ASTS, &astd_wakeflags))
+ schedule();
+ set_current_state(TASK_RUNNING);
+
+ mutex_lock(&astd_running);
+ if (test_and_clear_bit(WAKE_ASTS, &astd_wakeflags))
+ process_asts();
+ mutex_unlock(&astd_running);
+ }
+ return 0;
+}
+
+void dlm_astd_wake(void)
+{
+ if (!no_asts()) {
+ set_bit(WAKE_ASTS, &astd_wakeflags);
+ wake_up_process(astd_task);
+ }
+}
+
+int dlm_astd_start(void)
+{
+ struct task_struct *p;
+ int error = 0;
+
+ INIT_LIST_HEAD(&ast_queue);
+ spin_lock_init(&ast_queue_lock);
+ mutex_init(&astd_running);
+
+ p = kthread_run(dlm_astd, NULL, "dlm_astd");
+ if (IS_ERR(p))
+ error = PTR_ERR(p);
+ else
+ astd_task = p;
+ return error;
+}
+
+void dlm_astd_stop(void)
+{
+ kthread_stop(astd_task);
+}
+
+void dlm_astd_suspend(void)
+{
+ mutex_lock(&astd_running);
+}
+
+void dlm_astd_resume(void)
+{
+ mutex_unlock(&astd_running);
+}
+
diff --git a/fs/dlm/ast.h b/fs/dlm/ast.h
new file mode 100644
index 000000000000..6ee276c74c52
--- /dev/null
+++ b/fs/dlm/ast.h
@@ -0,0 +1,26 @@
+/******************************************************************************
+*******************************************************************************
+**
+** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
+**
+** This copyrighted material is made available to anyone wishing to use,
+** modify, copy, or redistribute it subject to the terms and conditions
+** of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __ASTD_DOT_H__
+#define __ASTD_DOT_H__
+
+void dlm_add_ast(struct dlm_lkb *lkb, int type);
+void dlm_del_ast(struct dlm_lkb *lkb);
+
+void dlm_astd_wake(void);
+int dlm_astd_start(void);
+void dlm_astd_stop(void);
+void dlm_astd_suspend(void);
+void dlm_astd_resume(void);
+
+#endif
+
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
new file mode 100644
index 000000000000..88553054bbfa
--- /dev/null
+++ b/fs/dlm/config.c
@@ -0,0 +1,789 @@
+/******************************************************************************
+*******************************************************************************
+**
+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
+**
+** This copyrighted material is made available to anyone wishing to use,
+** modify, copy, or redistribute it subject to the terms and conditions
+** of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/configfs.h>
+#include <net/sock.h>
+
+#include "config.h"
+#include "lowcomms.h"
+
+/*
+ * /config/dlm/<cluster>/spaces/<space>/nodes/<node>/nodeid
+ * /config/dlm/<cluster>/spaces/<space>/nodes/<node>/weight
+ * /config/dlm/<cluster>/comms/<comm>/nodeid
+ * /config/dlm/<cluster>/comms/<comm>/local
+ * /config/dlm/<cluster>/comms/<comm>/addr
+ * The <cluster> level is useless, but I haven't figured out how to avoid it.
+ */
+
+static struct config_group *space_list;
+static struct config_group *comm_list;
+static struct comm *local_comm;
+
+struct clusters;
+struct cluster;
+struct spaces;
+struct space;
+struct comms;
+struct comm;
+struct nodes;
+struct node;
+
+static struct config_group *make_cluster(struct config_group *, const char *);
+static void drop_cluster(struct config_group *, struct config_item *);
+static void release_cluster(struct config_item *);
+static struct config_group *make_space(struct config_group *, const char *);
+static void drop_space(struct config_group *, struct config_item *);
+static void release_space(struct config_item *);
+static struct config_item *make_comm(struct config_group *, const char *);
+static void drop_comm(struct config_group *, struct config_item *);
+static void release_comm(struct config_item *);
+static struct config_item *make_node(struct config_group *, const char *);
+static void drop_node(struct config_group *, struct config_item *);
+static void release_node(struct config_item *);
+
+static ssize_t show_comm(struct config_item *i, struct configfs_attribute *a,
+ char *buf);
+static ssize_t store_comm(struct config_item *i, struct configfs_attribute *a,
+ const char *buf, size_t len);
+static ssize_t show_node(struct config_item *i, struct configfs_attribute *a,
+ char *buf);
+static ssize_t store_node(struct config_item *i, struct configfs_attribute *a,
+ const char *buf, size_t len);
+
+static ssize_t comm_nodeid_read(struct comm *cm, char *buf);
+static ssize_t comm_nodeid_write(struct comm *cm, const char *buf, size_t len);
+static ssize_t comm_local_read(struct comm *cm, char *buf);
+static ssize_t comm_local_write(struct comm *cm, const char *buf, size_t len);
+static ssize_t comm_addr_write(struct comm *cm, const char *buf, size_t len);
+static ssize_t node_nodeid_read(struct node *nd, char *buf);
+static ssize_t node_nodeid_write(struct node *nd, const char *buf, size_t len);
+static ssize_t node_weight_read(struct node *nd, char *buf);
+static ssize_t node_weight_write(struct node *nd, const char *buf, size_t len);
+
+enum {
+ COMM_ATTR_NODEID = 0,
+ COMM_ATTR_LOCAL,
+ COMM_ATTR_ADDR,
+};
+
+struct comm_attribute {
+ struct configfs_attribute attr;
+ ssize_t (*show)(struct comm *, char *);
+ ssize_t (*store)(struct comm *, const char *, size_t);
+};
+
+static struct comm_attribute comm_attr_nodeid = {
+ .attr = { .ca_owner = THIS_MODULE,
+ .ca_name = "nodeid",
+ .ca_mode = S_IRUGO | S_IWUSR },
+ .show = comm_nodeid_read,
+ .store = comm_nodeid_write,
+};
+
+static struct comm_attribute comm_attr_local = {
+ .attr = { .ca_owner = THIS_MODULE,
+ .ca_name = "local",
+ .ca_mode = S_IRUGO | S_IWUSR },
+ .show = comm_local_read,
+ .store = comm_local_write,
+};
+
+static struct comm_attribute comm_attr_addr = {
+ .attr = { .ca_owner = THIS_MODULE,
+ .ca_name = "addr",
+ .ca_mode = S_IRUGO | S_IWUSR },
+ .store = comm_addr_write,
+};
+
+static struct configfs_attribute *comm_attrs[] = {
+ [COMM_ATTR_NODEID] = &comm_attr_nodeid.attr,
+ [COMM_ATTR_LOCAL] = &comm_attr_local.attr,
+ [COMM_ATTR_ADDR] = &comm_attr_addr.attr,
+ NULL,
+};
+
+enum {
+ NODE_ATTR_NODEID = 0,
+ NODE_ATTR_WEIGHT,
+};
+
+struct node_attribute {
+ struct configfs_attribute attr;
+ ssize_t (*show)(struct node *, char *);
+ ssize_t (*store)(struct node *, const char *, size_t);
+};
+
+static struct node_attribute node_attr_nodeid = {
+ .attr = { .ca_owner = THIS_MODULE,
+ .ca_name = "nodeid",
+ .ca_mode = S_IRUGO | S_IWUSR },
+ .show = node_nodeid_read,
+ .store = node_nodeid_write,
+};
+
+static struct node_attribute node_attr_weight = {
+ .attr = { .ca_owner = THIS_MODULE,
+ .ca_name = "weight",
+ .ca_mode = S_IRUGO | S_IWUSR },
+ .show = node_weight_read,
+ .store = node_weight_write,
+};
+
+static struct configfs_attribute *node_attrs[] = {
+ [NODE_ATTR_NODEID] = &node_attr_nodeid.attr,
+ [NODE_ATTR_WEIGHT] = &node_attr_weight.attr,
+ NULL,
+};
+
+struct clusters {
+ struct configfs_subsystem subsys;
+};
+
+struct cluster {
+ struct config_group group;
+};
+
+struct spaces {
+ struct config_group ss_group;
+};
+
+struct space {
+ struct config_group group;
+ struct list_head members;
+ struct mutex members_lock;
+ int members_count;
+};
+
+struct comms {
+ struct config_group cs_group;
+};
+
+struct comm {
+ struct config_item item;
+ int nodeid;
+ int local;
+ int addr_count;
+ struct sockaddr_storage *addr[DLM_MAX_ADDR_COUNT];
+};
+
+struct nodes {
+ struct config_group ns_group;
+};
+
+struct node {
+ struct config_item item;
+ struct list_head list; /* space->members */
+ int nodeid;
+ int weight;
+};
+
+static struct configfs_group_operations clusters_ops = {
+ .make_group = make_cluster,
+ .drop_item = drop_cluster,
+};
+
+static struct configfs_item_operations cluster_ops = {
+ .release = release_cluster,
+};
+
+static struct configfs_group_operations spaces_ops = {
+ .make_group = make_space,
+ .drop_item = drop_space,
+};
+
+static struct configfs_item_operations space_ops = {
+ .release = release_space,
+};
+
+static struct configfs_group_operations comms_ops = {
+ .make_item = make_comm,
+ .drop_item = drop_comm,
+};
+
+static struct configfs_item_operations comm_ops = {
+ .release = release_comm,
+ .show_attribute = show_comm,
+ .store_attribute = store_comm,
+};
+
+static struct configfs_group_operations nodes_ops = {
+ .make_item = make_node,
+ .drop_item = drop_node,
+};
+
+static struct configfs_item_operations node_ops = {
+ .release = release_node,
+ .show_attribute = show_node,
+ .store_attribute = store_node,
+};
+
+static struct config_item_type clusters_type = {
+ .ct_group_ops = &clusters_ops,
+ .ct_owner = THIS_MODULE,
+};
+
+static struct config_item_type cluster_type = {
+ .ct_item_ops = &cluster_ops,
+ .ct_owner = THIS_MODULE,
+};
+
+static struct config_item_type spaces_type = {
+ .ct_group_ops = &spaces_ops,
+ .ct_owner = THIS_MODULE,
+};
+
+static struct config_item_type space_type = {
+ .ct_item_ops = &space_ops,
+ .ct_owner = THIS_MODULE,
+};
+
+static struct config_item_type comms_type = {
+ .ct_group_ops = &comms_ops,
+ .ct_owner = THIS_MODULE,
+};
+
+static struct config_item_type comm_type = {
+ .ct_item_ops = &comm_ops,
+ .ct_attrs = comm_attrs,
+ .ct_owner = THIS_MODULE,
+};
+
+static struct config_item_type nodes_type = {
+ .ct_group_ops = &nodes_ops,
+ .ct_owner = THIS_MODULE,
+};
+
+static struct config_item_type node_type = {
+ .ct_item_ops = &node_ops,
+ .ct_attrs = node_attrs,
+ .ct_owner = THIS_MODULE,
+};
+
+static struct cluster *to_cluster(struct config_item *i)
+{
+ return i ? container_of(to_config_group(i), struct cluster, group):NULL;
+}
+
+static struct space *to_space(struct config_item *i)
+{
+ return i ? container_of(to_config_group(i), struct space, group) : NULL;
+}
+
+static struct comm *to_comm(struct config_item *i)
+{
+ return i ? container_of(i, struct comm, item) : NULL;
+}
+
+static struct node *to_node(struct config_item *i)
+{
+ return i ? container_of(i, struct node, item) : NULL;
+}
+
+static struct config_group *make_cluster(struct config_group *g,
+ const char *name)
+{
+ struct cluster *cl = NULL;
+ struct spaces *sps = NULL;
+ struct comms *cms = NULL;
+ void *gps = NULL;
+
+ cl = kzalloc(sizeof(struct cluster), GFP_KERNEL);
+ gps = kcalloc(3, sizeof(struct config_group *), GFP_KERNEL);
+ sps = kzalloc(sizeof(struct spaces), GFP_KERNEL);
+ cms = kzalloc(sizeof(struct comms), GFP_KERNEL);
+
+ if (!cl || !gps || !sps || !cms)
+ goto fail;
+
+ config_group_init_type_name(&cl->group, name, &cluster_type);
+ config_group_init_type_name(&sps->ss_group, "spaces", &spaces_type);
+ config_group_init_type_name(&cms->cs_group, "comms", &comms_type);
+
+ cl->group.default_groups = gps;
+ cl->group.default_groups[0] = &sps->ss_group;
+ cl->group.default_groups[1] = &cms->cs_group;
+ cl->group.default_groups[2] = NULL;
+
+ space_list = &sps->ss_group;
+ comm_list = &cms->cs_group;
+ return &cl->group;
+
+ fail:
+ kfree(cl);
+ kfree(gps);
+ kfree(sps);
+ kfree(cms);
+ return NULL;
+}
+
+static void drop_cluster(struct config_group *g, struct config_item *i)
+{
+ struct cluster *cl = to_cluster(i);
+ struct config_item *tmp;
+ int j;
+
+ for (j = 0; cl->group.default_groups[j]; j++) {
+ tmp = &cl->group.default_groups[j]->cg_item;
+ cl->group.default_groups[j] = NULL;
+ config_item_put(tmp);
+ }
+
+ space_list = NULL;
+ comm_list = NULL;
+
+ config_item_put(i);
+}
+
+static void release_cluster(struct config_item *i)
+{
+ struct cluster *cl = to_cluster(i);
+ kfree(cl->group.default_groups);
+ kfree(cl);
+}
+
+static struct config_group *make_space(struct config_group *g, const char *name)
+{
+ struct space *sp = NULL;
+ struct nodes *nds = NULL;
+ void *gps = NULL;
+
+ sp = kzalloc(sizeof(struct space), GFP_KERNEL);
+ gps = kcalloc(2, sizeof(struct config_group *), GFP_KERNEL);
+ nds = kzalloc(sizeof(struct nodes), GFP_KERNEL);
+
+ if (!sp || !gps || !nds)
+ goto fail;
+
+ config_group_init_type_name(&sp->group, name, &space_type);
+ config_group_init_type_name(&nds->ns_group, "nodes", &nodes_type);
+
+ sp->group.default_groups = gps;
+ sp->group.default_groups[0] = &nds->ns_group;
+ sp->group.default_groups[1] = NULL;
+
+ INIT_LIST_HEAD(&sp->members);
+ mutex_init(&sp->members_lock);
+ sp->members_count = 0;
+ return &sp->group;
+
+ fail:
+ kfree(sp);
+ kfree(gps);
+ kfree(nds);
+ return NULL;
+}
+
+static void drop_space(struct config_group *g, struct config_item *i)
+{
+ struct space *sp = to_space(i);
+ struct config_item *tmp;
+ int j;
+
+ /* assert list_empty(&sp->members) */
+
+ for (j = 0; sp->group.default_groups[j]; j++) {
+ tmp = &sp->group.default_groups[j]->cg_item;
+ sp->group.default_groups[j] = NULL;
+ config_item_put(tmp);
+ }
+
+ config_item_put(i);
+}
+
+static void release_space(struct config_item *i)
+{
+ struct space *sp = to_space(i);
+ kfree(sp->group.default_groups);
+ kfree(sp);
+}
+
+static struct config_item *make_comm(struct config_group *g, const char *name)
+{
+ struct comm *cm;
+
+ cm = kzalloc(sizeof(struct comm), GFP_KERNEL);
+ if (!cm)
+ return NULL;
+
+ config_item_init_type_name(&cm->item, name, &comm_type);
+ cm->nodeid = -1;
+ cm->local = 0;
+ cm->addr_count = 0;
+ return &cm->item;
+}
+
+static void drop_comm(struct config_group *g, struct config_item *i)
+{
+ struct comm *cm = to_comm(i);
+ if (local_comm == cm)
+ local_comm = NULL;
+ dlm_lowcomms_close(cm->nodeid);
+ while (cm->addr_count--)
+ kfree(cm->addr[cm->addr_count]);
+ config_item_put(i);
+}
+
+static void release_comm(struct config_item *i)
+{
+ struct comm *cm = to_comm(i);
+ kfree(cm);
+}
+
+static struct config_item *make_node(struct config_group *g, const char *name)
+{
+ struct space *sp = to_space(g->cg_item.ci_parent);
+ struct node *nd;
+
+ nd = kzalloc(sizeof(struct node), GFP_KERNEL);
+ if (!nd)
+ return NULL;
+
+ config_item_init_type_name(&nd->item, name, &node_type);
+ nd->nodeid = -1;
+ nd->weight = 1; /* default weight of 1 if none is set */
+
+ mutex_lock(&sp->members_lock);
+ list_add(&nd->list, &sp->members);
+ sp->members_count++;
+ mutex_unlock(&sp->members_lock);
+
+ return &nd->item;
+}
+
+static void drop_node(struct config_group *g, struct config_item *i)
+{
+ struct space *sp = to_space(g->cg_item.ci_parent);
+ struct node *nd = to_node(i);
+
+ mutex_lock(&sp->members_lock);
+ list_del(&nd->list);
+ sp->members_count--;
+ mutex_unlock(&sp->members_lock);
+
+ config_item_put(i);
+}
+
+static void release_node(struct config_item *i)
+{
+ struct node *nd = to_node(i);
+ kfree(nd);
+}
+
+static struct clusters clusters_root = {
+ .subsys = {
+ .su_group = {
+ .cg_item = {
+ .ci_namebuf = "dlm",
+ .ci_type = &clusters_type,
+ },
+ },
+ },
+};
+
+int dlm_config_init(void)
+{
+ config_group_init(&clusters_root.subsys.su_group);
+ init_MUTEX(&clusters_root.subsys.su_sem);
+ return configfs_register_subsystem(&clusters_root.subsys);
+}
+
+void dlm_config_exit(void)
+{
+ configfs_unregister_subsystem(&clusters_root.subsys);
+}
+
+/*
+ * Functions for user space to read/write attributes
+ */
+
+static ssize_t show_comm(struct config_item *i, struct configfs_attribute *a,
+ char *buf)
+{
+ struct comm *cm = to_comm(i);
+ struct comm_attribute *cma =
+ container_of(a, struct comm_attribute, attr);
+ return cma->show ? cma->show(cm, buf) : 0;
+}
+
+static ssize_t store_comm(struct config_item *i, struct configfs_attribute *a,
+ const char *buf, size_t len)
+{
+ struct comm *cm = to_comm(i);
+ struct comm_attribute *cma =
+ container_of(a, struct comm_attribute, attr);
+ return cma->store ? cma->store(cm, buf, len) : -EINVAL;
+}
+
+static ssize_t comm_nodeid_read(struct comm *cm, char *buf)
+{
+ return sprintf(buf, "%d\n", cm->nodeid);
+}
+
+static ssize_t comm_nodeid_write(struct comm *cm, const char *buf, size_t len)
+{
+ cm->nodeid = simple_strtol(buf, NULL, 0);
+ return len;
+}
+
+static ssize_t comm_local_read(struct comm *cm, char *buf)
+{
+ return sprintf(buf, "%d\n", cm->local);
+}
+
+static ssize_t comm_local_write(struct comm *cm, const char *buf, size_t len)
+{
+ cm->local= simple_strtol(buf, NULL, 0);
+ if (cm->local && !local_comm)
+ local_comm = cm;
+ return len;
+}
+
+static ssize_t comm_addr_write(struct comm *cm, const char *buf, size_t len)
+{
+ struct sockaddr_storage *addr;
+
+ if (len != sizeof(struct sockaddr_storage))
+ return -EINVAL;
+
+ if (cm->addr_count >= DLM_MAX_ADDR_COUNT)
+ return -ENOSPC;
+
+ addr = kzalloc(sizeof(*addr), GFP_KERNEL);
+ if (!addr)
+ return -ENOMEM;
+
+ memcpy(addr, buf, len);
+ cm->addr[cm->addr_count++] = addr;
+ return len;
+}
+
+static ssize_t show_node(struct config_item *i, struct configfs_attribute *a,
+ char *buf)
+{
+ struct node *nd = to_node(i);
+ struct node_attribute *nda =
+ container_of(a, struct node_attribute, attr);
+ return nda->show ? nda->show(nd, buf) : 0;
+}
+
+static ssize_t store_node(struct config_item *i, struct configfs_attribute *a,
+ const char *buf, size_t len)
+{
+ struct node *nd = to_node(i);
+ struct node_attribute *nda =
+ container_of(a, struct node_attribute, attr);
+ return nda->store ? nda->store(nd, buf, len) : -EINVAL;
+}
+
+static ssize_t node_nodeid_read(struct node *nd, char *buf)
+{
+ return sprintf(buf, "%d\n", nd->nodeid);
+}
+
+static ssize_t node_nodeid_write(struct node *nd, const char *buf, size_t len)
+{
+ nd->nodeid = simple_strtol(buf, NULL, 0);
+ return len;
+}
+
+static ssize_t node_weight_read(struct node *nd, char *buf)
+{
+ return sprintf(buf, "%d\n", nd->weight);
+}
+
+static ssize_t node_weight_write(struct node *nd, const char *buf, size_t len)
+{
+ nd->weight = simple_strtol(buf, NULL, 0);
+ return len;
+}
+
+/*
+ * Functions for the dlm to get the info that's been configured
+ */
+
+static struct space *get_space(char *name)
+{
+ if (!space_list)
+ return NULL;
+ return to_space(config_group_find_obj(space_list, name));
+}
+
+static void put_space(struct space *sp)
+{
+ config_item_put(&sp->group.cg_item);
+}
+
+static struct comm *get_comm(int nodeid, struct sockaddr_storage *addr)
+{
+ struct config_item *i;
+ struct comm *cm = NULL;
+ int found = 0;
+
+ if (!comm_list)
+ return NULL;
+
+ down(&clusters_root.subsys.su_sem);
+
+ list_for_each_entry(i, &comm_list->cg_children, ci_entry) {
+ cm = to_comm(i);
+
+ if (nodeid) {
+ if (cm->nodeid != nodeid)
+ continue;
+ found = 1;
+ break;
+ } else {
+ if (!cm->addr_count ||
+ memcmp(cm->addr[0], addr, sizeof(*addr)))
+ continue;
+ found = 1;
+ break;
+ }
+ }
+ up(&clusters_root.subsys.su_sem);
+
+ if (found)
+ config_item_get(i);
+ else
+ cm = NULL;
+ return cm;
+}
+
+static void put_comm(struct comm *cm)
+{
+ config_item_put(&cm->item);
+}
+
+/* caller must free mem */
+int dlm_nodeid_list(char *lsname, int **ids_out)
+{
+ struct space *sp;
+ struct node *nd;
+ int i = 0, rv = 0;
+ int *ids;
+
+ sp = get_space(lsname);
+ if (!sp)
+ return -EEXIST;
+
+ mutex_lock(&sp->members_lock);
+ if (!sp->members_count) {
+ rv = 0;
+ goto out;
+ }
+
+ ids = kcalloc(sp->members_count, sizeof(int), GFP_KERNEL);
+ if (!ids) {
+ rv = -ENOMEM;
+ goto out;
+ }
+
+ rv = sp->members_count;
+ list_for_each_entry(nd, &sp->members, list)
+ ids[i++] = nd->nodeid;
+
+ if (rv != i)
+ printk("bad nodeid count %d %d\n", rv, i);
+
+ *ids_out = ids;
+ out:
+ mutex_unlock(&sp->members_lock);
+ put_space(sp);
+ return rv;
+}
+
+int dlm_node_weight(char *lsname, int nodeid)
+{
+ struct space *sp;
+ struct node *nd;
+ int w = -EEXIST;
+
+ sp = get_space(lsname);
+ if (!sp)
+ goto out;
+
+ mutex_lock(&sp->members_lock);
+ list_for_each_entry(nd, &sp->members, list) {
+ if (nd->nodeid != nodeid)
+ continue;
+ w = nd->weight;
+ break;
+ }
+ mutex_unlock(&sp->members_lock);
+ put_space(sp);
+ out:
+ return w;
+}
+
+int dlm_nodeid_to_addr(int nodeid, struct sockaddr_storage *addr)
+{
+ struct comm *cm = get_comm(nodeid, NULL);
+ if (!cm)
+ return -EEXIST;
+ if (!cm->addr_count)
+ return -ENOENT;
+ memcpy(addr, cm->addr[0], sizeof(*addr));
+ put_comm(cm);
+ return 0;
+}
+
+int dlm_addr_to_nodeid(struct sockaddr_storage *addr, int *nodeid)
+{
+ struct comm *cm = get_comm(0, addr);
+ if (!cm)
+ return -EEXIST;
+ *nodeid = cm->nodeid;
+ put_comm(cm);
+ return 0;
+}
+
+int dlm_our_nodeid(void)
+{
+ return local_comm ? local_comm->nodeid : 0;
+}
+
+/* num 0 is first addr, num 1 is second addr */
+int dlm_our_addr(struct sockaddr_storage *addr, int num)
+{
+ if (!local_comm)
+ return -1;
+ if (num + 1 > local_comm->addr_count)
+ return -1;
+ memcpy(addr, local_comm->addr[num], sizeof(*addr));
+ return 0;
+}
+
+/* Config file defaults */
+#define DEFAULT_TCP_PORT 21064
+#define DEFAULT_BUFFER_SIZE 4096
+#define DEFAULT_RSBTBL_SIZE 256
+#define DEFAULT_LKBTBL_SIZE 1024
+#define DEFAULT_DIRTBL_SIZE 512
+#define DEFAULT_RECOVER_TIMER 5
+#define DEFAULT_TOSS_SECS 10
+#define DEFAULT_SCAN_SECS 5
+
+struct dlm_config_info dlm_config = {
+ .tcp_port = DEFAULT_TCP_PORT,
+ .buffer_size = DEFAULT_BUFFER_SIZE,
+ .rsbtbl_size = DEFAULT_RSBTBL_SIZE,
+ .lkbtbl_size = DEFAULT_LKBTBL_SIZE,
+ .dirtbl_size = DEFAULT_DIRTBL_SIZE,
+ .recover_timer = DEFAULT_RECOVER_TIMER,
+ .toss_secs = DEFAULT_TOSS_SECS,
+ .scan_secs = DEFAULT_SCAN_SECS
+};
+
diff --git a/fs/dlm/config.h b/fs/dlm/config.h
new file mode 100644
index 000000000000..9da7839958a9
--- /dev/null
+++ b/fs/dlm/config.h
@@ -0,0 +1,42 @@
+/******************************************************************************
+*******************************************************************************
+**
+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
+**
+** This copyrighted material is made available to anyone wishing to use,
+** modify, copy, or redistribute it subject to the terms and conditions
+** of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __CONFIG_DOT_H__
+#define __CONFIG_DOT_H__
+
+#define DLM_MAX_ADDR_COUNT 3
+
+struct dlm_config_info {
+ int tcp_port;
+ int buffer_size;
+ int rsbtbl_size;
+ int lkbtbl_size;
+ int dirtbl_size;
+ int recover_timer;
+ int toss_secs;
+ int scan_secs;
+};
+
+extern struct dlm_config_info dlm_config;
+
+int dlm_config_init(void);
+void dlm_config_exit(void);
+int dlm_node_weight(char *lsname, int nodeid);
+int dlm_nodeid_list(char *lsname, int **ids_out);
+int dlm_nodeid_to_addr(int nodeid, struct sockaddr_storage *addr);
+int dlm_addr_to_nodeid(struct sockaddr_storage *addr, int *nodeid);
+int dlm_our_nodeid(void);
+int dlm_our_addr(struct sockaddr_storage *addr, int num);
+
+#endif /* __CONFIG_DOT_H__ */
+
diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c
new file mode 100644
index 000000000000..ca94a837a5bb
--- /dev/null
+++ b/fs/dlm/debug_fs.c
@@ -0,0 +1,387 @@
+/******************************************************************************
+*******************************************************************************
+**
+** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
+**
+** This copyrighted material is made available to anyone wishing to use,
+** modify, copy, or redistribute it subject to the terms and conditions
+** of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#include <linux/pagemap.h>
+#include <linux/seq_file.h>
+#include <linux/module.h>
+#include <linux/ctype.h>
+#include <linux/debugfs.h>
+
+#include "dlm_internal.h"
+
+#define DLM_DEBUG_BUF_LEN 4096
+static char debug_buf[DLM_DEBUG_BUF_LEN];
+static struct mutex debug_buf_lock;
+
+static struct dentry *dlm_root;
+
+struct rsb_iter {
+ int entry;
+ struct dlm_ls *ls;
+ struct list_head *next;
+ struct dlm_rsb *rsb;
+};
+
+/*
+ * dump all rsb's in the lockspace hash table
+ */
+
+static char *print_lockmode(int mode)
+{
+ switch (mode) {
+ case DLM_LOCK_IV:
+ return "--";
+ case DLM_LOCK_NL:
+ return "NL";
+ case DLM_LOCK_CR:
+ return "CR";
+ case DLM_LOCK_CW:
+ return "CW";
+ case DLM_LOCK_PR:
+ return "PR";
+ case DLM_LOCK_PW:
+ return "PW";
+ case DLM_LOCK_EX:
+ return "EX";
+ default:
+ return "??";
+ }
+}
+
+static void print_lock(struct seq_file *s, struct dlm_lkb *lkb,
+ struct dlm_rsb *res)
+{
+ seq_printf(s, "%08x %s", lkb->lkb_id, print_lockmode(lkb->lkb_grmode));
+
+ if (lkb->lkb_status == DLM_LKSTS_CONVERT
+ || lkb->lkb_status == DLM_LKSTS_WAITING)
+ seq_printf(s, " (%s)", print_lockmode(lkb->lkb_rqmode));
+
+ if (lkb->lkb_nodeid) {
+ if (lkb->lkb_nodeid != res->res_nodeid)
+ seq_printf(s, " Remote: %3d %08x", lkb->lkb_nodeid,
+ lkb->lkb_remid);
+ else
+ seq_printf(s, " Master: %08x", lkb->lkb_remid);
+ }
+
+ if (lkb->lkb_wait_type)
+ seq_printf(s, " wait_type: %d", lkb->lkb_wait_type);
+
+ seq_printf(s, "\n");
+}
+
+static int print_resource(struct dlm_rsb *res, struct seq_file *s)
+{
+ struct dlm_lkb *lkb;
+ int i, lvblen = res->res_ls->ls_lvblen, recover_list, root_list;
+
+ seq_printf(s, "\nResource %p Name (len=%d) \"", res, res->res_length);
+ for (i = 0; i < res->res_length; i++) {
+ if (isprint(res->res_name[i]))
+ seq_printf(s, "%c", res->res_name[i]);
+ else
+ seq_printf(s, "%c", '.');
+ }
+ if (res->res_nodeid > 0)
+ seq_printf(s, "\" \nLocal Copy, Master is node %d\n",
+ res->res_nodeid);
+ else if (res->res_nodeid == 0)
+ seq_printf(s, "\" \nMaster Copy\n");
+ else if (res->res_nodeid == -1)
+ seq_printf(s, "\" \nLooking up master (lkid %x)\n",
+ res->res_first_lkid);
+ else
+ seq_printf(s, "\" \nInvalid master %d\n", res->res_nodeid);
+
+ /* Print the LVB: */
+ if (res->res_lvbptr) {
+ seq_printf(s, "LVB: ");
+ for (i = 0; i < lvblen; i++) {
+ if (i == lvblen / 2)
+ seq_printf(s, "\n ");
+ seq_printf(s, "%02x ",
+ (unsigned char) res->res_lvbptr[i]);
+ }
+ if (rsb_flag(res, RSB_VALNOTVALID))
+ seq_printf(s, " (INVALID)");
+ seq_printf(s, "\n");
+ }
+
+ root_list = !list_empty(&res->res_root_list);
+ recover_list = !list_empty(&res->res_recover_list);
+
+ if (root_list || recover_list) {
+ seq_printf(s, "Recovery: root %d recover %d flags %lx "
+ "count %d\n", root_list, recover_list,
+ res->res_flags, res->res_recover_locks_count);
+ }
+
+ /* Print the locks attached to this resource */
+ seq_printf(s, "Granted Queue\n");
+ list_for_each_entry(lkb, &res->res_grantqueue, lkb_statequeue)
+ print_lock(s, lkb, res);
+
+ seq_printf(s, "Conversion Queue\n");
+ list_for_each_entry(lkb, &res->res_convertqueue, lkb_statequeue)
+ print_lock(s, lkb, res);
+
+ seq_printf(s, "Waiting Queue\n");
+ list_for_each_entry(lkb, &res->res_waitqueue, lkb_statequeue)
+ print_lock(s, lkb, res);
+
+ if (list_empty(&res->res_lookup))
+ goto out;
+
+ seq_printf(s, "Lookup Queue\n");
+ list_for_each_entry(lkb, &res->res_lookup, lkb_rsb_lookup) {
+ seq_printf(s, "%08x %s", lkb->lkb_id,
+ print_lockmode(lkb->lkb_rqmode));
+ if (lkb->lkb_wait_type)
+ seq_printf(s, " wait_type: %d", lkb->lkb_wait_type);
+ seq_printf(s, "\n");
+ }
+ out:
+ return 0;
+}
+
+static int rsb_iter_next(struct rsb_iter *ri)
+{
+ struct dlm_ls *ls = ri->ls;
+ int i;
+
+ if (!ri->next) {
+ top:
+ /* Find the next non-empty hash bucket */
+ for (i = ri->entry; i < ls->ls_rsbtbl_size; i++) {
+ read_lock(&ls->ls_rsbtbl[i].lock);
+ if (!list_empty(&ls->ls_rsbtbl[i].list)) {
+ ri->next = ls->ls_rsbtbl[i].list.next;
+ read_unlock(&ls->ls_rsbtbl[i].lock);
+ break;
+ }
+ read_unlock(&ls->ls_rsbtbl[i].lock);
+ }
+ ri->entry = i;
+
+ if (ri->entry >= ls->ls_rsbtbl_size)
+ return 1;
+ } else {
+ i = ri->entry;
+ read_lock(&ls->ls_rsbtbl[i].lock);
+ ri->next = ri->next->next;
+ if (ri->next->next == ls->ls_rsbtbl[i].list.next) {
+ /* End of list - move to next bucket */
+ ri->next = NULL;
+ ri->entry++;
+ read_unlock(&ls->ls_rsbtbl[i].lock);
+ goto top;
+ }
+ read_unlock(&ls->ls_rsbtbl[i].lock);
+ }
+ ri->rsb = list_entry(ri->next, struct dlm_rsb, res_hashchain);
+
+ return 0;
+}
+
+static void rsb_iter_free(struct rsb_iter *ri)
+{
+ kfree(ri);
+}
+
+static struct rsb_iter *rsb_iter_init(struct dlm_ls *ls)
+{
+ struct rsb_iter *ri;
+
+ ri = kmalloc(sizeof *ri, GFP_KERNEL);
+ if (!ri)
+ return NULL;
+
+ ri->ls = ls;
+ ri->entry = 0;
+ ri->next = NULL;
+
+ if (rsb_iter_next(ri)) {
+ rsb_iter_free(ri);
+ return NULL;
+ }
+
+ return ri;
+}
+
+static void *rsb_seq_start(struct seq_file *file, loff_t *pos)
+{
+ struct rsb_iter *ri;
+ loff_t n = *pos;
+
+ ri = rsb_iter_init(file->private);
+ if (!ri)
+ return NULL;
+
+ while (n--) {
+ if (rsb_iter_next(ri)) {
+ rsb_iter_free(ri);
+ return NULL;
+ }
+ }
+
+ return ri;
+}
+
+static void *rsb_seq_next(struct seq_file *file, void *iter_ptr, loff_t *pos)
+{
+ struct rsb_iter *ri = iter_ptr;
+
+ (*pos)++;
+
+ if (rsb_iter_next(ri)) {
+ rsb_iter_free(ri);
+ return NULL;
+ }
+
+ return ri;
+}
+
+static void rsb_seq_stop(struct seq_file *file, void *iter_ptr)
+{
+ /* nothing for now */
+}
+
+static int rsb_seq_show(struct seq_file *file, void *iter_ptr)
+{
+ struct rsb_iter *ri = iter_ptr;
+
+ print_resource(ri->rsb, file);
+
+ return 0;
+}
+
+static struct seq_operations rsb_seq_ops = {
+ .start = rsb_seq_start,
+ .next = rsb_seq_next,
+ .stop = rsb_seq_stop,
+ .show = rsb_seq_show,
+};
+
+static int rsb_open(struct inode *inode, struct file *file)
+{
+ struct seq_file *seq;
+ int ret;
+
+ ret = seq_open(file, &rsb_seq_ops);
+ if (ret)
+ return ret;
+
+ seq = file->private_data;
+ seq->private = inode->i_private;
+
+ return 0;
+}
+
+static struct file_operations rsb_fops = {
+ .owner = THIS_MODULE,
+ .open = rsb_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release
+};
+
+/*
+ * dump lkb's on the ls_waiters list
+ */
+
+static int waiters_open(struct inode *inode, struct file *file)
+{
+ file->private_data = inode->i_private;
+ return 0;
+}
+
+static ssize_t waiters_read(struct file *file, char __user *userbuf,
+ size_t count, loff_t *ppos)
+{
+ struct dlm_ls *ls = file->private_data;
+ struct dlm_lkb *lkb;
+ size_t len = DLM_DEBUG_BUF_LEN, pos = 0, ret, rv;
+
+ mutex_lock(&debug_buf_lock);
+ mutex_lock(&ls->ls_waiters_mutex);
+ memset(debug_buf, 0, sizeof(debug_buf));
+
+ list_for_each_entry(lkb, &ls->ls_waiters, lkb_wait_reply) {
+ ret = snprintf(debug_buf + pos, len - pos, "%x %d %d %s\n",
+ lkb->lkb_id, lkb->lkb_wait_type,
+ lkb->lkb_nodeid, lkb->lkb_resource->res_name);
+ if (ret >= len - pos)
+ break;
+ pos += ret;
+ }
+ mutex_unlock(&ls->ls_waiters_mutex);
+
+ rv = simple_read_from_buffer(userbuf, count, ppos, debug_buf, pos);
+ mutex_unlock(&debug_buf_lock);
+ return rv;
+}
+
+static struct file_operations waiters_fops = {
+ .owner = THIS_MODULE,
+ .open = waiters_open,
+ .read = waiters_read
+};
+
+int dlm_create_debug_file(struct dlm_ls *ls)
+{
+ char name[DLM_LOCKSPACE_LEN+8];
+
+ ls->ls_debug_rsb_dentry = debugfs_create_file(ls->ls_name,
+ S_IFREG | S_IRUGO,
+ dlm_root,
+ ls,
+ &rsb_fops);
+ if (!ls->ls_debug_rsb_dentry)
+ return -ENOMEM;
+
+ memset(name, 0, sizeof(name));
+ snprintf(name, DLM_LOCKSPACE_LEN+8, "%s_waiters", ls->ls_name);
+
+ ls->ls_debug_waiters_dentry = debugfs_create_file(name,
+ S_IFREG | S_IRUGO,
+ dlm_root,
+ ls,
+ &waiters_fops);
+ if (!ls->ls_debug_waiters_dentry) {
+ debugfs_remove(ls->ls_debug_rsb_dentry);
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+void dlm_delete_debug_file(struct dlm_ls *ls)
+{
+ if (ls->ls_debug_rsb_dentry)
+ debugfs_remove(ls->ls_debug_rsb_dentry);
+ if (ls->ls_debug_waiters_dentry)
+ debugfs_remove(ls->ls_debug_waiters_dentry);
+}
+
+int dlm_register_debugfs(void)
+{
+ mutex_init(&debug_buf_lock);
+ dlm_root = debugfs_create_dir("dlm", NULL);
+ return dlm_root ? 0 : -ENOMEM;
+}
+
+void dlm_unregister_debugfs(void)
+{
+ debugfs_remove(dlm_root);
+}
+
diff --git a/fs/dlm/dir.c b/fs/dlm/dir.c
new file mode 100644
index 000000000000..46754553fdcc
--- /dev/null
+++ b/fs/dlm/dir.c
@@ -0,0 +1,423 @@
+/******************************************************************************
+*******************************************************************************
+**
+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
+**
+** This copyrighted material is made available to anyone wishing to use,
+** modify, copy, or redistribute it subject to the terms and conditions
+** of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#include "dlm_internal.h"
+#include "lockspace.h"
+#include "member.h"
+#include "lowcomms.h"
+#include "rcom.h"
+#include "config.h"
+#include "memory.h"
+#include "recover.h"
+#include "util.h"
+#include "lock.h"
+#include "dir.h"
+
+
+static void put_free_de(struct dlm_ls *ls, struct dlm_direntry *de)
+{
+ spin_lock(&ls->ls_recover_list_lock);
+ list_add(&de->list, &ls->ls_recover_list);
+ spin_unlock(&ls->ls_recover_list_lock);
+}
+
+static struct dlm_direntry *get_free_de(struct dlm_ls *ls, int len)
+{
+ int found = 0;
+ struct dlm_direntry *de;
+
+ spin_lock(&ls->ls_recover_list_lock);
+ list_for_each_entry(de, &ls->ls_recover_list, list) {
+ if (de->length == len) {
+ list_del(&de->list);
+ de->master_nodeid = 0;
+ memset(de->name, 0, len);
+ found = 1;
+ break;
+ }
+ }
+ spin_unlock(&ls->ls_recover_list_lock);
+
+ if (!found)
+ de = allocate_direntry(ls, len);
+ return de;
+}
+
+void dlm_clear_free_entries(struct dlm_ls *ls)
+{
+ struct dlm_direntry *de;
+
+ spin_lock(&ls->ls_recover_list_lock);
+ while (!list_empty(&ls->ls_recover_list)) {
+ de = list_entry(ls->ls_recover_list.next, struct dlm_direntry,
+ list);
+ list_del(&de->list);
+ free_direntry(de);
+ }
+ spin_unlock(&ls->ls_recover_list_lock);
+}
+
+/*
+ * We use the upper 16 bits of the hash value to select the directory node.
+ * Low bits are used for distribution of rsb's among hash buckets on each node.
+ *
+ * To give the exact range wanted (0 to num_nodes-1), we apply a modulus of
+ * num_nodes to the hash value. This value in the desired range is used as an
+ * offset into the sorted list of nodeid's to give the particular nodeid.
+ */
+
+int dlm_hash2nodeid(struct dlm_ls *ls, uint32_t hash)
+{
+ struct list_head *tmp;
+ struct dlm_member *memb = NULL;
+ uint32_t node, n = 0;
+ int nodeid;
+
+ if (ls->ls_num_nodes == 1) {
+ nodeid = dlm_our_nodeid();
+ goto out;
+ }
+
+ if (ls->ls_node_array) {
+ node = (hash >> 16) % ls->ls_total_weight;
+ nodeid = ls->ls_node_array[node];
+ goto out;
+ }
+
+ /* make_member_array() failed to kmalloc ls_node_array... */
+
+ node = (hash >> 16) % ls->ls_num_nodes;
+
+ list_for_each(tmp, &ls->ls_nodes) {
+ if (n++ != node)
+ continue;
+ memb = list_entry(tmp, struct dlm_member, list);
+ break;
+ }
+
+ DLM_ASSERT(memb , printk("num_nodes=%u n=%u node=%u\n",
+ ls->ls_num_nodes, n, node););
+ nodeid = memb->nodeid;
+ out:
+ return nodeid;
+}
+
+int dlm_dir_nodeid(struct dlm_rsb *r)
+{
+ return dlm_hash2nodeid(r->res_ls, r->res_hash);
+}
+
+static inline uint32_t dir_hash(struct dlm_ls *ls, char *name, int len)
+{
+ uint32_t val;
+
+ val = jhash(name, len, 0);
+ val &= (ls->ls_dirtbl_size - 1);
+
+ return val;
+}
+
+static void add_entry_to_hash(struct dlm_ls *ls, struct dlm_direntry *de)
+{
+ uint32_t bucket;
+
+ bucket = dir_hash(ls, de->name, de->length);
+ list_add_tail(&de->list, &ls->ls_dirtbl[bucket].list);
+}
+
+static struct dlm_direntry *search_bucket(struct dlm_ls *ls, char *name,
+ int namelen, uint32_t bucket)
+{
+ struct dlm_direntry *de;
+
+ list_for_each_entry(de, &ls->ls_dirtbl[bucket].list, list) {
+ if (de->length == namelen && !memcmp(name, de->name, namelen))
+ goto out;
+ }
+ de = NULL;
+ out:
+ return de;
+}
+
+void dlm_dir_remove_entry(struct dlm_ls *ls, int nodeid, char *name, int namelen)
+{
+ struct dlm_direntry *de;
+ uint32_t bucket;
+
+ bucket = dir_hash(ls, name, namelen);
+
+ write_lock(&ls->ls_dirtbl[bucket].lock);
+
+ de = search_bucket(ls, name, namelen, bucket);
+
+ if (!de) {
+ log_error(ls, "remove fr %u none", nodeid);
+ goto out;
+ }
+
+ if (de->master_nodeid != nodeid) {
+ log_error(ls, "remove fr %u ID %u", nodeid, de->master_nodeid);
+ goto out;
+ }
+
+ list_del(&de->list);
+ free_direntry(de);
+ out:
+ write_unlock(&ls->ls_dirtbl[bucket].lock);
+}
+
+void dlm_dir_clear(struct dlm_ls *ls)
+{
+ struct list_head *head;
+ struct dlm_direntry *de;
+ int i;
+
+ DLM_ASSERT(list_empty(&ls->ls_recover_list), );
+
+ for (i = 0; i < ls->ls_dirtbl_size; i++) {
+ write_lock(&ls->ls_dirtbl[i].lock);
+ head = &ls->ls_dirtbl[i].list;
+ while (!list_empty(head)) {
+ de = list_entry(head->next, struct dlm_direntry, list);
+ list_del(&de->list);
+ put_free_de(ls, de);
+ }
+ write_unlock(&ls->ls_dirtbl[i].lock);
+ }
+}
+
+int dlm_recover_directory(struct dlm_ls *ls)
+{
+ struct dlm_member *memb;
+ struct dlm_direntry *de;
+ char *b, *last_name = NULL;
+ int error = -ENOMEM, last_len, count = 0;
+ uint16_t namelen;
+
+ log_debug(ls, "dlm_recover_directory");
+
+ if (dlm_no_directory(ls))
+ goto out_status;
+
+ dlm_dir_clear(ls);
+
+ last_name = kmalloc(DLM_RESNAME_MAXLEN, GFP_KERNEL);
+ if (!last_name)
+ goto out;
+
+ list_for_each_entry(memb, &ls->ls_nodes, list) {
+ memset(last_name, 0, DLM_RESNAME_MAXLEN);
+ last_len = 0;
+
+ for (;;) {
+ error = dlm_recovery_stopped(ls);
+ if (error)
+ goto out_free;
+
+ error = dlm_rcom_names(ls, memb->nodeid,
+ last_name, last_len);
+ if (error)
+ goto out_free;
+
+ schedule();
+
+ /*
+ * pick namelen/name pairs out of received buffer
+ */
+
+ b = ls->ls_recover_buf + sizeof(struct dlm_rcom);
+
+ for (;;) {
+ memcpy(&namelen, b, sizeof(uint16_t));
+ namelen = be16_to_cpu(namelen);
+ b += sizeof(uint16_t);
+
+ /* namelen of 0xFFFFF marks end of names for
+ this node; namelen of 0 marks end of the
+ buffer */
+
+ if (namelen == 0xFFFF)
+ goto done;
+ if (!namelen)
+ break;
+
+ error = -ENOMEM;
+ de = get_free_de(ls, namelen);
+ if (!de)
+ goto out_free;
+
+ de->master_nodeid = memb->nodeid;
+ de->length = namelen;
+ last_len = namelen;
+ memcpy(de->name, b, namelen);
+ memcpy(last_name, b, namelen);
+ b += namelen;
+
+ add_entry_to_hash(ls, de);
+ count++;
+ }
+ }
+ done:
+ ;
+ }
+
+ out_status:
+ error = 0;
+ dlm_set_recover_status(ls, DLM_RS_DIR);
+ log_debug(ls, "dlm_recover_directory %d entries", count);
+ out_free:
+ kfree(last_name);
+ out:
+ dlm_clear_free_entries(ls);
+ return error;
+}
+
+static int get_entry(struct dlm_ls *ls, int nodeid, char *name,
+ int namelen, int *r_nodeid)
+{
+ struct dlm_direntry *de, *tmp;
+ uint32_t bucket;
+
+ bucket = dir_hash(ls, name, namelen);
+
+ write_lock(&ls->ls_dirtbl[bucket].lock);
+ de = search_bucket(ls, name, namelen, bucket);
+ if (de) {
+ *r_nodeid = de->master_nodeid;
+ write_unlock(&ls->ls_dirtbl[bucket].lock);
+ if (*r_nodeid == nodeid)
+ return -EEXIST;
+ return 0;
+ }
+
+ write_unlock(&ls->ls_dirtbl[bucket].lock);
+
+ de = allocate_direntry(ls, namelen);
+ if (!de)
+ return -ENOMEM;
+
+ de->master_nodeid = nodeid;
+ de->length = namelen;
+ memcpy(de->name, name, namelen);
+
+ write_lock(&ls->ls_dirtbl[bucket].lock);
+ tmp = search_bucket(ls, name, namelen, bucket);
+ if (tmp) {
+ free_direntry(de);
+ de = tmp;
+ } else {
+ list_add_tail(&de->list, &ls->ls_dirtbl[bucket].list);
+ }
+ *r_nodeid = de->master_nodeid;
+ write_unlock(&ls->ls_dirtbl[bucket].lock);
+ return 0;
+}
+
+int dlm_dir_lookup(struct dlm_ls *ls, int nodeid, char *name, int namelen,
+ int *r_nodeid)
+{
+ return get_entry(ls, nodeid, name, namelen, r_nodeid);
+}
+
+/* Copy the names of master rsb's into the buffer provided.
+ Only select names whose dir node is the given nodeid. */
+
+void dlm_copy_master_names(struct dlm_ls *ls, char *inbuf, int inlen,
+ char *outbuf, int outlen, int nodeid)
+{
+ struct list_head *list;
+ struct dlm_rsb *start_r = NULL, *r = NULL;
+ int offset = 0, start_namelen, error, dir_nodeid;
+ char *start_name;
+ uint16_t be_namelen;
+
+ /*
+ * Find the rsb where we left off (or start again)
+ */
+
+ start_namelen = inlen;
+ start_name = inbuf;
+
+ if (start_namelen > 1) {
+ /*
+ * We could also use a find_rsb_root() function here that
+ * searched the ls_root_list.
+ */
+ error = dlm_find_rsb(ls, start_name, start_namelen, R_MASTER,
+ &start_r);
+ DLM_ASSERT(!error && start_r,
+ printk("error %d\n", error););
+ DLM_ASSERT(!list_empty(&start_r->res_root_list),
+ dlm_print_rsb(start_r););
+ dlm_put_rsb(start_r);
+ }
+
+ /*
+ * Send rsb names for rsb's we're master of and whose directory node
+ * matches the requesting node.
+ */
+
+ down_read(&ls->ls_root_sem);
+ if (start_r)
+ list = start_r->res_root_list.next;
+ else
+ list = ls->ls_root_list.next;
+
+ for (offset = 0; list != &ls->ls_root_list; list = list->next) {
+ r = list_entry(list, struct dlm_rsb, res_root_list);
+ if (r->res_nodeid)
+ continue;
+
+ dir_nodeid = dlm_dir_nodeid(r);
+ if (dir_nodeid != nodeid)
+ continue;
+
+ /*
+ * The block ends when we can't fit the following in the
+ * remaining buffer space:
+ * namelen (uint16_t) +
+ * name (r->res_length) +
+ * end-of-block record 0x0000 (uint16_t)
+ */
+
+ if (offset + sizeof(uint16_t)*2 + r->res_length > outlen) {
+ /* Write end-of-block record */
+ be_namelen = 0;
+ memcpy(outbuf + offset, &be_namelen, sizeof(uint16_t));
+ offset += sizeof(uint16_t);
+ goto out;
+ }
+
+ be_namelen = cpu_to_be16(r->res_length);
+ memcpy(outbuf + offset, &be_namelen, sizeof(uint16_t));
+ offset += sizeof(uint16_t);
+ memcpy(outbuf + offset, r->res_name, r->res_length);
+ offset += r->res_length;
+ }
+
+ /*
+ * If we've reached the end of the list (and there's room) write a
+ * terminating record.
+ */
+
+ if ((list == &ls->ls_root_list) &&
+ (offset + sizeof(uint16_t) <= outlen)) {
+ be_namelen = 0xFFFF;
+ memcpy(outbuf + offset, &be_namelen, sizeof(uint16_t));
+ offset += sizeof(uint16_t);
+ }
+
+ out:
+ up_read(&ls->ls_root_sem);
+}
+
diff --git a/fs/dlm/dir.h b/fs/dlm/dir.h
new file mode 100644
index 000000000000..0b0eb1267b6e
--- /dev/null
+++ b/fs/dlm/dir.h
@@ -0,0 +1,30 @@
+/******************************************************************************
+*******************************************************************************
+**
+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
+**
+** This copyrighted material is made available to anyone wishing to use,
+** modify, copy, or redistribute it subject to the terms and conditions
+** of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __DIR_DOT_H__
+#define __DIR_DOT_H__
+
+
+int dlm_dir_nodeid(struct dlm_rsb *rsb);
+int dlm_hash2nodeid(struct dlm_ls *ls, uint32_t hash);
+void dlm_dir_remove_entry(struct dlm_ls *ls, int nodeid, char *name, int len);
+void dlm_dir_clear(struct dlm_ls *ls);
+void dlm_clear_free_entries(struct dlm_ls *ls);
+int dlm_recover_directory(struct dlm_ls *ls);
+int dlm_dir_lookup(struct dlm_ls *ls, int nodeid, char *name, int namelen,
+ int *r_nodeid);
+void dlm_copy_master_names(struct dlm_ls *ls, char *inbuf, int inlen,
+ char *outbuf, int outlen, int nodeid);
+
+#endif /* __DIR_DOT_H__ */
+
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
new file mode 100644
index 000000000000..1e5cd67e1b7a
--- /dev/null
+++ b/fs/dlm/dlm_internal.h
@@ -0,0 +1,543 @@
+/******************************************************************************
+*******************************************************************************
+**
+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
+**
+** This copyrighted material is made available to anyone wishing to use,
+** modify, copy, or redistribute it subject to the terms and conditions
+** of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __DLM_INTERNAL_DOT_H__
+#define __DLM_INTERNAL_DOT_H__
+
+/*
+ * This is the main header file to be included in each DLM source file.
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/types.h>
+#include <linux/ctype.h>
+#include <linux/spinlock.h>
+#include <linux/vmalloc.h>
+#include <linux/list.h>
+#include <linux/errno.h>
+#include <linux/random.h>
+#include <linux/delay.h>
+#include <linux/socket.h>
+#include <linux/kthread.h>
+#include <linux/kobject.h>
+#include <linux/kref.h>
+#include <linux/kernel.h>
+#include <linux/jhash.h>
+#include <linux/miscdevice.h>
+#include <linux/mutex.h>
+#include <asm/semaphore.h>
+#include <asm/uaccess.h>
+
+#include <linux/dlm.h>
+
+#define DLM_LOCKSPACE_LEN 64
+
+/* Size of the temp buffer midcomms allocates on the stack.
+ We try to make this large enough so most messages fit.
+ FIXME: should sctp make this unnecessary? */
+
+#define DLM_INBUF_LEN 148
+
+struct dlm_ls;
+struct dlm_lkb;
+struct dlm_rsb;
+struct dlm_member;
+struct dlm_lkbtable;
+struct dlm_rsbtable;
+struct dlm_dirtable;
+struct dlm_direntry;
+struct dlm_recover;
+struct dlm_header;
+struct dlm_message;
+struct dlm_rcom;
+struct dlm_mhandle;
+
+#define log_print(fmt, args...) \
+ printk(KERN_ERR "dlm: "fmt"\n" , ##args)
+#define log_error(ls, fmt, args...) \
+ printk(KERN_ERR "dlm: %s: " fmt "\n", (ls)->ls_name , ##args)
+
+#define DLM_LOG_DEBUG
+#ifdef DLM_LOG_DEBUG
+#define log_debug(ls, fmt, args...) log_error(ls, fmt, ##args)
+#else
+#define log_debug(ls, fmt, args...)
+#endif
+
+#define DLM_ASSERT(x, do) \
+{ \
+ if (!(x)) \
+ { \
+ printk(KERN_ERR "\nDLM: Assertion failed on line %d of file %s\n" \
+ "DLM: assertion: \"%s\"\n" \
+ "DLM: time = %lu\n", \
+ __LINE__, __FILE__, #x, jiffies); \
+ {do} \
+ printk("\n"); \
+ BUG(); \
+ panic("DLM: Record message above and reboot.\n"); \
+ } \
+}
+
+#define DLM_FAKE_USER_AST ERR_PTR(-EINVAL)
+
+
+struct dlm_direntry {
+ struct list_head list;
+ uint32_t master_nodeid;
+ uint16_t length;
+ char name[1];
+};
+
+struct dlm_dirtable {
+ struct list_head list;
+ rwlock_t lock;
+};
+
+struct dlm_rsbtable {
+ struct list_head list;
+ struct list_head toss;
+ rwlock_t lock;
+};
+
+struct dlm_lkbtable {
+ struct list_head list;
+ rwlock_t lock;
+ uint16_t counter;
+};
+
+/*
+ * Lockspace member (per node in a ls)
+ */
+
+struct dlm_member {
+ struct list_head list;
+ int nodeid;
+ int weight;
+};
+
+/*
+ * Save and manage recovery state for a lockspace.
+ */
+
+struct dlm_recover {
+ struct list_head list;
+ int *nodeids;
+ int node_count;
+ uint64_t seq;
+};
+
+/*
+ * Pass input args to second stage locking function.
+ */
+
+struct dlm_args {
+ uint32_t flags;
+ void *astaddr;
+ long astparam;
+ void *bastaddr;
+ int mode;
+ struct dlm_lksb *lksb;
+};
+
+
+/*
+ * Lock block
+ *
+ * A lock can be one of three types:
+ *
+ * local copy lock is mastered locally
+ * (lkb_nodeid is zero and DLM_LKF_MSTCPY is not set)
+ * process copy lock is mastered on a remote node
+ * (lkb_nodeid is non-zero and DLM_LKF_MSTCPY is not set)
+ * master copy master node's copy of a lock owned by remote node
+ * (lkb_nodeid is non-zero and DLM_LKF_MSTCPY is set)
+ *
+ * lkb_exflags: a copy of the most recent flags arg provided to dlm_lock or
+ * dlm_unlock. The dlm does not modify these or use any private flags in
+ * this field; it only contains DLM_LKF_ flags from dlm.h. These flags
+ * are sent as-is to the remote master when the lock is remote.
+ *
+ * lkb_flags: internal dlm flags (DLM_IFL_ prefix) from dlm_internal.h.
+ * Some internal flags are shared between the master and process nodes;
+ * these shared flags are kept in the lower two bytes. One of these
+ * flags set on the master copy will be propagated to the process copy
+ * and v.v. Other internal flags are private to the master or process
+ * node (e.g. DLM_IFL_MSTCPY). These are kept in the high two bytes.
+ *
+ * lkb_sbflags: status block flags. These flags are copied directly into
+ * the caller's lksb.sb_flags prior to the dlm_lock/dlm_unlock completion
+ * ast. All defined in dlm.h with DLM_SBF_ prefix.
+ *
+ * lkb_status: the lock status indicates which rsb queue the lock is
+ * on, grant, convert, or wait. DLM_LKSTS_ WAITING/GRANTED/CONVERT
+ *
+ * lkb_wait_type: the dlm message type (DLM_MSG_ prefix) for which a
+ * reply is needed. Only set when the lkb is on the lockspace waiters
+ * list awaiting a reply from a remote node.
+ *
+ * lkb_nodeid: when the lkb is a local copy, nodeid is 0; when the lkb
+ * is a master copy, nodeid specifies the remote lock holder, when the
+ * lkb is a process copy, the nodeid specifies the lock master.
+ */
+
+/* lkb_ast_type */
+
+#define AST_COMP 1
+#define AST_BAST 2
+
+/* lkb_status */
+
+#define DLM_LKSTS_WAITING 1
+#define DLM_LKSTS_GRANTED 2
+#define DLM_LKSTS_CONVERT 3
+
+/* lkb_flags */
+
+#define DLM_IFL_MSTCPY 0x00010000
+#define DLM_IFL_RESEND 0x00020000
+#define DLM_IFL_DEAD 0x00040000
+#define DLM_IFL_USER 0x00000001
+#define DLM_IFL_ORPHAN 0x00000002
+
+struct dlm_lkb {
+ struct dlm_rsb *lkb_resource; /* the rsb */
+ struct kref lkb_ref;
+ int lkb_nodeid; /* copied from rsb */
+ int lkb_ownpid; /* pid of lock owner */
+ uint32_t lkb_id; /* our lock ID */
+ uint32_t lkb_remid; /* lock ID on remote partner */
+ uint32_t lkb_exflags; /* external flags from caller */
+ uint32_t lkb_sbflags; /* lksb flags */
+ uint32_t lkb_flags; /* internal flags */
+ uint32_t lkb_lvbseq; /* lvb sequence number */
+
+ int8_t lkb_status; /* granted, waiting, convert */
+ int8_t lkb_rqmode; /* requested lock mode */
+ int8_t lkb_grmode; /* granted lock mode */
+ int8_t lkb_bastmode; /* requested mode */
+ int8_t lkb_highbast; /* highest mode bast sent for */
+
+ int8_t lkb_wait_type; /* type of reply waiting for */
+ int8_t lkb_ast_type; /* type of ast queued for */
+
+ struct list_head lkb_idtbl_list; /* lockspace lkbtbl */
+ struct list_head lkb_statequeue; /* rsb g/c/w list */
+ struct list_head lkb_rsb_lookup; /* waiting for rsb lookup */
+ struct list_head lkb_wait_reply; /* waiting for remote reply */
+ struct list_head lkb_astqueue; /* need ast to be sent */
+ struct list_head lkb_ownqueue; /* list of locks for a process */
+
+ char *lkb_lvbptr;
+ struct dlm_lksb *lkb_lksb; /* caller's status block */
+ void *lkb_astaddr; /* caller's ast function */
+ void *lkb_bastaddr; /* caller's bast function */
+ long lkb_astparam; /* caller's ast arg */
+};
+
+
+struct dlm_rsb {
+ struct dlm_ls *res_ls; /* the lockspace */
+ struct kref res_ref;
+ struct mutex res_mutex;
+ unsigned long res_flags;
+ int res_length; /* length of rsb name */
+ int res_nodeid;
+ uint32_t res_lvbseq;
+ uint32_t res_hash;
+ uint32_t res_bucket; /* rsbtbl */
+ unsigned long res_toss_time;
+ uint32_t res_first_lkid;
+ struct list_head res_lookup; /* lkbs waiting on first */
+ struct list_head res_hashchain; /* rsbtbl */
+ struct list_head res_grantqueue;
+ struct list_head res_convertqueue;
+ struct list_head res_waitqueue;
+
+ struct list_head res_root_list; /* used for recovery */
+ struct list_head res_recover_list; /* used for recovery */
+ int res_recover_locks_count;
+
+ char *res_lvbptr;
+ char res_name[1];
+};
+
+/* find_rsb() flags */
+
+#define R_MASTER 1 /* only return rsb if it's a master */
+#define R_CREATE 2 /* create/add rsb if not found */
+
+/* rsb_flags */
+
+enum rsb_flags {
+ RSB_MASTER_UNCERTAIN,
+ RSB_VALNOTVALID,
+ RSB_VALNOTVALID_PREV,
+ RSB_NEW_MASTER,
+ RSB_NEW_MASTER2,
+ RSB_RECOVER_CONVERT,
+ RSB_LOCKS_PURGED,
+};
+
+static inline void rsb_set_flag(struct dlm_rsb *r, enum rsb_flags flag)
+{
+ __set_bit(flag, &r->res_flags);
+}
+
+static inline void rsb_clear_flag(struct dlm_rsb *r, enum rsb_flags flag)
+{
+ __clear_bit(flag, &r->res_flags);
+}
+
+static inline int rsb_flag(struct dlm_rsb *r, enum rsb_flags flag)
+{
+ return test_bit(flag, &r->res_flags);
+}
+
+
+/* dlm_header is first element of all structs sent between nodes */
+
+#define DLM_HEADER_MAJOR 0x00020000
+#define DLM_HEADER_MINOR 0x00000001
+
+#define DLM_MSG 1
+#define DLM_RCOM 2
+
+struct dlm_header {
+ uint32_t h_version;
+ uint32_t h_lockspace;
+ uint32_t h_nodeid; /* nodeid of sender */
+ uint16_t h_length;
+ uint8_t h_cmd; /* DLM_MSG, DLM_RCOM */
+ uint8_t h_pad;
+};
+
+
+#define DLM_MSG_REQUEST 1
+#define DLM_MSG_CONVERT 2
+#define DLM_MSG_UNLOCK 3
+#define DLM_MSG_CANCEL 4
+#define DLM_MSG_REQUEST_REPLY 5
+#define DLM_MSG_CONVERT_REPLY 6
+#define DLM_MSG_UNLOCK_REPLY 7
+#define DLM_MSG_CANCEL_REPLY 8
+#define DLM_MSG_GRANT 9
+#define DLM_MSG_BAST 10
+#define DLM_MSG_LOOKUP 11
+#define DLM_MSG_REMOVE 12
+#define DLM_MSG_LOOKUP_REPLY 13
+
+struct dlm_message {
+ struct dlm_header m_header;
+ uint32_t m_type; /* DLM_MSG_ */
+ uint32_t m_nodeid;
+ uint32_t m_pid;
+ uint32_t m_lkid; /* lkid on sender */
+ uint32_t m_remid; /* lkid on receiver */
+ uint32_t m_parent_lkid;
+ uint32_t m_parent_remid;
+ uint32_t m_exflags;
+ uint32_t m_sbflags;
+ uint32_t m_flags;
+ uint32_t m_lvbseq;
+ uint32_t m_hash;
+ int m_status;
+ int m_grmode;
+ int m_rqmode;
+ int m_bastmode;
+ int m_asts;
+ int m_result; /* 0 or -EXXX */
+ char m_extra[0]; /* name or lvb */
+};
+
+
+#define DLM_RS_NODES 0x00000001
+#define DLM_RS_NODES_ALL 0x00000002
+#define DLM_RS_DIR 0x00000004
+#define DLM_RS_DIR_ALL 0x00000008
+#define DLM_RS_LOCKS 0x00000010
+#define DLM_RS_LOCKS_ALL 0x00000020
+#define DLM_RS_DONE 0x00000040
+#define DLM_RS_DONE_ALL 0x00000080
+
+#define DLM_RCOM_STATUS 1
+#define DLM_RCOM_NAMES 2
+#define DLM_RCOM_LOOKUP 3
+#define DLM_RCOM_LOCK 4
+#define DLM_RCOM_STATUS_REPLY 5
+#define DLM_RCOM_NAMES_REPLY 6
+#define DLM_RCOM_LOOKUP_REPLY 7
+#define DLM_RCOM_LOCK_REPLY 8
+
+struct dlm_rcom {
+ struct dlm_header rc_header;
+ uint32_t rc_type; /* DLM_RCOM_ */
+ int rc_result; /* multi-purpose */
+ uint64_t rc_id; /* match reply with request */
+ char rc_buf[0];
+};
+
+struct rcom_config {
+ uint32_t rf_lvblen;
+ uint32_t rf_lsflags;
+ uint64_t rf_unused;
+};
+
+struct rcom_lock {
+ uint32_t rl_ownpid;
+ uint32_t rl_lkid;
+ uint32_t rl_remid;
+ uint32_t rl_parent_lkid;
+ uint32_t rl_parent_remid;
+ uint32_t rl_exflags;
+ uint32_t rl_flags;
+ uint32_t rl_lvbseq;
+ int rl_result;
+ int8_t rl_rqmode;
+ int8_t rl_grmode;
+ int8_t rl_status;
+ int8_t rl_asts;
+ uint16_t rl_wait_type;
+ uint16_t rl_namelen;
+ char rl_name[DLM_RESNAME_MAXLEN];
+ char rl_lvb[0];
+};
+
+struct dlm_ls {
+ struct list_head ls_list; /* list of lockspaces */
+ dlm_lockspace_t *ls_local_handle;
+ uint32_t ls_global_id; /* global unique lockspace ID */
+ uint32_t ls_exflags;
+ int ls_lvblen;
+ int ls_count; /* reference count */
+ unsigned long ls_flags; /* LSFL_ */
+ struct kobject ls_kobj;
+
+ struct dlm_rsbtable *ls_rsbtbl;
+ uint32_t ls_rsbtbl_size;
+
+ struct dlm_lkbtable *ls_lkbtbl;
+ uint32_t ls_lkbtbl_size;
+
+ struct dlm_dirtable *ls_dirtbl;
+ uint32_t ls_dirtbl_size;
+
+ struct mutex ls_waiters_mutex;
+ struct list_head ls_waiters; /* lkbs needing a reply */
+
+ struct list_head ls_nodes; /* current nodes in ls */
+ struct list_head ls_nodes_gone; /* dead node list, recovery */
+ int ls_num_nodes; /* number of nodes in ls */
+ int ls_low_nodeid;
+ int ls_total_weight;
+ int *ls_node_array;
+
+ struct dlm_rsb ls_stub_rsb; /* for returning errors */
+ struct dlm_lkb ls_stub_lkb; /* for returning errors */
+ struct dlm_message ls_stub_ms; /* for faking a reply */
+
+ struct dentry *ls_debug_rsb_dentry; /* debugfs */
+ struct dentry *ls_debug_waiters_dentry; /* debugfs */
+
+ wait_queue_head_t ls_uevent_wait; /* user part of join/leave */
+ int ls_uevent_result;
+
+ struct miscdevice ls_device;
+
+ /* recovery related */
+
+ struct timer_list ls_timer;
+ struct task_struct *ls_recoverd_task;
+ struct mutex ls_recoverd_active;
+ spinlock_t ls_recover_lock;
+ uint32_t ls_recover_status; /* DLM_RS_ */
+ uint64_t ls_recover_seq;
+ struct dlm_recover *ls_recover_args;
+ struct rw_semaphore ls_in_recovery; /* block local requests */
+ struct list_head ls_requestqueue;/* queue remote requests */
+ struct mutex ls_requestqueue_mutex;
+ char *ls_recover_buf;
+ int ls_recover_nodeid; /* for debugging */
+ uint64_t ls_rcom_seq;
+ struct list_head ls_recover_list;
+ spinlock_t ls_recover_list_lock;
+ int ls_recover_list_count;
+ wait_queue_head_t ls_wait_general;
+ struct mutex ls_clear_proc_locks;
+
+ struct list_head ls_root_list; /* root resources */
+ struct rw_semaphore ls_root_sem; /* protect root_list */
+
+ int ls_namelen;
+ char ls_name[1];
+};
+
+#define LSFL_WORK 0
+#define LSFL_RUNNING 1
+#define LSFL_RECOVERY_STOP 2
+#define LSFL_RCOM_READY 3
+#define LSFL_UEVENT_WAIT 4
+
+/* much of this is just saving user space pointers associated with the
+ lock that we pass back to the user lib with an ast */
+
+struct dlm_user_args {
+ struct dlm_user_proc *proc; /* each process that opens the lockspace
+ device has private data
+ (dlm_user_proc) on the struct file,
+ the process's locks point back to it*/
+ struct dlm_lksb lksb;
+ int old_mode;
+ int update_user_lvb;
+ struct dlm_lksb __user *user_lksb;
+ void __user *castparam;
+ void __user *castaddr;
+ void __user *bastparam;
+ void __user *bastaddr;
+};
+
+#define DLM_PROC_FLAGS_CLOSING 1
+#define DLM_PROC_FLAGS_COMPAT 2
+
+/* locks list is kept so we can remove all a process's locks when it
+ exits (or orphan those that are persistent) */
+
+struct dlm_user_proc {
+ dlm_lockspace_t *lockspace;
+ unsigned long flags; /* DLM_PROC_FLAGS */
+ struct list_head asts;
+ spinlock_t asts_spin;
+ struct list_head locks;
+ spinlock_t locks_spin;
+ wait_queue_head_t wait;
+};
+
+static inline int dlm_locking_stopped(struct dlm_ls *ls)
+{
+ return !test_bit(LSFL_RUNNING, &ls->ls_flags);
+}
+
+static inline int dlm_recovery_stopped(struct dlm_ls *ls)
+{
+ return test_bit(LSFL_RECOVERY_STOP, &ls->ls_flags);
+}
+
+static inline int dlm_no_directory(struct dlm_ls *ls)
+{
+ return (ls->ls_exflags & DLM_LSFL_NODIR) ? 1 : 0;
+}
+
+#endif /* __DLM_INTERNAL_DOT_H__ */
+
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
new file mode 100644
index 000000000000..3f2befa4797b
--- /dev/null
+++ b/fs/dlm/lock.c
@@ -0,0 +1,3871 @@
+/******************************************************************************
+*******************************************************************************
+**
+** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
+**
+** This copyrighted material is made available to anyone wishing to use,
+** modify, copy, or redistribute it subject to the terms and conditions
+** of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+/* Central locking logic has four stages:
+
+ dlm_lock()
+ dlm_unlock()
+
+ request_lock(ls, lkb)
+ convert_lock(ls, lkb)
+ unlock_lock(ls, lkb)
+ cancel_lock(ls, lkb)
+
+ _request_lock(r, lkb)
+ _convert_lock(r, lkb)
+ _unlock_lock(r, lkb)
+ _cancel_lock(r, lkb)
+
+ do_request(r, lkb)
+ do_convert(r, lkb)
+ do_unlock(r, lkb)
+ do_cancel(r, lkb)
+
+ Stage 1 (lock, unlock) is mainly about checking input args and
+ splitting into one of the four main operations:
+
+ dlm_lock = request_lock
+ dlm_lock+CONVERT = convert_lock
+ dlm_unlock = unlock_lock
+ dlm_unlock+CANCEL = cancel_lock
+
+ Stage 2, xxxx_lock(), just finds and locks the relevant rsb which is
+ provided to the next stage.
+
+ Stage 3, _xxxx_lock(), determines if the operation is local or remote.
+ When remote, it calls send_xxxx(), when local it calls do_xxxx().
+
+ Stage 4, do_xxxx(), is the guts of the operation. It manipulates the
+ given rsb and lkb and queues callbacks.
+
+ For remote operations, send_xxxx() results in the corresponding do_xxxx()
+ function being executed on the remote node. The connecting send/receive
+ calls on local (L) and remote (R) nodes:
+
+ L: send_xxxx() -> R: receive_xxxx()
+ R: do_xxxx()
+ L: receive_xxxx_reply() <- R: send_xxxx_reply()
+*/
+#include <linux/types.h>
+#include "dlm_internal.h"
+#include <linux/dlm_device.h>
+#include "memory.h"
+#include "lowcomms.h"
+#include "requestqueue.h"
+#include "util.h"
+#include "dir.h"
+#include "member.h"
+#include "lockspace.h"
+#include "ast.h"
+#include "lock.h"
+#include "rcom.h"
+#include "recover.h"
+#include "lvb_table.h"
+#include "user.h"
+#include "config.h"
+
+static int send_request(struct dlm_rsb *r, struct dlm_lkb *lkb);
+static int send_convert(struct dlm_rsb *r, struct dlm_lkb *lkb);
+static int send_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb);
+static int send_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb);
+static int send_grant(struct dlm_rsb *r, struct dlm_lkb *lkb);
+static int send_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int mode);
+static int send_lookup(struct dlm_rsb *r, struct dlm_lkb *lkb);
+static int send_remove(struct dlm_rsb *r);
+static int _request_lock(struct dlm_rsb *r, struct dlm_lkb *lkb);
+static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
+ struct dlm_message *ms);
+static int receive_extralen(struct dlm_message *ms);
+
+/*
+ * Lock compatibilty matrix - thanks Steve
+ * UN = Unlocked state. Not really a state, used as a flag
+ * PD = Padding. Used to make the matrix a nice power of two in size
+ * Other states are the same as the VMS DLM.
+ * Usage: matrix[grmode+1][rqmode+1] (although m[rq+1][gr+1] is the same)
+ */
+
+static const int __dlm_compat_matrix[8][8] = {
+ /* UN NL CR CW PR PW EX PD */
+ {1, 1, 1, 1, 1, 1, 1, 0}, /* UN */
+ {1, 1, 1, 1, 1, 1, 1, 0}, /* NL */
+ {1, 1, 1, 1, 1, 1, 0, 0}, /* CR */
+ {1, 1, 1, 1, 0, 0, 0, 0}, /* CW */
+ {1, 1, 1, 0, 1, 0, 0, 0}, /* PR */
+ {1, 1, 1, 0, 0, 0, 0, 0}, /* PW */
+ {1, 1, 0, 0, 0, 0, 0, 0}, /* EX */
+ {0, 0, 0, 0, 0, 0, 0, 0} /* PD */
+};
+
+/*
+ * This defines the direction of transfer of LVB data.
+ * Granted mode is the row; requested mode is the column.
+ * Usage: matrix[grmode+1][rqmode+1]
+ * 1 = LVB is returned to the caller
+ * 0 = LVB is written to the resource
+ * -1 = nothing happens to the LVB
+ */
+
+const int dlm_lvb_operations[8][8] = {
+ /* UN NL CR CW PR PW EX PD*/
+ { -1, 1, 1, 1, 1, 1, 1, -1 }, /* UN */
+ { -1, 1, 1, 1, 1, 1, 1, 0 }, /* NL */
+ { -1, -1, 1, 1, 1, 1, 1, 0 }, /* CR */
+ { -1, -1, -1, 1, 1, 1, 1, 0 }, /* CW */
+ { -1, -1, -1, -1, 1, 1, 1, 0 }, /* PR */
+ { -1, 0, 0, 0, 0, 0, 1, 0 }, /* PW */
+ { -1, 0, 0, 0, 0, 0, 0, 0 }, /* EX */
+ { -1, 0, 0, 0, 0, 0, 0, 0 } /* PD */
+};
+
+#define modes_compat(gr, rq) \
+ __dlm_compat_matrix[(gr)->lkb_grmode + 1][(rq)->lkb_rqmode + 1]
+
+int dlm_modes_compat(int mode1, int mode2)
+{
+ return __dlm_compat_matrix[mode1 + 1][mode2 + 1];
+}
+
+/*
+ * Compatibility matrix for conversions with QUECVT set.
+ * Granted mode is the row; requested mode is the column.
+ * Usage: matrix[grmode+1][rqmode+1]
+ */
+
+static const int __quecvt_compat_matrix[8][8] = {
+ /* UN NL CR CW PR PW EX PD */
+ {0, 0, 0, 0, 0, 0, 0, 0}, /* UN */
+ {0, 0, 1, 1, 1, 1, 1, 0}, /* NL */
+ {0, 0, 0, 1, 1, 1, 1, 0}, /* CR */
+ {0, 0, 0, 0, 1, 1, 1, 0}, /* CW */
+ {0, 0, 0, 1, 0, 1, 1, 0}, /* PR */
+ {0, 0, 0, 0, 0, 0, 1, 0}, /* PW */
+ {0, 0, 0, 0, 0, 0, 0, 0}, /* EX */
+ {0, 0, 0, 0, 0, 0, 0, 0} /* PD */
+};
+
+void dlm_print_lkb(struct dlm_lkb *lkb)
+{
+ printk(KERN_ERR "lkb: nodeid %d id %x remid %x exflags %x flags %x\n"
+ " status %d rqmode %d grmode %d wait_type %d ast_type %d\n",
+ lkb->lkb_nodeid, lkb->lkb_id, lkb->lkb_remid, lkb->lkb_exflags,
+ lkb->lkb_flags, lkb->lkb_status, lkb->lkb_rqmode,
+ lkb->lkb_grmode, lkb->lkb_wait_type, lkb->lkb_ast_type);
+}
+
+void dlm_print_rsb(struct dlm_rsb *r)
+{
+ printk(KERN_ERR "rsb: nodeid %d flags %lx first %x rlc %d name %s\n",
+ r->res_nodeid, r->res_flags, r->res_first_lkid,
+ r->res_recover_locks_count, r->res_name);
+}
+
+void dlm_dump_rsb(struct dlm_rsb *r)
+{
+ struct dlm_lkb *lkb;
+
+ dlm_print_rsb(r);
+
+ printk(KERN_ERR "rsb: root_list empty %d recover_list empty %d\n",
+ list_empty(&r->res_root_list), list_empty(&r->res_recover_list));
+ printk(KERN_ERR "rsb lookup list\n");
+ list_for_each_entry(lkb, &r->res_lookup, lkb_rsb_lookup)
+ dlm_print_lkb(lkb);
+ printk(KERN_ERR "rsb grant queue:\n");
+ list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue)
+ dlm_print_lkb(lkb);
+ printk(KERN_ERR "rsb convert queue:\n");
+ list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue)
+ dlm_print_lkb(lkb);
+ printk(KERN_ERR "rsb wait queue:\n");
+ list_for_each_entry(lkb, &r->res_waitqueue, lkb_statequeue)
+ dlm_print_lkb(lkb);
+}
+
+/* Threads cannot use the lockspace while it's being recovered */
+
+static inline void lock_recovery(struct dlm_ls *ls)
+{
+ down_read(&ls->ls_in_recovery);
+}
+
+static inline void unlock_recovery(struct dlm_ls *ls)
+{
+ up_read(&ls->ls_in_recovery);
+}
+
+static inline int lock_recovery_try(struct dlm_ls *ls)
+{
+ return down_read_trylock(&ls->ls_in_recovery);
+}
+
+static inline int can_be_queued(struct dlm_lkb *lkb)
+{
+ return !(lkb->lkb_exflags & DLM_LKF_NOQUEUE);
+}
+
+static inline int force_blocking_asts(struct dlm_lkb *lkb)
+{
+ return (lkb->lkb_exflags & DLM_LKF_NOQUEUEBAST);
+}
+
+static inline int is_demoted(struct dlm_lkb *lkb)
+{
+ return (lkb->lkb_sbflags & DLM_SBF_DEMOTED);
+}
+
+static inline int is_remote(struct dlm_rsb *r)
+{
+ DLM_ASSERT(r->res_nodeid >= 0, dlm_print_rsb(r););
+ return !!r->res_nodeid;
+}
+
+static inline int is_process_copy(struct dlm_lkb *lkb)
+{
+ return (lkb->lkb_nodeid && !(lkb->lkb_flags & DLM_IFL_MSTCPY));
+}
+
+static inline int is_master_copy(struct dlm_lkb *lkb)
+{
+ if (lkb->lkb_flags & DLM_IFL_MSTCPY)
+ DLM_ASSERT(lkb->lkb_nodeid, dlm_print_lkb(lkb););
+ return (lkb->lkb_flags & DLM_IFL_MSTCPY) ? 1 : 0;
+}
+
+static inline int middle_conversion(struct dlm_lkb *lkb)
+{
+ if ((lkb->lkb_grmode==DLM_LOCK_PR && lkb->lkb_rqmode==DLM_LOCK_CW) ||
+ (lkb->lkb_rqmode==DLM_LOCK_PR && lkb->lkb_grmode==DLM_LOCK_CW))
+ return 1;
+ return 0;
+}
+
+static inline int down_conversion(struct dlm_lkb *lkb)
+{
+ return (!middle_conversion(lkb) && lkb->lkb_rqmode < lkb->lkb_grmode);
+}
+
+static void queue_cast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
+{
+ if (is_master_copy(lkb))
+ return;
+
+ DLM_ASSERT(lkb->lkb_lksb, dlm_print_lkb(lkb););
+
+ lkb->lkb_lksb->sb_status = rv;
+ lkb->lkb_lksb->sb_flags = lkb->lkb_sbflags;
+
+ dlm_add_ast(lkb, AST_COMP);
+}
+
+static void queue_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rqmode)
+{
+ if (is_master_copy(lkb))
+ send_bast(r, lkb, rqmode);
+ else {
+ lkb->lkb_bastmode = rqmode;
+ dlm_add_ast(lkb, AST_BAST);
+ }
+}
+
+/*
+ * Basic operations on rsb's and lkb's
+ */
+
+static struct dlm_rsb *create_rsb(struct dlm_ls *ls, char *name, int len)
+{
+ struct dlm_rsb *r;
+
+ r = allocate_rsb(ls, len);
+ if (!r)
+ return NULL;
+
+ r->res_ls = ls;
+ r->res_length = len;
+ memcpy(r->res_name, name, len);
+ mutex_init(&r->res_mutex);
+
+ INIT_LIST_HEAD(&r->res_lookup);
+ INIT_LIST_HEAD(&r->res_grantqueue);
+ INIT_LIST_HEAD(&r->res_convertqueue);
+ INIT_LIST_HEAD(&r->res_waitqueue);
+ INIT_LIST_HEAD(&r->res_root_list);
+ INIT_LIST_HEAD(&r->res_recover_list);
+
+ return r;
+}
+
+static int search_rsb_list(struct list_head *head, char *name, int len,
+ unsigned int flags, struct dlm_rsb **r_ret)
+{
+ struct dlm_rsb *r;
+ int error = 0;
+
+ list_for_each_entry(r, head, res_hashchain) {
+ if (len == r->res_length && !memcmp(name, r->res_name, len))
+ goto found;
+ }
+ return -EBADR;
+
+ found:
+ if (r->res_nodeid && (flags & R_MASTER))
+ error = -ENOTBLK;
+ *r_ret = r;
+ return error;
+}
+
+static int _search_rsb(struct dlm_ls *ls, char *name, int len, int b,
+ unsigned int flags, struct dlm_rsb **r_ret)
+{
+ struct dlm_rsb *r;
+ int error;
+
+ error = search_rsb_list(&ls->ls_rsbtbl[b].list, name, len, flags, &r);
+ if (!error) {
+ kref_get(&r->res_ref);
+ goto out;
+ }
+ error = search_rsb_list(&ls->ls_rsbtbl[b].toss, name, len, flags, &r);
+ if (error)
+ goto out;
+
+ list_move(&r->res_hashchain, &ls->ls_rsbtbl[b].list);
+
+ if (dlm_no_directory(ls))
+ goto out;
+
+ if (r->res_nodeid == -1) {
+ rsb_clear_flag(r, RSB_MASTER_UNCERTAIN);
+ r->res_first_lkid = 0;
+ } else if (r->res_nodeid > 0) {
+ rsb_set_flag(r, RSB_MASTER_UNCERTAIN);
+ r->res_first_lkid = 0;
+ } else {
+ DLM_ASSERT(r->res_nodeid == 0, dlm_print_rsb(r););
+ DLM_ASSERT(!rsb_flag(r, RSB_MASTER_UNCERTAIN),);
+ }
+ out:
+ *r_ret = r;
+ return error;
+}
+
+static int search_rsb(struct dlm_ls *ls, char *name, int len, int b,
+ unsigned int flags, struct dlm_rsb **r_ret)
+{
+ int error;
+ write_lock(&ls->ls_rsbtbl[b].lock);
+ error = _search_rsb(ls, name, len, b, flags, r_ret);
+ write_unlock(&ls->ls_rsbtbl[b].lock);
+ return error;
+}
+
+/*
+ * Find rsb in rsbtbl and potentially create/add one
+ *
+ * Delaying the release of rsb's has a similar benefit to applications keeping
+ * NL locks on an rsb, but without the guarantee that the cached master value
+ * will still be valid when the rsb is reused. Apps aren't always smart enough
+ * to keep NL locks on an rsb that they may lock again shortly; this can lead
+ * to excessive master lookups and removals if we don't delay the release.
+ *
+ * Searching for an rsb means looking through both the normal list and toss
+ * list. When found on the toss list the rsb is moved to the normal list with
+ * ref count of 1; when found on normal list the ref count is incremented.
+ */
+
+static int find_rsb(struct dlm_ls *ls, char *name, int namelen,
+ unsigned int flags, struct dlm_rsb **r_ret)
+{
+ struct dlm_rsb *r, *tmp;
+ uint32_t hash, bucket;
+ int error = 0;
+
+ if (dlm_no_directory(ls))
+ flags |= R_CREATE;
+
+ hash = jhash(name, namelen, 0);
+ bucket = hash & (ls->ls_rsbtbl_size - 1);
+
+ error = search_rsb(ls, name, namelen, bucket, flags, &r);
+ if (!error)
+ goto out;
+
+ if (error == -EBADR && !(flags & R_CREATE))
+ goto out;
+
+ /* the rsb was found but wasn't a master copy */
+ if (error == -ENOTBLK)
+ goto out;
+
+ error = -ENOMEM;
+ r = create_rsb(ls, name, namelen);
+ if (!r)
+ goto out;
+
+ r->res_hash = hash;
+ r->res_bucket = bucket;
+ r->res_nodeid = -1;
+ kref_init(&r->res_ref);
+
+ /* With no directory, the master can be set immediately */
+ if (dlm_no_directory(ls)) {
+ int nodeid = dlm_dir_nodeid(r);
+ if (nodeid == dlm_our_nodeid())
+ nodeid = 0;
+ r->res_nodeid = nodeid;
+ }
+
+ write_lock(&ls->ls_rsbtbl[bucket].lock);
+ error = _search_rsb(ls, name, namelen, bucket, 0, &tmp);
+ if (!error) {
+ write_unlock(&ls->ls_rsbtbl[bucket].lock);
+ free_rsb(r);
+ r = tmp;
+ goto out;
+ }
+ list_add(&r->res_hashchain, &ls->ls_rsbtbl[bucket].list);
+ write_unlock(&ls->ls_rsbtbl[bucket].lock);
+ error = 0;
+ out:
+ *r_ret = r;
+ return error;
+}
+
+int dlm_find_rsb(struct dlm_ls *ls, char *name, int namelen,
+ unsigned int flags, struct dlm_rsb **r_ret)
+{
+ return find_rsb(ls, name, namelen, flags, r_ret);
+}
+
+/* This is only called to add a reference when the code already holds
+ a valid reference to the rsb, so there's no need for locking. */
+
+static inline void hold_rsb(struct dlm_rsb *r)
+{
+ kref_get(&r->res_ref);
+}
+
+void dlm_hold_rsb(struct dlm_rsb *r)
+{
+ hold_rsb(r);
+}
+
+static void toss_rsb(struct kref *kref)
+{
+ struct dlm_rsb *r = container_of(kref, struct dlm_rsb, res_ref);
+ struct dlm_ls *ls = r->res_ls;
+
+ DLM_ASSERT(list_empty(&r->res_root_list), dlm_print_rsb(r););
+ kref_init(&r->res_ref);
+ list_move(&r->res_hashchain, &ls->ls_rsbtbl[r->res_bucket].toss);
+ r->res_toss_time = jiffies;
+ if (r->res_lvbptr) {
+ free_lvb(r->res_lvbptr);
+ r->res_lvbptr = NULL;
+ }
+}
+
+/* When all references to the rsb are gone it's transfered to
+ the tossed list for later disposal. */
+
+static void put_rsb(struct dlm_rsb *r)
+{
+ struct dlm_ls *ls = r->res_ls;
+ uint32_t bucket = r->res_bucket;
+
+ write_lock(&ls->ls_rsbtbl[bucket].lock);
+ kref_put(&r->res_ref, toss_rsb);
+ write_unlock(&ls->ls_rsbtbl[bucket].lock);
+}
+
+void dlm_put_rsb(struct dlm_rsb *r)
+{
+ put_rsb(r);
+}
+
+/* See comment for unhold_lkb */
+
+static void unhold_rsb(struct dlm_rsb *r)
+{
+ int rv;
+ rv = kref_put(&r->res_ref, toss_rsb);
+ DLM_ASSERT(!rv, dlm_dump_rsb(r););
+}
+
+static void kill_rsb(struct kref *kref)
+{
+ struct dlm_rsb *r = container_of(kref, struct dlm_rsb, res_ref);
+
+ /* All work is done after the return from kref_put() so we
+ can release the write_lock before the remove and free. */
+
+ DLM_ASSERT(list_empty(&r->res_lookup), dlm_dump_rsb(r););
+ DLM_ASSERT(list_empty(&r->res_grantqueue), dlm_dump_rsb(r););
+ DLM_ASSERT(list_empty(&r->res_convertqueue), dlm_dump_rsb(r););
+ DLM_ASSERT(list_empty(&r->res_waitqueue), dlm_dump_rsb(r););
+ DLM_ASSERT(list_empty(&r->res_root_list), dlm_dump_rsb(r););
+ DLM_ASSERT(list_empty(&r->res_recover_list), dlm_dump_rsb(r););
+}
+
+/* Attaching/detaching lkb's from rsb's is for rsb reference counting.
+ The rsb must exist as long as any lkb's for it do. */
+
+static void attach_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+ hold_rsb(r);
+ lkb->lkb_resource = r;
+}
+
+static void detach_lkb(struct dlm_lkb *lkb)
+{
+ if (lkb->lkb_resource) {
+ put_rsb(lkb->lkb_resource);
+ lkb->lkb_resource = NULL;
+ }
+}
+
+static int create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret)
+{
+ struct dlm_lkb *lkb, *tmp;
+ uint32_t lkid = 0;
+ uint16_t bucket;
+
+ lkb = allocate_lkb(ls);
+ if (!lkb)
+ return -ENOMEM;
+
+ lkb->lkb_nodeid = -1;
+ lkb->lkb_grmode = DLM_LOCK_IV;
+ kref_init(&lkb->lkb_ref);
+ INIT_LIST_HEAD(&lkb->lkb_ownqueue);
+
+ get_random_bytes(&bucket, sizeof(bucket));
+ bucket &= (ls->ls_lkbtbl_size - 1);
+
+ write_lock(&ls->ls_lkbtbl[bucket].lock);
+
+ /* counter can roll over so we must verify lkid is not in use */
+
+ while (lkid == 0) {
+ lkid = bucket | (ls->ls_lkbtbl[bucket].counter++ << 16);
+
+ list_for_each_entry(tmp, &ls->ls_lkbtbl[bucket].list,
+ lkb_idtbl_list) {
+ if (tmp->lkb_id != lkid)
+ continue;
+ lkid = 0;
+ break;
+ }
+ }
+
+ lkb->lkb_id = lkid;
+ list_add(&lkb->lkb_idtbl_list, &ls->ls_lkbtbl[bucket].list);
+ write_unlock(&ls->ls_lkbtbl[bucket].lock);
+
+ *lkb_ret = lkb;
+ return 0;
+}
+
+static struct dlm_lkb *__find_lkb(struct dlm_ls *ls, uint32_t lkid)
+{
+ uint16_t bucket = lkid & 0xFFFF;
+ struct dlm_lkb *lkb;
+
+ list_for_each_entry(lkb, &ls->ls_lkbtbl[bucket].list, lkb_idtbl_list) {
+ if (lkb->lkb_id == lkid)
+ return lkb;
+ }
+ return NULL;
+}
+
+static int find_lkb(struct dlm_ls *ls, uint32_t lkid, struct dlm_lkb **lkb_ret)
+{
+ struct dlm_lkb *lkb;
+ uint16_t bucket = lkid & 0xFFFF;
+
+ if (bucket >= ls->ls_lkbtbl_size)
+ return -EBADSLT;
+
+ read_lock(&ls->ls_lkbtbl[bucket].lock);
+ lkb = __find_lkb(ls, lkid);
+ if (lkb)
+ kref_get(&lkb->lkb_ref);
+ read_unlock(&ls->ls_lkbtbl[bucket].lock);
+
+ *lkb_ret = lkb;
+ return lkb ? 0 : -ENOENT;
+}
+
+static void kill_lkb(struct kref *kref)
+{
+ struct dlm_lkb *lkb = container_of(kref, struct dlm_lkb, lkb_ref);
+
+ /* All work is done after the return from kref_put() so we
+ can release the write_lock before the detach_lkb */
+
+ DLM_ASSERT(!lkb->lkb_status, dlm_print_lkb(lkb););
+}
+
+/* __put_lkb() is used when an lkb may not have an rsb attached to
+ it so we need to provide the lockspace explicitly */
+
+static int __put_lkb(struct dlm_ls *ls, struct dlm_lkb *lkb)
+{
+ uint16_t bucket = lkb->lkb_id & 0xFFFF;
+
+ write_lock(&ls->ls_lkbtbl[bucket].lock);
+ if (kref_put(&lkb->lkb_ref, kill_lkb)) {
+ list_del(&lkb->lkb_idtbl_list);
+ write_unlock(&ls->ls_lkbtbl[bucket].lock);
+
+ detach_lkb(lkb);
+
+ /* for local/process lkbs, lvbptr points to caller's lksb */
+ if (lkb->lkb_lvbptr && is_master_copy(lkb))
+ free_lvb(lkb->lkb_lvbptr);
+ free_lkb(lkb);
+ return 1;
+ } else {
+ write_unlock(&ls->ls_lkbtbl[bucket].lock);
+ return 0;
+ }
+}
+
+int dlm_put_lkb(struct dlm_lkb *lkb)
+{
+ struct dlm_ls *ls;
+
+ DLM_ASSERT(lkb->lkb_resource, dlm_print_lkb(lkb););
+ DLM_ASSERT(lkb->lkb_resource->res_ls, dlm_print_lkb(lkb););
+
+ ls = lkb->lkb_resource->res_ls;
+ return __put_lkb(ls, lkb);
+}
+
+/* This is only called to add a reference when the code already holds
+ a valid reference to the lkb, so there's no need for locking. */
+
+static inline void hold_lkb(struct dlm_lkb *lkb)
+{
+ kref_get(&lkb->lkb_ref);
+}
+
+/* This is called when we need to remove a reference and are certain
+ it's not the last ref. e.g. del_lkb is always called between a
+ find_lkb/put_lkb and is always the inverse of a previous add_lkb.
+ put_lkb would work fine, but would involve unnecessary locking */
+
+static inline void unhold_lkb(struct dlm_lkb *lkb)
+{
+ int rv;
+ rv = kref_put(&lkb->lkb_ref, kill_lkb);
+ DLM_ASSERT(!rv, dlm_print_lkb(lkb););
+}
+
+static void lkb_add_ordered(struct list_head *new, struct list_head *head,
+ int mode)
+{
+ struct dlm_lkb *lkb = NULL;
+
+ list_for_each_entry(lkb, head, lkb_statequeue)
+ if (lkb->lkb_rqmode < mode)
+ break;
+
+ if (!lkb)
+ list_add_tail(new, head);
+ else
+ __list_add(new, lkb->lkb_statequeue.prev, &lkb->lkb_statequeue);
+}
+
+/* add/remove lkb to rsb's grant/convert/wait queue */
+
+static void add_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb, int status)
+{
+ kref_get(&lkb->lkb_ref);
+
+ DLM_ASSERT(!lkb->lkb_status, dlm_print_lkb(lkb););
+
+ lkb->lkb_status = status;
+
+ switch (status) {
+ case DLM_LKSTS_WAITING:
+ if (lkb->lkb_exflags & DLM_LKF_HEADQUE)
+ list_add(&lkb->lkb_statequeue, &r->res_waitqueue);
+ else
+ list_add_tail(&lkb->lkb_statequeue, &r->res_waitqueue);
+ break;
+ case DLM_LKSTS_GRANTED:
+ /* convention says granted locks kept in order of grmode */
+ lkb_add_ordered(&lkb->lkb_statequeue, &r->res_grantqueue,
+ lkb->lkb_grmode);
+ break;
+ case DLM_LKSTS_CONVERT:
+ if (lkb->lkb_exflags & DLM_LKF_HEADQUE)
+ list_add(&lkb->lkb_statequeue, &r->res_convertqueue);
+ else
+ list_add_tail(&lkb->lkb_statequeue,
+ &r->res_convertqueue);
+ break;
+ default:
+ DLM_ASSERT(0, dlm_print_lkb(lkb); printk("sts=%d\n", status););
+ }
+}
+
+static void del_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+ lkb->lkb_status = 0;
+ list_del(&lkb->lkb_statequeue);
+ unhold_lkb(lkb);
+}
+
+static void move_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb, int sts)
+{
+ hold_lkb(lkb);
+ del_lkb(r, lkb);
+ add_lkb(r, lkb, sts);
+ unhold_lkb(lkb);
+}
+
+/* add/remove lkb from global waiters list of lkb's waiting for
+ a reply from a remote node */
+
+static void add_to_waiters(struct dlm_lkb *lkb, int mstype)
+{
+ struct dlm_ls *ls = lkb->lkb_resource->res_ls;
+
+ mutex_lock(&ls->ls_waiters_mutex);
+ if (lkb->lkb_wait_type) {
+ log_print("add_to_waiters error %d", lkb->lkb_wait_type);
+ goto out;
+ }
+ lkb->lkb_wait_type = mstype;
+ kref_get(&lkb->lkb_ref);
+ list_add(&lkb->lkb_wait_reply, &ls->ls_waiters);
+ out:
+ mutex_unlock(&ls->ls_waiters_mutex);
+}
+
+static int _remove_from_waiters(struct dlm_lkb *lkb)
+{
+ int error = 0;
+
+ if (!lkb->lkb_wait_type) {
+ log_print("remove_from_waiters error");
+ error = -EINVAL;
+ goto out;
+ }
+ lkb->lkb_wait_type = 0;
+ list_del(&lkb->lkb_wait_reply);
+ unhold_lkb(lkb);
+ out:
+ return error;
+}
+
+static int remove_from_waiters(struct dlm_lkb *lkb)
+{
+ struct dlm_ls *ls = lkb->lkb_resource->res_ls;
+ int error;
+
+ mutex_lock(&ls->ls_waiters_mutex);
+ error = _remove_from_waiters(lkb);
+ mutex_unlock(&ls->ls_waiters_mutex);
+ return error;
+}
+
+static void dir_remove(struct dlm_rsb *r)
+{
+ int to_nodeid;
+
+ if (dlm_no_directory(r->res_ls))
+ return;
+
+ to_nodeid = dlm_dir_nodeid(r);
+ if (to_nodeid != dlm_our_nodeid())
+ send_remove(r);
+ else
+ dlm_dir_remove_entry(r->res_ls, to_nodeid,
+ r->res_name, r->res_length);
+}
+
+/* FIXME: shouldn't this be able to exit as soon as one non-due rsb is
+ found since they are in order of newest to oldest? */
+
+static int shrink_bucket(struct dlm_ls *ls, int b)
+{
+ struct dlm_rsb *r;
+ int count = 0, found;
+
+ for (;;) {
+ found = 0;
+ write_lock(&ls->ls_rsbtbl[b].lock);
+ list_for_each_entry_reverse(r, &ls->ls_rsbtbl[b].toss,
+ res_hashchain) {
+ if (!time_after_eq(jiffies, r->res_toss_time +
+ dlm_config.toss_secs * HZ))
+ continue;
+ found = 1;
+ break;
+ }
+
+ if (!found) {
+ write_unlock(&ls->ls_rsbtbl[b].lock);
+ break;
+ }
+
+ if (kref_put(&r->res_ref, kill_rsb)) {
+ list_del(&r->res_hashchain);
+ write_unlock(&ls->ls_rsbtbl[b].lock);
+
+ if (is_master(r))
+ dir_remove(r);
+ free_rsb(r);
+ count++;
+ } else {
+ write_unlock(&ls->ls_rsbtbl[b].lock);
+ log_error(ls, "tossed rsb in use %s", r->res_name);
+ }
+ }
+
+ return count;
+}
+
+void dlm_scan_rsbs(struct dlm_ls *ls)
+{
+ int i;
+
+ if (dlm_locking_stopped(ls))
+ return;
+
+ for (i = 0; i < ls->ls_rsbtbl_size; i++) {
+ shrink_bucket(ls, i);
+ cond_resched();
+ }
+}
+
+/* lkb is master or local copy */
+
+static void set_lvb_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+ int b, len = r->res_ls->ls_lvblen;
+
+ /* b=1 lvb returned to caller
+ b=0 lvb written to rsb or invalidated
+ b=-1 do nothing */
+
+ b = dlm_lvb_operations[lkb->lkb_grmode + 1][lkb->lkb_rqmode + 1];
+
+ if (b == 1) {
+ if (!lkb->lkb_lvbptr)
+ return;
+
+ if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
+ return;
+
+ if (!r->res_lvbptr)
+ return;
+
+ memcpy(lkb->lkb_lvbptr, r->res_lvbptr, len);
+ lkb->lkb_lvbseq = r->res_lvbseq;
+
+ } else if (b == 0) {
+ if (lkb->lkb_exflags & DLM_LKF_IVVALBLK) {
+ rsb_set_flag(r, RSB_VALNOTVALID);
+ return;
+ }
+
+ if (!lkb->lkb_lvbptr)
+ return;
+
+ if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
+ return;
+
+ if (!r->res_lvbptr)
+ r->res_lvbptr = allocate_lvb(r->res_ls);
+
+ if (!r->res_lvbptr)
+ return;
+
+ memcpy(r->res_lvbptr, lkb->lkb_lvbptr, len);
+ r->res_lvbseq++;
+ lkb->lkb_lvbseq = r->res_lvbseq;
+ rsb_clear_flag(r, RSB_VALNOTVALID);
+ }
+
+ if (rsb_flag(r, RSB_VALNOTVALID))
+ lkb->lkb_sbflags |= DLM_SBF_VALNOTVALID;
+}
+
+static void set_lvb_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+ if (lkb->lkb_grmode < DLM_LOCK_PW)
+ return;
+
+ if (lkb->lkb_exflags & DLM_LKF_IVVALBLK) {
+ rsb_set_flag(r, RSB_VALNOTVALID);
+ return;
+ }
+
+ if (!lkb->lkb_lvbptr)
+ return;
+
+ if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
+ return;
+
+ if (!r->res_lvbptr)
+ r->res_lvbptr = allocate_lvb(r->res_ls);
+
+ if (!r->res_lvbptr)
+ return;
+
+ memcpy(r->res_lvbptr, lkb->lkb_lvbptr, r->res_ls->ls_lvblen);
+ r->res_lvbseq++;
+ rsb_clear_flag(r, RSB_VALNOTVALID);
+}
+
+/* lkb is process copy (pc) */
+
+static void set_lvb_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb,
+ struct dlm_message *ms)
+{
+ int b;
+
+ if (!lkb->lkb_lvbptr)
+ return;
+
+ if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
+ return;
+
+ b = dlm_lvb_operations[lkb->lkb_grmode + 1][lkb->lkb_rqmode + 1];
+ if (b == 1) {
+ int len = receive_extralen(ms);
+ memcpy(lkb->lkb_lvbptr, ms->m_extra, len);
+ lkb->lkb_lvbseq = ms->m_lvbseq;
+ }
+}
+
+/* Manipulate lkb's on rsb's convert/granted/waiting queues
+ remove_lock -- used for unlock, removes lkb from granted
+ revert_lock -- used for cancel, moves lkb from convert to granted
+ grant_lock -- used for request and convert, adds lkb to granted or
+ moves lkb from convert or waiting to granted
+
+ Each of these is used for master or local copy lkb's. There is
+ also a _pc() variation used to make the corresponding change on
+ a process copy (pc) lkb. */
+
+static void _remove_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+ del_lkb(r, lkb);
+ lkb->lkb_grmode = DLM_LOCK_IV;
+ /* this unhold undoes the original ref from create_lkb()
+ so this leads to the lkb being freed */
+ unhold_lkb(lkb);
+}
+
+static void remove_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+ set_lvb_unlock(r, lkb);
+ _remove_lock(r, lkb);
+}
+
+static void remove_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+ _remove_lock(r, lkb);
+}
+
+static void revert_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+ lkb->lkb_rqmode = DLM_LOCK_IV;
+
+ switch (lkb->lkb_status) {
+ case DLM_LKSTS_GRANTED:
+ break;
+ case DLM_LKSTS_CONVERT:
+ move_lkb(r, lkb, DLM_LKSTS_GRANTED);
+ break;
+ case DLM_LKSTS_WAITING:
+ del_lkb(r, lkb);
+ lkb->lkb_grmode = DLM_LOCK_IV;
+ /* this unhold undoes the original ref from create_lkb()
+ so this leads to the lkb being freed */
+ unhold_lkb(lkb);
+ break;
+ default:
+ log_print("invalid status for revert %d", lkb->lkb_status);
+ }
+}
+
+static void revert_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+ revert_lock(r, lkb);
+}
+
+static void _grant_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+ if (lkb->lkb_grmode != lkb->lkb_rqmode) {
+ lkb->lkb_grmode = lkb->lkb_rqmode;
+ if (lkb->lkb_status)
+ move_lkb(r, lkb, DLM_LKSTS_GRANTED);
+ else
+ add_lkb(r, lkb, DLM_LKSTS_GRANTED);
+ }
+
+ lkb->lkb_rqmode = DLM_LOCK_IV;
+}
+
+static void grant_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+ set_lvb_lock(r, lkb);
+ _grant_lock(r, lkb);
+ lkb->lkb_highbast = 0;
+}
+
+static void grant_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb,
+ struct dlm_message *ms)
+{
+ set_lvb_lock_pc(r, lkb, ms);
+ _grant_lock(r, lkb);
+}
+
+/* called by grant_pending_locks() which means an async grant message must
+ be sent to the requesting node in addition to granting the lock if the
+ lkb belongs to a remote node. */
+
+static void grant_lock_pending(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+ grant_lock(r, lkb);
+ if (is_master_copy(lkb))
+ send_grant(r, lkb);
+ else
+ queue_cast(r, lkb, 0);
+}
+
+static inline int first_in_list(struct dlm_lkb *lkb, struct list_head *head)
+{
+ struct dlm_lkb *first = list_entry(head->next, struct dlm_lkb,
+ lkb_statequeue);
+ if (lkb->lkb_id == first->lkb_id)
+ return 1;
+
+ return 0;
+}
+
+/* Check if the given lkb conflicts with another lkb on the queue. */
+
+static int queue_conflict(struct list_head *head, struct dlm_lkb *lkb)
+{
+ struct dlm_lkb *this;
+
+ list_for_each_entry(this, head, lkb_statequeue) {
+ if (this == lkb)
+ continue;
+ if (!modes_compat(this, lkb))
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * "A conversion deadlock arises with a pair of lock requests in the converting
+ * queue for one resource. The granted mode of each lock blocks the requested
+ * mode of the other lock."
+ *
+ * Part 2: if the granted mode of lkb is preventing the first lkb in the
+ * convert queue from being granted, then demote lkb (set grmode to NL).
+ * This second form requires that we check for conv-deadlk even when
+ * now == 0 in _can_be_granted().
+ *
+ * Example:
+ * Granted Queue: empty
+ * Convert Queue: NL->EX (first lock)
+ * PR->EX (second lock)
+ *
+ * The first lock can't be granted because of the granted mode of the second
+ * lock and the second lock can't be granted because it's not first in the
+ * list. We demote the granted mode of the second lock (the lkb passed to this
+ * function).
+ *
+ * After the resolution, the "grant pending" function needs to go back and try
+ * to grant locks on the convert queue again since the first lock can now be
+ * granted.
+ */
+
+static int conversion_deadlock_detect(struct dlm_rsb *rsb, struct dlm_lkb *lkb)
+{
+ struct dlm_lkb *this, *first = NULL, *self = NULL;
+
+ list_for_each_entry(this, &rsb->res_convertqueue, lkb_statequeue) {
+ if (!first)
+ first = this;
+ if (this == lkb) {
+ self = lkb;
+ continue;
+ }
+
+ if (!modes_compat(this, lkb) && !modes_compat(lkb, this))
+ return 1;
+ }
+
+ /* if lkb is on the convert queue and is preventing the first
+ from being granted, then there's deadlock and we demote lkb.
+ multiple converting locks may need to do this before the first
+ converting lock can be granted. */
+
+ if (self && self != first) {
+ if (!modes_compat(lkb, first) &&
+ !queue_conflict(&rsb->res_grantqueue, first))
+ return 1;
+ }
+
+ return 0;
+}
+
+/*
+ * Return 1 if the lock can be granted, 0 otherwise.
+ * Also detect and resolve conversion deadlocks.
+ *
+ * lkb is the lock to be granted
+ *
+ * now is 1 if the function is being called in the context of the
+ * immediate request, it is 0 if called later, after the lock has been
+ * queued.
+ *
+ * References are from chapter 6 of "VAXcluster Principles" by Roy Davis
+ */
+
+static int _can_be_granted(struct dlm_rsb *r, struct dlm_lkb *lkb, int now)
+{
+ int8_t conv = (lkb->lkb_grmode != DLM_LOCK_IV);
+
+ /*
+ * 6-10: Version 5.4 introduced an option to address the phenomenon of
+ * a new request for a NL mode lock being blocked.
+ *
+ * 6-11: If the optional EXPEDITE flag is used with the new NL mode
+ * request, then it would be granted. In essence, the use of this flag
+ * tells the Lock Manager to expedite theis request by not considering
+ * what may be in the CONVERTING or WAITING queues... As of this
+ * writing, the EXPEDITE flag can be used only with new requests for NL
+ * mode locks. This flag is not valid for conversion requests.
+ *
+ * A shortcut. Earlier checks return an error if EXPEDITE is used in a
+ * conversion or used with a non-NL requested mode. We also know an
+ * EXPEDITE request is always granted immediately, so now must always
+ * be 1. The full condition to grant an expedite request: (now &&
+ * !conv && lkb->rqmode == DLM_LOCK_NL && (flags & EXPEDITE)) can
+ * therefore be shortened to just checking the flag.
+ */
+
+ if (lkb->lkb_exflags & DLM_LKF_EXPEDITE)
+ return 1;
+
+ /*
+ * A shortcut. Without this, !queue_conflict(grantqueue, lkb) would be
+ * added to the remaining conditions.
+ */
+
+ if (queue_conflict(&r->res_grantqueue, lkb))
+ goto out;
+
+ /*
+ * 6-3: By default, a conversion request is immediately granted if the
+ * requested mode is compatible with the modes of all other granted
+ * locks
+ */
+
+ if (queue_conflict(&r->res_convertqueue, lkb))
+ goto out;
+
+ /*
+ * 6-5: But the default algorithm for deciding whether to grant or
+ * queue conversion requests does not by itself guarantee that such
+ * requests are serviced on a "first come first serve" basis. This, in
+ * turn, can lead to a phenomenon known as "indefinate postponement".
+ *
+ * 6-7: This issue is dealt with by using the optional QUECVT flag with
+ * the system service employed to request a lock conversion. This flag
+ * forces certain conversion requests to be queued, even if they are
+ * compatible with the granted modes of other locks on the same
+ * resource. Thus, the use of this flag results in conversion requests
+ * being ordered on a "first come first servce" basis.
+ *
+ * DCT: This condition is all about new conversions being able to occur
+ * "in place" while the lock remains on the granted queue (assuming
+ * nothing else conflicts.) IOW if QUECVT isn't set, a conversion
+ * doesn't _have_ to go onto the convert queue where it's processed in
+ * order. The "now" variable is necessary to distinguish converts
+ * being received and processed for the first time now, because once a
+ * convert is moved to the conversion queue the condition below applies
+ * requiring fifo granting.
+ */
+
+ if (now && conv && !(lkb->lkb_exflags & DLM_LKF_QUECVT))
+ return 1;
+
+ /*
+ * The NOORDER flag is set to avoid the standard vms rules on grant
+ * order.
+ */
+
+ if (lkb->lkb_exflags & DLM_LKF_NOORDER)
+ return 1;
+
+ /*
+ * 6-3: Once in that queue [CONVERTING], a conversion request cannot be
+ * granted until all other conversion requests ahead of it are granted
+ * and/or canceled.
+ */
+
+ if (!now && conv && first_in_list(lkb, &r->res_convertqueue))
+ return 1;
+
+ /*
+ * 6-4: By default, a new request is immediately granted only if all
+ * three of the following conditions are satisfied when the request is
+ * issued:
+ * - The queue of ungranted conversion requests for the resource is
+ * empty.
+ * - The queue of ungranted new requests for the resource is empty.
+ * - The mode of the new request is compatible with the most
+ * restrictive mode of all granted locks on the resource.
+ */
+
+ if (now && !conv && list_empty(&r->res_convertqueue) &&
+ list_empty(&r->res_waitqueue))
+ return 1;
+
+ /*
+ * 6-4: Once a lock request is in the queue of ungranted new requests,
+ * it cannot be granted until the queue of ungranted conversion
+ * requests is empty, all ungranted new requests ahead of it are
+ * granted and/or canceled, and it is compatible with the granted mode
+ * of the most restrictive lock granted on the resource.
+ */
+
+ if (!now && !conv && list_empty(&r->res_convertqueue) &&
+ first_in_list(lkb, &r->res_waitqueue))
+ return 1;
+
+ out:
+ /*
+ * The following, enabled by CONVDEADLK, departs from VMS.
+ */
+
+ if (conv && (lkb->lkb_exflags & DLM_LKF_CONVDEADLK) &&
+ conversion_deadlock_detect(r, lkb)) {
+ lkb->lkb_grmode = DLM_LOCK_NL;
+ lkb->lkb_sbflags |= DLM_SBF_DEMOTED;
+ }
+
+ return 0;
+}
+
+/*
+ * The ALTPR and ALTCW flags aren't traditional lock manager flags, but are a
+ * simple way to provide a big optimization to applications that can use them.
+ */
+
+static int can_be_granted(struct dlm_rsb *r, struct dlm_lkb *lkb, int now)
+{
+ uint32_t flags = lkb->lkb_exflags;
+ int rv;
+ int8_t alt = 0, rqmode = lkb->lkb_rqmode;
+
+ rv = _can_be_granted(r, lkb, now);
+ if (rv)
+ goto out;
+
+ if (lkb->lkb_sbflags & DLM_SBF_DEMOTED)
+ goto out;
+
+ if (rqmode != DLM_LOCK_PR && flags & DLM_LKF_ALTPR)
+ alt = DLM_LOCK_PR;
+ else if (rqmode != DLM_LOCK_CW && flags & DLM_LKF_ALTCW)
+ alt = DLM_LOCK_CW;
+
+ if (alt) {
+ lkb->lkb_rqmode = alt;
+ rv = _can_be_granted(r, lkb, now);
+ if (rv)
+ lkb->lkb_sbflags |= DLM_SBF_ALTMODE;
+ else
+ lkb->lkb_rqmode = rqmode;
+ }
+ out:
+ return rv;
+}
+
+static int grant_pending_convert(struct dlm_rsb *r, int high)
+{
+ struct dlm_lkb *lkb, *s;
+ int hi, demoted, quit, grant_restart, demote_restart;
+
+ quit = 0;
+ restart:
+ grant_restart = 0;
+ demote_restart = 0;
+ hi = DLM_LOCK_IV;
+
+ list_for_each_entry_safe(lkb, s, &r->res_convertqueue, lkb_statequeue) {
+ demoted = is_demoted(lkb);
+ if (can_be_granted(r, lkb, 0)) {
+ grant_lock_pending(r, lkb);
+ grant_restart = 1;
+ } else {
+ hi = max_t(int, lkb->lkb_rqmode, hi);
+ if (!demoted && is_demoted(lkb))
+ demote_restart = 1;
+ }
+ }
+
+ if (grant_restart)
+ goto restart;
+ if (demote_restart && !quit) {
+ quit = 1;
+ goto restart;
+ }
+
+ return max_t(int, high, hi);
+}
+
+static int grant_pending_wait(struct dlm_rsb *r, int high)
+{
+ struct dlm_lkb *lkb, *s;
+
+ list_for_each_entry_safe(lkb, s, &r->res_waitqueue, lkb_statequeue) {
+ if (can_be_granted(r, lkb, 0))
+ grant_lock_pending(r, lkb);
+ else
+ high = max_t(int, lkb->lkb_rqmode, high);
+ }
+
+ return high;
+}
+
+static void grant_pending_locks(struct dlm_rsb *r)
+{
+ struct dlm_lkb *lkb, *s;
+ int high = DLM_LOCK_IV;
+
+ DLM_ASSERT(is_master(r), dlm_dump_rsb(r););
+
+ high = grant_pending_convert(r, high);
+ high = grant_pending_wait(r, high);
+
+ if (high == DLM_LOCK_IV)
+ return;
+
+ /*
+ * If there are locks left on the wait/convert queue then send blocking
+ * ASTs to granted locks based on the largest requested mode (high)
+ * found above. FIXME: highbast < high comparison not valid for PR/CW.
+ */
+
+ list_for_each_entry_safe(lkb, s, &r->res_grantqueue, lkb_statequeue) {
+ if (lkb->lkb_bastaddr && (lkb->lkb_highbast < high) &&
+ !__dlm_compat_matrix[lkb->lkb_grmode+1][high+1]) {
+ queue_bast(r, lkb, high);
+ lkb->lkb_highbast = high;
+ }
+ }
+}
+
+static void send_bast_queue(struct dlm_rsb *r, struct list_head *head,
+ struct dlm_lkb *lkb)
+{
+ struct dlm_lkb *gr;
+
+ list_for_each_entry(gr, head, lkb_statequeue) {
+ if (gr->lkb_bastaddr &&
+ gr->lkb_highbast < lkb->lkb_rqmode &&
+ !modes_compat(gr, lkb)) {
+ queue_bast(r, gr, lkb->lkb_rqmode);
+ gr->lkb_highbast = lkb->lkb_rqmode;
+ }
+ }
+}
+
+static void send_blocking_asts(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+ send_bast_queue(r, &r->res_grantqueue, lkb);
+}
+
+static void send_blocking_asts_all(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+ send_bast_queue(r, &r->res_grantqueue, lkb);
+ send_bast_queue(r, &r->res_convertqueue, lkb);
+}
+
+/* set_master(r, lkb) -- set the master nodeid of a resource
+
+ The purpose of this function is to set the nodeid field in the given
+ lkb using the nodeid field in the given rsb. If the rsb's nodeid is
+ known, it can just be copied to the lkb and the function will return
+ 0. If the rsb's nodeid is _not_ known, it needs to be looked up
+ before it can be copied to the lkb.
+
+ When the rsb nodeid is being looked up remotely, the initial lkb
+ causing the lookup is kept on the ls_waiters list waiting for the
+ lookup reply. Other lkb's waiting for the same rsb lookup are kept
+ on the rsb's res_lookup list until the master is verified.
+
+ Return values:
+ 0: nodeid is set in rsb/lkb and the caller should go ahead and use it
+ 1: the rsb master is not available and the lkb has been placed on
+ a wait queue
+*/
+
+static int set_master(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+ struct dlm_ls *ls = r->res_ls;
+ int error, dir_nodeid, ret_nodeid, our_nodeid = dlm_our_nodeid();
+
+ if (rsb_flag(r, RSB_MASTER_UNCERTAIN)) {
+ rsb_clear_flag(r, RSB_MASTER_UNCERTAIN);
+ r->res_first_lkid = lkb->lkb_id;
+ lkb->lkb_nodeid = r->res_nodeid;
+ return 0;
+ }
+
+ if (r->res_first_lkid && r->res_first_lkid != lkb->lkb_id) {
+ list_add_tail(&lkb->lkb_rsb_lookup, &r->res_lookup);
+ return 1;
+ }
+
+ if (r->res_nodeid == 0) {
+ lkb->lkb_nodeid = 0;
+ return 0;
+ }
+
+ if (r->res_nodeid > 0) {
+ lkb->lkb_nodeid = r->res_nodeid;
+ return 0;
+ }
+
+ DLM_ASSERT(r->res_nodeid == -1, dlm_dump_rsb(r););
+
+ dir_nodeid = dlm_dir_nodeid(r);
+
+ if (dir_nodeid != our_nodeid) {
+ r->res_first_lkid = lkb->lkb_id;
+ send_lookup(r, lkb);
+ return 1;
+ }
+
+ for (;;) {
+ /* It's possible for dlm_scand to remove an old rsb for
+ this same resource from the toss list, us to create
+ a new one, look up the master locally, and find it
+ already exists just before dlm_scand does the
+ dir_remove() on the previous rsb. */
+
+ error = dlm_dir_lookup(ls, our_nodeid, r->res_name,
+ r->res_length, &ret_nodeid);
+ if (!error)
+ break;
+ log_debug(ls, "dir_lookup error %d %s", error, r->res_name);
+ schedule();
+ }
+
+ if (ret_nodeid == our_nodeid) {
+ r->res_first_lkid = 0;
+ r->res_nodeid = 0;
+ lkb->lkb_nodeid = 0;
+ } else {
+ r->res_first_lkid = lkb->lkb_id;
+ r->res_nodeid = ret_nodeid;
+ lkb->lkb_nodeid = ret_nodeid;
+ }
+ return 0;
+}
+
+static void process_lookup_list(struct dlm_rsb *r)
+{
+ struct dlm_lkb *lkb, *safe;
+
+ list_for_each_entry_safe(lkb, safe, &r->res_lookup, lkb_rsb_lookup) {
+ list_del(&lkb->lkb_rsb_lookup);
+ _request_lock(r, lkb);
+ schedule();
+ }
+}
+
+/* confirm_master -- confirm (or deny) an rsb's master nodeid */
+
+static void confirm_master(struct dlm_rsb *r, int error)
+{
+ struct dlm_lkb *lkb;
+
+ if (!r->res_first_lkid)
+ return;
+
+ switch (error) {
+ case 0:
+ case -EINPROGRESS:
+ r->res_first_lkid = 0;
+ process_lookup_list(r);
+ break;
+
+ case -EAGAIN:
+ /* the remote master didn't queue our NOQUEUE request;
+ make a waiting lkb the first_lkid */
+
+ r->res_first_lkid = 0;
+
+ if (!list_empty(&r->res_lookup)) {
+ lkb = list_entry(r->res_lookup.next, struct dlm_lkb,
+ lkb_rsb_lookup);
+ list_del(&lkb->lkb_rsb_lookup);
+ r->res_first_lkid = lkb->lkb_id;
+ _request_lock(r, lkb);
+ } else
+ r->res_nodeid = -1;
+ break;
+
+ default:
+ log_error(r->res_ls, "confirm_master unknown error %d", error);
+ }
+}
+
+static int set_lock_args(int mode, struct dlm_lksb *lksb, uint32_t flags,
+ int namelen, uint32_t parent_lkid, void *ast,
+ void *astarg, void *bast, struct dlm_args *args)
+{
+ int rv = -EINVAL;
+
+ /* check for invalid arg usage */
+
+ if (mode < 0 || mode > DLM_LOCK_EX)
+ goto out;
+
+ if (!(flags & DLM_LKF_CONVERT) && (namelen > DLM_RESNAME_MAXLEN))
+ goto out;
+
+ if (flags & DLM_LKF_CANCEL)
+ goto out;
+
+ if (flags & DLM_LKF_QUECVT && !(flags & DLM_LKF_CONVERT))
+ goto out;
+
+ if (flags & DLM_LKF_CONVDEADLK && !(flags & DLM_LKF_CONVERT))
+ goto out;
+
+ if (flags & DLM_LKF_CONVDEADLK && flags & DLM_LKF_NOQUEUE)
+ goto out;
+
+ if (flags & DLM_LKF_EXPEDITE && flags & DLM_LKF_CONVERT)
+ goto out;
+
+ if (flags & DLM_LKF_EXPEDITE && flags & DLM_LKF_QUECVT)
+ goto out;
+
+ if (flags & DLM_LKF_EXPEDITE && flags & DLM_LKF_NOQUEUE)
+ goto out;
+
+ if (flags & DLM_LKF_EXPEDITE && mode != DLM_LOCK_NL)
+ goto out;
+
+ if (!ast || !lksb)
+ goto out;
+
+ if (flags & DLM_LKF_VALBLK && !lksb->sb_lvbptr)
+ goto out;
+
+ /* parent/child locks not yet supported */
+ if (parent_lkid)
+ goto out;
+
+ if (flags & DLM_LKF_CONVERT && !lksb->sb_lkid)
+ goto out;
+
+ /* these args will be copied to the lkb in validate_lock_args,
+ it cannot be done now because when converting locks, fields in
+ an active lkb cannot be modified before locking the rsb */
+
+ args->flags = flags;
+ args->astaddr = ast;
+ args->astparam = (long) astarg;
+ args->bastaddr = bast;
+ args->mode = mode;
+ args->lksb = lksb;
+ rv = 0;
+ out:
+ return rv;
+}
+
+static int set_unlock_args(uint32_t flags, void *astarg, struct dlm_args *args)
+{
+ if (flags & ~(DLM_LKF_CANCEL | DLM_LKF_VALBLK | DLM_LKF_IVVALBLK |
+ DLM_LKF_FORCEUNLOCK))
+ return -EINVAL;
+
+ args->flags = flags;
+ args->astparam = (long) astarg;
+ return 0;
+}
+
+static int validate_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
+ struct dlm_args *args)
+{
+ int rv = -EINVAL;
+
+ if (args->flags & DLM_LKF_CONVERT) {
+ if (lkb->lkb_flags & DLM_IFL_MSTCPY)
+ goto out;
+
+ if (args->flags & DLM_LKF_QUECVT &&
+ !__quecvt_compat_matrix[lkb->lkb_grmode+1][args->mode+1])
+ goto out;
+
+ rv = -EBUSY;
+ if (lkb->lkb_status != DLM_LKSTS_GRANTED)
+ goto out;
+
+ if (lkb->lkb_wait_type)
+ goto out;
+ }
+
+ lkb->lkb_exflags = args->flags;
+ lkb->lkb_sbflags = 0;
+ lkb->lkb_astaddr = args->astaddr;
+ lkb->lkb_astparam = args->astparam;
+ lkb->lkb_bastaddr = args->bastaddr;
+ lkb->lkb_rqmode = args->mode;
+ lkb->lkb_lksb = args->lksb;
+ lkb->lkb_lvbptr = args->lksb->sb_lvbptr;
+ lkb->lkb_ownpid = (int) current->pid;
+ rv = 0;
+ out:
+ return rv;
+}
+
+static int validate_unlock_args(struct dlm_lkb *lkb, struct dlm_args *args)
+{
+ int rv = -EINVAL;
+
+ if (lkb->lkb_flags & DLM_IFL_MSTCPY)
+ goto out;
+
+ if (args->flags & DLM_LKF_FORCEUNLOCK)
+ goto out_ok;
+
+ if (args->flags & DLM_LKF_CANCEL &&
+ lkb->lkb_status == DLM_LKSTS_GRANTED)
+ goto out;
+
+ if (!(args->flags & DLM_LKF_CANCEL) &&
+ lkb->lkb_status != DLM_LKSTS_GRANTED)
+ goto out;
+
+ rv = -EBUSY;
+ if (lkb->lkb_wait_type)
+ goto out;
+
+ out_ok:
+ lkb->lkb_exflags = args->flags;
+ lkb->lkb_sbflags = 0;
+ lkb->lkb_astparam = args->astparam;
+
+ rv = 0;
+ out:
+ return rv;
+}
+
+/*
+ * Four stage 4 varieties:
+ * do_request(), do_convert(), do_unlock(), do_cancel()
+ * These are called on the master node for the given lock and
+ * from the central locking logic.
+ */
+
+static int do_request(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+ int error = 0;
+
+ if (can_be_granted(r, lkb, 1)) {
+ grant_lock(r, lkb);
+ queue_cast(r, lkb, 0);
+ goto out;
+ }
+
+ if (can_be_queued(lkb)) {
+ error = -EINPROGRESS;
+ add_lkb(r, lkb, DLM_LKSTS_WAITING);
+ send_blocking_asts(r, lkb);
+ goto out;
+ }
+
+ error = -EAGAIN;
+ if (force_blocking_asts(lkb))
+ send_blocking_asts_all(r, lkb);
+ queue_cast(r, lkb, -EAGAIN);
+
+ out:
+ return error;
+}
+
+static int do_convert(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+ int error = 0;
+
+ /* changing an existing lock may allow others to be granted */
+
+ if (can_be_granted(r, lkb, 1)) {
+ grant_lock(r, lkb);
+ queue_cast(r, lkb, 0);
+ grant_pending_locks(r);
+ goto out;
+ }
+
+ if (can_be_queued(lkb)) {
+ if (is_demoted(lkb))
+ grant_pending_locks(r);
+ error = -EINPROGRESS;
+ del_lkb(r, lkb);
+ add_lkb(r, lkb, DLM_LKSTS_CONVERT);
+ send_blocking_asts(r, lkb);
+ goto out;
+ }
+
+ error = -EAGAIN;
+ if (force_blocking_asts(lkb))
+ send_blocking_asts_all(r, lkb);
+ queue_cast(r, lkb, -EAGAIN);
+
+ out:
+ return error;
+}
+
+static int do_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+ remove_lock(r, lkb);
+ queue_cast(r, lkb, -DLM_EUNLOCK);
+ grant_pending_locks(r);
+ return -DLM_EUNLOCK;
+}
+
+/* FIXME: if revert_lock() finds that the lkb is granted, we should
+ skip the queue_cast(ECANCEL). It indicates that the request/convert
+ completed (and queued a normal ast) just before the cancel; we don't
+ want to clobber the sb_result for the normal ast with ECANCEL. */
+
+static int do_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+ revert_lock(r, lkb);
+ queue_cast(r, lkb, -DLM_ECANCEL);
+ grant_pending_locks(r);
+ return -DLM_ECANCEL;
+}
+
+/*
+ * Four stage 3 varieties:
+ * _request_lock(), _convert_lock(), _unlock_lock(), _cancel_lock()
+ */
+
+/* add a new lkb to a possibly new rsb, called by requesting process */
+
+static int _request_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+ int error;
+
+ /* set_master: sets lkb nodeid from r */
+
+ error = set_master(r, lkb);
+ if (error < 0)
+ goto out;
+ if (error) {
+ error = 0;
+ goto out;
+ }
+
+ if (is_remote(r))
+ /* receive_request() calls do_request() on remote node */
+ error = send_request(r, lkb);
+ else
+ error = do_request(r, lkb);
+ out:
+ return error;
+}
+
+/* change some property of an existing lkb, e.g. mode */
+
+static int _convert_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+ int error;
+
+ if (is_remote(r))
+ /* receive_convert() calls do_convert() on remote node */
+ error = send_convert(r, lkb);
+ else
+ error = do_convert(r, lkb);
+
+ return error;
+}
+
+/* remove an existing lkb from the granted queue */
+
+static int _unlock_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+ int error;
+
+ if (is_remote(r))
+ /* receive_unlock() calls do_unlock() on remote node */
+ error = send_unlock(r, lkb);
+ else
+ error = do_unlock(r, lkb);
+
+ return error;
+}
+
+/* remove an existing lkb from the convert or wait queue */
+
+static int _cancel_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+ int error;
+
+ if (is_remote(r))
+ /* receive_cancel() calls do_cancel() on remote node */
+ error = send_cancel(r, lkb);
+ else
+ error = do_cancel(r, lkb);
+
+ return error;
+}
+
+/*
+ * Four stage 2 varieties:
+ * request_lock(), convert_lock(), unlock_lock(), cancel_lock()
+ */
+
+static int request_lock(struct dlm_ls *ls, struct dlm_lkb *lkb, char *name,
+ int len, struct dlm_args *args)
+{
+ struct dlm_rsb *r;
+ int error;
+
+ error = validate_lock_args(ls, lkb, args);
+ if (error)
+ goto out;
+
+ error = find_rsb(ls, name, len, R_CREATE, &r);
+ if (error)
+ goto out;
+
+ lock_rsb(r);
+
+ attach_lkb(r, lkb);
+ lkb->lkb_lksb->sb_lkid = lkb->lkb_id;
+
+ error = _request_lock(r, lkb);
+
+ unlock_rsb(r);
+ put_rsb(r);
+
+ out:
+ return error;
+}
+
+static int convert_lock(struct dlm_ls *ls, struct dlm_lkb *lkb,
+ struct dlm_args *args)
+{
+ struct dlm_rsb *r;
+ int error;
+
+ r = lkb->lkb_resource;
+
+ hold_rsb(r);
+ lock_rsb(r);
+
+ error = validate_lock_args(ls, lkb, args);
+ if (error)
+ goto out;
+
+ error = _convert_lock(r, lkb);
+ out:
+ unlock_rsb(r);
+ put_rsb(r);
+ return error;
+}
+
+static int unlock_lock(struct dlm_ls *ls, struct dlm_lkb *lkb,
+ struct dlm_args *args)
+{
+ struct dlm_rsb *r;
+ int error;
+
+ r = lkb->lkb_resource;
+
+ hold_rsb(r);
+ lock_rsb(r);
+
+ error = validate_unlock_args(lkb, args);
+ if (error)
+ goto out;
+
+ error = _unlock_lock(r, lkb);
+ out:
+ unlock_rsb(r);
+ put_rsb(r);
+ return error;
+}
+
+static int cancel_lock(struct dlm_ls *ls, struct dlm_lkb *lkb,
+ struct dlm_args *args)
+{
+ struct dlm_rsb *r;
+ int error;
+
+ r = lkb->lkb_resource;
+
+ hold_rsb(r);
+ lock_rsb(r);
+
+ error = validate_unlock_args(lkb, args);
+ if (error)
+ goto out;
+
+ error = _cancel_lock(r, lkb);
+ out:
+ unlock_rsb(r);
+ put_rsb(r);
+ return error;
+}
+
+/*
+ * Two stage 1 varieties: dlm_lock() and dlm_unlock()
+ */
+
+int dlm_lock(dlm_lockspace_t *lockspace,
+ int mode,
+ struct dlm_lksb *lksb,
+ uint32_t flags,
+ void *name,
+ unsigned int namelen,
+ uint32_t parent_lkid,
+ void (*ast) (void *astarg),
+ void *astarg,
+ void (*bast) (void *astarg, int mode))
+{
+ struct dlm_ls *ls;
+ struct dlm_lkb *lkb;
+ struct dlm_args args;
+ int error, convert = flags & DLM_LKF_CONVERT;
+
+ ls = dlm_find_lockspace_local(lockspace);
+ if (!ls)
+ return -EINVAL;
+
+ lock_recovery(ls);
+
+ if (convert)
+ error = find_lkb(ls, lksb->sb_lkid, &lkb);
+ else
+ error = create_lkb(ls, &lkb);
+
+ if (error)
+ goto out;
+
+ error = set_lock_args(mode, lksb, flags, namelen, parent_lkid, ast,
+ astarg, bast, &args);
+ if (error)
+ goto out_put;
+
+ if (convert)
+ error = convert_lock(ls, lkb, &args);
+ else
+ error = request_lock(ls, lkb, name, namelen, &args);
+
+ if (error == -EINPROGRESS)
+ error = 0;
+ out_put:
+ if (convert || error)
+ __put_lkb(ls, lkb);
+ if (error == -EAGAIN)
+ error = 0;
+ out:
+ unlock_recovery(ls);
+ dlm_put_lockspace(ls);
+ return error;
+}
+
+int dlm_unlock(dlm_lockspace_t *lockspace,
+ uint32_t lkid,
+ uint32_t flags,
+ struct dlm_lksb *lksb,
+ void *astarg)
+{
+ struct dlm_ls *ls;
+ struct dlm_lkb *lkb;
+ struct dlm_args args;
+ int error;
+
+ ls = dlm_find_lockspace_local(lockspace);
+ if (!ls)
+ return -EINVAL;
+
+ lock_recovery(ls);
+
+ error = find_lkb(ls, lkid, &lkb);
+ if (error)
+ goto out;
+
+ error = set_unlock_args(flags, astarg, &args);
+ if (error)
+ goto out_put;
+
+ if (flags & DLM_LKF_CANCEL)
+ error = cancel_lock(ls, lkb, &args);
+ else
+ error = unlock_lock(ls, lkb, &args);
+
+ if (error == -DLM_EUNLOCK || error == -DLM_ECANCEL)
+ error = 0;
+ out_put:
+ dlm_put_lkb(lkb);
+ out:
+ unlock_recovery(ls);
+ dlm_put_lockspace(ls);
+ return error;
+}
+
+/*
+ * send/receive routines for remote operations and replies
+ *
+ * send_args
+ * send_common
+ * send_request receive_request
+ * send_convert receive_convert
+ * send_unlock receive_unlock
+ * send_cancel receive_cancel
+ * send_grant receive_grant
+ * send_bast receive_bast
+ * send_lookup receive_lookup
+ * send_remove receive_remove
+ *
+ * send_common_reply
+ * receive_request_reply send_request_reply
+ * receive_convert_reply send_convert_reply
+ * receive_unlock_reply send_unlock_reply
+ * receive_cancel_reply send_cancel_reply
+ * receive_lookup_reply send_lookup_reply
+ */
+
+static int create_message(struct dlm_rsb *r, struct dlm_lkb *lkb,
+ int to_nodeid, int mstype,
+ struct dlm_message **ms_ret,
+ struct dlm_mhandle **mh_ret)
+{
+ struct dlm_message *ms;
+ struct dlm_mhandle *mh;
+ char *mb;
+ int mb_len = sizeof(struct dlm_message);
+
+ switch (mstype) {
+ case DLM_MSG_REQUEST:
+ case DLM_MSG_LOOKUP:
+ case DLM_MSG_REMOVE:
+ mb_len += r->res_length;
+ break;
+ case DLM_MSG_CONVERT:
+ case DLM_MSG_UNLOCK:
+ case DLM_MSG_REQUEST_REPLY:
+ case DLM_MSG_CONVERT_REPLY:
+ case DLM_MSG_GRANT:
+ if (lkb && lkb->lkb_lvbptr)
+ mb_len += r->res_ls->ls_lvblen;
+ break;
+ }
+
+ /* get_buffer gives us a message handle (mh) that we need to
+ pass into lowcomms_commit and a message buffer (mb) that we
+ write our data into */
+
+ mh = dlm_lowcomms_get_buffer(to_nodeid, mb_len, GFP_KERNEL, &mb);
+ if (!mh)
+ return -ENOBUFS;
+
+ memset(mb, 0, mb_len);
+
+ ms = (struct dlm_message *) mb;
+
+ ms->m_header.h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR);
+ ms->m_header.h_lockspace = r->res_ls->ls_global_id;
+ ms->m_header.h_nodeid = dlm_our_nodeid();
+ ms->m_header.h_length = mb_len;
+ ms->m_header.h_cmd = DLM_MSG;
+
+ ms->m_type = mstype;
+
+ *mh_ret = mh;
+ *ms_ret = ms;
+ return 0;
+}
+
+/* further lowcomms enhancements or alternate implementations may make
+ the return value from this function useful at some point */
+
+static int send_message(struct dlm_mhandle *mh, struct dlm_message *ms)
+{
+ dlm_message_out(ms);
+ dlm_lowcomms_commit_buffer(mh);
+ return 0;
+}
+
+static void send_args(struct dlm_rsb *r, struct dlm_lkb *lkb,
+ struct dlm_message *ms)
+{
+ ms->m_nodeid = lkb->lkb_nodeid;
+ ms->m_pid = lkb->lkb_ownpid;
+ ms->m_lkid = lkb->lkb_id;
+ ms->m_remid = lkb->lkb_remid;
+ ms->m_exflags = lkb->lkb_exflags;
+ ms->m_sbflags = lkb->lkb_sbflags;
+ ms->m_flags = lkb->lkb_flags;
+ ms->m_lvbseq = lkb->lkb_lvbseq;
+ ms->m_status = lkb->lkb_status;
+ ms->m_grmode = lkb->lkb_grmode;
+ ms->m_rqmode = lkb->lkb_rqmode;
+ ms->m_hash = r->res_hash;
+
+ /* m_result and m_bastmode are set from function args,
+ not from lkb fields */
+
+ if (lkb->lkb_bastaddr)
+ ms->m_asts |= AST_BAST;
+ if (lkb->lkb_astaddr)
+ ms->m_asts |= AST_COMP;
+
+ if (ms->m_type == DLM_MSG_REQUEST || ms->m_type == DLM_MSG_LOOKUP)
+ memcpy(ms->m_extra, r->res_name, r->res_length);
+
+ else if (lkb->lkb_lvbptr)
+ memcpy(ms->m_extra, lkb->lkb_lvbptr, r->res_ls->ls_lvblen);
+
+}
+
+static int send_common(struct dlm_rsb *r, struct dlm_lkb *lkb, int mstype)
+{
+ struct dlm_message *ms;
+ struct dlm_mhandle *mh;
+ int to_nodeid, error;
+
+ add_to_waiters(lkb, mstype);
+
+ to_nodeid = r->res_nodeid;
+
+ error = create_message(r, lkb, to_nodeid, mstype, &ms, &mh);
+ if (error)
+ goto fail;
+
+ send_args(r, lkb, ms);
+
+ error = send_message(mh, ms);
+ if (error)
+ goto fail;
+ return 0;
+
+ fail:
+ remove_from_waiters(lkb);
+ return error;
+}
+
+static int send_request(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+ return send_common(r, lkb, DLM_MSG_REQUEST);
+}
+
+static int send_convert(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+ int error;
+
+ error = send_common(r, lkb, DLM_MSG_CONVERT);
+
+ /* down conversions go without a reply from the master */
+ if (!error && down_conversion(lkb)) {
+ remove_from_waiters(lkb);
+ r->res_ls->ls_stub_ms.m_result = 0;
+ r->res_ls->ls_stub_ms.m_flags = lkb->lkb_flags;
+ __receive_convert_reply(r, lkb, &r->res_ls->ls_stub_ms);
+ }
+
+ return error;
+}
+
+/* FIXME: if this lkb is the only lock we hold on the rsb, then set
+ MASTER_UNCERTAIN to force the next request on the rsb to confirm
+ that the master is still correct. */
+
+static int send_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+ return send_common(r, lkb, DLM_MSG_UNLOCK);
+}
+
+static int send_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+ return send_common(r, lkb, DLM_MSG_CANCEL);
+}
+
+static int send_grant(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+ struct dlm_message *ms;
+ struct dlm_mhandle *mh;
+ int to_nodeid, error;
+
+ to_nodeid = lkb->lkb_nodeid;
+
+ error = create_message(r, lkb, to_nodeid, DLM_MSG_GRANT, &ms, &mh);
+ if (error)
+ goto out;
+
+ send_args(r, lkb, ms);
+
+ ms->m_result = 0;
+
+ error = send_message(mh, ms);
+ out:
+ return error;
+}
+
+static int send_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int mode)
+{
+ struct dlm_message *ms;
+ struct dlm_mhandle *mh;
+ int to_nodeid, error;
+
+ to_nodeid = lkb->lkb_nodeid;
+
+ error = create_message(r, NULL, to_nodeid, DLM_MSG_BAST, &ms, &mh);
+ if (error)
+ goto out;
+
+ send_args(r, lkb, ms);
+
+ ms->m_bastmode = mode;
+
+ error = send_message(mh, ms);
+ out:
+ return error;
+}
+
+static int send_lookup(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+ struct dlm_message *ms;
+ struct dlm_mhandle *mh;
+ int to_nodeid, error;
+
+ add_to_waiters(lkb, DLM_MSG_LOOKUP);
+
+ to_nodeid = dlm_dir_nodeid(r);
+
+ error = create_message(r, NULL, to_nodeid, DLM_MSG_LOOKUP, &ms, &mh);
+ if (error)
+ goto fail;
+
+ send_args(r, lkb, ms);
+
+ error = send_message(mh, ms);
+ if (error)
+ goto fail;
+ return 0;
+
+ fail:
+ remove_from_waiters(lkb);
+ return error;
+}
+
+static int send_remove(struct dlm_rsb *r)
+{
+ struct dlm_message *ms;
+ struct dlm_mhandle *mh;
+ int to_nodeid, error;
+
+ to_nodeid = dlm_dir_nodeid(r);
+
+ error = create_message(r, NULL, to_nodeid, DLM_MSG_REMOVE, &ms, &mh);
+ if (error)
+ goto out;
+
+ memcpy(ms->m_extra, r->res_name, r->res_length);
+ ms->m_hash = r->res_hash;
+
+ error = send_message(mh, ms);
+ out:
+ return error;
+}
+
+static int send_common_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
+ int mstype, int rv)
+{
+ struct dlm_message *ms;
+ struct dlm_mhandle *mh;
+ int to_nodeid, error;
+
+ to_nodeid = lkb->lkb_nodeid;
+
+ error = create_message(r, lkb, to_nodeid, mstype, &ms, &mh);
+ if (error)
+ goto out;
+
+ send_args(r, lkb, ms);
+
+ ms->m_result = rv;
+
+ error = send_message(mh, ms);
+ out:
+ return error;
+}
+
+static int send_request_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
+{
+ return send_common_reply(r, lkb, DLM_MSG_REQUEST_REPLY, rv);
+}
+
+static int send_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
+{
+ return send_common_reply(r, lkb, DLM_MSG_CONVERT_REPLY, rv);
+}
+
+static int send_unlock_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
+{
+ return send_common_reply(r, lkb, DLM_MSG_UNLOCK_REPLY, rv);
+}
+
+static int send_cancel_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
+{
+ return send_common_reply(r, lkb, DLM_MSG_CANCEL_REPLY, rv);
+}
+
+static int send_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms_in,
+ int ret_nodeid, int rv)
+{
+ struct dlm_rsb *r = &ls->ls_stub_rsb;
+ struct dlm_message *ms;
+ struct dlm_mhandle *mh;
+ int error, nodeid = ms_in->m_header.h_nodeid;
+
+ error = create_message(r, NULL, nodeid, DLM_MSG_LOOKUP_REPLY, &ms, &mh);
+ if (error)
+ goto out;
+
+ ms->m_lkid = ms_in->m_lkid;
+ ms->m_result = rv;
+ ms->m_nodeid = ret_nodeid;
+
+ error = send_message(mh, ms);
+ out:
+ return error;
+}
+
+/* which args we save from a received message depends heavily on the type
+ of message, unlike the send side where we can safely send everything about
+ the lkb for any type of message */
+
+static void receive_flags(struct dlm_lkb *lkb, struct dlm_message *ms)
+{
+ lkb->lkb_exflags = ms->m_exflags;
+ lkb->lkb_flags = (lkb->lkb_flags & 0xFFFF0000) |
+ (ms->m_flags & 0x0000FFFF);
+}
+
+static void receive_flags_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
+{
+ lkb->lkb_sbflags = ms->m_sbflags;
+ lkb->lkb_flags = (lkb->lkb_flags & 0xFFFF0000) |
+ (ms->m_flags & 0x0000FFFF);
+}
+
+static int receive_extralen(struct dlm_message *ms)
+{
+ return (ms->m_header.h_length - sizeof(struct dlm_message));
+}
+
+static int receive_lvb(struct dlm_ls *ls, struct dlm_lkb *lkb,
+ struct dlm_message *ms)
+{
+ int len;
+
+ if (lkb->lkb_exflags & DLM_LKF_VALBLK) {
+ if (!lkb->lkb_lvbptr)
+ lkb->lkb_lvbptr = allocate_lvb(ls);
+ if (!lkb->lkb_lvbptr)
+ return -ENOMEM;
+ len = receive_extralen(ms);
+ memcpy(lkb->lkb_lvbptr, ms->m_extra, len);
+ }
+ return 0;
+}
+
+static int receive_request_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
+ struct dlm_message *ms)
+{
+ lkb->lkb_nodeid = ms->m_header.h_nodeid;
+ lkb->lkb_ownpid = ms->m_pid;
+ lkb->lkb_remid = ms->m_lkid;
+ lkb->lkb_grmode = DLM_LOCK_IV;
+ lkb->lkb_rqmode = ms->m_rqmode;
+ lkb->lkb_bastaddr = (void *) (long) (ms->m_asts & AST_BAST);
+ lkb->lkb_astaddr = (void *) (long) (ms->m_asts & AST_COMP);
+
+ DLM_ASSERT(is_master_copy(lkb), dlm_print_lkb(lkb););
+
+ if (receive_lvb(ls, lkb, ms))
+ return -ENOMEM;
+
+ return 0;
+}
+
+static int receive_convert_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
+ struct dlm_message *ms)
+{
+ if (lkb->lkb_nodeid != ms->m_header.h_nodeid) {
+ log_error(ls, "convert_args nodeid %d %d lkid %x %x",
+ lkb->lkb_nodeid, ms->m_header.h_nodeid,
+ lkb->lkb_id, lkb->lkb_remid);
+ return -EINVAL;
+ }
+
+ if (!is_master_copy(lkb))
+ return -EINVAL;
+
+ if (lkb->lkb_status != DLM_LKSTS_GRANTED)
+ return -EBUSY;
+
+ if (receive_lvb(ls, lkb, ms))
+ return -ENOMEM;
+
+ lkb->lkb_rqmode = ms->m_rqmode;
+ lkb->lkb_lvbseq = ms->m_lvbseq;
+
+ return 0;
+}
+
+static int receive_unlock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
+ struct dlm_message *ms)
+{
+ if (!is_master_copy(lkb))
+ return -EINVAL;
+ if (receive_lvb(ls, lkb, ms))
+ return -ENOMEM;
+ return 0;
+}
+
+/* We fill in the stub-lkb fields with the info that send_xxxx_reply()
+ uses to send a reply and that the remote end uses to process the reply. */
+
+static void setup_stub_lkb(struct dlm_ls *ls, struct dlm_message *ms)
+{
+ struct dlm_lkb *lkb = &ls->ls_stub_lkb;
+ lkb->lkb_nodeid = ms->m_header.h_nodeid;
+ lkb->lkb_remid = ms->m_lkid;
+}
+
+static void receive_request(struct dlm_ls *ls, struct dlm_message *ms)
+{
+ struct dlm_lkb *lkb;
+ struct dlm_rsb *r;
+ int error, namelen;
+
+ error = create_lkb(ls, &lkb);
+ if (error)
+ goto fail;
+
+ receive_flags(lkb, ms);
+ lkb->lkb_flags |= DLM_IFL_MSTCPY;
+ error = receive_request_args(ls, lkb, ms);
+ if (error) {
+ __put_lkb(ls, lkb);
+ goto fail;
+ }
+
+ namelen = receive_extralen(ms);
+
+ error = find_rsb(ls, ms->m_extra, namelen, R_MASTER, &r);
+ if (error) {
+ __put_lkb(ls, lkb);
+ goto fail;
+ }
+
+ lock_rsb(r);
+
+ attach_lkb(r, lkb);
+ error = do_request(r, lkb);
+ send_request_reply(r, lkb, error);
+
+ unlock_rsb(r);
+ put_rsb(r);
+
+ if (error == -EINPROGRESS)
+ error = 0;
+ if (error)
+ dlm_put_lkb(lkb);
+ return;
+
+ fail:
+ setup_stub_lkb(ls, ms);
+ send_request_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error);
+}
+
+static void receive_convert(struct dlm_ls *ls, struct dlm_message *ms)
+{
+ struct dlm_lkb *lkb;
+ struct dlm_rsb *r;
+ int error, reply = 1;
+
+ error = find_lkb(ls, ms->m_remid, &lkb);
+ if (error)
+ goto fail;
+
+ r = lkb->lkb_resource;
+
+ hold_rsb(r);
+ lock_rsb(r);
+
+ receive_flags(lkb, ms);
+ error = receive_convert_args(ls, lkb, ms);
+ if (error)
+ goto out;
+ reply = !down_conversion(lkb);
+
+ error = do_convert(r, lkb);
+ out:
+ if (reply)
+ send_convert_reply(r, lkb, error);
+
+ unlock_rsb(r);
+ put_rsb(r);
+ dlm_put_lkb(lkb);
+ return;
+
+ fail:
+ setup_stub_lkb(ls, ms);
+ send_convert_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error);
+}
+
+static void receive_unlock(struct dlm_ls *ls, struct dlm_message *ms)
+{
+ struct dlm_lkb *lkb;
+ struct dlm_rsb *r;
+ int error;
+
+ error = find_lkb(ls, ms->m_remid, &lkb);
+ if (error)
+ goto fail;
+
+ r = lkb->lkb_resource;
+
+ hold_rsb(r);
+ lock_rsb(r);
+
+ receive_flags(lkb, ms);
+ error = receive_unlock_args(ls, lkb, ms);
+ if (error)
+ goto out;
+
+ error = do_unlock(r, lkb);
+ out:
+ send_unlock_reply(r, lkb, error);
+
+ unlock_rsb(r);
+ put_rsb(r);
+ dlm_put_lkb(lkb);
+ return;
+
+ fail:
+ setup_stub_lkb(ls, ms);
+ send_unlock_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error);
+}
+
+static void receive_cancel(struct dlm_ls *ls, struct dlm_message *ms)
+{
+ struct dlm_lkb *lkb;
+ struct dlm_rsb *r;
+ int error;
+
+ error = find_lkb(ls, ms->m_remid, &lkb);
+ if (error)
+ goto fail;
+
+ receive_flags(lkb, ms);
+
+ r = lkb->lkb_resource;
+
+ hold_rsb(r);
+ lock_rsb(r);
+
+ error = do_cancel(r, lkb);
+ send_cancel_reply(r, lkb, error);
+
+ unlock_rsb(r);
+ put_rsb(r);
+ dlm_put_lkb(lkb);
+ return;
+
+ fail:
+ setup_stub_lkb(ls, ms);
+ send_cancel_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error);
+}
+
+static void receive_grant(struct dlm_ls *ls, struct dlm_message *ms)
+{
+ struct dlm_lkb *lkb;
+ struct dlm_rsb *r;
+ int error;
+
+ error = find_lkb(ls, ms->m_remid, &lkb);
+ if (error) {
+ log_error(ls, "receive_grant no lkb");
+ return;
+ }
+ DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
+
+ r = lkb->lkb_resource;
+
+ hold_rsb(r);
+ lock_rsb(r);
+
+ receive_flags_reply(lkb, ms);
+ grant_lock_pc(r, lkb, ms);
+ queue_cast(r, lkb, 0);
+
+ unlock_rsb(r);
+ put_rsb(r);
+ dlm_put_lkb(lkb);
+}
+
+static void receive_bast(struct dlm_ls *ls, struct dlm_message *ms)
+{
+ struct dlm_lkb *lkb;
+ struct dlm_rsb *r;
+ int error;
+
+ error = find_lkb(ls, ms->m_remid, &lkb);
+ if (error) {
+ log_error(ls, "receive_bast no lkb");
+ return;
+ }
+ DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
+
+ r = lkb->lkb_resource;
+
+ hold_rsb(r);
+ lock_rsb(r);
+
+ queue_bast(r, lkb, ms->m_bastmode);
+
+ unlock_rsb(r);
+ put_rsb(r);
+ dlm_put_lkb(lkb);
+}
+
+static void receive_lookup(struct dlm_ls *ls, struct dlm_message *ms)
+{
+ int len, error, ret_nodeid, dir_nodeid, from_nodeid, our_nodeid;
+
+ from_nodeid = ms->m_header.h_nodeid;
+ our_nodeid = dlm_our_nodeid();
+
+ len = receive_extralen(ms);
+
+ dir_nodeid = dlm_hash2nodeid(ls, ms->m_hash);
+ if (dir_nodeid != our_nodeid) {
+ log_error(ls, "lookup dir_nodeid %d from %d",
+ dir_nodeid, from_nodeid);
+ error = -EINVAL;
+ ret_nodeid = -1;
+ goto out;
+ }
+
+ error = dlm_dir_lookup(ls, from_nodeid, ms->m_extra, len, &ret_nodeid);
+
+ /* Optimization: we're master so treat lookup as a request */
+ if (!error && ret_nodeid == our_nodeid) {
+ receive_request(ls, ms);
+ return;
+ }
+ out:
+ send_lookup_reply(ls, ms, ret_nodeid, error);
+}
+
+static void receive_remove(struct dlm_ls *ls, struct dlm_message *ms)
+{
+ int len, dir_nodeid, from_nodeid;
+
+ from_nodeid = ms->m_header.h_nodeid;
+
+ len = receive_extralen(ms);
+
+ dir_nodeid = dlm_hash2nodeid(ls, ms->m_hash);
+ if (dir_nodeid != dlm_our_nodeid()) {
+ log_error(ls, "remove dir entry dir_nodeid %d from %d",
+ dir_nodeid, from_nodeid);
+ return;
+ }
+
+ dlm_dir_remove_entry(ls, from_nodeid, ms->m_extra, len);
+}
+
+static void receive_request_reply(struct dlm_ls *ls, struct dlm_message *ms)
+{
+ struct dlm_lkb *lkb;
+ struct dlm_rsb *r;
+ int error, mstype;
+
+ error = find_lkb(ls, ms->m_remid, &lkb);
+ if (error) {
+ log_error(ls, "receive_request_reply no lkb");
+ return;
+ }
+ DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
+
+ mstype = lkb->lkb_wait_type;
+ error = remove_from_waiters(lkb);
+ if (error) {
+ log_error(ls, "receive_request_reply not on waiters");
+ goto out;
+ }
+
+ /* this is the value returned from do_request() on the master */
+ error = ms->m_result;
+
+ r = lkb->lkb_resource;
+ hold_rsb(r);
+ lock_rsb(r);
+
+ /* Optimization: the dir node was also the master, so it took our
+ lookup as a request and sent request reply instead of lookup reply */
+ if (mstype == DLM_MSG_LOOKUP) {
+ r->res_nodeid = ms->m_header.h_nodeid;
+ lkb->lkb_nodeid = r->res_nodeid;
+ }
+
+ switch (error) {
+ case -EAGAIN:
+ /* request would block (be queued) on remote master;
+ the unhold undoes the original ref from create_lkb()
+ so it leads to the lkb being freed */
+ queue_cast(r, lkb, -EAGAIN);
+ confirm_master(r, -EAGAIN);
+ unhold_lkb(lkb);
+ break;
+
+ case -EINPROGRESS:
+ case 0:
+ /* request was queued or granted on remote master */
+ receive_flags_reply(lkb, ms);
+ lkb->lkb_remid = ms->m_lkid;
+ if (error)
+ add_lkb(r, lkb, DLM_LKSTS_WAITING);
+ else {
+ grant_lock_pc(r, lkb, ms);
+ queue_cast(r, lkb, 0);
+ }
+ confirm_master(r, error);
+ break;
+
+ case -EBADR:
+ case -ENOTBLK:
+ /* find_rsb failed to find rsb or rsb wasn't master */
+ r->res_nodeid = -1;
+ lkb->lkb_nodeid = -1;
+ _request_lock(r, lkb);
+ break;
+
+ default:
+ log_error(ls, "receive_request_reply error %d", error);
+ }
+
+ unlock_rsb(r);
+ put_rsb(r);
+ out:
+ dlm_put_lkb(lkb);
+}
+
+static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
+ struct dlm_message *ms)
+{
+ int error = ms->m_result;
+
+ /* this is the value returned from do_convert() on the master */
+
+ switch (error) {
+ case -EAGAIN:
+ /* convert would block (be queued) on remote master */
+ queue_cast(r, lkb, -EAGAIN);
+ break;
+
+ case -EINPROGRESS:
+ /* convert was queued on remote master */
+ del_lkb(r, lkb);
+ add_lkb(r, lkb, DLM_LKSTS_CONVERT);
+ break;
+
+ case 0:
+ /* convert was granted on remote master */
+ receive_flags_reply(lkb, ms);
+ grant_lock_pc(r, lkb, ms);
+ queue_cast(r, lkb, 0);
+ break;
+
+ default:
+ log_error(r->res_ls, "receive_convert_reply error %d", error);
+ }
+}
+
+static void _receive_convert_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
+{
+ struct dlm_rsb *r = lkb->lkb_resource;
+
+ hold_rsb(r);
+ lock_rsb(r);
+
+ __receive_convert_reply(r, lkb, ms);
+
+ unlock_rsb(r);
+ put_rsb(r);
+}
+
+static void receive_convert_reply(struct dlm_ls *ls, struct dlm_message *ms)
+{
+ struct dlm_lkb *lkb;
+ int error;
+
+ error = find_lkb(ls, ms->m_remid, &lkb);
+ if (error) {
+ log_error(ls, "receive_convert_reply no lkb");
+ return;
+ }
+ DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
+
+ error = remove_from_waiters(lkb);
+ if (error) {
+ log_error(ls, "receive_convert_reply not on waiters");
+ goto out;
+ }
+
+ _receive_convert_reply(lkb, ms);
+ out:
+ dlm_put_lkb(lkb);
+}
+
+static void _receive_unlock_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
+{
+ struct dlm_rsb *r = lkb->lkb_resource;
+ int error = ms->m_result;
+
+ hold_rsb(r);
+ lock_rsb(r);
+
+ /* this is the value returned from do_unlock() on the master */
+
+ switch (error) {
+ case -DLM_EUNLOCK:
+ receive_flags_reply(lkb, ms);
+ remove_lock_pc(r, lkb);
+ queue_cast(r, lkb, -DLM_EUNLOCK);
+ break;
+ default:
+ log_error(r->res_ls, "receive_unlock_reply error %d", error);
+ }
+
+ unlock_rsb(r);
+ put_rsb(r);
+}
+
+static void receive_unlock_reply(struct dlm_ls *ls, struct dlm_message *ms)
+{
+ struct dlm_lkb *lkb;
+ int error;
+
+ error = find_lkb(ls, ms->m_remid, &lkb);
+ if (error) {
+ log_error(ls, "receive_unlock_reply no lkb");
+ return;
+ }
+ DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
+
+ error = remove_from_waiters(lkb);
+ if (error) {
+ log_error(ls, "receive_unlock_reply not on waiters");
+ goto out;
+ }
+
+ _receive_unlock_reply(lkb, ms);
+ out:
+ dlm_put_lkb(lkb);
+}
+
+static void _receive_cancel_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
+{
+ struct dlm_rsb *r = lkb->lkb_resource;
+ int error = ms->m_result;
+
+ hold_rsb(r);
+ lock_rsb(r);
+
+ /* this is the value returned from do_cancel() on the master */
+
+ switch (error) {
+ case -DLM_ECANCEL:
+ receive_flags_reply(lkb, ms);
+ revert_lock_pc(r, lkb);
+ queue_cast(r, lkb, -DLM_ECANCEL);
+ break;
+ default:
+ log_error(r->res_ls, "receive_cancel_reply error %d", error);
+ }
+
+ unlock_rsb(r);
+ put_rsb(r);
+}
+
+static void receive_cancel_reply(struct dlm_ls *ls, struct dlm_message *ms)
+{
+ struct dlm_lkb *lkb;
+ int error;
+
+ error = find_lkb(ls, ms->m_remid, &lkb);
+ if (error) {
+ log_error(ls, "receive_cancel_reply no lkb");
+ return;
+ }
+ DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
+
+ error = remove_from_waiters(lkb);
+ if (error) {
+ log_error(ls, "receive_cancel_reply not on waiters");
+ goto out;
+ }
+
+ _receive_cancel_reply(lkb, ms);
+ out:
+ dlm_put_lkb(lkb);
+}
+
+static void receive_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms)
+{
+ struct dlm_lkb *lkb;
+ struct dlm_rsb *r;
+ int error, ret_nodeid;
+
+ error = find_lkb(ls, ms->m_lkid, &lkb);
+ if (error) {
+ log_error(ls, "receive_lookup_reply no lkb");
+ return;
+ }
+
+ error = remove_from_waiters(lkb);
+ if (error) {
+ log_error(ls, "receive_lookup_reply not on waiters");
+ goto out;
+ }
+
+ /* this is the value returned by dlm_dir_lookup on dir node
+ FIXME: will a non-zero error ever be returned? */
+ error = ms->m_result;
+
+ r = lkb->lkb_resource;
+ hold_rsb(r);
+ lock_rsb(r);
+
+ ret_nodeid = ms->m_nodeid;
+ if (ret_nodeid == dlm_our_nodeid()) {
+ r->res_nodeid = 0;
+ ret_nodeid = 0;
+ r->res_first_lkid = 0;
+ } else {
+ /* set_master() will copy res_nodeid to lkb_nodeid */
+ r->res_nodeid = ret_nodeid;
+ }
+
+ _request_lock(r, lkb);
+
+ if (!ret_nodeid)
+ process_lookup_list(r);
+
+ unlock_rsb(r);
+ put_rsb(r);
+ out:
+ dlm_put_lkb(lkb);
+}
+
+int dlm_receive_message(struct dlm_header *hd, int nodeid, int recovery)
+{
+ struct dlm_message *ms = (struct dlm_message *) hd;
+ struct dlm_ls *ls;
+ int error;
+
+ if (!recovery)
+ dlm_message_in(ms);
+
+ ls = dlm_find_lockspace_global(hd->h_lockspace);
+ if (!ls) {
+ log_print("drop message %d from %d for unknown lockspace %d",
+ ms->m_type, nodeid, hd->h_lockspace);
+ return -EINVAL;
+ }
+
+ /* recovery may have just ended leaving a bunch of backed-up requests
+ in the requestqueue; wait while dlm_recoverd clears them */
+
+ if (!recovery)
+ dlm_wait_requestqueue(ls);
+
+ /* recovery may have just started while there were a bunch of
+ in-flight requests -- save them in requestqueue to be processed
+ after recovery. we can't let dlm_recvd block on the recovery
+ lock. if dlm_recoverd is calling this function to clear the
+ requestqueue, it needs to be interrupted (-EINTR) if another
+ recovery operation is starting. */
+
+ while (1) {
+ if (dlm_locking_stopped(ls)) {
+ if (!recovery)
+ dlm_add_requestqueue(ls, nodeid, hd);
+ error = -EINTR;
+ goto out;
+ }
+
+ if (lock_recovery_try(ls))
+ break;
+ schedule();
+ }
+
+ switch (ms->m_type) {
+
+ /* messages sent to a master node */
+
+ case DLM_MSG_REQUEST:
+ receive_request(ls, ms);
+ break;
+
+ case DLM_MSG_CONVERT:
+ receive_convert(ls, ms);
+ break;
+
+ case DLM_MSG_UNLOCK:
+ receive_unlock(ls, ms);
+ break;
+
+ case DLM_MSG_CANCEL:
+ receive_cancel(ls, ms);
+ break;
+
+ /* messages sent from a master node (replies to above) */
+
+ case DLM_MSG_REQUEST_REPLY:
+ receive_request_reply(ls, ms);
+ break;
+
+ case DLM_MSG_CONVERT_REPLY:
+ receive_convert_reply(ls, ms);
+ break;
+
+ case DLM_MSG_UNLOCK_REPLY:
+ receive_unlock_reply(ls, ms);
+ break;
+
+ case DLM_MSG_CANCEL_REPLY:
+ receive_cancel_reply(ls, ms);
+ break;
+
+ /* messages sent from a master node (only two types of async msg) */
+
+ case DLM_MSG_GRANT:
+ receive_grant(ls, ms);
+ break;
+
+ case DLM_MSG_BAST:
+ receive_bast(ls, ms);
+ break;
+
+ /* messages sent to a dir node */
+
+ case DLM_MSG_LOOKUP:
+ receive_lookup(ls, ms);
+ break;
+
+ case DLM_MSG_REMOVE:
+ receive_remove(ls, ms);
+ break;
+
+ /* messages sent from a dir node (remove has no reply) */
+
+ case DLM_MSG_LOOKUP_REPLY:
+ receive_lookup_reply(ls, ms);
+ break;
+
+ default:
+ log_error(ls, "unknown message type %d", ms->m_type);
+ }
+
+ unlock_recovery(ls);
+ out:
+ dlm_put_lockspace(ls);
+ dlm_astd_wake();
+ return 0;
+}
+
+
+/*
+ * Recovery related
+ */
+
+static void recover_convert_waiter(struct dlm_ls *ls, struct dlm_lkb *lkb)
+{
+ if (middle_conversion(lkb)) {
+ hold_lkb(lkb);
+ ls->ls_stub_ms.m_result = -EINPROGRESS;
+ _remove_from_waiters(lkb);
+ _receive_convert_reply(lkb, &ls->ls_stub_ms);
+
+ /* Same special case as in receive_rcom_lock_args() */
+ lkb->lkb_grmode = DLM_LOCK_IV;
+ rsb_set_flag(lkb->lkb_resource, RSB_RECOVER_CONVERT);
+ unhold_lkb(lkb);
+
+ } else if (lkb->lkb_rqmode >= lkb->lkb_grmode) {
+ lkb->lkb_flags |= DLM_IFL_RESEND;
+ }
+
+ /* lkb->lkb_rqmode < lkb->lkb_grmode shouldn't happen since down
+ conversions are async; there's no reply from the remote master */
+}
+
+/* A waiting lkb needs recovery if the master node has failed, or
+ the master node is changing (only when no directory is used) */
+
+static int waiter_needs_recovery(struct dlm_ls *ls, struct dlm_lkb *lkb)
+{
+ if (dlm_is_removed(ls, lkb->lkb_nodeid))
+ return 1;
+
+ if (!dlm_no_directory(ls))
+ return 0;
+
+ if (dlm_dir_nodeid(lkb->lkb_resource) != lkb->lkb_nodeid)
+ return 1;
+
+ return 0;
+}
+
+/* Recovery for locks that are waiting for replies from nodes that are now
+ gone. We can just complete unlocks and cancels by faking a reply from the
+ dead node. Requests and up-conversions we flag to be resent after
+ recovery. Down-conversions can just be completed with a fake reply like
+ unlocks. Conversions between PR and CW need special attention. */
+
+void dlm_recover_waiters_pre(struct dlm_ls *ls)
+{
+ struct dlm_lkb *lkb, *safe;
+
+ mutex_lock(&ls->ls_waiters_mutex);
+
+ list_for_each_entry_safe(lkb, safe, &ls->ls_waiters, lkb_wait_reply) {
+ log_debug(ls, "pre recover waiter lkid %x type %d flags %x",
+ lkb->lkb_id, lkb->lkb_wait_type, lkb->lkb_flags);
+
+ /* all outstanding lookups, regardless of destination will be
+ resent after recovery is done */
+
+ if (lkb->lkb_wait_type == DLM_MSG_LOOKUP) {
+ lkb->lkb_flags |= DLM_IFL_RESEND;
+ continue;
+ }
+
+ if (!waiter_needs_recovery(ls, lkb))
+ continue;
+
+ switch (lkb->lkb_wait_type) {
+
+ case DLM_MSG_REQUEST:
+ lkb->lkb_flags |= DLM_IFL_RESEND;
+ break;
+
+ case DLM_MSG_CONVERT:
+ recover_convert_waiter(ls, lkb);
+ break;
+
+ case DLM_MSG_UNLOCK:
+ hold_lkb(lkb);
+ ls->ls_stub_ms.m_result = -DLM_EUNLOCK;
+ _remove_from_waiters(lkb);
+ _receive_unlock_reply(lkb, &ls->ls_stub_ms);
+ dlm_put_lkb(lkb);
+ break;
+
+ case DLM_MSG_CANCEL:
+ hold_lkb(lkb);
+ ls->ls_stub_ms.m_result = -DLM_ECANCEL;
+ _remove_from_waiters(lkb);
+ _receive_cancel_reply(lkb, &ls->ls_stub_ms);
+ dlm_put_lkb(lkb);
+ break;
+
+ default:
+ log_error(ls, "invalid lkb wait_type %d",
+ lkb->lkb_wait_type);
+ }
+ schedule();
+ }
+ mutex_unlock(&ls->ls_waiters_mutex);
+}
+
+static int remove_resend_waiter(struct dlm_ls *ls, struct dlm_lkb **lkb_ret)
+{
+ struct dlm_lkb *lkb;
+ int rv = 0;
+
+ mutex_lock(&ls->ls_waiters_mutex);
+ list_for_each_entry(lkb, &ls->ls_waiters, lkb_wait_reply) {
+ if (lkb->lkb_flags & DLM_IFL_RESEND) {
+ rv = lkb->lkb_wait_type;
+ _remove_from_waiters(lkb);
+ lkb->lkb_flags &= ~DLM_IFL_RESEND;
+ break;
+ }
+ }
+ mutex_unlock(&ls->ls_waiters_mutex);
+
+ if (!rv)
+ lkb = NULL;
+ *lkb_ret = lkb;
+ return rv;
+}
+
+/* Deal with lookups and lkb's marked RESEND from _pre. We may now be the
+ master or dir-node for r. Processing the lkb may result in it being placed
+ back on waiters. */
+
+int dlm_recover_waiters_post(struct dlm_ls *ls)
+{
+ struct dlm_lkb *lkb;
+ struct dlm_rsb *r;
+ int error = 0, mstype;
+
+ while (1) {
+ if (dlm_locking_stopped(ls)) {
+ log_debug(ls, "recover_waiters_post aborted");
+ error = -EINTR;
+ break;
+ }
+
+ mstype = remove_resend_waiter(ls, &lkb);
+ if (!mstype)
+ break;
+
+ r = lkb->lkb_resource;
+
+ log_debug(ls, "recover_waiters_post %x type %d flags %x %s",
+ lkb->lkb_id, mstype, lkb->lkb_flags, r->res_name);
+
+ switch (mstype) {
+
+ case DLM_MSG_LOOKUP:
+ hold_rsb(r);
+ lock_rsb(r);
+ _request_lock(r, lkb);
+ if (is_master(r))
+ confirm_master(r, 0);
+ unlock_rsb(r);
+ put_rsb(r);
+ break;
+
+ case DLM_MSG_REQUEST:
+ hold_rsb(r);
+ lock_rsb(r);
+ _request_lock(r, lkb);
+ if (is_master(r))
+ confirm_master(r, 0);
+ unlock_rsb(r);
+ put_rsb(r);
+ break;
+
+ case DLM_MSG_CONVERT:
+ hold_rsb(r);
+ lock_rsb(r);
+ _convert_lock(r, lkb);
+ unlock_rsb(r);
+ put_rsb(r);
+ break;
+
+ default:
+ log_error(ls, "recover_waiters_post type %d", mstype);
+ }
+ }
+
+ return error;
+}
+
+static void purge_queue(struct dlm_rsb *r, struct list_head *queue,
+ int (*test)(struct dlm_ls *ls, struct dlm_lkb *lkb))
+{
+ struct dlm_ls *ls = r->res_ls;
+ struct dlm_lkb *lkb, *safe;
+
+ list_for_each_entry_safe(lkb, safe, queue, lkb_statequeue) {
+ if (test(ls, lkb)) {
+ rsb_set_flag(r, RSB_LOCKS_PURGED);
+ del_lkb(r, lkb);
+ /* this put should free the lkb */
+ if (!dlm_put_lkb(lkb))
+ log_error(ls, "purged lkb not released");
+ }
+ }
+}
+
+static int purge_dead_test(struct dlm_ls *ls, struct dlm_lkb *lkb)
+{
+ return (is_master_copy(lkb) && dlm_is_removed(ls, lkb->lkb_nodeid));
+}
+
+static int purge_mstcpy_test(struct dlm_ls *ls, struct dlm_lkb *lkb)
+{
+ return is_master_copy(lkb);
+}
+
+static void purge_dead_locks(struct dlm_rsb *r)
+{
+ purge_queue(r, &r->res_grantqueue, &purge_dead_test);
+ purge_queue(r, &r->res_convertqueue, &purge_dead_test);
+ purge_queue(r, &r->res_waitqueue, &purge_dead_test);
+}
+
+void dlm_purge_mstcpy_locks(struct dlm_rsb *r)
+{
+ purge_queue(r, &r->res_grantqueue, &purge_mstcpy_test);
+ purge_queue(r, &r->res_convertqueue, &purge_mstcpy_test);
+ purge_queue(r, &r->res_waitqueue, &purge_mstcpy_test);
+}
+
+/* Get rid of locks held by nodes that are gone. */
+
+int dlm_purge_locks(struct dlm_ls *ls)
+{
+ struct dlm_rsb *r;
+
+ log_debug(ls, "dlm_purge_locks");
+
+ down_write(&ls->ls_root_sem);
+ list_for_each_entry(r, &ls->ls_root_list, res_root_list) {
+ hold_rsb(r);
+ lock_rsb(r);
+ if (is_master(r))
+ purge_dead_locks(r);
+ unlock_rsb(r);
+ unhold_rsb(r);
+
+ schedule();
+ }
+ up_write(&ls->ls_root_sem);
+
+ return 0;
+}
+
+static struct dlm_rsb *find_purged_rsb(struct dlm_ls *ls, int bucket)
+{
+ struct dlm_rsb *r, *r_ret = NULL;
+
+ read_lock(&ls->ls_rsbtbl[bucket].lock);
+ list_for_each_entry(r, &ls->ls_rsbtbl[bucket].list, res_hashchain) {
+ if (!rsb_flag(r, RSB_LOCKS_PURGED))
+ continue;
+ hold_rsb(r);
+ rsb_clear_flag(r, RSB_LOCKS_PURGED);
+ r_ret = r;
+ break;
+ }
+ read_unlock(&ls->ls_rsbtbl[bucket].lock);
+ return r_ret;
+}
+
+void dlm_grant_after_purge(struct dlm_ls *ls)
+{
+ struct dlm_rsb *r;
+ int bucket = 0;
+
+ while (1) {
+ r = find_purged_rsb(ls, bucket);
+ if (!r) {
+ if (bucket == ls->ls_rsbtbl_size - 1)
+ break;
+ bucket++;
+ continue;
+ }
+ lock_rsb(r);
+ if (is_master(r)) {
+ grant_pending_locks(r);
+ confirm_master(r, 0);
+ }
+ unlock_rsb(r);
+ put_rsb(r);
+ schedule();
+ }
+}
+
+static struct dlm_lkb *search_remid_list(struct list_head *head, int nodeid,
+ uint32_t remid)
+{
+ struct dlm_lkb *lkb;
+
+ list_for_each_entry(lkb, head, lkb_statequeue) {
+ if (lkb->lkb_nodeid == nodeid && lkb->lkb_remid == remid)
+ return lkb;
+ }
+ return NULL;
+}
+
+static struct dlm_lkb *search_remid(struct dlm_rsb *r, int nodeid,
+ uint32_t remid)
+{
+ struct dlm_lkb *lkb;
+
+ lkb = search_remid_list(&r->res_grantqueue, nodeid, remid);
+ if (lkb)
+ return lkb;
+ lkb = search_remid_list(&r->res_convertqueue, nodeid, remid);
+ if (lkb)
+ return lkb;
+ lkb = search_remid_list(&r->res_waitqueue, nodeid, remid);
+ if (lkb)
+ return lkb;
+ return NULL;
+}
+
+static int receive_rcom_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
+ struct dlm_rsb *r, struct dlm_rcom *rc)
+{
+ struct rcom_lock *rl = (struct rcom_lock *) rc->rc_buf;
+ int lvblen;
+
+ lkb->lkb_nodeid = rc->rc_header.h_nodeid;
+ lkb->lkb_ownpid = rl->rl_ownpid;
+ lkb->lkb_remid = rl->rl_lkid;
+ lkb->lkb_exflags = rl->rl_exflags;
+ lkb->lkb_flags = rl->rl_flags & 0x0000FFFF;
+ lkb->lkb_flags |= DLM_IFL_MSTCPY;
+ lkb->lkb_lvbseq = rl->rl_lvbseq;
+ lkb->lkb_rqmode = rl->rl_rqmode;
+ lkb->lkb_grmode = rl->rl_grmode;
+ /* don't set lkb_status because add_lkb wants to itself */
+
+ lkb->lkb_bastaddr = (void *) (long) (rl->rl_asts & AST_BAST);
+ lkb->lkb_astaddr = (void *) (long) (rl->rl_asts & AST_COMP);
+
+ if (lkb->lkb_exflags & DLM_LKF_VALBLK) {
+ lkb->lkb_lvbptr = allocate_lvb(ls);
+ if (!lkb->lkb_lvbptr)
+ return -ENOMEM;
+ lvblen = rc->rc_header.h_length - sizeof(struct dlm_rcom) -
+ sizeof(struct rcom_lock);
+ memcpy(lkb->lkb_lvbptr, rl->rl_lvb, lvblen);
+ }
+
+ /* Conversions between PR and CW (middle modes) need special handling.
+ The real granted mode of these converting locks cannot be determined
+ until all locks have been rebuilt on the rsb (recover_conversion) */
+
+ if (rl->rl_wait_type == DLM_MSG_CONVERT && middle_conversion(lkb)) {
+ rl->rl_status = DLM_LKSTS_CONVERT;
+ lkb->lkb_grmode = DLM_LOCK_IV;
+ rsb_set_flag(r, RSB_RECOVER_CONVERT);
+ }
+
+ return 0;
+}
+
+/* This lkb may have been recovered in a previous aborted recovery so we need
+ to check if the rsb already has an lkb with the given remote nodeid/lkid.
+ If so we just send back a standard reply. If not, we create a new lkb with
+ the given values and send back our lkid. We send back our lkid by sending
+ back the rcom_lock struct we got but with the remid field filled in. */
+
+int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc)
+{
+ struct rcom_lock *rl = (struct rcom_lock *) rc->rc_buf;
+ struct dlm_rsb *r;
+ struct dlm_lkb *lkb;
+ int error;
+
+ if (rl->rl_parent_lkid) {
+ error = -EOPNOTSUPP;
+ goto out;
+ }
+
+ error = find_rsb(ls, rl->rl_name, rl->rl_namelen, R_MASTER, &r);
+ if (error)
+ goto out;
+
+ lock_rsb(r);
+
+ lkb = search_remid(r, rc->rc_header.h_nodeid, rl->rl_lkid);
+ if (lkb) {
+ error = -EEXIST;
+ goto out_remid;
+ }
+
+ error = create_lkb(ls, &lkb);
+ if (error)
+ goto out_unlock;
+
+ error = receive_rcom_lock_args(ls, lkb, r, rc);
+ if (error) {
+ __put_lkb(ls, lkb);
+ goto out_unlock;
+ }
+
+ attach_lkb(r, lkb);
+ add_lkb(r, lkb, rl->rl_status);
+ error = 0;
+
+ out_remid:
+ /* this is the new value returned to the lock holder for
+ saving in its process-copy lkb */
+ rl->rl_remid = lkb->lkb_id;
+
+ out_unlock:
+ unlock_rsb(r);
+ put_rsb(r);
+ out:
+ if (error)
+ log_print("recover_master_copy %d %x", error, rl->rl_lkid);
+ rl->rl_result = error;
+ return error;
+}
+
+int dlm_recover_process_copy(struct dlm_ls *ls, struct dlm_rcom *rc)
+{
+ struct rcom_lock *rl = (struct rcom_lock *) rc->rc_buf;
+ struct dlm_rsb *r;
+ struct dlm_lkb *lkb;
+ int error;
+
+ error = find_lkb(ls, rl->rl_lkid, &lkb);
+ if (error) {
+ log_error(ls, "recover_process_copy no lkid %x", rl->rl_lkid);
+ return error;
+ }
+
+ DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
+
+ error = rl->rl_result;
+
+ r = lkb->lkb_resource;
+ hold_rsb(r);
+ lock_rsb(r);
+
+ switch (error) {
+ case -EEXIST:
+ log_debug(ls, "master copy exists %x", lkb->lkb_id);
+ /* fall through */
+ case 0:
+ lkb->lkb_remid = rl->rl_remid;
+ break;
+ default:
+ log_error(ls, "dlm_recover_process_copy unknown error %d %x",
+ error, lkb->lkb_id);
+ }
+
+ /* an ack for dlm_recover_locks() which waits for replies from
+ all the locks it sends to new masters */
+ dlm_recovered_lock(r);
+
+ unlock_rsb(r);
+ put_rsb(r);
+ dlm_put_lkb(lkb);
+
+ return 0;
+}
+
+int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua,
+ int mode, uint32_t flags, void *name, unsigned int namelen,
+ uint32_t parent_lkid)
+{
+ struct dlm_lkb *lkb;
+ struct dlm_args args;
+ int error;
+
+ lock_recovery(ls);
+
+ error = create_lkb(ls, &lkb);
+ if (error) {
+ kfree(ua);
+ goto out;
+ }
+
+ if (flags & DLM_LKF_VALBLK) {
+ ua->lksb.sb_lvbptr = kmalloc(DLM_USER_LVB_LEN, GFP_KERNEL);
+ if (!ua->lksb.sb_lvbptr) {
+ kfree(ua);
+ __put_lkb(ls, lkb);
+ error = -ENOMEM;
+ goto out;
+ }
+ }
+
+ /* After ua is attached to lkb it will be freed by free_lkb().
+ When DLM_IFL_USER is set, the dlm knows that this is a userspace
+ lock and that lkb_astparam is the dlm_user_args structure. */
+
+ error = set_lock_args(mode, &ua->lksb, flags, namelen, parent_lkid,
+ DLM_FAKE_USER_AST, ua, DLM_FAKE_USER_AST, &args);
+ lkb->lkb_flags |= DLM_IFL_USER;
+ ua->old_mode = DLM_LOCK_IV;
+
+ if (error) {
+ __put_lkb(ls, lkb);
+ goto out;
+ }
+
+ error = request_lock(ls, lkb, name, namelen, &args);
+
+ switch (error) {
+ case 0:
+ break;
+ case -EINPROGRESS:
+ error = 0;
+ break;
+ case -EAGAIN:
+ error = 0;
+ /* fall through */
+ default:
+ __put_lkb(ls, lkb);
+ goto out;
+ }
+
+ /* add this new lkb to the per-process list of locks */
+ spin_lock(&ua->proc->locks_spin);
+ kref_get(&lkb->lkb_ref);
+ list_add_tail(&lkb->lkb_ownqueue, &ua->proc->locks);
+ spin_unlock(&ua->proc->locks_spin);
+ out:
+ unlock_recovery(ls);
+ return error;
+}
+
+int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
+ int mode, uint32_t flags, uint32_t lkid, char *lvb_in)
+{
+ struct dlm_lkb *lkb;
+ struct dlm_args args;
+ struct dlm_user_args *ua;
+ int error;
+
+ lock_recovery(ls);
+
+ error = find_lkb(ls, lkid, &lkb);
+ if (error)
+ goto out;
+
+ /* user can change the params on its lock when it converts it, or
+ add an lvb that didn't exist before */
+
+ ua = (struct dlm_user_args *)lkb->lkb_astparam;
+
+ if (flags & DLM_LKF_VALBLK && !ua->lksb.sb_lvbptr) {
+ ua->lksb.sb_lvbptr = kmalloc(DLM_USER_LVB_LEN, GFP_KERNEL);
+ if (!ua->lksb.sb_lvbptr) {
+ error = -ENOMEM;
+ goto out_put;
+ }
+ }
+ if (lvb_in && ua->lksb.sb_lvbptr)
+ memcpy(ua->lksb.sb_lvbptr, lvb_in, DLM_USER_LVB_LEN);
+
+ ua->castparam = ua_tmp->castparam;
+ ua->castaddr = ua_tmp->castaddr;
+ ua->bastparam = ua_tmp->bastparam;
+ ua->bastaddr = ua_tmp->bastaddr;
+ ua->user_lksb = ua_tmp->user_lksb;
+ ua->old_mode = lkb->lkb_grmode;
+
+ error = set_lock_args(mode, &ua->lksb, flags, 0, 0, DLM_FAKE_USER_AST,
+ ua, DLM_FAKE_USER_AST, &args);
+ if (error)
+ goto out_put;
+
+ error = convert_lock(ls, lkb, &args);
+
+ if (error == -EINPROGRESS || error == -EAGAIN)
+ error = 0;
+ out_put:
+ dlm_put_lkb(lkb);
+ out:
+ unlock_recovery(ls);
+ kfree(ua_tmp);
+ return error;
+}
+
+int dlm_user_unlock(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
+ uint32_t flags, uint32_t lkid, char *lvb_in)
+{
+ struct dlm_lkb *lkb;
+ struct dlm_args args;
+ struct dlm_user_args *ua;
+ int error;
+
+ lock_recovery(ls);
+
+ error = find_lkb(ls, lkid, &lkb);
+ if (error)
+ goto out;
+
+ ua = (struct dlm_user_args *)lkb->lkb_astparam;
+
+ if (lvb_in && ua->lksb.sb_lvbptr)
+ memcpy(ua->lksb.sb_lvbptr, lvb_in, DLM_USER_LVB_LEN);
+ ua->castparam = ua_tmp->castparam;
+ ua->user_lksb = ua_tmp->user_lksb;
+
+ error = set_unlock_args(flags, ua, &args);
+ if (error)
+ goto out_put;
+
+ error = unlock_lock(ls, lkb, &args);
+
+ if (error == -DLM_EUNLOCK)
+ error = 0;
+ if (error)
+ goto out_put;
+
+ spin_lock(&ua->proc->locks_spin);
+ list_del_init(&lkb->lkb_ownqueue);
+ spin_unlock(&ua->proc->locks_spin);
+
+ /* this removes the reference for the proc->locks list added by
+ dlm_user_request */
+ unhold_lkb(lkb);
+ out_put:
+ dlm_put_lkb(lkb);
+ out:
+ unlock_recovery(ls);
+ return error;
+}
+
+int dlm_user_cancel(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
+ uint32_t flags, uint32_t lkid)
+{
+ struct dlm_lkb *lkb;
+ struct dlm_args args;
+ struct dlm_user_args *ua;
+ int error;
+
+ lock_recovery(ls);
+
+ error = find_lkb(ls, lkid, &lkb);
+ if (error)
+ goto out;
+
+ ua = (struct dlm_user_args *)lkb->lkb_astparam;
+ ua->castparam = ua_tmp->castparam;
+ ua->user_lksb = ua_tmp->user_lksb;
+
+ error = set_unlock_args(flags, ua, &args);
+ if (error)
+ goto out_put;
+
+ error = cancel_lock(ls, lkb, &args);
+
+ if (error == -DLM_ECANCEL)
+ error = 0;
+ if (error)
+ goto out_put;
+
+ /* this lkb was removed from the WAITING queue */
+ if (lkb->lkb_grmode == DLM_LOCK_IV) {
+ spin_lock(&ua->proc->locks_spin);
+ list_del_init(&lkb->lkb_ownqueue);
+ spin_unlock(&ua->proc->locks_spin);
+ unhold_lkb(lkb);
+ }
+ out_put:
+ dlm_put_lkb(lkb);
+ out:
+ unlock_recovery(ls);
+ return error;
+}
+
+static int orphan_proc_lock(struct dlm_ls *ls, struct dlm_lkb *lkb)
+{
+ struct dlm_user_args *ua = (struct dlm_user_args *)lkb->lkb_astparam;
+
+ if (ua->lksb.sb_lvbptr)
+ kfree(ua->lksb.sb_lvbptr);
+ kfree(ua);
+ lkb->lkb_astparam = (long)NULL;
+
+ /* TODO: propogate to master if needed */
+ return 0;
+}
+
+/* The force flag allows the unlock to go ahead even if the lkb isn't granted.
+ Regardless of what rsb queue the lock is on, it's removed and freed. */
+
+static int unlock_proc_lock(struct dlm_ls *ls, struct dlm_lkb *lkb)
+{
+ struct dlm_user_args *ua = (struct dlm_user_args *)lkb->lkb_astparam;
+ struct dlm_args args;
+ int error;
+
+ /* FIXME: we need to handle the case where the lkb is in limbo
+ while the rsb is being looked up, currently we assert in
+ _unlock_lock/is_remote because rsb nodeid is -1. */
+
+ set_unlock_args(DLM_LKF_FORCEUNLOCK, ua, &args);
+
+ error = unlock_lock(ls, lkb, &args);
+ if (error == -DLM_EUNLOCK)
+ error = 0;
+ return error;
+}
+
+/* The ls_clear_proc_locks mutex protects against dlm_user_add_asts() which
+ 1) references lkb->ua which we free here and 2) adds lkbs to proc->asts,
+ which we clear here. */
+
+/* proc CLOSING flag is set so no more device_reads should look at proc->asts
+ list, and no more device_writes should add lkb's to proc->locks list; so we
+ shouldn't need to take asts_spin or locks_spin here. this assumes that
+ device reads/writes/closes are serialized -- FIXME: we may need to serialize
+ them ourself. */
+
+void dlm_clear_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc)
+{
+ struct dlm_lkb *lkb, *safe;
+
+ lock_recovery(ls);
+ mutex_lock(&ls->ls_clear_proc_locks);
+
+ list_for_each_entry_safe(lkb, safe, &proc->locks, lkb_ownqueue) {
+ if (lkb->lkb_ast_type) {
+ list_del(&lkb->lkb_astqueue);
+ unhold_lkb(lkb);
+ }
+
+ list_del_init(&lkb->lkb_ownqueue);
+
+ if (lkb->lkb_exflags & DLM_LKF_PERSISTENT) {
+ lkb->lkb_flags |= DLM_IFL_ORPHAN;
+ orphan_proc_lock(ls, lkb);
+ } else {
+ lkb->lkb_flags |= DLM_IFL_DEAD;
+ unlock_proc_lock(ls, lkb);
+ }
+
+ /* this removes the reference for the proc->locks list
+ added by dlm_user_request, it may result in the lkb
+ being freed */
+
+ dlm_put_lkb(lkb);
+ }
+ mutex_unlock(&ls->ls_clear_proc_locks);
+ unlock_recovery(ls);
+}
diff --git a/fs/dlm/lock.h b/fs/dlm/lock.h
new file mode 100644
index 000000000000..0843a3073ec3
--- /dev/null
+++ b/fs/dlm/lock.h
@@ -0,0 +1,62 @@
+/******************************************************************************
+*******************************************************************************
+**
+** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
+**
+** This copyrighted material is made available to anyone wishing to use,
+** modify, copy, or redistribute it subject to the terms and conditions
+** of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __LOCK_DOT_H__
+#define __LOCK_DOT_H__
+
+void dlm_print_rsb(struct dlm_rsb *r);
+void dlm_dump_rsb(struct dlm_rsb *r);
+void dlm_print_lkb(struct dlm_lkb *lkb);
+int dlm_receive_message(struct dlm_header *hd, int nodeid, int recovery);
+int dlm_modes_compat(int mode1, int mode2);
+int dlm_find_rsb(struct dlm_ls *ls, char *name, int namelen,
+ unsigned int flags, struct dlm_rsb **r_ret);
+void dlm_put_rsb(struct dlm_rsb *r);
+void dlm_hold_rsb(struct dlm_rsb *r);
+int dlm_put_lkb(struct dlm_lkb *lkb);
+void dlm_scan_rsbs(struct dlm_ls *ls);
+
+int dlm_purge_locks(struct dlm_ls *ls);
+void dlm_purge_mstcpy_locks(struct dlm_rsb *r);
+void dlm_grant_after_purge(struct dlm_ls *ls);
+int dlm_recover_waiters_post(struct dlm_ls *ls);
+void dlm_recover_waiters_pre(struct dlm_ls *ls);
+int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc);
+int dlm_recover_process_copy(struct dlm_ls *ls, struct dlm_rcom *rc);
+
+int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua, int mode,
+ uint32_t flags, void *name, unsigned int namelen, uint32_t parent_lkid);
+int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
+ int mode, uint32_t flags, uint32_t lkid, char *lvb_in);
+int dlm_user_unlock(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
+ uint32_t flags, uint32_t lkid, char *lvb_in);
+int dlm_user_cancel(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
+ uint32_t flags, uint32_t lkid);
+void dlm_clear_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc);
+
+static inline int is_master(struct dlm_rsb *r)
+{
+ return !r->res_nodeid;
+}
+
+static inline void lock_rsb(struct dlm_rsb *r)
+{
+ mutex_lock(&r->res_mutex);
+}
+
+static inline void unlock_rsb(struct dlm_rsb *r)
+{
+ mutex_unlock(&r->res_mutex);
+}
+
+#endif
+
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
new file mode 100644
index 000000000000..109333c8ecb9
--- /dev/null
+++ b/fs/dlm/lockspace.c
@@ -0,0 +1,717 @@
+/******************************************************************************
+*******************************************************************************
+**
+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
+**
+** This copyrighted material is made available to anyone wishing to use,
+** modify, copy, or redistribute it subject to the terms and conditions
+** of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#include "dlm_internal.h"
+#include "lockspace.h"
+#include "member.h"
+#include "recoverd.h"
+#include "ast.h"
+#include "dir.h"
+#include "lowcomms.h"
+#include "config.h"
+#include "memory.h"
+#include "lock.h"
+#include "recover.h"
+
+#ifdef CONFIG_DLM_DEBUG
+int dlm_create_debug_file(struct dlm_ls *ls);
+void dlm_delete_debug_file(struct dlm_ls *ls);
+#else
+static inline int dlm_create_debug_file(struct dlm_ls *ls) { return 0; }
+static inline void dlm_delete_debug_file(struct dlm_ls *ls) { }
+#endif
+
+static int ls_count;
+static struct mutex ls_lock;
+static struct list_head lslist;
+static spinlock_t lslist_lock;
+static struct task_struct * scand_task;
+
+
+static ssize_t dlm_control_store(struct dlm_ls *ls, const char *buf, size_t len)
+{
+ ssize_t ret = len;
+ int n = simple_strtol(buf, NULL, 0);
+
+ switch (n) {
+ case 0:
+ dlm_ls_stop(ls);
+ break;
+ case 1:
+ dlm_ls_start(ls);
+ break;
+ default:
+ ret = -EINVAL;
+ }
+ return ret;
+}
+
+static ssize_t dlm_event_store(struct dlm_ls *ls, const char *buf, size_t len)
+{
+ ls->ls_uevent_result = simple_strtol(buf, NULL, 0);
+ set_bit(LSFL_UEVENT_WAIT, &ls->ls_flags);
+ wake_up(&ls->ls_uevent_wait);
+ return len;
+}
+
+static ssize_t dlm_id_show(struct dlm_ls *ls, char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%u\n", ls->ls_global_id);
+}
+
+static ssize_t dlm_id_store(struct dlm_ls *ls, const char *buf, size_t len)
+{
+ ls->ls_global_id = simple_strtoul(buf, NULL, 0);
+ return len;
+}
+
+static ssize_t dlm_recover_status_show(struct dlm_ls *ls, char *buf)
+{
+ uint32_t status = dlm_recover_status(ls);
+ return snprintf(buf, PAGE_SIZE, "%x\n", status);
+}
+
+static ssize_t dlm_recover_nodeid_show(struct dlm_ls *ls, char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%d\n", ls->ls_recover_nodeid);
+}
+
+struct dlm_attr {
+ struct attribute attr;
+ ssize_t (*show)(struct dlm_ls *, char *);
+ ssize_t (*store)(struct dlm_ls *, const char *, size_t);
+};
+
+static struct dlm_attr dlm_attr_control = {
+ .attr = {.name = "control", .mode = S_IWUSR},
+ .store = dlm_control_store
+};
+
+static struct dlm_attr dlm_attr_event = {
+ .attr = {.name = "event_done", .mode = S_IWUSR},
+ .store = dlm_event_store
+};
+
+static struct dlm_attr dlm_attr_id = {
+ .attr = {.name = "id", .mode = S_IRUGO | S_IWUSR},
+ .show = dlm_id_show,
+ .store = dlm_id_store
+};
+
+static struct dlm_attr dlm_attr_recover_status = {
+ .attr = {.name = "recover_status", .mode = S_IRUGO},
+ .show = dlm_recover_status_show
+};
+
+static struct dlm_attr dlm_attr_recover_nodeid = {
+ .attr = {.name = "recover_nodeid", .mode = S_IRUGO},
+ .show = dlm_recover_nodeid_show
+};
+
+static struct attribute *dlm_attrs[] = {
+ &dlm_attr_control.attr,
+ &dlm_attr_event.attr,
+ &dlm_attr_id.attr,
+ &dlm_attr_recover_status.attr,
+ &dlm_attr_recover_nodeid.attr,
+ NULL,
+};
+
+static ssize_t dlm_attr_show(struct kobject *kobj, struct attribute *attr,
+ char *buf)
+{
+ struct dlm_ls *ls = container_of(kobj, struct dlm_ls, ls_kobj);
+ struct dlm_attr *a = container_of(attr, struct dlm_attr, attr);
+ return a->show ? a->show(ls, buf) : 0;
+}
+
+static ssize_t dlm_attr_store(struct kobject *kobj, struct attribute *attr,
+ const char *buf, size_t len)
+{
+ struct dlm_ls *ls = container_of(kobj, struct dlm_ls, ls_kobj);
+ struct dlm_attr *a = container_of(attr, struct dlm_attr, attr);
+ return a->store ? a->store(ls, buf, len) : len;
+}
+
+static struct sysfs_ops dlm_attr_ops = {
+ .show = dlm_attr_show,
+ .store = dlm_attr_store,
+};
+
+static struct kobj_type dlm_ktype = {
+ .default_attrs = dlm_attrs,
+ .sysfs_ops = &dlm_attr_ops,
+};
+
+static struct kset dlm_kset = {
+ .subsys = &kernel_subsys,
+ .kobj = {.name = "dlm",},
+ .ktype = &dlm_ktype,
+};
+
+static int kobject_setup(struct dlm_ls *ls)
+{
+ char lsname[DLM_LOCKSPACE_LEN];
+ int error;
+
+ memset(lsname, 0, DLM_LOCKSPACE_LEN);
+ snprintf(lsname, DLM_LOCKSPACE_LEN, "%s", ls->ls_name);
+
+ error = kobject_set_name(&ls->ls_kobj, "%s", lsname);
+ if (error)
+ return error;
+
+ ls->ls_kobj.kset = &dlm_kset;
+ ls->ls_kobj.ktype = &dlm_ktype;
+ return 0;
+}
+
+static int do_uevent(struct dlm_ls *ls, int in)
+{
+ int error;
+
+ if (in)
+ kobject_uevent(&ls->ls_kobj, KOBJ_ONLINE);
+ else
+ kobject_uevent(&ls->ls_kobj, KOBJ_OFFLINE);
+
+ error = wait_event_interruptible(ls->ls_uevent_wait,
+ test_and_clear_bit(LSFL_UEVENT_WAIT, &ls->ls_flags));
+ if (error)
+ goto out;
+
+ error = ls->ls_uevent_result;
+ out:
+ return error;
+}
+
+
+int dlm_lockspace_init(void)
+{
+ int error;
+
+ ls_count = 0;
+ mutex_init(&ls_lock);
+ INIT_LIST_HEAD(&lslist);
+ spin_lock_init(&lslist_lock);
+
+ error = kset_register(&dlm_kset);
+ if (error)
+ printk("dlm_lockspace_init: cannot register kset %d\n", error);
+ return error;
+}
+
+void dlm_lockspace_exit(void)
+{
+ kset_unregister(&dlm_kset);
+}
+
+static int dlm_scand(void *data)
+{
+ struct dlm_ls *ls;
+
+ while (!kthread_should_stop()) {
+ list_for_each_entry(ls, &lslist, ls_list)
+ dlm_scan_rsbs(ls);
+ schedule_timeout_interruptible(dlm_config.scan_secs * HZ);
+ }
+ return 0;
+}
+
+static int dlm_scand_start(void)
+{
+ struct task_struct *p;
+ int error = 0;
+
+ p = kthread_run(dlm_scand, NULL, "dlm_scand");
+ if (IS_ERR(p))
+ error = PTR_ERR(p);
+ else
+ scand_task = p;
+ return error;
+}
+
+static void dlm_scand_stop(void)
+{
+ kthread_stop(scand_task);
+}
+
+static struct dlm_ls *dlm_find_lockspace_name(char *name, int namelen)
+{
+ struct dlm_ls *ls;
+
+ spin_lock(&lslist_lock);
+
+ list_for_each_entry(ls, &lslist, ls_list) {
+ if (ls->ls_namelen == namelen &&
+ memcmp(ls->ls_name, name, namelen) == 0)
+ goto out;
+ }
+ ls = NULL;
+ out:
+ spin_unlock(&lslist_lock);
+ return ls;
+}
+
+struct dlm_ls *dlm_find_lockspace_global(uint32_t id)
+{
+ struct dlm_ls *ls;
+
+ spin_lock(&lslist_lock);
+
+ list_for_each_entry(ls, &lslist, ls_list) {
+ if (ls->ls_global_id == id) {
+ ls->ls_count++;
+ goto out;
+ }
+ }
+ ls = NULL;
+ out:
+ spin_unlock(&lslist_lock);
+ return ls;
+}
+
+struct dlm_ls *dlm_find_lockspace_local(dlm_lockspace_t *lockspace)
+{
+ struct dlm_ls *ls;
+
+ spin_lock(&lslist_lock);
+ list_for_each_entry(ls, &lslist, ls_list) {
+ if (ls->ls_local_handle == lockspace) {
+ ls->ls_count++;
+ goto out;
+ }
+ }
+ ls = NULL;
+ out:
+ spin_unlock(&lslist_lock);
+ return ls;
+}
+
+struct dlm_ls *dlm_find_lockspace_device(int minor)
+{
+ struct dlm_ls *ls;
+
+ spin_lock(&lslist_lock);
+ list_for_each_entry(ls, &lslist, ls_list) {
+ if (ls->ls_device.minor == minor) {
+ ls->ls_count++;
+ goto out;
+ }
+ }
+ ls = NULL;
+ out:
+ spin_unlock(&lslist_lock);
+ return ls;
+}
+
+void dlm_put_lockspace(struct dlm_ls *ls)
+{
+ spin_lock(&lslist_lock);
+ ls->ls_count--;
+ spin_unlock(&lslist_lock);
+}
+
+static void remove_lockspace(struct dlm_ls *ls)
+{
+ for (;;) {
+ spin_lock(&lslist_lock);
+ if (ls->ls_count == 0) {
+ list_del(&ls->ls_list);
+ spin_unlock(&lslist_lock);
+ return;
+ }
+ spin_unlock(&lslist_lock);
+ ssleep(1);
+ }
+}
+
+static int threads_start(void)
+{
+ int error;
+
+ /* Thread which process lock requests for all lockspace's */
+ error = dlm_astd_start();
+ if (error) {
+ log_print("cannot start dlm_astd thread %d", error);
+ goto fail;
+ }
+
+ error = dlm_scand_start();
+ if (error) {
+ log_print("cannot start dlm_scand thread %d", error);
+ goto astd_fail;
+ }
+
+ /* Thread for sending/receiving messages for all lockspace's */
+ error = dlm_lowcomms_start();
+ if (error) {
+ log_print("cannot start dlm lowcomms %d", error);
+ goto scand_fail;
+ }
+
+ return 0;
+
+ scand_fail:
+ dlm_scand_stop();
+ astd_fail:
+ dlm_astd_stop();
+ fail:
+ return error;
+}
+
+static void threads_stop(void)
+{
+ dlm_scand_stop();
+ dlm_lowcomms_stop();
+ dlm_astd_stop();
+}
+
+static int new_lockspace(char *name, int namelen, void **lockspace,
+ uint32_t flags, int lvblen)
+{
+ struct dlm_ls *ls;
+ int i, size, error = -ENOMEM;
+
+ if (namelen > DLM_LOCKSPACE_LEN)
+ return -EINVAL;
+
+ if (!lvblen || (lvblen % 8))
+ return -EINVAL;
+
+ if (!try_module_get(THIS_MODULE))
+ return -EINVAL;
+
+ ls = dlm_find_lockspace_name(name, namelen);
+ if (ls) {
+ *lockspace = ls;
+ module_put(THIS_MODULE);
+ return -EEXIST;
+ }
+
+ ls = kzalloc(sizeof(struct dlm_ls) + namelen, GFP_KERNEL);
+ if (!ls)
+ goto out;
+ memcpy(ls->ls_name, name, namelen);
+ ls->ls_namelen = namelen;
+ ls->ls_exflags = flags;
+ ls->ls_lvblen = lvblen;
+ ls->ls_count = 0;
+ ls->ls_flags = 0;
+
+ size = dlm_config.rsbtbl_size;
+ ls->ls_rsbtbl_size = size;
+
+ ls->ls_rsbtbl = kmalloc(sizeof(struct dlm_rsbtable) * size, GFP_KERNEL);
+ if (!ls->ls_rsbtbl)
+ goto out_lsfree;
+ for (i = 0; i < size; i++) {
+ INIT_LIST_HEAD(&ls->ls_rsbtbl[i].list);
+ INIT_LIST_HEAD(&ls->ls_rsbtbl[i].toss);
+ rwlock_init(&ls->ls_rsbtbl[i].lock);
+ }
+
+ size = dlm_config.lkbtbl_size;
+ ls->ls_lkbtbl_size = size;
+
+ ls->ls_lkbtbl = kmalloc(sizeof(struct dlm_lkbtable) * size, GFP_KERNEL);
+ if (!ls->ls_lkbtbl)
+ goto out_rsbfree;
+ for (i = 0; i < size; i++) {
+ INIT_LIST_HEAD(&ls->ls_lkbtbl[i].list);
+ rwlock_init(&ls->ls_lkbtbl[i].lock);
+ ls->ls_lkbtbl[i].counter = 1;
+ }
+
+ size = dlm_config.dirtbl_size;
+ ls->ls_dirtbl_size = size;
+
+ ls->ls_dirtbl = kmalloc(sizeof(struct dlm_dirtable) * size, GFP_KERNEL);
+ if (!ls->ls_dirtbl)
+ goto out_lkbfree;
+ for (i = 0; i < size; i++) {
+ INIT_LIST_HEAD(&ls->ls_dirtbl[i].list);
+ rwlock_init(&ls->ls_dirtbl[i].lock);
+ }
+
+ INIT_LIST_HEAD(&ls->ls_waiters);
+ mutex_init(&ls->ls_waiters_mutex);
+
+ INIT_LIST_HEAD(&ls->ls_nodes);
+ INIT_LIST_HEAD(&ls->ls_nodes_gone);
+ ls->ls_num_nodes = 0;
+ ls->ls_low_nodeid = 0;
+ ls->ls_total_weight = 0;
+ ls->ls_node_array = NULL;
+
+ memset(&ls->ls_stub_rsb, 0, sizeof(struct dlm_rsb));
+ ls->ls_stub_rsb.res_ls = ls;
+
+ ls->ls_debug_rsb_dentry = NULL;
+ ls->ls_debug_waiters_dentry = NULL;
+
+ init_waitqueue_head(&ls->ls_uevent_wait);
+ ls->ls_uevent_result = 0;
+
+ ls->ls_recoverd_task = NULL;
+ mutex_init(&ls->ls_recoverd_active);
+ spin_lock_init(&ls->ls_recover_lock);
+ ls->ls_recover_status = 0;
+ ls->ls_recover_seq = 0;
+ ls->ls_recover_args = NULL;
+ init_rwsem(&ls->ls_in_recovery);
+ INIT_LIST_HEAD(&ls->ls_requestqueue);
+ mutex_init(&ls->ls_requestqueue_mutex);
+ mutex_init(&ls->ls_clear_proc_locks);
+
+ ls->ls_recover_buf = kmalloc(dlm_config.buffer_size, GFP_KERNEL);
+ if (!ls->ls_recover_buf)
+ goto out_dirfree;
+
+ INIT_LIST_HEAD(&ls->ls_recover_list);
+ spin_lock_init(&ls->ls_recover_list_lock);
+ ls->ls_recover_list_count = 0;
+ ls->ls_local_handle = ls;
+ init_waitqueue_head(&ls->ls_wait_general);
+ INIT_LIST_HEAD(&ls->ls_root_list);
+ init_rwsem(&ls->ls_root_sem);
+
+ down_write(&ls->ls_in_recovery);
+
+ spin_lock(&lslist_lock);
+ list_add(&ls->ls_list, &lslist);
+ spin_unlock(&lslist_lock);
+
+ /* needs to find ls in lslist */
+ error = dlm_recoverd_start(ls);
+ if (error) {
+ log_error(ls, "can't start dlm_recoverd %d", error);
+ goto out_rcomfree;
+ }
+
+ dlm_create_debug_file(ls);
+
+ error = kobject_setup(ls);
+ if (error)
+ goto out_del;
+
+ error = kobject_register(&ls->ls_kobj);
+ if (error)
+ goto out_del;
+
+ error = do_uevent(ls, 1);
+ if (error)
+ goto out_unreg;
+
+ *lockspace = ls;
+ return 0;
+
+ out_unreg:
+ kobject_unregister(&ls->ls_kobj);
+ out_del:
+ dlm_delete_debug_file(ls);
+ dlm_recoverd_stop(ls);
+ out_rcomfree:
+ spin_lock(&lslist_lock);
+ list_del(&ls->ls_list);
+ spin_unlock(&lslist_lock);
+ kfree(ls->ls_recover_buf);
+ out_dirfree:
+ kfree(ls->ls_dirtbl);
+ out_lkbfree:
+ kfree(ls->ls_lkbtbl);
+ out_rsbfree:
+ kfree(ls->ls_rsbtbl);
+ out_lsfree:
+ kfree(ls);
+ out:
+ module_put(THIS_MODULE);
+ return error;
+}
+
+int dlm_new_lockspace(char *name, int namelen, void **lockspace,
+ uint32_t flags, int lvblen)
+{
+ int error = 0;
+
+ mutex_lock(&ls_lock);
+ if (!ls_count)
+ error = threads_start();
+ if (error)
+ goto out;
+
+ error = new_lockspace(name, namelen, lockspace, flags, lvblen);
+ if (!error)
+ ls_count++;
+ out:
+ mutex_unlock(&ls_lock);
+ return error;
+}
+
+/* Return 1 if the lockspace still has active remote locks,
+ * 2 if the lockspace still has active local locks.
+ */
+static int lockspace_busy(struct dlm_ls *ls)
+{
+ int i, lkb_found = 0;
+ struct dlm_lkb *lkb;
+
+ /* NOTE: We check the lockidtbl here rather than the resource table.
+ This is because there may be LKBs queued as ASTs that have been
+ unlinked from their RSBs and are pending deletion once the AST has
+ been delivered */
+
+ for (i = 0; i < ls->ls_lkbtbl_size; i++) {
+ read_lock(&ls->ls_lkbtbl[i].lock);
+ if (!list_empty(&ls->ls_lkbtbl[i].list)) {
+ lkb_found = 1;
+ list_for_each_entry(lkb, &ls->ls_lkbtbl[i].list,
+ lkb_idtbl_list) {
+ if (!lkb->lkb_nodeid) {
+ read_unlock(&ls->ls_lkbtbl[i].lock);
+ return 2;
+ }
+ }
+ }
+ read_unlock(&ls->ls_lkbtbl[i].lock);
+ }
+ return lkb_found;
+}
+
+static int release_lockspace(struct dlm_ls *ls, int force)
+{
+ struct dlm_lkb *lkb;
+ struct dlm_rsb *rsb;
+ struct list_head *head;
+ int i;
+ int busy = lockspace_busy(ls);
+
+ if (busy > force)
+ return -EBUSY;
+
+ if (force < 3)
+ do_uevent(ls, 0);
+
+ dlm_recoverd_stop(ls);
+
+ remove_lockspace(ls);
+
+ dlm_delete_debug_file(ls);
+
+ dlm_astd_suspend();
+
+ kfree(ls->ls_recover_buf);
+
+ /*
+ * Free direntry structs.
+ */
+
+ dlm_dir_clear(ls);
+ kfree(ls->ls_dirtbl);
+
+ /*
+ * Free all lkb's on lkbtbl[] lists.
+ */
+
+ for (i = 0; i < ls->ls_lkbtbl_size; i++) {
+ head = &ls->ls_lkbtbl[i].list;
+ while (!list_empty(head)) {
+ lkb = list_entry(head->next, struct dlm_lkb,
+ lkb_idtbl_list);
+
+ list_del(&lkb->lkb_idtbl_list);
+
+ dlm_del_ast(lkb);
+
+ if (lkb->lkb_lvbptr && lkb->lkb_flags & DLM_IFL_MSTCPY)
+ free_lvb(lkb->lkb_lvbptr);
+
+ free_lkb(lkb);
+ }
+ }
+ dlm_astd_resume();
+
+ kfree(ls->ls_lkbtbl);
+
+ /*
+ * Free all rsb's on rsbtbl[] lists
+ */
+
+ for (i = 0; i < ls->ls_rsbtbl_size; i++) {
+ head = &ls->ls_rsbtbl[i].list;
+ while (!list_empty(head)) {
+ rsb = list_entry(head->next, struct dlm_rsb,
+ res_hashchain);
+
+ list_del(&rsb->res_hashchain);
+ free_rsb(rsb);
+ }
+
+ head = &ls->ls_rsbtbl[i].toss;
+ while (!list_empty(head)) {
+ rsb = list_entry(head->next, struct dlm_rsb,
+ res_hashchain);
+ list_del(&rsb->res_hashchain);
+ free_rsb(rsb);
+ }
+ }
+
+ kfree(ls->ls_rsbtbl);
+
+ /*
+ * Free structures on any other lists
+ */
+
+ kfree(ls->ls_recover_args);
+ dlm_clear_free_entries(ls);
+ dlm_clear_members(ls);
+ dlm_clear_members_gone(ls);
+ kfree(ls->ls_node_array);
+ kobject_unregister(&ls->ls_kobj);
+ kfree(ls);
+
+ mutex_lock(&ls_lock);
+ ls_count--;
+ if (!ls_count)
+ threads_stop();
+ mutex_unlock(&ls_lock);
+
+ module_put(THIS_MODULE);
+ return 0;
+}
+
+/*
+ * Called when a system has released all its locks and is not going to use the
+ * lockspace any longer. We free everything we're managing for this lockspace.
+ * Remaining nodes will go through the recovery process as if we'd died. The
+ * lockspace must continue to function as usual, participating in recoveries,
+ * until this returns.
+ *
+ * Force has 4 possible values:
+ * 0 - don't destroy locksapce if it has any LKBs
+ * 1 - destroy lockspace if it has remote LKBs but not if it has local LKBs
+ * 2 - destroy lockspace regardless of LKBs
+ * 3 - destroy lockspace as part of a forced shutdown
+ */
+
+int dlm_release_lockspace(void *lockspace, int force)
+{
+ struct dlm_ls *ls;
+
+ ls = dlm_find_lockspace_local(lockspace);
+ if (!ls)
+ return -EINVAL;
+ dlm_put_lockspace(ls);
+ return release_lockspace(ls, force);
+}
+
diff --git a/fs/dlm/lockspace.h b/fs/dlm/lockspace.h
new file mode 100644
index 000000000000..891eabbdd021
--- /dev/null
+++ b/fs/dlm/lockspace.h
@@ -0,0 +1,25 @@
+/******************************************************************************
+*******************************************************************************
+**
+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
+**
+** This copyrighted material is made available to anyone wishing to use,
+** modify, copy, or redistribute it subject to the terms and conditions
+** of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __LOCKSPACE_DOT_H__
+#define __LOCKSPACE_DOT_H__
+
+int dlm_lockspace_init(void);
+void dlm_lockspace_exit(void);
+struct dlm_ls *dlm_find_lockspace_global(uint32_t id);
+struct dlm_ls *dlm_find_lockspace_local(void *id);
+struct dlm_ls *dlm_find_lockspace_device(int minor);
+void dlm_put_lockspace(struct dlm_ls *ls);
+
+#endif /* __LOCKSPACE_DOT_H__ */
+
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
new file mode 100644
index 000000000000..6da6b14d5a61
--- /dev/null
+++ b/fs/dlm/lowcomms.c
@@ -0,0 +1,1239 @@
+/******************************************************************************
+*******************************************************************************
+**
+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
+**
+** This copyrighted material is made available to anyone wishing to use,
+** modify, copy, or redistribute it subject to the terms and conditions
+** of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+/*
+ * lowcomms.c
+ *
+ * This is the "low-level" comms layer.
+ *
+ * It is responsible for sending/receiving messages
+ * from other nodes in the cluster.
+ *
+ * Cluster nodes are referred to by their nodeids. nodeids are
+ * simply 32 bit numbers to the locking module - if they need to
+ * be expanded for the cluster infrastructure then that is it's
+ * responsibility. It is this layer's
+ * responsibility to resolve these into IP address or
+ * whatever it needs for inter-node communication.
+ *
+ * The comms level is two kernel threads that deal mainly with
+ * the receiving of messages from other nodes and passing them
+ * up to the mid-level comms layer (which understands the
+ * message format) for execution by the locking core, and
+ * a send thread which does all the setting up of connections
+ * to remote nodes and the sending of data. Threads are not allowed
+ * to send their own data because it may cause them to wait in times
+ * of high load. Also, this way, the sending thread can collect together
+ * messages bound for one node and send them in one block.
+ *
+ * I don't see any problem with the recv thread executing the locking
+ * code on behalf of remote processes as the locking code is
+ * short, efficient and never (well, hardly ever) waits.
+ *
+ */
+
+#include <asm/ioctls.h>
+#include <net/sock.h>
+#include <net/tcp.h>
+#include <net/sctp/user.h>
+#include <linux/pagemap.h>
+#include <linux/socket.h>
+#include <linux/idr.h>
+
+#include "dlm_internal.h"
+#include "lowcomms.h"
+#include "config.h"
+#include "midcomms.h"
+
+static struct sockaddr_storage *dlm_local_addr[DLM_MAX_ADDR_COUNT];
+static int dlm_local_count;
+static int dlm_local_nodeid;
+
+/* One of these per connected node */
+
+#define NI_INIT_PENDING 1
+#define NI_WRITE_PENDING 2
+
+struct nodeinfo {
+ spinlock_t lock;
+ sctp_assoc_t assoc_id;
+ unsigned long flags;
+ struct list_head write_list; /* nodes with pending writes */
+ struct list_head writequeue; /* outgoing writequeue_entries */
+ spinlock_t writequeue_lock;
+ int nodeid;
+};
+
+static DEFINE_IDR(nodeinfo_idr);
+static struct rw_semaphore nodeinfo_lock;
+static int max_nodeid;
+
+struct cbuf {
+ unsigned base;
+ unsigned len;
+ unsigned mask;
+};
+
+/* Just the one of these, now. But this struct keeps
+ the connection-specific variables together */
+
+#define CF_READ_PENDING 1
+
+struct connection {
+ struct socket *sock;
+ unsigned long flags;
+ struct page *rx_page;
+ atomic_t waiting_requests;
+ struct cbuf cb;
+ int eagain_flag;
+};
+
+/* An entry waiting to be sent */
+
+struct writequeue_entry {
+ struct list_head list;
+ struct page *page;
+ int offset;
+ int len;
+ int end;
+ int users;
+ struct nodeinfo *ni;
+};
+
+#define CBUF_ADD(cb, n) do { (cb)->len += n; } while(0)
+#define CBUF_EMPTY(cb) ((cb)->len == 0)
+#define CBUF_MAY_ADD(cb, n) (((cb)->len + (n)) < ((cb)->mask + 1))
+#define CBUF_DATA(cb) (((cb)->base + (cb)->len) & (cb)->mask)
+
+#define CBUF_INIT(cb, size) \
+do { \
+ (cb)->base = (cb)->len = 0; \
+ (cb)->mask = ((size)-1); \
+} while(0)
+
+#define CBUF_EAT(cb, n) \
+do { \
+ (cb)->len -= (n); \
+ (cb)->base += (n); \
+ (cb)->base &= (cb)->mask; \
+} while(0)
+
+
+/* List of nodes which have writes pending */
+static struct list_head write_nodes;
+static spinlock_t write_nodes_lock;
+
+/* Maximum number of incoming messages to process before
+ * doing a schedule()
+ */
+#define MAX_RX_MSG_COUNT 25
+
+/* Manage daemons */
+static struct task_struct *recv_task;
+static struct task_struct *send_task;
+static wait_queue_head_t lowcomms_recv_wait;
+static atomic_t accepting;
+
+/* The SCTP connection */
+static struct connection sctp_con;
+
+
+static int nodeid_to_addr(int nodeid, struct sockaddr *retaddr)
+{
+ struct sockaddr_storage addr;
+ int error;
+
+ if (!dlm_local_count)
+ return -1;
+
+ error = dlm_nodeid_to_addr(nodeid, &addr);
+ if (error)
+ return error;
+
+ if (dlm_local_addr[0]->ss_family == AF_INET) {
+ struct sockaddr_in *in4 = (struct sockaddr_in *) &addr;
+ struct sockaddr_in *ret4 = (struct sockaddr_in *) retaddr;
+ ret4->sin_addr.s_addr = in4->sin_addr.s_addr;
+ } else {
+ struct sockaddr_in6 *in6 = (struct sockaddr_in6 *) &addr;
+ struct sockaddr_in6 *ret6 = (struct sockaddr_in6 *) retaddr;
+ memcpy(&ret6->sin6_addr, &in6->sin6_addr,
+ sizeof(in6->sin6_addr));
+ }
+
+ return 0;
+}
+
+static struct nodeinfo *nodeid2nodeinfo(int nodeid, gfp_t alloc)
+{
+ struct nodeinfo *ni;
+ int r;
+ int n;
+
+ down_read(&nodeinfo_lock);
+ ni = idr_find(&nodeinfo_idr, nodeid);
+ up_read(&nodeinfo_lock);
+
+ if (!ni && alloc) {
+ down_write(&nodeinfo_lock);
+
+ ni = idr_find(&nodeinfo_idr, nodeid);
+ if (ni)
+ goto out_up;
+
+ r = idr_pre_get(&nodeinfo_idr, alloc);
+ if (!r)
+ goto out_up;
+
+ ni = kmalloc(sizeof(struct nodeinfo), alloc);
+ if (!ni)
+ goto out_up;
+
+ r = idr_get_new_above(&nodeinfo_idr, ni, nodeid, &n);
+ if (r) {
+ kfree(ni);
+ ni = NULL;
+ goto out_up;
+ }
+ if (n != nodeid) {
+ idr_remove(&nodeinfo_idr, n);
+ kfree(ni);
+ ni = NULL;
+ goto out_up;
+ }
+ memset(ni, 0, sizeof(struct nodeinfo));
+ spin_lock_init(&ni->lock);
+ INIT_LIST_HEAD(&ni->writequeue);
+ spin_lock_init(&ni->writequeue_lock);
+ ni->nodeid = nodeid;
+
+ if (nodeid > max_nodeid)
+ max_nodeid = nodeid;
+ out_up:
+ up_write(&nodeinfo_lock);
+ }
+
+ return ni;
+}
+
+/* Don't call this too often... */
+static struct nodeinfo *assoc2nodeinfo(sctp_assoc_t assoc)
+{
+ int i;
+ struct nodeinfo *ni;
+
+ for (i=1; i<=max_nodeid; i++) {
+ ni = nodeid2nodeinfo(i, 0);
+ if (ni && ni->assoc_id == assoc)
+ return ni;
+ }
+ return NULL;
+}
+
+/* Data or notification available on socket */
+static void lowcomms_data_ready(struct sock *sk, int count_unused)
+{
+ atomic_inc(&sctp_con.waiting_requests);
+ if (test_and_set_bit(CF_READ_PENDING, &sctp_con.flags))
+ return;
+
+ wake_up_interruptible(&lowcomms_recv_wait);
+}
+
+
+/* Add the port number to an IP6 or 4 sockaddr and return the address length.
+ Also padd out the struct with zeros to make comparisons meaningful */
+
+static void make_sockaddr(struct sockaddr_storage *saddr, uint16_t port,
+ int *addr_len)
+{
+ struct sockaddr_in *local4_addr;
+ struct sockaddr_in6 *local6_addr;
+
+ if (!dlm_local_count)
+ return;
+
+ if (!port) {
+ if (dlm_local_addr[0]->ss_family == AF_INET) {
+ local4_addr = (struct sockaddr_in *)dlm_local_addr[0];
+ port = be16_to_cpu(local4_addr->sin_port);
+ } else {
+ local6_addr = (struct sockaddr_in6 *)dlm_local_addr[0];
+ port = be16_to_cpu(local6_addr->sin6_port);
+ }
+ }
+
+ saddr->ss_family = dlm_local_addr[0]->ss_family;
+ if (dlm_local_addr[0]->ss_family == AF_INET) {
+ struct sockaddr_in *in4_addr = (struct sockaddr_in *)saddr;
+ in4_addr->sin_port = cpu_to_be16(port);
+ memset(&in4_addr->sin_zero, 0, sizeof(in4_addr->sin_zero));
+ memset(in4_addr+1, 0, sizeof(struct sockaddr_storage) -
+ sizeof(struct sockaddr_in));
+ *addr_len = sizeof(struct sockaddr_in);
+ } else {
+ struct sockaddr_in6 *in6_addr = (struct sockaddr_in6 *)saddr;
+ in6_addr->sin6_port = cpu_to_be16(port);
+ memset(in6_addr+1, 0, sizeof(struct sockaddr_storage) -
+ sizeof(struct sockaddr_in6));
+ *addr_len = sizeof(struct sockaddr_in6);
+ }
+}
+
+/* Close the connection and tidy up */
+static void close_connection(void)
+{
+ if (sctp_con.sock) {
+ sock_release(sctp_con.sock);
+ sctp_con.sock = NULL;
+ }
+
+ if (sctp_con.rx_page) {
+ __free_page(sctp_con.rx_page);
+ sctp_con.rx_page = NULL;
+ }
+}
+
+/* We only send shutdown messages to nodes that are not part of the cluster */
+static void send_shutdown(sctp_assoc_t associd)
+{
+ static char outcmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
+ struct msghdr outmessage;
+ struct cmsghdr *cmsg;
+ struct sctp_sndrcvinfo *sinfo;
+ int ret;
+
+ outmessage.msg_name = NULL;
+ outmessage.msg_namelen = 0;
+ outmessage.msg_control = outcmsg;
+ outmessage.msg_controllen = sizeof(outcmsg);
+ outmessage.msg_flags = MSG_EOR;
+
+ cmsg = CMSG_FIRSTHDR(&outmessage);
+ cmsg->cmsg_level = IPPROTO_SCTP;
+ cmsg->cmsg_type = SCTP_SNDRCV;
+ cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo));
+ outmessage.msg_controllen = cmsg->cmsg_len;
+ sinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmsg);
+ memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo));
+
+ sinfo->sinfo_flags |= MSG_EOF;
+ sinfo->sinfo_assoc_id = associd;
+
+ ret = kernel_sendmsg(sctp_con.sock, &outmessage, NULL, 0, 0);
+
+ if (ret != 0)
+ log_print("send EOF to node failed: %d", ret);
+}
+
+
+/* INIT failed but we don't know which node...
+ restart INIT on all pending nodes */
+static void init_failed(void)
+{
+ int i;
+ struct nodeinfo *ni;
+
+ for (i=1; i<=max_nodeid; i++) {
+ ni = nodeid2nodeinfo(i, 0);
+ if (!ni)
+ continue;
+
+ if (test_and_clear_bit(NI_INIT_PENDING, &ni->flags)) {
+ ni->assoc_id = 0;
+ if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) {
+ spin_lock_bh(&write_nodes_lock);
+ list_add_tail(&ni->write_list, &write_nodes);
+ spin_unlock_bh(&write_nodes_lock);
+ }
+ }
+ }
+ wake_up_process(send_task);
+}
+
+/* Something happened to an association */
+static void process_sctp_notification(struct msghdr *msg, char *buf)
+{
+ union sctp_notification *sn = (union sctp_notification *)buf;
+
+ if (sn->sn_header.sn_type == SCTP_ASSOC_CHANGE) {
+ switch (sn->sn_assoc_change.sac_state) {
+
+ case SCTP_COMM_UP:
+ case SCTP_RESTART:
+ {
+ /* Check that the new node is in the lockspace */
+ struct sctp_prim prim;
+ mm_segment_t fs;
+ int nodeid;
+ int prim_len, ret;
+ int addr_len;
+ struct nodeinfo *ni;
+
+ /* This seems to happen when we received a connection
+ * too early... or something... anyway, it happens but
+ * we always seem to get a real message too, see
+ * receive_from_sock */
+
+ if ((int)sn->sn_assoc_change.sac_assoc_id <= 0) {
+ log_print("COMM_UP for invalid assoc ID %d",
+ (int)sn->sn_assoc_change.sac_assoc_id);
+ init_failed();
+ return;
+ }
+ memset(&prim, 0, sizeof(struct sctp_prim));
+ prim_len = sizeof(struct sctp_prim);
+ prim.ssp_assoc_id = sn->sn_assoc_change.sac_assoc_id;
+
+ fs = get_fs();
+ set_fs(get_ds());
+ ret = sctp_con.sock->ops->getsockopt(sctp_con.sock,
+ IPPROTO_SCTP, SCTP_PRIMARY_ADDR,
+ (char*)&prim, &prim_len);
+ set_fs(fs);
+ if (ret < 0) {
+ struct nodeinfo *ni;
+
+ log_print("getsockopt/sctp_primary_addr on "
+ "new assoc %d failed : %d",
+ (int)sn->sn_assoc_change.sac_assoc_id, ret);
+
+ /* Retry INIT later */
+ ni = assoc2nodeinfo(sn->sn_assoc_change.sac_assoc_id);
+ if (ni)
+ clear_bit(NI_INIT_PENDING, &ni->flags);
+ return;
+ }
+ make_sockaddr(&prim.ssp_addr, 0, &addr_len);
+ if (dlm_addr_to_nodeid(&prim.ssp_addr, &nodeid)) {
+ log_print("reject connect from unknown addr");
+ send_shutdown(prim.ssp_assoc_id);
+ return;
+ }
+
+ ni = nodeid2nodeinfo(nodeid, GFP_KERNEL);
+ if (!ni)
+ return;
+
+ /* Save the assoc ID */
+ spin_lock(&ni->lock);
+ ni->assoc_id = sn->sn_assoc_change.sac_assoc_id;
+ spin_unlock(&ni->lock);
+
+ log_print("got new/restarted association %d nodeid %d",
+ (int)sn->sn_assoc_change.sac_assoc_id, nodeid);
+
+ /* Send any pending writes */
+ clear_bit(NI_INIT_PENDING, &ni->flags);
+ if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) {
+ spin_lock_bh(&write_nodes_lock);
+ list_add_tail(&ni->write_list, &write_nodes);
+ spin_unlock_bh(&write_nodes_lock);
+ }
+ wake_up_process(send_task);
+ }
+ break;
+
+ case SCTP_COMM_LOST:
+ case SCTP_SHUTDOWN_COMP:
+ {
+ struct nodeinfo *ni;
+
+ ni = assoc2nodeinfo(sn->sn_assoc_change.sac_assoc_id);
+ if (ni) {
+ spin_lock(&ni->lock);
+ ni->assoc_id = 0;
+ spin_unlock(&ni->lock);
+ }
+ }
+ break;
+
+ /* We don't know which INIT failed, so clear the PENDING flags
+ * on them all. if assoc_id is zero then it will then try
+ * again */
+
+ case SCTP_CANT_STR_ASSOC:
+ {
+ log_print("Can't start SCTP association - retrying");
+ init_failed();
+ }
+ break;
+
+ default:
+ log_print("unexpected SCTP assoc change id=%d state=%d",
+ (int)sn->sn_assoc_change.sac_assoc_id,
+ sn->sn_assoc_change.sac_state);
+ }
+ }
+}
+
+/* Data received from remote end */
+static int receive_from_sock(void)
+{
+ int ret = 0;
+ struct msghdr msg;
+ struct kvec iov[2];
+ unsigned len;
+ int r;
+ struct sctp_sndrcvinfo *sinfo;
+ struct cmsghdr *cmsg;
+ struct nodeinfo *ni;
+
+ /* These two are marginally too big for stack allocation, but this
+ * function is (currently) only called by dlm_recvd so static should be
+ * OK.
+ */
+ static struct sockaddr_storage msgname;
+ static char incmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
+
+ if (sctp_con.sock == NULL)
+ goto out;
+
+ if (sctp_con.rx_page == NULL) {
+ /*
+ * This doesn't need to be atomic, but I think it should
+ * improve performance if it is.
+ */
+ sctp_con.rx_page = alloc_page(GFP_ATOMIC);
+ if (sctp_con.rx_page == NULL)
+ goto out_resched;
+ CBUF_INIT(&sctp_con.cb, PAGE_CACHE_SIZE);
+ }
+
+ memset(&incmsg, 0, sizeof(incmsg));
+ memset(&msgname, 0, sizeof(msgname));
+
+ memset(incmsg, 0, sizeof(incmsg));
+ msg.msg_name = &msgname;
+ msg.msg_namelen = sizeof(msgname);
+ msg.msg_flags = 0;
+ msg.msg_control = incmsg;
+ msg.msg_controllen = sizeof(incmsg);
+ msg.msg_iovlen = 1;
+
+ /* I don't see why this circular buffer stuff is necessary for SCTP
+ * which is a packet-based protocol, but the whole thing breaks under
+ * load without it! The overhead is minimal (and is in the TCP lowcomms
+ * anyway, of course) so I'll leave it in until I can figure out what's
+ * really happening.
+ */
+
+ /*
+ * iov[0] is the bit of the circular buffer between the current end
+ * point (cb.base + cb.len) and the end of the buffer.
+ */
+ iov[0].iov_len = sctp_con.cb.base - CBUF_DATA(&sctp_con.cb);
+ iov[0].iov_base = page_address(sctp_con.rx_page) +
+ CBUF_DATA(&sctp_con.cb);
+ iov[1].iov_len = 0;
+
+ /*
+ * iov[1] is the bit of the circular buffer between the start of the
+ * buffer and the start of the currently used section (cb.base)
+ */
+ if (CBUF_DATA(&sctp_con.cb) >= sctp_con.cb.base) {
+ iov[0].iov_len = PAGE_CACHE_SIZE - CBUF_DATA(&sctp_con.cb);
+ iov[1].iov_len = sctp_con.cb.base;
+ iov[1].iov_base = page_address(sctp_con.rx_page);
+ msg.msg_iovlen = 2;
+ }
+ len = iov[0].iov_len + iov[1].iov_len;
+
+ r = ret = kernel_recvmsg(sctp_con.sock, &msg, iov, msg.msg_iovlen, len,
+ MSG_NOSIGNAL | MSG_DONTWAIT);
+ if (ret <= 0)
+ goto out_close;
+
+ msg.msg_control = incmsg;
+ msg.msg_controllen = sizeof(incmsg);
+ cmsg = CMSG_FIRSTHDR(&msg);
+ sinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmsg);
+
+ if (msg.msg_flags & MSG_NOTIFICATION) {
+ process_sctp_notification(&msg, page_address(sctp_con.rx_page));
+ return 0;
+ }
+
+ /* Is this a new association ? */
+ ni = nodeid2nodeinfo(le32_to_cpu(sinfo->sinfo_ppid), GFP_KERNEL);
+ if (ni) {
+ ni->assoc_id = sinfo->sinfo_assoc_id;
+ if (test_and_clear_bit(NI_INIT_PENDING, &ni->flags)) {
+
+ if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) {
+ spin_lock_bh(&write_nodes_lock);
+ list_add_tail(&ni->write_list, &write_nodes);
+ spin_unlock_bh(&write_nodes_lock);
+ }
+ wake_up_process(send_task);
+ }
+ }
+
+ /* INIT sends a message with length of 1 - ignore it */
+ if (r == 1)
+ return 0;
+
+ CBUF_ADD(&sctp_con.cb, ret);
+ ret = dlm_process_incoming_buffer(cpu_to_le32(sinfo->sinfo_ppid),
+ page_address(sctp_con.rx_page),
+ sctp_con.cb.base, sctp_con.cb.len,
+ PAGE_CACHE_SIZE);
+ if (ret < 0)
+ goto out_close;
+ CBUF_EAT(&sctp_con.cb, ret);
+
+ out:
+ ret = 0;
+ goto out_ret;
+
+ out_resched:
+ lowcomms_data_ready(sctp_con.sock->sk, 0);
+ ret = 0;
+ schedule();
+ goto out_ret;
+
+ out_close:
+ if (ret != -EAGAIN)
+ log_print("error reading from sctp socket: %d", ret);
+ out_ret:
+ return ret;
+}
+
+/* Bind to an IP address. SCTP allows multiple address so it can do multi-homing */
+static int add_bind_addr(struct sockaddr_storage *addr, int addr_len, int num)
+{
+ mm_segment_t fs;
+ int result = 0;
+
+ fs = get_fs();
+ set_fs(get_ds());
+ if (num == 1)
+ result = sctp_con.sock->ops->bind(sctp_con.sock,
+ (struct sockaddr *) addr, addr_len);
+ else
+ result = sctp_con.sock->ops->setsockopt(sctp_con.sock, SOL_SCTP,
+ SCTP_SOCKOPT_BINDX_ADD, (char *)addr, addr_len);
+ set_fs(fs);
+
+ if (result < 0)
+ log_print("Can't bind to port %d addr number %d",
+ dlm_config.tcp_port, num);
+
+ return result;
+}
+
+static void init_local(void)
+{
+ struct sockaddr_storage sas, *addr;
+ int i;
+
+ dlm_local_nodeid = dlm_our_nodeid();
+
+ for (i = 0; i < DLM_MAX_ADDR_COUNT - 1; i++) {
+ if (dlm_our_addr(&sas, i))
+ break;
+
+ addr = kmalloc(sizeof(*addr), GFP_KERNEL);
+ if (!addr)
+ break;
+ memcpy(addr, &sas, sizeof(*addr));
+ dlm_local_addr[dlm_local_count++] = addr;
+ }
+}
+
+/* Initialise SCTP socket and bind to all interfaces */
+static int init_sock(void)
+{
+ mm_segment_t fs;
+ struct socket *sock = NULL;
+ struct sockaddr_storage localaddr;
+ struct sctp_event_subscribe subscribe;
+ int result = -EINVAL, num = 1, i, addr_len;
+
+ if (!dlm_local_count) {
+ init_local();
+ if (!dlm_local_count) {
+ log_print("no local IP address has been set");
+ goto out;
+ }
+ }
+
+ result = sock_create_kern(dlm_local_addr[0]->ss_family, SOCK_SEQPACKET,
+ IPPROTO_SCTP, &sock);
+ if (result < 0) {
+ log_print("Can't create comms socket, check SCTP is loaded");
+ goto out;
+ }
+
+ /* Listen for events */
+ memset(&subscribe, 0, sizeof(subscribe));
+ subscribe.sctp_data_io_event = 1;
+ subscribe.sctp_association_event = 1;
+ subscribe.sctp_send_failure_event = 1;
+ subscribe.sctp_shutdown_event = 1;
+ subscribe.sctp_partial_delivery_event = 1;
+
+ fs = get_fs();
+ set_fs(get_ds());
+ result = sock->ops->setsockopt(sock, SOL_SCTP, SCTP_EVENTS,
+ (char *)&subscribe, sizeof(subscribe));
+ set_fs(fs);
+
+ if (result < 0) {
+ log_print("Failed to set SCTP_EVENTS on socket: result=%d",
+ result);
+ goto create_delsock;
+ }
+
+ /* Init con struct */
+ sock->sk->sk_user_data = &sctp_con;
+ sctp_con.sock = sock;
+ sctp_con.sock->sk->sk_data_ready = lowcomms_data_ready;
+
+ /* Bind to all interfaces. */
+ for (i = 0; i < dlm_local_count; i++) {
+ memcpy(&localaddr, dlm_local_addr[i], sizeof(localaddr));
+ make_sockaddr(&localaddr, dlm_config.tcp_port, &addr_len);
+
+ result = add_bind_addr(&localaddr, addr_len, num);
+ if (result)
+ goto create_delsock;
+ ++num;
+ }
+
+ result = sock->ops->listen(sock, 5);
+ if (result < 0) {
+ log_print("Can't set socket listening");
+ goto create_delsock;
+ }
+
+ return 0;
+
+ create_delsock:
+ sock_release(sock);
+ sctp_con.sock = NULL;
+ out:
+ return result;
+}
+
+
+static struct writequeue_entry *new_writequeue_entry(gfp_t allocation)
+{
+ struct writequeue_entry *entry;
+
+ entry = kmalloc(sizeof(struct writequeue_entry), allocation);
+ if (!entry)
+ return NULL;
+
+ entry->page = alloc_page(allocation);
+ if (!entry->page) {
+ kfree(entry);
+ return NULL;
+ }
+
+ entry->offset = 0;
+ entry->len = 0;
+ entry->end = 0;
+ entry->users = 0;
+
+ return entry;
+}
+
+void *dlm_lowcomms_get_buffer(int nodeid, int len, gfp_t allocation, char **ppc)
+{
+ struct writequeue_entry *e;
+ int offset = 0;
+ int users = 0;
+ struct nodeinfo *ni;
+
+ if (!atomic_read(&accepting))
+ return NULL;
+
+ ni = nodeid2nodeinfo(nodeid, allocation);
+ if (!ni)
+ return NULL;
+
+ spin_lock(&ni->writequeue_lock);
+ e = list_entry(ni->writequeue.prev, struct writequeue_entry, list);
+ if (((struct list_head *) e == &ni->writequeue) ||
+ (PAGE_CACHE_SIZE - e->end < len)) {
+ e = NULL;
+ } else {
+ offset = e->end;
+ e->end += len;
+ users = e->users++;
+ }
+ spin_unlock(&ni->writequeue_lock);
+
+ if (e) {
+ got_one:
+ if (users == 0)
+ kmap(e->page);
+ *ppc = page_address(e->page) + offset;
+ return e;
+ }
+
+ e = new_writequeue_entry(allocation);
+ if (e) {
+ spin_lock(&ni->writequeue_lock);
+ offset = e->end;
+ e->end += len;
+ e->ni = ni;
+ users = e->users++;
+ list_add_tail(&e->list, &ni->writequeue);
+ spin_unlock(&ni->writequeue_lock);
+ goto got_one;
+ }
+ return NULL;
+}
+
+void dlm_lowcomms_commit_buffer(void *arg)
+{
+ struct writequeue_entry *e = (struct writequeue_entry *) arg;
+ int users;
+ struct nodeinfo *ni = e->ni;
+
+ if (!atomic_read(&accepting))
+ return;
+
+ spin_lock(&ni->writequeue_lock);
+ users = --e->users;
+ if (users)
+ goto out;
+ e->len = e->end - e->offset;
+ kunmap(e->page);
+ spin_unlock(&ni->writequeue_lock);
+
+ if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) {
+ spin_lock_bh(&write_nodes_lock);
+ list_add_tail(&ni->write_list, &write_nodes);
+ spin_unlock_bh(&write_nodes_lock);
+ wake_up_process(send_task);
+ }
+ return;
+
+ out:
+ spin_unlock(&ni->writequeue_lock);
+ return;
+}
+
+static void free_entry(struct writequeue_entry *e)
+{
+ __free_page(e->page);
+ kfree(e);
+}
+
+/* Initiate an SCTP association. In theory we could just use sendmsg() on
+ the first IP address and it should work, but this allows us to set up the
+ association before sending any valuable data that we can't afford to lose.
+ It also keeps the send path clean as it can now always use the association ID */
+static void initiate_association(int nodeid)
+{
+ struct sockaddr_storage rem_addr;
+ static char outcmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
+ struct msghdr outmessage;
+ struct cmsghdr *cmsg;
+ struct sctp_sndrcvinfo *sinfo;
+ int ret;
+ int addrlen;
+ char buf[1];
+ struct kvec iov[1];
+ struct nodeinfo *ni;
+
+ log_print("Initiating association with node %d", nodeid);
+
+ ni = nodeid2nodeinfo(nodeid, GFP_KERNEL);
+ if (!ni)
+ return;
+
+ if (nodeid_to_addr(nodeid, (struct sockaddr *)&rem_addr)) {
+ log_print("no address for nodeid %d", nodeid);
+ return;
+ }
+
+ make_sockaddr(&rem_addr, dlm_config.tcp_port, &addrlen);
+
+ outmessage.msg_name = &rem_addr;
+ outmessage.msg_namelen = addrlen;
+ outmessage.msg_control = outcmsg;
+ outmessage.msg_controllen = sizeof(outcmsg);
+ outmessage.msg_flags = MSG_EOR;
+
+ iov[0].iov_base = buf;
+ iov[0].iov_len = 1;
+
+ /* Real INIT messages seem to cause trouble. Just send a 1 byte message
+ we can afford to lose */
+ cmsg = CMSG_FIRSTHDR(&outmessage);
+ cmsg->cmsg_level = IPPROTO_SCTP;
+ cmsg->cmsg_type = SCTP_SNDRCV;
+ cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo));
+ sinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmsg);
+ memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo));
+ sinfo->sinfo_ppid = cpu_to_le32(dlm_local_nodeid);
+
+ outmessage.msg_controllen = cmsg->cmsg_len;
+ ret = kernel_sendmsg(sctp_con.sock, &outmessage, iov, 1, 1);
+ if (ret < 0) {
+ log_print("send INIT to node failed: %d", ret);
+ /* Try again later */
+ clear_bit(NI_INIT_PENDING, &ni->flags);
+ }
+}
+
+/* Send a message */
+static int send_to_sock(struct nodeinfo *ni)
+{
+ int ret = 0;
+ struct writequeue_entry *e;
+ int len, offset;
+ struct msghdr outmsg;
+ static char outcmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
+ struct cmsghdr *cmsg;
+ struct sctp_sndrcvinfo *sinfo;
+ struct kvec iov;
+
+ /* See if we need to init an association before we start
+ sending precious messages */
+ spin_lock(&ni->lock);
+ if (!ni->assoc_id && !test_and_set_bit(NI_INIT_PENDING, &ni->flags)) {
+ spin_unlock(&ni->lock);
+ initiate_association(ni->nodeid);
+ return 0;
+ }
+ spin_unlock(&ni->lock);
+
+ outmsg.msg_name = NULL; /* We use assoc_id */
+ outmsg.msg_namelen = 0;
+ outmsg.msg_control = outcmsg;
+ outmsg.msg_controllen = sizeof(outcmsg);
+ outmsg.msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL | MSG_EOR;
+
+ cmsg = CMSG_FIRSTHDR(&outmsg);
+ cmsg->cmsg_level = IPPROTO_SCTP;
+ cmsg->cmsg_type = SCTP_SNDRCV;
+ cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo));
+ sinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmsg);
+ memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo));
+ sinfo->sinfo_ppid = cpu_to_le32(dlm_local_nodeid);
+ sinfo->sinfo_assoc_id = ni->assoc_id;
+ outmsg.msg_controllen = cmsg->cmsg_len;
+
+ spin_lock(&ni->writequeue_lock);
+ for (;;) {
+ if (list_empty(&ni->writequeue))
+ break;
+ e = list_entry(ni->writequeue.next, struct writequeue_entry,
+ list);
+ len = e->len;
+ offset = e->offset;
+ BUG_ON(len == 0 && e->users == 0);
+ spin_unlock(&ni->writequeue_lock);
+ kmap(e->page);
+
+ ret = 0;
+ if (len) {
+ iov.iov_base = page_address(e->page)+offset;
+ iov.iov_len = len;
+
+ ret = kernel_sendmsg(sctp_con.sock, &outmsg, &iov, 1,
+ len);
+ if (ret == -EAGAIN) {
+ sctp_con.eagain_flag = 1;
+ goto out;
+ } else if (ret < 0)
+ goto send_error;
+ } else {
+ /* Don't starve people filling buffers */
+ schedule();
+ }
+
+ spin_lock(&ni->writequeue_lock);
+ e->offset += ret;
+ e->len -= ret;
+
+ if (e->len == 0 && e->users == 0) {
+ list_del(&e->list);
+ free_entry(e);
+ continue;
+ }
+ }
+ spin_unlock(&ni->writequeue_lock);
+ out:
+ return ret;
+
+ send_error:
+ log_print("Error sending to node %d %d", ni->nodeid, ret);
+ spin_lock(&ni->lock);
+ if (!test_and_set_bit(NI_INIT_PENDING, &ni->flags)) {
+ ni->assoc_id = 0;
+ spin_unlock(&ni->lock);
+ initiate_association(ni->nodeid);
+ } else
+ spin_unlock(&ni->lock);
+
+ return ret;
+}
+
+/* Try to send any messages that are pending */
+static void process_output_queue(void)
+{
+ struct list_head *list;
+ struct list_head *temp;
+
+ spin_lock_bh(&write_nodes_lock);
+ list_for_each_safe(list, temp, &write_nodes) {
+ struct nodeinfo *ni =
+ list_entry(list, struct nodeinfo, write_list);
+ clear_bit(NI_WRITE_PENDING, &ni->flags);
+ list_del(&ni->write_list);
+
+ spin_unlock_bh(&write_nodes_lock);
+
+ send_to_sock(ni);
+ spin_lock_bh(&write_nodes_lock);
+ }
+ spin_unlock_bh(&write_nodes_lock);
+}
+
+/* Called after we've had -EAGAIN and been woken up */
+static void refill_write_queue(void)
+{
+ int i;
+
+ for (i=1; i<=max_nodeid; i++) {
+ struct nodeinfo *ni = nodeid2nodeinfo(i, 0);
+
+ if (ni) {
+ if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) {
+ spin_lock_bh(&write_nodes_lock);
+ list_add_tail(&ni->write_list, &write_nodes);
+ spin_unlock_bh(&write_nodes_lock);
+ }
+ }
+ }
+}
+
+static void clean_one_writequeue(struct nodeinfo *ni)
+{
+ struct list_head *list;
+ struct list_head *temp;
+
+ spin_lock(&ni->writequeue_lock);
+ list_for_each_safe(list, temp, &ni->writequeue) {
+ struct writequeue_entry *e =
+ list_entry(list, struct writequeue_entry, list);
+ list_del(&e->list);
+ free_entry(e);
+ }
+ spin_unlock(&ni->writequeue_lock);
+}
+
+static void clean_writequeues(void)
+{
+ int i;
+
+ for (i=1; i<=max_nodeid; i++) {
+ struct nodeinfo *ni = nodeid2nodeinfo(i, 0);
+ if (ni)
+ clean_one_writequeue(ni);
+ }
+}
+
+
+static void dealloc_nodeinfo(void)
+{
+ int i;
+
+ for (i=1; i<=max_nodeid; i++) {
+ struct nodeinfo *ni = nodeid2nodeinfo(i, 0);
+ if (ni) {
+ idr_remove(&nodeinfo_idr, i);
+ kfree(ni);
+ }
+ }
+}
+
+int dlm_lowcomms_close(int nodeid)
+{
+ struct nodeinfo *ni;
+
+ ni = nodeid2nodeinfo(nodeid, 0);
+ if (!ni)
+ return -1;
+
+ spin_lock(&ni->lock);
+ if (ni->assoc_id) {
+ ni->assoc_id = 0;
+ /* Don't send shutdown here, sctp will just queue it
+ till the node comes back up! */
+ }
+ spin_unlock(&ni->lock);
+
+ clean_one_writequeue(ni);
+ clear_bit(NI_INIT_PENDING, &ni->flags);
+ return 0;
+}
+
+static int write_list_empty(void)
+{
+ int status;
+
+ spin_lock_bh(&write_nodes_lock);
+ status = list_empty(&write_nodes);
+ spin_unlock_bh(&write_nodes_lock);
+
+ return status;
+}
+
+static int dlm_recvd(void *data)
+{
+ DECLARE_WAITQUEUE(wait, current);
+
+ while (!kthread_should_stop()) {
+ int count = 0;
+
+ set_current_state(TASK_INTERRUPTIBLE);
+ add_wait_queue(&lowcomms_recv_wait, &wait);
+ if (!test_bit(CF_READ_PENDING, &sctp_con.flags))
+ schedule();
+ remove_wait_queue(&lowcomms_recv_wait, &wait);
+ set_current_state(TASK_RUNNING);
+
+ if (test_and_clear_bit(CF_READ_PENDING, &sctp_con.flags)) {
+ int ret;
+
+ do {
+ ret = receive_from_sock();
+
+ /* Don't starve out everyone else */
+ if (++count >= MAX_RX_MSG_COUNT) {
+ schedule();
+ count = 0;
+ }
+ } while (!kthread_should_stop() && ret >=0);
+ }
+ schedule();
+ }
+
+ return 0;
+}
+
+static int dlm_sendd(void *data)
+{
+ DECLARE_WAITQUEUE(wait, current);
+
+ add_wait_queue(sctp_con.sock->sk->sk_sleep, &wait);
+
+ while (!kthread_should_stop()) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ if (write_list_empty())
+ schedule();
+ set_current_state(TASK_RUNNING);
+
+ if (sctp_con.eagain_flag) {
+ sctp_con.eagain_flag = 0;
+ refill_write_queue();
+ }
+ process_output_queue();
+ }
+
+ remove_wait_queue(sctp_con.sock->sk->sk_sleep, &wait);
+
+ return 0;
+}
+
+static void daemons_stop(void)
+{
+ kthread_stop(recv_task);
+ kthread_stop(send_task);
+}
+
+static int daemons_start(void)
+{
+ struct task_struct *p;
+ int error;
+
+ p = kthread_run(dlm_recvd, NULL, "dlm_recvd");
+ error = IS_ERR(p);
+ if (error) {
+ log_print("can't start dlm_recvd %d", error);
+ return error;
+ }
+ recv_task = p;
+
+ p = kthread_run(dlm_sendd, NULL, "dlm_sendd");
+ error = IS_ERR(p);
+ if (error) {
+ log_print("can't start dlm_sendd %d", error);
+ kthread_stop(recv_task);
+ return error;
+ }
+ send_task = p;
+
+ return 0;
+}
+
+/*
+ * This is quite likely to sleep...
+ */
+int dlm_lowcomms_start(void)
+{
+ int error;
+
+ error = init_sock();
+ if (error)
+ goto fail_sock;
+ error = daemons_start();
+ if (error)
+ goto fail_sock;
+ atomic_set(&accepting, 1);
+ return 0;
+
+ fail_sock:
+ close_connection();
+ return error;
+}
+
+/* Set all the activity flags to prevent any socket activity. */
+
+void dlm_lowcomms_stop(void)
+{
+ atomic_set(&accepting, 0);
+ sctp_con.flags = 0x7;
+ daemons_stop();
+ clean_writequeues();
+ close_connection();
+ dealloc_nodeinfo();
+ max_nodeid = 0;
+}
+
+int dlm_lowcomms_init(void)
+{
+ init_waitqueue_head(&lowcomms_recv_wait);
+ spin_lock_init(&write_nodes_lock);
+ INIT_LIST_HEAD(&write_nodes);
+ init_rwsem(&nodeinfo_lock);
+ return 0;
+}
+
+void dlm_lowcomms_exit(void)
+{
+ int i;
+
+ for (i = 0; i < dlm_local_count; i++)
+ kfree(dlm_local_addr[i]);
+ dlm_local_count = 0;
+ dlm_local_nodeid = 0;
+}
+
diff --git a/fs/dlm/lowcomms.h b/fs/dlm/lowcomms.h
new file mode 100644
index 000000000000..2d045e0daae1
--- /dev/null
+++ b/fs/dlm/lowcomms.h
@@ -0,0 +1,26 @@
+/******************************************************************************
+*******************************************************************************
+**
+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
+**
+** This copyrighted material is made available to anyone wishing to use,
+** modify, copy, or redistribute it subject to the terms and conditions
+** of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __LOWCOMMS_DOT_H__
+#define __LOWCOMMS_DOT_H__
+
+int dlm_lowcomms_init(void);
+void dlm_lowcomms_exit(void);
+int dlm_lowcomms_start(void);
+void dlm_lowcomms_stop(void);
+int dlm_lowcomms_close(int nodeid);
+void *dlm_lowcomms_get_buffer(int nodeid, int len, gfp_t allocation, char **ppc);
+void dlm_lowcomms_commit_buffer(void *mh);
+
+#endif /* __LOWCOMMS_DOT_H__ */
+
diff --git a/fs/dlm/lvb_table.h b/fs/dlm/lvb_table.h
new file mode 100644
index 000000000000..cc3e92f3feef
--- /dev/null
+++ b/fs/dlm/lvb_table.h
@@ -0,0 +1,18 @@
+/******************************************************************************
+*******************************************************************************
+**
+** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
+**
+** This copyrighted material is made available to anyone wishing to use,
+** modify, copy, or redistribute it subject to the terms and conditions
+** of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __LVB_TABLE_DOT_H__
+#define __LVB_TABLE_DOT_H__
+
+extern const int dlm_lvb_operations[8][8];
+
+#endif
diff --git a/fs/dlm/main.c b/fs/dlm/main.c
new file mode 100644
index 000000000000..a8da8dc36b2e
--- /dev/null
+++ b/fs/dlm/main.c
@@ -0,0 +1,97 @@
+/******************************************************************************
+*******************************************************************************
+**
+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
+**
+** This copyrighted material is made available to anyone wishing to use,
+** modify, copy, or redistribute it subject to the terms and conditions
+** of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#include "dlm_internal.h"
+#include "lockspace.h"
+#include "lock.h"
+#include "user.h"
+#include "memory.h"
+#include "lowcomms.h"
+#include "config.h"
+
+#ifdef CONFIG_DLM_DEBUG
+int dlm_register_debugfs(void);
+void dlm_unregister_debugfs(void);
+#else
+static inline int dlm_register_debugfs(void) { return 0; }
+static inline void dlm_unregister_debugfs(void) { }
+#endif
+
+static int __init init_dlm(void)
+{
+ int error;
+
+ error = dlm_memory_init();
+ if (error)
+ goto out;
+
+ error = dlm_lockspace_init();
+ if (error)
+ goto out_mem;
+
+ error = dlm_config_init();
+ if (error)
+ goto out_lockspace;
+
+ error = dlm_register_debugfs();
+ if (error)
+ goto out_config;
+
+ error = dlm_lowcomms_init();
+ if (error)
+ goto out_debug;
+
+ error = dlm_user_init();
+ if (error)
+ goto out_lowcomms;
+
+ printk("DLM (built %s %s) installed\n", __DATE__, __TIME__);
+
+ return 0;
+
+ out_lowcomms:
+ dlm_lowcomms_exit();
+ out_debug:
+ dlm_unregister_debugfs();
+ out_config:
+ dlm_config_exit();
+ out_lockspace:
+ dlm_lockspace_exit();
+ out_mem:
+ dlm_memory_exit();
+ out:
+ return error;
+}
+
+static void __exit exit_dlm(void)
+{
+ dlm_user_exit();
+ dlm_lowcomms_exit();
+ dlm_config_exit();
+ dlm_memory_exit();
+ dlm_lockspace_exit();
+ dlm_unregister_debugfs();
+}
+
+module_init(init_dlm);
+module_exit(exit_dlm);
+
+MODULE_DESCRIPTION("Distributed Lock Manager");
+MODULE_AUTHOR("Red Hat, Inc.");
+MODULE_LICENSE("GPL");
+
+EXPORT_SYMBOL_GPL(dlm_new_lockspace);
+EXPORT_SYMBOL_GPL(dlm_release_lockspace);
+EXPORT_SYMBOL_GPL(dlm_lock);
+EXPORT_SYMBOL_GPL(dlm_unlock);
+
diff --git a/fs/dlm/member.c b/fs/dlm/member.c
new file mode 100644
index 000000000000..a3f7de7f3a8f
--- /dev/null
+++ b/fs/dlm/member.c
@@ -0,0 +1,327 @@
+/******************************************************************************
+*******************************************************************************
+**
+** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
+**
+** This copyrighted material is made available to anyone wishing to use,
+** modify, copy, or redistribute it subject to the terms and conditions
+** of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#include "dlm_internal.h"
+#include "lockspace.h"
+#include "member.h"
+#include "recoverd.h"
+#include "recover.h"
+#include "rcom.h"
+#include "config.h"
+
+/*
+ * Following called by dlm_recoverd thread
+ */
+
+static void add_ordered_member(struct dlm_ls *ls, struct dlm_member *new)
+{
+ struct dlm_member *memb = NULL;
+ struct list_head *tmp;
+ struct list_head *newlist = &new->list;
+ struct list_head *head = &ls->ls_nodes;
+
+ list_for_each(tmp, head) {
+ memb = list_entry(tmp, struct dlm_member, list);
+ if (new->nodeid < memb->nodeid)
+ break;
+ }
+
+ if (!memb)
+ list_add_tail(newlist, head);
+ else {
+ /* FIXME: can use list macro here */
+ newlist->prev = tmp->prev;
+ newlist->next = tmp;
+ tmp->prev->next = newlist;
+ tmp->prev = newlist;
+ }
+}
+
+static int dlm_add_member(struct dlm_ls *ls, int nodeid)
+{
+ struct dlm_member *memb;
+ int w;
+
+ memb = kzalloc(sizeof(struct dlm_member), GFP_KERNEL);
+ if (!memb)
+ return -ENOMEM;
+
+ w = dlm_node_weight(ls->ls_name, nodeid);
+ if (w < 0)
+ return w;
+
+ memb->nodeid = nodeid;
+ memb->weight = w;
+ add_ordered_member(ls, memb);
+ ls->ls_num_nodes++;
+ return 0;
+}
+
+static void dlm_remove_member(struct dlm_ls *ls, struct dlm_member *memb)
+{
+ list_move(&memb->list, &ls->ls_nodes_gone);
+ ls->ls_num_nodes--;
+}
+
+static int dlm_is_member(struct dlm_ls *ls, int nodeid)
+{
+ struct dlm_member *memb;
+
+ list_for_each_entry(memb, &ls->ls_nodes, list) {
+ if (memb->nodeid == nodeid)
+ return 1;
+ }
+ return 0;
+}
+
+int dlm_is_removed(struct dlm_ls *ls, int nodeid)
+{
+ struct dlm_member *memb;
+
+ list_for_each_entry(memb, &ls->ls_nodes_gone, list) {
+ if (memb->nodeid == nodeid)
+ return 1;
+ }
+ return 0;
+}
+
+static void clear_memb_list(struct list_head *head)
+{
+ struct dlm_member *memb;
+
+ while (!list_empty(head)) {
+ memb = list_entry(head->next, struct dlm_member, list);
+ list_del(&memb->list);
+ kfree(memb);
+ }
+}
+
+void dlm_clear_members(struct dlm_ls *ls)
+{
+ clear_memb_list(&ls->ls_nodes);
+ ls->ls_num_nodes = 0;
+}
+
+void dlm_clear_members_gone(struct dlm_ls *ls)
+{
+ clear_memb_list(&ls->ls_nodes_gone);
+}
+
+static void make_member_array(struct dlm_ls *ls)
+{
+ struct dlm_member *memb;
+ int i, w, x = 0, total = 0, all_zero = 0, *array;
+
+ kfree(ls->ls_node_array);
+ ls->ls_node_array = NULL;
+
+ list_for_each_entry(memb, &ls->ls_nodes, list) {
+ if (memb->weight)
+ total += memb->weight;
+ }
+
+ /* all nodes revert to weight of 1 if all have weight 0 */
+
+ if (!total) {
+ total = ls->ls_num_nodes;
+ all_zero = 1;
+ }
+
+ ls->ls_total_weight = total;
+
+ array = kmalloc(sizeof(int) * total, GFP_KERNEL);
+ if (!array)
+ return;
+
+ list_for_each_entry(memb, &ls->ls_nodes, list) {
+ if (!all_zero && !memb->weight)
+ continue;
+
+ if (all_zero)
+ w = 1;
+ else
+ w = memb->weight;
+
+ DLM_ASSERT(x < total, printk("total %d x %d\n", total, x););
+
+ for (i = 0; i < w; i++)
+ array[x++] = memb->nodeid;
+ }
+
+ ls->ls_node_array = array;
+}
+
+/* send a status request to all members just to establish comms connections */
+
+static int ping_members(struct dlm_ls *ls)
+{
+ struct dlm_member *memb;
+ int error = 0;
+
+ list_for_each_entry(memb, &ls->ls_nodes, list) {
+ error = dlm_recovery_stopped(ls);
+ if (error)
+ break;
+ error = dlm_rcom_status(ls, memb->nodeid);
+ if (error)
+ break;
+ }
+ if (error)
+ log_debug(ls, "ping_members aborted %d last nodeid %d",
+ error, ls->ls_recover_nodeid);
+ return error;
+}
+
+int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out)
+{
+ struct dlm_member *memb, *safe;
+ int i, error, found, pos = 0, neg = 0, low = -1;
+
+ /* move departed members from ls_nodes to ls_nodes_gone */
+
+ list_for_each_entry_safe(memb, safe, &ls->ls_nodes, list) {
+ found = 0;
+ for (i = 0; i < rv->node_count; i++) {
+ if (memb->nodeid == rv->nodeids[i]) {
+ found = 1;
+ break;
+ }
+ }
+
+ if (!found) {
+ neg++;
+ dlm_remove_member(ls, memb);
+ log_debug(ls, "remove member %d", memb->nodeid);
+ }
+ }
+
+ /* add new members to ls_nodes */
+
+ for (i = 0; i < rv->node_count; i++) {
+ if (dlm_is_member(ls, rv->nodeids[i]))
+ continue;
+ dlm_add_member(ls, rv->nodeids[i]);
+ pos++;
+ log_debug(ls, "add member %d", rv->nodeids[i]);
+ }
+
+ list_for_each_entry(memb, &ls->ls_nodes, list) {
+ if (low == -1 || memb->nodeid < low)
+ low = memb->nodeid;
+ }
+ ls->ls_low_nodeid = low;
+
+ make_member_array(ls);
+ dlm_set_recover_status(ls, DLM_RS_NODES);
+ *neg_out = neg;
+
+ error = ping_members(ls);
+ if (error)
+ goto out;
+
+ error = dlm_recover_members_wait(ls);
+ out:
+ log_debug(ls, "total members %d error %d", ls->ls_num_nodes, error);
+ return error;
+}
+
+/*
+ * Following called from lockspace.c
+ */
+
+int dlm_ls_stop(struct dlm_ls *ls)
+{
+ int new;
+
+ /*
+ * A stop cancels any recovery that's in progress (see RECOVERY_STOP,
+ * dlm_recovery_stopped()) and prevents any new locks from being
+ * processed (see RUNNING, dlm_locking_stopped()).
+ */
+
+ spin_lock(&ls->ls_recover_lock);
+ set_bit(LSFL_RECOVERY_STOP, &ls->ls_flags);
+ new = test_and_clear_bit(LSFL_RUNNING, &ls->ls_flags);
+ ls->ls_recover_seq++;
+ spin_unlock(&ls->ls_recover_lock);
+
+ /*
+ * This in_recovery lock does two things:
+ *
+ * 1) Keeps this function from returning until all threads are out
+ * of locking routines and locking is truely stopped.
+ * 2) Keeps any new requests from being processed until it's unlocked
+ * when recovery is complete.
+ */
+
+ if (new)
+ down_write(&ls->ls_in_recovery);
+
+ /*
+ * The recoverd suspend/resume makes sure that dlm_recoverd (if
+ * running) has noticed the clearing of RUNNING above and quit
+ * processing the previous recovery. This will be true for all nodes
+ * before any nodes start the new recovery.
+ */
+
+ dlm_recoverd_suspend(ls);
+ ls->ls_recover_status = 0;
+ dlm_recoverd_resume(ls);
+ return 0;
+}
+
+int dlm_ls_start(struct dlm_ls *ls)
+{
+ struct dlm_recover *rv = NULL, *rv_old;
+ int *ids = NULL;
+ int error, count;
+
+ rv = kzalloc(sizeof(struct dlm_recover), GFP_KERNEL);
+ if (!rv)
+ return -ENOMEM;
+
+ error = count = dlm_nodeid_list(ls->ls_name, &ids);
+ if (error <= 0)
+ goto fail;
+
+ spin_lock(&ls->ls_recover_lock);
+
+ /* the lockspace needs to be stopped before it can be started */
+
+ if (!dlm_locking_stopped(ls)) {
+ spin_unlock(&ls->ls_recover_lock);
+ log_error(ls, "start ignored: lockspace running");
+ error = -EINVAL;
+ goto fail;
+ }
+
+ rv->nodeids = ids;
+ rv->node_count = count;
+ rv->seq = ++ls->ls_recover_seq;
+ rv_old = ls->ls_recover_args;
+ ls->ls_recover_args = rv;
+ spin_unlock(&ls->ls_recover_lock);
+
+ if (rv_old) {
+ kfree(rv_old->nodeids);
+ kfree(rv_old);
+ }
+
+ dlm_recoverd_kick(ls);
+ return 0;
+
+ fail:
+ kfree(rv);
+ kfree(ids);
+ return error;
+}
+
diff --git a/fs/dlm/member.h b/fs/dlm/member.h
new file mode 100644
index 000000000000..927c08c19214
--- /dev/null
+++ b/fs/dlm/member.h
@@ -0,0 +1,24 @@
+/******************************************************************************
+*******************************************************************************
+**
+** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
+**
+** This copyrighted material is made available to anyone wishing to use,
+** modify, copy, or redistribute it subject to the terms and conditions
+** of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __MEMBER_DOT_H__
+#define __MEMBER_DOT_H__
+
+int dlm_ls_stop(struct dlm_ls *ls);
+int dlm_ls_start(struct dlm_ls *ls);
+void dlm_clear_members(struct dlm_ls *ls);
+void dlm_clear_members_gone(struct dlm_ls *ls);
+int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv,int *neg_out);
+int dlm_is_removed(struct dlm_ls *ls, int nodeid);
+
+#endif /* __MEMBER_DOT_H__ */
+
diff --git a/fs/dlm/memory.c b/fs/dlm/memory.c
new file mode 100644
index 000000000000..989b608fd836
--- /dev/null
+++ b/fs/dlm/memory.c
@@ -0,0 +1,116 @@
+/******************************************************************************
+*******************************************************************************
+**
+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
+**
+** This copyrighted material is made available to anyone wishing to use,
+** modify, copy, or redistribute it subject to the terms and conditions
+** of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#include "dlm_internal.h"
+#include "config.h"
+#include "memory.h"
+
+static kmem_cache_t *lkb_cache;
+
+
+int dlm_memory_init(void)
+{
+ int ret = 0;
+
+ lkb_cache = kmem_cache_create("dlm_lkb", sizeof(struct dlm_lkb),
+ __alignof__(struct dlm_lkb), 0, NULL, NULL);
+ if (!lkb_cache)
+ ret = -ENOMEM;
+ return ret;
+}
+
+void dlm_memory_exit(void)
+{
+ if (lkb_cache)
+ kmem_cache_destroy(lkb_cache);
+}
+
+char *allocate_lvb(struct dlm_ls *ls)
+{
+ char *p;
+
+ p = kmalloc(ls->ls_lvblen, GFP_KERNEL);
+ if (p)
+ memset(p, 0, ls->ls_lvblen);
+ return p;
+}
+
+void free_lvb(char *p)
+{
+ kfree(p);
+}
+
+/* FIXME: have some minimal space built-in to rsb for the name and
+ kmalloc a separate name if needed, like dentries are done */
+
+struct dlm_rsb *allocate_rsb(struct dlm_ls *ls, int namelen)
+{
+ struct dlm_rsb *r;
+
+ DLM_ASSERT(namelen <= DLM_RESNAME_MAXLEN,);
+
+ r = kmalloc(sizeof(*r) + namelen, GFP_KERNEL);
+ if (r)
+ memset(r, 0, sizeof(*r) + namelen);
+ return r;
+}
+
+void free_rsb(struct dlm_rsb *r)
+{
+ if (r->res_lvbptr)
+ free_lvb(r->res_lvbptr);
+ kfree(r);
+}
+
+struct dlm_lkb *allocate_lkb(struct dlm_ls *ls)
+{
+ struct dlm_lkb *lkb;
+
+ lkb = kmem_cache_alloc(lkb_cache, GFP_KERNEL);
+ if (lkb)
+ memset(lkb, 0, sizeof(*lkb));
+ return lkb;
+}
+
+void free_lkb(struct dlm_lkb *lkb)
+{
+ if (lkb->lkb_flags & DLM_IFL_USER) {
+ struct dlm_user_args *ua;
+ ua = (struct dlm_user_args *)lkb->lkb_astparam;
+ if (ua) {
+ if (ua->lksb.sb_lvbptr)
+ kfree(ua->lksb.sb_lvbptr);
+ kfree(ua);
+ }
+ }
+ kmem_cache_free(lkb_cache, lkb);
+}
+
+struct dlm_direntry *allocate_direntry(struct dlm_ls *ls, int namelen)
+{
+ struct dlm_direntry *de;
+
+ DLM_ASSERT(namelen <= DLM_RESNAME_MAXLEN,
+ printk("namelen = %d\n", namelen););
+
+ de = kmalloc(sizeof(*de) + namelen, GFP_KERNEL);
+ if (de)
+ memset(de, 0, sizeof(*de) + namelen);
+ return de;
+}
+
+void free_direntry(struct dlm_direntry *de)
+{
+ kfree(de);
+}
+
diff --git a/fs/dlm/memory.h b/fs/dlm/memory.h
new file mode 100644
index 000000000000..6ead158ccc5c
--- /dev/null
+++ b/fs/dlm/memory.h
@@ -0,0 +1,29 @@
+/******************************************************************************
+*******************************************************************************
+**
+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
+**
+** This copyrighted material is made available to anyone wishing to use,
+** modify, copy, or redistribute it subject to the terms and conditions
+** of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __MEMORY_DOT_H__
+#define __MEMORY_DOT_H__
+
+int dlm_memory_init(void);
+void dlm_memory_exit(void);
+struct dlm_rsb *allocate_rsb(struct dlm_ls *ls, int namelen);
+void free_rsb(struct dlm_rsb *r);
+struct dlm_lkb *allocate_lkb(struct dlm_ls *ls);
+void free_lkb(struct dlm_lkb *l);
+struct dlm_direntry *allocate_direntry(struct dlm_ls *ls, int namelen);
+void free_direntry(struct dlm_direntry *de);
+char *allocate_lvb(struct dlm_ls *ls);
+void free_lvb(char *l);
+
+#endif /* __MEMORY_DOT_H__ */
+
diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c
new file mode 100644
index 000000000000..c9b1c3d535f4
--- /dev/null
+++ b/fs/dlm/midcomms.c
@@ -0,0 +1,140 @@
+/******************************************************************************
+*******************************************************************************
+**
+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
+**
+** This copyrighted material is made available to anyone wishing to use,
+** modify, copy, or redistribute it subject to the terms and conditions
+** of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+/*
+ * midcomms.c
+ *
+ * This is the appallingly named "mid-level" comms layer.
+ *
+ * Its purpose is to take packets from the "real" comms layer,
+ * split them up into packets and pass them to the interested
+ * part of the locking mechanism.
+ *
+ * It also takes messages from the locking layer, formats them
+ * into packets and sends them to the comms layer.
+ */
+
+#include "dlm_internal.h"
+#include "lowcomms.h"
+#include "config.h"
+#include "rcom.h"
+#include "lock.h"
+#include "midcomms.h"
+
+
+static void copy_from_cb(void *dst, const void *base, unsigned offset,
+ unsigned len, unsigned limit)
+{
+ unsigned copy = len;
+
+ if ((copy + offset) > limit)
+ copy = limit - offset;
+ memcpy(dst, base + offset, copy);
+ len -= copy;
+ if (len)
+ memcpy(dst + copy, base, len);
+}
+
+/*
+ * Called from the low-level comms layer to process a buffer of
+ * commands.
+ *
+ * Only complete messages are processed here, any "spare" bytes from
+ * the end of a buffer are saved and tacked onto the front of the next
+ * message that comes in. I doubt this will happen very often but we
+ * need to be able to cope with it and I don't want the task to be waiting
+ * for packets to come in when there is useful work to be done.
+ */
+
+int dlm_process_incoming_buffer(int nodeid, const void *base,
+ unsigned offset, unsigned len, unsigned limit)
+{
+ unsigned char __tmp[DLM_INBUF_LEN];
+ struct dlm_header *msg = (struct dlm_header *) __tmp;
+ int ret = 0;
+ int err = 0;
+ uint16_t msglen;
+ uint32_t lockspace;
+
+ while (len > sizeof(struct dlm_header)) {
+
+ /* Copy just the header to check the total length. The
+ message may wrap around the end of the buffer back to the
+ start, so we need to use a temp buffer and copy_from_cb. */
+
+ copy_from_cb(msg, base, offset, sizeof(struct dlm_header),
+ limit);
+
+ msglen = le16_to_cpu(msg->h_length);
+ lockspace = msg->h_lockspace;
+
+ err = -EINVAL;
+ if (msglen < sizeof(struct dlm_header))
+ break;
+ err = -E2BIG;
+ if (msglen > dlm_config.buffer_size) {
+ log_print("message size %d from %d too big, buf len %d",
+ msglen, nodeid, len);
+ break;
+ }
+ err = 0;
+
+ /* If only part of the full message is contained in this
+ buffer, then do nothing and wait for lowcomms to call
+ us again later with more data. We return 0 meaning
+ we've consumed none of the input buffer. */
+
+ if (msglen > len)
+ break;
+
+ /* Allocate a larger temp buffer if the full message won't fit
+ in the buffer on the stack (which should work for most
+ ordinary messages). */
+
+ if (msglen > sizeof(__tmp) &&
+ msg == (struct dlm_header *) __tmp) {
+ msg = kmalloc(dlm_config.buffer_size, GFP_KERNEL);
+ if (msg == NULL)
+ return ret;
+ }
+
+ copy_from_cb(msg, base, offset, msglen, limit);
+
+ BUG_ON(lockspace != msg->h_lockspace);
+
+ ret += msglen;
+ offset += msglen;
+ offset &= (limit - 1);
+ len -= msglen;
+
+ switch (msg->h_cmd) {
+ case DLM_MSG:
+ dlm_receive_message(msg, nodeid, 0);
+ break;
+
+ case DLM_RCOM:
+ dlm_receive_rcom(msg, nodeid);
+ break;
+
+ default:
+ log_print("unknown msg type %x from %u: %u %u %u %u",
+ msg->h_cmd, nodeid, msglen, len, offset, ret);
+ }
+ }
+
+ if (msg != (struct dlm_header *) __tmp)
+ kfree(msg);
+
+ return err ? err : ret;
+}
+
diff --git a/fs/dlm/midcomms.h b/fs/dlm/midcomms.h
new file mode 100644
index 000000000000..95852a5f111d
--- /dev/null
+++ b/fs/dlm/midcomms.h
@@ -0,0 +1,21 @@
+/******************************************************************************
+*******************************************************************************
+**
+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
+**
+** This copyrighted material is made available to anyone wishing to use,
+** modify, copy, or redistribute it subject to the terms and conditions
+** of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __MIDCOMMS_DOT_H__
+#define __MIDCOMMS_DOT_H__
+
+int dlm_process_incoming_buffer(int nodeid, const void *base, unsigned offset,
+ unsigned len, unsigned limit);
+
+#endif /* __MIDCOMMS_DOT_H__ */
+
diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c
new file mode 100644
index 000000000000..518239a8b1e9
--- /dev/null
+++ b/fs/dlm/rcom.c
@@ -0,0 +1,472 @@
+/******************************************************************************
+*******************************************************************************
+**
+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
+**
+** This copyrighted material is made available to anyone wishing to use,
+** modify, copy, or redistribute it subject to the terms and conditions
+** of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#include "dlm_internal.h"
+#include "lockspace.h"
+#include "member.h"
+#include "lowcomms.h"
+#include "midcomms.h"
+#include "rcom.h"
+#include "recover.h"
+#include "dir.h"
+#include "config.h"
+#include "memory.h"
+#include "lock.h"
+#include "util.h"
+
+
+static int rcom_response(struct dlm_ls *ls)
+{
+ return test_bit(LSFL_RCOM_READY, &ls->ls_flags);
+}
+
+static int create_rcom(struct dlm_ls *ls, int to_nodeid, int type, int len,
+ struct dlm_rcom **rc_ret, struct dlm_mhandle **mh_ret)
+{
+ struct dlm_rcom *rc;
+ struct dlm_mhandle *mh;
+ char *mb;
+ int mb_len = sizeof(struct dlm_rcom) + len;
+
+ mh = dlm_lowcomms_get_buffer(to_nodeid, mb_len, GFP_KERNEL, &mb);
+ if (!mh) {
+ log_print("create_rcom to %d type %d len %d ENOBUFS",
+ to_nodeid, type, len);
+ return -ENOBUFS;
+ }
+ memset(mb, 0, mb_len);
+
+ rc = (struct dlm_rcom *) mb;
+
+ rc->rc_header.h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR);
+ rc->rc_header.h_lockspace = ls->ls_global_id;
+ rc->rc_header.h_nodeid = dlm_our_nodeid();
+ rc->rc_header.h_length = mb_len;
+ rc->rc_header.h_cmd = DLM_RCOM;
+
+ rc->rc_type = type;
+
+ *mh_ret = mh;
+ *rc_ret = rc;
+ return 0;
+}
+
+static void send_rcom(struct dlm_ls *ls, struct dlm_mhandle *mh,
+ struct dlm_rcom *rc)
+{
+ dlm_rcom_out(rc);
+ dlm_lowcomms_commit_buffer(mh);
+}
+
+/* When replying to a status request, a node also sends back its
+ configuration values. The requesting node then checks that the remote
+ node is configured the same way as itself. */
+
+static void make_config(struct dlm_ls *ls, struct rcom_config *rf)
+{
+ rf->rf_lvblen = ls->ls_lvblen;
+ rf->rf_lsflags = ls->ls_exflags;
+}
+
+static int check_config(struct dlm_ls *ls, struct rcom_config *rf, int nodeid)
+{
+ if (rf->rf_lvblen != ls->ls_lvblen ||
+ rf->rf_lsflags != ls->ls_exflags) {
+ log_error(ls, "config mismatch: %d,%x nodeid %d: %d,%x",
+ ls->ls_lvblen, ls->ls_exflags,
+ nodeid, rf->rf_lvblen, rf->rf_lsflags);
+ return -EINVAL;
+ }
+ return 0;
+}
+
+int dlm_rcom_status(struct dlm_ls *ls, int nodeid)
+{
+ struct dlm_rcom *rc;
+ struct dlm_mhandle *mh;
+ int error = 0;
+
+ memset(ls->ls_recover_buf, 0, dlm_config.buffer_size);
+ ls->ls_recover_nodeid = nodeid;
+
+ if (nodeid == dlm_our_nodeid()) {
+ rc = (struct dlm_rcom *) ls->ls_recover_buf;
+ rc->rc_result = dlm_recover_status(ls);
+ goto out;
+ }
+
+ error = create_rcom(ls, nodeid, DLM_RCOM_STATUS, 0, &rc, &mh);
+ if (error)
+ goto out;
+ rc->rc_id = ++ls->ls_rcom_seq;
+
+ send_rcom(ls, mh, rc);
+
+ error = dlm_wait_function(ls, &rcom_response);
+ clear_bit(LSFL_RCOM_READY, &ls->ls_flags);
+ if (error)
+ goto out;
+
+ rc = (struct dlm_rcom *) ls->ls_recover_buf;
+
+ if (rc->rc_result == -ESRCH) {
+ /* we pretend the remote lockspace exists with 0 status */
+ log_debug(ls, "remote node %d not ready", nodeid);
+ rc->rc_result = 0;
+ } else
+ error = check_config(ls, (struct rcom_config *) rc->rc_buf,
+ nodeid);
+ /* the caller looks at rc_result for the remote recovery status */
+ out:
+ return error;
+}
+
+static void receive_rcom_status(struct dlm_ls *ls, struct dlm_rcom *rc_in)
+{
+ struct dlm_rcom *rc;
+ struct dlm_mhandle *mh;
+ int error, nodeid = rc_in->rc_header.h_nodeid;
+
+ error = create_rcom(ls, nodeid, DLM_RCOM_STATUS_REPLY,
+ sizeof(struct rcom_config), &rc, &mh);
+ if (error)
+ return;
+ rc->rc_id = rc_in->rc_id;
+ rc->rc_result = dlm_recover_status(ls);
+ make_config(ls, (struct rcom_config *) rc->rc_buf);
+
+ send_rcom(ls, mh, rc);
+}
+
+static void receive_sync_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
+{
+ if (rc_in->rc_id != ls->ls_rcom_seq) {
+ log_debug(ls, "reject old reply %d got %llx wanted %llx",
+ rc_in->rc_type, rc_in->rc_id, ls->ls_rcom_seq);
+ return;
+ }
+ memcpy(ls->ls_recover_buf, rc_in, rc_in->rc_header.h_length);
+ set_bit(LSFL_RCOM_READY, &ls->ls_flags);
+ wake_up(&ls->ls_wait_general);
+}
+
+static void receive_rcom_status_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
+{
+ receive_sync_reply(ls, rc_in);
+}
+
+int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name, int last_len)
+{
+ struct dlm_rcom *rc;
+ struct dlm_mhandle *mh;
+ int error = 0, len = sizeof(struct dlm_rcom);
+
+ memset(ls->ls_recover_buf, 0, dlm_config.buffer_size);
+ ls->ls_recover_nodeid = nodeid;
+
+ if (nodeid == dlm_our_nodeid()) {
+ dlm_copy_master_names(ls, last_name, last_len,
+ ls->ls_recover_buf + len,
+ dlm_config.buffer_size - len, nodeid);
+ goto out;
+ }
+
+ error = create_rcom(ls, nodeid, DLM_RCOM_NAMES, last_len, &rc, &mh);
+ if (error)
+ goto out;
+ memcpy(rc->rc_buf, last_name, last_len);
+ rc->rc_id = ++ls->ls_rcom_seq;
+
+ send_rcom(ls, mh, rc);
+
+ error = dlm_wait_function(ls, &rcom_response);
+ clear_bit(LSFL_RCOM_READY, &ls->ls_flags);
+ out:
+ return error;
+}
+
+static void receive_rcom_names(struct dlm_ls *ls, struct dlm_rcom *rc_in)
+{
+ struct dlm_rcom *rc;
+ struct dlm_mhandle *mh;
+ int error, inlen, outlen;
+ int nodeid = rc_in->rc_header.h_nodeid;
+ uint32_t status = dlm_recover_status(ls);
+
+ /*
+ * We can't run dlm_dir_rebuild_send (which uses ls_nodes) while
+ * dlm_recoverd is running ls_nodes_reconfig (which changes ls_nodes).
+ * It could only happen in rare cases where we get a late NAMES
+ * message from a previous instance of recovery.
+ */
+
+ if (!(status & DLM_RS_NODES)) {
+ log_debug(ls, "ignoring RCOM_NAMES from %u", nodeid);
+ return;
+ }
+
+ nodeid = rc_in->rc_header.h_nodeid;
+ inlen = rc_in->rc_header.h_length - sizeof(struct dlm_rcom);
+ outlen = dlm_config.buffer_size - sizeof(struct dlm_rcom);
+
+ error = create_rcom(ls, nodeid, DLM_RCOM_NAMES_REPLY, outlen, &rc, &mh);
+ if (error)
+ return;
+ rc->rc_id = rc_in->rc_id;
+
+ dlm_copy_master_names(ls, rc_in->rc_buf, inlen, rc->rc_buf, outlen,
+ nodeid);
+ send_rcom(ls, mh, rc);
+}
+
+static void receive_rcom_names_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
+{
+ receive_sync_reply(ls, rc_in);
+}
+
+int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid)
+{
+ struct dlm_rcom *rc;
+ struct dlm_mhandle *mh;
+ struct dlm_ls *ls = r->res_ls;
+ int error;
+
+ error = create_rcom(ls, dir_nodeid, DLM_RCOM_LOOKUP, r->res_length,
+ &rc, &mh);
+ if (error)
+ goto out;
+ memcpy(rc->rc_buf, r->res_name, r->res_length);
+ rc->rc_id = (unsigned long) r;
+
+ send_rcom(ls, mh, rc);
+ out:
+ return error;
+}
+
+static void receive_rcom_lookup(struct dlm_ls *ls, struct dlm_rcom *rc_in)
+{
+ struct dlm_rcom *rc;
+ struct dlm_mhandle *mh;
+ int error, ret_nodeid, nodeid = rc_in->rc_header.h_nodeid;
+ int len = rc_in->rc_header.h_length - sizeof(struct dlm_rcom);
+
+ error = create_rcom(ls, nodeid, DLM_RCOM_LOOKUP_REPLY, 0, &rc, &mh);
+ if (error)
+ return;
+
+ error = dlm_dir_lookup(ls, nodeid, rc_in->rc_buf, len, &ret_nodeid);
+ if (error)
+ ret_nodeid = error;
+ rc->rc_result = ret_nodeid;
+ rc->rc_id = rc_in->rc_id;
+
+ send_rcom(ls, mh, rc);
+}
+
+static void receive_rcom_lookup_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
+{
+ dlm_recover_master_reply(ls, rc_in);
+}
+
+static void pack_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb,
+ struct rcom_lock *rl)
+{
+ memset(rl, 0, sizeof(*rl));
+
+ rl->rl_ownpid = lkb->lkb_ownpid;
+ rl->rl_lkid = lkb->lkb_id;
+ rl->rl_exflags = lkb->lkb_exflags;
+ rl->rl_flags = lkb->lkb_flags;
+ rl->rl_lvbseq = lkb->lkb_lvbseq;
+ rl->rl_rqmode = lkb->lkb_rqmode;
+ rl->rl_grmode = lkb->lkb_grmode;
+ rl->rl_status = lkb->lkb_status;
+ rl->rl_wait_type = lkb->lkb_wait_type;
+
+ if (lkb->lkb_bastaddr)
+ rl->rl_asts |= AST_BAST;
+ if (lkb->lkb_astaddr)
+ rl->rl_asts |= AST_COMP;
+
+ rl->rl_namelen = r->res_length;
+ memcpy(rl->rl_name, r->res_name, r->res_length);
+
+ /* FIXME: might we have an lvb without DLM_LKF_VALBLK set ?
+ If so, receive_rcom_lock_args() won't take this copy. */
+
+ if (lkb->lkb_lvbptr)
+ memcpy(rl->rl_lvb, lkb->lkb_lvbptr, r->res_ls->ls_lvblen);
+}
+
+int dlm_send_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+ struct dlm_ls *ls = r->res_ls;
+ struct dlm_rcom *rc;
+ struct dlm_mhandle *mh;
+ struct rcom_lock *rl;
+ int error, len = sizeof(struct rcom_lock);
+
+ if (lkb->lkb_lvbptr)
+ len += ls->ls_lvblen;
+
+ error = create_rcom(ls, r->res_nodeid, DLM_RCOM_LOCK, len, &rc, &mh);
+ if (error)
+ goto out;
+
+ rl = (struct rcom_lock *) rc->rc_buf;
+ pack_rcom_lock(r, lkb, rl);
+ rc->rc_id = (unsigned long) r;
+
+ send_rcom(ls, mh, rc);
+ out:
+ return error;
+}
+
+static void receive_rcom_lock(struct dlm_ls *ls, struct dlm_rcom *rc_in)
+{
+ struct dlm_rcom *rc;
+ struct dlm_mhandle *mh;
+ int error, nodeid = rc_in->rc_header.h_nodeid;
+
+ dlm_recover_master_copy(ls, rc_in);
+
+ error = create_rcom(ls, nodeid, DLM_RCOM_LOCK_REPLY,
+ sizeof(struct rcom_lock), &rc, &mh);
+ if (error)
+ return;
+
+ /* We send back the same rcom_lock struct we received, but
+ dlm_recover_master_copy() has filled in rl_remid and rl_result */
+
+ memcpy(rc->rc_buf, rc_in->rc_buf, sizeof(struct rcom_lock));
+ rc->rc_id = rc_in->rc_id;
+
+ send_rcom(ls, mh, rc);
+}
+
+static void receive_rcom_lock_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
+{
+ uint32_t status = dlm_recover_status(ls);
+
+ if (!(status & DLM_RS_DIR)) {
+ log_debug(ls, "ignoring RCOM_LOCK_REPLY from %u",
+ rc_in->rc_header.h_nodeid);
+ return;
+ }
+
+ dlm_recover_process_copy(ls, rc_in);
+}
+
+static int send_ls_not_ready(int nodeid, struct dlm_rcom *rc_in)
+{
+ struct dlm_rcom *rc;
+ struct dlm_mhandle *mh;
+ char *mb;
+ int mb_len = sizeof(struct dlm_rcom);
+
+ mh = dlm_lowcomms_get_buffer(nodeid, mb_len, GFP_KERNEL, &mb);
+ if (!mh)
+ return -ENOBUFS;
+ memset(mb, 0, mb_len);
+
+ rc = (struct dlm_rcom *) mb;
+
+ rc->rc_header.h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR);
+ rc->rc_header.h_lockspace = rc_in->rc_header.h_lockspace;
+ rc->rc_header.h_nodeid = dlm_our_nodeid();
+ rc->rc_header.h_length = mb_len;
+ rc->rc_header.h_cmd = DLM_RCOM;
+
+ rc->rc_type = DLM_RCOM_STATUS_REPLY;
+ rc->rc_id = rc_in->rc_id;
+ rc->rc_result = -ESRCH;
+
+ dlm_rcom_out(rc);
+ dlm_lowcomms_commit_buffer(mh);
+
+ return 0;
+}
+
+/* Called by dlm_recvd; corresponds to dlm_receive_message() but special
+ recovery-only comms are sent through here. */
+
+void dlm_receive_rcom(struct dlm_header *hd, int nodeid)
+{
+ struct dlm_rcom *rc = (struct dlm_rcom *) hd;
+ struct dlm_ls *ls;
+
+ dlm_rcom_in(rc);
+
+ /* If the lockspace doesn't exist then still send a status message
+ back; it's possible that it just doesn't have its global_id yet. */
+
+ ls = dlm_find_lockspace_global(hd->h_lockspace);
+ if (!ls) {
+ log_print("lockspace %x from %d not found",
+ hd->h_lockspace, nodeid);
+ send_ls_not_ready(nodeid, rc);
+ return;
+ }
+
+ if (dlm_recovery_stopped(ls) && (rc->rc_type != DLM_RCOM_STATUS)) {
+ log_error(ls, "ignoring recovery message %x from %d",
+ rc->rc_type, nodeid);
+ goto out;
+ }
+
+ if (nodeid != rc->rc_header.h_nodeid) {
+ log_error(ls, "bad rcom nodeid %d from %d",
+ rc->rc_header.h_nodeid, nodeid);
+ goto out;
+ }
+
+ switch (rc->rc_type) {
+ case DLM_RCOM_STATUS:
+ receive_rcom_status(ls, rc);
+ break;
+
+ case DLM_RCOM_NAMES:
+ receive_rcom_names(ls, rc);
+ break;
+
+ case DLM_RCOM_LOOKUP:
+ receive_rcom_lookup(ls, rc);
+ break;
+
+ case DLM_RCOM_LOCK:
+ receive_rcom_lock(ls, rc);
+ break;
+
+ case DLM_RCOM_STATUS_REPLY:
+ receive_rcom_status_reply(ls, rc);
+ break;
+
+ case DLM_RCOM_NAMES_REPLY:
+ receive_rcom_names_reply(ls, rc);
+ break;
+
+ case DLM_RCOM_LOOKUP_REPLY:
+ receive_rcom_lookup_reply(ls, rc);
+ break;
+
+ case DLM_RCOM_LOCK_REPLY:
+ receive_rcom_lock_reply(ls, rc);
+ break;
+
+ default:
+ DLM_ASSERT(0, printk("rc_type=%x\n", rc->rc_type););
+ }
+ out:
+ dlm_put_lockspace(ls);
+}
+
diff --git a/fs/dlm/rcom.h b/fs/dlm/rcom.h
new file mode 100644
index 000000000000..d7984321ff41
--- /dev/null
+++ b/fs/dlm/rcom.h
@@ -0,0 +1,24 @@
+/******************************************************************************
+*******************************************************************************
+**
+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
+**
+** This copyrighted material is made available to anyone wishing to use,
+** modify, copy, or redistribute it subject to the terms and conditions
+** of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __RCOM_DOT_H__
+#define __RCOM_DOT_H__
+
+int dlm_rcom_status(struct dlm_ls *ls, int nodeid);
+int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name,int last_len);
+int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid);
+int dlm_send_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb);
+void dlm_receive_rcom(struct dlm_header *hd, int nodeid);
+
+#endif
+
diff --git a/fs/dlm/recover.c b/fs/dlm/recover.c
new file mode 100644
index 000000000000..a5e6d184872e
--- /dev/null
+++ b/fs/dlm/recover.c
@@ -0,0 +1,765 @@
+/******************************************************************************
+*******************************************************************************
+**
+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
+**
+** This copyrighted material is made available to anyone wishing to use,
+** modify, copy, or redistribute it subject to the terms and conditions
+** of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#include "dlm_internal.h"
+#include "lockspace.h"
+#include "dir.h"
+#include "config.h"
+#include "ast.h"
+#include "memory.h"
+#include "rcom.h"
+#include "lock.h"
+#include "lowcomms.h"
+#include "member.h"
+#include "recover.h"
+
+
+/*
+ * Recovery waiting routines: these functions wait for a particular reply from
+ * a remote node, or for the remote node to report a certain status. They need
+ * to abort if the lockspace is stopped indicating a node has failed (perhaps
+ * the one being waited for).
+ */
+
+/*
+ * Wait until given function returns non-zero or lockspace is stopped
+ * (LS_RECOVERY_STOP set due to failure of a node in ls_nodes). When another
+ * function thinks it could have completed the waited-on task, they should wake
+ * up ls_wait_general to get an immediate response rather than waiting for the
+ * timer to detect the result. A timer wakes us up periodically while waiting
+ * to see if we should abort due to a node failure. This should only be called
+ * by the dlm_recoverd thread.
+ */
+
+static void dlm_wait_timer_fn(unsigned long data)
+{
+ struct dlm_ls *ls = (struct dlm_ls *) data;
+ mod_timer(&ls->ls_timer, jiffies + (dlm_config.recover_timer * HZ));
+ wake_up(&ls->ls_wait_general);
+}
+
+int dlm_wait_function(struct dlm_ls *ls, int (*testfn) (struct dlm_ls *ls))
+{
+ int error = 0;
+
+ init_timer(&ls->ls_timer);
+ ls->ls_timer.function = dlm_wait_timer_fn;
+ ls->ls_timer.data = (long) ls;
+ ls->ls_timer.expires = jiffies + (dlm_config.recover_timer * HZ);
+ add_timer(&ls->ls_timer);
+
+ wait_event(ls->ls_wait_general, testfn(ls) || dlm_recovery_stopped(ls));
+ del_timer_sync(&ls->ls_timer);
+
+ if (dlm_recovery_stopped(ls)) {
+ log_debug(ls, "dlm_wait_function aborted");
+ error = -EINTR;
+ }
+ return error;
+}
+
+/*
+ * An efficient way for all nodes to wait for all others to have a certain
+ * status. The node with the lowest nodeid polls all the others for their
+ * status (wait_status_all) and all the others poll the node with the low id
+ * for its accumulated result (wait_status_low). When all nodes have set
+ * status flag X, then status flag X_ALL will be set on the low nodeid.
+ */
+
+uint32_t dlm_recover_status(struct dlm_ls *ls)
+{
+ uint32_t status;
+ spin_lock(&ls->ls_recover_lock);
+ status = ls->ls_recover_status;
+ spin_unlock(&ls->ls_recover_lock);
+ return status;
+}
+
+void dlm_set_recover_status(struct dlm_ls *ls, uint32_t status)
+{
+ spin_lock(&ls->ls_recover_lock);
+ ls->ls_recover_status |= status;
+ spin_unlock(&ls->ls_recover_lock);
+}
+
+static int wait_status_all(struct dlm_ls *ls, uint32_t wait_status)
+{
+ struct dlm_rcom *rc = (struct dlm_rcom *) ls->ls_recover_buf;
+ struct dlm_member *memb;
+ int error = 0, delay;
+
+ list_for_each_entry(memb, &ls->ls_nodes, list) {
+ delay = 0;
+ for (;;) {
+ if (dlm_recovery_stopped(ls)) {
+ error = -EINTR;
+ goto out;
+ }
+
+ error = dlm_rcom_status(ls, memb->nodeid);
+ if (error)
+ goto out;
+
+ if (rc->rc_result & wait_status)
+ break;
+ if (delay < 1000)
+ delay += 20;
+ msleep(delay);
+ }
+ }
+ out:
+ return error;
+}
+
+static int wait_status_low(struct dlm_ls *ls, uint32_t wait_status)
+{
+ struct dlm_rcom *rc = (struct dlm_rcom *) ls->ls_recover_buf;
+ int error = 0, delay = 0, nodeid = ls->ls_low_nodeid;
+
+ for (;;) {
+ if (dlm_recovery_stopped(ls)) {
+ error = -EINTR;
+ goto out;
+ }
+
+ error = dlm_rcom_status(ls, nodeid);
+ if (error)
+ break;
+
+ if (rc->rc_result & wait_status)
+ break;
+ if (delay < 1000)
+ delay += 20;
+ msleep(delay);
+ }
+ out:
+ return error;
+}
+
+static int wait_status(struct dlm_ls *ls, uint32_t status)
+{
+ uint32_t status_all = status << 1;
+ int error;
+
+ if (ls->ls_low_nodeid == dlm_our_nodeid()) {
+ error = wait_status_all(ls, status);
+ if (!error)
+ dlm_set_recover_status(ls, status_all);
+ } else
+ error = wait_status_low(ls, status_all);
+
+ return error;
+}
+
+int dlm_recover_members_wait(struct dlm_ls *ls)
+{
+ return wait_status(ls, DLM_RS_NODES);
+}
+
+int dlm_recover_directory_wait(struct dlm_ls *ls)
+{
+ return wait_status(ls, DLM_RS_DIR);
+}
+
+int dlm_recover_locks_wait(struct dlm_ls *ls)
+{
+ return wait_status(ls, DLM_RS_LOCKS);
+}
+
+int dlm_recover_done_wait(struct dlm_ls *ls)
+{
+ return wait_status(ls, DLM_RS_DONE);
+}
+
+/*
+ * The recover_list contains all the rsb's for which we've requested the new
+ * master nodeid. As replies are returned from the resource directories the
+ * rsb's are removed from the list. When the list is empty we're done.
+ *
+ * The recover_list is later similarly used for all rsb's for which we've sent
+ * new lkb's and need to receive new corresponding lkid's.
+ *
+ * We use the address of the rsb struct as a simple local identifier for the
+ * rsb so we can match an rcom reply with the rsb it was sent for.
+ */
+
+static int recover_list_empty(struct dlm_ls *ls)
+{
+ int empty;
+
+ spin_lock(&ls->ls_recover_list_lock);
+ empty = list_empty(&ls->ls_recover_list);
+ spin_unlock(&ls->ls_recover_list_lock);
+
+ return empty;
+}
+
+static void recover_list_add(struct dlm_rsb *r)
+{
+ struct dlm_ls *ls = r->res_ls;
+
+ spin_lock(&ls->ls_recover_list_lock);
+ if (list_empty(&r->res_recover_list)) {
+ list_add_tail(&r->res_recover_list, &ls->ls_recover_list);
+ ls->ls_recover_list_count++;
+ dlm_hold_rsb(r);
+ }
+ spin_unlock(&ls->ls_recover_list_lock);
+}
+
+static void recover_list_del(struct dlm_rsb *r)
+{
+ struct dlm_ls *ls = r->res_ls;
+
+ spin_lock(&ls->ls_recover_list_lock);
+ list_del_init(&r->res_recover_list);
+ ls->ls_recover_list_count--;
+ spin_unlock(&ls->ls_recover_list_lock);
+
+ dlm_put_rsb(r);
+}
+
+static struct dlm_rsb *recover_list_find(struct dlm_ls *ls, uint64_t id)
+{
+ struct dlm_rsb *r = NULL;
+
+ spin_lock(&ls->ls_recover_list_lock);
+
+ list_for_each_entry(r, &ls->ls_recover_list, res_recover_list) {
+ if (id == (unsigned long) r)
+ goto out;
+ }
+ r = NULL;
+ out:
+ spin_unlock(&ls->ls_recover_list_lock);
+ return r;
+}
+
+static void recover_list_clear(struct dlm_ls *ls)
+{
+ struct dlm_rsb *r, *s;
+
+ spin_lock(&ls->ls_recover_list_lock);
+ list_for_each_entry_safe(r, s, &ls->ls_recover_list, res_recover_list) {
+ list_del_init(&r->res_recover_list);
+ dlm_put_rsb(r);
+ ls->ls_recover_list_count--;
+ }
+
+ if (ls->ls_recover_list_count != 0) {
+ log_error(ls, "warning: recover_list_count %d",
+ ls->ls_recover_list_count);
+ ls->ls_recover_list_count = 0;
+ }
+ spin_unlock(&ls->ls_recover_list_lock);
+}
+
+
+/* Master recovery: find new master node for rsb's that were
+ mastered on nodes that have been removed.
+
+ dlm_recover_masters
+ recover_master
+ dlm_send_rcom_lookup -> receive_rcom_lookup
+ dlm_dir_lookup
+ receive_rcom_lookup_reply <-
+ dlm_recover_master_reply
+ set_new_master
+ set_master_lkbs
+ set_lock_master
+*/
+
+/*
+ * Set the lock master for all LKBs in a lock queue
+ * If we are the new master of the rsb, we may have received new
+ * MSTCPY locks from other nodes already which we need to ignore
+ * when setting the new nodeid.
+ */
+
+static void set_lock_master(struct list_head *queue, int nodeid)
+{
+ struct dlm_lkb *lkb;
+
+ list_for_each_entry(lkb, queue, lkb_statequeue)
+ if (!(lkb->lkb_flags & DLM_IFL_MSTCPY))
+ lkb->lkb_nodeid = nodeid;
+}
+
+static void set_master_lkbs(struct dlm_rsb *r)
+{
+ set_lock_master(&r->res_grantqueue, r->res_nodeid);
+ set_lock_master(&r->res_convertqueue, r->res_nodeid);
+ set_lock_master(&r->res_waitqueue, r->res_nodeid);
+}
+
+/*
+ * Propogate the new master nodeid to locks
+ * The NEW_MASTER flag tells dlm_recover_locks() which rsb's to consider.
+ * The NEW_MASTER2 flag tells recover_lvb() and set_locks_purged() which
+ * rsb's to consider.
+ */
+
+static void set_new_master(struct dlm_rsb *r, int nodeid)
+{
+ lock_rsb(r);
+ r->res_nodeid = nodeid;
+ set_master_lkbs(r);
+ rsb_set_flag(r, RSB_NEW_MASTER);
+ rsb_set_flag(r, RSB_NEW_MASTER2);
+ unlock_rsb(r);
+}
+
+/*
+ * We do async lookups on rsb's that need new masters. The rsb's
+ * waiting for a lookup reply are kept on the recover_list.
+ */
+
+static int recover_master(struct dlm_rsb *r)
+{
+ struct dlm_ls *ls = r->res_ls;
+ int error, dir_nodeid, ret_nodeid, our_nodeid = dlm_our_nodeid();
+
+ dir_nodeid = dlm_dir_nodeid(r);
+
+ if (dir_nodeid == our_nodeid) {
+ error = dlm_dir_lookup(ls, our_nodeid, r->res_name,
+ r->res_length, &ret_nodeid);
+ if (error)
+ log_error(ls, "recover dir lookup error %d", error);
+
+ if (ret_nodeid == our_nodeid)
+ ret_nodeid = 0;
+ set_new_master(r, ret_nodeid);
+ } else {
+ recover_list_add(r);
+ error = dlm_send_rcom_lookup(r, dir_nodeid);
+ }
+
+ return error;
+}
+
+/*
+ * When not using a directory, most resource names will hash to a new static
+ * master nodeid and the resource will need to be remastered.
+ */
+
+static int recover_master_static(struct dlm_rsb *r)
+{
+ int master = dlm_dir_nodeid(r);
+
+ if (master == dlm_our_nodeid())
+ master = 0;
+
+ if (r->res_nodeid != master) {
+ if (is_master(r))
+ dlm_purge_mstcpy_locks(r);
+ set_new_master(r, master);
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * Go through local root resources and for each rsb which has a master which
+ * has departed, get the new master nodeid from the directory. The dir will
+ * assign mastery to the first node to look up the new master. That means
+ * we'll discover in this lookup if we're the new master of any rsb's.
+ *
+ * We fire off all the dir lookup requests individually and asynchronously to
+ * the correct dir node.
+ */
+
+int dlm_recover_masters(struct dlm_ls *ls)
+{
+ struct dlm_rsb *r;
+ int error = 0, count = 0;
+
+ log_debug(ls, "dlm_recover_masters");
+
+ down_read(&ls->ls_root_sem);
+ list_for_each_entry(r, &ls->ls_root_list, res_root_list) {
+ if (dlm_recovery_stopped(ls)) {
+ up_read(&ls->ls_root_sem);
+ error = -EINTR;
+ goto out;
+ }
+
+ if (dlm_no_directory(ls))
+ count += recover_master_static(r);
+ else if (!is_master(r) && dlm_is_removed(ls, r->res_nodeid)) {
+ recover_master(r);
+ count++;
+ }
+
+ schedule();
+ }
+ up_read(&ls->ls_root_sem);
+
+ log_debug(ls, "dlm_recover_masters %d resources", count);
+
+ error = dlm_wait_function(ls, &recover_list_empty);
+ out:
+ if (error)
+ recover_list_clear(ls);
+ return error;
+}
+
+int dlm_recover_master_reply(struct dlm_ls *ls, struct dlm_rcom *rc)
+{
+ struct dlm_rsb *r;
+ int nodeid;
+
+ r = recover_list_find(ls, rc->rc_id);
+ if (!r) {
+ log_error(ls, "dlm_recover_master_reply no id %llx",
+ (unsigned long long)rc->rc_id);
+ goto out;
+ }
+
+ nodeid = rc->rc_result;
+ if (nodeid == dlm_our_nodeid())
+ nodeid = 0;
+
+ set_new_master(r, nodeid);
+ recover_list_del(r);
+
+ if (recover_list_empty(ls))
+ wake_up(&ls->ls_wait_general);
+ out:
+ return 0;
+}
+
+
+/* Lock recovery: rebuild the process-copy locks we hold on a
+ remastered rsb on the new rsb master.
+
+ dlm_recover_locks
+ recover_locks
+ recover_locks_queue
+ dlm_send_rcom_lock -> receive_rcom_lock
+ dlm_recover_master_copy
+ receive_rcom_lock_reply <-
+ dlm_recover_process_copy
+*/
+
+
+/*
+ * keep a count of the number of lkb's we send to the new master; when we get
+ * an equal number of replies then recovery for the rsb is done
+ */
+
+static int recover_locks_queue(struct dlm_rsb *r, struct list_head *head)
+{
+ struct dlm_lkb *lkb;
+ int error = 0;
+
+ list_for_each_entry(lkb, head, lkb_statequeue) {
+ error = dlm_send_rcom_lock(r, lkb);
+ if (error)
+ break;
+ r->res_recover_locks_count++;
+ }
+
+ return error;
+}
+
+static int recover_locks(struct dlm_rsb *r)
+{
+ int error = 0;
+
+ lock_rsb(r);
+
+ DLM_ASSERT(!r->res_recover_locks_count, dlm_dump_rsb(r););
+
+ error = recover_locks_queue(r, &r->res_grantqueue);
+ if (error)
+ goto out;
+ error = recover_locks_queue(r, &r->res_convertqueue);
+ if (error)
+ goto out;
+ error = recover_locks_queue(r, &r->res_waitqueue);
+ if (error)
+ goto out;
+
+ if (r->res_recover_locks_count)
+ recover_list_add(r);
+ else
+ rsb_clear_flag(r, RSB_NEW_MASTER);
+ out:
+ unlock_rsb(r);
+ return error;
+}
+
+int dlm_recover_locks(struct dlm_ls *ls)
+{
+ struct dlm_rsb *r;
+ int error, count = 0;
+
+ log_debug(ls, "dlm_recover_locks");
+
+ down_read(&ls->ls_root_sem);
+ list_for_each_entry(r, &ls->ls_root_list, res_root_list) {
+ if (is_master(r)) {
+ rsb_clear_flag(r, RSB_NEW_MASTER);
+ continue;
+ }
+
+ if (!rsb_flag(r, RSB_NEW_MASTER))
+ continue;
+
+ if (dlm_recovery_stopped(ls)) {
+ error = -EINTR;
+ up_read(&ls->ls_root_sem);
+ goto out;
+ }
+
+ error = recover_locks(r);
+ if (error) {
+ up_read(&ls->ls_root_sem);
+ goto out;
+ }
+
+ count += r->res_recover_locks_count;
+ }
+ up_read(&ls->ls_root_sem);
+
+ log_debug(ls, "dlm_recover_locks %d locks", count);
+
+ error = dlm_wait_function(ls, &recover_list_empty);
+ out:
+ if (error)
+ recover_list_clear(ls);
+ else
+ dlm_set_recover_status(ls, DLM_RS_LOCKS);
+ return error;
+}
+
+void dlm_recovered_lock(struct dlm_rsb *r)
+{
+ DLM_ASSERT(rsb_flag(r, RSB_NEW_MASTER), dlm_dump_rsb(r););
+
+ r->res_recover_locks_count--;
+ if (!r->res_recover_locks_count) {
+ rsb_clear_flag(r, RSB_NEW_MASTER);
+ recover_list_del(r);
+ }
+
+ if (recover_list_empty(r->res_ls))
+ wake_up(&r->res_ls->ls_wait_general);
+}
+
+/*
+ * The lvb needs to be recovered on all master rsb's. This includes setting
+ * the VALNOTVALID flag if necessary, and determining the correct lvb contents
+ * based on the lvb's of the locks held on the rsb.
+ *
+ * RSB_VALNOTVALID is set if there are only NL/CR locks on the rsb. If it
+ * was already set prior to recovery, it's not cleared, regardless of locks.
+ *
+ * The LVB contents are only considered for changing when this is a new master
+ * of the rsb (NEW_MASTER2). Then, the rsb's lvb is taken from any lkb with
+ * mode > CR. If no lkb's exist with mode above CR, the lvb contents are taken
+ * from the lkb with the largest lvb sequence number.
+ */
+
+static void recover_lvb(struct dlm_rsb *r)
+{
+ struct dlm_lkb *lkb, *high_lkb = NULL;
+ uint32_t high_seq = 0;
+ int lock_lvb_exists = 0;
+ int big_lock_exists = 0;
+ int lvblen = r->res_ls->ls_lvblen;
+
+ list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue) {
+ if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
+ continue;
+
+ lock_lvb_exists = 1;
+
+ if (lkb->lkb_grmode > DLM_LOCK_CR) {
+ big_lock_exists = 1;
+ goto setflag;
+ }
+
+ if (((int)lkb->lkb_lvbseq - (int)high_seq) >= 0) {
+ high_lkb = lkb;
+ high_seq = lkb->lkb_lvbseq;
+ }
+ }
+
+ list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue) {
+ if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
+ continue;
+
+ lock_lvb_exists = 1;
+
+ if (lkb->lkb_grmode > DLM_LOCK_CR) {
+ big_lock_exists = 1;
+ goto setflag;
+ }
+
+ if (((int)lkb->lkb_lvbseq - (int)high_seq) >= 0) {
+ high_lkb = lkb;
+ high_seq = lkb->lkb_lvbseq;
+ }
+ }
+
+ setflag:
+ if (!lock_lvb_exists)
+ goto out;
+
+ if (!big_lock_exists)
+ rsb_set_flag(r, RSB_VALNOTVALID);
+
+ /* don't mess with the lvb unless we're the new master */
+ if (!rsb_flag(r, RSB_NEW_MASTER2))
+ goto out;
+
+ if (!r->res_lvbptr) {
+ r->res_lvbptr = allocate_lvb(r->res_ls);
+ if (!r->res_lvbptr)
+ goto out;
+ }
+
+ if (big_lock_exists) {
+ r->res_lvbseq = lkb->lkb_lvbseq;
+ memcpy(r->res_lvbptr, lkb->lkb_lvbptr, lvblen);
+ } else if (high_lkb) {
+ r->res_lvbseq = high_lkb->lkb_lvbseq;
+ memcpy(r->res_lvbptr, high_lkb->lkb_lvbptr, lvblen);
+ } else {
+ r->res_lvbseq = 0;
+ memset(r->res_lvbptr, 0, lvblen);
+ }
+ out:
+ return;
+}
+
+/* All master rsb's flagged RECOVER_CONVERT need to be looked at. The locks
+ converting PR->CW or CW->PR need to have their lkb_grmode set. */
+
+static void recover_conversion(struct dlm_rsb *r)
+{
+ struct dlm_lkb *lkb;
+ int grmode = -1;
+
+ list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue) {
+ if (lkb->lkb_grmode == DLM_LOCK_PR ||
+ lkb->lkb_grmode == DLM_LOCK_CW) {
+ grmode = lkb->lkb_grmode;
+ break;
+ }
+ }
+
+ list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue) {
+ if (lkb->lkb_grmode != DLM_LOCK_IV)
+ continue;
+ if (grmode == -1)
+ lkb->lkb_grmode = lkb->lkb_rqmode;
+ else
+ lkb->lkb_grmode = grmode;
+ }
+}
+
+/* We've become the new master for this rsb and waiting/converting locks may
+ need to be granted in dlm_grant_after_purge() due to locks that may have
+ existed from a removed node. */
+
+static void set_locks_purged(struct dlm_rsb *r)
+{
+ if (!list_empty(&r->res_waitqueue) || !list_empty(&r->res_convertqueue))
+ rsb_set_flag(r, RSB_LOCKS_PURGED);
+}
+
+void dlm_recover_rsbs(struct dlm_ls *ls)
+{
+ struct dlm_rsb *r;
+ int count = 0;
+
+ log_debug(ls, "dlm_recover_rsbs");
+
+ down_read(&ls->ls_root_sem);
+ list_for_each_entry(r, &ls->ls_root_list, res_root_list) {
+ lock_rsb(r);
+ if (is_master(r)) {
+ if (rsb_flag(r, RSB_RECOVER_CONVERT))
+ recover_conversion(r);
+ if (rsb_flag(r, RSB_NEW_MASTER2))
+ set_locks_purged(r);
+ recover_lvb(r);
+ count++;
+ }
+ rsb_clear_flag(r, RSB_RECOVER_CONVERT);
+ rsb_clear_flag(r, RSB_NEW_MASTER2);
+ unlock_rsb(r);
+ }
+ up_read(&ls->ls_root_sem);
+
+ log_debug(ls, "dlm_recover_rsbs %d rsbs", count);
+}
+
+/* Create a single list of all root rsb's to be used during recovery */
+
+int dlm_create_root_list(struct dlm_ls *ls)
+{
+ struct dlm_rsb *r;
+ int i, error = 0;
+
+ down_write(&ls->ls_root_sem);
+ if (!list_empty(&ls->ls_root_list)) {
+ log_error(ls, "root list not empty");
+ error = -EINVAL;
+ goto out;
+ }
+
+ for (i = 0; i < ls->ls_rsbtbl_size; i++) {
+ read_lock(&ls->ls_rsbtbl[i].lock);
+ list_for_each_entry(r, &ls->ls_rsbtbl[i].list, res_hashchain) {
+ list_add(&r->res_root_list, &ls->ls_root_list);
+ dlm_hold_rsb(r);
+ }
+ read_unlock(&ls->ls_rsbtbl[i].lock);
+ }
+ out:
+ up_write(&ls->ls_root_sem);
+ return error;
+}
+
+void dlm_release_root_list(struct dlm_ls *ls)
+{
+ struct dlm_rsb *r, *safe;
+
+ down_write(&ls->ls_root_sem);
+ list_for_each_entry_safe(r, safe, &ls->ls_root_list, res_root_list) {
+ list_del_init(&r->res_root_list);
+ dlm_put_rsb(r);
+ }
+ up_write(&ls->ls_root_sem);
+}
+
+void dlm_clear_toss_list(struct dlm_ls *ls)
+{
+ struct dlm_rsb *r, *safe;
+ int i;
+
+ for (i = 0; i < ls->ls_rsbtbl_size; i++) {
+ write_lock(&ls->ls_rsbtbl[i].lock);
+ list_for_each_entry_safe(r, safe, &ls->ls_rsbtbl[i].toss,
+ res_hashchain) {
+ list_del(&r->res_hashchain);
+ free_rsb(r);
+ }
+ write_unlock(&ls->ls_rsbtbl[i].lock);
+ }
+}
+
diff --git a/fs/dlm/recover.h b/fs/dlm/recover.h
new file mode 100644
index 000000000000..ebd0363f1e08
--- /dev/null
+++ b/fs/dlm/recover.h
@@ -0,0 +1,34 @@
+/******************************************************************************
+*******************************************************************************
+**
+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
+**
+** This copyrighted material is made available to anyone wishing to use,
+** modify, copy, or redistribute it subject to the terms and conditions
+** of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __RECOVER_DOT_H__
+#define __RECOVER_DOT_H__
+
+int dlm_wait_function(struct dlm_ls *ls, int (*testfn) (struct dlm_ls *ls));
+uint32_t dlm_recover_status(struct dlm_ls *ls);
+void dlm_set_recover_status(struct dlm_ls *ls, uint32_t status);
+int dlm_recover_members_wait(struct dlm_ls *ls);
+int dlm_recover_directory_wait(struct dlm_ls *ls);
+int dlm_recover_locks_wait(struct dlm_ls *ls);
+int dlm_recover_done_wait(struct dlm_ls *ls);
+int dlm_recover_masters(struct dlm_ls *ls);
+int dlm_recover_master_reply(struct dlm_ls *ls, struct dlm_rcom *rc);
+int dlm_recover_locks(struct dlm_ls *ls);
+void dlm_recovered_lock(struct dlm_rsb *r);
+int dlm_create_root_list(struct dlm_ls *ls);
+void dlm_release_root_list(struct dlm_ls *ls);
+void dlm_clear_toss_list(struct dlm_ls *ls);
+void dlm_recover_rsbs(struct dlm_ls *ls);
+
+#endif /* __RECOVER_DOT_H__ */
+
diff --git a/fs/dlm/recoverd.c b/fs/dlm/recoverd.c
new file mode 100644
index 000000000000..362e3eff4dc9
--- /dev/null
+++ b/fs/dlm/recoverd.c
@@ -0,0 +1,290 @@
+/******************************************************************************
+*******************************************************************************
+**
+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
+**
+** This copyrighted material is made available to anyone wishing to use,
+** modify, copy, or redistribute it subject to the terms and conditions
+** of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#include "dlm_internal.h"
+#include "lockspace.h"
+#include "member.h"
+#include "dir.h"
+#include "ast.h"
+#include "recover.h"
+#include "lowcomms.h"
+#include "lock.h"
+#include "requestqueue.h"
+#include "recoverd.h"
+
+
+/* If the start for which we're re-enabling locking (seq) has been superseded
+ by a newer stop (ls_recover_seq), we need to leave locking disabled. */
+
+static int enable_locking(struct dlm_ls *ls, uint64_t seq)
+{
+ int error = -EINTR;
+
+ spin_lock(&ls->ls_recover_lock);
+ if (ls->ls_recover_seq == seq) {
+ set_bit(LSFL_RUNNING, &ls->ls_flags);
+ up_write(&ls->ls_in_recovery);
+ error = 0;
+ }
+ spin_unlock(&ls->ls_recover_lock);
+ return error;
+}
+
+static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
+{
+ unsigned long start;
+ int error, neg = 0;
+
+ log_debug(ls, "recover %llx", rv->seq);
+
+ mutex_lock(&ls->ls_recoverd_active);
+
+ /*
+ * Suspending and resuming dlm_astd ensures that no lkb's from this ls
+ * will be processed by dlm_astd during recovery.
+ */
+
+ dlm_astd_suspend();
+ dlm_astd_resume();
+
+ /*
+ * This list of root rsb's will be the basis of most of the recovery
+ * routines.
+ */
+
+ dlm_create_root_list(ls);
+
+ /*
+ * Free all the tossed rsb's so we don't have to recover them.
+ */
+
+ dlm_clear_toss_list(ls);
+
+ /*
+ * Add or remove nodes from the lockspace's ls_nodes list.
+ * Also waits for all nodes to complete dlm_recover_members.
+ */
+
+ error = dlm_recover_members(ls, rv, &neg);
+ if (error) {
+ log_error(ls, "recover_members failed %d", error);
+ goto fail;
+ }
+ start = jiffies;
+
+ /*
+ * Rebuild our own share of the directory by collecting from all other
+ * nodes their master rsb names that hash to us.
+ */
+
+ error = dlm_recover_directory(ls);
+ if (error) {
+ log_error(ls, "recover_directory failed %d", error);
+ goto fail;
+ }
+
+ /*
+ * Purge directory-related requests that are saved in requestqueue.
+ * All dir requests from before recovery are invalid now due to the dir
+ * rebuild and will be resent by the requesting nodes.
+ */
+
+ dlm_purge_requestqueue(ls);
+
+ /*
+ * Wait for all nodes to complete directory rebuild.
+ */
+
+ error = dlm_recover_directory_wait(ls);
+ if (error) {
+ log_error(ls, "recover_directory_wait failed %d", error);
+ goto fail;
+ }
+
+ /*
+ * We may have outstanding operations that are waiting for a reply from
+ * a failed node. Mark these to be resent after recovery. Unlock and
+ * cancel ops can just be completed.
+ */
+
+ dlm_recover_waiters_pre(ls);
+
+ error = dlm_recovery_stopped(ls);
+ if (error)
+ goto fail;
+
+ if (neg || dlm_no_directory(ls)) {
+ /*
+ * Clear lkb's for departed nodes.
+ */
+
+ dlm_purge_locks(ls);
+
+ /*
+ * Get new master nodeid's for rsb's that were mastered on
+ * departed nodes.
+ */
+
+ error = dlm_recover_masters(ls);
+ if (error) {
+ log_error(ls, "recover_masters failed %d", error);
+ goto fail;
+ }
+
+ /*
+ * Send our locks on remastered rsb's to the new masters.
+ */
+
+ error = dlm_recover_locks(ls);
+ if (error) {
+ log_error(ls, "recover_locks failed %d", error);
+ goto fail;
+ }
+
+ error = dlm_recover_locks_wait(ls);
+ if (error) {
+ log_error(ls, "recover_locks_wait failed %d", error);
+ goto fail;
+ }
+
+ /*
+ * Finalize state in master rsb's now that all locks can be
+ * checked. This includes conversion resolution and lvb
+ * settings.
+ */
+
+ dlm_recover_rsbs(ls);
+ }
+
+ dlm_release_root_list(ls);
+
+ dlm_set_recover_status(ls, DLM_RS_DONE);
+ error = dlm_recover_done_wait(ls);
+ if (error) {
+ log_error(ls, "recover_done_wait failed %d", error);
+ goto fail;
+ }
+
+ dlm_clear_members_gone(ls);
+
+ error = enable_locking(ls, rv->seq);
+ if (error) {
+ log_error(ls, "enable_locking failed %d", error);
+ goto fail;
+ }
+
+ error = dlm_process_requestqueue(ls);
+ if (error) {
+ log_error(ls, "process_requestqueue failed %d", error);
+ goto fail;
+ }
+
+ error = dlm_recover_waiters_post(ls);
+ if (error) {
+ log_error(ls, "recover_waiters_post failed %d", error);
+ goto fail;
+ }
+
+ dlm_grant_after_purge(ls);
+
+ dlm_astd_wake();
+
+ log_debug(ls, "recover %llx done: %u ms", rv->seq,
+ jiffies_to_msecs(jiffies - start));
+ mutex_unlock(&ls->ls_recoverd_active);
+
+ return 0;
+
+ fail:
+ dlm_release_root_list(ls);
+ log_debug(ls, "recover %llx error %d", rv->seq, error);
+ mutex_unlock(&ls->ls_recoverd_active);
+ return error;
+}
+
+static void do_ls_recovery(struct dlm_ls *ls)
+{
+ struct dlm_recover *rv = NULL;
+
+ spin_lock(&ls->ls_recover_lock);
+ rv = ls->ls_recover_args;
+ ls->ls_recover_args = NULL;
+ clear_bit(LSFL_RECOVERY_STOP, &ls->ls_flags);
+ spin_unlock(&ls->ls_recover_lock);
+
+ if (rv) {
+ ls_recover(ls, rv);
+ kfree(rv->nodeids);
+ kfree(rv);
+ }
+}
+
+static int dlm_recoverd(void *arg)
+{
+ struct dlm_ls *ls;
+
+ ls = dlm_find_lockspace_local(arg);
+ if (!ls) {
+ log_print("dlm_recoverd: no lockspace %p", arg);
+ return -1;
+ }
+
+ while (!kthread_should_stop()) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ if (!test_bit(LSFL_WORK, &ls->ls_flags))
+ schedule();
+ set_current_state(TASK_RUNNING);
+
+ if (test_and_clear_bit(LSFL_WORK, &ls->ls_flags))
+ do_ls_recovery(ls);
+ }
+
+ dlm_put_lockspace(ls);
+ return 0;
+}
+
+void dlm_recoverd_kick(struct dlm_ls *ls)
+{
+ set_bit(LSFL_WORK, &ls->ls_flags);
+ wake_up_process(ls->ls_recoverd_task);
+}
+
+int dlm_recoverd_start(struct dlm_ls *ls)
+{
+ struct task_struct *p;
+ int error = 0;
+
+ p = kthread_run(dlm_recoverd, ls, "dlm_recoverd");
+ if (IS_ERR(p))
+ error = PTR_ERR(p);
+ else
+ ls->ls_recoverd_task = p;
+ return error;
+}
+
+void dlm_recoverd_stop(struct dlm_ls *ls)
+{
+ kthread_stop(ls->ls_recoverd_task);
+}
+
+void dlm_recoverd_suspend(struct dlm_ls *ls)
+{
+ wake_up(&ls->ls_wait_general);
+ mutex_lock(&ls->ls_recoverd_active);
+}
+
+void dlm_recoverd_resume(struct dlm_ls *ls)
+{
+ mutex_unlock(&ls->ls_recoverd_active);
+}
+
diff --git a/fs/dlm/recoverd.h b/fs/dlm/recoverd.h
new file mode 100644
index 000000000000..866657c5d69d
--- /dev/null
+++ b/fs/dlm/recoverd.h
@@ -0,0 +1,24 @@
+/******************************************************************************
+*******************************************************************************
+**
+** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
+**
+** This copyrighted material is made available to anyone wishing to use,
+** modify, copy, or redistribute it subject to the terms and conditions
+** of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __RECOVERD_DOT_H__
+#define __RECOVERD_DOT_H__
+
+void dlm_recoverd_kick(struct dlm_ls *ls);
+void dlm_recoverd_stop(struct dlm_ls *ls);
+int dlm_recoverd_start(struct dlm_ls *ls);
+void dlm_recoverd_suspend(struct dlm_ls *ls);
+void dlm_recoverd_resume(struct dlm_ls *ls);
+
+#endif /* __RECOVERD_DOT_H__ */
+
diff --git a/fs/dlm/requestqueue.c b/fs/dlm/requestqueue.c
new file mode 100644
index 000000000000..7b2b089634a2
--- /dev/null
+++ b/fs/dlm/requestqueue.c
@@ -0,0 +1,184 @@
+/******************************************************************************
+*******************************************************************************
+**
+** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
+**
+** This copyrighted material is made available to anyone wishing to use,
+** modify, copy, or redistribute it subject to the terms and conditions
+** of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#include "dlm_internal.h"
+#include "member.h"
+#include "lock.h"
+#include "dir.h"
+#include "config.h"
+#include "requestqueue.h"
+
+struct rq_entry {
+ struct list_head list;
+ int nodeid;
+ char request[1];
+};
+
+/*
+ * Requests received while the lockspace is in recovery get added to the
+ * request queue and processed when recovery is complete. This happens when
+ * the lockspace is suspended on some nodes before it is on others, or the
+ * lockspace is enabled on some while still suspended on others.
+ */
+
+void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_header *hd)
+{
+ struct rq_entry *e;
+ int length = hd->h_length;
+
+ if (dlm_is_removed(ls, nodeid))
+ return;
+
+ e = kmalloc(sizeof(struct rq_entry) + length, GFP_KERNEL);
+ if (!e) {
+ log_print("dlm_add_requestqueue: out of memory\n");
+ return;
+ }
+
+ e->nodeid = nodeid;
+ memcpy(e->request, hd, length);
+
+ mutex_lock(&ls->ls_requestqueue_mutex);
+ list_add_tail(&e->list, &ls->ls_requestqueue);
+ mutex_unlock(&ls->ls_requestqueue_mutex);
+}
+
+int dlm_process_requestqueue(struct dlm_ls *ls)
+{
+ struct rq_entry *e;
+ struct dlm_header *hd;
+ int error = 0;
+
+ mutex_lock(&ls->ls_requestqueue_mutex);
+
+ for (;;) {
+ if (list_empty(&ls->ls_requestqueue)) {
+ mutex_unlock(&ls->ls_requestqueue_mutex);
+ error = 0;
+ break;
+ }
+ e = list_entry(ls->ls_requestqueue.next, struct rq_entry, list);
+ mutex_unlock(&ls->ls_requestqueue_mutex);
+
+ hd = (struct dlm_header *) e->request;
+ error = dlm_receive_message(hd, e->nodeid, 1);
+
+ if (error == -EINTR) {
+ /* entry is left on requestqueue */
+ log_debug(ls, "process_requestqueue abort eintr");
+ break;
+ }
+
+ mutex_lock(&ls->ls_requestqueue_mutex);
+ list_del(&e->list);
+ kfree(e);
+
+ if (dlm_locking_stopped(ls)) {
+ log_debug(ls, "process_requestqueue abort running");
+ mutex_unlock(&ls->ls_requestqueue_mutex);
+ error = -EINTR;
+ break;
+ }
+ schedule();
+ }
+
+ return error;
+}
+
+/*
+ * After recovery is done, locking is resumed and dlm_recoverd takes all the
+ * saved requests and processes them as they would have been by dlm_recvd. At
+ * the same time, dlm_recvd will start receiving new requests from remote
+ * nodes. We want to delay dlm_recvd processing new requests until
+ * dlm_recoverd has finished processing the old saved requests.
+ */
+
+void dlm_wait_requestqueue(struct dlm_ls *ls)
+{
+ for (;;) {
+ mutex_lock(&ls->ls_requestqueue_mutex);
+ if (list_empty(&ls->ls_requestqueue))
+ break;
+ if (dlm_locking_stopped(ls))
+ break;
+ mutex_unlock(&ls->ls_requestqueue_mutex);
+ schedule();
+ }
+ mutex_unlock(&ls->ls_requestqueue_mutex);
+}
+
+static int purge_request(struct dlm_ls *ls, struct dlm_message *ms, int nodeid)
+{
+ uint32_t type = ms->m_type;
+
+ if (dlm_is_removed(ls, nodeid))
+ return 1;
+
+ /* directory operations are always purged because the directory is
+ always rebuilt during recovery and the lookups resent */
+
+ if (type == DLM_MSG_REMOVE ||
+ type == DLM_MSG_LOOKUP ||
+ type == DLM_MSG_LOOKUP_REPLY)
+ return 1;
+
+ if (!dlm_no_directory(ls))
+ return 0;
+
+ /* with no directory, the master is likely to change as a part of
+ recovery; requests to/from the defunct master need to be purged */
+
+ switch (type) {
+ case DLM_MSG_REQUEST:
+ case DLM_MSG_CONVERT:
+ case DLM_MSG_UNLOCK:
+ case DLM_MSG_CANCEL:
+ /* we're no longer the master of this resource, the sender
+ will resend to the new master (see waiter_needs_recovery) */
+
+ if (dlm_hash2nodeid(ls, ms->m_hash) != dlm_our_nodeid())
+ return 1;
+ break;
+
+ case DLM_MSG_REQUEST_REPLY:
+ case DLM_MSG_CONVERT_REPLY:
+ case DLM_MSG_UNLOCK_REPLY:
+ case DLM_MSG_CANCEL_REPLY:
+ case DLM_MSG_GRANT:
+ /* this reply is from the former master of the resource,
+ we'll resend to the new master if needed */
+
+ if (dlm_hash2nodeid(ls, ms->m_hash) != nodeid)
+ return 1;
+ break;
+ }
+
+ return 0;
+}
+
+void dlm_purge_requestqueue(struct dlm_ls *ls)
+{
+ struct dlm_message *ms;
+ struct rq_entry *e, *safe;
+
+ mutex_lock(&ls->ls_requestqueue_mutex);
+ list_for_each_entry_safe(e, safe, &ls->ls_requestqueue, list) {
+ ms = (struct dlm_message *) e->request;
+
+ if (purge_request(ls, ms, e->nodeid)) {
+ list_del(&e->list);
+ kfree(e);
+ }
+ }
+ mutex_unlock(&ls->ls_requestqueue_mutex);
+}
+
diff --git a/fs/dlm/requestqueue.h b/fs/dlm/requestqueue.h
new file mode 100644
index 000000000000..349f0d292d95
--- /dev/null
+++ b/fs/dlm/requestqueue.h
@@ -0,0 +1,22 @@
+/******************************************************************************
+*******************************************************************************
+**
+** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
+**
+** This copyrighted material is made available to anyone wishing to use,
+** modify, copy, or redistribute it subject to the terms and conditions
+** of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __REQUESTQUEUE_DOT_H__
+#define __REQUESTQUEUE_DOT_H__
+
+void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_header *hd);
+int dlm_process_requestqueue(struct dlm_ls *ls);
+void dlm_wait_requestqueue(struct dlm_ls *ls);
+void dlm_purge_requestqueue(struct dlm_ls *ls);
+
+#endif
+
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
new file mode 100644
index 000000000000..c37e93e4f2df
--- /dev/null
+++ b/fs/dlm/user.c
@@ -0,0 +1,788 @@
+/*
+ * Copyright (C) 2006 Red Hat, Inc. All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License v.2.
+ */
+
+#include <linux/miscdevice.h>
+#include <linux/init.h>
+#include <linux/wait.h>
+#include <linux/module.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/poll.h>
+#include <linux/signal.h>
+#include <linux/spinlock.h>
+#include <linux/dlm.h>
+#include <linux/dlm_device.h>
+
+#include "dlm_internal.h"
+#include "lockspace.h"
+#include "lock.h"
+#include "lvb_table.h"
+
+static const char *name_prefix="dlm";
+static struct miscdevice ctl_device;
+static struct file_operations device_fops;
+
+#ifdef CONFIG_COMPAT
+
+struct dlm_lock_params32 {
+ __u8 mode;
+ __u8 namelen;
+ __u16 flags;
+ __u32 lkid;
+ __u32 parent;
+
+ __u32 castparam;
+ __u32 castaddr;
+ __u32 bastparam;
+ __u32 bastaddr;
+ __u32 lksb;
+
+ char lvb[DLM_USER_LVB_LEN];
+ char name[0];
+};
+
+struct dlm_write_request32 {
+ __u32 version[3];
+ __u8 cmd;
+ __u8 is64bit;
+ __u8 unused[2];
+
+ union {
+ struct dlm_lock_params32 lock;
+ struct dlm_lspace_params lspace;
+ } i;
+};
+
+struct dlm_lksb32 {
+ __u32 sb_status;
+ __u32 sb_lkid;
+ __u8 sb_flags;
+ __u32 sb_lvbptr;
+};
+
+struct dlm_lock_result32 {
+ __u32 length;
+ __u32 user_astaddr;
+ __u32 user_astparam;
+ __u32 user_lksb;
+ struct dlm_lksb32 lksb;
+ __u8 bast_mode;
+ __u8 unused[3];
+ /* Offsets may be zero if no data is present */
+ __u32 lvb_offset;
+};
+
+static void compat_input(struct dlm_write_request *kb,
+ struct dlm_write_request32 *kb32)
+{
+ kb->version[0] = kb32->version[0];
+ kb->version[1] = kb32->version[1];
+ kb->version[2] = kb32->version[2];
+
+ kb->cmd = kb32->cmd;
+ kb->is64bit = kb32->is64bit;
+ if (kb->cmd == DLM_USER_CREATE_LOCKSPACE ||
+ kb->cmd == DLM_USER_REMOVE_LOCKSPACE) {
+ kb->i.lspace.flags = kb32->i.lspace.flags;
+ kb->i.lspace.minor = kb32->i.lspace.minor;
+ strcpy(kb->i.lspace.name, kb32->i.lspace.name);
+ } else {
+ kb->i.lock.mode = kb32->i.lock.mode;
+ kb->i.lock.namelen = kb32->i.lock.namelen;
+ kb->i.lock.flags = kb32->i.lock.flags;
+ kb->i.lock.lkid = kb32->i.lock.lkid;
+ kb->i.lock.parent = kb32->i.lock.parent;
+ kb->i.lock.castparam = (void *)(long)kb32->i.lock.castparam;
+ kb->i.lock.castaddr = (void *)(long)kb32->i.lock.castaddr;
+ kb->i.lock.bastparam = (void *)(long)kb32->i.lock.bastparam;
+ kb->i.lock.bastaddr = (void *)(long)kb32->i.lock.bastaddr;
+ kb->i.lock.lksb = (void *)(long)kb32->i.lock.lksb;
+ memcpy(kb->i.lock.lvb, kb32->i.lock.lvb, DLM_USER_LVB_LEN);
+ memcpy(kb->i.lock.name, kb32->i.lock.name, kb->i.lock.namelen);
+ }
+}
+
+static void compat_output(struct dlm_lock_result *res,
+ struct dlm_lock_result32 *res32)
+{
+ res32->length = res->length - (sizeof(struct dlm_lock_result) -
+ sizeof(struct dlm_lock_result32));
+ res32->user_astaddr = (__u32)(long)res->user_astaddr;
+ res32->user_astparam = (__u32)(long)res->user_astparam;
+ res32->user_lksb = (__u32)(long)res->user_lksb;
+ res32->bast_mode = res->bast_mode;
+
+ res32->lvb_offset = res->lvb_offset;
+ res32->length = res->length;
+
+ res32->lksb.sb_status = res->lksb.sb_status;
+ res32->lksb.sb_flags = res->lksb.sb_flags;
+ res32->lksb.sb_lkid = res->lksb.sb_lkid;
+ res32->lksb.sb_lvbptr = (__u32)(long)res->lksb.sb_lvbptr;
+}
+#endif
+
+
+void dlm_user_add_ast(struct dlm_lkb *lkb, int type)
+{
+ struct dlm_ls *ls;
+ struct dlm_user_args *ua;
+ struct dlm_user_proc *proc;
+ int remove_ownqueue = 0;
+
+ /* dlm_clear_proc_locks() sets ORPHAN/DEAD flag on each
+ lkb before dealing with it. We need to check this
+ flag before taking ls_clear_proc_locks mutex because if
+ it's set, dlm_clear_proc_locks() holds the mutex. */
+
+ if (lkb->lkb_flags & (DLM_IFL_ORPHAN | DLM_IFL_DEAD)) {
+ /* log_print("user_add_ast skip1 %x", lkb->lkb_flags); */
+ return;
+ }
+
+ ls = lkb->lkb_resource->res_ls;
+ mutex_lock(&ls->ls_clear_proc_locks);
+
+ /* If ORPHAN/DEAD flag is set, it means the process is dead so an ast
+ can't be delivered. For ORPHAN's, dlm_clear_proc_locks() freed
+ lkb->ua so we can't try to use it. */
+
+ if (lkb->lkb_flags & (DLM_IFL_ORPHAN | DLM_IFL_DEAD)) {
+ /* log_print("user_add_ast skip2 %x", lkb->lkb_flags); */
+ goto out;
+ }
+
+ DLM_ASSERT(lkb->lkb_astparam, dlm_print_lkb(lkb););
+ ua = (struct dlm_user_args *)lkb->lkb_astparam;
+ proc = ua->proc;
+
+ if (type == AST_BAST && ua->bastaddr == NULL)
+ goto out;
+
+ spin_lock(&proc->asts_spin);
+ if (!(lkb->lkb_ast_type & (AST_COMP | AST_BAST))) {
+ kref_get(&lkb->lkb_ref);
+ list_add_tail(&lkb->lkb_astqueue, &proc->asts);
+ lkb->lkb_ast_type |= type;
+ wake_up_interruptible(&proc->wait);
+ }
+
+ /* noqueue requests that fail may need to be removed from the
+ proc's locks list, there should be a better way of detecting
+ this situation than checking all these things... */
+
+ if (type == AST_COMP && lkb->lkb_grmode == DLM_LOCK_IV &&
+ ua->lksb.sb_status == -EAGAIN && !list_empty(&lkb->lkb_ownqueue))
+ remove_ownqueue = 1;
+
+ /* We want to copy the lvb to userspace when the completion
+ ast is read if the status is 0, the lock has an lvb and
+ lvb_ops says we should. We could probably have set_lvb_lock()
+ set update_user_lvb instead and not need old_mode */
+
+ if ((lkb->lkb_ast_type & AST_COMP) &&
+ (lkb->lkb_lksb->sb_status == 0) &&
+ lkb->lkb_lksb->sb_lvbptr &&
+ dlm_lvb_operations[ua->old_mode + 1][lkb->lkb_grmode + 1])
+ ua->update_user_lvb = 1;
+ else
+ ua->update_user_lvb = 0;
+
+ spin_unlock(&proc->asts_spin);
+
+ if (remove_ownqueue) {
+ spin_lock(&ua->proc->locks_spin);
+ list_del_init(&lkb->lkb_ownqueue);
+ spin_unlock(&ua->proc->locks_spin);
+ dlm_put_lkb(lkb);
+ }
+ out:
+ mutex_unlock(&ls->ls_clear_proc_locks);
+}
+
+static int device_user_lock(struct dlm_user_proc *proc,
+ struct dlm_lock_params *params)
+{
+ struct dlm_ls *ls;
+ struct dlm_user_args *ua;
+ int error = -ENOMEM;
+
+ ls = dlm_find_lockspace_local(proc->lockspace);
+ if (!ls)
+ return -ENOENT;
+
+ if (!params->castaddr || !params->lksb) {
+ error = -EINVAL;
+ goto out;
+ }
+
+ ua = kzalloc(sizeof(struct dlm_user_args), GFP_KERNEL);
+ if (!ua)
+ goto out;
+ ua->proc = proc;
+ ua->user_lksb = params->lksb;
+ ua->castparam = params->castparam;
+ ua->castaddr = params->castaddr;
+ ua->bastparam = params->bastparam;
+ ua->bastaddr = params->bastaddr;
+
+ if (params->flags & DLM_LKF_CONVERT)
+ error = dlm_user_convert(ls, ua,
+ params->mode, params->flags,
+ params->lkid, params->lvb);
+ else {
+ error = dlm_user_request(ls, ua,
+ params->mode, params->flags,
+ params->name, params->namelen,
+ params->parent);
+ if (!error)
+ error = ua->lksb.sb_lkid;
+ }
+ out:
+ dlm_put_lockspace(ls);
+ return error;
+}
+
+static int device_user_unlock(struct dlm_user_proc *proc,
+ struct dlm_lock_params *params)
+{
+ struct dlm_ls *ls;
+ struct dlm_user_args *ua;
+ int error = -ENOMEM;
+
+ ls = dlm_find_lockspace_local(proc->lockspace);
+ if (!ls)
+ return -ENOENT;
+
+ ua = kzalloc(sizeof(struct dlm_user_args), GFP_KERNEL);
+ if (!ua)
+ goto out;
+ ua->proc = proc;
+ ua->user_lksb = params->lksb;
+ ua->castparam = params->castparam;
+ ua->castaddr = params->castaddr;
+
+ if (params->flags & DLM_LKF_CANCEL)
+ error = dlm_user_cancel(ls, ua, params->flags, params->lkid);
+ else
+ error = dlm_user_unlock(ls, ua, params->flags, params->lkid,
+ params->lvb);
+ out:
+ dlm_put_lockspace(ls);
+ return error;
+}
+
+static int device_create_lockspace(struct dlm_lspace_params *params)
+{
+ dlm_lockspace_t *lockspace;
+ struct dlm_ls *ls;
+ int error, len;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ error = dlm_new_lockspace(params->name, strlen(params->name),
+ &lockspace, 0, DLM_USER_LVB_LEN);
+ if (error)
+ return error;
+
+ ls = dlm_find_lockspace_local(lockspace);
+ if (!ls)
+ return -ENOENT;
+
+ error = -ENOMEM;
+ len = strlen(params->name) + strlen(name_prefix) + 2;
+ ls->ls_device.name = kzalloc(len, GFP_KERNEL);
+ if (!ls->ls_device.name)
+ goto fail;
+ snprintf((char *)ls->ls_device.name, len, "%s_%s", name_prefix,
+ params->name);
+ ls->ls_device.fops = &device_fops;
+ ls->ls_device.minor = MISC_DYNAMIC_MINOR;
+
+ error = misc_register(&ls->ls_device);
+ if (error) {
+ kfree(ls->ls_device.name);
+ goto fail;
+ }
+
+ error = ls->ls_device.minor;
+ dlm_put_lockspace(ls);
+ return error;
+
+ fail:
+ dlm_put_lockspace(ls);
+ dlm_release_lockspace(lockspace, 0);
+ return error;
+}
+
+static int device_remove_lockspace(struct dlm_lspace_params *params)
+{
+ dlm_lockspace_t *lockspace;
+ struct dlm_ls *ls;
+ int error, force = 0;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ ls = dlm_find_lockspace_device(params->minor);
+ if (!ls)
+ return -ENOENT;
+
+ error = misc_deregister(&ls->ls_device);
+ if (error) {
+ dlm_put_lockspace(ls);
+ goto out;
+ }
+ kfree(ls->ls_device.name);
+
+ if (params->flags & DLM_USER_LSFLG_FORCEFREE)
+ force = 2;
+
+ lockspace = ls->ls_local_handle;
+
+ /* dlm_release_lockspace waits for references to go to zero,
+ so all processes will need to close their device for the ls
+ before the release will procede */
+
+ dlm_put_lockspace(ls);
+ error = dlm_release_lockspace(lockspace, force);
+ out:
+ return error;
+}
+
+/* Check the user's version matches ours */
+static int check_version(struct dlm_write_request *req)
+{
+ if (req->version[0] != DLM_DEVICE_VERSION_MAJOR ||
+ (req->version[0] == DLM_DEVICE_VERSION_MAJOR &&
+ req->version[1] > DLM_DEVICE_VERSION_MINOR)) {
+
+ printk(KERN_DEBUG "dlm: process %s (%d) version mismatch "
+ "user (%d.%d.%d) kernel (%d.%d.%d)\n",
+ current->comm,
+ current->pid,
+ req->version[0],
+ req->version[1],
+ req->version[2],
+ DLM_DEVICE_VERSION_MAJOR,
+ DLM_DEVICE_VERSION_MINOR,
+ DLM_DEVICE_VERSION_PATCH);
+ return -EINVAL;
+ }
+ return 0;
+}
+
+/*
+ * device_write
+ *
+ * device_user_lock
+ * dlm_user_request -> request_lock
+ * dlm_user_convert -> convert_lock
+ *
+ * device_user_unlock
+ * dlm_user_unlock -> unlock_lock
+ * dlm_user_cancel -> cancel_lock
+ *
+ * device_create_lockspace
+ * dlm_new_lockspace
+ *
+ * device_remove_lockspace
+ * dlm_release_lockspace
+ */
+
+/* a write to a lockspace device is a lock or unlock request, a write
+ to the control device is to create/remove a lockspace */
+
+static ssize_t device_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct dlm_user_proc *proc = file->private_data;
+ struct dlm_write_request *kbuf;
+ sigset_t tmpsig, allsigs;
+ int error;
+
+#ifdef CONFIG_COMPAT
+ if (count < sizeof(struct dlm_write_request32))
+#else
+ if (count < sizeof(struct dlm_write_request))
+#endif
+ return -EINVAL;
+
+ kbuf = kmalloc(count, GFP_KERNEL);
+ if (!kbuf)
+ return -ENOMEM;
+
+ if (copy_from_user(kbuf, buf, count)) {
+ error = -EFAULT;
+ goto out_free;
+ }
+
+ if (check_version(kbuf)) {
+ error = -EBADE;
+ goto out_free;
+ }
+
+#ifdef CONFIG_COMPAT
+ if (!kbuf->is64bit) {
+ struct dlm_write_request32 *k32buf;
+ k32buf = (struct dlm_write_request32 *)kbuf;
+ kbuf = kmalloc(count + (sizeof(struct dlm_write_request) -
+ sizeof(struct dlm_write_request32)), GFP_KERNEL);
+ if (!kbuf)
+ return -ENOMEM;
+
+ if (proc)
+ set_bit(DLM_PROC_FLAGS_COMPAT, &proc->flags);
+ compat_input(kbuf, k32buf);
+ kfree(k32buf);
+ }
+#endif
+
+ /* do we really need this? can a write happen after a close? */
+ if ((kbuf->cmd == DLM_USER_LOCK || kbuf->cmd == DLM_USER_UNLOCK) &&
+ test_bit(DLM_PROC_FLAGS_CLOSING, &proc->flags))
+ return -EINVAL;
+
+ sigfillset(&allsigs);
+ sigprocmask(SIG_BLOCK, &allsigs, &tmpsig);
+
+ error = -EINVAL;
+
+ switch (kbuf->cmd)
+ {
+ case DLM_USER_LOCK:
+ if (!proc) {
+ log_print("no locking on control device");
+ goto out_sig;
+ }
+ error = device_user_lock(proc, &kbuf->i.lock);
+ break;
+
+ case DLM_USER_UNLOCK:
+ if (!proc) {
+ log_print("no locking on control device");
+ goto out_sig;
+ }
+ error = device_user_unlock(proc, &kbuf->i.lock);
+ break;
+
+ case DLM_USER_CREATE_LOCKSPACE:
+ if (proc) {
+ log_print("create/remove only on control device");
+ goto out_sig;
+ }
+ error = device_create_lockspace(&kbuf->i.lspace);
+ break;
+
+ case DLM_USER_REMOVE_LOCKSPACE:
+ if (proc) {
+ log_print("create/remove only on control device");
+ goto out_sig;
+ }
+ error = device_remove_lockspace(&kbuf->i.lspace);
+ break;
+
+ default:
+ log_print("Unknown command passed to DLM device : %d\n",
+ kbuf->cmd);
+ }
+
+ out_sig:
+ sigprocmask(SIG_SETMASK, &tmpsig, NULL);
+ recalc_sigpending();
+ out_free:
+ kfree(kbuf);
+ return error;
+}
+
+/* Every process that opens the lockspace device has its own "proc" structure
+ hanging off the open file that's used to keep track of locks owned by the
+ process and asts that need to be delivered to the process. */
+
+static int device_open(struct inode *inode, struct file *file)
+{
+ struct dlm_user_proc *proc;
+ struct dlm_ls *ls;
+
+ ls = dlm_find_lockspace_device(iminor(inode));
+ if (!ls)
+ return -ENOENT;
+
+ proc = kzalloc(sizeof(struct dlm_user_proc), GFP_KERNEL);
+ if (!proc) {
+ dlm_put_lockspace(ls);
+ return -ENOMEM;
+ }
+
+ proc->lockspace = ls->ls_local_handle;
+ INIT_LIST_HEAD(&proc->asts);
+ INIT_LIST_HEAD(&proc->locks);
+ spin_lock_init(&proc->asts_spin);
+ spin_lock_init(&proc->locks_spin);
+ init_waitqueue_head(&proc->wait);
+ file->private_data = proc;
+
+ return 0;
+}
+
+static int device_close(struct inode *inode, struct file *file)
+{
+ struct dlm_user_proc *proc = file->private_data;
+ struct dlm_ls *ls;
+ sigset_t tmpsig, allsigs;
+
+ ls = dlm_find_lockspace_local(proc->lockspace);
+ if (!ls)
+ return -ENOENT;
+
+ sigfillset(&allsigs);
+ sigprocmask(SIG_BLOCK, &allsigs, &tmpsig);
+
+ set_bit(DLM_PROC_FLAGS_CLOSING, &proc->flags);
+
+ dlm_clear_proc_locks(ls, proc);
+
+ /* at this point no more lkb's should exist for this lockspace,
+ so there's no chance of dlm_user_add_ast() being called and
+ looking for lkb->ua->proc */
+
+ kfree(proc);
+ file->private_data = NULL;
+
+ dlm_put_lockspace(ls);
+ dlm_put_lockspace(ls); /* for the find in device_open() */
+
+ /* FIXME: AUTOFREE: if this ls is no longer used do
+ device_remove_lockspace() */
+
+ sigprocmask(SIG_SETMASK, &tmpsig, NULL);
+ recalc_sigpending();
+
+ return 0;
+}
+
+static int copy_result_to_user(struct dlm_user_args *ua, int compat, int type,
+ int bmode, char __user *buf, size_t count)
+{
+#ifdef CONFIG_COMPAT
+ struct dlm_lock_result32 result32;
+#endif
+ struct dlm_lock_result result;
+ void *resultptr;
+ int error=0;
+ int len;
+ int struct_len;
+
+ memset(&result, 0, sizeof(struct dlm_lock_result));
+ memcpy(&result.lksb, &ua->lksb, sizeof(struct dlm_lksb));
+ result.user_lksb = ua->user_lksb;
+
+ /* FIXME: dlm1 provides for the user's bastparam/addr to not be updated
+ in a conversion unless the conversion is successful. See code
+ in dlm_user_convert() for updating ua from ua_tmp. OpenVMS, though,
+ notes that a new blocking AST address and parameter are set even if
+ the conversion fails, so maybe we should just do that. */
+
+ if (type == AST_BAST) {
+ result.user_astaddr = ua->bastaddr;
+ result.user_astparam = ua->bastparam;
+ result.bast_mode = bmode;
+ } else {
+ result.user_astaddr = ua->castaddr;
+ result.user_astparam = ua->castparam;
+ }
+
+#ifdef CONFIG_COMPAT
+ if (compat)
+ len = sizeof(struct dlm_lock_result32);
+ else
+#endif
+ len = sizeof(struct dlm_lock_result);
+ struct_len = len;
+
+ /* copy lvb to userspace if there is one, it's been updated, and
+ the user buffer has space for it */
+
+ if (ua->update_user_lvb && ua->lksb.sb_lvbptr &&
+ count >= len + DLM_USER_LVB_LEN) {
+ if (copy_to_user(buf+len, ua->lksb.sb_lvbptr,
+ DLM_USER_LVB_LEN)) {
+ error = -EFAULT;
+ goto out;
+ }
+
+ result.lvb_offset = len;
+ len += DLM_USER_LVB_LEN;
+ }
+
+ result.length = len;
+ resultptr = &result;
+#ifdef CONFIG_COMPAT
+ if (compat) {
+ compat_output(&result, &result32);
+ resultptr = &result32;
+ }
+#endif
+
+ if (copy_to_user(buf, resultptr, struct_len))
+ error = -EFAULT;
+ else
+ error = len;
+ out:
+ return error;
+}
+
+/* a read returns a single ast described in a struct dlm_lock_result */
+
+static ssize_t device_read(struct file *file, char __user *buf, size_t count,
+ loff_t *ppos)
+{
+ struct dlm_user_proc *proc = file->private_data;
+ struct dlm_lkb *lkb;
+ struct dlm_user_args *ua;
+ DECLARE_WAITQUEUE(wait, current);
+ int error, type=0, bmode=0, removed = 0;
+
+#ifdef CONFIG_COMPAT
+ if (count < sizeof(struct dlm_lock_result32))
+#else
+ if (count < sizeof(struct dlm_lock_result))
+#endif
+ return -EINVAL;
+
+ /* do we really need this? can a read happen after a close? */
+ if (test_bit(DLM_PROC_FLAGS_CLOSING, &proc->flags))
+ return -EINVAL;
+
+ spin_lock(&proc->asts_spin);
+ if (list_empty(&proc->asts)) {
+ if (file->f_flags & O_NONBLOCK) {
+ spin_unlock(&proc->asts_spin);
+ return -EAGAIN;
+ }
+
+ add_wait_queue(&proc->wait, &wait);
+
+ repeat:
+ set_current_state(TASK_INTERRUPTIBLE);
+ if (list_empty(&proc->asts) && !signal_pending(current)) {
+ spin_unlock(&proc->asts_spin);
+ schedule();
+ spin_lock(&proc->asts_spin);
+ goto repeat;
+ }
+ set_current_state(TASK_RUNNING);
+ remove_wait_queue(&proc->wait, &wait);
+
+ if (signal_pending(current)) {
+ spin_unlock(&proc->asts_spin);
+ return -ERESTARTSYS;
+ }
+ }
+
+ if (list_empty(&proc->asts)) {
+ spin_unlock(&proc->asts_spin);
+ return -EAGAIN;
+ }
+
+ /* there may be both completion and blocking asts to return for
+ the lkb, don't remove lkb from asts list unless no asts remain */
+
+ lkb = list_entry(proc->asts.next, struct dlm_lkb, lkb_astqueue);
+
+ if (lkb->lkb_ast_type & AST_COMP) {
+ lkb->lkb_ast_type &= ~AST_COMP;
+ type = AST_COMP;
+ } else if (lkb->lkb_ast_type & AST_BAST) {
+ lkb->lkb_ast_type &= ~AST_BAST;
+ type = AST_BAST;
+ bmode = lkb->lkb_bastmode;
+ }
+
+ if (!lkb->lkb_ast_type) {
+ list_del(&lkb->lkb_astqueue);
+ removed = 1;
+ }
+ spin_unlock(&proc->asts_spin);
+
+ ua = (struct dlm_user_args *)lkb->lkb_astparam;
+ error = copy_result_to_user(ua,
+ test_bit(DLM_PROC_FLAGS_COMPAT, &proc->flags),
+ type, bmode, buf, count);
+
+ /* removes reference for the proc->asts lists added by
+ dlm_user_add_ast() and may result in the lkb being freed */
+ if (removed)
+ dlm_put_lkb(lkb);
+
+ return error;
+}
+
+static unsigned int device_poll(struct file *file, poll_table *wait)
+{
+ struct dlm_user_proc *proc = file->private_data;
+
+ poll_wait(file, &proc->wait, wait);
+
+ spin_lock(&proc->asts_spin);
+ if (!list_empty(&proc->asts)) {
+ spin_unlock(&proc->asts_spin);
+ return POLLIN | POLLRDNORM;
+ }
+ spin_unlock(&proc->asts_spin);
+ return 0;
+}
+
+static int ctl_device_open(struct inode *inode, struct file *file)
+{
+ file->private_data = NULL;
+ return 0;
+}
+
+static int ctl_device_close(struct inode *inode, struct file *file)
+{
+ return 0;
+}
+
+static struct file_operations device_fops = {
+ .open = device_open,
+ .release = device_close,
+ .read = device_read,
+ .write = device_write,
+ .poll = device_poll,
+ .owner = THIS_MODULE,
+};
+
+static struct file_operations ctl_device_fops = {
+ .open = ctl_device_open,
+ .release = ctl_device_close,
+ .write = device_write,
+ .owner = THIS_MODULE,
+};
+
+int dlm_user_init(void)
+{
+ int error;
+
+ ctl_device.name = "dlm-control";
+ ctl_device.fops = &ctl_device_fops;
+ ctl_device.minor = MISC_DYNAMIC_MINOR;
+
+ error = misc_register(&ctl_device);
+ if (error)
+ log_print("misc_register failed for control device");
+
+ return error;
+}
+
+void dlm_user_exit(void)
+{
+ misc_deregister(&ctl_device);
+}
+
diff --git a/fs/dlm/user.h b/fs/dlm/user.h
new file mode 100644
index 000000000000..d38e9f3e4151
--- /dev/null
+++ b/fs/dlm/user.h
@@ -0,0 +1,16 @@
+/*
+ * Copyright (C) 2006 Red Hat, Inc. All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License v.2.
+ */
+
+#ifndef __USER_DOT_H__
+#define __USER_DOT_H__
+
+void dlm_user_add_ast(struct dlm_lkb *lkb, int type);
+int dlm_user_init(void);
+void dlm_user_exit(void);
+
+#endif
diff --git a/fs/dlm/util.c b/fs/dlm/util.c
new file mode 100644
index 000000000000..767197db9944
--- /dev/null
+++ b/fs/dlm/util.c
@@ -0,0 +1,161 @@
+/******************************************************************************
+*******************************************************************************
+**
+** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
+**
+** This copyrighted material is made available to anyone wishing to use,
+** modify, copy, or redistribute it subject to the terms and conditions
+** of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#include "dlm_internal.h"
+#include "rcom.h"
+#include "util.h"
+
+static void header_out(struct dlm_header *hd)
+{
+ hd->h_version = cpu_to_le32(hd->h_version);
+ hd->h_lockspace = cpu_to_le32(hd->h_lockspace);
+ hd->h_nodeid = cpu_to_le32(hd->h_nodeid);
+ hd->h_length = cpu_to_le16(hd->h_length);
+}
+
+static void header_in(struct dlm_header *hd)
+{
+ hd->h_version = le32_to_cpu(hd->h_version);
+ hd->h_lockspace = le32_to_cpu(hd->h_lockspace);
+ hd->h_nodeid = le32_to_cpu(hd->h_nodeid);
+ hd->h_length = le16_to_cpu(hd->h_length);
+}
+
+void dlm_message_out(struct dlm_message *ms)
+{
+ struct dlm_header *hd = (struct dlm_header *) ms;
+
+ header_out(hd);
+
+ ms->m_type = cpu_to_le32(ms->m_type);
+ ms->m_nodeid = cpu_to_le32(ms->m_nodeid);
+ ms->m_pid = cpu_to_le32(ms->m_pid);
+ ms->m_lkid = cpu_to_le32(ms->m_lkid);
+ ms->m_remid = cpu_to_le32(ms->m_remid);
+ ms->m_parent_lkid = cpu_to_le32(ms->m_parent_lkid);
+ ms->m_parent_remid = cpu_to_le32(ms->m_parent_remid);
+ ms->m_exflags = cpu_to_le32(ms->m_exflags);
+ ms->m_sbflags = cpu_to_le32(ms->m_sbflags);
+ ms->m_flags = cpu_to_le32(ms->m_flags);
+ ms->m_lvbseq = cpu_to_le32(ms->m_lvbseq);
+ ms->m_hash = cpu_to_le32(ms->m_hash);
+ ms->m_status = cpu_to_le32(ms->m_status);
+ ms->m_grmode = cpu_to_le32(ms->m_grmode);
+ ms->m_rqmode = cpu_to_le32(ms->m_rqmode);
+ ms->m_bastmode = cpu_to_le32(ms->m_bastmode);
+ ms->m_asts = cpu_to_le32(ms->m_asts);
+ ms->m_result = cpu_to_le32(ms->m_result);
+}
+
+void dlm_message_in(struct dlm_message *ms)
+{
+ struct dlm_header *hd = (struct dlm_header *) ms;
+
+ header_in(hd);
+
+ ms->m_type = le32_to_cpu(ms->m_type);
+ ms->m_nodeid = le32_to_cpu(ms->m_nodeid);
+ ms->m_pid = le32_to_cpu(ms->m_pid);
+ ms->m_lkid = le32_to_cpu(ms->m_lkid);
+ ms->m_remid = le32_to_cpu(ms->m_remid);
+ ms->m_parent_lkid = le32_to_cpu(ms->m_parent_lkid);
+ ms->m_parent_remid = le32_to_cpu(ms->m_parent_remid);
+ ms->m_exflags = le32_to_cpu(ms->m_exflags);
+ ms->m_sbflags = le32_to_cpu(ms->m_sbflags);
+ ms->m_flags = le32_to_cpu(ms->m_flags);
+ ms->m_lvbseq = le32_to_cpu(ms->m_lvbseq);
+ ms->m_hash = le32_to_cpu(ms->m_hash);
+ ms->m_status = le32_to_cpu(ms->m_status);
+ ms->m_grmode = le32_to_cpu(ms->m_grmode);
+ ms->m_rqmode = le32_to_cpu(ms->m_rqmode);
+ ms->m_bastmode = le32_to_cpu(ms->m_bastmode);
+ ms->m_asts = le32_to_cpu(ms->m_asts);
+ ms->m_result = le32_to_cpu(ms->m_result);
+}
+
+static void rcom_lock_out(struct rcom_lock *rl)
+{
+ rl->rl_ownpid = cpu_to_le32(rl->rl_ownpid);
+ rl->rl_lkid = cpu_to_le32(rl->rl_lkid);
+ rl->rl_remid = cpu_to_le32(rl->rl_remid);
+ rl->rl_parent_lkid = cpu_to_le32(rl->rl_parent_lkid);
+ rl->rl_parent_remid = cpu_to_le32(rl->rl_parent_remid);
+ rl->rl_exflags = cpu_to_le32(rl->rl_exflags);
+ rl->rl_flags = cpu_to_le32(rl->rl_flags);
+ rl->rl_lvbseq = cpu_to_le32(rl->rl_lvbseq);
+ rl->rl_result = cpu_to_le32(rl->rl_result);
+ rl->rl_wait_type = cpu_to_le16(rl->rl_wait_type);
+ rl->rl_namelen = cpu_to_le16(rl->rl_namelen);
+}
+
+static void rcom_lock_in(struct rcom_lock *rl)
+{
+ rl->rl_ownpid = le32_to_cpu(rl->rl_ownpid);
+ rl->rl_lkid = le32_to_cpu(rl->rl_lkid);
+ rl->rl_remid = le32_to_cpu(rl->rl_remid);
+ rl->rl_parent_lkid = le32_to_cpu(rl->rl_parent_lkid);
+ rl->rl_parent_remid = le32_to_cpu(rl->rl_parent_remid);
+ rl->rl_exflags = le32_to_cpu(rl->rl_exflags);
+ rl->rl_flags = le32_to_cpu(rl->rl_flags);
+ rl->rl_lvbseq = le32_to_cpu(rl->rl_lvbseq);
+ rl->rl_result = le32_to_cpu(rl->rl_result);
+ rl->rl_wait_type = le16_to_cpu(rl->rl_wait_type);
+ rl->rl_namelen = le16_to_cpu(rl->rl_namelen);
+}
+
+static void rcom_config_out(struct rcom_config *rf)
+{
+ rf->rf_lvblen = cpu_to_le32(rf->rf_lvblen);
+ rf->rf_lsflags = cpu_to_le32(rf->rf_lsflags);
+}
+
+static void rcom_config_in(struct rcom_config *rf)
+{
+ rf->rf_lvblen = le32_to_cpu(rf->rf_lvblen);
+ rf->rf_lsflags = le32_to_cpu(rf->rf_lsflags);
+}
+
+void dlm_rcom_out(struct dlm_rcom *rc)
+{
+ struct dlm_header *hd = (struct dlm_header *) rc;
+ int type = rc->rc_type;
+
+ header_out(hd);
+
+ rc->rc_type = cpu_to_le32(rc->rc_type);
+ rc->rc_result = cpu_to_le32(rc->rc_result);
+ rc->rc_id = cpu_to_le64(rc->rc_id);
+
+ if (type == DLM_RCOM_LOCK)
+ rcom_lock_out((struct rcom_lock *) rc->rc_buf);
+
+ else if (type == DLM_RCOM_STATUS_REPLY)
+ rcom_config_out((struct rcom_config *) rc->rc_buf);
+}
+
+void dlm_rcom_in(struct dlm_rcom *rc)
+{
+ struct dlm_header *hd = (struct dlm_header *) rc;
+
+ header_in(hd);
+
+ rc->rc_type = le32_to_cpu(rc->rc_type);
+ rc->rc_result = le32_to_cpu(rc->rc_result);
+ rc->rc_id = le64_to_cpu(rc->rc_id);
+
+ if (rc->rc_type == DLM_RCOM_LOCK)
+ rcom_lock_in((struct rcom_lock *) rc->rc_buf);
+
+ else if (rc->rc_type == DLM_RCOM_STATUS_REPLY)
+ rcom_config_in((struct rcom_config *) rc->rc_buf);
+}
+
diff --git a/fs/dlm/util.h b/fs/dlm/util.h
new file mode 100644
index 000000000000..2b25915161c0
--- /dev/null
+++ b/fs/dlm/util.h
@@ -0,0 +1,22 @@
+/******************************************************************************
+*******************************************************************************
+**
+** Copyright (C) 2005 Red Hat, Inc. All rights reserved.
+**
+** This copyrighted material is made available to anyone wishing to use,
+** modify, copy, or redistribute it subject to the terms and conditions
+** of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __UTIL_DOT_H__
+#define __UTIL_DOT_H__
+
+void dlm_message_out(struct dlm_message *ms);
+void dlm_message_in(struct dlm_message *ms);
+void dlm_rcom_out(struct dlm_rcom *rc);
+void dlm_rcom_in(struct dlm_rcom *rc);
+
+#endif
+
diff --git a/fs/ecryptfs/Makefile b/fs/ecryptfs/Makefile
new file mode 100644
index 000000000000..ca6562451eeb
--- /dev/null
+++ b/fs/ecryptfs/Makefile
@@ -0,0 +1,7 @@
+#
+# Makefile for the Linux 2.6 eCryptfs
+#
+
+obj-$(CONFIG_ECRYPT_FS) += ecryptfs.o
+
+ecryptfs-objs := dentry.o file.o inode.o main.o super.o mmap.o crypto.o keystore.o debug.o
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
new file mode 100644
index 000000000000..ed35a9712fa1
--- /dev/null
+++ b/fs/ecryptfs/crypto.c
@@ -0,0 +1,1659 @@
+/**
+ * eCryptfs: Linux filesystem encryption layer
+ *
+ * Copyright (C) 1997-2004 Erez Zadok
+ * Copyright (C) 2001-2004 Stony Brook University
+ * Copyright (C) 2004-2006 International Business Machines Corp.
+ * Author(s): Michael A. Halcrow <mahalcro@us.ibm.com>
+ * Michael C. Thompson <mcthomps@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+ * 02111-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/pagemap.h>
+#include <linux/random.h>
+#include <linux/compiler.h>
+#include <linux/key.h>
+#include <linux/namei.h>
+#include <linux/crypto.h>
+#include <linux/file.h>
+#include <linux/scatterlist.h>
+#include "ecryptfs_kernel.h"
+
+static int
+ecryptfs_decrypt_page_offset(struct ecryptfs_crypt_stat *crypt_stat,
+ struct page *dst_page, int dst_offset,
+ struct page *src_page, int src_offset, int size,
+ unsigned char *iv);
+static int
+ecryptfs_encrypt_page_offset(struct ecryptfs_crypt_stat *crypt_stat,
+ struct page *dst_page, int dst_offset,
+ struct page *src_page, int src_offset, int size,
+ unsigned char *iv);
+
+/**
+ * ecryptfs_to_hex
+ * @dst: Buffer to take hex character representation of contents of
+ * src; must be at least of size (src_size * 2)
+ * @src: Buffer to be converted to a hex string respresentation
+ * @src_size: number of bytes to convert
+ */
+void ecryptfs_to_hex(char *dst, char *src, size_t src_size)
+{
+ int x;
+
+ for (x = 0; x < src_size; x++)
+ sprintf(&dst[x * 2], "%.2x", (unsigned char)src[x]);
+}
+
+/**
+ * ecryptfs_from_hex
+ * @dst: Buffer to take the bytes from src hex; must be at least of
+ * size (src_size / 2)
+ * @src: Buffer to be converted from a hex string respresentation to raw value
+ * @dst_size: size of dst buffer, or number of hex characters pairs to convert
+ */
+void ecryptfs_from_hex(char *dst, char *src, int dst_size)
+{
+ int x;
+ char tmp[3] = { 0, };
+
+ for (x = 0; x < dst_size; x++) {
+ tmp[0] = src[x * 2];
+ tmp[1] = src[x * 2 + 1];
+ dst[x] = (unsigned char)simple_strtol(tmp, NULL, 16);
+ }
+}
+
+/**
+ * ecryptfs_calculate_md5 - calculates the md5 of @src
+ * @dst: Pointer to 16 bytes of allocated memory
+ * @crypt_stat: Pointer to crypt_stat struct for the current inode
+ * @src: Data to be md5'd
+ * @len: Length of @src
+ *
+ * Uses the allocated crypto context that crypt_stat references to
+ * generate the MD5 sum of the contents of src.
+ */
+static int ecryptfs_calculate_md5(char *dst,
+ struct ecryptfs_crypt_stat *crypt_stat,
+ char *src, int len)
+{
+ int rc = 0;
+ struct scatterlist sg;
+
+ mutex_lock(&crypt_stat->cs_md5_tfm_mutex);
+ sg_init_one(&sg, (u8 *)src, len);
+ if (!crypt_stat->md5_tfm) {
+ crypt_stat->md5_tfm =
+ crypto_alloc_tfm("md5", CRYPTO_TFM_REQ_MAY_SLEEP);
+ if (!crypt_stat->md5_tfm) {
+ rc = -ENOMEM;
+ ecryptfs_printk(KERN_ERR, "Error attempting to "
+ "allocate crypto context\n");
+ goto out;
+ }
+ }
+ crypto_digest_init(crypt_stat->md5_tfm);
+ crypto_digest_update(crypt_stat->md5_tfm, &sg, 1);
+ crypto_digest_final(crypt_stat->md5_tfm, dst);
+ mutex_unlock(&crypt_stat->cs_md5_tfm_mutex);
+out:
+ return rc;
+}
+
+/**
+ * ecryptfs_derive_iv
+ * @iv: destination for the derived iv vale
+ * @crypt_stat: Pointer to crypt_stat struct for the current inode
+ * @offset: Offset of the page whose's iv we are to derive
+ *
+ * Generate the initialization vector from the given root IV and page
+ * offset.
+ *
+ * Returns zero on success; non-zero on error.
+ */
+static int ecryptfs_derive_iv(char *iv, struct ecryptfs_crypt_stat *crypt_stat,
+ pgoff_t offset)
+{
+ int rc = 0;
+ char dst[MD5_DIGEST_SIZE];
+ char src[ECRYPTFS_MAX_IV_BYTES + 16];
+
+ if (unlikely(ecryptfs_verbosity > 0)) {
+ ecryptfs_printk(KERN_DEBUG, "root iv:\n");
+ ecryptfs_dump_hex(crypt_stat->root_iv, crypt_stat->iv_bytes);
+ }
+ /* TODO: It is probably secure to just cast the least
+ * significant bits of the root IV into an unsigned long and
+ * add the offset to that rather than go through all this
+ * hashing business. -Halcrow */
+ memcpy(src, crypt_stat->root_iv, crypt_stat->iv_bytes);
+ memset((src + crypt_stat->iv_bytes), 0, 16);
+ snprintf((src + crypt_stat->iv_bytes), 16, "%ld", offset);
+ if (unlikely(ecryptfs_verbosity > 0)) {
+ ecryptfs_printk(KERN_DEBUG, "source:\n");
+ ecryptfs_dump_hex(src, (crypt_stat->iv_bytes + 16));
+ }
+ rc = ecryptfs_calculate_md5(dst, crypt_stat, src,
+ (crypt_stat->iv_bytes + 16));
+ if (rc) {
+ ecryptfs_printk(KERN_WARNING, "Error attempting to compute "
+ "MD5 while generating IV for a page\n");
+ goto out;
+ }
+ memcpy(iv, dst, crypt_stat->iv_bytes);
+ if (unlikely(ecryptfs_verbosity > 0)) {
+ ecryptfs_printk(KERN_DEBUG, "derived iv:\n");
+ ecryptfs_dump_hex(iv, crypt_stat->iv_bytes);
+ }
+out:
+ return rc;
+}
+
+/**
+ * ecryptfs_init_crypt_stat
+ * @crypt_stat: Pointer to the crypt_stat struct to initialize.
+ *
+ * Initialize the crypt_stat structure.
+ */
+void
+ecryptfs_init_crypt_stat(struct ecryptfs_crypt_stat *crypt_stat)
+{
+ memset((void *)crypt_stat, 0, sizeof(struct ecryptfs_crypt_stat));
+ mutex_init(&crypt_stat->cs_mutex);
+ mutex_init(&crypt_stat->cs_tfm_mutex);
+ mutex_init(&crypt_stat->cs_md5_tfm_mutex);
+ ECRYPTFS_SET_FLAG(crypt_stat->flags, ECRYPTFS_STRUCT_INITIALIZED);
+}
+
+/**
+ * ecryptfs_destruct_crypt_stat
+ * @crypt_stat: Pointer to the crypt_stat struct to initialize.
+ *
+ * Releases all memory associated with a crypt_stat struct.
+ */
+void ecryptfs_destruct_crypt_stat(struct ecryptfs_crypt_stat *crypt_stat)
+{
+ if (crypt_stat->tfm)
+ crypto_free_tfm(crypt_stat->tfm);
+ if (crypt_stat->md5_tfm)
+ crypto_free_tfm(crypt_stat->md5_tfm);
+ memset(crypt_stat, 0, sizeof(struct ecryptfs_crypt_stat));
+}
+
+void ecryptfs_destruct_mount_crypt_stat(
+ struct ecryptfs_mount_crypt_stat *mount_crypt_stat)
+{
+ if (mount_crypt_stat->global_auth_tok_key)
+ key_put(mount_crypt_stat->global_auth_tok_key);
+ if (mount_crypt_stat->global_key_tfm)
+ crypto_free_tfm(mount_crypt_stat->global_key_tfm);
+ memset(mount_crypt_stat, 0, sizeof(struct ecryptfs_mount_crypt_stat));
+}
+
+/**
+ * virt_to_scatterlist
+ * @addr: Virtual address
+ * @size: Size of data; should be an even multiple of the block size
+ * @sg: Pointer to scatterlist array; set to NULL to obtain only
+ * the number of scatterlist structs required in array
+ * @sg_size: Max array size
+ *
+ * Fills in a scatterlist array with page references for a passed
+ * virtual address.
+ *
+ * Returns the number of scatterlist structs in array used
+ */
+int virt_to_scatterlist(const void *addr, int size, struct scatterlist *sg,
+ int sg_size)
+{
+ int i = 0;
+ struct page *pg;
+ int offset;
+ int remainder_of_page;
+
+ while (size > 0 && i < sg_size) {
+ pg = virt_to_page(addr);
+ offset = offset_in_page(addr);
+ if (sg) {
+ sg[i].page = pg;
+ sg[i].offset = offset;
+ }
+ remainder_of_page = PAGE_CACHE_SIZE - offset;
+ if (size >= remainder_of_page) {
+ if (sg)
+ sg[i].length = remainder_of_page;
+ addr += remainder_of_page;
+ size -= remainder_of_page;
+ } else {
+ if (sg)
+ sg[i].length = size;
+ addr += size;
+ size = 0;
+ }
+ i++;
+ }
+ if (size > 0)
+ return -ENOMEM;
+ return i;
+}
+
+/**
+ * encrypt_scatterlist
+ * @crypt_stat: Pointer to the crypt_stat struct to initialize.
+ * @dest_sg: Destination of encrypted data
+ * @src_sg: Data to be encrypted
+ * @size: Length of data to be encrypted
+ * @iv: iv to use during encryption
+ *
+ * Returns the number of bytes encrypted; negative value on error
+ */
+static int encrypt_scatterlist(struct ecryptfs_crypt_stat *crypt_stat,
+ struct scatterlist *dest_sg,
+ struct scatterlist *src_sg, int size,
+ unsigned char *iv)
+{
+ int rc = 0;
+
+ BUG_ON(!crypt_stat || !crypt_stat->tfm
+ || !ECRYPTFS_CHECK_FLAG(crypt_stat->flags,
+ ECRYPTFS_STRUCT_INITIALIZED));
+ if (unlikely(ecryptfs_verbosity > 0)) {
+ ecryptfs_printk(KERN_DEBUG, "Key size [%d]; key:\n",
+ crypt_stat->key_size);
+ ecryptfs_dump_hex(crypt_stat->key,
+ crypt_stat->key_size);
+ }
+ /* Consider doing this once, when the file is opened */
+ mutex_lock(&crypt_stat->cs_tfm_mutex);
+ rc = crypto_cipher_setkey(crypt_stat->tfm, crypt_stat->key,
+ crypt_stat->key_size);
+ if (rc) {
+ ecryptfs_printk(KERN_ERR, "Error setting key; rc = [%d]\n",
+ rc);
+ mutex_unlock(&crypt_stat->cs_tfm_mutex);
+ rc = -EINVAL;
+ goto out;
+ }
+ ecryptfs_printk(KERN_DEBUG, "Encrypting [%d] bytes.\n", size);
+ crypto_cipher_encrypt_iv(crypt_stat->tfm, dest_sg, src_sg, size, iv);
+ mutex_unlock(&crypt_stat->cs_tfm_mutex);
+out:
+ return rc;
+}
+
+static void
+ecryptfs_extent_to_lwr_pg_idx_and_offset(unsigned long *lower_page_idx,
+ int *byte_offset,
+ struct ecryptfs_crypt_stat *crypt_stat,
+ unsigned long extent_num)
+{
+ unsigned long lower_extent_num;
+ int extents_occupied_by_headers_at_front;
+ int bytes_occupied_by_headers_at_front;
+ int extent_offset;
+ int extents_per_page;
+
+ bytes_occupied_by_headers_at_front =
+ ( crypt_stat->header_extent_size
+ * crypt_stat->num_header_extents_at_front );
+ extents_occupied_by_headers_at_front =
+ ( bytes_occupied_by_headers_at_front
+ / crypt_stat->extent_size );
+ lower_extent_num = extents_occupied_by_headers_at_front + extent_num;
+ extents_per_page = PAGE_CACHE_SIZE / crypt_stat->extent_size;
+ (*lower_page_idx) = lower_extent_num / extents_per_page;
+ extent_offset = lower_extent_num % extents_per_page;
+ (*byte_offset) = extent_offset * crypt_stat->extent_size;
+ ecryptfs_printk(KERN_DEBUG, " * crypt_stat->header_extent_size = "
+ "[%d]\n", crypt_stat->header_extent_size);
+ ecryptfs_printk(KERN_DEBUG, " * crypt_stat->"
+ "num_header_extents_at_front = [%d]\n",
+ crypt_stat->num_header_extents_at_front);
+ ecryptfs_printk(KERN_DEBUG, " * extents_occupied_by_headers_at_"
+ "front = [%d]\n", extents_occupied_by_headers_at_front);
+ ecryptfs_printk(KERN_DEBUG, " * lower_extent_num = [0x%.16x]\n",
+ lower_extent_num);
+ ecryptfs_printk(KERN_DEBUG, " * extents_per_page = [%d]\n",
+ extents_per_page);
+ ecryptfs_printk(KERN_DEBUG, " * (*lower_page_idx) = [0x%.16x]\n",
+ (*lower_page_idx));
+ ecryptfs_printk(KERN_DEBUG, " * extent_offset = [%d]\n",
+ extent_offset);
+ ecryptfs_printk(KERN_DEBUG, " * (*byte_offset) = [%d]\n",
+ (*byte_offset));
+}
+
+static int ecryptfs_write_out_page(struct ecryptfs_page_crypt_context *ctx,
+ struct page *lower_page,
+ struct inode *lower_inode,
+ int byte_offset_in_page, int bytes_to_write)
+{
+ int rc = 0;
+
+ if (ctx->mode == ECRYPTFS_PREPARE_COMMIT_MODE) {
+ rc = ecryptfs_commit_lower_page(lower_page, lower_inode,
+ ctx->param.lower_file,
+ byte_offset_in_page,
+ bytes_to_write);
+ if (rc) {
+ ecryptfs_printk(KERN_ERR, "Error calling lower "
+ "commit; rc = [%d]\n", rc);
+ goto out;
+ }
+ } else {
+ rc = ecryptfs_writepage_and_release_lower_page(lower_page,
+ lower_inode,
+ ctx->param.wbc);
+ if (rc) {
+ ecryptfs_printk(KERN_ERR, "Error calling lower "
+ "writepage(); rc = [%d]\n", rc);
+ goto out;
+ }
+ }
+out:
+ return rc;
+}
+
+static int ecryptfs_read_in_page(struct ecryptfs_page_crypt_context *ctx,
+ struct page **lower_page,
+ struct inode *lower_inode,
+ unsigned long lower_page_idx,
+ int byte_offset_in_page)
+{
+ int rc = 0;
+
+ if (ctx->mode == ECRYPTFS_PREPARE_COMMIT_MODE) {
+ /* TODO: Limit this to only the data extents that are
+ * needed */
+ rc = ecryptfs_get_lower_page(lower_page, lower_inode,
+ ctx->param.lower_file,
+ lower_page_idx,
+ byte_offset_in_page,
+ (PAGE_CACHE_SIZE
+ - byte_offset_in_page));
+ if (rc) {
+ ecryptfs_printk(
+ KERN_ERR, "Error attempting to grab, map, "
+ "and prepare_write lower page with index "
+ "[0x%.16x]; rc = [%d]\n", lower_page_idx, rc);
+ goto out;
+ }
+ } else {
+ rc = ecryptfs_grab_and_map_lower_page(lower_page, NULL,
+ lower_inode,
+ lower_page_idx);
+ if (rc) {
+ ecryptfs_printk(
+ KERN_ERR, "Error attempting to grab and map "
+ "lower page with index [0x%.16x]; rc = [%d]\n",
+ lower_page_idx, rc);
+ goto out;
+ }
+ }
+out:
+ return rc;
+}
+
+/**
+ * ecryptfs_encrypt_page
+ * @ctx: The context of the page
+ *
+ * Encrypt an eCryptfs page. This is done on a per-extent basis. Note
+ * that eCryptfs pages may straddle the lower pages -- for instance,
+ * if the file was created on a machine with an 8K page size
+ * (resulting in an 8K header), and then the file is copied onto a
+ * host with a 32K page size, then when reading page 0 of the eCryptfs
+ * file, 24K of page 0 of the lower file will be read and decrypted,
+ * and then 8K of page 1 of the lower file will be read and decrypted.
+ *
+ * The actual operations performed on each page depends on the
+ * contents of the ecryptfs_page_crypt_context struct.
+ *
+ * Returns zero on success; negative on error
+ */
+int ecryptfs_encrypt_page(struct ecryptfs_page_crypt_context *ctx)
+{
+ char extent_iv[ECRYPTFS_MAX_IV_BYTES];
+ unsigned long base_extent;
+ unsigned long extent_offset = 0;
+ unsigned long lower_page_idx = 0;
+ unsigned long prior_lower_page_idx = 0;
+ struct page *lower_page;
+ struct inode *lower_inode;
+ struct ecryptfs_inode_info *inode_info;
+ struct ecryptfs_crypt_stat *crypt_stat;
+ int rc = 0;
+ int lower_byte_offset = 0;
+ int orig_byte_offset = 0;
+ int num_extents_per_page;
+#define ECRYPTFS_PAGE_STATE_UNREAD 0
+#define ECRYPTFS_PAGE_STATE_READ 1
+#define ECRYPTFS_PAGE_STATE_MODIFIED 2
+#define ECRYPTFS_PAGE_STATE_WRITTEN 3
+ int page_state;
+
+ lower_inode = ecryptfs_inode_to_lower(ctx->page->mapping->host);
+ inode_info = ecryptfs_inode_to_private(ctx->page->mapping->host);
+ crypt_stat = &inode_info->crypt_stat;
+ if (!ECRYPTFS_CHECK_FLAG(crypt_stat->flags, ECRYPTFS_ENCRYPTED)) {
+ rc = ecryptfs_copy_page_to_lower(ctx->page, lower_inode,
+ ctx->param.lower_file);
+ if (rc)
+ ecryptfs_printk(KERN_ERR, "Error attempting to copy "
+ "page at index [0x%.16x]\n",
+ ctx->page->index);
+ goto out;
+ }
+ num_extents_per_page = PAGE_CACHE_SIZE / crypt_stat->extent_size;
+ base_extent = (ctx->page->index * num_extents_per_page);
+ page_state = ECRYPTFS_PAGE_STATE_UNREAD;
+ while (extent_offset < num_extents_per_page) {
+ ecryptfs_extent_to_lwr_pg_idx_and_offset(
+ &lower_page_idx, &lower_byte_offset, crypt_stat,
+ (base_extent + extent_offset));
+ if (prior_lower_page_idx != lower_page_idx
+ && page_state == ECRYPTFS_PAGE_STATE_MODIFIED) {
+ rc = ecryptfs_write_out_page(ctx, lower_page,
+ lower_inode,
+ orig_byte_offset,
+ (PAGE_CACHE_SIZE
+ - orig_byte_offset));
+ if (rc) {
+ ecryptfs_printk(KERN_ERR, "Error attempting "
+ "to write out page; rc = [%d]"
+ "\n", rc);
+ goto out;
+ }
+ page_state = ECRYPTFS_PAGE_STATE_WRITTEN;
+ }
+ if (page_state == ECRYPTFS_PAGE_STATE_UNREAD
+ || page_state == ECRYPTFS_PAGE_STATE_WRITTEN) {
+ rc = ecryptfs_read_in_page(ctx, &lower_page,
+ lower_inode, lower_page_idx,
+ lower_byte_offset);
+ if (rc) {
+ ecryptfs_printk(KERN_ERR, "Error attempting "
+ "to read in lower page with "
+ "index [0x%.16x]; rc = [%d]\n",
+ lower_page_idx, rc);
+ goto out;
+ }
+ orig_byte_offset = lower_byte_offset;
+ prior_lower_page_idx = lower_page_idx;
+ page_state = ECRYPTFS_PAGE_STATE_READ;
+ }
+ BUG_ON(!(page_state == ECRYPTFS_PAGE_STATE_MODIFIED
+ || page_state == ECRYPTFS_PAGE_STATE_READ));
+ rc = ecryptfs_derive_iv(extent_iv, crypt_stat,
+ (base_extent + extent_offset));
+ if (rc) {
+ ecryptfs_printk(KERN_ERR, "Error attempting to "
+ "derive IV for extent [0x%.16x]; "
+ "rc = [%d]\n",
+ (base_extent + extent_offset), rc);
+ goto out;
+ }
+ if (unlikely(ecryptfs_verbosity > 0)) {
+ ecryptfs_printk(KERN_DEBUG, "Encrypting extent "
+ "with iv:\n");
+ ecryptfs_dump_hex(extent_iv, crypt_stat->iv_bytes);
+ ecryptfs_printk(KERN_DEBUG, "First 8 bytes before "
+ "encryption:\n");
+ ecryptfs_dump_hex((char *)
+ (page_address(ctx->page)
+ + (extent_offset
+ * crypt_stat->extent_size)), 8);
+ }
+ rc = ecryptfs_encrypt_page_offset(
+ crypt_stat, lower_page, lower_byte_offset, ctx->page,
+ (extent_offset * crypt_stat->extent_size),
+ crypt_stat->extent_size, extent_iv);
+ ecryptfs_printk(KERN_DEBUG, "Encrypt extent [0x%.16x]; "
+ "rc = [%d]\n",
+ (base_extent + extent_offset), rc);
+ if (unlikely(ecryptfs_verbosity > 0)) {
+ ecryptfs_printk(KERN_DEBUG, "First 8 bytes after "
+ "encryption:\n");
+ ecryptfs_dump_hex((char *)(page_address(lower_page)
+ + lower_byte_offset), 8);
+ }
+ page_state = ECRYPTFS_PAGE_STATE_MODIFIED;
+ extent_offset++;
+ }
+ BUG_ON(orig_byte_offset != 0);
+ rc = ecryptfs_write_out_page(ctx, lower_page, lower_inode, 0,
+ (lower_byte_offset
+ + crypt_stat->extent_size));
+ if (rc) {
+ ecryptfs_printk(KERN_ERR, "Error attempting to write out "
+ "page; rc = [%d]\n", rc);
+ goto out;
+ }
+out:
+ return rc;
+}
+
+/**
+ * ecryptfs_decrypt_page
+ * @file: The ecryptfs file
+ * @page: The page in ecryptfs to decrypt
+ *
+ * Decrypt an eCryptfs page. This is done on a per-extent basis. Note
+ * that eCryptfs pages may straddle the lower pages -- for instance,
+ * if the file was created on a machine with an 8K page size
+ * (resulting in an 8K header), and then the file is copied onto a
+ * host with a 32K page size, then when reading page 0 of the eCryptfs
+ * file, 24K of page 0 of the lower file will be read and decrypted,
+ * and then 8K of page 1 of the lower file will be read and decrypted.
+ *
+ * Returns zero on success; negative on error
+ */
+int ecryptfs_decrypt_page(struct file *file, struct page *page)
+{
+ char extent_iv[ECRYPTFS_MAX_IV_BYTES];
+ unsigned long base_extent;
+ unsigned long extent_offset = 0;
+ unsigned long lower_page_idx = 0;
+ unsigned long prior_lower_page_idx = 0;
+ struct page *lower_page;
+ char *lower_page_virt = NULL;
+ struct inode *lower_inode;
+ struct ecryptfs_crypt_stat *crypt_stat;
+ int rc = 0;
+ int byte_offset;
+ int num_extents_per_page;
+ int page_state;
+
+ crypt_stat = &(ecryptfs_inode_to_private(
+ page->mapping->host)->crypt_stat);
+ lower_inode = ecryptfs_inode_to_lower(page->mapping->host);
+ if (!ECRYPTFS_CHECK_FLAG(crypt_stat->flags, ECRYPTFS_ENCRYPTED)) {
+ rc = ecryptfs_do_readpage(file, page, page->index);
+ if (rc)
+ ecryptfs_printk(KERN_ERR, "Error attempting to copy "
+ "page at index [0x%.16x]\n",
+ page->index);
+ goto out;
+ }
+ num_extents_per_page = PAGE_CACHE_SIZE / crypt_stat->extent_size;
+ base_extent = (page->index * num_extents_per_page);
+ lower_page_virt = kmem_cache_alloc(ecryptfs_lower_page_cache,
+ SLAB_KERNEL);
+ if (!lower_page_virt) {
+ rc = -ENOMEM;
+ ecryptfs_printk(KERN_ERR, "Error getting page for encrypted "
+ "lower page(s)\n");
+ goto out;
+ }
+ lower_page = virt_to_page(lower_page_virt);
+ page_state = ECRYPTFS_PAGE_STATE_UNREAD;
+ while (extent_offset < num_extents_per_page) {
+ ecryptfs_extent_to_lwr_pg_idx_and_offset(
+ &lower_page_idx, &byte_offset, crypt_stat,
+ (base_extent + extent_offset));
+ if (prior_lower_page_idx != lower_page_idx
+ || page_state == ECRYPTFS_PAGE_STATE_UNREAD) {
+ rc = ecryptfs_do_readpage(file, lower_page,
+ lower_page_idx);
+ if (rc) {
+ ecryptfs_printk(KERN_ERR, "Error reading "
+ "lower encrypted page; rc = "
+ "[%d]\n", rc);
+ goto out;
+ }
+ prior_lower_page_idx = lower_page_idx;
+ page_state = ECRYPTFS_PAGE_STATE_READ;
+ }
+ rc = ecryptfs_derive_iv(extent_iv, crypt_stat,
+ (base_extent + extent_offset));
+ if (rc) {
+ ecryptfs_printk(KERN_ERR, "Error attempting to "
+ "derive IV for extent [0x%.16x]; rc = "
+ "[%d]\n",
+ (base_extent + extent_offset), rc);
+ goto out;
+ }
+ if (unlikely(ecryptfs_verbosity > 0)) {
+ ecryptfs_printk(KERN_DEBUG, "Decrypting extent "
+ "with iv:\n");
+ ecryptfs_dump_hex(extent_iv, crypt_stat->iv_bytes);
+ ecryptfs_printk(KERN_DEBUG, "First 8 bytes before "
+ "decryption:\n");
+ ecryptfs_dump_hex((lower_page_virt + byte_offset), 8);
+ }
+ rc = ecryptfs_decrypt_page_offset(crypt_stat, page,
+ (extent_offset
+ * crypt_stat->extent_size),
+ lower_page, byte_offset,
+ crypt_stat->extent_size,
+ extent_iv);
+ if (rc != crypt_stat->extent_size) {
+ ecryptfs_printk(KERN_ERR, "Error attempting to "
+ "decrypt extent [0x%.16x]\n",
+ (base_extent + extent_offset));
+ goto out;
+ }
+ rc = 0;
+ if (unlikely(ecryptfs_verbosity > 0)) {
+ ecryptfs_printk(KERN_DEBUG, "First 8 bytes after "
+ "decryption:\n");
+ ecryptfs_dump_hex((char *)(page_address(page)
+ + byte_offset), 8);
+ }
+ extent_offset++;
+ }
+out:
+ if (lower_page_virt)
+ kmem_cache_free(ecryptfs_lower_page_cache, lower_page_virt);
+ return rc;
+}
+
+/**
+ * decrypt_scatterlist
+ *
+ * Returns the number of bytes decrypted; negative value on error
+ */
+static int decrypt_scatterlist(struct ecryptfs_crypt_stat *crypt_stat,
+ struct scatterlist *dest_sg,
+ struct scatterlist *src_sg, int size,
+ unsigned char *iv)
+{
+ int rc = 0;
+
+ /* Consider doing this once, when the file is opened */
+ mutex_lock(&crypt_stat->cs_tfm_mutex);
+ rc = crypto_cipher_setkey(crypt_stat->tfm, crypt_stat->key,
+ crypt_stat->key_size);
+ if (rc) {
+ ecryptfs_printk(KERN_ERR, "Error setting key; rc = [%d]\n",
+ rc);
+ mutex_unlock(&crypt_stat->cs_tfm_mutex);
+ rc = -EINVAL;
+ goto out;
+ }
+ ecryptfs_printk(KERN_DEBUG, "Decrypting [%d] bytes.\n", size);
+ rc = crypto_cipher_decrypt_iv(crypt_stat->tfm, dest_sg, src_sg, size,
+ iv);
+ mutex_unlock(&crypt_stat->cs_tfm_mutex);
+ if (rc) {
+ ecryptfs_printk(KERN_ERR, "Error decrypting; rc = [%d]\n",
+ rc);
+ goto out;
+ }
+ rc = size;
+out:
+ return rc;
+}
+
+/**
+ * ecryptfs_encrypt_page_offset
+ *
+ * Returns the number of bytes encrypted
+ */
+static int
+ecryptfs_encrypt_page_offset(struct ecryptfs_crypt_stat *crypt_stat,
+ struct page *dst_page, int dst_offset,
+ struct page *src_page, int src_offset, int size,
+ unsigned char *iv)
+{
+ struct scatterlist src_sg, dst_sg;
+
+ src_sg.page = src_page;
+ src_sg.offset = src_offset;
+ src_sg.length = size;
+ dst_sg.page = dst_page;
+ dst_sg.offset = dst_offset;
+ dst_sg.length = size;
+ return encrypt_scatterlist(crypt_stat, &dst_sg, &src_sg, size, iv);
+}
+
+/**
+ * ecryptfs_decrypt_page_offset
+ *
+ * Returns the number of bytes decrypted
+ */
+static int
+ecryptfs_decrypt_page_offset(struct ecryptfs_crypt_stat *crypt_stat,
+ struct page *dst_page, int dst_offset,
+ struct page *src_page, int src_offset, int size,
+ unsigned char *iv)
+{
+ struct scatterlist src_sg, dst_sg;
+
+ src_sg.page = src_page;
+ src_sg.offset = src_offset;
+ src_sg.length = size;
+ dst_sg.page = dst_page;
+ dst_sg.offset = dst_offset;
+ dst_sg.length = size;
+ return decrypt_scatterlist(crypt_stat, &dst_sg, &src_sg, size, iv);
+}
+
+#define ECRYPTFS_MAX_SCATTERLIST_LEN 4
+
+/**
+ * ecryptfs_init_crypt_ctx
+ * @crypt_stat: Uninitilized crypt stats structure
+ *
+ * Initialize the crypto context.
+ *
+ * TODO: Performance: Keep a cache of initialized cipher contexts;
+ * only init if needed
+ */
+int ecryptfs_init_crypt_ctx(struct ecryptfs_crypt_stat *crypt_stat)
+{
+ int rc = -EINVAL;
+
+ if (!crypt_stat->cipher) {
+ ecryptfs_printk(KERN_ERR, "No cipher specified\n");
+ goto out;
+ }
+ ecryptfs_printk(KERN_DEBUG,
+ "Initializing cipher [%s]; strlen = [%d]; "
+ "key_size_bits = [%d]\n",
+ crypt_stat->cipher, (int)strlen(crypt_stat->cipher),
+ crypt_stat->key_size << 3);
+ if (crypt_stat->tfm) {
+ rc = 0;
+ goto out;
+ }
+ mutex_lock(&crypt_stat->cs_tfm_mutex);
+ crypt_stat->tfm = crypto_alloc_tfm(crypt_stat->cipher,
+ ECRYPTFS_DEFAULT_CHAINING_MODE
+ | CRYPTO_TFM_REQ_WEAK_KEY);
+ mutex_unlock(&crypt_stat->cs_tfm_mutex);
+ if (!crypt_stat->tfm) {
+ ecryptfs_printk(KERN_ERR, "cryptfs: init_crypt_ctx(): "
+ "Error initializing cipher [%s]\n",
+ crypt_stat->cipher);
+ goto out;
+ }
+ rc = 0;
+out:
+ return rc;
+}
+
+static void set_extent_mask_and_shift(struct ecryptfs_crypt_stat *crypt_stat)
+{
+ int extent_size_tmp;
+
+ crypt_stat->extent_mask = 0xFFFFFFFF;
+ crypt_stat->extent_shift = 0;
+ if (crypt_stat->extent_size == 0)
+ return;
+ extent_size_tmp = crypt_stat->extent_size;
+ while ((extent_size_tmp & 0x01) == 0) {
+ extent_size_tmp >>= 1;
+ crypt_stat->extent_mask <<= 1;
+ crypt_stat->extent_shift++;
+ }
+}
+
+void ecryptfs_set_default_sizes(struct ecryptfs_crypt_stat *crypt_stat)
+{
+ /* Default values; may be overwritten as we are parsing the
+ * packets. */
+ crypt_stat->extent_size = ECRYPTFS_DEFAULT_EXTENT_SIZE;
+ set_extent_mask_and_shift(crypt_stat);
+ crypt_stat->iv_bytes = ECRYPTFS_DEFAULT_IV_BYTES;
+ if (PAGE_CACHE_SIZE <= ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE) {
+ crypt_stat->header_extent_size =
+ ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE;
+ } else
+ crypt_stat->header_extent_size = PAGE_CACHE_SIZE;
+ crypt_stat->num_header_extents_at_front = 1;
+}
+
+/**
+ * ecryptfs_compute_root_iv
+ * @crypt_stats
+ *
+ * On error, sets the root IV to all 0's.
+ */
+int ecryptfs_compute_root_iv(struct ecryptfs_crypt_stat *crypt_stat)
+{
+ int rc = 0;
+ char dst[MD5_DIGEST_SIZE];
+
+ BUG_ON(crypt_stat->iv_bytes > MD5_DIGEST_SIZE);
+ BUG_ON(crypt_stat->iv_bytes <= 0);
+ if (!ECRYPTFS_CHECK_FLAG(crypt_stat->flags, ECRYPTFS_KEY_VALID)) {
+ rc = -EINVAL;
+ ecryptfs_printk(KERN_WARNING, "Session key not valid; "
+ "cannot generate root IV\n");
+ goto out;
+ }
+ rc = ecryptfs_calculate_md5(dst, crypt_stat, crypt_stat->key,
+ crypt_stat->key_size);
+ if (rc) {
+ ecryptfs_printk(KERN_WARNING, "Error attempting to compute "
+ "MD5 while generating root IV\n");
+ goto out;
+ }
+ memcpy(crypt_stat->root_iv, dst, crypt_stat->iv_bytes);
+out:
+ if (rc) {
+ memset(crypt_stat->root_iv, 0, crypt_stat->iv_bytes);
+ ECRYPTFS_SET_FLAG(crypt_stat->flags,
+ ECRYPTFS_SECURITY_WARNING);
+ }
+ return rc;
+}
+
+static void ecryptfs_generate_new_key(struct ecryptfs_crypt_stat *crypt_stat)
+{
+ get_random_bytes(crypt_stat->key, crypt_stat->key_size);
+ ECRYPTFS_SET_FLAG(crypt_stat->flags, ECRYPTFS_KEY_VALID);
+ ecryptfs_compute_root_iv(crypt_stat);
+ if (unlikely(ecryptfs_verbosity > 0)) {
+ ecryptfs_printk(KERN_DEBUG, "Generated new session key:\n");
+ ecryptfs_dump_hex(crypt_stat->key,
+ crypt_stat->key_size);
+ }
+}
+
+/**
+ * ecryptfs_set_default_crypt_stat_vals
+ * @crypt_stat
+ *
+ * Default values in the event that policy does not override them.
+ */
+static void ecryptfs_set_default_crypt_stat_vals(
+ struct ecryptfs_crypt_stat *crypt_stat,
+ struct ecryptfs_mount_crypt_stat *mount_crypt_stat)
+{
+ ecryptfs_set_default_sizes(crypt_stat);
+ strcpy(crypt_stat->cipher, ECRYPTFS_DEFAULT_CIPHER);
+ crypt_stat->key_size = ECRYPTFS_DEFAULT_KEY_BYTES;
+ ECRYPTFS_CLEAR_FLAG(crypt_stat->flags, ECRYPTFS_KEY_VALID);
+ crypt_stat->file_version = ECRYPTFS_FILE_VERSION;
+ crypt_stat->mount_crypt_stat = mount_crypt_stat;
+}
+
+/**
+ * ecryptfs_new_file_context
+ * @ecryptfs_dentry
+ *
+ * If the crypto context for the file has not yet been established,
+ * this is where we do that. Establishing a new crypto context
+ * involves the following decisions:
+ * - What cipher to use?
+ * - What set of authentication tokens to use?
+ * Here we just worry about getting enough information into the
+ * authentication tokens so that we know that they are available.
+ * We associate the available authentication tokens with the new file
+ * via the set of signatures in the crypt_stat struct. Later, when
+ * the headers are actually written out, we may again defer to
+ * userspace to perform the encryption of the session key; for the
+ * foreseeable future, this will be the case with public key packets.
+ *
+ * Returns zero on success; non-zero otherwise
+ */
+/* Associate an authentication token(s) with the file */
+int ecryptfs_new_file_context(struct dentry *ecryptfs_dentry)
+{
+ int rc = 0;
+ struct ecryptfs_crypt_stat *crypt_stat =
+ &ecryptfs_inode_to_private(ecryptfs_dentry->d_inode)->crypt_stat;
+ struct ecryptfs_mount_crypt_stat *mount_crypt_stat =
+ &ecryptfs_superblock_to_private(
+ ecryptfs_dentry->d_sb)->mount_crypt_stat;
+ int cipher_name_len;
+
+ ecryptfs_set_default_crypt_stat_vals(crypt_stat, mount_crypt_stat);
+ /* See if there are mount crypt options */
+ if (mount_crypt_stat->global_auth_tok) {
+ ecryptfs_printk(KERN_DEBUG, "Initializing context for new "
+ "file using mount_crypt_stat\n");
+ ECRYPTFS_SET_FLAG(crypt_stat->flags, ECRYPTFS_ENCRYPTED);
+ ECRYPTFS_SET_FLAG(crypt_stat->flags, ECRYPTFS_KEY_VALID);
+ memcpy(crypt_stat->keysigs[crypt_stat->num_keysigs++],
+ mount_crypt_stat->global_auth_tok_sig,
+ ECRYPTFS_SIG_SIZE_HEX);
+ cipher_name_len =
+ strlen(mount_crypt_stat->global_default_cipher_name);
+ memcpy(crypt_stat->cipher,
+ mount_crypt_stat->global_default_cipher_name,
+ cipher_name_len);
+ crypt_stat->cipher[cipher_name_len] = '\0';
+ crypt_stat->key_size =
+ mount_crypt_stat->global_default_cipher_key_size;
+ ecryptfs_generate_new_key(crypt_stat);
+ } else
+ /* We should not encounter this scenario since we
+ * should detect lack of global_auth_tok at mount time
+ * TODO: Applies to 0.1 release only; remove in future
+ * release */
+ BUG();
+ rc = ecryptfs_init_crypt_ctx(crypt_stat);
+ if (rc)
+ ecryptfs_printk(KERN_ERR, "Error initializing cryptographic "
+ "context for cipher [%s]: rc = [%d]\n",
+ crypt_stat->cipher, rc);
+ return rc;
+}
+
+/**
+ * contains_ecryptfs_marker - check for the ecryptfs marker
+ * @data: The data block in which to check
+ *
+ * Returns one if marker found; zero if not found
+ */
+int contains_ecryptfs_marker(char *data)
+{
+ u32 m_1, m_2;
+
+ memcpy(&m_1, data, 4);
+ m_1 = be32_to_cpu(m_1);
+ memcpy(&m_2, (data + 4), 4);
+ m_2 = be32_to_cpu(m_2);
+ if ((m_1 ^ MAGIC_ECRYPTFS_MARKER) == m_2)
+ return 1;
+ ecryptfs_printk(KERN_DEBUG, "m_1 = [0x%.8x]; m_2 = [0x%.8x]; "
+ "MAGIC_ECRYPTFS_MARKER = [0x%.8x]\n", m_1, m_2,
+ MAGIC_ECRYPTFS_MARKER);
+ ecryptfs_printk(KERN_DEBUG, "(m_1 ^ MAGIC_ECRYPTFS_MARKER) = "
+ "[0x%.8x]\n", (m_1 ^ MAGIC_ECRYPTFS_MARKER));
+ return 0;
+}
+
+struct ecryptfs_flag_map_elem {
+ u32 file_flag;
+ u32 local_flag;
+};
+
+/* Add support for additional flags by adding elements here. */
+static struct ecryptfs_flag_map_elem ecryptfs_flag_map[] = {
+ {0x00000001, ECRYPTFS_ENABLE_HMAC},
+ {0x00000002, ECRYPTFS_ENCRYPTED}
+};
+
+/**
+ * ecryptfs_process_flags
+ * @crypt_stat
+ * @page_virt: Source data to be parsed
+ * @bytes_read: Updated with the number of bytes read
+ *
+ * Returns zero on success; non-zero if the flag set is invalid
+ */
+static int ecryptfs_process_flags(struct ecryptfs_crypt_stat *crypt_stat,
+ char *page_virt, int *bytes_read)
+{
+ int rc = 0;
+ int i;
+ u32 flags;
+
+ memcpy(&flags, page_virt, 4);
+ flags = be32_to_cpu(flags);
+ for (i = 0; i < ((sizeof(ecryptfs_flag_map)
+ / sizeof(struct ecryptfs_flag_map_elem))); i++)
+ if (flags & ecryptfs_flag_map[i].file_flag) {
+ ECRYPTFS_SET_FLAG(crypt_stat->flags,
+ ecryptfs_flag_map[i].local_flag);
+ } else
+ ECRYPTFS_CLEAR_FLAG(crypt_stat->flags,
+ ecryptfs_flag_map[i].local_flag);
+ /* Version is in top 8 bits of the 32-bit flag vector */
+ crypt_stat->file_version = ((flags >> 24) & 0xFF);
+ (*bytes_read) = 4;
+ return rc;
+}
+
+/**
+ * write_ecryptfs_marker
+ * @page_virt: The pointer to in a page to begin writing the marker
+ * @written: Number of bytes written
+ *
+ * Marker = 0x3c81b7f5
+ */
+static void write_ecryptfs_marker(char *page_virt, size_t *written)
+{
+ u32 m_1, m_2;
+
+ get_random_bytes(&m_1, (MAGIC_ECRYPTFS_MARKER_SIZE_BYTES / 2));
+ m_2 = (m_1 ^ MAGIC_ECRYPTFS_MARKER);
+ m_1 = cpu_to_be32(m_1);
+ memcpy(page_virt, &m_1, (MAGIC_ECRYPTFS_MARKER_SIZE_BYTES / 2));
+ m_2 = cpu_to_be32(m_2);
+ memcpy(page_virt + (MAGIC_ECRYPTFS_MARKER_SIZE_BYTES / 2), &m_2,
+ (MAGIC_ECRYPTFS_MARKER_SIZE_BYTES / 2));
+ (*written) = MAGIC_ECRYPTFS_MARKER_SIZE_BYTES;
+}
+
+static void
+write_ecryptfs_flags(char *page_virt, struct ecryptfs_crypt_stat *crypt_stat,
+ size_t *written)
+{
+ u32 flags = 0;
+ int i;
+
+ for (i = 0; i < ((sizeof(ecryptfs_flag_map)
+ / sizeof(struct ecryptfs_flag_map_elem))); i++)
+ if (ECRYPTFS_CHECK_FLAG(crypt_stat->flags,
+ ecryptfs_flag_map[i].local_flag))
+ flags |= ecryptfs_flag_map[i].file_flag;
+ /* Version is in top 8 bits of the 32-bit flag vector */
+ flags |= ((((u8)crypt_stat->file_version) << 24) & 0xFF000000);
+ flags = cpu_to_be32(flags);
+ memcpy(page_virt, &flags, 4);
+ (*written) = 4;
+}
+
+struct ecryptfs_cipher_code_str_map_elem {
+ char cipher_str[16];
+ u16 cipher_code;
+};
+
+/* Add support for additional ciphers by adding elements here. The
+ * cipher_code is whatever OpenPGP applicatoins use to identify the
+ * ciphers. List in order of probability. */
+static struct ecryptfs_cipher_code_str_map_elem
+ecryptfs_cipher_code_str_map[] = {
+ {"aes",RFC2440_CIPHER_AES_128 },
+ {"blowfish", RFC2440_CIPHER_BLOWFISH},
+ {"des3_ede", RFC2440_CIPHER_DES3_EDE},
+ {"cast5", RFC2440_CIPHER_CAST_5},
+ {"twofish", RFC2440_CIPHER_TWOFISH},
+ {"cast6", RFC2440_CIPHER_CAST_6},
+ {"aes", RFC2440_CIPHER_AES_192},
+ {"aes", RFC2440_CIPHER_AES_256}
+};
+
+/**
+ * ecryptfs_code_for_cipher_string
+ * @str: The string representing the cipher name
+ *
+ * Returns zero on no match, or the cipher code on match
+ */
+u16 ecryptfs_code_for_cipher_string(struct ecryptfs_crypt_stat *crypt_stat)
+{
+ int i;
+ u16 code = 0;
+ struct ecryptfs_cipher_code_str_map_elem *map =
+ ecryptfs_cipher_code_str_map;
+
+ if (strcmp(crypt_stat->cipher, "aes") == 0) {
+ switch (crypt_stat->key_size) {
+ case 16:
+ code = RFC2440_CIPHER_AES_128;
+ break;
+ case 24:
+ code = RFC2440_CIPHER_AES_192;
+ break;
+ case 32:
+ code = RFC2440_CIPHER_AES_256;
+ }
+ } else {
+ for (i = 0; i < ARRAY_SIZE(ecryptfs_cipher_code_str_map); i++)
+ if (strcmp(crypt_stat->cipher, map[i].cipher_str) == 0){
+ code = map[i].cipher_code;
+ break;
+ }
+ }
+ return code;
+}
+
+/**
+ * ecryptfs_cipher_code_to_string
+ * @str: Destination to write out the cipher name
+ * @cipher_code: The code to convert to cipher name string
+ *
+ * Returns zero on success
+ */
+int ecryptfs_cipher_code_to_string(char *str, u16 cipher_code)
+{
+ int rc = 0;
+ int i;
+
+ str[0] = '\0';
+ for (i = 0; i < ARRAY_SIZE(ecryptfs_cipher_code_str_map); i++)
+ if (cipher_code == ecryptfs_cipher_code_str_map[i].cipher_code)
+ strcpy(str, ecryptfs_cipher_code_str_map[i].cipher_str);
+ if (str[0] == '\0') {
+ ecryptfs_printk(KERN_WARNING, "Cipher code not recognized: "
+ "[%d]\n", cipher_code);
+ rc = -EINVAL;
+ }
+ return rc;
+}
+
+/**
+ * ecryptfs_read_header_region
+ * @data
+ * @dentry
+ * @nd
+ *
+ * Returns zero on success; non-zero otherwise
+ */
+int ecryptfs_read_header_region(char *data, struct dentry *dentry,
+ struct vfsmount *mnt)
+{
+ struct file *file;
+ mm_segment_t oldfs;
+ int rc;
+
+ mnt = mntget(mnt);
+ file = dentry_open(dentry, mnt, O_RDONLY);
+ if (IS_ERR(file)) {
+ ecryptfs_printk(KERN_DEBUG, "Error opening file to "
+ "read header region\n");
+ mntput(mnt);
+ rc = PTR_ERR(file);
+ goto out;
+ }
+ file->f_pos = 0;
+ oldfs = get_fs();
+ set_fs(get_ds());
+ /* For releases 0.1 and 0.2, all of the header information
+ * fits in the first data extent-sized region. */
+ rc = file->f_op->read(file, (char __user *)data,
+ ECRYPTFS_DEFAULT_EXTENT_SIZE, &file->f_pos);
+ set_fs(oldfs);
+ fput(file);
+ rc = 0;
+out:
+ return rc;
+}
+
+static void
+write_header_metadata(char *virt, struct ecryptfs_crypt_stat *crypt_stat,
+ size_t *written)
+{
+ u32 header_extent_size;
+ u16 num_header_extents_at_front;
+
+ header_extent_size = (u32)crypt_stat->header_extent_size;
+ num_header_extents_at_front =
+ (u16)crypt_stat->num_header_extents_at_front;
+ header_extent_size = cpu_to_be32(header_extent_size);
+ memcpy(virt, &header_extent_size, 4);
+ virt += 4;
+ num_header_extents_at_front = cpu_to_be16(num_header_extents_at_front);
+ memcpy(virt, &num_header_extents_at_front, 2);
+ (*written) = 6;
+}
+
+struct kmem_cache *ecryptfs_header_cache_0;
+struct kmem_cache *ecryptfs_header_cache_1;
+struct kmem_cache *ecryptfs_header_cache_2;
+
+/**
+ * ecryptfs_write_headers_virt
+ * @page_virt
+ * @crypt_stat
+ * @ecryptfs_dentry
+ *
+ * Format version: 1
+ *
+ * Header Extent:
+ * Octets 0-7: Unencrypted file size (big-endian)
+ * Octets 8-15: eCryptfs special marker
+ * Octets 16-19: Flags
+ * Octet 16: File format version number (between 0 and 255)
+ * Octets 17-18: Reserved
+ * Octet 19: Bit 1 (lsb): Reserved
+ * Bit 2: Encrypted?
+ * Bits 3-8: Reserved
+ * Octets 20-23: Header extent size (big-endian)
+ * Octets 24-25: Number of header extents at front of file
+ * (big-endian)
+ * Octet 26: Begin RFC 2440 authentication token packet set
+ * Data Extent 0:
+ * Lower data (CBC encrypted)
+ * Data Extent 1:
+ * Lower data (CBC encrypted)
+ * ...
+ *
+ * Returns zero on success
+ */
+int ecryptfs_write_headers_virt(char *page_virt,
+ struct ecryptfs_crypt_stat *crypt_stat,
+ struct dentry *ecryptfs_dentry)
+{
+ int rc;
+ size_t written;
+ size_t offset;
+
+ offset = ECRYPTFS_FILE_SIZE_BYTES;
+ write_ecryptfs_marker((page_virt + offset), &written);
+ offset += written;
+ write_ecryptfs_flags((page_virt + offset), crypt_stat, &written);
+ offset += written;
+ write_header_metadata((page_virt + offset), crypt_stat, &written);
+ offset += written;
+ rc = ecryptfs_generate_key_packet_set((page_virt + offset), crypt_stat,
+ ecryptfs_dentry, &written,
+ PAGE_CACHE_SIZE - offset);
+ if (rc)
+ ecryptfs_printk(KERN_WARNING, "Error generating key packet "
+ "set; rc = [%d]\n", rc);
+ return rc;
+}
+
+/**
+ * ecryptfs_write_headers
+ * @lower_file: The lower file struct, which was returned from dentry_open
+ *
+ * Write the file headers out. This will likely involve a userspace
+ * callout, in which the session key is encrypted with one or more
+ * public keys and/or the passphrase necessary to do the encryption is
+ * retrieved via a prompt. Exactly what happens at this point should
+ * be policy-dependent.
+ *
+ * Returns zero on success; non-zero on error
+ */
+int ecryptfs_write_headers(struct dentry *ecryptfs_dentry,
+ struct file *lower_file)
+{
+ mm_segment_t oldfs;
+ struct ecryptfs_crypt_stat *crypt_stat;
+ char *page_virt;
+ int current_header_page;
+ int header_pages;
+ int rc = 0;
+
+ crypt_stat = &ecryptfs_inode_to_private(
+ ecryptfs_dentry->d_inode)->crypt_stat;
+ if (likely(ECRYPTFS_CHECK_FLAG(crypt_stat->flags,
+ ECRYPTFS_ENCRYPTED))) {
+ if (!ECRYPTFS_CHECK_FLAG(crypt_stat->flags,
+ ECRYPTFS_KEY_VALID)) {
+ ecryptfs_printk(KERN_DEBUG, "Key is "
+ "invalid; bailing out\n");
+ rc = -EINVAL;
+ goto out;
+ }
+ } else {
+ rc = -EINVAL;
+ ecryptfs_printk(KERN_WARNING,
+ "Called with crypt_stat->encrypted == 0\n");
+ goto out;
+ }
+ /* Released in this function */
+ page_virt = kmem_cache_alloc(ecryptfs_header_cache_0, SLAB_USER);
+ if (!page_virt) {
+ ecryptfs_printk(KERN_ERR, "Out of memory\n");
+ rc = -ENOMEM;
+ goto out;
+ }
+ memset(page_virt, 0, PAGE_CACHE_SIZE);
+ rc = ecryptfs_write_headers_virt(page_virt, crypt_stat,
+ ecryptfs_dentry);
+ if (unlikely(rc)) {
+ ecryptfs_printk(KERN_ERR, "Error whilst writing headers\n");
+ memset(page_virt, 0, PAGE_CACHE_SIZE);
+ goto out_free;
+ }
+ ecryptfs_printk(KERN_DEBUG,
+ "Writing key packet set to underlying file\n");
+ lower_file->f_pos = 0;
+ oldfs = get_fs();
+ set_fs(get_ds());
+ ecryptfs_printk(KERN_DEBUG, "Calling lower_file->f_op->"
+ "write() w/ header page; lower_file->f_pos = "
+ "[0x%.16x]\n", lower_file->f_pos);
+ lower_file->f_op->write(lower_file, (char __user *)page_virt,
+ PAGE_CACHE_SIZE, &lower_file->f_pos);
+ header_pages = ((crypt_stat->header_extent_size
+ * crypt_stat->num_header_extents_at_front)
+ / PAGE_CACHE_SIZE);
+ memset(page_virt, 0, PAGE_CACHE_SIZE);
+ current_header_page = 1;
+ while (current_header_page < header_pages) {
+ ecryptfs_printk(KERN_DEBUG, "Calling lower_file->f_op->"
+ "write() w/ zero'd page; lower_file->f_pos = "
+ "[0x%.16x]\n", lower_file->f_pos);
+ lower_file->f_op->write(lower_file, (char __user *)page_virt,
+ PAGE_CACHE_SIZE, &lower_file->f_pos);
+ current_header_page++;
+ }
+ set_fs(oldfs);
+ ecryptfs_printk(KERN_DEBUG,
+ "Done writing key packet set to underlying file.\n");
+out_free:
+ kmem_cache_free(ecryptfs_header_cache_0, page_virt);
+out:
+ return rc;
+}
+
+static int parse_header_metadata(struct ecryptfs_crypt_stat *crypt_stat,
+ char *virt, int *bytes_read)
+{
+ int rc = 0;
+ u32 header_extent_size;
+ u16 num_header_extents_at_front;
+
+ memcpy(&header_extent_size, virt, 4);
+ header_extent_size = be32_to_cpu(header_extent_size);
+ virt += 4;
+ memcpy(&num_header_extents_at_front, virt, 2);
+ num_header_extents_at_front = be16_to_cpu(num_header_extents_at_front);
+ crypt_stat->header_extent_size = (int)header_extent_size;
+ crypt_stat->num_header_extents_at_front =
+ (int)num_header_extents_at_front;
+ (*bytes_read) = 6;
+ if ((crypt_stat->header_extent_size
+ * crypt_stat->num_header_extents_at_front)
+ < ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE) {
+ rc = -EINVAL;
+ ecryptfs_printk(KERN_WARNING, "Invalid header extent size: "
+ "[%d]\n", crypt_stat->header_extent_size);
+ }
+ return rc;
+}
+
+/**
+ * set_default_header_data
+ *
+ * For version 0 file format; this function is only for backwards
+ * compatibility for files created with the prior versions of
+ * eCryptfs.
+ */
+static void set_default_header_data(struct ecryptfs_crypt_stat *crypt_stat)
+{
+ crypt_stat->header_extent_size = 4096;
+ crypt_stat->num_header_extents_at_front = 1;
+}
+
+/**
+ * ecryptfs_read_headers_virt
+ *
+ * Read/parse the header data. The header format is detailed in the
+ * comment block for the ecryptfs_write_headers_virt() function.
+ *
+ * Returns zero on success
+ */
+static int ecryptfs_read_headers_virt(char *page_virt,
+ struct ecryptfs_crypt_stat *crypt_stat,
+ struct dentry *ecryptfs_dentry)
+{
+ int rc = 0;
+ int offset;
+ int bytes_read;
+
+ ecryptfs_set_default_sizes(crypt_stat);
+ crypt_stat->mount_crypt_stat = &ecryptfs_superblock_to_private(
+ ecryptfs_dentry->d_sb)->mount_crypt_stat;
+ offset = ECRYPTFS_FILE_SIZE_BYTES;
+ rc = contains_ecryptfs_marker(page_virt + offset);
+ if (rc == 0) {
+ rc = -EINVAL;
+ goto out;
+ }
+ offset += MAGIC_ECRYPTFS_MARKER_SIZE_BYTES;
+ rc = ecryptfs_process_flags(crypt_stat, (page_virt + offset),
+ &bytes_read);
+ if (rc) {
+ ecryptfs_printk(KERN_WARNING, "Error processing flags\n");
+ goto out;
+ }
+ if (crypt_stat->file_version > ECRYPTFS_SUPPORTED_FILE_VERSION) {
+ ecryptfs_printk(KERN_WARNING, "File version is [%d]; only "
+ "file version [%d] is supported by this "
+ "version of eCryptfs\n",
+ crypt_stat->file_version,
+ ECRYPTFS_SUPPORTED_FILE_VERSION);
+ rc = -EINVAL;
+ goto out;
+ }
+ offset += bytes_read;
+ if (crypt_stat->file_version >= 1) {
+ rc = parse_header_metadata(crypt_stat, (page_virt + offset),
+ &bytes_read);
+ if (rc) {
+ ecryptfs_printk(KERN_WARNING, "Error reading header "
+ "metadata; rc = [%d]\n", rc);
+ }
+ offset += bytes_read;
+ } else
+ set_default_header_data(crypt_stat);
+ rc = ecryptfs_parse_packet_set(crypt_stat, (page_virt + offset),
+ ecryptfs_dentry);
+out:
+ return rc;
+}
+
+/**
+ * ecryptfs_read_headers
+ *
+ * Returns zero if valid headers found and parsed; non-zero otherwise
+ */
+int ecryptfs_read_headers(struct dentry *ecryptfs_dentry,
+ struct file *lower_file)
+{
+ int rc = 0;
+ char *page_virt = NULL;
+ mm_segment_t oldfs;
+ ssize_t bytes_read;
+ struct ecryptfs_crypt_stat *crypt_stat =
+ &ecryptfs_inode_to_private(ecryptfs_dentry->d_inode)->crypt_stat;
+
+ /* Read the first page from the underlying file */
+ page_virt = kmem_cache_alloc(ecryptfs_header_cache_1, SLAB_USER);
+ if (!page_virt) {
+ rc = -ENOMEM;
+ ecryptfs_printk(KERN_ERR, "Unable to allocate page_virt\n");
+ goto out;
+ }
+ lower_file->f_pos = 0;
+ oldfs = get_fs();
+ set_fs(get_ds());
+ bytes_read = lower_file->f_op->read(lower_file,
+ (char __user *)page_virt,
+ ECRYPTFS_DEFAULT_EXTENT_SIZE,
+ &lower_file->f_pos);
+ set_fs(oldfs);
+ if (bytes_read != ECRYPTFS_DEFAULT_EXTENT_SIZE) {
+ rc = -EINVAL;
+ goto out;
+ }
+ rc = ecryptfs_read_headers_virt(page_virt, crypt_stat,
+ ecryptfs_dentry);
+ if (rc) {
+ ecryptfs_printk(KERN_DEBUG, "Valid eCryptfs headers not "
+ "found\n");
+ rc = -EINVAL;
+ }
+out:
+ if (page_virt) {
+ memset(page_virt, 0, PAGE_CACHE_SIZE);
+ kmem_cache_free(ecryptfs_header_cache_1, page_virt);
+ }
+ return rc;
+}
+
+/**
+ * ecryptfs_encode_filename - converts a plaintext file name to cipher text
+ * @crypt_stat: The crypt_stat struct associated with the file anem to encode
+ * @name: The plaintext name
+ * @length: The length of the plaintext
+ * @encoded_name: The encypted name
+ *
+ * Encrypts and encodes a filename into something that constitutes a
+ * valid filename for a filesystem, with printable characters.
+ *
+ * We assume that we have a properly initialized crypto context,
+ * pointed to by crypt_stat->tfm.
+ *
+ * TODO: Implement filename decoding and decryption here, in place of
+ * memcpy. We are keeping the framework around for now to (1)
+ * facilitate testing of the components needed to implement filename
+ * encryption and (2) to provide a code base from which other
+ * developers in the community can easily implement this feature.
+ *
+ * Returns the length of encoded filename; negative if error
+ */
+int
+ecryptfs_encode_filename(struct ecryptfs_crypt_stat *crypt_stat,
+ const char *name, int length, char **encoded_name)
+{
+ int error = 0;
+
+ (*encoded_name) = kmalloc(length + 2, GFP_KERNEL);
+ if (!(*encoded_name)) {
+ error = -ENOMEM;
+ goto out;
+ }
+ /* TODO: Filename encryption is a scheduled feature for a
+ * future version of eCryptfs. This function is here only for
+ * the purpose of providing a framework for other developers
+ * to easily implement filename encryption. Hint: Replace this
+ * memcpy() with a call to encrypt and encode the
+ * filename, the set the length accordingly. */
+ memcpy((void *)(*encoded_name), (void *)name, length);
+ (*encoded_name)[length] = '\0';
+ error = length + 1;
+out:
+ return error;
+}
+
+/**
+ * ecryptfs_decode_filename - converts the cipher text name to plaintext
+ * @crypt_stat: The crypt_stat struct associated with the file
+ * @name: The filename in cipher text
+ * @length: The length of the cipher text name
+ * @decrypted_name: The plaintext name
+ *
+ * Decodes and decrypts the filename.
+ *
+ * We assume that we have a properly initialized crypto context,
+ * pointed to by crypt_stat->tfm.
+ *
+ * TODO: Implement filename decoding and decryption here, in place of
+ * memcpy. We are keeping the framework around for now to (1)
+ * facilitate testing of the components needed to implement filename
+ * encryption and (2) to provide a code base from which other
+ * developers in the community can easily implement this feature.
+ *
+ * Returns the length of decoded filename; negative if error
+ */
+int
+ecryptfs_decode_filename(struct ecryptfs_crypt_stat *crypt_stat,
+ const char *name, int length, char **decrypted_name)
+{
+ int error = 0;
+
+ (*decrypted_name) = kmalloc(length + 2, GFP_KERNEL);
+ if (!(*decrypted_name)) {
+ error = -ENOMEM;
+ goto out;
+ }
+ /* TODO: Filename encryption is a scheduled feature for a
+ * future version of eCryptfs. This function is here only for
+ * the purpose of providing a framework for other developers
+ * to easily implement filename encryption. Hint: Replace this
+ * memcpy() with a call to decode and decrypt the
+ * filename, the set the length accordingly. */
+ memcpy((void *)(*decrypted_name), (void *)name, length);
+ (*decrypted_name)[length + 1] = '\0'; /* Only for convenience
+ * in printing out the
+ * string in debug
+ * messages */
+ error = length;
+out:
+ return error;
+}
+
+/**
+ * ecryptfs_process_cipher - Perform cipher initialization.
+ * @tfm: Crypto context set by this function
+ * @key_tfm: Crypto context for key material, set by this function
+ * @cipher_name: Name of the cipher.
+ * @key_size: Size of the key in bytes.
+ *
+ * Returns zero on success. Any crypto_tfm structs allocated here
+ * should be released by other functions, such as on a superblock put
+ * event, regardless of whether this function succeeds for fails.
+ */
+int
+ecryptfs_process_cipher(struct crypto_tfm **tfm, struct crypto_tfm **key_tfm,
+ char *cipher_name, size_t key_size)
+{
+ char dummy_key[ECRYPTFS_MAX_KEY_BYTES];
+ int rc;
+
+ *tfm = *key_tfm = NULL;
+ if (key_size > ECRYPTFS_MAX_KEY_BYTES) {
+ rc = -EINVAL;
+ printk(KERN_ERR "Requested key size is [%Zd] bytes; maximum "
+ "allowable is [%d]\n", key_size, ECRYPTFS_MAX_KEY_BYTES);
+ goto out;
+ }
+ *tfm = crypto_alloc_tfm(cipher_name, (ECRYPTFS_DEFAULT_CHAINING_MODE
+ | CRYPTO_TFM_REQ_WEAK_KEY));
+ if (!(*tfm)) {
+ rc = -EINVAL;
+ printk(KERN_ERR "Unable to allocate crypto cipher with name "
+ "[%s]\n", cipher_name);
+ goto out;
+ }
+ *key_tfm = crypto_alloc_tfm(cipher_name, CRYPTO_TFM_REQ_WEAK_KEY);
+ if (!(*key_tfm)) {
+ rc = -EINVAL;
+ printk(KERN_ERR "Unable to allocate crypto cipher with name "
+ "[%s]\n", cipher_name);
+ goto out;
+ }
+ if (key_size < crypto_tfm_alg_min_keysize(*tfm)) {
+ rc = -EINVAL;
+ printk(KERN_ERR "Request key size is [%Zd]; minimum key size "
+ "supported by cipher [%s] is [%d]\n", key_size,
+ cipher_name, crypto_tfm_alg_min_keysize(*tfm));
+ goto out;
+ }
+ if (key_size < crypto_tfm_alg_min_keysize(*key_tfm)) {
+ rc = -EINVAL;
+ printk(KERN_ERR "Request key size is [%Zd]; minimum key size "
+ "supported by cipher [%s] is [%d]\n", key_size,
+ cipher_name, crypto_tfm_alg_min_keysize(*key_tfm));
+ goto out;
+ }
+ if (key_size > crypto_tfm_alg_max_keysize(*tfm)) {
+ rc = -EINVAL;
+ printk(KERN_ERR "Request key size is [%Zd]; maximum key size "
+ "supported by cipher [%s] is [%d]\n", key_size,
+ cipher_name, crypto_tfm_alg_min_keysize(*tfm));
+ goto out;
+ }
+ if (key_size > crypto_tfm_alg_max_keysize(*key_tfm)) {
+ rc = -EINVAL;
+ printk(KERN_ERR "Request key size is [%Zd]; maximum key size "
+ "supported by cipher [%s] is [%d]\n", key_size,
+ cipher_name, crypto_tfm_alg_min_keysize(*key_tfm));
+ goto out;
+ }
+ get_random_bytes(dummy_key, key_size);
+ rc = crypto_cipher_setkey(*tfm, dummy_key, key_size);
+ if (rc) {
+ printk(KERN_ERR "Error attempting to set key of size [%Zd] for "
+ "cipher [%s]; rc = [%d]\n", key_size, cipher_name, rc);
+ rc = -EINVAL;
+ goto out;
+ }
+ rc = crypto_cipher_setkey(*key_tfm, dummy_key, key_size);
+ if (rc) {
+ printk(KERN_ERR "Error attempting to set key of size [%Zd] for "
+ "cipher [%s]; rc = [%d]\n", key_size, cipher_name, rc);
+ rc = -EINVAL;
+ goto out;
+ }
+out:
+ return rc;
+}
diff --git a/fs/ecryptfs/debug.c b/fs/ecryptfs/debug.c
new file mode 100644
index 000000000000..61f8e894284f
--- /dev/null
+++ b/fs/ecryptfs/debug.c
@@ -0,0 +1,123 @@
+/**
+ * eCryptfs: Linux filesystem encryption layer
+ * Functions only useful for debugging.
+ *
+ * Copyright (C) 2006 International Business Machines Corp.
+ * Author(s): Michael A. Halcrow <mahalcro@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+ * 02111-1307, USA.
+ */
+
+#include "ecryptfs_kernel.h"
+
+/**
+ * ecryptfs_dump_auth_tok - debug function to print auth toks
+ *
+ * This function will print the contents of an ecryptfs authentication
+ * token.
+ */
+void ecryptfs_dump_auth_tok(struct ecryptfs_auth_tok *auth_tok)
+{
+ char salt[ECRYPTFS_SALT_SIZE * 2 + 1];
+ char sig[ECRYPTFS_SIG_SIZE_HEX + 1];
+
+ ecryptfs_printk(KERN_DEBUG, "Auth tok at mem loc [%p]:\n",
+ auth_tok);
+ if (ECRYPTFS_CHECK_FLAG(auth_tok->flags, ECRYPTFS_PRIVATE_KEY)) {
+ ecryptfs_printk(KERN_DEBUG, " * private key type\n");
+ ecryptfs_printk(KERN_DEBUG, " * (NO PRIVATE KEY SUPPORT "
+ "IN ECRYPTFS VERSION 0.1)\n");
+ } else {
+ ecryptfs_printk(KERN_DEBUG, " * passphrase type\n");
+ ecryptfs_to_hex(salt, auth_tok->token.password.salt,
+ ECRYPTFS_SALT_SIZE);
+ salt[ECRYPTFS_SALT_SIZE * 2] = '\0';
+ ecryptfs_printk(KERN_DEBUG, " * salt = [%s]\n", salt);
+ if (ECRYPTFS_CHECK_FLAG(auth_tok->token.password.flags,
+ ECRYPTFS_PERSISTENT_PASSWORD)) {
+ ecryptfs_printk(KERN_DEBUG, " * persistent\n");
+ }
+ memcpy(sig, auth_tok->token.password.signature,
+ ECRYPTFS_SIG_SIZE_HEX);
+ sig[ECRYPTFS_SIG_SIZE_HEX] = '\0';
+ ecryptfs_printk(KERN_DEBUG, " * signature = [%s]\n", sig);
+ }
+ ecryptfs_printk(KERN_DEBUG, " * session_key.flags = [0x%x]\n",
+ auth_tok->session_key.flags);
+ if (auth_tok->session_key.flags
+ & ECRYPTFS_USERSPACE_SHOULD_TRY_TO_DECRYPT)
+ ecryptfs_printk(KERN_DEBUG,
+ " * Userspace decrypt request set\n");
+ if (auth_tok->session_key.flags
+ & ECRYPTFS_USERSPACE_SHOULD_TRY_TO_ENCRYPT)
+ ecryptfs_printk(KERN_DEBUG,
+ " * Userspace encrypt request set\n");
+ if (auth_tok->session_key.flags & ECRYPTFS_CONTAINS_DECRYPTED_KEY) {
+ ecryptfs_printk(KERN_DEBUG, " * Contains decrypted key\n");
+ ecryptfs_printk(KERN_DEBUG,
+ " * session_key.decrypted_key_size = [0x%x]\n",
+ auth_tok->session_key.decrypted_key_size);
+ ecryptfs_printk(KERN_DEBUG, " * Decrypted session key "
+ "dump:\n");
+ if (ecryptfs_verbosity > 0)
+ ecryptfs_dump_hex(auth_tok->session_key.decrypted_key,
+ ECRYPTFS_DEFAULT_KEY_BYTES);
+ }
+ if (auth_tok->session_key.flags & ECRYPTFS_CONTAINS_ENCRYPTED_KEY) {
+ ecryptfs_printk(KERN_DEBUG, " * Contains encrypted key\n");
+ ecryptfs_printk(KERN_DEBUG,
+ " * session_key.encrypted_key_size = [0x%x]\n",
+ auth_tok->session_key.encrypted_key_size);
+ ecryptfs_printk(KERN_DEBUG, " * Encrypted session key "
+ "dump:\n");
+ if (ecryptfs_verbosity > 0)
+ ecryptfs_dump_hex(auth_tok->session_key.encrypted_key,
+ auth_tok->session_key.
+ encrypted_key_size);
+ }
+}
+
+/**
+ * ecryptfs_dump_hex - debug hex printer
+ * @data: string of bytes to be printed
+ * @bytes: number of bytes to print
+ *
+ * Dump hexadecimal representation of char array
+ */
+void ecryptfs_dump_hex(char *data, int bytes)
+{
+ int i = 0;
+ int add_newline = 1;
+
+ if (ecryptfs_verbosity < 1)
+ return;
+ if (bytes != 0) {
+ printk(KERN_DEBUG "0x%.2x.", (unsigned char)data[i]);
+ i++;
+ }
+ while (i < bytes) {
+ printk("0x%.2x.", (unsigned char)data[i]);
+ i++;
+ if (i % 16 == 0) {
+ printk("\n");
+ add_newline = 0;
+ } else
+ add_newline = 1;
+ }
+ if (add_newline)
+ printk("\n");
+}
+
diff --git a/fs/ecryptfs/dentry.c b/fs/ecryptfs/dentry.c
new file mode 100644
index 000000000000..f0d2a433242b
--- /dev/null
+++ b/fs/ecryptfs/dentry.c
@@ -0,0 +1,87 @@
+/**
+ * eCryptfs: Linux filesystem encryption layer
+ *
+ * Copyright (C) 1997-2003 Erez Zadok
+ * Copyright (C) 2001-2003 Stony Brook University
+ * Copyright (C) 2004-2006 International Business Machines Corp.
+ * Author(s): Michael A. Halcrow <mahalcro@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+ * 02111-1307, USA.
+ */
+
+#include <linux/dcache.h>
+#include <linux/namei.h>
+#include "ecryptfs_kernel.h"
+
+/**
+ * ecryptfs_d_revalidate - revalidate an ecryptfs dentry
+ * @dentry: The ecryptfs dentry
+ * @nd: The associated nameidata
+ *
+ * Called when the VFS needs to revalidate a dentry. This
+ * is called whenever a name lookup finds a dentry in the
+ * dcache. Most filesystems leave this as NULL, because all their
+ * dentries in the dcache are valid.
+ *
+ * Returns 1 if valid, 0 otherwise.
+ *
+ */
+static int ecryptfs_d_revalidate(struct dentry *dentry, struct nameidata *nd)
+{
+ struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
+ struct vfsmount *lower_mnt = ecryptfs_dentry_to_lower_mnt(dentry);
+ struct dentry *dentry_save;
+ struct vfsmount *vfsmount_save;
+ int rc = 1;
+
+ if (!lower_dentry->d_op || !lower_dentry->d_op->d_revalidate)
+ goto out;
+ dentry_save = nd->dentry;
+ vfsmount_save = nd->mnt;
+ nd->dentry = lower_dentry;
+ nd->mnt = lower_mnt;
+ rc = lower_dentry->d_op->d_revalidate(lower_dentry, nd);
+ nd->dentry = dentry_save;
+ nd->mnt = vfsmount_save;
+out:
+ return rc;
+}
+
+struct kmem_cache *ecryptfs_dentry_info_cache;
+
+/**
+ * ecryptfs_d_release
+ * @dentry: The ecryptfs dentry
+ *
+ * Called when a dentry is really deallocated.
+ */
+static void ecryptfs_d_release(struct dentry *dentry)
+{
+ struct dentry *lower_dentry;
+
+ lower_dentry = ecryptfs_dentry_to_lower(dentry);
+ if (ecryptfs_dentry_to_private(dentry))
+ kmem_cache_free(ecryptfs_dentry_info_cache,
+ ecryptfs_dentry_to_private(dentry));
+ if (lower_dentry)
+ dput(lower_dentry);
+ return;
+}
+
+struct dentry_operations ecryptfs_dops = {
+ .d_revalidate = ecryptfs_d_revalidate,
+ .d_release = ecryptfs_d_release,
+};
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
new file mode 100644
index 000000000000..872c9958531a
--- /dev/null
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -0,0 +1,482 @@
+/**
+ * eCryptfs: Linux filesystem encryption layer
+ * Kernel declarations.
+ *
+ * Copyright (C) 1997-2003 Erez Zadok
+ * Copyright (C) 2001-2003 Stony Brook University
+ * Copyright (C) 2004-2006 International Business Machines Corp.
+ * Author(s): Michael A. Halcrow <mahalcro@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+ * 02111-1307, USA.
+ */
+
+#ifndef ECRYPTFS_KERNEL_H
+#define ECRYPTFS_KERNEL_H
+
+#include <keys/user-type.h>
+#include <linux/fs.h>
+#include <linux/scatterlist.h>
+
+/* Version verification for shared data structures w/ userspace */
+#define ECRYPTFS_VERSION_MAJOR 0x00
+#define ECRYPTFS_VERSION_MINOR 0x04
+#define ECRYPTFS_SUPPORTED_FILE_VERSION 0x01
+/* These flags indicate which features are supported by the kernel
+ * module; userspace tools such as the mount helper read
+ * ECRYPTFS_VERSIONING_MASK from a sysfs handle in order to determine
+ * how to behave. */
+#define ECRYPTFS_VERSIONING_PASSPHRASE 0x00000001
+#define ECRYPTFS_VERSIONING_PUBKEY 0x00000002
+#define ECRYPTFS_VERSIONING_PLAINTEXT_PASSTHROUGH 0x00000004
+#define ECRYPTFS_VERSIONING_POLICY 0x00000008
+#define ECRYPTFS_VERSIONING_MASK (ECRYPTFS_VERSIONING_PASSPHRASE \
+ | ECRYPTFS_VERSIONING_PLAINTEXT_PASSTHROUGH)
+
+#define ECRYPTFS_MAX_PASSWORD_LENGTH 64
+#define ECRYPTFS_MAX_PASSPHRASE_BYTES ECRYPTFS_MAX_PASSWORD_LENGTH
+#define ECRYPTFS_SALT_SIZE 8
+#define ECRYPTFS_SALT_SIZE_HEX (ECRYPTFS_SALT_SIZE*2)
+/* The original signature size is only for what is stored on disk; all
+ * in-memory representations are expanded hex, so it better adapted to
+ * be passed around or referenced on the command line */
+#define ECRYPTFS_SIG_SIZE 8
+#define ECRYPTFS_SIG_SIZE_HEX (ECRYPTFS_SIG_SIZE*2)
+#define ECRYPTFS_PASSWORD_SIG_SIZE ECRYPTFS_SIG_SIZE_HEX
+#define ECRYPTFS_MAX_KEY_BYTES 64
+#define ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES 512
+#define ECRYPTFS_DEFAULT_IV_BYTES 16
+#define ECRYPTFS_FILE_VERSION 0x01
+#define ECRYPTFS_DEFAULT_HEADER_EXTENT_SIZE 8192
+#define ECRYPTFS_DEFAULT_EXTENT_SIZE 4096
+#define ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE 8192
+
+#define RFC2440_CIPHER_DES3_EDE 0x02
+#define RFC2440_CIPHER_CAST_5 0x03
+#define RFC2440_CIPHER_BLOWFISH 0x04
+#define RFC2440_CIPHER_AES_128 0x07
+#define RFC2440_CIPHER_AES_192 0x08
+#define RFC2440_CIPHER_AES_256 0x09
+#define RFC2440_CIPHER_TWOFISH 0x0a
+#define RFC2440_CIPHER_CAST_6 0x0b
+
+#define ECRYPTFS_SET_FLAG(flag_bit_vector, flag) (flag_bit_vector |= (flag))
+#define ECRYPTFS_CLEAR_FLAG(flag_bit_vector, flag) (flag_bit_vector &= ~(flag))
+#define ECRYPTFS_CHECK_FLAG(flag_bit_vector, flag) (flag_bit_vector & (flag))
+
+/**
+ * For convenience, we may need to pass around the encrypted session
+ * key between kernel and userspace because the authentication token
+ * may not be extractable. For example, the TPM may not release the
+ * private key, instead requiring the encrypted data and returning the
+ * decrypted data.
+ */
+struct ecryptfs_session_key {
+#define ECRYPTFS_USERSPACE_SHOULD_TRY_TO_DECRYPT 0x00000001
+#define ECRYPTFS_USERSPACE_SHOULD_TRY_TO_ENCRYPT 0x00000002
+#define ECRYPTFS_CONTAINS_DECRYPTED_KEY 0x00000004
+#define ECRYPTFS_CONTAINS_ENCRYPTED_KEY 0x00000008
+ u32 flags;
+ u32 encrypted_key_size;
+ u32 decrypted_key_size;
+ u8 encrypted_key[ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES];
+ u8 decrypted_key[ECRYPTFS_MAX_KEY_BYTES];
+};
+
+struct ecryptfs_password {
+ u32 password_bytes;
+ s32 hash_algo;
+ u32 hash_iterations;
+ u32 session_key_encryption_key_bytes;
+#define ECRYPTFS_PERSISTENT_PASSWORD 0x01
+#define ECRYPTFS_SESSION_KEY_ENCRYPTION_KEY_SET 0x02
+ u32 flags;
+ /* Iterated-hash concatenation of salt and passphrase */
+ u8 session_key_encryption_key[ECRYPTFS_MAX_KEY_BYTES];
+ u8 signature[ECRYPTFS_PASSWORD_SIG_SIZE + 1];
+ /* Always in expanded hex */
+ u8 salt[ECRYPTFS_SALT_SIZE];
+};
+
+enum ecryptfs_token_types {ECRYPTFS_PASSWORD, ECRYPTFS_PRIVATE_KEY};
+
+/* May be a password or a private key */
+struct ecryptfs_auth_tok {
+ u16 version; /* 8-bit major and 8-bit minor */
+ u16 token_type;
+ u32 flags;
+ struct ecryptfs_session_key session_key;
+ u8 reserved[32];
+ union {
+ struct ecryptfs_password password;
+ /* Private key is in future eCryptfs releases */
+ } token;
+} __attribute__ ((packed));
+
+void ecryptfs_dump_auth_tok(struct ecryptfs_auth_tok *auth_tok);
+extern void ecryptfs_to_hex(char *dst, char *src, size_t src_size);
+extern void ecryptfs_from_hex(char *dst, char *src, int dst_size);
+
+struct ecryptfs_key_record {
+ unsigned char type;
+ size_t enc_key_size;
+ unsigned char sig[ECRYPTFS_SIG_SIZE];
+ unsigned char enc_key[ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES];
+};
+
+struct ecryptfs_auth_tok_list {
+ struct ecryptfs_auth_tok *auth_tok;
+ struct list_head list;
+};
+
+struct ecryptfs_crypt_stat;
+struct ecryptfs_mount_crypt_stat;
+
+struct ecryptfs_page_crypt_context {
+ struct page *page;
+#define ECRYPTFS_PREPARE_COMMIT_MODE 0
+#define ECRYPTFS_WRITEPAGE_MODE 1
+ unsigned int mode;
+ union {
+ struct file *lower_file;
+ struct writeback_control *wbc;
+ } param;
+};
+
+static inline struct ecryptfs_auth_tok *
+ecryptfs_get_key_payload_data(struct key *key)
+{
+ return (struct ecryptfs_auth_tok *)
+ (((struct user_key_payload*)key->payload.data)->data);
+}
+
+#define ECRYPTFS_SUPER_MAGIC 0xf15f
+#define ECRYPTFS_MAX_KEYSET_SIZE 1024
+#define ECRYPTFS_MAX_CIPHER_NAME_SIZE 32
+#define ECRYPTFS_MAX_NUM_ENC_KEYS 64
+#define ECRYPTFS_MAX_NUM_KEYSIGS 2 /* TODO: Make this a linked list */
+#define ECRYPTFS_MAX_IV_BYTES 16 /* 128 bits */
+#define ECRYPTFS_SALT_BYTES 2
+#define MAGIC_ECRYPTFS_MARKER 0x3c81b7f5
+#define MAGIC_ECRYPTFS_MARKER_SIZE_BYTES 8 /* 4*2 */
+#define ECRYPTFS_FILE_SIZE_BYTES 8
+#define ECRYPTFS_DEFAULT_CIPHER "aes"
+#define ECRYPTFS_DEFAULT_KEY_BYTES 16
+#define ECRYPTFS_DEFAULT_CHAINING_MODE CRYPTO_TFM_MODE_CBC
+#define ECRYPTFS_TAG_3_PACKET_TYPE 0x8C
+#define ECRYPTFS_TAG_11_PACKET_TYPE 0xED
+#define MD5_DIGEST_SIZE 16
+
+/**
+ * This is the primary struct associated with each encrypted file.
+ *
+ * TODO: cache align/pack?
+ */
+struct ecryptfs_crypt_stat {
+#define ECRYPTFS_STRUCT_INITIALIZED 0x00000001
+#define ECRYPTFS_POLICY_APPLIED 0x00000002
+#define ECRYPTFS_NEW_FILE 0x00000004
+#define ECRYPTFS_ENCRYPTED 0x00000008
+#define ECRYPTFS_SECURITY_WARNING 0x00000010
+#define ECRYPTFS_ENABLE_HMAC 0x00000020
+#define ECRYPTFS_ENCRYPT_IV_PAGES 0x00000040
+#define ECRYPTFS_KEY_VALID 0x00000080
+ u32 flags;
+ unsigned int file_version;
+ size_t iv_bytes;
+ size_t num_keysigs;
+ size_t header_extent_size;
+ size_t num_header_extents_at_front;
+ size_t extent_size; /* Data extent size; default is 4096 */
+ size_t key_size;
+ size_t extent_shift;
+ unsigned int extent_mask;
+ struct ecryptfs_mount_crypt_stat *mount_crypt_stat;
+ struct crypto_tfm *tfm;
+ struct crypto_tfm *md5_tfm; /* Crypto context for generating
+ * the initialization vectors */
+ unsigned char cipher[ECRYPTFS_MAX_CIPHER_NAME_SIZE];
+ unsigned char key[ECRYPTFS_MAX_KEY_BYTES];
+ unsigned char root_iv[ECRYPTFS_MAX_IV_BYTES];
+ unsigned char keysigs[ECRYPTFS_MAX_NUM_KEYSIGS][ECRYPTFS_SIG_SIZE_HEX];
+ struct mutex cs_tfm_mutex;
+ struct mutex cs_md5_tfm_mutex;
+ struct mutex cs_mutex;
+};
+
+/* inode private data. */
+struct ecryptfs_inode_info {
+ struct inode vfs_inode;
+ struct inode *wii_inode;
+ struct ecryptfs_crypt_stat crypt_stat;
+};
+
+/* dentry private data. Each dentry must keep track of a lower
+ * vfsmount too. */
+struct ecryptfs_dentry_info {
+ struct dentry *wdi_dentry;
+ struct vfsmount *lower_mnt;
+ struct ecryptfs_crypt_stat *crypt_stat;
+};
+
+/**
+ * This struct is to enable a mount-wide passphrase/salt combo. This
+ * is more or less a stopgap to provide similar functionality to other
+ * crypto filesystems like EncFS or CFS until full policy support is
+ * implemented in eCryptfs.
+ */
+struct ecryptfs_mount_crypt_stat {
+ /* Pointers to memory we do not own, do not free these */
+#define ECRYPTFS_PLAINTEXT_PASSTHROUGH_ENABLED 0x00000001
+ u32 flags;
+ struct ecryptfs_auth_tok *global_auth_tok;
+ struct key *global_auth_tok_key;
+ size_t global_default_cipher_key_size;
+ struct crypto_tfm *global_key_tfm;
+ struct mutex global_key_tfm_mutex;
+ unsigned char global_default_cipher_name[ECRYPTFS_MAX_CIPHER_NAME_SIZE
+ + 1];
+ unsigned char global_auth_tok_sig[ECRYPTFS_SIG_SIZE_HEX + 1];
+};
+
+/* superblock private data. */
+struct ecryptfs_sb_info {
+ struct super_block *wsi_sb;
+ struct ecryptfs_mount_crypt_stat mount_crypt_stat;
+};
+
+/* file private data. */
+struct ecryptfs_file_info {
+ struct file *wfi_file;
+ struct ecryptfs_crypt_stat *crypt_stat;
+};
+
+/* auth_tok <=> encrypted_session_key mappings */
+struct ecryptfs_auth_tok_list_item {
+ unsigned char encrypted_session_key[ECRYPTFS_MAX_KEY_BYTES];
+ struct list_head list;
+ struct ecryptfs_auth_tok auth_tok;
+};
+
+static inline struct ecryptfs_file_info *
+ecryptfs_file_to_private(struct file *file)
+{
+ return (struct ecryptfs_file_info *)file->private_data;
+}
+
+static inline void
+ecryptfs_set_file_private(struct file *file,
+ struct ecryptfs_file_info *file_info)
+{
+ file->private_data = file_info;
+}
+
+static inline struct file *ecryptfs_file_to_lower(struct file *file)
+{
+ return ((struct ecryptfs_file_info *)file->private_data)->wfi_file;
+}
+
+static inline void
+ecryptfs_set_file_lower(struct file *file, struct file *lower_file)
+{
+ ((struct ecryptfs_file_info *)file->private_data)->wfi_file =
+ lower_file;
+}
+
+static inline struct ecryptfs_inode_info *
+ecryptfs_inode_to_private(struct inode *inode)
+{
+ return container_of(inode, struct ecryptfs_inode_info, vfs_inode);
+}
+
+static inline struct inode *ecryptfs_inode_to_lower(struct inode *inode)
+{
+ return ecryptfs_inode_to_private(inode)->wii_inode;
+}
+
+static inline void
+ecryptfs_set_inode_lower(struct inode *inode, struct inode *lower_inode)
+{
+ ecryptfs_inode_to_private(inode)->wii_inode = lower_inode;
+}
+
+static inline struct ecryptfs_sb_info *
+ecryptfs_superblock_to_private(struct super_block *sb)
+{
+ return (struct ecryptfs_sb_info *)sb->s_fs_info;
+}
+
+static inline void
+ecryptfs_set_superblock_private(struct super_block *sb,
+ struct ecryptfs_sb_info *sb_info)
+{
+ sb->s_fs_info = sb_info;
+}
+
+static inline struct super_block *
+ecryptfs_superblock_to_lower(struct super_block *sb)
+{
+ return ((struct ecryptfs_sb_info *)sb->s_fs_info)->wsi_sb;
+}
+
+static inline void
+ecryptfs_set_superblock_lower(struct super_block *sb,
+ struct super_block *lower_sb)
+{
+ ((struct ecryptfs_sb_info *)sb->s_fs_info)->wsi_sb = lower_sb;
+}
+
+static inline struct ecryptfs_dentry_info *
+ecryptfs_dentry_to_private(struct dentry *dentry)
+{
+ return (struct ecryptfs_dentry_info *)dentry->d_fsdata;
+}
+
+static inline void
+ecryptfs_set_dentry_private(struct dentry *dentry,
+ struct ecryptfs_dentry_info *dentry_info)
+{
+ dentry->d_fsdata = dentry_info;
+}
+
+static inline struct dentry *
+ecryptfs_dentry_to_lower(struct dentry *dentry)
+{
+ return ((struct ecryptfs_dentry_info *)dentry->d_fsdata)->wdi_dentry;
+}
+
+static inline void
+ecryptfs_set_dentry_lower(struct dentry *dentry, struct dentry *lower_dentry)
+{
+ ((struct ecryptfs_dentry_info *)dentry->d_fsdata)->wdi_dentry =
+ lower_dentry;
+}
+
+static inline struct vfsmount *
+ecryptfs_dentry_to_lower_mnt(struct dentry *dentry)
+{
+ return ((struct ecryptfs_dentry_info *)dentry->d_fsdata)->lower_mnt;
+}
+
+static inline void
+ecryptfs_set_dentry_lower_mnt(struct dentry *dentry, struct vfsmount *lower_mnt)
+{
+ ((struct ecryptfs_dentry_info *)dentry->d_fsdata)->lower_mnt =
+ lower_mnt;
+}
+
+#define ecryptfs_printk(type, fmt, arg...) \
+ __ecryptfs_printk(type "%s: " fmt, __FUNCTION__, ## arg);
+void __ecryptfs_printk(const char *fmt, ...);
+
+extern const struct file_operations ecryptfs_main_fops;
+extern const struct file_operations ecryptfs_dir_fops;
+extern struct inode_operations ecryptfs_main_iops;
+extern struct inode_operations ecryptfs_dir_iops;
+extern struct inode_operations ecryptfs_symlink_iops;
+extern struct super_operations ecryptfs_sops;
+extern struct dentry_operations ecryptfs_dops;
+extern struct address_space_operations ecryptfs_aops;
+extern int ecryptfs_verbosity;
+
+extern struct kmem_cache *ecryptfs_auth_tok_list_item_cache;
+extern struct kmem_cache *ecryptfs_file_info_cache;
+extern struct kmem_cache *ecryptfs_dentry_info_cache;
+extern struct kmem_cache *ecryptfs_inode_info_cache;
+extern struct kmem_cache *ecryptfs_sb_info_cache;
+extern struct kmem_cache *ecryptfs_header_cache_0;
+extern struct kmem_cache *ecryptfs_header_cache_1;
+extern struct kmem_cache *ecryptfs_header_cache_2;
+extern struct kmem_cache *ecryptfs_lower_page_cache;
+
+int ecryptfs_interpose(struct dentry *hidden_dentry,
+ struct dentry *this_dentry, struct super_block *sb,
+ int flag);
+int ecryptfs_fill_zeros(struct file *file, loff_t new_length);
+int ecryptfs_decode_filename(struct ecryptfs_crypt_stat *crypt_stat,
+ const char *name, int length,
+ char **decrypted_name);
+int ecryptfs_encode_filename(struct ecryptfs_crypt_stat *crypt_stat,
+ const char *name, int length,
+ char **encoded_name);
+struct dentry *ecryptfs_lower_dentry(struct dentry *this_dentry);
+void ecryptfs_copy_attr_atime(struct inode *dest, const struct inode *src);
+void ecryptfs_copy_attr_all(struct inode *dest, const struct inode *src);
+void ecryptfs_copy_inode_size(struct inode *dst, const struct inode *src);
+void ecryptfs_dump_hex(char *data, int bytes);
+int virt_to_scatterlist(const void *addr, int size, struct scatterlist *sg,
+ int sg_size);
+int ecryptfs_compute_root_iv(struct ecryptfs_crypt_stat *crypt_stat);
+void ecryptfs_rotate_iv(unsigned char *iv);
+void ecryptfs_init_crypt_stat(struct ecryptfs_crypt_stat *crypt_stat);
+void ecryptfs_destruct_crypt_stat(struct ecryptfs_crypt_stat *crypt_stat);
+void ecryptfs_destruct_mount_crypt_stat(
+ struct ecryptfs_mount_crypt_stat *mount_crypt_stat);
+int ecryptfs_init_crypt_ctx(struct ecryptfs_crypt_stat *crypt_stat);
+int ecryptfs_write_inode_size_to_header(struct file *lower_file,
+ struct inode *lower_inode,
+ struct inode *inode);
+int ecryptfs_get_lower_page(struct page **lower_page, struct inode *lower_inode,
+ struct file *lower_file,
+ unsigned long lower_page_index, int byte_offset,
+ int region_bytes);
+int
+ecryptfs_commit_lower_page(struct page *lower_page, struct inode *lower_inode,
+ struct file *lower_file, int byte_offset,
+ int region_size);
+int ecryptfs_copy_page_to_lower(struct page *page, struct inode *lower_inode,
+ struct file *lower_file);
+int ecryptfs_do_readpage(struct file *file, struct page *page,
+ pgoff_t lower_page_index);
+int ecryptfs_grab_and_map_lower_page(struct page **lower_page,
+ char **lower_virt,
+ struct inode *lower_inode,
+ unsigned long lower_page_index);
+int ecryptfs_writepage_and_release_lower_page(struct page *lower_page,
+ struct inode *lower_inode,
+ struct writeback_control *wbc);
+int ecryptfs_encrypt_page(struct ecryptfs_page_crypt_context *ctx);
+int ecryptfs_decrypt_page(struct file *file, struct page *page);
+int ecryptfs_write_headers(struct dentry *ecryptfs_dentry,
+ struct file *lower_file);
+int ecryptfs_write_headers_virt(char *page_virt,
+ struct ecryptfs_crypt_stat *crypt_stat,
+ struct dentry *ecryptfs_dentry);
+int ecryptfs_read_headers(struct dentry *ecryptfs_dentry,
+ struct file *lower_file);
+int ecryptfs_new_file_context(struct dentry *ecryptfs_dentry);
+int contains_ecryptfs_marker(char *data);
+int ecryptfs_read_header_region(char *data, struct dentry *dentry,
+ struct vfsmount *mnt);
+u16 ecryptfs_code_for_cipher_string(struct ecryptfs_crypt_stat *crypt_stat);
+int ecryptfs_cipher_code_to_string(char *str, u16 cipher_code);
+void ecryptfs_set_default_sizes(struct ecryptfs_crypt_stat *crypt_stat);
+int ecryptfs_generate_key_packet_set(char *dest_base,
+ struct ecryptfs_crypt_stat *crypt_stat,
+ struct dentry *ecryptfs_dentry,
+ size_t *len, size_t max);
+int process_request_key_err(long err_code);
+int
+ecryptfs_parse_packet_set(struct ecryptfs_crypt_stat *crypt_stat,
+ unsigned char *src, struct dentry *ecryptfs_dentry);
+int ecryptfs_truncate(struct dentry *dentry, loff_t new_length);
+int
+ecryptfs_process_cipher(struct crypto_tfm **tfm, struct crypto_tfm **key_tfm,
+ char *cipher_name, size_t key_size);
+int ecryptfs_inode_test(struct inode *inode, void *candidate_lower_inode);
+int ecryptfs_inode_set(struct inode *inode, void *lower_inode);
+void ecryptfs_init_inode(struct inode *inode, struct inode *lower_inode);
+
+#endif /* #ifndef ECRYPTFS_KERNEL_H */
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
new file mode 100644
index 000000000000..c8550c9f9cd2
--- /dev/null
+++ b/fs/ecryptfs/file.c
@@ -0,0 +1,440 @@
+/**
+ * eCryptfs: Linux filesystem encryption layer
+ *
+ * Copyright (C) 1997-2004 Erez Zadok
+ * Copyright (C) 2001-2004 Stony Brook University
+ * Copyright (C) 2004-2006 International Business Machines Corp.
+ * Author(s): Michael A. Halcrow <mhalcrow@us.ibm.com>
+ * Michael C. Thompson <mcthomps@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+ * 02111-1307, USA.
+ */
+
+#include <linux/file.h>
+#include <linux/poll.h>
+#include <linux/mount.h>
+#include <linux/pagemap.h>
+#include <linux/security.h>
+#include <linux/smp_lock.h>
+#include <linux/compat.h>
+#include "ecryptfs_kernel.h"
+
+/**
+ * ecryptfs_llseek
+ * @file: File we are seeking in
+ * @offset: The offset to seek to
+ * @origin: 2 - offset from i_size; 1 - offset from f_pos
+ *
+ * Returns the position we have seeked to, or negative on error
+ */
+static loff_t ecryptfs_llseek(struct file *file, loff_t offset, int origin)
+{
+ loff_t rv;
+ loff_t new_end_pos;
+ int rc;
+ int expanding_file = 0;
+ struct inode *inode = file->f_mapping->host;
+
+ /* If our offset is past the end of our file, we're going to
+ * need to grow it so we have a valid length of 0's */
+ new_end_pos = offset;
+ switch (origin) {
+ case 2:
+ new_end_pos += i_size_read(inode);
+ expanding_file = 1;
+ break;
+ case 1:
+ new_end_pos += file->f_pos;
+ if (new_end_pos > i_size_read(inode)) {
+ ecryptfs_printk(KERN_DEBUG, "new_end_pos(=[0x%.16x]) "
+ "> i_size_read(inode)(=[0x%.16x])\n",
+ new_end_pos, i_size_read(inode));
+ expanding_file = 1;
+ }
+ break;
+ default:
+ if (new_end_pos > i_size_read(inode)) {
+ ecryptfs_printk(KERN_DEBUG, "new_end_pos(=[0x%.16x]) "
+ "> i_size_read(inode)(=[0x%.16x])\n",
+ new_end_pos, i_size_read(inode));
+ expanding_file = 1;
+ }
+ }
+ ecryptfs_printk(KERN_DEBUG, "new_end_pos = [0x%.16x]\n", new_end_pos);
+ if (expanding_file) {
+ rc = ecryptfs_truncate(file->f_dentry, new_end_pos);
+ if (rc) {
+ rv = rc;
+ ecryptfs_printk(KERN_ERR, "Error on attempt to "
+ "truncate to (higher) offset [0x%.16x];"
+ " rc = [%d]\n", new_end_pos, rc);
+ goto out;
+ }
+ }
+ rv = generic_file_llseek(file, offset, origin);
+out:
+ return rv;
+}
+
+/**
+ * ecryptfs_read_update_atime
+ *
+ * generic_file_read updates the atime of upper layer inode. But, it
+ * doesn't give us a chance to update the atime of the lower layer
+ * inode. This function is a wrapper to generic_file_read. It
+ * updates the atime of the lower level inode if generic_file_read
+ * returns without any errors. This is to be used only for file reads.
+ * The function to be used for directory reads is ecryptfs_read.
+ */
+static ssize_t ecryptfs_read_update_atime(struct kiocb *iocb,
+ const struct iovec *iov,
+ unsigned long nr_segs, loff_t pos)
+{
+ int rc;
+ struct dentry *lower_dentry;
+ struct vfsmount *lower_vfsmount;
+ struct file *file = iocb->ki_filp;
+
+ rc = generic_file_aio_read(iocb, iov, nr_segs, pos);
+ /*
+ * Even though this is a async interface, we need to wait
+ * for IO to finish to update atime
+ */
+ if (-EIOCBQUEUED == rc)
+ rc = wait_on_sync_kiocb(iocb);
+ if (rc >= 0) {
+ lower_dentry = ecryptfs_dentry_to_lower(file->f_dentry);
+ lower_vfsmount = ecryptfs_dentry_to_lower_mnt(file->f_dentry);
+ touch_atime(lower_vfsmount, lower_dentry);
+ }
+ return rc;
+}
+
+struct ecryptfs_getdents_callback {
+ void *dirent;
+ struct dentry *dentry;
+ filldir_t filldir;
+ int err;
+ int filldir_called;
+ int entries_written;
+};
+
+/* Inspired by generic filldir in fs/readir.c */
+static int
+ecryptfs_filldir(void *dirent, const char *name, int namelen, loff_t offset,
+ u64 ino, unsigned int d_type)
+{
+ struct ecryptfs_crypt_stat *crypt_stat;
+ struct ecryptfs_getdents_callback *buf =
+ (struct ecryptfs_getdents_callback *)dirent;
+ int rc;
+ int decoded_length;
+ char *decoded_name;
+
+ crypt_stat = ecryptfs_dentry_to_private(buf->dentry)->crypt_stat;
+ buf->filldir_called++;
+ decoded_length = ecryptfs_decode_filename(crypt_stat, name, namelen,
+ &decoded_name);
+ if (decoded_length < 0) {
+ rc = decoded_length;
+ goto out;
+ }
+ rc = buf->filldir(buf->dirent, decoded_name, decoded_length, offset,
+ ino, d_type);
+ kfree(decoded_name);
+ if (rc >= 0)
+ buf->entries_written++;
+out:
+ return rc;
+}
+
+/**
+ * ecryptfs_readdir
+ * @file: The ecryptfs file struct
+ * @dirent: Directory entry
+ * @filldir: The filldir callback function
+ */
+static int ecryptfs_readdir(struct file *file, void *dirent, filldir_t filldir)
+{
+ int rc;
+ struct file *lower_file;
+ struct inode *inode;
+ struct ecryptfs_getdents_callback buf;
+
+ lower_file = ecryptfs_file_to_lower(file);
+ lower_file->f_pos = file->f_pos;
+ inode = file->f_dentry->d_inode;
+ memset(&buf, 0, sizeof(buf));
+ buf.dirent = dirent;
+ buf.dentry = file->f_dentry;
+ buf.filldir = filldir;
+retry:
+ buf.filldir_called = 0;
+ buf.entries_written = 0;
+ buf.err = 0;
+ rc = vfs_readdir(lower_file, ecryptfs_filldir, (void *)&buf);
+ if (buf.err)
+ rc = buf.err;
+ if (buf.filldir_called && !buf.entries_written)
+ goto retry;
+ file->f_pos = lower_file->f_pos;
+ if (rc >= 0)
+ ecryptfs_copy_attr_atime(inode, lower_file->f_dentry->d_inode);
+ return rc;
+}
+
+struct kmem_cache *ecryptfs_file_info_cache;
+
+/**
+ * ecryptfs_open
+ * @inode: inode speciying file to open
+ * @file: Structure to return filled in
+ *
+ * Opens the file specified by inode.
+ *
+ * Returns zero on success; non-zero otherwise
+ */
+static int ecryptfs_open(struct inode *inode, struct file *file)
+{
+ int rc = 0;
+ struct ecryptfs_crypt_stat *crypt_stat = NULL;
+ struct ecryptfs_mount_crypt_stat *mount_crypt_stat;
+ struct dentry *ecryptfs_dentry = file->f_dentry;
+ /* Private value of ecryptfs_dentry allocated in
+ * ecryptfs_lookup() */
+ struct dentry *lower_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry);
+ struct inode *lower_inode = NULL;
+ struct file *lower_file = NULL;
+ struct vfsmount *lower_mnt;
+ struct ecryptfs_file_info *file_info;
+ int lower_flags;
+
+ /* Released in ecryptfs_release or end of function if failure */
+ file_info = kmem_cache_alloc(ecryptfs_file_info_cache, SLAB_KERNEL);
+ ecryptfs_set_file_private(file, file_info);
+ if (!file_info) {
+ ecryptfs_printk(KERN_ERR,
+ "Error attempting to allocate memory\n");
+ rc = -ENOMEM;
+ goto out;
+ }
+ memset(file_info, 0, sizeof(*file_info));
+ lower_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry);
+ crypt_stat = &ecryptfs_inode_to_private(inode)->crypt_stat;
+ mount_crypt_stat = &ecryptfs_superblock_to_private(
+ ecryptfs_dentry->d_sb)->mount_crypt_stat;
+ mutex_lock(&crypt_stat->cs_mutex);
+ if (!ECRYPTFS_CHECK_FLAG(crypt_stat->flags, ECRYPTFS_POLICY_APPLIED)) {
+ ecryptfs_printk(KERN_DEBUG, "Setting flags for stat...\n");
+ /* Policy code enabled in future release */
+ ECRYPTFS_SET_FLAG(crypt_stat->flags, ECRYPTFS_POLICY_APPLIED);
+ ECRYPTFS_SET_FLAG(crypt_stat->flags, ECRYPTFS_ENCRYPTED);
+ }
+ mutex_unlock(&crypt_stat->cs_mutex);
+ /* This mntget & dget is undone via fput when the file is released */
+ dget(lower_dentry);
+ lower_flags = file->f_flags;
+ if ((lower_flags & O_ACCMODE) == O_WRONLY)
+ lower_flags = (lower_flags & O_ACCMODE) | O_RDWR;
+ if (file->f_flags & O_APPEND)
+ lower_flags &= ~O_APPEND;
+ lower_mnt = ecryptfs_dentry_to_lower_mnt(ecryptfs_dentry);
+ mntget(lower_mnt);
+ /* Corresponding fput() in ecryptfs_release() */
+ lower_file = dentry_open(lower_dentry, lower_mnt, lower_flags);
+ if (IS_ERR(lower_file)) {
+ rc = PTR_ERR(lower_file);
+ ecryptfs_printk(KERN_ERR, "Error opening lower file\n");
+ goto out_puts;
+ }
+ ecryptfs_set_file_lower(file, lower_file);
+ /* Isn't this check the same as the one in lookup? */
+ lower_inode = lower_dentry->d_inode;
+ if (S_ISDIR(ecryptfs_dentry->d_inode->i_mode)) {
+ ecryptfs_printk(KERN_DEBUG, "This is a directory\n");
+ ECRYPTFS_CLEAR_FLAG(crypt_stat->flags, ECRYPTFS_ENCRYPTED);
+ rc = 0;
+ goto out;
+ }
+ mutex_lock(&crypt_stat->cs_mutex);
+ if (i_size_read(lower_inode) < ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE) {
+ if (!(mount_crypt_stat->flags
+ & ECRYPTFS_PLAINTEXT_PASSTHROUGH_ENABLED)) {
+ rc = -EIO;
+ printk(KERN_WARNING "Attempt to read file that is "
+ "not in a valid eCryptfs format, and plaintext "
+ "passthrough mode is not enabled; returning "
+ "-EIO\n");
+ mutex_unlock(&crypt_stat->cs_mutex);
+ goto out_puts;
+ }
+ crypt_stat->flags &= ~(ECRYPTFS_ENCRYPTED);
+ rc = 0;
+ mutex_unlock(&crypt_stat->cs_mutex);
+ goto out;
+ } else if (!ECRYPTFS_CHECK_FLAG(crypt_stat->flags,
+ ECRYPTFS_POLICY_APPLIED)
+ || !ECRYPTFS_CHECK_FLAG(crypt_stat->flags,
+ ECRYPTFS_KEY_VALID)) {
+ rc = ecryptfs_read_headers(ecryptfs_dentry, lower_file);
+ if (rc) {
+ ecryptfs_printk(KERN_DEBUG,
+ "Valid headers not found\n");
+ if (!(mount_crypt_stat->flags
+ & ECRYPTFS_PLAINTEXT_PASSTHROUGH_ENABLED)) {
+ rc = -EIO;
+ printk(KERN_WARNING "Attempt to read file that "
+ "is not in a valid eCryptfs format, "
+ "and plaintext passthrough mode is not "
+ "enabled; returning -EIO\n");
+ mutex_unlock(&crypt_stat->cs_mutex);
+ goto out_puts;
+ }
+ ECRYPTFS_CLEAR_FLAG(crypt_stat->flags,
+ ECRYPTFS_ENCRYPTED);
+ rc = 0;
+ mutex_unlock(&crypt_stat->cs_mutex);
+ goto out;
+ }
+ }
+ mutex_unlock(&crypt_stat->cs_mutex);
+ ecryptfs_printk(KERN_DEBUG, "inode w/ addr = [0x%p], i_ino = [0x%.16x] "
+ "size: [0x%.16x]\n", inode, inode->i_ino,
+ i_size_read(inode));
+ ecryptfs_set_file_lower(file, lower_file);
+ goto out;
+out_puts:
+ mntput(lower_mnt);
+ dput(lower_dentry);
+ kmem_cache_free(ecryptfs_file_info_cache,
+ ecryptfs_file_to_private(file));
+out:
+ return rc;
+}
+
+static int ecryptfs_flush(struct file *file, fl_owner_t td)
+{
+ int rc = 0;
+ struct file *lower_file = NULL;
+
+ lower_file = ecryptfs_file_to_lower(file);
+ if (lower_file->f_op && lower_file->f_op->flush)
+ rc = lower_file->f_op->flush(lower_file, td);
+ return rc;
+}
+
+static int ecryptfs_release(struct inode *inode, struct file *file)
+{
+ struct file *lower_file = ecryptfs_file_to_lower(file);
+ struct ecryptfs_file_info *file_info = ecryptfs_file_to_private(file);
+ struct inode *lower_inode = ecryptfs_inode_to_lower(inode);
+
+ fput(lower_file);
+ inode->i_blocks = lower_inode->i_blocks;
+ kmem_cache_free(ecryptfs_file_info_cache, file_info);
+ return 0;
+}
+
+static int
+ecryptfs_fsync(struct file *file, struct dentry *dentry, int datasync)
+{
+ struct file *lower_file = ecryptfs_file_to_lower(file);
+ struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
+ struct inode *lower_inode = lower_dentry->d_inode;
+ int rc = -EINVAL;
+
+ if (lower_inode->i_fop->fsync) {
+ mutex_lock(&lower_inode->i_mutex);
+ rc = lower_inode->i_fop->fsync(lower_file, lower_dentry,
+ datasync);
+ mutex_unlock(&lower_inode->i_mutex);
+ }
+ return rc;
+}
+
+static int ecryptfs_fasync(int fd, struct file *file, int flag)
+{
+ int rc = 0;
+ struct file *lower_file = NULL;
+
+ lower_file = ecryptfs_file_to_lower(file);
+ if (lower_file->f_op && lower_file->f_op->fasync)
+ rc = lower_file->f_op->fasync(fd, lower_file, flag);
+ return rc;
+}
+
+static ssize_t ecryptfs_sendfile(struct file *file, loff_t * ppos,
+ size_t count, read_actor_t actor, void *target)
+{
+ struct file *lower_file = NULL;
+ int rc = -EINVAL;
+
+ lower_file = ecryptfs_file_to_lower(file);
+ if (lower_file->f_op && lower_file->f_op->sendfile)
+ rc = lower_file->f_op->sendfile(lower_file, ppos, count,
+ actor, target);
+
+ return rc;
+}
+
+static int ecryptfs_ioctl(struct inode *inode, struct file *file,
+ unsigned int cmd, unsigned long arg);
+
+const struct file_operations ecryptfs_dir_fops = {
+ .readdir = ecryptfs_readdir,
+ .ioctl = ecryptfs_ioctl,
+ .mmap = generic_file_mmap,
+ .open = ecryptfs_open,
+ .flush = ecryptfs_flush,
+ .release = ecryptfs_release,
+ .fsync = ecryptfs_fsync,
+ .fasync = ecryptfs_fasync,
+ .sendfile = ecryptfs_sendfile,
+};
+
+const struct file_operations ecryptfs_main_fops = {
+ .llseek = ecryptfs_llseek,
+ .read = do_sync_read,
+ .aio_read = ecryptfs_read_update_atime,
+ .write = do_sync_write,
+ .aio_write = generic_file_aio_write,
+ .readdir = ecryptfs_readdir,
+ .ioctl = ecryptfs_ioctl,
+ .mmap = generic_file_mmap,
+ .open = ecryptfs_open,
+ .flush = ecryptfs_flush,
+ .release = ecryptfs_release,
+ .fsync = ecryptfs_fsync,
+ .fasync = ecryptfs_fasync,
+ .sendfile = ecryptfs_sendfile,
+};
+
+static int
+ecryptfs_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
+ unsigned long arg)
+{
+ int rc = 0;
+ struct file *lower_file = NULL;
+
+ if (ecryptfs_file_to_private(file))
+ lower_file = ecryptfs_file_to_lower(file);
+ if (lower_file && lower_file->f_op && lower_file->f_op->ioctl)
+ rc = lower_file->f_op->ioctl(ecryptfs_inode_to_lower(inode),
+ lower_file, cmd, arg);
+ else
+ rc = -ENOTTY;
+ return rc;
+}
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
new file mode 100644
index 000000000000..efdd2b7b62d7
--- /dev/null
+++ b/fs/ecryptfs/inode.c
@@ -0,0 +1,1079 @@
+/**
+ * eCryptfs: Linux filesystem encryption layer
+ *
+ * Copyright (C) 1997-2004 Erez Zadok
+ * Copyright (C) 2001-2004 Stony Brook University
+ * Copyright (C) 2004-2006 International Business Machines Corp.
+ * Author(s): Michael A. Halcrow <mahalcro@us.ibm.com>
+ * Michael C. Thompsion <mcthomps@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+ * 02111-1307, USA.
+ */
+
+#include <linux/file.h>
+#include <linux/vmalloc.h>
+#include <linux/pagemap.h>
+#include <linux/dcache.h>
+#include <linux/namei.h>
+#include <linux/mount.h>
+#include <linux/crypto.h>
+#include "ecryptfs_kernel.h"
+
+static struct dentry *lock_parent(struct dentry *dentry)
+{
+ struct dentry *dir;
+
+ dir = dget(dentry->d_parent);
+ mutex_lock(&(dir->d_inode->i_mutex));
+ return dir;
+}
+
+static void unlock_parent(struct dentry *dentry)
+{
+ mutex_unlock(&(dentry->d_parent->d_inode->i_mutex));
+ dput(dentry->d_parent);
+}
+
+static void unlock_dir(struct dentry *dir)
+{
+ mutex_unlock(&dir->d_inode->i_mutex);
+ dput(dir);
+}
+
+void ecryptfs_copy_inode_size(struct inode *dst, const struct inode *src)
+{
+ i_size_write(dst, i_size_read((struct inode *)src));
+ dst->i_blocks = src->i_blocks;
+}
+
+void ecryptfs_copy_attr_atime(struct inode *dest, const struct inode *src)
+{
+ dest->i_atime = src->i_atime;
+}
+
+static void ecryptfs_copy_attr_times(struct inode *dest,
+ const struct inode *src)
+{
+ dest->i_atime = src->i_atime;
+ dest->i_mtime = src->i_mtime;
+ dest->i_ctime = src->i_ctime;
+}
+
+static void ecryptfs_copy_attr_timesizes(struct inode *dest,
+ const struct inode *src)
+{
+ dest->i_atime = src->i_atime;
+ dest->i_mtime = src->i_mtime;
+ dest->i_ctime = src->i_ctime;
+ ecryptfs_copy_inode_size(dest, src);
+}
+
+void ecryptfs_copy_attr_all(struct inode *dest, const struct inode *src)
+{
+ dest->i_mode = src->i_mode;
+ dest->i_nlink = src->i_nlink;
+ dest->i_uid = src->i_uid;
+ dest->i_gid = src->i_gid;
+ dest->i_rdev = src->i_rdev;
+ dest->i_atime = src->i_atime;
+ dest->i_mtime = src->i_mtime;
+ dest->i_ctime = src->i_ctime;
+ dest->i_blkbits = src->i_blkbits;
+ dest->i_flags = src->i_flags;
+}
+
+/**
+ * ecryptfs_create_underlying_file
+ * @lower_dir_inode: inode of the parent in the lower fs of the new file
+ * @lower_dentry: New file's dentry in the lower fs
+ * @ecryptfs_dentry: New file's dentry in ecryptfs
+ * @mode: The mode of the new file
+ * @nd: nameidata of ecryptfs' parent's dentry & vfsmount
+ *
+ * Creates the file in the lower file system.
+ *
+ * Returns zero on success; non-zero on error condition
+ */
+static int
+ecryptfs_create_underlying_file(struct inode *lower_dir_inode,
+ struct dentry *dentry, int mode,
+ struct nameidata *nd)
+{
+ struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
+ struct vfsmount *lower_mnt = ecryptfs_dentry_to_lower_mnt(dentry);
+ struct dentry *dentry_save;
+ struct vfsmount *vfsmount_save;
+ int rc;
+
+ dentry_save = nd->dentry;
+ vfsmount_save = nd->mnt;
+ nd->dentry = lower_dentry;
+ nd->mnt = lower_mnt;
+ rc = vfs_create(lower_dir_inode, lower_dentry, mode, nd);
+ nd->dentry = dentry_save;
+ nd->mnt = vfsmount_save;
+ return rc;
+}
+
+/**
+ * ecryptfs_do_create
+ * @directory_inode: inode of the new file's dentry's parent in ecryptfs
+ * @ecryptfs_dentry: New file's dentry in ecryptfs
+ * @mode: The mode of the new file
+ * @nd: nameidata of ecryptfs' parent's dentry & vfsmount
+ *
+ * Creates the underlying file and the eCryptfs inode which will link to
+ * it. It will also update the eCryptfs directory inode to mimic the
+ * stat of the lower directory inode.
+ *
+ * Returns zero on success; non-zero on error condition
+ */
+static int
+ecryptfs_do_create(struct inode *directory_inode,
+ struct dentry *ecryptfs_dentry, int mode,
+ struct nameidata *nd)
+{
+ int rc;
+ struct dentry *lower_dentry;
+ struct dentry *lower_dir_dentry;
+
+ lower_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry);
+ lower_dir_dentry = lock_parent(lower_dentry);
+ if (unlikely(IS_ERR(lower_dir_dentry))) {
+ ecryptfs_printk(KERN_ERR, "Error locking directory of "
+ "dentry\n");
+ rc = PTR_ERR(lower_dir_dentry);
+ goto out;
+ }
+ rc = ecryptfs_create_underlying_file(lower_dir_dentry->d_inode,
+ ecryptfs_dentry, mode, nd);
+ if (unlikely(rc)) {
+ ecryptfs_printk(KERN_ERR,
+ "Failure to create underlying file\n");
+ goto out_lock;
+ }
+ rc = ecryptfs_interpose(lower_dentry, ecryptfs_dentry,
+ directory_inode->i_sb, 0);
+ if (rc) {
+ ecryptfs_printk(KERN_ERR, "Failure in ecryptfs_interpose\n");
+ goto out_lock;
+ }
+ ecryptfs_copy_attr_timesizes(directory_inode,
+ lower_dir_dentry->d_inode);
+out_lock:
+ unlock_dir(lower_dir_dentry);
+out:
+ return rc;
+}
+
+/**
+ * grow_file
+ * @ecryptfs_dentry: the ecryptfs dentry
+ * @lower_file: The lower file
+ * @inode: The ecryptfs inode
+ * @lower_inode: The lower inode
+ *
+ * This is the code which will grow the file to its correct size.
+ */
+static int grow_file(struct dentry *ecryptfs_dentry, struct file *lower_file,
+ struct inode *inode, struct inode *lower_inode)
+{
+ int rc = 0;
+ struct file fake_file;
+ struct ecryptfs_file_info tmp_file_info;
+
+ memset(&fake_file, 0, sizeof(fake_file));
+ fake_file.f_dentry = ecryptfs_dentry;
+ memset(&tmp_file_info, 0, sizeof(tmp_file_info));
+ ecryptfs_set_file_private(&fake_file, &tmp_file_info);
+ ecryptfs_set_file_lower(&fake_file, lower_file);
+ rc = ecryptfs_fill_zeros(&fake_file, 1);
+ if (rc) {
+ ECRYPTFS_SET_FLAG(
+ ecryptfs_inode_to_private(inode)->crypt_stat.flags,
+ ECRYPTFS_SECURITY_WARNING);
+ ecryptfs_printk(KERN_WARNING, "Error attempting to fill zeros "
+ "in file; rc = [%d]\n", rc);
+ goto out;
+ }
+ i_size_write(inode, 0);
+ ecryptfs_write_inode_size_to_header(lower_file, lower_inode, inode);
+ ECRYPTFS_SET_FLAG(ecryptfs_inode_to_private(inode)->crypt_stat.flags,
+ ECRYPTFS_NEW_FILE);
+out:
+ return rc;
+}
+
+/**
+ * ecryptfs_initialize_file
+ *
+ * Cause the file to be changed from a basic empty file to an ecryptfs
+ * file with a header and first data page.
+ *
+ * Returns zero on success
+ */
+static int ecryptfs_initialize_file(struct dentry *ecryptfs_dentry)
+{
+ int rc = 0;
+ int lower_flags;
+ struct ecryptfs_crypt_stat *crypt_stat;
+ struct dentry *lower_dentry;
+ struct dentry *tlower_dentry = NULL;
+ struct file *lower_file;
+ struct inode *inode, *lower_inode;
+ struct vfsmount *lower_mnt;
+
+ lower_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry);
+ ecryptfs_printk(KERN_DEBUG, "lower_dentry->d_name.name = [%s]\n",
+ lower_dentry->d_name.name);
+ inode = ecryptfs_dentry->d_inode;
+ crypt_stat = &ecryptfs_inode_to_private(inode)->crypt_stat;
+ tlower_dentry = dget(lower_dentry);
+ if (!tlower_dentry) {
+ rc = -ENOMEM;
+ ecryptfs_printk(KERN_ERR, "Error dget'ing lower_dentry\n");
+ goto out;
+ }
+ lower_flags = ((O_CREAT | O_WRONLY | O_TRUNC) & O_ACCMODE) | O_RDWR;
+#if BITS_PER_LONG != 32
+ lower_flags |= O_LARGEFILE;
+#endif
+ lower_mnt = ecryptfs_dentry_to_lower_mnt(ecryptfs_dentry);
+ mntget(lower_mnt);
+ /* Corresponding fput() at end of this function */
+ lower_file = dentry_open(tlower_dentry, lower_mnt, lower_flags);
+ if (IS_ERR(lower_file)) {
+ rc = PTR_ERR(lower_file);
+ ecryptfs_printk(KERN_ERR,
+ "Error opening dentry; rc = [%i]\n", rc);
+ goto out;
+ }
+ /* fput(lower_file) should handle the puts if we do this */
+ lower_file->f_dentry = tlower_dentry;
+ lower_file->f_vfsmnt = lower_mnt;
+ lower_inode = tlower_dentry->d_inode;
+ if (S_ISDIR(ecryptfs_dentry->d_inode->i_mode)) {
+ ecryptfs_printk(KERN_DEBUG, "This is a directory\n");
+ ECRYPTFS_CLEAR_FLAG(crypt_stat->flags, ECRYPTFS_ENCRYPTED);
+ goto out_fput;
+ }
+ ECRYPTFS_SET_FLAG(crypt_stat->flags, ECRYPTFS_NEW_FILE);
+ ecryptfs_printk(KERN_DEBUG, "Initializing crypto context\n");
+ rc = ecryptfs_new_file_context(ecryptfs_dentry);
+ if (rc) {
+ ecryptfs_printk(KERN_DEBUG, "Error creating new file "
+ "context\n");
+ goto out_fput;
+ }
+ rc = ecryptfs_write_headers(ecryptfs_dentry, lower_file);
+ if (rc) {
+ ecryptfs_printk(KERN_DEBUG, "Error writing headers\n");
+ goto out_fput;
+ }
+ rc = grow_file(ecryptfs_dentry, lower_file, inode, lower_inode);
+out_fput:
+ fput(lower_file);
+out:
+ return rc;
+}
+
+/**
+ * ecryptfs_create
+ * @dir: The inode of the directory in which to create the file.
+ * @dentry: The eCryptfs dentry
+ * @mode: The mode of the new file.
+ * @nd: nameidata
+ *
+ * Creates a new file.
+ *
+ * Returns zero on success; non-zero on error condition
+ */
+static int
+ecryptfs_create(struct inode *directory_inode, struct dentry *ecryptfs_dentry,
+ int mode, struct nameidata *nd)
+{
+ int rc;
+
+ rc = ecryptfs_do_create(directory_inode, ecryptfs_dentry, mode, nd);
+ if (unlikely(rc)) {
+ ecryptfs_printk(KERN_WARNING, "Failed to create file in"
+ "lower filesystem\n");
+ goto out;
+ }
+ /* At this point, a file exists on "disk"; we need to make sure
+ * that this on disk file is prepared to be an ecryptfs file */
+ rc = ecryptfs_initialize_file(ecryptfs_dentry);
+out:
+ return rc;
+}
+
+/**
+ * ecryptfs_lookup
+ * @dir: inode
+ * @dentry: The dentry
+ * @nd: nameidata, may be NULL
+ *
+ * Find a file on disk. If the file does not exist, then we'll add it to the
+ * dentry cache and continue on to read it from the disk.
+ */
+static struct dentry *ecryptfs_lookup(struct inode *dir, struct dentry *dentry,
+ struct nameidata *nd)
+{
+ int rc = 0;
+ struct dentry *lower_dir_dentry;
+ struct dentry *lower_dentry;
+ struct vfsmount *lower_mnt;
+ struct dentry *tlower_dentry = NULL;
+ char *encoded_name;
+ unsigned int encoded_namelen;
+ struct ecryptfs_crypt_stat *crypt_stat = NULL;
+ char *page_virt = NULL;
+ struct inode *lower_inode;
+ u64 file_size;
+
+ lower_dir_dentry = ecryptfs_dentry_to_lower(dentry->d_parent);
+ dentry->d_op = &ecryptfs_dops;
+ if ((dentry->d_name.len == 1 && !strcmp(dentry->d_name.name, "."))
+ || (dentry->d_name.len == 2 && !strcmp(dentry->d_name.name, "..")))
+ goto out_drop;
+ encoded_namelen = ecryptfs_encode_filename(crypt_stat,
+ dentry->d_name.name,
+ dentry->d_name.len,
+ &encoded_name);
+ if (encoded_namelen < 0) {
+ rc = encoded_namelen;
+ goto out_drop;
+ }
+ ecryptfs_printk(KERN_DEBUG, "encoded_name = [%s]; encoded_namelen "
+ "= [%d]\n", encoded_name, encoded_namelen);
+ lower_dentry = lookup_one_len(encoded_name, lower_dir_dentry,
+ encoded_namelen - 1);
+ kfree(encoded_name);
+ lower_mnt = mntget(ecryptfs_dentry_to_lower_mnt(dentry->d_parent));
+ if (IS_ERR(lower_dentry)) {
+ ecryptfs_printk(KERN_ERR, "ERR from lower_dentry\n");
+ rc = PTR_ERR(lower_dentry);
+ goto out_drop;
+ }
+ ecryptfs_printk(KERN_DEBUG, "lower_dentry = [%p]; lower_dentry->"
+ "d_name.name = [%s]\n", lower_dentry,
+ lower_dentry->d_name.name);
+ lower_inode = lower_dentry->d_inode;
+ ecryptfs_copy_attr_atime(dir, lower_dir_dentry->d_inode);
+ BUG_ON(!atomic_read(&lower_dentry->d_count));
+ ecryptfs_set_dentry_private(dentry,
+ kmem_cache_alloc(ecryptfs_dentry_info_cache,
+ SLAB_KERNEL));
+ if (!ecryptfs_dentry_to_private(dentry)) {
+ rc = -ENOMEM;
+ ecryptfs_printk(KERN_ERR, "Out of memory whilst attempting "
+ "to allocate ecryptfs_dentry_info struct\n");
+ goto out_dput;
+ }
+ ecryptfs_set_dentry_lower(dentry, lower_dentry);
+ ecryptfs_set_dentry_lower_mnt(dentry, lower_mnt);
+ if (!lower_dentry->d_inode) {
+ /* We want to add because we couldn't find in lower */
+ d_add(dentry, NULL);
+ goto out;
+ }
+ rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb, 1);
+ if (rc) {
+ ecryptfs_printk(KERN_ERR, "Error interposing\n");
+ goto out_dput;
+ }
+ if (S_ISDIR(lower_inode->i_mode)) {
+ ecryptfs_printk(KERN_DEBUG, "Is a directory; returning\n");
+ goto out;
+ }
+ if (S_ISLNK(lower_inode->i_mode)) {
+ ecryptfs_printk(KERN_DEBUG, "Is a symlink; returning\n");
+ goto out;
+ }
+ if (!nd) {
+ ecryptfs_printk(KERN_DEBUG, "We have a NULL nd, just leave"
+ "as we *think* we are about to unlink\n");
+ goto out;
+ }
+ tlower_dentry = dget(lower_dentry);
+ if (!tlower_dentry || IS_ERR(tlower_dentry)) {
+ rc = -ENOMEM;
+ ecryptfs_printk(KERN_ERR, "Cannot dget lower_dentry\n");
+ goto out_dput;
+ }
+ /* Released in this function */
+ page_virt =
+ (char *)kmem_cache_alloc(ecryptfs_header_cache_2,
+ SLAB_USER);
+ if (!page_virt) {
+ rc = -ENOMEM;
+ ecryptfs_printk(KERN_ERR,
+ "Cannot ecryptfs_kmalloc a page\n");
+ goto out_dput;
+ }
+ memset(page_virt, 0, PAGE_CACHE_SIZE);
+ rc = ecryptfs_read_header_region(page_virt, tlower_dentry, nd->mnt);
+ crypt_stat = &ecryptfs_inode_to_private(dentry->d_inode)->crypt_stat;
+ if (!ECRYPTFS_CHECK_FLAG(crypt_stat->flags, ECRYPTFS_POLICY_APPLIED))
+ ecryptfs_set_default_sizes(crypt_stat);
+ if (rc) {
+ rc = 0;
+ ecryptfs_printk(KERN_WARNING, "Error reading header region;"
+ " assuming unencrypted\n");
+ } else {
+ if (!contains_ecryptfs_marker(page_virt
+ + ECRYPTFS_FILE_SIZE_BYTES)) {
+ kmem_cache_free(ecryptfs_header_cache_2, page_virt);
+ goto out;
+ }
+ memcpy(&file_size, page_virt, sizeof(file_size));
+ file_size = be64_to_cpu(file_size);
+ i_size_write(dentry->d_inode, (loff_t)file_size);
+ }
+ kmem_cache_free(ecryptfs_header_cache_2, page_virt);
+ goto out;
+
+out_dput:
+ dput(lower_dentry);
+ if (tlower_dentry)
+ dput(tlower_dentry);
+out_drop:
+ d_drop(dentry);
+out:
+ return ERR_PTR(rc);
+}
+
+static int ecryptfs_link(struct dentry *old_dentry, struct inode *dir,
+ struct dentry *new_dentry)
+{
+ struct dentry *lower_old_dentry;
+ struct dentry *lower_new_dentry;
+ struct dentry *lower_dir_dentry;
+ u64 file_size_save;
+ int rc;
+
+ file_size_save = i_size_read(old_dentry->d_inode);
+ lower_old_dentry = ecryptfs_dentry_to_lower(old_dentry);
+ lower_new_dentry = ecryptfs_dentry_to_lower(new_dentry);
+ dget(lower_old_dentry);
+ dget(lower_new_dentry);
+ lower_dir_dentry = lock_parent(lower_new_dentry);
+ rc = vfs_link(lower_old_dentry, lower_dir_dentry->d_inode,
+ lower_new_dentry);
+ if (rc || !lower_new_dentry->d_inode)
+ goto out_lock;
+ rc = ecryptfs_interpose(lower_new_dentry, new_dentry, dir->i_sb, 0);
+ if (rc)
+ goto out_lock;
+ ecryptfs_copy_attr_timesizes(dir, lower_new_dentry->d_inode);
+ old_dentry->d_inode->i_nlink =
+ ecryptfs_inode_to_lower(old_dentry->d_inode)->i_nlink;
+ i_size_write(new_dentry->d_inode, file_size_save);
+out_lock:
+ unlock_dir(lower_dir_dentry);
+ dput(lower_new_dentry);
+ dput(lower_old_dentry);
+ if (!new_dentry->d_inode)
+ d_drop(new_dentry);
+ return rc;
+}
+
+static int ecryptfs_unlink(struct inode *dir, struct dentry *dentry)
+{
+ int rc = 0;
+ struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
+ struct inode *lower_dir_inode = ecryptfs_inode_to_lower(dir);
+
+ lock_parent(lower_dentry);
+ rc = vfs_unlink(lower_dir_inode, lower_dentry);
+ if (rc) {
+ ecryptfs_printk(KERN_ERR, "Error in vfs_unlink\n");
+ goto out_unlock;
+ }
+ ecryptfs_copy_attr_times(dir, lower_dir_inode);
+ dentry->d_inode->i_nlink =
+ ecryptfs_inode_to_lower(dentry->d_inode)->i_nlink;
+ dentry->d_inode->i_ctime = dir->i_ctime;
+out_unlock:
+ unlock_parent(lower_dentry);
+ return rc;
+}
+
+static int ecryptfs_symlink(struct inode *dir, struct dentry *dentry,
+ const char *symname)
+{
+ int rc;
+ struct dentry *lower_dentry;
+ struct dentry *lower_dir_dentry;
+ umode_t mode;
+ char *encoded_symname;
+ unsigned int encoded_symlen;
+ struct ecryptfs_crypt_stat *crypt_stat = NULL;
+
+ lower_dentry = ecryptfs_dentry_to_lower(dentry);
+ dget(lower_dentry);
+ lower_dir_dentry = lock_parent(lower_dentry);
+ mode = S_IALLUGO;
+ encoded_symlen = ecryptfs_encode_filename(crypt_stat, symname,
+ strlen(symname),
+ &encoded_symname);
+ if (encoded_symlen < 0) {
+ rc = encoded_symlen;
+ goto out_lock;
+ }
+ rc = vfs_symlink(lower_dir_dentry->d_inode, lower_dentry,
+ encoded_symname, mode);
+ kfree(encoded_symname);
+ if (rc || !lower_dentry->d_inode)
+ goto out_lock;
+ rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb, 0);
+ if (rc)
+ goto out_lock;
+ ecryptfs_copy_attr_timesizes(dir, lower_dir_dentry->d_inode);
+out_lock:
+ unlock_dir(lower_dir_dentry);
+ dput(lower_dentry);
+ if (!dentry->d_inode)
+ d_drop(dentry);
+ return rc;
+}
+
+static int ecryptfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+ int rc;
+ struct dentry *lower_dentry;
+ struct dentry *lower_dir_dentry;
+
+ lower_dentry = ecryptfs_dentry_to_lower(dentry);
+ lower_dir_dentry = lock_parent(lower_dentry);
+ rc = vfs_mkdir(lower_dir_dentry->d_inode, lower_dentry, mode);
+ if (rc || !lower_dentry->d_inode)
+ goto out;
+ rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb, 0);
+ if (rc)
+ goto out;
+ ecryptfs_copy_attr_timesizes(dir, lower_dir_dentry->d_inode);
+ dir->i_nlink = lower_dir_dentry->d_inode->i_nlink;
+out:
+ unlock_dir(lower_dir_dentry);
+ if (!dentry->d_inode)
+ d_drop(dentry);
+ return rc;
+}
+
+static int ecryptfs_rmdir(struct inode *dir, struct dentry *dentry)
+{
+ int rc = 0;
+ struct dentry *tdentry = NULL;
+ struct dentry *lower_dentry;
+ struct dentry *tlower_dentry = NULL;
+ struct dentry *lower_dir_dentry;
+
+ lower_dentry = ecryptfs_dentry_to_lower(dentry);
+ if (!(tdentry = dget(dentry))) {
+ rc = -EINVAL;
+ ecryptfs_printk(KERN_ERR, "Error dget'ing dentry [%p]\n",
+ dentry);
+ goto out;
+ }
+ lower_dir_dentry = lock_parent(lower_dentry);
+ if (!(tlower_dentry = dget(lower_dentry))) {
+ rc = -EINVAL;
+ ecryptfs_printk(KERN_ERR, "Error dget'ing lower_dentry "
+ "[%p]\n", lower_dentry);
+ goto out;
+ }
+ rc = vfs_rmdir(lower_dir_dentry->d_inode, lower_dentry);
+ if (!rc) {
+ d_delete(tlower_dentry);
+ tlower_dentry = NULL;
+ }
+ ecryptfs_copy_attr_times(dir, lower_dir_dentry->d_inode);
+ dir->i_nlink = lower_dir_dentry->d_inode->i_nlink;
+ unlock_dir(lower_dir_dentry);
+ if (!rc)
+ d_drop(dentry);
+out:
+ if (tdentry)
+ dput(tdentry);
+ if (tlower_dentry)
+ dput(tlower_dentry);
+ return rc;
+}
+
+static int
+ecryptfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
+{
+ int rc;
+ struct dentry *lower_dentry;
+ struct dentry *lower_dir_dentry;
+
+ lower_dentry = ecryptfs_dentry_to_lower(dentry);
+ lower_dir_dentry = lock_parent(lower_dentry);
+ rc = vfs_mknod(lower_dir_dentry->d_inode, lower_dentry, mode, dev);
+ if (rc || !lower_dentry->d_inode)
+ goto out;
+ rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb, 0);
+ if (rc)
+ goto out;
+ ecryptfs_copy_attr_timesizes(dir, lower_dir_dentry->d_inode);
+out:
+ unlock_dir(lower_dir_dentry);
+ if (!dentry->d_inode)
+ d_drop(dentry);
+ return rc;
+}
+
+static int
+ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry,
+ struct inode *new_dir, struct dentry *new_dentry)
+{
+ int rc;
+ struct dentry *lower_old_dentry;
+ struct dentry *lower_new_dentry;
+ struct dentry *lower_old_dir_dentry;
+ struct dentry *lower_new_dir_dentry;
+
+ lower_old_dentry = ecryptfs_dentry_to_lower(old_dentry);
+ lower_new_dentry = ecryptfs_dentry_to_lower(new_dentry);
+ dget(lower_old_dentry);
+ dget(lower_new_dentry);
+ lower_old_dir_dentry = dget_parent(lower_old_dentry);
+ lower_new_dir_dentry = dget_parent(lower_new_dentry);
+ lock_rename(lower_old_dir_dentry, lower_new_dir_dentry);
+ rc = vfs_rename(lower_old_dir_dentry->d_inode, lower_old_dentry,
+ lower_new_dir_dentry->d_inode, lower_new_dentry);
+ if (rc)
+ goto out_lock;
+ ecryptfs_copy_attr_all(new_dir, lower_new_dir_dentry->d_inode);
+ if (new_dir != old_dir)
+ ecryptfs_copy_attr_all(old_dir, lower_old_dir_dentry->d_inode);
+out_lock:
+ unlock_rename(lower_old_dir_dentry, lower_new_dir_dentry);
+ dput(lower_new_dentry);
+ dput(lower_old_dentry);
+ return rc;
+}
+
+static int
+ecryptfs_readlink(struct dentry *dentry, char __user * buf, int bufsiz)
+{
+ int rc;
+ struct dentry *lower_dentry;
+ char *decoded_name;
+ char *lower_buf;
+ mm_segment_t old_fs;
+ struct ecryptfs_crypt_stat *crypt_stat;
+
+ lower_dentry = ecryptfs_dentry_to_lower(dentry);
+ if (!lower_dentry->d_inode->i_op ||
+ !lower_dentry->d_inode->i_op->readlink) {
+ rc = -EINVAL;
+ goto out;
+ }
+ /* Released in this function */
+ lower_buf = kmalloc(bufsiz, GFP_KERNEL);
+ if (lower_buf == NULL) {
+ ecryptfs_printk(KERN_ERR, "Out of memory\n");
+ rc = -ENOMEM;
+ goto out;
+ }
+ old_fs = get_fs();
+ set_fs(get_ds());
+ ecryptfs_printk(KERN_DEBUG, "Calling readlink w/ "
+ "lower_dentry->d_name.name = [%s]\n",
+ lower_dentry->d_name.name);
+ rc = lower_dentry->d_inode->i_op->readlink(lower_dentry,
+ (char __user *)lower_buf,
+ bufsiz);
+ set_fs(old_fs);
+ if (rc >= 0) {
+ crypt_stat = NULL;
+ rc = ecryptfs_decode_filename(crypt_stat, lower_buf, rc,
+ &decoded_name);
+ if (rc == -ENOMEM)
+ goto out_free_lower_buf;
+ if (rc > 0) {
+ ecryptfs_printk(KERN_DEBUG, "Copying [%d] bytes "
+ "to userspace: [%*s]\n", rc,
+ decoded_name);
+ if (copy_to_user(buf, decoded_name, rc))
+ rc = -EFAULT;
+ }
+ kfree(decoded_name);
+ ecryptfs_copy_attr_atime(dentry->d_inode,
+ lower_dentry->d_inode);
+ }
+out_free_lower_buf:
+ kfree(lower_buf);
+out:
+ return rc;
+}
+
+static void *ecryptfs_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+ char *buf;
+ int len = PAGE_SIZE, rc;
+ mm_segment_t old_fs;
+
+ /* Released in ecryptfs_put_link(); only release here on error */
+ buf = kmalloc(len, GFP_KERNEL);
+ if (!buf) {
+ rc = -ENOMEM;
+ goto out;
+ }
+ old_fs = get_fs();
+ set_fs(get_ds());
+ ecryptfs_printk(KERN_DEBUG, "Calling readlink w/ "
+ "dentry->d_name.name = [%s]\n", dentry->d_name.name);
+ rc = dentry->d_inode->i_op->readlink(dentry, (char __user *)buf, len);
+ buf[rc] = '\0';
+ set_fs(old_fs);
+ if (rc < 0)
+ goto out_free;
+ rc = 0;
+ nd_set_link(nd, buf);
+ goto out;
+out_free:
+ kfree(buf);
+out:
+ return ERR_PTR(rc);
+}
+
+static void
+ecryptfs_put_link(struct dentry *dentry, struct nameidata *nd, void *ptr)
+{
+ /* Free the char* */
+ kfree(nd_get_link(nd));
+}
+
+/**
+ * upper_size_to_lower_size
+ * @crypt_stat: Crypt_stat associated with file
+ * @upper_size: Size of the upper file
+ *
+ * Calculate the requried size of the lower file based on the
+ * specified size of the upper file. This calculation is based on the
+ * number of headers in the underlying file and the extent size.
+ *
+ * Returns Calculated size of the lower file.
+ */
+static loff_t
+upper_size_to_lower_size(struct ecryptfs_crypt_stat *crypt_stat,
+ loff_t upper_size)
+{
+ loff_t lower_size;
+
+ lower_size = ( crypt_stat->header_extent_size
+ * crypt_stat->num_header_extents_at_front );
+ if (upper_size != 0) {
+ loff_t num_extents;
+
+ num_extents = upper_size >> crypt_stat->extent_shift;
+ if (upper_size & ~crypt_stat->extent_mask)
+ num_extents++;
+ lower_size += (num_extents * crypt_stat->extent_size);
+ }
+ return lower_size;
+}
+
+/**
+ * ecryptfs_truncate
+ * @dentry: The ecryptfs layer dentry
+ * @new_length: The length to expand the file to
+ *
+ * Function to handle truncations modifying the size of the file. Note
+ * that the file sizes are interpolated. When expanding, we are simply
+ * writing strings of 0's out. When truncating, we need to modify the
+ * underlying file size according to the page index interpolations.
+ *
+ * Returns zero on success; non-zero otherwise
+ */
+int ecryptfs_truncate(struct dentry *dentry, loff_t new_length)
+{
+ int rc = 0;
+ struct inode *inode = dentry->d_inode;
+ struct dentry *lower_dentry;
+ struct vfsmount *lower_mnt;
+ struct file fake_ecryptfs_file, *lower_file = NULL;
+ struct ecryptfs_crypt_stat *crypt_stat;
+ loff_t i_size = i_size_read(inode);
+ loff_t lower_size_before_truncate;
+ loff_t lower_size_after_truncate;
+
+ if (unlikely((new_length == i_size)))
+ goto out;
+ crypt_stat = &ecryptfs_inode_to_private(dentry->d_inode)->crypt_stat;
+ /* Set up a fake ecryptfs file, this is used to interface with
+ * the file in the underlying filesystem so that the
+ * truncation has an effect there as well. */
+ memset(&fake_ecryptfs_file, 0, sizeof(fake_ecryptfs_file));
+ fake_ecryptfs_file.f_dentry = dentry;
+ /* Released at out_free: label */
+ ecryptfs_set_file_private(&fake_ecryptfs_file,
+ kmem_cache_alloc(ecryptfs_file_info_cache,
+ SLAB_KERNEL));
+ if (unlikely(!ecryptfs_file_to_private(&fake_ecryptfs_file))) {
+ rc = -ENOMEM;
+ goto out;
+ }
+ lower_dentry = ecryptfs_dentry_to_lower(dentry);
+ /* This dget & mntget is released through fput at out_fput: */
+ dget(lower_dentry);
+ lower_mnt = ecryptfs_dentry_to_lower_mnt(dentry);
+ mntget(lower_mnt);
+ lower_file = dentry_open(lower_dentry, lower_mnt, O_RDWR);
+ if (unlikely(IS_ERR(lower_file))) {
+ rc = PTR_ERR(lower_file);
+ goto out_free;
+ }
+ ecryptfs_set_file_lower(&fake_ecryptfs_file, lower_file);
+ /* Switch on growing or shrinking file */
+ if (new_length > i_size) {
+ rc = ecryptfs_fill_zeros(&fake_ecryptfs_file, new_length);
+ if (rc) {
+ ecryptfs_printk(KERN_ERR,
+ "Problem with fill_zeros\n");
+ goto out_fput;
+ }
+ i_size_write(inode, new_length);
+ rc = ecryptfs_write_inode_size_to_header(lower_file,
+ lower_dentry->d_inode,
+ inode);
+ if (rc) {
+ ecryptfs_printk(KERN_ERR,
+ "Problem with ecryptfs_write"
+ "_inode_size\n");
+ goto out_fput;
+ }
+ } else { /* new_length < i_size_read(inode) */
+ vmtruncate(inode, new_length);
+ ecryptfs_write_inode_size_to_header(lower_file,
+ lower_dentry->d_inode,
+ inode);
+ /* We are reducing the size of the ecryptfs file, and need to
+ * know if we need to reduce the size of the lower file. */
+ lower_size_before_truncate =
+ upper_size_to_lower_size(crypt_stat, i_size);
+ lower_size_after_truncate =
+ upper_size_to_lower_size(crypt_stat, new_length);
+ if (lower_size_after_truncate < lower_size_before_truncate)
+ vmtruncate(lower_dentry->d_inode,
+ lower_size_after_truncate);
+ }
+ /* Update the access times */
+ lower_dentry->d_inode->i_mtime = lower_dentry->d_inode->i_ctime
+ = CURRENT_TIME;
+ mark_inode_dirty_sync(inode);
+out_fput:
+ fput(lower_file);
+out_free:
+ if (ecryptfs_file_to_private(&fake_ecryptfs_file))
+ kmem_cache_free(ecryptfs_file_info_cache,
+ ecryptfs_file_to_private(&fake_ecryptfs_file));
+out:
+ return rc;
+}
+
+static int
+ecryptfs_permission(struct inode *inode, int mask, struct nameidata *nd)
+{
+ int rc;
+
+ if (nd) {
+ struct vfsmount *vfsmnt_save = nd->mnt;
+ struct dentry *dentry_save = nd->dentry;
+
+ nd->mnt = ecryptfs_dentry_to_lower_mnt(nd->dentry);
+ nd->dentry = ecryptfs_dentry_to_lower(nd->dentry);
+ rc = permission(ecryptfs_inode_to_lower(inode), mask, nd);
+ nd->mnt = vfsmnt_save;
+ nd->dentry = dentry_save;
+ } else
+ rc = permission(ecryptfs_inode_to_lower(inode), mask, NULL);
+ return rc;
+}
+
+/**
+ * ecryptfs_setattr
+ * @dentry: dentry handle to the inode to modify
+ * @ia: Structure with flags of what to change and values
+ *
+ * Updates the metadata of an inode. If the update is to the size
+ * i.e. truncation, then ecryptfs_truncate will handle the size modification
+ * of both the ecryptfs inode and the lower inode.
+ *
+ * All other metadata changes will be passed right to the lower filesystem,
+ * and we will just update our inode to look like the lower.
+ */
+static int ecryptfs_setattr(struct dentry *dentry, struct iattr *ia)
+{
+ int rc = 0;
+ struct dentry *lower_dentry;
+ struct inode *inode;
+ struct inode *lower_inode;
+ struct ecryptfs_crypt_stat *crypt_stat;
+
+ crypt_stat = &ecryptfs_inode_to_private(dentry->d_inode)->crypt_stat;
+ lower_dentry = ecryptfs_dentry_to_lower(dentry);
+ inode = dentry->d_inode;
+ lower_inode = ecryptfs_inode_to_lower(inode);
+ if (ia->ia_valid & ATTR_SIZE) {
+ ecryptfs_printk(KERN_DEBUG,
+ "ia->ia_valid = [0x%x] ATTR_SIZE" " = [0x%x]\n",
+ ia->ia_valid, ATTR_SIZE);
+ rc = ecryptfs_truncate(dentry, ia->ia_size);
+ /* ecryptfs_truncate handles resizing of the lower file */
+ ia->ia_valid &= ~ATTR_SIZE;
+ ecryptfs_printk(KERN_DEBUG, "ia->ia_valid = [%x]\n",
+ ia->ia_valid);
+ if (rc < 0)
+ goto out;
+ }
+ rc = notify_change(lower_dentry, ia);
+out:
+ ecryptfs_copy_attr_all(inode, lower_inode);
+ return rc;
+}
+
+static int
+ecryptfs_setxattr(struct dentry *dentry, const char *name, const void *value,
+ size_t size, int flags)
+{
+ int rc = 0;
+ struct dentry *lower_dentry;
+
+ lower_dentry = ecryptfs_dentry_to_lower(dentry);
+ if (!lower_dentry->d_inode->i_op->setxattr) {
+ rc = -ENOSYS;
+ goto out;
+ }
+ mutex_lock(&lower_dentry->d_inode->i_mutex);
+ rc = lower_dentry->d_inode->i_op->setxattr(lower_dentry, name, value,
+ size, flags);
+ mutex_unlock(&lower_dentry->d_inode->i_mutex);
+out:
+ return rc;
+}
+
+static ssize_t
+ecryptfs_getxattr(struct dentry *dentry, const char *name, void *value,
+ size_t size)
+{
+ int rc = 0;
+ struct dentry *lower_dentry;
+
+ lower_dentry = ecryptfs_dentry_to_lower(dentry);
+ if (!lower_dentry->d_inode->i_op->getxattr) {
+ rc = -ENOSYS;
+ goto out;
+ }
+ mutex_lock(&lower_dentry->d_inode->i_mutex);
+ rc = lower_dentry->d_inode->i_op->getxattr(lower_dentry, name, value,
+ size);
+ mutex_unlock(&lower_dentry->d_inode->i_mutex);
+out:
+ return rc;
+}
+
+static ssize_t
+ecryptfs_listxattr(struct dentry *dentry, char *list, size_t size)
+{
+ int rc = 0;
+ struct dentry *lower_dentry;
+
+ lower_dentry = ecryptfs_dentry_to_lower(dentry);
+ if (!lower_dentry->d_inode->i_op->listxattr) {
+ rc = -ENOSYS;
+ goto out;
+ }
+ mutex_lock(&lower_dentry->d_inode->i_mutex);
+ rc = lower_dentry->d_inode->i_op->listxattr(lower_dentry, list, size);
+ mutex_unlock(&lower_dentry->d_inode->i_mutex);
+out:
+ return rc;
+}
+
+static int ecryptfs_removexattr(struct dentry *dentry, const char *name)
+{
+ int rc = 0;
+ struct dentry *lower_dentry;
+
+ lower_dentry = ecryptfs_dentry_to_lower(dentry);
+ if (!lower_dentry->d_inode->i_op->removexattr) {
+ rc = -ENOSYS;
+ goto out;
+ }
+ mutex_lock(&lower_dentry->d_inode->i_mutex);
+ rc = lower_dentry->d_inode->i_op->removexattr(lower_dentry, name);
+ mutex_unlock(&lower_dentry->d_inode->i_mutex);
+out:
+ return rc;
+}
+
+int ecryptfs_inode_test(struct inode *inode, void *candidate_lower_inode)
+{
+ if ((ecryptfs_inode_to_lower(inode)
+ == (struct inode *)candidate_lower_inode))
+ return 1;
+ else
+ return 0;
+}
+
+int ecryptfs_inode_set(struct inode *inode, void *lower_inode)
+{
+ ecryptfs_init_inode(inode, (struct inode *)lower_inode);
+ return 0;
+}
+
+struct inode_operations ecryptfs_symlink_iops = {
+ .readlink = ecryptfs_readlink,
+ .follow_link = ecryptfs_follow_link,
+ .put_link = ecryptfs_put_link,
+ .permission = ecryptfs_permission,
+ .setattr = ecryptfs_setattr,
+ .setxattr = ecryptfs_setxattr,
+ .getxattr = ecryptfs_getxattr,
+ .listxattr = ecryptfs_listxattr,
+ .removexattr = ecryptfs_removexattr
+};
+
+struct inode_operations ecryptfs_dir_iops = {
+ .create = ecryptfs_create,
+ .lookup = ecryptfs_lookup,
+ .link = ecryptfs_link,
+ .unlink = ecryptfs_unlink,
+ .symlink = ecryptfs_symlink,
+ .mkdir = ecryptfs_mkdir,
+ .rmdir = ecryptfs_rmdir,
+ .mknod = ecryptfs_mknod,
+ .rename = ecryptfs_rename,
+ .permission = ecryptfs_permission,
+ .setattr = ecryptfs_setattr,
+ .setxattr = ecryptfs_setxattr,
+ .getxattr = ecryptfs_getxattr,
+ .listxattr = ecryptfs_listxattr,
+ .removexattr = ecryptfs_removexattr
+};
+
+struct inode_operations ecryptfs_main_iops = {
+ .permission = ecryptfs_permission,
+ .setattr = ecryptfs_setattr,
+ .setxattr = ecryptfs_setxattr,
+ .getxattr = ecryptfs_getxattr,
+ .listxattr = ecryptfs_listxattr,
+ .removexattr = ecryptfs_removexattr
+};
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
new file mode 100644
index 000000000000..ba454785a0c5
--- /dev/null
+++ b/fs/ecryptfs/keystore.c
@@ -0,0 +1,1061 @@
+/**
+ * eCryptfs: Linux filesystem encryption layer
+ * In-kernel key management code. Includes functions to parse and
+ * write authentication token-related packets with the underlying
+ * file.
+ *
+ * Copyright (C) 2004-2006 International Business Machines Corp.
+ * Author(s): Michael A. Halcrow <mhalcrow@us.ibm.com>
+ * Michael C. Thompson <mcthomps@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+ * 02111-1307, USA.
+ */
+
+#include <linux/string.h>
+#include <linux/sched.h>
+#include <linux/syscalls.h>
+#include <linux/pagemap.h>
+#include <linux/key.h>
+#include <linux/random.h>
+#include <linux/crypto.h>
+#include <linux/scatterlist.h>
+#include "ecryptfs_kernel.h"
+
+/**
+ * request_key returned an error instead of a valid key address;
+ * determine the type of error, make appropriate log entries, and
+ * return an error code.
+ */
+int process_request_key_err(long err_code)
+{
+ int rc = 0;
+
+ switch (err_code) {
+ case ENOKEY:
+ ecryptfs_printk(KERN_WARNING, "No key\n");
+ rc = -ENOENT;
+ break;
+ case EKEYEXPIRED:
+ ecryptfs_printk(KERN_WARNING, "Key expired\n");
+ rc = -ETIME;
+ break;
+ case EKEYREVOKED:
+ ecryptfs_printk(KERN_WARNING, "Key revoked\n");
+ rc = -EINVAL;
+ break;
+ default:
+ ecryptfs_printk(KERN_WARNING, "Unknown error code: "
+ "[0x%.16x]\n", err_code);
+ rc = -EINVAL;
+ }
+ return rc;
+}
+
+static void wipe_auth_tok_list(struct list_head *auth_tok_list_head)
+{
+ struct list_head *walker;
+ struct ecryptfs_auth_tok_list_item *auth_tok_list_item;
+
+ walker = auth_tok_list_head->next;
+ while (walker != auth_tok_list_head) {
+ auth_tok_list_item =
+ list_entry(walker, struct ecryptfs_auth_tok_list_item,
+ list);
+ walker = auth_tok_list_item->list.next;
+ memset(auth_tok_list_item, 0,
+ sizeof(struct ecryptfs_auth_tok_list_item));
+ kmem_cache_free(ecryptfs_auth_tok_list_item_cache,
+ auth_tok_list_item);
+ }
+}
+
+struct kmem_cache *ecryptfs_auth_tok_list_item_cache;
+
+/**
+ * parse_packet_length
+ * @data: Pointer to memory containing length at offset
+ * @size: This function writes the decoded size to this memory
+ * address; zero on error
+ * @length_size: The number of bytes occupied by the encoded length
+ *
+ * Returns Zero on success
+ */
+static int parse_packet_length(unsigned char *data, size_t *size,
+ size_t *length_size)
+{
+ int rc = 0;
+
+ (*length_size) = 0;
+ (*size) = 0;
+ if (data[0] < 192) {
+ /* One-byte length */
+ (*size) = data[0];
+ (*length_size) = 1;
+ } else if (data[0] < 224) {
+ /* Two-byte length */
+ (*size) = ((data[0] - 192) * 256);
+ (*size) += (data[1] + 192);
+ (*length_size) = 2;
+ } else if (data[0] == 255) {
+ /* Five-byte length; we're not supposed to see this */
+ ecryptfs_printk(KERN_ERR, "Five-byte packet length not "
+ "supported\n");
+ rc = -EINVAL;
+ goto out;
+ } else {
+ ecryptfs_printk(KERN_ERR, "Error parsing packet length\n");
+ rc = -EINVAL;
+ goto out;
+ }
+out:
+ return rc;
+}
+
+/**
+ * write_packet_length
+ * @dest: The byte array target into which to write the
+ * length. Must have at least 5 bytes allocated.
+ * @size: The length to write.
+ * @packet_size_length: The number of bytes used to encode the
+ * packet length is written to this address.
+ *
+ * Returns zero on success; non-zero on error.
+ */
+static int write_packet_length(char *dest, size_t size,
+ size_t *packet_size_length)
+{
+ int rc = 0;
+
+ if (size < 192) {
+ dest[0] = size;
+ (*packet_size_length) = 1;
+ } else if (size < 65536) {
+ dest[0] = (((size - 192) / 256) + 192);
+ dest[1] = ((size - 192) % 256);
+ (*packet_size_length) = 2;
+ } else {
+ rc = -EINVAL;
+ ecryptfs_printk(KERN_WARNING,
+ "Unsupported packet size: [%d]\n", size);
+ }
+ return rc;
+}
+
+/**
+ * parse_tag_3_packet
+ * @crypt_stat: The cryptographic context to modify based on packet
+ * contents.
+ * @data: The raw bytes of the packet.
+ * @auth_tok_list: eCryptfs parses packets into authentication tokens;
+ * a new authentication token will be placed at the end
+ * of this list for this packet.
+ * @new_auth_tok: Pointer to a pointer to memory that this function
+ * allocates; sets the memory address of the pointer to
+ * NULL on error. This object is added to the
+ * auth_tok_list.
+ * @packet_size: This function writes the size of the parsed packet
+ * into this memory location; zero on error.
+ * @max_packet_size: maximum number of bytes to parse
+ *
+ * Returns zero on success; non-zero on error.
+ */
+static int
+parse_tag_3_packet(struct ecryptfs_crypt_stat *crypt_stat,
+ unsigned char *data, struct list_head *auth_tok_list,
+ struct ecryptfs_auth_tok **new_auth_tok,
+ size_t *packet_size, size_t max_packet_size)
+{
+ int rc = 0;
+ size_t body_size;
+ struct ecryptfs_auth_tok_list_item *auth_tok_list_item;
+ size_t length_size;
+
+ (*packet_size) = 0;
+ (*new_auth_tok) = NULL;
+
+ /* we check that:
+ * one byte for the Tag 3 ID flag
+ * two bytes for the body size
+ * do not exceed the maximum_packet_size
+ */
+ if (unlikely((*packet_size) + 3 > max_packet_size)) {
+ ecryptfs_printk(KERN_ERR, "Packet size exceeds max\n");
+ rc = -EINVAL;
+ goto out;
+ }
+
+ /* check for Tag 3 identifyer - one byte */
+ if (data[(*packet_size)++] != ECRYPTFS_TAG_3_PACKET_TYPE) {
+ ecryptfs_printk(KERN_ERR, "Enter w/ first byte != 0x%.2x\n",
+ ECRYPTFS_TAG_3_PACKET_TYPE);
+ rc = -EINVAL;
+ goto out;
+ }
+ /* Released: wipe_auth_tok_list called in ecryptfs_parse_packet_set or
+ * at end of function upon failure */
+ auth_tok_list_item =
+ kmem_cache_alloc(ecryptfs_auth_tok_list_item_cache, SLAB_KERNEL);
+ if (!auth_tok_list_item) {
+ ecryptfs_printk(KERN_ERR, "Unable to allocate memory\n");
+ rc = -ENOMEM;
+ goto out;
+ }
+ memset(auth_tok_list_item, 0,
+ sizeof(struct ecryptfs_auth_tok_list_item));
+ (*new_auth_tok) = &auth_tok_list_item->auth_tok;
+
+ /* check for body size - one to two bytes */
+ rc = parse_packet_length(&data[(*packet_size)], &body_size,
+ &length_size);
+ if (rc) {
+ ecryptfs_printk(KERN_WARNING, "Error parsing packet length; "
+ "rc = [%d]\n", rc);
+ goto out_free;
+ }
+ if (unlikely(body_size < (0x05 + ECRYPTFS_SALT_SIZE))) {
+ ecryptfs_printk(KERN_WARNING, "Invalid body size ([%d])\n",
+ body_size);
+ rc = -EINVAL;
+ goto out_free;
+ }
+ (*packet_size) += length_size;
+
+ /* now we know the length of the remainting Tag 3 packet size:
+ * 5 fix bytes for: version string, cipher, S2K ID, hash algo,
+ * number of hash iterations
+ * ECRYPTFS_SALT_SIZE bytes for salt
+ * body_size bytes minus the stuff above is the encrypted key size
+ */
+ if (unlikely((*packet_size) + body_size > max_packet_size)) {
+ ecryptfs_printk(KERN_ERR, "Packet size exceeds max\n");
+ rc = -EINVAL;
+ goto out_free;
+ }
+
+ /* There are 5 characters of additional information in the
+ * packet */
+ (*new_auth_tok)->session_key.encrypted_key_size =
+ body_size - (0x05 + ECRYPTFS_SALT_SIZE);
+ ecryptfs_printk(KERN_DEBUG, "Encrypted key size = [%d]\n",
+ (*new_auth_tok)->session_key.encrypted_key_size);
+
+ /* Version 4 (from RFC2440) - one byte */
+ if (unlikely(data[(*packet_size)++] != 0x04)) {
+ ecryptfs_printk(KERN_DEBUG, "Unknown version number "
+ "[%d]\n", data[(*packet_size) - 1]);
+ rc = -EINVAL;
+ goto out_free;
+ }
+
+ /* cipher - one byte */
+ ecryptfs_cipher_code_to_string(crypt_stat->cipher,
+ (u16)data[(*packet_size)]);
+ /* A little extra work to differentiate among the AES key
+ * sizes; see RFC2440 */
+ switch(data[(*packet_size)++]) {
+ case RFC2440_CIPHER_AES_192:
+ crypt_stat->key_size = 24;
+ break;
+ default:
+ crypt_stat->key_size =
+ (*new_auth_tok)->session_key.encrypted_key_size;
+ }
+ ecryptfs_init_crypt_ctx(crypt_stat);
+ /* S2K identifier 3 (from RFC2440) */
+ if (unlikely(data[(*packet_size)++] != 0x03)) {
+ ecryptfs_printk(KERN_ERR, "Only S2K ID 3 is currently "
+ "supported\n");
+ rc = -ENOSYS;
+ goto out_free;
+ }
+
+ /* TODO: finish the hash mapping */
+ /* hash algorithm - one byte */
+ switch (data[(*packet_size)++]) {
+ case 0x01: /* See RFC2440 for these numbers and their mappings */
+ /* Choose MD5 */
+ /* salt - ECRYPTFS_SALT_SIZE bytes */
+ memcpy((*new_auth_tok)->token.password.salt,
+ &data[(*packet_size)], ECRYPTFS_SALT_SIZE);
+ (*packet_size) += ECRYPTFS_SALT_SIZE;
+
+ /* This conversion was taken straight from RFC2440 */
+ /* number of hash iterations - one byte */
+ (*new_auth_tok)->token.password.hash_iterations =
+ ((u32) 16 + (data[(*packet_size)] & 15))
+ << ((data[(*packet_size)] >> 4) + 6);
+ (*packet_size)++;
+
+ /* encrypted session key -
+ * (body_size-5-ECRYPTFS_SALT_SIZE) bytes */
+ memcpy((*new_auth_tok)->session_key.encrypted_key,
+ &data[(*packet_size)],
+ (*new_auth_tok)->session_key.encrypted_key_size);
+ (*packet_size) +=
+ (*new_auth_tok)->session_key.encrypted_key_size;
+ (*new_auth_tok)->session_key.flags &=
+ ~ECRYPTFS_CONTAINS_DECRYPTED_KEY;
+ (*new_auth_tok)->session_key.flags |=
+ ECRYPTFS_CONTAINS_ENCRYPTED_KEY;
+ (*new_auth_tok)->token.password.hash_algo = 0x01;
+ break;
+ default:
+ ecryptfs_printk(KERN_ERR, "Unsupported hash algorithm: "
+ "[%d]\n", data[(*packet_size) - 1]);
+ rc = -ENOSYS;
+ goto out_free;
+ }
+ (*new_auth_tok)->token_type = ECRYPTFS_PASSWORD;
+ /* TODO: Parametarize; we might actually want userspace to
+ * decrypt the session key. */
+ ECRYPTFS_CLEAR_FLAG((*new_auth_tok)->session_key.flags,
+ ECRYPTFS_USERSPACE_SHOULD_TRY_TO_DECRYPT);
+ ECRYPTFS_CLEAR_FLAG((*new_auth_tok)->session_key.flags,
+ ECRYPTFS_USERSPACE_SHOULD_TRY_TO_ENCRYPT);
+ list_add(&auth_tok_list_item->list, auth_tok_list);
+ goto out;
+out_free:
+ (*new_auth_tok) = NULL;
+ memset(auth_tok_list_item, 0,
+ sizeof(struct ecryptfs_auth_tok_list_item));
+ kmem_cache_free(ecryptfs_auth_tok_list_item_cache,
+ auth_tok_list_item);
+out:
+ if (rc)
+ (*packet_size) = 0;
+ return rc;
+}
+
+/**
+ * parse_tag_11_packet
+ * @data: The raw bytes of the packet
+ * @contents: This function writes the data contents of the literal
+ * packet into this memory location
+ * @max_contents_bytes: The maximum number of bytes that this function
+ * is allowed to write into contents
+ * @tag_11_contents_size: This function writes the size of the parsed
+ * contents into this memory location; zero on
+ * error
+ * @packet_size: This function writes the size of the parsed packet
+ * into this memory location; zero on error
+ * @max_packet_size: maximum number of bytes to parse
+ *
+ * Returns zero on success; non-zero on error.
+ */
+static int
+parse_tag_11_packet(unsigned char *data, unsigned char *contents,
+ size_t max_contents_bytes, size_t *tag_11_contents_size,
+ size_t *packet_size, size_t max_packet_size)
+{
+ int rc = 0;
+ size_t body_size;
+ size_t length_size;
+
+ (*packet_size) = 0;
+ (*tag_11_contents_size) = 0;
+
+ /* check that:
+ * one byte for the Tag 11 ID flag
+ * two bytes for the Tag 11 length
+ * do not exceed the maximum_packet_size
+ */
+ if (unlikely((*packet_size) + 3 > max_packet_size)) {
+ ecryptfs_printk(KERN_ERR, "Packet size exceeds max\n");
+ rc = -EINVAL;
+ goto out;
+ }
+
+ /* check for Tag 11 identifyer - one byte */
+ if (data[(*packet_size)++] != ECRYPTFS_TAG_11_PACKET_TYPE) {
+ ecryptfs_printk(KERN_WARNING,
+ "Invalid tag 11 packet format\n");
+ rc = -EINVAL;
+ goto out;
+ }
+
+ /* get Tag 11 content length - one or two bytes */
+ rc = parse_packet_length(&data[(*packet_size)], &body_size,
+ &length_size);
+ if (rc) {
+ ecryptfs_printk(KERN_WARNING,
+ "Invalid tag 11 packet format\n");
+ goto out;
+ }
+ (*packet_size) += length_size;
+
+ if (body_size < 13) {
+ ecryptfs_printk(KERN_WARNING, "Invalid body size ([%d])\n",
+ body_size);
+ rc = -EINVAL;
+ goto out;
+ }
+ /* We have 13 bytes of surrounding packet values */
+ (*tag_11_contents_size) = (body_size - 13);
+
+ /* now we know the length of the remainting Tag 11 packet size:
+ * 14 fix bytes for: special flag one, special flag two,
+ * 12 skipped bytes
+ * body_size bytes minus the stuff above is the Tag 11 content
+ */
+ /* FIXME why is the body size one byte smaller than the actual
+ * size of the body?
+ * this seems to be an error here as well as in
+ * write_tag_11_packet() */
+ if (unlikely((*packet_size) + body_size + 1 > max_packet_size)) {
+ ecryptfs_printk(KERN_ERR, "Packet size exceeds max\n");
+ rc = -EINVAL;
+ goto out;
+ }
+
+ /* special flag one - one byte */
+ if (data[(*packet_size)++] != 0x62) {
+ ecryptfs_printk(KERN_WARNING, "Unrecognizable packet\n");
+ rc = -EINVAL;
+ goto out;
+ }
+
+ /* special flag two - one byte */
+ if (data[(*packet_size)++] != 0x08) {
+ ecryptfs_printk(KERN_WARNING, "Unrecognizable packet\n");
+ rc = -EINVAL;
+ goto out;
+ }
+
+ /* skip the next 12 bytes */
+ (*packet_size) += 12; /* We don't care about the filename or
+ * the timestamp */
+
+ /* get the Tag 11 contents - tag_11_contents_size bytes */
+ memcpy(contents, &data[(*packet_size)], (*tag_11_contents_size));
+ (*packet_size) += (*tag_11_contents_size);
+
+out:
+ if (rc) {
+ (*packet_size) = 0;
+ (*tag_11_contents_size) = 0;
+ }
+ return rc;
+}
+
+/**
+ * decrypt_session_key - Decrypt the session key with the given auth_tok.
+ *
+ * Returns Zero on success; non-zero error otherwise.
+ */
+static int decrypt_session_key(struct ecryptfs_auth_tok *auth_tok,
+ struct ecryptfs_crypt_stat *crypt_stat)
+{
+ int rc = 0;
+ struct ecryptfs_password *password_s_ptr;
+ struct crypto_tfm *tfm = NULL;
+ struct scatterlist src_sg[2], dst_sg[2];
+ struct mutex *tfm_mutex = NULL;
+ /* TODO: Use virt_to_scatterlist for these */
+ char *encrypted_session_key;
+ char *session_key;
+
+ password_s_ptr = &auth_tok->token.password;
+ if (ECRYPTFS_CHECK_FLAG(password_s_ptr->flags,
+ ECRYPTFS_SESSION_KEY_ENCRYPTION_KEY_SET))
+ ecryptfs_printk(KERN_DEBUG, "Session key encryption key "
+ "set; skipping key generation\n");
+ ecryptfs_printk(KERN_DEBUG, "Session key encryption key (size [%d])"
+ ":\n",
+ password_s_ptr->session_key_encryption_key_bytes);
+ if (ecryptfs_verbosity > 0)
+ ecryptfs_dump_hex(password_s_ptr->session_key_encryption_key,
+ password_s_ptr->
+ session_key_encryption_key_bytes);
+ if (!strcmp(crypt_stat->cipher,
+ crypt_stat->mount_crypt_stat->global_default_cipher_name)
+ && crypt_stat->mount_crypt_stat->global_key_tfm) {
+ tfm = crypt_stat->mount_crypt_stat->global_key_tfm;
+ tfm_mutex = &crypt_stat->mount_crypt_stat->global_key_tfm_mutex;
+ } else {
+ tfm = crypto_alloc_tfm(crypt_stat->cipher,
+ CRYPTO_TFM_REQ_WEAK_KEY);
+ if (!tfm) {
+ printk(KERN_ERR "Error allocating crypto context\n");
+ rc = -ENOMEM;
+ goto out;
+ }
+ }
+ if (password_s_ptr->session_key_encryption_key_bytes
+ < crypto_tfm_alg_min_keysize(tfm)) {
+ printk(KERN_WARNING "Session key encryption key is [%d] bytes; "
+ "minimum keysize for selected cipher is [%d] bytes.\n",
+ password_s_ptr->session_key_encryption_key_bytes,
+ crypto_tfm_alg_min_keysize(tfm));
+ rc = -EINVAL;
+ goto out;
+ }
+ if (tfm_mutex)
+ mutex_lock(tfm_mutex);
+ crypto_cipher_setkey(tfm, password_s_ptr->session_key_encryption_key,
+ crypt_stat->key_size);
+ /* TODO: virt_to_scatterlist */
+ encrypted_session_key = (char *)__get_free_page(GFP_KERNEL);
+ if (!encrypted_session_key) {
+ ecryptfs_printk(KERN_ERR, "Out of memory\n");
+ rc = -ENOMEM;
+ goto out_free_tfm;
+ }
+ session_key = (char *)__get_free_page(GFP_KERNEL);
+ if (!session_key) {
+ kfree(encrypted_session_key);
+ ecryptfs_printk(KERN_ERR, "Out of memory\n");
+ rc = -ENOMEM;
+ goto out_free_tfm;
+ }
+ memcpy(encrypted_session_key, auth_tok->session_key.encrypted_key,
+ auth_tok->session_key.encrypted_key_size);
+ src_sg[0].page = virt_to_page(encrypted_session_key);
+ src_sg[0].offset = 0;
+ BUG_ON(auth_tok->session_key.encrypted_key_size > PAGE_CACHE_SIZE);
+ src_sg[0].length = auth_tok->session_key.encrypted_key_size;
+ dst_sg[0].page = virt_to_page(session_key);
+ dst_sg[0].offset = 0;
+ auth_tok->session_key.decrypted_key_size =
+ auth_tok->session_key.encrypted_key_size;
+ dst_sg[0].length = auth_tok->session_key.encrypted_key_size;
+ /* TODO: Handle error condition */
+ crypto_cipher_decrypt(tfm, dst_sg, src_sg,
+ auth_tok->session_key.encrypted_key_size);
+ auth_tok->session_key.decrypted_key_size =
+ auth_tok->session_key.encrypted_key_size;
+ memcpy(auth_tok->session_key.decrypted_key, session_key,
+ auth_tok->session_key.decrypted_key_size);
+ auth_tok->session_key.flags |= ECRYPTFS_CONTAINS_DECRYPTED_KEY;
+ memcpy(crypt_stat->key, auth_tok->session_key.decrypted_key,
+ auth_tok->session_key.decrypted_key_size);
+ ECRYPTFS_SET_FLAG(crypt_stat->flags, ECRYPTFS_KEY_VALID);
+ ecryptfs_printk(KERN_DEBUG, "Decrypted session key:\n");
+ if (ecryptfs_verbosity > 0)
+ ecryptfs_dump_hex(crypt_stat->key,
+ crypt_stat->key_size);
+ memset(encrypted_session_key, 0, PAGE_CACHE_SIZE);
+ free_page((unsigned long)encrypted_session_key);
+ memset(session_key, 0, PAGE_CACHE_SIZE);
+ free_page((unsigned long)session_key);
+out_free_tfm:
+ if (tfm_mutex)
+ mutex_unlock(tfm_mutex);
+ else
+ crypto_free_tfm(tfm);
+out:
+ return rc;
+}
+
+/**
+ * ecryptfs_parse_packet_set
+ * @dest: The header page in memory
+ * @version: Version of file format, to guide parsing behavior
+ *
+ * Get crypt_stat to have the file's session key if the requisite key
+ * is available to decrypt the session key.
+ *
+ * Returns Zero if a valid authentication token was retrieved and
+ * processed; negative value for file not encrypted or for error
+ * conditions.
+ */
+int ecryptfs_parse_packet_set(struct ecryptfs_crypt_stat *crypt_stat,
+ unsigned char *src,
+ struct dentry *ecryptfs_dentry)
+{
+ size_t i = 0;
+ int rc = 0;
+ size_t found_auth_tok = 0;
+ size_t next_packet_is_auth_tok_packet;
+ char sig[ECRYPTFS_SIG_SIZE_HEX];
+ struct list_head auth_tok_list;
+ struct list_head *walker;
+ struct ecryptfs_auth_tok *chosen_auth_tok = NULL;
+ struct ecryptfs_mount_crypt_stat *mount_crypt_stat =
+ &ecryptfs_superblock_to_private(
+ ecryptfs_dentry->d_sb)->mount_crypt_stat;
+ struct ecryptfs_auth_tok *candidate_auth_tok = NULL;
+ size_t packet_size;
+ struct ecryptfs_auth_tok *new_auth_tok;
+ unsigned char sig_tmp_space[ECRYPTFS_SIG_SIZE];
+ size_t tag_11_contents_size;
+ size_t tag_11_packet_size;
+
+ INIT_LIST_HEAD(&auth_tok_list);
+ /* Parse the header to find as many packets as we can, these will be
+ * added the our &auth_tok_list */
+ next_packet_is_auth_tok_packet = 1;
+ while (next_packet_is_auth_tok_packet) {
+ size_t max_packet_size = ((PAGE_CACHE_SIZE - 8) - i);
+
+ switch (src[i]) {
+ case ECRYPTFS_TAG_3_PACKET_TYPE:
+ rc = parse_tag_3_packet(crypt_stat,
+ (unsigned char *)&src[i],
+ &auth_tok_list, &new_auth_tok,
+ &packet_size, max_packet_size);
+ if (rc) {
+ ecryptfs_printk(KERN_ERR, "Error parsing "
+ "tag 3 packet\n");
+ rc = -EIO;
+ goto out_wipe_list;
+ }
+ i += packet_size;
+ rc = parse_tag_11_packet((unsigned char *)&src[i],
+ sig_tmp_space,
+ ECRYPTFS_SIG_SIZE,
+ &tag_11_contents_size,
+ &tag_11_packet_size,
+ max_packet_size);
+ if (rc) {
+ ecryptfs_printk(KERN_ERR, "No valid "
+ "(ecryptfs-specific) literal "
+ "packet containing "
+ "authentication token "
+ "signature found after "
+ "tag 3 packet\n");
+ rc = -EIO;
+ goto out_wipe_list;
+ }
+ i += tag_11_packet_size;
+ if (ECRYPTFS_SIG_SIZE != tag_11_contents_size) {
+ ecryptfs_printk(KERN_ERR, "Expected "
+ "signature of size [%d]; "
+ "read size [%d]\n",
+ ECRYPTFS_SIG_SIZE,
+ tag_11_contents_size);
+ rc = -EIO;
+ goto out_wipe_list;
+ }
+ ecryptfs_to_hex(new_auth_tok->token.password.signature,
+ sig_tmp_space, tag_11_contents_size);
+ new_auth_tok->token.password.signature[
+ ECRYPTFS_PASSWORD_SIG_SIZE] = '\0';
+ ECRYPTFS_SET_FLAG(crypt_stat->flags,
+ ECRYPTFS_ENCRYPTED);
+ break;
+ case ECRYPTFS_TAG_11_PACKET_TYPE:
+ ecryptfs_printk(KERN_WARNING, "Invalid packet set "
+ "(Tag 11 not allowed by itself)\n");
+ rc = -EIO;
+ goto out_wipe_list;
+ break;
+ default:
+ ecryptfs_printk(KERN_DEBUG, "No packet at offset "
+ "[%d] of the file header; hex value of "
+ "character is [0x%.2x]\n", i, src[i]);
+ next_packet_is_auth_tok_packet = 0;
+ }
+ }
+ if (list_empty(&auth_tok_list)) {
+ rc = -EINVAL; /* Do not support non-encrypted files in
+ * the 0.1 release */
+ goto out;
+ }
+ /* If we have a global auth tok, then we should try to use
+ * it */
+ if (mount_crypt_stat->global_auth_tok) {
+ memcpy(sig, mount_crypt_stat->global_auth_tok_sig,
+ ECRYPTFS_SIG_SIZE_HEX);
+ chosen_auth_tok = mount_crypt_stat->global_auth_tok;
+ } else
+ BUG(); /* We should always have a global auth tok in
+ * the 0.1 release */
+ /* Scan list to see if our chosen_auth_tok works */
+ list_for_each(walker, &auth_tok_list) {
+ struct ecryptfs_auth_tok_list_item *auth_tok_list_item;
+ auth_tok_list_item =
+ list_entry(walker, struct ecryptfs_auth_tok_list_item,
+ list);
+ candidate_auth_tok = &auth_tok_list_item->auth_tok;
+ if (unlikely(ecryptfs_verbosity > 0)) {
+ ecryptfs_printk(KERN_DEBUG,
+ "Considering cadidate auth tok:\n");
+ ecryptfs_dump_auth_tok(candidate_auth_tok);
+ }
+ /* TODO: Replace ECRYPTFS_SIG_SIZE_HEX w/ dynamic value */
+ if (candidate_auth_tok->token_type == ECRYPTFS_PASSWORD
+ && !strncmp(candidate_auth_tok->token.password.signature,
+ sig, ECRYPTFS_SIG_SIZE_HEX)) {
+ found_auth_tok = 1;
+ goto leave_list;
+ /* TODO: Transfer the common salt into the
+ * crypt_stat salt */
+ }
+ }
+leave_list:
+ if (!found_auth_tok) {
+ ecryptfs_printk(KERN_ERR, "Could not find authentication "
+ "token on temporary list for sig [%.*s]\n",
+ ECRYPTFS_SIG_SIZE_HEX, sig);
+ rc = -EIO;
+ goto out_wipe_list;
+ } else {
+ memcpy(&(candidate_auth_tok->token.password),
+ &(chosen_auth_tok->token.password),
+ sizeof(struct ecryptfs_password));
+ rc = decrypt_session_key(candidate_auth_tok, crypt_stat);
+ if (rc) {
+ ecryptfs_printk(KERN_ERR, "Error decrypting the "
+ "session key\n");
+ goto out_wipe_list;
+ }
+ rc = ecryptfs_compute_root_iv(crypt_stat);
+ if (rc) {
+ ecryptfs_printk(KERN_ERR, "Error computing "
+ "the root IV\n");
+ goto out_wipe_list;
+ }
+ }
+ rc = ecryptfs_init_crypt_ctx(crypt_stat);
+ if (rc) {
+ ecryptfs_printk(KERN_ERR, "Error initializing crypto "
+ "context for cipher [%s]; rc = [%d]\n",
+ crypt_stat->cipher, rc);
+ }
+out_wipe_list:
+ wipe_auth_tok_list(&auth_tok_list);
+out:
+ return rc;
+}
+
+/**
+ * write_tag_11_packet
+ * @dest: Target into which Tag 11 packet is to be written
+ * @max: Maximum packet length
+ * @contents: Byte array of contents to copy in
+ * @contents_length: Number of bytes in contents
+ * @packet_length: Length of the Tag 11 packet written; zero on error
+ *
+ * Returns zero on success; non-zero on error.
+ */
+static int
+write_tag_11_packet(char *dest, int max, char *contents, size_t contents_length,
+ size_t *packet_length)
+{
+ int rc = 0;
+ size_t packet_size_length;
+
+ (*packet_length) = 0;
+ if ((13 + contents_length) > max) {
+ rc = -EINVAL;
+ ecryptfs_printk(KERN_ERR, "Packet length larger than "
+ "maximum allowable\n");
+ goto out;
+ }
+ /* General packet header */
+ /* Packet tag */
+ dest[(*packet_length)++] = ECRYPTFS_TAG_11_PACKET_TYPE;
+ /* Packet length */
+ rc = write_packet_length(&dest[(*packet_length)],
+ (13 + contents_length), &packet_size_length);
+ if (rc) {
+ ecryptfs_printk(KERN_ERR, "Error generating tag 11 packet "
+ "header; cannot generate packet length\n");
+ goto out;
+ }
+ (*packet_length) += packet_size_length;
+ /* Tag 11 specific */
+ /* One-octet field that describes how the data is formatted */
+ dest[(*packet_length)++] = 0x62; /* binary data */
+ /* One-octet filename length followed by filename */
+ dest[(*packet_length)++] = 8;
+ memcpy(&dest[(*packet_length)], "_CONSOLE", 8);
+ (*packet_length) += 8;
+ /* Four-octet number indicating modification date */
+ memset(&dest[(*packet_length)], 0x00, 4);
+ (*packet_length) += 4;
+ /* Remainder is literal data */
+ memcpy(&dest[(*packet_length)], contents, contents_length);
+ (*packet_length) += contents_length;
+ out:
+ if (rc)
+ (*packet_length) = 0;
+ return rc;
+}
+
+/**
+ * write_tag_3_packet
+ * @dest: Buffer into which to write the packet
+ * @max: Maximum number of bytes that can be written
+ * @auth_tok: Authentication token
+ * @crypt_stat: The cryptographic context
+ * @key_rec: encrypted key
+ * @packet_size: This function will write the number of bytes that end
+ * up constituting the packet; set to zero on error
+ *
+ * Returns zero on success; non-zero on error.
+ */
+static int
+write_tag_3_packet(char *dest, size_t max, struct ecryptfs_auth_tok *auth_tok,
+ struct ecryptfs_crypt_stat *crypt_stat,
+ struct ecryptfs_key_record *key_rec, size_t *packet_size)
+{
+ int rc = 0;
+
+ size_t i;
+ size_t signature_is_valid = 0;
+ size_t encrypted_session_key_valid = 0;
+ char session_key_encryption_key[ECRYPTFS_MAX_KEY_BYTES];
+ struct scatterlist dest_sg[2];
+ struct scatterlist src_sg[2];
+ struct crypto_tfm *tfm = NULL;
+ struct mutex *tfm_mutex = NULL;
+ size_t key_rec_size;
+ size_t packet_size_length;
+ size_t cipher_code;
+
+ (*packet_size) = 0;
+ /* Check for a valid signature on the auth_tok */
+ for (i = 0; i < ECRYPTFS_SIG_SIZE_HEX; i++)
+ signature_is_valid |= auth_tok->token.password.signature[i];
+ if (!signature_is_valid)
+ BUG();
+ ecryptfs_from_hex((*key_rec).sig, auth_tok->token.password.signature,
+ ECRYPTFS_SIG_SIZE);
+ encrypted_session_key_valid = 0;
+ for (i = 0; i < crypt_stat->key_size; i++)
+ encrypted_session_key_valid |=
+ auth_tok->session_key.encrypted_key[i];
+ if (encrypted_session_key_valid) {
+ memcpy((*key_rec).enc_key,
+ auth_tok->session_key.encrypted_key,
+ auth_tok->session_key.encrypted_key_size);
+ goto encrypted_session_key_set;
+ }
+ if (auth_tok->session_key.encrypted_key_size == 0)
+ auth_tok->session_key.encrypted_key_size =
+ crypt_stat->key_size;
+ if (crypt_stat->key_size == 24
+ && strcmp("aes", crypt_stat->cipher) == 0) {
+ memset((crypt_stat->key + 24), 0, 8);
+ auth_tok->session_key.encrypted_key_size = 32;
+ }
+ (*key_rec).enc_key_size =
+ auth_tok->session_key.encrypted_key_size;
+ if (ECRYPTFS_CHECK_FLAG(auth_tok->token.password.flags,
+ ECRYPTFS_SESSION_KEY_ENCRYPTION_KEY_SET)) {
+ ecryptfs_printk(KERN_DEBUG, "Using previously generated "
+ "session key encryption key of size [%d]\n",
+ auth_tok->token.password.
+ session_key_encryption_key_bytes);
+ memcpy(session_key_encryption_key,
+ auth_tok->token.password.session_key_encryption_key,
+ crypt_stat->key_size);
+ ecryptfs_printk(KERN_DEBUG,
+ "Cached session key " "encryption key: \n");
+ if (ecryptfs_verbosity > 0)
+ ecryptfs_dump_hex(session_key_encryption_key, 16);
+ }
+ if (unlikely(ecryptfs_verbosity > 0)) {
+ ecryptfs_printk(KERN_DEBUG, "Session key encryption key:\n");
+ ecryptfs_dump_hex(session_key_encryption_key, 16);
+ }
+ rc = virt_to_scatterlist(crypt_stat->key,
+ (*key_rec).enc_key_size, src_sg, 2);
+ if (!rc) {
+ ecryptfs_printk(KERN_ERR, "Error generating scatterlist "
+ "for crypt_stat session key\n");
+ rc = -ENOMEM;
+ goto out;
+ }
+ rc = virt_to_scatterlist((*key_rec).enc_key,
+ (*key_rec).enc_key_size, dest_sg, 2);
+ if (!rc) {
+ ecryptfs_printk(KERN_ERR, "Error generating scatterlist "
+ "for crypt_stat encrypted session key\n");
+ rc = -ENOMEM;
+ goto out;
+ }
+ if (!strcmp(crypt_stat->cipher,
+ crypt_stat->mount_crypt_stat->global_default_cipher_name)
+ && crypt_stat->mount_crypt_stat->global_key_tfm) {
+ tfm = crypt_stat->mount_crypt_stat->global_key_tfm;
+ tfm_mutex = &crypt_stat->mount_crypt_stat->global_key_tfm_mutex;
+ } else
+ tfm = crypto_alloc_tfm(crypt_stat->cipher, 0);
+ if (!tfm) {
+ ecryptfs_printk(KERN_ERR, "Could not initialize crypto "
+ "context for cipher [%s]\n",
+ crypt_stat->cipher);
+ rc = -EINVAL;
+ goto out;
+ }
+ if (tfm_mutex)
+ mutex_lock(tfm_mutex);
+ rc = crypto_cipher_setkey(tfm, session_key_encryption_key,
+ crypt_stat->key_size);
+ if (rc < 0) {
+ if (tfm_mutex)
+ mutex_unlock(tfm_mutex);
+ ecryptfs_printk(KERN_ERR, "Error setting key for crypto "
+ "context\n");
+ goto out;
+ }
+ rc = 0;
+ ecryptfs_printk(KERN_DEBUG, "Encrypting [%d] bytes of the key\n",
+ crypt_stat->key_size);
+ crypto_cipher_encrypt(tfm, dest_sg, src_sg,
+ (*key_rec).enc_key_size);
+ if (tfm_mutex)
+ mutex_unlock(tfm_mutex);
+ ecryptfs_printk(KERN_DEBUG, "This should be the encrypted key:\n");
+ if (ecryptfs_verbosity > 0)
+ ecryptfs_dump_hex((*key_rec).enc_key,
+ (*key_rec).enc_key_size);
+encrypted_session_key_set:
+ /* Now we have a valid key_rec. Append it to the
+ * key_rec set. */
+ key_rec_size = (sizeof(struct ecryptfs_key_record)
+ - ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES
+ + ((*key_rec).enc_key_size));
+ /* TODO: Include a packet size limit as a parameter to this
+ * function once we have multi-packet headers (for versions
+ * later than 0.1 */
+ if (key_rec_size >= ECRYPTFS_MAX_KEYSET_SIZE) {
+ ecryptfs_printk(KERN_ERR, "Keyset too large\n");
+ rc = -EINVAL;
+ goto out;
+ }
+ /* TODO: Packet size limit */
+ /* We have 5 bytes of surrounding packet data */
+ if ((0x05 + ECRYPTFS_SALT_SIZE
+ + (*key_rec).enc_key_size) >= max) {
+ ecryptfs_printk(KERN_ERR, "Authentication token is too "
+ "large\n");
+ rc = -EINVAL;
+ goto out;
+ }
+ /* This format is inspired by OpenPGP; see RFC 2440
+ * packet tag 3 */
+ dest[(*packet_size)++] = ECRYPTFS_TAG_3_PACKET_TYPE;
+ /* ver+cipher+s2k+hash+salt+iter+enc_key */
+ rc = write_packet_length(&dest[(*packet_size)],
+ (0x05 + ECRYPTFS_SALT_SIZE
+ + (*key_rec).enc_key_size),
+ &packet_size_length);
+ if (rc) {
+ ecryptfs_printk(KERN_ERR, "Error generating tag 3 packet "
+ "header; cannot generate packet length\n");
+ goto out;
+ }
+ (*packet_size) += packet_size_length;
+ dest[(*packet_size)++] = 0x04; /* version 4 */
+ cipher_code = ecryptfs_code_for_cipher_string(crypt_stat);
+ if (cipher_code == 0) {
+ ecryptfs_printk(KERN_WARNING, "Unable to generate code for "
+ "cipher [%s]\n", crypt_stat->cipher);
+ rc = -EINVAL;
+ goto out;
+ }
+ dest[(*packet_size)++] = cipher_code;
+ dest[(*packet_size)++] = 0x03; /* S2K */
+ dest[(*packet_size)++] = 0x01; /* MD5 (TODO: parameterize) */
+ memcpy(&dest[(*packet_size)], auth_tok->token.password.salt,
+ ECRYPTFS_SALT_SIZE);
+ (*packet_size) += ECRYPTFS_SALT_SIZE; /* salt */
+ dest[(*packet_size)++] = 0x60; /* hash iterations (65536) */
+ memcpy(&dest[(*packet_size)], (*key_rec).enc_key,
+ (*key_rec).enc_key_size);
+ (*packet_size) += (*key_rec).enc_key_size;
+out:
+ if (tfm && !tfm_mutex)
+ crypto_free_tfm(tfm);
+ if (rc)
+ (*packet_size) = 0;
+ return rc;
+}
+
+/**
+ * ecryptfs_generate_key_packet_set
+ * @dest: Virtual address from which to write the key record set
+ * @crypt_stat: The cryptographic context from which the
+ * authentication tokens will be retrieved
+ * @ecryptfs_dentry: The dentry, used to retrieve the mount crypt stat
+ * for the global parameters
+ * @len: The amount written
+ * @max: The maximum amount of data allowed to be written
+ *
+ * Generates a key packet set and writes it to the virtual address
+ * passed in.
+ *
+ * Returns zero on success; non-zero on error.
+ */
+int
+ecryptfs_generate_key_packet_set(char *dest_base,
+ struct ecryptfs_crypt_stat *crypt_stat,
+ struct dentry *ecryptfs_dentry, size_t *len,
+ size_t max)
+{
+ int rc = 0;
+ struct ecryptfs_auth_tok *auth_tok;
+ struct ecryptfs_mount_crypt_stat *mount_crypt_stat =
+ &ecryptfs_superblock_to_private(
+ ecryptfs_dentry->d_sb)->mount_crypt_stat;
+ size_t written;
+ struct ecryptfs_key_record key_rec;
+
+ (*len) = 0;
+ if (mount_crypt_stat->global_auth_tok) {
+ auth_tok = mount_crypt_stat->global_auth_tok;
+ if (auth_tok->token_type == ECRYPTFS_PASSWORD) {
+ rc = write_tag_3_packet((dest_base + (*len)),
+ max, auth_tok,
+ crypt_stat, &key_rec,
+ &written);
+ if (rc) {
+ ecryptfs_printk(KERN_WARNING, "Error "
+ "writing tag 3 packet\n");
+ goto out;
+ }
+ (*len) += written;
+ /* Write auth tok signature packet */
+ rc = write_tag_11_packet(
+ (dest_base + (*len)),
+ (max - (*len)),
+ key_rec.sig, ECRYPTFS_SIG_SIZE, &written);
+ if (rc) {
+ ecryptfs_printk(KERN_ERR, "Error writing "
+ "auth tok signature packet\n");
+ goto out;
+ }
+ (*len) += written;
+ } else {
+ ecryptfs_printk(KERN_WARNING, "Unsupported "
+ "authentication token type\n");
+ rc = -EINVAL;
+ goto out;
+ }
+ if (rc) {
+ ecryptfs_printk(KERN_WARNING, "Error writing "
+ "authentication token packet with sig "
+ "= [%s]\n",
+ mount_crypt_stat->global_auth_tok_sig);
+ rc = -EIO;
+ goto out;
+ }
+ } else
+ BUG();
+ if (likely((max - (*len)) > 0)) {
+ dest_base[(*len)] = 0x00;
+ } else {
+ ecryptfs_printk(KERN_ERR, "Error writing boundary byte\n");
+ rc = -EIO;
+ }
+out:
+ if (rc)
+ (*len) = 0;
+ return rc;
+}
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
new file mode 100644
index 000000000000..5938a232d11b
--- /dev/null
+++ b/fs/ecryptfs/main.c
@@ -0,0 +1,828 @@
+/**
+ * eCryptfs: Linux filesystem encryption layer
+ *
+ * Copyright (C) 1997-2003 Erez Zadok
+ * Copyright (C) 2001-2003 Stony Brook University
+ * Copyright (C) 2004-2006 International Business Machines Corp.
+ * Author(s): Michael A. Halcrow <mahalcro@us.ibm.com>
+ * Michael C. Thompson <mcthomps@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+ * 02111-1307, USA.
+ */
+
+#include <linux/dcache.h>
+#include <linux/file.h>
+#include <linux/module.h>
+#include <linux/namei.h>
+#include <linux/skbuff.h>
+#include <linux/crypto.h>
+#include <linux/netlink.h>
+#include <linux/mount.h>
+#include <linux/dcache.h>
+#include <linux/pagemap.h>
+#include <linux/key.h>
+#include <linux/parser.h>
+#include "ecryptfs_kernel.h"
+
+/**
+ * Module parameter that defines the ecryptfs_verbosity level.
+ */
+int ecryptfs_verbosity = 0;
+
+module_param(ecryptfs_verbosity, int, 0);
+MODULE_PARM_DESC(ecryptfs_verbosity,
+ "Initial verbosity level (0 or 1; defaults to "
+ "0, which is Quiet)");
+
+void __ecryptfs_printk(const char *fmt, ...)
+{
+ va_list args;
+ va_start(args, fmt);
+ if (fmt[1] == '7') { /* KERN_DEBUG */
+ if (ecryptfs_verbosity >= 1)
+ vprintk(fmt, args);
+ } else
+ vprintk(fmt, args);
+ va_end(args);
+}
+
+/**
+ * ecryptfs_interpose
+ * @lower_dentry: Existing dentry in the lower filesystem
+ * @dentry: ecryptfs' dentry
+ * @sb: ecryptfs's super_block
+ * @flag: If set to true, then d_add is called, else d_instantiate is called
+ *
+ * Interposes upper and lower dentries.
+ *
+ * Returns zero on success; non-zero otherwise
+ */
+int ecryptfs_interpose(struct dentry *lower_dentry, struct dentry *dentry,
+ struct super_block *sb, int flag)
+{
+ struct inode *lower_inode;
+ struct inode *inode;
+ int rc = 0;
+
+ lower_inode = lower_dentry->d_inode;
+ if (lower_inode->i_sb != ecryptfs_superblock_to_lower(sb)) {
+ rc = -EXDEV;
+ goto out;
+ }
+ if (!igrab(lower_inode)) {
+ rc = -ESTALE;
+ goto out;
+ }
+ inode = iget5_locked(sb, (unsigned long)lower_inode,
+ ecryptfs_inode_test, ecryptfs_inode_set,
+ lower_inode);
+ if (!inode) {
+ rc = -EACCES;
+ iput(lower_inode);
+ goto out;
+ }
+ if (inode->i_state & I_NEW)
+ unlock_new_inode(inode);
+ else
+ iput(lower_inode);
+ if (S_ISLNK(lower_inode->i_mode))
+ inode->i_op = &ecryptfs_symlink_iops;
+ else if (S_ISDIR(lower_inode->i_mode))
+ inode->i_op = &ecryptfs_dir_iops;
+ if (S_ISDIR(lower_inode->i_mode))
+ inode->i_fop = &ecryptfs_dir_fops;
+ if (special_file(lower_inode->i_mode))
+ init_special_inode(inode, lower_inode->i_mode,
+ lower_inode->i_rdev);
+ dentry->d_op = &ecryptfs_dops;
+ if (flag)
+ d_add(dentry, inode);
+ else
+ d_instantiate(dentry, inode);
+ ecryptfs_copy_attr_all(inode, lower_inode);
+ /* This size will be overwritten for real files w/ headers and
+ * other metadata */
+ ecryptfs_copy_inode_size(inode, lower_inode);
+out:
+ return rc;
+}
+
+enum { ecryptfs_opt_sig, ecryptfs_opt_ecryptfs_sig, ecryptfs_opt_debug,
+ ecryptfs_opt_ecryptfs_debug, ecryptfs_opt_cipher,
+ ecryptfs_opt_ecryptfs_cipher, ecryptfs_opt_ecryptfs_key_bytes,
+ ecryptfs_opt_passthrough, ecryptfs_opt_err };
+
+static match_table_t tokens = {
+ {ecryptfs_opt_sig, "sig=%s"},
+ {ecryptfs_opt_ecryptfs_sig, "ecryptfs_sig=%s"},
+ {ecryptfs_opt_debug, "debug=%u"},
+ {ecryptfs_opt_ecryptfs_debug, "ecryptfs_debug=%u"},
+ {ecryptfs_opt_cipher, "cipher=%s"},
+ {ecryptfs_opt_ecryptfs_cipher, "ecryptfs_cipher=%s"},
+ {ecryptfs_opt_ecryptfs_key_bytes, "ecryptfs_key_bytes=%u"},
+ {ecryptfs_opt_passthrough, "ecryptfs_passthrough"},
+ {ecryptfs_opt_err, NULL}
+};
+
+/**
+ * ecryptfs_verify_version
+ * @version: The version number to confirm
+ *
+ * Returns zero on good version; non-zero otherwise
+ */
+static int ecryptfs_verify_version(u16 version)
+{
+ int rc = 0;
+ unsigned char major;
+ unsigned char minor;
+
+ major = ((version >> 8) & 0xFF);
+ minor = (version & 0xFF);
+ if (major != ECRYPTFS_VERSION_MAJOR) {
+ ecryptfs_printk(KERN_ERR, "Major version number mismatch. "
+ "Expected [%d]; got [%d]\n",
+ ECRYPTFS_VERSION_MAJOR, major);
+ rc = -EINVAL;
+ goto out;
+ }
+ if (minor != ECRYPTFS_VERSION_MINOR) {
+ ecryptfs_printk(KERN_ERR, "Minor version number mismatch. "
+ "Expected [%d]; got [%d]\n",
+ ECRYPTFS_VERSION_MINOR, minor);
+ rc = -EINVAL;
+ goto out;
+ }
+out:
+ return rc;
+}
+
+/**
+ * ecryptfs_parse_options
+ * @sb: The ecryptfs super block
+ * @options: The options pased to the kernel
+ *
+ * Parse mount options:
+ * debug=N - ecryptfs_verbosity level for debug output
+ * sig=XXX - description(signature) of the key to use
+ *
+ * Returns the dentry object of the lower-level (lower/interposed)
+ * directory; We want to mount our stackable file system on top of
+ * that lower directory.
+ *
+ * The signature of the key to use must be the description of a key
+ * already in the keyring. Mounting will fail if the key can not be
+ * found.
+ *
+ * Returns zero on success; non-zero on error
+ */
+static int ecryptfs_parse_options(struct super_block *sb, char *options)
+{
+ char *p;
+ int rc = 0;
+ int sig_set = 0;
+ int cipher_name_set = 0;
+ int cipher_key_bytes;
+ int cipher_key_bytes_set = 0;
+ struct key *auth_tok_key = NULL;
+ struct ecryptfs_auth_tok *auth_tok = NULL;
+ struct ecryptfs_mount_crypt_stat *mount_crypt_stat =
+ &ecryptfs_superblock_to_private(sb)->mount_crypt_stat;
+ substring_t args[MAX_OPT_ARGS];
+ int token;
+ char *sig_src;
+ char *sig_dst;
+ char *debug_src;
+ char *cipher_name_dst;
+ char *cipher_name_src;
+ char *cipher_key_bytes_src;
+ struct crypto_tfm *tmp_tfm;
+ int cipher_name_len;
+
+ if (!options) {
+ rc = -EINVAL;
+ goto out;
+ }
+ while ((p = strsep(&options, ",")) != NULL) {
+ if (!*p)
+ continue;
+ token = match_token(p, tokens, args);
+ switch (token) {
+ case ecryptfs_opt_sig:
+ case ecryptfs_opt_ecryptfs_sig:
+ sig_src = args[0].from;
+ sig_dst =
+ mount_crypt_stat->global_auth_tok_sig;
+ memcpy(sig_dst, sig_src, ECRYPTFS_SIG_SIZE_HEX);
+ sig_dst[ECRYPTFS_SIG_SIZE_HEX] = '\0';
+ ecryptfs_printk(KERN_DEBUG,
+ "The mount_crypt_stat "
+ "global_auth_tok_sig set to: "
+ "[%s]\n", sig_dst);
+ sig_set = 1;
+ break;
+ case ecryptfs_opt_debug:
+ case ecryptfs_opt_ecryptfs_debug:
+ debug_src = args[0].from;
+ ecryptfs_verbosity =
+ (int)simple_strtol(debug_src, &debug_src,
+ 0);
+ ecryptfs_printk(KERN_DEBUG,
+ "Verbosity set to [%d]" "\n",
+ ecryptfs_verbosity);
+ break;
+ case ecryptfs_opt_cipher:
+ case ecryptfs_opt_ecryptfs_cipher:
+ cipher_name_src = args[0].from;
+ cipher_name_dst =
+ mount_crypt_stat->
+ global_default_cipher_name;
+ strncpy(cipher_name_dst, cipher_name_src,
+ ECRYPTFS_MAX_CIPHER_NAME_SIZE);
+ ecryptfs_printk(KERN_DEBUG,
+ "The mount_crypt_stat "
+ "global_default_cipher_name set to: "
+ "[%s]\n", cipher_name_dst);
+ cipher_name_set = 1;
+ break;
+ case ecryptfs_opt_ecryptfs_key_bytes:
+ cipher_key_bytes_src = args[0].from;
+ cipher_key_bytes =
+ (int)simple_strtol(cipher_key_bytes_src,
+ &cipher_key_bytes_src, 0);
+ mount_crypt_stat->global_default_cipher_key_size =
+ cipher_key_bytes;
+ ecryptfs_printk(KERN_DEBUG,
+ "The mount_crypt_stat "
+ "global_default_cipher_key_size "
+ "set to: [%d]\n", mount_crypt_stat->
+ global_default_cipher_key_size);
+ cipher_key_bytes_set = 1;
+ break;
+ case ecryptfs_opt_passthrough:
+ mount_crypt_stat->flags |=
+ ECRYPTFS_PLAINTEXT_PASSTHROUGH_ENABLED;
+ break;
+ case ecryptfs_opt_err:
+ default:
+ ecryptfs_printk(KERN_WARNING,
+ "eCryptfs: unrecognized option '%s'\n",
+ p);
+ }
+ }
+ /* Do not support lack of mount-wide signature in 0.1
+ * release */
+ if (!sig_set) {
+ rc = -EINVAL;
+ ecryptfs_printk(KERN_ERR, "You must supply a valid "
+ "passphrase auth tok signature as a mount "
+ "parameter; see the eCryptfs README\n");
+ goto out;
+ }
+ if (!cipher_name_set) {
+ cipher_name_len = strlen(ECRYPTFS_DEFAULT_CIPHER);
+ if (unlikely(cipher_name_len
+ >= ECRYPTFS_MAX_CIPHER_NAME_SIZE)) {
+ rc = -EINVAL;
+ BUG();
+ goto out;
+ }
+ memcpy(mount_crypt_stat->global_default_cipher_name,
+ ECRYPTFS_DEFAULT_CIPHER, cipher_name_len);
+ mount_crypt_stat->global_default_cipher_name[cipher_name_len]
+ = '\0';
+ }
+ if (!cipher_key_bytes_set) {
+ mount_crypt_stat->global_default_cipher_key_size =
+ ECRYPTFS_DEFAULT_KEY_BYTES;
+ ecryptfs_printk(KERN_DEBUG, "Cipher key size was not "
+ "specified. Defaulting to [%d]\n",
+ mount_crypt_stat->
+ global_default_cipher_key_size);
+ }
+ rc = ecryptfs_process_cipher(
+ &tmp_tfm,
+ &mount_crypt_stat->global_key_tfm,
+ mount_crypt_stat->global_default_cipher_name,
+ mount_crypt_stat->global_default_cipher_key_size);
+ if (tmp_tfm)
+ crypto_free_tfm(tmp_tfm);
+ if (rc) {
+ printk(KERN_ERR "Error attempting to initialize cipher [%s] "
+ "with key size [%Zd] bytes; rc = [%d]\n",
+ mount_crypt_stat->global_default_cipher_name,
+ mount_crypt_stat->global_default_cipher_key_size, rc);
+ rc = -EINVAL;
+ goto out;
+ }
+ mutex_init(&mount_crypt_stat->global_key_tfm_mutex);
+ ecryptfs_printk(KERN_DEBUG, "Requesting the key with description: "
+ "[%s]\n", mount_crypt_stat->global_auth_tok_sig);
+ /* The reference to this key is held until umount is done The
+ * call to key_put is done in ecryptfs_put_super() */
+ auth_tok_key = request_key(&key_type_user,
+ mount_crypt_stat->global_auth_tok_sig,
+ NULL);
+ if (!auth_tok_key || IS_ERR(auth_tok_key)) {
+ ecryptfs_printk(KERN_ERR, "Could not find key with "
+ "description: [%s]\n",
+ mount_crypt_stat->global_auth_tok_sig);
+ process_request_key_err(PTR_ERR(auth_tok_key));
+ rc = -EINVAL;
+ goto out;
+ }
+ auth_tok = ecryptfs_get_key_payload_data(auth_tok_key);
+ if (ecryptfs_verify_version(auth_tok->version)) {
+ ecryptfs_printk(KERN_ERR, "Data structure version mismatch. "
+ "Userspace tools must match eCryptfs kernel "
+ "module with major version [%d] and minor "
+ "version [%d]\n", ECRYPTFS_VERSION_MAJOR,
+ ECRYPTFS_VERSION_MINOR);
+ rc = -EINVAL;
+ goto out;
+ }
+ if (auth_tok->token_type != ECRYPTFS_PASSWORD) {
+ ecryptfs_printk(KERN_ERR, "Invalid auth_tok structure "
+ "returned from key\n");
+ rc = -EINVAL;
+ goto out;
+ }
+ mount_crypt_stat->global_auth_tok_key = auth_tok_key;
+ mount_crypt_stat->global_auth_tok = auth_tok;
+out:
+ return rc;
+}
+
+struct kmem_cache *ecryptfs_sb_info_cache;
+
+/**
+ * ecryptfs_fill_super
+ * @sb: The ecryptfs super block
+ * @raw_data: The options passed to mount
+ * @silent: Not used but required by function prototype
+ *
+ * Sets up what we can of the sb, rest is done in ecryptfs_read_super
+ *
+ * Returns zero on success; non-zero otherwise
+ */
+static int
+ecryptfs_fill_super(struct super_block *sb, void *raw_data, int silent)
+{
+ int rc = 0;
+
+ /* Released in ecryptfs_put_super() */
+ ecryptfs_set_superblock_private(sb,
+ kmem_cache_alloc(ecryptfs_sb_info_cache,
+ SLAB_KERNEL));
+ if (!ecryptfs_superblock_to_private(sb)) {
+ ecryptfs_printk(KERN_WARNING, "Out of memory\n");
+ rc = -ENOMEM;
+ goto out;
+ }
+ memset(ecryptfs_superblock_to_private(sb), 0,
+ sizeof(struct ecryptfs_sb_info));
+ sb->s_op = &ecryptfs_sops;
+ /* Released through deactivate_super(sb) from get_sb_nodev */
+ sb->s_root = d_alloc(NULL, &(const struct qstr) {
+ .hash = 0,.name = "/",.len = 1});
+ if (!sb->s_root) {
+ ecryptfs_printk(KERN_ERR, "d_alloc failed\n");
+ rc = -ENOMEM;
+ goto out;
+ }
+ sb->s_root->d_op = &ecryptfs_dops;
+ sb->s_root->d_sb = sb;
+ sb->s_root->d_parent = sb->s_root;
+ /* Released in d_release when dput(sb->s_root) is called */
+ /* through deactivate_super(sb) from get_sb_nodev() */
+ ecryptfs_set_dentry_private(sb->s_root,
+ kmem_cache_alloc(ecryptfs_dentry_info_cache,
+ SLAB_KERNEL));
+ if (!ecryptfs_dentry_to_private(sb->s_root)) {
+ ecryptfs_printk(KERN_ERR,
+ "dentry_info_cache alloc failed\n");
+ rc = -ENOMEM;
+ goto out;
+ }
+ memset(ecryptfs_dentry_to_private(sb->s_root), 0,
+ sizeof(struct ecryptfs_dentry_info));
+ rc = 0;
+out:
+ /* Should be able to rely on deactivate_super called from
+ * get_sb_nodev */
+ return rc;
+}
+
+/**
+ * ecryptfs_read_super
+ * @sb: The ecryptfs super block
+ * @dev_name: The path to mount over
+ *
+ * Read the super block of the lower filesystem, and use
+ * ecryptfs_interpose to create our initial inode and super block
+ * struct.
+ */
+static int ecryptfs_read_super(struct super_block *sb, const char *dev_name)
+{
+ int rc;
+ struct nameidata nd;
+ struct dentry *lower_root;
+ struct vfsmount *lower_mnt;
+
+ memset(&nd, 0, sizeof(struct nameidata));
+ rc = path_lookup(dev_name, LOOKUP_FOLLOW, &nd);
+ if (rc) {
+ ecryptfs_printk(KERN_WARNING, "path_lookup() failed\n");
+ goto out_free;
+ }
+ lower_root = nd.dentry;
+ if (!lower_root->d_inode) {
+ ecryptfs_printk(KERN_WARNING,
+ "No directory to interpose on\n");
+ rc = -ENOENT;
+ goto out_free;
+ }
+ lower_mnt = nd.mnt;
+ ecryptfs_set_superblock_lower(sb, lower_root->d_sb);
+ sb->s_maxbytes = lower_root->d_sb->s_maxbytes;
+ ecryptfs_set_dentry_lower(sb->s_root, lower_root);
+ ecryptfs_set_dentry_lower_mnt(sb->s_root, lower_mnt);
+ if ((rc = ecryptfs_interpose(lower_root, sb->s_root, sb, 0)))
+ goto out_free;
+ rc = 0;
+ goto out;
+out_free:
+ path_release(&nd);
+out:
+ return rc;
+}
+
+/**
+ * ecryptfs_get_sb
+ * @fs_type
+ * @flags
+ * @dev_name: The path to mount over
+ * @raw_data: The options passed into the kernel
+ *
+ * The whole ecryptfs_get_sb process is broken into 4 functions:
+ * ecryptfs_parse_options(): handle options passed to ecryptfs, if any
+ * ecryptfs_fill_super(): used by get_sb_nodev, fills out the super_block
+ * with as much information as it can before needing
+ * the lower filesystem.
+ * ecryptfs_read_super(): this accesses the lower filesystem and uses
+ * ecryptfs_interpolate to perform most of the linking
+ * ecryptfs_interpolate(): links the lower filesystem into ecryptfs
+ */
+static int ecryptfs_get_sb(struct file_system_type *fs_type, int flags,
+ const char *dev_name, void *raw_data,
+ struct vfsmount *mnt)
+{
+ int rc;
+ struct super_block *sb;
+
+ rc = get_sb_nodev(fs_type, flags, raw_data, ecryptfs_fill_super, mnt);
+ if (rc < 0) {
+ printk(KERN_ERR "Getting sb failed; rc = [%d]\n", rc);
+ goto out;
+ }
+ sb = mnt->mnt_sb;
+ rc = ecryptfs_parse_options(sb, raw_data);
+ if (rc) {
+ printk(KERN_ERR "Error parsing options; rc = [%d]\n", rc);
+ goto out_abort;
+ }
+ rc = ecryptfs_read_super(sb, dev_name);
+ if (rc) {
+ printk(KERN_ERR "Reading sb failed; rc = [%d]\n", rc);
+ goto out_abort;
+ }
+ goto out;
+out_abort:
+ dput(sb->s_root);
+ up_write(&sb->s_umount);
+ deactivate_super(sb);
+out:
+ return rc;
+}
+
+/**
+ * ecryptfs_kill_block_super
+ * @sb: The ecryptfs super block
+ *
+ * Used to bring the superblock down and free the private data.
+ * Private data is free'd in ecryptfs_put_super()
+ */
+static void ecryptfs_kill_block_super(struct super_block *sb)
+{
+ generic_shutdown_super(sb);
+}
+
+static struct file_system_type ecryptfs_fs_type = {
+ .owner = THIS_MODULE,
+ .name = "ecryptfs",
+ .get_sb = ecryptfs_get_sb,
+ .kill_sb = ecryptfs_kill_block_super,
+ .fs_flags = 0
+};
+
+/**
+ * inode_info_init_once
+ *
+ * Initializes the ecryptfs_inode_info_cache when it is created
+ */
+static void
+inode_info_init_once(void *vptr, struct kmem_cache *cachep, unsigned long flags)
+{
+ struct ecryptfs_inode_info *ei = (struct ecryptfs_inode_info *)vptr;
+
+ if ((flags & (SLAB_CTOR_VERIFY | SLAB_CTOR_CONSTRUCTOR)) ==
+ SLAB_CTOR_CONSTRUCTOR)
+ inode_init_once(&ei->vfs_inode);
+}
+
+static struct ecryptfs_cache_info {
+ kmem_cache_t **cache;
+ const char *name;
+ size_t size;
+ void (*ctor)(void*, struct kmem_cache *, unsigned long);
+} ecryptfs_cache_infos[] = {
+ {
+ .cache = &ecryptfs_auth_tok_list_item_cache,
+ .name = "ecryptfs_auth_tok_list_item",
+ .size = sizeof(struct ecryptfs_auth_tok_list_item),
+ },
+ {
+ .cache = &ecryptfs_file_info_cache,
+ .name = "ecryptfs_file_cache",
+ .size = sizeof(struct ecryptfs_file_info),
+ },
+ {
+ .cache = &ecryptfs_dentry_info_cache,
+ .name = "ecryptfs_dentry_info_cache",
+ .size = sizeof(struct ecryptfs_dentry_info),
+ },
+ {
+ .cache = &ecryptfs_inode_info_cache,
+ .name = "ecryptfs_inode_cache",
+ .size = sizeof(struct ecryptfs_inode_info),
+ .ctor = inode_info_init_once,
+ },
+ {
+ .cache = &ecryptfs_sb_info_cache,
+ .name = "ecryptfs_sb_cache",
+ .size = sizeof(struct ecryptfs_sb_info),
+ },
+ {
+ .cache = &ecryptfs_header_cache_0,
+ .name = "ecryptfs_headers_0",
+ .size = PAGE_CACHE_SIZE,
+ },
+ {
+ .cache = &ecryptfs_header_cache_1,
+ .name = "ecryptfs_headers_1",
+ .size = PAGE_CACHE_SIZE,
+ },
+ {
+ .cache = &ecryptfs_header_cache_2,
+ .name = "ecryptfs_headers_2",
+ .size = PAGE_CACHE_SIZE,
+ },
+ {
+ .cache = &ecryptfs_lower_page_cache,
+ .name = "ecryptfs_lower_page_cache",
+ .size = PAGE_CACHE_SIZE,
+ },
+};
+
+static void ecryptfs_free_kmem_caches(void)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(ecryptfs_cache_infos); i++) {
+ struct ecryptfs_cache_info *info;
+
+ info = &ecryptfs_cache_infos[i];
+ if (*(info->cache))
+ kmem_cache_destroy(*(info->cache));
+ }
+}
+
+/**
+ * ecryptfs_init_kmem_caches
+ *
+ * Returns zero on success; non-zero otherwise
+ */
+static int ecryptfs_init_kmem_caches(void)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(ecryptfs_cache_infos); i++) {
+ struct ecryptfs_cache_info *info;
+
+ info = &ecryptfs_cache_infos[i];
+ *(info->cache) = kmem_cache_create(info->name, info->size,
+ 0, SLAB_HWCACHE_ALIGN, info->ctor, NULL);
+ if (!*(info->cache)) {
+ ecryptfs_free_kmem_caches();
+ ecryptfs_printk(KERN_WARNING, "%s: "
+ "kmem_cache_create failed\n",
+ info->name);
+ return -ENOMEM;
+ }
+ }
+ return 0;
+}
+
+struct ecryptfs_obj {
+ char *name;
+ struct list_head slot_list;
+ struct kobject kobj;
+};
+
+struct ecryptfs_attribute {
+ struct attribute attr;
+ ssize_t(*show) (struct ecryptfs_obj *, char *);
+ ssize_t(*store) (struct ecryptfs_obj *, const char *, size_t);
+};
+
+static ssize_t
+ecryptfs_attr_store(struct kobject *kobj,
+ struct attribute *attr, const char *buf, size_t len)
+{
+ struct ecryptfs_obj *obj = container_of(kobj, struct ecryptfs_obj,
+ kobj);
+ struct ecryptfs_attribute *attribute =
+ container_of(attr, struct ecryptfs_attribute, attr);
+
+ return (attribute->store ? attribute->store(obj, buf, len) : 0);
+}
+
+static ssize_t
+ecryptfs_attr_show(struct kobject *kobj, struct attribute *attr, char *buf)
+{
+ struct ecryptfs_obj *obj = container_of(kobj, struct ecryptfs_obj,
+ kobj);
+ struct ecryptfs_attribute *attribute =
+ container_of(attr, struct ecryptfs_attribute, attr);
+
+ return (attribute->show ? attribute->show(obj, buf) : 0);
+}
+
+static struct sysfs_ops ecryptfs_sysfs_ops = {
+ .show = ecryptfs_attr_show,
+ .store = ecryptfs_attr_store
+};
+
+static struct kobj_type ecryptfs_ktype = {
+ .sysfs_ops = &ecryptfs_sysfs_ops
+};
+
+static decl_subsys(ecryptfs, &ecryptfs_ktype, NULL);
+
+static ssize_t version_show(struct ecryptfs_obj *obj, char *buff)
+{
+ return snprintf(buff, PAGE_SIZE, "%d\n", ECRYPTFS_VERSIONING_MASK);
+}
+
+static struct ecryptfs_attribute sysfs_attr_version = __ATTR_RO(version);
+
+struct ecryptfs_version_str_map_elem {
+ u32 flag;
+ char *str;
+} ecryptfs_version_str_map[] = {
+ {ECRYPTFS_VERSIONING_PASSPHRASE, "passphrase"},
+ {ECRYPTFS_VERSIONING_PUBKEY, "pubkey"},
+ {ECRYPTFS_VERSIONING_PLAINTEXT_PASSTHROUGH, "plaintext passthrough"},
+ {ECRYPTFS_VERSIONING_POLICY, "policy"}
+};
+
+static ssize_t version_str_show(struct ecryptfs_obj *obj, char *buff)
+{
+ int i;
+ int remaining = PAGE_SIZE;
+ int total_written = 0;
+
+ buff[0] = '\0';
+ for (i = 0; i < ARRAY_SIZE(ecryptfs_version_str_map); i++) {
+ int entry_size;
+
+ if (!(ECRYPTFS_VERSIONING_MASK
+ & ecryptfs_version_str_map[i].flag))
+ continue;
+ entry_size = strlen(ecryptfs_version_str_map[i].str);
+ if ((entry_size + 2) > remaining)
+ goto out;
+ memcpy(buff, ecryptfs_version_str_map[i].str, entry_size);
+ buff[entry_size++] = '\n';
+ buff[entry_size] = '\0';
+ buff += entry_size;
+ total_written += entry_size;
+ remaining -= entry_size;
+ }
+out:
+ return total_written;
+}
+
+static struct ecryptfs_attribute sysfs_attr_version_str = __ATTR_RO(version_str);
+
+static int do_sysfs_registration(void)
+{
+ int rc;
+
+ if ((rc = subsystem_register(&ecryptfs_subsys))) {
+ printk(KERN_ERR
+ "Unable to register ecryptfs sysfs subsystem\n");
+ goto out;
+ }
+ rc = sysfs_create_file(&ecryptfs_subsys.kset.kobj,
+ &sysfs_attr_version.attr);
+ if (rc) {
+ printk(KERN_ERR
+ "Unable to create ecryptfs version attribute\n");
+ subsystem_unregister(&ecryptfs_subsys);
+ goto out;
+ }
+ rc = sysfs_create_file(&ecryptfs_subsys.kset.kobj,
+ &sysfs_attr_version_str.attr);
+ if (rc) {
+ printk(KERN_ERR
+ "Unable to create ecryptfs version_str attribute\n");
+ sysfs_remove_file(&ecryptfs_subsys.kset.kobj,
+ &sysfs_attr_version.attr);
+ subsystem_unregister(&ecryptfs_subsys);
+ goto out;
+ }
+out:
+ return rc;
+}
+
+static int __init ecryptfs_init(void)
+{
+ int rc;
+
+ if (ECRYPTFS_DEFAULT_EXTENT_SIZE > PAGE_CACHE_SIZE) {
+ rc = -EINVAL;
+ ecryptfs_printk(KERN_ERR, "The eCryptfs extent size is "
+ "larger than the host's page size, and so "
+ "eCryptfs cannot run on this system. The "
+ "default eCryptfs extent size is [%d] bytes; "
+ "the page size is [%d] bytes.\n",
+ ECRYPTFS_DEFAULT_EXTENT_SIZE, PAGE_CACHE_SIZE);
+ goto out;
+ }
+ rc = ecryptfs_init_kmem_caches();
+ if (rc) {
+ printk(KERN_ERR
+ "Failed to allocate one or more kmem_cache objects\n");
+ goto out;
+ }
+ rc = register_filesystem(&ecryptfs_fs_type);
+ if (rc) {
+ printk(KERN_ERR "Failed to register filesystem\n");
+ ecryptfs_free_kmem_caches();
+ goto out;
+ }
+ kset_set_kset_s(&ecryptfs_subsys, fs_subsys);
+ sysfs_attr_version.attr.owner = THIS_MODULE;
+ sysfs_attr_version_str.attr.owner = THIS_MODULE;
+ rc = do_sysfs_registration();
+ if (rc) {
+ printk(KERN_ERR "sysfs registration failed\n");
+ unregister_filesystem(&ecryptfs_fs_type);
+ ecryptfs_free_kmem_caches();
+ goto out;
+ }
+out:
+ return rc;
+}
+
+static void __exit ecryptfs_exit(void)
+{
+ sysfs_remove_file(&ecryptfs_subsys.kset.kobj,
+ &sysfs_attr_version.attr);
+ sysfs_remove_file(&ecryptfs_subsys.kset.kobj,
+ &sysfs_attr_version_str.attr);
+ subsystem_unregister(&ecryptfs_subsys);
+ unregister_filesystem(&ecryptfs_fs_type);
+ ecryptfs_free_kmem_caches();
+}
+
+MODULE_AUTHOR("Michael A. Halcrow <mhalcrow@us.ibm.com>");
+MODULE_DESCRIPTION("eCryptfs");
+
+MODULE_LICENSE("GPL");
+
+module_init(ecryptfs_init)
+module_exit(ecryptfs_exit)
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
new file mode 100644
index 000000000000..924dd90a4cf5
--- /dev/null
+++ b/fs/ecryptfs/mmap.c
@@ -0,0 +1,788 @@
+/**
+ * eCryptfs: Linux filesystem encryption layer
+ * This is where eCryptfs coordinates the symmetric encryption and
+ * decryption of the file data as it passes between the lower
+ * encrypted file and the upper decrypted file.
+ *
+ * Copyright (C) 1997-2003 Erez Zadok
+ * Copyright (C) 2001-2003 Stony Brook University
+ * Copyright (C) 2004-2006 International Business Machines Corp.
+ * Author(s): Michael A. Halcrow <mahalcro@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+ * 02111-1307, USA.
+ */
+
+#include <linux/pagemap.h>
+#include <linux/writeback.h>
+#include <linux/page-flags.h>
+#include <linux/mount.h>
+#include <linux/file.h>
+#include <linux/crypto.h>
+#include <linux/scatterlist.h>
+#include "ecryptfs_kernel.h"
+
+struct kmem_cache *ecryptfs_lower_page_cache;
+
+/**
+ * ecryptfs_get1page
+ *
+ * Get one page from cache or lower f/s, return error otherwise.
+ *
+ * Returns unlocked and up-to-date page (if ok), with increased
+ * refcnt.
+ */
+static struct page *ecryptfs_get1page(struct file *file, int index)
+{
+ struct page *page;
+ struct dentry *dentry;
+ struct inode *inode;
+ struct address_space *mapping;
+
+ dentry = file->f_dentry;
+ inode = dentry->d_inode;
+ mapping = inode->i_mapping;
+ page = read_cache_page(mapping, index,
+ (filler_t *)mapping->a_ops->readpage,
+ (void *)file);
+ if (IS_ERR(page))
+ goto out;
+ wait_on_page_locked(page);
+out:
+ return page;
+}
+
+static
+int write_zeros(struct file *file, pgoff_t index, int start, int num_zeros);
+
+/**
+ * ecryptfs_fill_zeros
+ * @file: The ecryptfs file
+ * @new_length: The new length of the data in the underlying file;
+ * everything between the prior end of the file and the
+ * new end of the file will be filled with zero's.
+ * new_length must be greater than current length
+ *
+ * Function for handling lseek-ing past the end of the file.
+ *
+ * This function does not support shrinking, only growing a file.
+ *
+ * Returns zero on success; non-zero otherwise.
+ */
+int ecryptfs_fill_zeros(struct file *file, loff_t new_length)
+{
+ int rc = 0;
+ struct dentry *dentry = file->f_dentry;
+ struct inode *inode = dentry->d_inode;
+ pgoff_t old_end_page_index = 0;
+ pgoff_t index = old_end_page_index;
+ int old_end_pos_in_page = -1;
+ pgoff_t new_end_page_index;
+ int new_end_pos_in_page;
+ loff_t cur_length = i_size_read(inode);
+
+ if (cur_length != 0) {
+ index = old_end_page_index =
+ ((cur_length - 1) >> PAGE_CACHE_SHIFT);
+ old_end_pos_in_page = ((cur_length - 1) & ~PAGE_CACHE_MASK);
+ }
+ new_end_page_index = ((new_length - 1) >> PAGE_CACHE_SHIFT);
+ new_end_pos_in_page = ((new_length - 1) & ~PAGE_CACHE_MASK);
+ ecryptfs_printk(KERN_DEBUG, "old_end_page_index = [0x%.16x]; "
+ "old_end_pos_in_page = [%d]; "
+ "new_end_page_index = [0x%.16x]; "
+ "new_end_pos_in_page = [%d]\n",
+ old_end_page_index, old_end_pos_in_page,
+ new_end_page_index, new_end_pos_in_page);
+ if (old_end_page_index == new_end_page_index) {
+ /* Start and end are in the same page; we just need to
+ * set a portion of the existing page to zero's */
+ rc = write_zeros(file, index, (old_end_pos_in_page + 1),
+ (new_end_pos_in_page - old_end_pos_in_page));
+ if (rc)
+ ecryptfs_printk(KERN_ERR, "write_zeros(file=[%p], "
+ "index=[0x%.16x], "
+ "old_end_pos_in_page=[d], "
+ "(PAGE_CACHE_SIZE - new_end_pos_in_page"
+ "=[%d]"
+ ")=[d]) returned [%d]\n", file, index,
+ old_end_pos_in_page,
+ new_end_pos_in_page,
+ (PAGE_CACHE_SIZE - new_end_pos_in_page),
+ rc);
+ goto out;
+ }
+ /* Fill the remainder of the previous last page with zeros */
+ rc = write_zeros(file, index, (old_end_pos_in_page + 1),
+ ((PAGE_CACHE_SIZE - 1) - old_end_pos_in_page));
+ if (rc) {
+ ecryptfs_printk(KERN_ERR, "write_zeros(file=[%p], "
+ "index=[0x%.16x], old_end_pos_in_page=[d], "
+ "(PAGE_CACHE_SIZE - old_end_pos_in_page)=[d]) "
+ "returned [%d]\n", file, index,
+ old_end_pos_in_page,
+ (PAGE_CACHE_SIZE - old_end_pos_in_page), rc);
+ goto out;
+ }
+ index++;
+ while (index < new_end_page_index) {
+ /* Fill all intermediate pages with zeros */
+ rc = write_zeros(file, index, 0, PAGE_CACHE_SIZE);
+ if (rc) {
+ ecryptfs_printk(KERN_ERR, "write_zeros(file=[%p], "
+ "index=[0x%.16x], "
+ "old_end_pos_in_page=[d], "
+ "(PAGE_CACHE_SIZE - new_end_pos_in_page"
+ "=[%d]"
+ ")=[d]) returned [%d]\n", file, index,
+ old_end_pos_in_page,
+ new_end_pos_in_page,
+ (PAGE_CACHE_SIZE - new_end_pos_in_page),
+ rc);
+ goto out;
+ }
+ index++;
+ }
+ /* Fill the portion at the beginning of the last new page with
+ * zero's */
+ rc = write_zeros(file, index, 0, (new_end_pos_in_page + 1));
+ if (rc) {
+ ecryptfs_printk(KERN_ERR, "write_zeros(file="
+ "[%p], index=[0x%.16x], 0, "
+ "new_end_pos_in_page=[%d]"
+ "returned [%d]\n", file, index,
+ new_end_pos_in_page, rc);
+ goto out;
+ }
+out:
+ return rc;
+}
+
+/**
+ * ecryptfs_writepage
+ * @page: Page that is locked before this call is made
+ *
+ * Returns zero on success; non-zero otherwise
+ */
+static int ecryptfs_writepage(struct page *page, struct writeback_control *wbc)
+{
+ struct ecryptfs_page_crypt_context ctx;
+ int rc;
+
+ ctx.page = page;
+ ctx.mode = ECRYPTFS_WRITEPAGE_MODE;
+ ctx.param.wbc = wbc;
+ rc = ecryptfs_encrypt_page(&ctx);
+ if (rc) {
+ ecryptfs_printk(KERN_WARNING, "Error encrypting "
+ "page (upper index [0x%.16x])\n", page->index);
+ ClearPageUptodate(page);
+ goto out;
+ }
+ SetPageUptodate(page);
+ unlock_page(page);
+out:
+ return rc;
+}
+
+/**
+ * Reads the data from the lower file file at index lower_page_index
+ * and copies that data into page.
+ *
+ * @param page Page to fill
+ * @param lower_page_index Index of the page in the lower file to get
+ */
+int ecryptfs_do_readpage(struct file *file, struct page *page,
+ pgoff_t lower_page_index)
+{
+ int rc;
+ struct dentry *dentry;
+ struct file *lower_file;
+ struct dentry *lower_dentry;
+ struct inode *inode;
+ struct inode *lower_inode;
+ char *page_data;
+ struct page *lower_page = NULL;
+ char *lower_page_data;
+ const struct address_space_operations *lower_a_ops;
+
+ dentry = file->f_dentry;
+ lower_file = ecryptfs_file_to_lower(file);
+ lower_dentry = ecryptfs_dentry_to_lower(dentry);
+ inode = dentry->d_inode;
+ lower_inode = ecryptfs_inode_to_lower(inode);
+ lower_a_ops = lower_inode->i_mapping->a_ops;
+ lower_page = read_cache_page(lower_inode->i_mapping, lower_page_index,
+ (filler_t *)lower_a_ops->readpage,
+ (void *)lower_file);
+ if (IS_ERR(lower_page)) {
+ rc = PTR_ERR(lower_page);
+ lower_page = NULL;
+ ecryptfs_printk(KERN_ERR, "Error reading from page cache\n");
+ goto out;
+ }
+ wait_on_page_locked(lower_page);
+ page_data = (char *)kmap(page);
+ if (!page_data) {
+ rc = -ENOMEM;
+ ecryptfs_printk(KERN_ERR, "Error mapping page\n");
+ goto out;
+ }
+ lower_page_data = (char *)kmap(lower_page);
+ if (!lower_page_data) {
+ rc = -ENOMEM;
+ ecryptfs_printk(KERN_ERR, "Error mapping page\n");
+ kunmap(page);
+ goto out;
+ }
+ memcpy(page_data, lower_page_data, PAGE_CACHE_SIZE);
+ kunmap(lower_page);
+ kunmap(page);
+ rc = 0;
+out:
+ if (likely(lower_page))
+ page_cache_release(lower_page);
+ if (rc == 0)
+ SetPageUptodate(page);
+ else
+ ClearPageUptodate(page);
+ return rc;
+}
+
+/**
+ * ecryptfs_readpage
+ * @file: This is an ecryptfs file
+ * @page: ecryptfs associated page to stick the read data into
+ *
+ * Read in a page, decrypting if necessary.
+ *
+ * Returns zero on success; non-zero on error.
+ */
+static int ecryptfs_readpage(struct file *file, struct page *page)
+{
+ int rc = 0;
+ struct ecryptfs_crypt_stat *crypt_stat;
+
+ BUG_ON(!(file && file->f_dentry && file->f_dentry->d_inode));
+ crypt_stat =
+ &ecryptfs_inode_to_private(file->f_dentry->d_inode)->crypt_stat;
+ if (!crypt_stat
+ || !ECRYPTFS_CHECK_FLAG(crypt_stat->flags, ECRYPTFS_ENCRYPTED)
+ || ECRYPTFS_CHECK_FLAG(crypt_stat->flags, ECRYPTFS_NEW_FILE)) {
+ ecryptfs_printk(KERN_DEBUG,
+ "Passing through unencrypted page\n");
+ rc = ecryptfs_do_readpage(file, page, page->index);
+ if (rc) {
+ ecryptfs_printk(KERN_ERR, "Error reading page; rc = "
+ "[%d]\n", rc);
+ goto out;
+ }
+ } else {
+ rc = ecryptfs_decrypt_page(file, page);
+ if (rc) {
+
+ ecryptfs_printk(KERN_ERR, "Error decrypting page; "
+ "rc = [%d]\n", rc);
+ goto out;
+ }
+ }
+ SetPageUptodate(page);
+out:
+ if (rc)
+ ClearPageUptodate(page);
+ ecryptfs_printk(KERN_DEBUG, "Unlocking page with index = [0x%.16x]\n",
+ page->index);
+ unlock_page(page);
+ return rc;
+}
+
+static int fill_zeros_to_end_of_page(struct page *page, unsigned int to)
+{
+ struct inode *inode = page->mapping->host;
+ int end_byte_in_page;
+ int rc = 0;
+ char *page_virt;
+
+ if ((i_size_read(inode) / PAGE_CACHE_SIZE) == page->index) {
+ end_byte_in_page = i_size_read(inode) % PAGE_CACHE_SIZE;
+ if (to > end_byte_in_page)
+ end_byte_in_page = to;
+ page_virt = kmap(page);
+ if (!page_virt) {
+ rc = -ENOMEM;
+ ecryptfs_printk(KERN_WARNING,
+ "Could not map page\n");
+ goto out;
+ }
+ memset((page_virt + end_byte_in_page), 0,
+ (PAGE_CACHE_SIZE - end_byte_in_page));
+ kunmap(page);
+ }
+out:
+ return rc;
+}
+
+static int ecryptfs_prepare_write(struct file *file, struct page *page,
+ unsigned from, unsigned to)
+{
+ int rc = 0;
+
+ kmap(page);
+ if (from == 0 && to == PAGE_CACHE_SIZE)
+ goto out; /* If we are writing a full page, it will be
+ up to date. */
+ if (!PageUptodate(page))
+ rc = ecryptfs_do_readpage(file, page, page->index);
+out:
+ return rc;
+}
+
+int ecryptfs_grab_and_map_lower_page(struct page **lower_page,
+ char **lower_virt,
+ struct inode *lower_inode,
+ unsigned long lower_page_index)
+{
+ int rc = 0;
+
+ (*lower_page) = grab_cache_page(lower_inode->i_mapping,
+ lower_page_index);
+ if (!(*lower_page)) {
+ ecryptfs_printk(KERN_ERR, "grab_cache_page for "
+ "lower_page_index = [0x%.16x] failed\n",
+ lower_page_index);
+ rc = -EINVAL;
+ goto out;
+ }
+ if (lower_virt)
+ (*lower_virt) = kmap((*lower_page));
+ else
+ kmap((*lower_page));
+out:
+ return rc;
+}
+
+int ecryptfs_writepage_and_release_lower_page(struct page *lower_page,
+ struct inode *lower_inode,
+ struct writeback_control *wbc)
+{
+ int rc = 0;
+
+ rc = lower_inode->i_mapping->a_ops->writepage(lower_page, wbc);
+ if (rc) {
+ ecryptfs_printk(KERN_ERR, "Error calling lower writepage(); "
+ "rc = [%d]\n", rc);
+ goto out;
+ }
+ lower_inode->i_mtime = lower_inode->i_ctime = CURRENT_TIME;
+ page_cache_release(lower_page);
+out:
+ return rc;
+}
+
+static void ecryptfs_unmap_and_release_lower_page(struct page *lower_page)
+{
+ kunmap(lower_page);
+ ecryptfs_printk(KERN_DEBUG, "Unlocking lower page with index = "
+ "[0x%.16x]\n", lower_page->index);
+ unlock_page(lower_page);
+ page_cache_release(lower_page);
+}
+
+/**
+ * ecryptfs_write_inode_size_to_header
+ *
+ * Writes the lower file size to the first 8 bytes of the header.
+ *
+ * Returns zero on success; non-zero on error.
+ */
+int
+ecryptfs_write_inode_size_to_header(struct file *lower_file,
+ struct inode *lower_inode,
+ struct inode *inode)
+{
+ int rc = 0;
+ struct page *header_page;
+ char *header_virt;
+ const struct address_space_operations *lower_a_ops;
+ u64 file_size;
+
+ rc = ecryptfs_grab_and_map_lower_page(&header_page, &header_virt,
+ lower_inode, 0);
+ if (rc) {
+ ecryptfs_printk(KERN_ERR, "grab_cache_page for header page "
+ "failed\n");
+ goto out;
+ }
+ lower_a_ops = lower_inode->i_mapping->a_ops;
+ rc = lower_a_ops->prepare_write(lower_file, header_page, 0, 8);
+ file_size = (u64)i_size_read(inode);
+ ecryptfs_printk(KERN_DEBUG, "Writing size: [0x%.16x]\n", file_size);
+ file_size = cpu_to_be64(file_size);
+ memcpy(header_virt, &file_size, sizeof(u64));
+ rc = lower_a_ops->commit_write(lower_file, header_page, 0, 8);
+ if (rc < 0)
+ ecryptfs_printk(KERN_ERR, "Error commiting header page "
+ "write\n");
+ ecryptfs_unmap_and_release_lower_page(header_page);
+ lower_inode->i_mtime = lower_inode->i_ctime = CURRENT_TIME;
+ mark_inode_dirty_sync(inode);
+out:
+ return rc;
+}
+
+int ecryptfs_get_lower_page(struct page **lower_page, struct inode *lower_inode,
+ struct file *lower_file,
+ unsigned long lower_page_index, int byte_offset,
+ int region_bytes)
+{
+ int rc = 0;
+
+ rc = ecryptfs_grab_and_map_lower_page(lower_page, NULL, lower_inode,
+ lower_page_index);
+ if (rc) {
+ ecryptfs_printk(KERN_ERR, "Error attempting to grab and map "
+ "lower page with index [0x%.16x]\n",
+ lower_page_index);
+ goto out;
+ }
+ rc = lower_inode->i_mapping->a_ops->prepare_write(lower_file,
+ (*lower_page),
+ byte_offset,
+ region_bytes);
+ if (rc) {
+ ecryptfs_printk(KERN_ERR, "prepare_write for "
+ "lower_page_index = [0x%.16x] failed; rc = "
+ "[%d]\n", lower_page_index, rc);
+ }
+out:
+ if (rc && (*lower_page)) {
+ ecryptfs_unmap_and_release_lower_page(*lower_page);
+ (*lower_page) = NULL;
+ }
+ return rc;
+}
+
+/**
+ * ecryptfs_commit_lower_page
+ *
+ * Returns zero on success; non-zero on error
+ */
+int
+ecryptfs_commit_lower_page(struct page *lower_page, struct inode *lower_inode,
+ struct file *lower_file, int byte_offset,
+ int region_size)
+{
+ int rc = 0;
+
+ rc = lower_inode->i_mapping->a_ops->commit_write(
+ lower_file, lower_page, byte_offset, region_size);
+ if (rc < 0) {
+ ecryptfs_printk(KERN_ERR,
+ "Error committing write; rc = [%d]\n", rc);
+ } else
+ rc = 0;
+ ecryptfs_unmap_and_release_lower_page(lower_page);
+ return rc;
+}
+
+/**
+ * ecryptfs_copy_page_to_lower
+ *
+ * Used for plaintext pass-through; no page index interpolation
+ * required.
+ */
+int ecryptfs_copy_page_to_lower(struct page *page, struct inode *lower_inode,
+ struct file *lower_file)
+{
+ int rc = 0;
+ struct page *lower_page;
+
+ rc = ecryptfs_get_lower_page(&lower_page, lower_inode, lower_file,
+ page->index, 0, PAGE_CACHE_SIZE);
+ if (rc) {
+ ecryptfs_printk(KERN_ERR, "Error attempting to get page "
+ "at index [0x%.16x]\n", page->index);
+ goto out;
+ }
+ /* TODO: aops */
+ memcpy((char *)page_address(lower_page), page_address(page),
+ PAGE_CACHE_SIZE);
+ rc = ecryptfs_commit_lower_page(lower_page, lower_inode, lower_file,
+ 0, PAGE_CACHE_SIZE);
+ if (rc)
+ ecryptfs_printk(KERN_ERR, "Error attempting to commit page "
+ "at index [0x%.16x]\n", page->index);
+out:
+ return rc;
+}
+
+static int
+process_new_file(struct ecryptfs_crypt_stat *crypt_stat,
+ struct file *file, struct inode *inode)
+{
+ struct page *header_page;
+ const struct address_space_operations *lower_a_ops;
+ struct inode *lower_inode;
+ struct file *lower_file;
+ char *header_virt;
+ int rc = 0;
+ int current_header_page = 0;
+ int header_pages;
+ int more_header_data_to_be_written = 1;
+
+ lower_inode = ecryptfs_inode_to_lower(inode);
+ lower_file = ecryptfs_file_to_lower(file);
+ lower_a_ops = lower_inode->i_mapping->a_ops;
+ header_pages = ((crypt_stat->header_extent_size
+ * crypt_stat->num_header_extents_at_front)
+ / PAGE_CACHE_SIZE);
+ BUG_ON(header_pages < 1);
+ while (current_header_page < header_pages) {
+ rc = ecryptfs_grab_and_map_lower_page(&header_page,
+ &header_virt,
+ lower_inode,
+ current_header_page);
+ if (rc) {
+ ecryptfs_printk(KERN_ERR, "grab_cache_page for "
+ "header page [%d] failed; rc = [%d]\n",
+ current_header_page, rc);
+ goto out;
+ }
+ rc = lower_a_ops->prepare_write(lower_file, header_page, 0,
+ PAGE_CACHE_SIZE);
+ if (rc) {
+ ecryptfs_printk(KERN_ERR, "Error preparing to write "
+ "header page out; rc = [%d]\n", rc);
+ goto out;
+ }
+ memset(header_virt, 0, PAGE_CACHE_SIZE);
+ if (more_header_data_to_be_written) {
+ rc = ecryptfs_write_headers_virt(header_virt,
+ crypt_stat,
+ file->f_dentry);
+ if (rc) {
+ ecryptfs_printk(KERN_WARNING, "Error "
+ "generating header; rc = "
+ "[%d]\n", rc);
+ rc = -EIO;
+ memset(header_virt, 0, PAGE_CACHE_SIZE);
+ ecryptfs_unmap_and_release_lower_page(
+ header_page);
+ goto out;
+ }
+ if (current_header_page == 0)
+ memset(header_virt, 0, 8);
+ more_header_data_to_be_written = 0;
+ }
+ rc = lower_a_ops->commit_write(lower_file, header_page, 0,
+ PAGE_CACHE_SIZE);
+ ecryptfs_unmap_and_release_lower_page(header_page);
+ if (rc < 0) {
+ ecryptfs_printk(KERN_ERR,
+ "Error commiting header page write; "
+ "rc = [%d]\n", rc);
+ break;
+ }
+ current_header_page++;
+ }
+ if (rc >= 0) {
+ rc = 0;
+ ecryptfs_printk(KERN_DEBUG, "lower_inode->i_blocks = "
+ "[0x%.16x]\n", lower_inode->i_blocks);
+ i_size_write(inode, 0);
+ lower_inode->i_mtime = lower_inode->i_ctime = CURRENT_TIME;
+ mark_inode_dirty_sync(inode);
+ }
+ ecryptfs_printk(KERN_DEBUG, "Clearing ECRYPTFS_NEW_FILE flag in "
+ "crypt_stat at memory location [%p]\n", crypt_stat);
+ ECRYPTFS_CLEAR_FLAG(crypt_stat->flags, ECRYPTFS_NEW_FILE);
+out:
+ return rc;
+}
+
+/**
+ * ecryptfs_commit_write
+ * @file: The eCryptfs file object
+ * @page: The eCryptfs page
+ * @from: Ignored (we rotate the page IV on each write)
+ * @to: Ignored
+ *
+ * This is where we encrypt the data and pass the encrypted data to
+ * the lower filesystem. In OpenPGP-compatible mode, we operate on
+ * entire underlying packets.
+ */
+static int ecryptfs_commit_write(struct file *file, struct page *page,
+ unsigned from, unsigned to)
+{
+ struct ecryptfs_page_crypt_context ctx;
+ loff_t pos;
+ struct inode *inode;
+ struct inode *lower_inode;
+ struct file *lower_file;
+ struct ecryptfs_crypt_stat *crypt_stat;
+ int rc;
+
+ inode = page->mapping->host;
+ lower_inode = ecryptfs_inode_to_lower(inode);
+ lower_file = ecryptfs_file_to_lower(file);
+ mutex_lock(&lower_inode->i_mutex);
+ crypt_stat =
+ &ecryptfs_inode_to_private(file->f_dentry->d_inode)->crypt_stat;
+ if (ECRYPTFS_CHECK_FLAG(crypt_stat->flags, ECRYPTFS_NEW_FILE)) {
+ ecryptfs_printk(KERN_DEBUG, "ECRYPTFS_NEW_FILE flag set in "
+ "crypt_stat at memory location [%p]\n", crypt_stat);
+ rc = process_new_file(crypt_stat, file, inode);
+ if (rc) {
+ ecryptfs_printk(KERN_ERR, "Error processing new "
+ "file; rc = [%d]\n", rc);
+ goto out;
+ }
+ } else
+ ecryptfs_printk(KERN_DEBUG, "Not a new file\n");
+ ecryptfs_printk(KERN_DEBUG, "Calling fill_zeros_to_end_of_page"
+ "(page w/ index = [0x%.16x], to = [%d])\n", page->index,
+ to);
+ rc = fill_zeros_to_end_of_page(page, to);
+ if (rc) {
+ ecryptfs_printk(KERN_WARNING, "Error attempting to fill "
+ "zeros in page with index = [0x%.16x]\n",
+ page->index);
+ goto out;
+ }
+ ctx.page = page;
+ ctx.mode = ECRYPTFS_PREPARE_COMMIT_MODE;
+ ctx.param.lower_file = lower_file;
+ rc = ecryptfs_encrypt_page(&ctx);
+ if (rc) {
+ ecryptfs_printk(KERN_WARNING, "Error encrypting page (upper "
+ "index [0x%.16x])\n", page->index);
+ goto out;
+ }
+ rc = 0;
+ inode->i_blocks = lower_inode->i_blocks;
+ pos = (page->index << PAGE_CACHE_SHIFT) + to;
+ if (pos > i_size_read(inode)) {
+ i_size_write(inode, pos);
+ ecryptfs_printk(KERN_DEBUG, "Expanded file size to "
+ "[0x%.16x]\n", i_size_read(inode));
+ }
+ ecryptfs_write_inode_size_to_header(lower_file, lower_inode, inode);
+ lower_inode->i_mtime = lower_inode->i_ctime = CURRENT_TIME;
+ mark_inode_dirty_sync(inode);
+out:
+ kunmap(page); /* mapped in prior call (prepare_write) */
+ if (rc < 0)
+ ClearPageUptodate(page);
+ else
+ SetPageUptodate(page);
+ mutex_unlock(&lower_inode->i_mutex);
+ return rc;
+}
+
+/**
+ * write_zeros
+ * @file: The ecryptfs file
+ * @index: The index in which we are writing
+ * @start: The position after the last block of data
+ * @num_zeros: The number of zeros to write
+ *
+ * Write a specified number of zero's to a page.
+ *
+ * (start + num_zeros) must be less than or equal to PAGE_CACHE_SIZE
+ */
+static
+int write_zeros(struct file *file, pgoff_t index, int start, int num_zeros)
+{
+ int rc = 0;
+ struct page *tmp_page;
+
+ tmp_page = ecryptfs_get1page(file, index);
+ if (IS_ERR(tmp_page)) {
+ ecryptfs_printk(KERN_ERR, "Error getting page at index "
+ "[0x%.16x]\n", index);
+ rc = PTR_ERR(tmp_page);
+ goto out;
+ }
+ kmap(tmp_page);
+ rc = ecryptfs_prepare_write(file, tmp_page, start, start + num_zeros);
+ if (rc) {
+ ecryptfs_printk(KERN_ERR, "Error preparing to write zero's "
+ "to remainder of page at index [0x%.16x]\n",
+ index);
+ kunmap(tmp_page);
+ page_cache_release(tmp_page);
+ goto out;
+ }
+ memset(((char *)page_address(tmp_page) + start), 0, num_zeros);
+ rc = ecryptfs_commit_write(file, tmp_page, start, start + num_zeros);
+ if (rc < 0) {
+ ecryptfs_printk(KERN_ERR, "Error attempting to write zero's "
+ "to remainder of page at index [0x%.16x]\n",
+ index);
+ kunmap(tmp_page);
+ page_cache_release(tmp_page);
+ goto out;
+ }
+ rc = 0;
+ kunmap(tmp_page);
+ page_cache_release(tmp_page);
+out:
+ return rc;
+}
+
+static sector_t ecryptfs_bmap(struct address_space *mapping, sector_t block)
+{
+ int rc = 0;
+ struct inode *inode;
+ struct inode *lower_inode;
+
+ inode = (struct inode *)mapping->host;
+ lower_inode = ecryptfs_inode_to_lower(inode);
+ if (lower_inode->i_mapping->a_ops->bmap)
+ rc = lower_inode->i_mapping->a_ops->bmap(lower_inode->i_mapping,
+ block);
+ return rc;
+}
+
+static void ecryptfs_sync_page(struct page *page)
+{
+ struct inode *inode;
+ struct inode *lower_inode;
+ struct page *lower_page;
+
+ inode = page->mapping->host;
+ lower_inode = ecryptfs_inode_to_lower(inode);
+ /* NOTE: Recently swapped with grab_cache_page(), since
+ * sync_page() just makes sure that pending I/O gets done. */
+ lower_page = find_lock_page(lower_inode->i_mapping, page->index);
+ if (!lower_page) {
+ ecryptfs_printk(KERN_DEBUG, "find_lock_page failed\n");
+ return;
+ }
+ lower_page->mapping->a_ops->sync_page(lower_page);
+ ecryptfs_printk(KERN_DEBUG, "Unlocking page with index = [0x%.16x]\n",
+ lower_page->index);
+ unlock_page(lower_page);
+ page_cache_release(lower_page);
+}
+
+struct address_space_operations ecryptfs_aops = {
+ .writepage = ecryptfs_writepage,
+ .readpage = ecryptfs_readpage,
+ .prepare_write = ecryptfs_prepare_write,
+ .commit_write = ecryptfs_commit_write,
+ .bmap = ecryptfs_bmap,
+ .sync_page = ecryptfs_sync_page,
+};
diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c
new file mode 100644
index 000000000000..c337c0410fb1
--- /dev/null
+++ b/fs/ecryptfs/super.c
@@ -0,0 +1,198 @@
+/**
+ * eCryptfs: Linux filesystem encryption layer
+ *
+ * Copyright (C) 1997-2003 Erez Zadok
+ * Copyright (C) 2001-2003 Stony Brook University
+ * Copyright (C) 2004-2006 International Business Machines Corp.
+ * Author(s): Michael A. Halcrow <mahalcro@us.ibm.com>
+ * Michael C. Thompson <mcthomps@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+ * 02111-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/key.h>
+#include <linux/seq_file.h>
+#include <linux/crypto.h>
+#include "ecryptfs_kernel.h"
+
+struct kmem_cache *ecryptfs_inode_info_cache;
+
+/**
+ * ecryptfs_alloc_inode - allocate an ecryptfs inode
+ * @sb: Pointer to the ecryptfs super block
+ *
+ * Called to bring an inode into existence.
+ *
+ * Only handle allocation, setting up structures should be done in
+ * ecryptfs_read_inode. This is because the kernel, between now and
+ * then, will 0 out the private data pointer.
+ *
+ * Returns a pointer to a newly allocated inode, NULL otherwise
+ */
+static struct inode *ecryptfs_alloc_inode(struct super_block *sb)
+{
+ struct ecryptfs_inode_info *ecryptfs_inode;
+ struct inode *inode = NULL;
+
+ ecryptfs_inode = kmem_cache_alloc(ecryptfs_inode_info_cache,
+ SLAB_KERNEL);
+ if (unlikely(!ecryptfs_inode))
+ goto out;
+ ecryptfs_init_crypt_stat(&ecryptfs_inode->crypt_stat);
+ inode = &ecryptfs_inode->vfs_inode;
+out:
+ return inode;
+}
+
+/**
+ * ecryptfs_destroy_inode
+ * @inode: The ecryptfs inode
+ *
+ * This is used during the final destruction of the inode.
+ * All allocation of memory related to the inode, including allocated
+ * memory in the crypt_stat struct, will be released here.
+ * There should be no chance that this deallocation will be missed.
+ */
+static void ecryptfs_destroy_inode(struct inode *inode)
+{
+ struct ecryptfs_inode_info *inode_info;
+
+ inode_info = ecryptfs_inode_to_private(inode);
+ ecryptfs_destruct_crypt_stat(&inode_info->crypt_stat);
+ kmem_cache_free(ecryptfs_inode_info_cache, inode_info);
+}
+
+/**
+ * ecryptfs_init_inode
+ * @inode: The ecryptfs inode
+ *
+ * Set up the ecryptfs inode.
+ */
+void ecryptfs_init_inode(struct inode *inode, struct inode *lower_inode)
+{
+ ecryptfs_set_inode_lower(inode, lower_inode);
+ inode->i_ino = lower_inode->i_ino;
+ inode->i_version++;
+ inode->i_op = &ecryptfs_main_iops;
+ inode->i_fop = &ecryptfs_main_fops;
+ inode->i_mapping->a_ops = &ecryptfs_aops;
+}
+
+/**
+ * ecryptfs_put_super
+ * @sb: Pointer to the ecryptfs super block
+ *
+ * Final actions when unmounting a file system.
+ * This will handle deallocation and release of our private data.
+ */
+static void ecryptfs_put_super(struct super_block *sb)
+{
+ struct ecryptfs_sb_info *sb_info = ecryptfs_superblock_to_private(sb);
+
+ ecryptfs_destruct_mount_crypt_stat(&sb_info->mount_crypt_stat);
+ kmem_cache_free(ecryptfs_sb_info_cache, sb_info);
+ ecryptfs_set_superblock_private(sb, NULL);
+}
+
+/**
+ * ecryptfs_statfs
+ * @sb: The ecryptfs super block
+ * @buf: The struct kstatfs to fill in with stats
+ *
+ * Get the filesystem statistics. Currently, we let this pass right through
+ * to the lower filesystem and take no action ourselves.
+ */
+static int ecryptfs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+ return vfs_statfs(ecryptfs_dentry_to_lower(dentry), buf);
+}
+
+/**
+ * ecryptfs_clear_inode
+ * @inode - The ecryptfs inode
+ *
+ * Called by iput() when the inode reference count reached zero
+ * and the inode is not hashed anywhere. Used to clear anything
+ * that needs to be, before the inode is completely destroyed and put
+ * on the inode free list. We use this to drop out reference to the
+ * lower inode.
+ */
+static void ecryptfs_clear_inode(struct inode *inode)
+{
+ iput(ecryptfs_inode_to_lower(inode));
+}
+
+/**
+ * ecryptfs_umount_begin
+ *
+ * Called in do_umount().
+ */
+static void ecryptfs_umount_begin(struct vfsmount *vfsmnt, int flags)
+{
+ struct vfsmount *lower_mnt =
+ ecryptfs_dentry_to_lower_mnt(vfsmnt->mnt_sb->s_root);
+ struct super_block *lower_sb;
+
+ mntput(lower_mnt);
+ lower_sb = lower_mnt->mnt_sb;
+ if (lower_sb->s_op->umount_begin)
+ lower_sb->s_op->umount_begin(lower_mnt, flags);
+}
+
+/**
+ * ecryptfs_show_options
+ *
+ * Prints the directory we are currently mounted over.
+ * Returns zero on success; non-zero otherwise
+ */
+static int ecryptfs_show_options(struct seq_file *m, struct vfsmount *mnt)
+{
+ struct super_block *sb = mnt->mnt_sb;
+ struct dentry *lower_root_dentry = ecryptfs_dentry_to_lower(sb->s_root);
+ struct vfsmount *lower_mnt = ecryptfs_dentry_to_lower_mnt(sb->s_root);
+ char *tmp_page;
+ char *path;
+ int rc = 0;
+
+ tmp_page = (char *)__get_free_page(GFP_KERNEL);
+ if (!tmp_page) {
+ rc = -ENOMEM;
+ goto out;
+ }
+ path = d_path(lower_root_dentry, lower_mnt, tmp_page, PAGE_SIZE);
+ if (IS_ERR(path)) {
+ rc = PTR_ERR(path);
+ goto out;
+ }
+ seq_printf(m, ",dir=%s", path);
+ free_page((unsigned long)tmp_page);
+out:
+ return rc;
+}
+
+struct super_operations ecryptfs_sops = {
+ .alloc_inode = ecryptfs_alloc_inode,
+ .destroy_inode = ecryptfs_destroy_inode,
+ .drop_inode = generic_delete_inode,
+ .put_super = ecryptfs_put_super,
+ .statfs = ecryptfs_statfs,
+ .remount_fs = NULL,
+ .clear_inode = ecryptfs_clear_inode,
+ .umount_begin = ecryptfs_umount_begin,
+ .show_options = ecryptfs_show_options
+};
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 557d5b614fae..ae228ec54e94 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -105,6 +105,8 @@
/* Maximum msec timeout value storeable in a long int */
#define EP_MAX_MSTIMEO min(1000ULL * MAX_SCHEDULE_TIMEOUT / HZ, (LONG_MAX - 999ULL) / HZ)
+#define EP_MAX_EVENTS (INT_MAX / sizeof(struct epoll_event))
+
struct epoll_filefd {
struct file *file;
@@ -497,7 +499,7 @@ void eventpoll_release_file(struct file *file)
*/
asmlinkage long sys_epoll_create(int size)
{
- int error, fd;
+ int error, fd = -1;
struct eventpoll *ep;
struct inode *inode;
struct file *file;
@@ -640,7 +642,6 @@ eexit_1:
return error;
}
-#define MAX_EVENTS (INT_MAX / sizeof(struct epoll_event))
/*
* Implement the event wait interface for the eventpoll file. It is the kernel
@@ -657,7 +658,7 @@ asmlinkage long sys_epoll_wait(int epfd, struct epoll_event __user *events,
current, epfd, events, maxevents, timeout));
/* The maximum number of event must be greater than zero */
- if (maxevents <= 0 || maxevents > MAX_EVENTS)
+ if (maxevents <= 0 || maxevents > EP_MAX_EVENTS)
return -EINVAL;
/* Verify that the area passed by the user is writeable */
@@ -699,6 +700,55 @@ eexit_1:
}
+#ifdef TIF_RESTORE_SIGMASK
+
+/*
+ * Implement the event wait interface for the eventpoll file. It is the kernel
+ * part of the user space epoll_pwait(2).
+ */
+asmlinkage long sys_epoll_pwait(int epfd, struct epoll_event __user *events,
+ int maxevents, int timeout, const sigset_t __user *sigmask,
+ size_t sigsetsize)
+{
+ int error;
+ sigset_t ksigmask, sigsaved;
+
+ /*
+ * If the caller wants a certain signal mask to be set during the wait,
+ * we apply it here.
+ */
+ if (sigmask) {
+ if (sigsetsize != sizeof(sigset_t))
+ return -EINVAL;
+ if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask)))
+ return -EFAULT;
+ sigdelsetmask(&ksigmask, sigmask(SIGKILL) | sigmask(SIGSTOP));
+ sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
+ }
+
+ error = sys_epoll_wait(epfd, events, maxevents, timeout);
+
+ /*
+ * If we changed the signal mask, we need to restore the original one.
+ * In case we've got a signal while waiting, we do not restore the
+ * signal mask yet, and we allow do_signal() to deliver the signal on
+ * the way back to userspace, before the signal mask is restored.
+ */
+ if (sigmask) {
+ if (error == -EINTR) {
+ memcpy(&current->saved_sigmask, &sigsaved,
+ sizeof(sigsaved));
+ set_thread_flag(TIF_RESTORE_SIGMASK);
+ } else
+ sigprocmask(SIG_SETMASK, &sigsaved, NULL);
+ }
+
+ return error;
+}
+
+#endif /* #ifdef TIF_RESTORE_SIGMASK */
+
+
/*
* Creates the file descriptor to be used by the epoll interface.
*/
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 513cd421ac0b..d8b9abd95d07 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -364,7 +364,6 @@ static int parse_options (char * options,
{
char * p;
substring_t args[MAX_OPT_ARGS];
- unsigned long kind = EXT2_MOUNT_ERRORS_CONT;
int option;
if (!options)
@@ -404,13 +403,19 @@ static int parse_options (char * options,
/* *sb_block = match_int(&args[0]); */
break;
case Opt_err_panic:
- kind = EXT2_MOUNT_ERRORS_PANIC;
+ clear_opt (sbi->s_mount_opt, ERRORS_CONT);
+ clear_opt (sbi->s_mount_opt, ERRORS_RO);
+ set_opt (sbi->s_mount_opt, ERRORS_PANIC);
break;
case Opt_err_ro:
- kind = EXT2_MOUNT_ERRORS_RO;
+ clear_opt (sbi->s_mount_opt, ERRORS_CONT);
+ clear_opt (sbi->s_mount_opt, ERRORS_PANIC);
+ set_opt (sbi->s_mount_opt, ERRORS_RO);
break;
case Opt_err_cont:
- kind = EXT2_MOUNT_ERRORS_CONT;
+ clear_opt (sbi->s_mount_opt, ERRORS_RO);
+ clear_opt (sbi->s_mount_opt, ERRORS_PANIC);
+ set_opt (sbi->s_mount_opt, ERRORS_CONT);
break;
case Opt_nouid32:
set_opt (sbi->s_mount_opt, NO_UID32);
@@ -489,7 +494,6 @@ static int parse_options (char * options,
return 0;
}
}
- sbi->s_mount_opt |= kind;
return 1;
}
@@ -715,6 +719,8 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
set_opt(sbi->s_mount_opt, ERRORS_PANIC);
else if (le16_to_cpu(sbi->s_es->s_errors) == EXT2_ERRORS_RO)
set_opt(sbi->s_mount_opt, ERRORS_RO);
+ else
+ set_opt(sbi->s_mount_opt, ERRORS_CONT);
sbi->s_resuid = le16_to_cpu(es->s_def_resuid);
sbi->s_resgid = le16_to_cpu(es->s_def_resgid);
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 8bfd56ef18ca..afc2d4f42d77 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -1470,6 +1470,8 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
set_opt(sbi->s_mount_opt, ERRORS_PANIC);
else if (le16_to_cpu(sbi->s_es->s_errors) == EXT3_ERRORS_RO)
set_opt(sbi->s_mount_opt, ERRORS_RO);
+ else
+ set_opt(sbi->s_mount_opt, ERRORS_CONT);
sbi->s_resuid = le16_to_cpu(es->s_def_resuid);
sbi->s_resgid = le16_to_cpu(es->s_def_resgid);
diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
new file mode 100644
index 000000000000..a6acb96ebeb9
--- /dev/null
+++ b/fs/ext4/Makefile
@@ -0,0 +1,12 @@
+#
+# Makefile for the linux ext4-filesystem routines.
+#
+
+obj-$(CONFIG_EXT4DEV_FS) += ext4dev.o
+
+ext4dev-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
+ ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o
+
+ext4dev-$(CONFIG_EXT4DEV_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o
+ext4dev-$(CONFIG_EXT4DEV_FS_POSIX_ACL) += acl.o
+ext4dev-$(CONFIG_EXT4DEV_FS_SECURITY) += xattr_security.o
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
new file mode 100644
index 000000000000..9e882546d91a
--- /dev/null
+++ b/fs/ext4/acl.c
@@ -0,0 +1,551 @@
+/*
+ * linux/fs/ext4/acl.c
+ *
+ * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de>
+ */
+
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/capability.h>
+#include <linux/fs.h>
+#include <linux/ext4_jbd2.h>
+#include <linux/ext4_fs.h>
+#include "xattr.h"
+#include "acl.h"
+
+/*
+ * Convert from filesystem to in-memory representation.
+ */
+static struct posix_acl *
+ext4_acl_from_disk(const void *value, size_t size)
+{
+ const char *end = (char *)value + size;
+ int n, count;
+ struct posix_acl *acl;
+
+ if (!value)
+ return NULL;
+ if (size < sizeof(ext4_acl_header))
+ return ERR_PTR(-EINVAL);
+ if (((ext4_acl_header *)value)->a_version !=
+ cpu_to_le32(EXT4_ACL_VERSION))
+ return ERR_PTR(-EINVAL);
+ value = (char *)value + sizeof(ext4_acl_header);
+ count = ext4_acl_count(size);
+ if (count < 0)
+ return ERR_PTR(-EINVAL);
+ if (count == 0)
+ return NULL;
+ acl = posix_acl_alloc(count, GFP_KERNEL);
+ if (!acl)
+ return ERR_PTR(-ENOMEM);
+ for (n=0; n < count; n++) {
+ ext4_acl_entry *entry =
+ (ext4_acl_entry *)value;
+ if ((char *)value + sizeof(ext4_acl_entry_short) > end)
+ goto fail;
+ acl->a_entries[n].e_tag = le16_to_cpu(entry->e_tag);
+ acl->a_entries[n].e_perm = le16_to_cpu(entry->e_perm);
+ switch(acl->a_entries[n].e_tag) {
+ case ACL_USER_OBJ:
+ case ACL_GROUP_OBJ:
+ case ACL_MASK:
+ case ACL_OTHER:
+ value = (char *)value +
+ sizeof(ext4_acl_entry_short);
+ acl->a_entries[n].e_id = ACL_UNDEFINED_ID;
+ break;
+
+ case ACL_USER:
+ case ACL_GROUP:
+ value = (char *)value + sizeof(ext4_acl_entry);
+ if ((char *)value > end)
+ goto fail;
+ acl->a_entries[n].e_id =
+ le32_to_cpu(entry->e_id);
+ break;
+
+ default:
+ goto fail;
+ }
+ }
+ if (value != end)
+ goto fail;
+ return acl;
+
+fail:
+ posix_acl_release(acl);
+ return ERR_PTR(-EINVAL);
+}
+
+/*
+ * Convert from in-memory to filesystem representation.
+ */
+static void *
+ext4_acl_to_disk(const struct posix_acl *acl, size_t *size)
+{
+ ext4_acl_header *ext_acl;
+ char *e;
+ size_t n;
+
+ *size = ext4_acl_size(acl->a_count);
+ ext_acl = kmalloc(sizeof(ext4_acl_header) + acl->a_count *
+ sizeof(ext4_acl_entry), GFP_KERNEL);
+ if (!ext_acl)
+ return ERR_PTR(-ENOMEM);
+ ext_acl->a_version = cpu_to_le32(EXT4_ACL_VERSION);
+ e = (char *)ext_acl + sizeof(ext4_acl_header);
+ for (n=0; n < acl->a_count; n++) {
+ ext4_acl_entry *entry = (ext4_acl_entry *)e;
+ entry->e_tag = cpu_to_le16(acl->a_entries[n].e_tag);
+ entry->e_perm = cpu_to_le16(acl->a_entries[n].e_perm);
+ switch(acl->a_entries[n].e_tag) {
+ case ACL_USER:
+ case ACL_GROUP:
+ entry->e_id =
+ cpu_to_le32(acl->a_entries[n].e_id);
+ e += sizeof(ext4_acl_entry);
+ break;
+
+ case ACL_USER_OBJ:
+ case ACL_GROUP_OBJ:
+ case ACL_MASK:
+ case ACL_OTHER:
+ e += sizeof(ext4_acl_entry_short);
+ break;
+
+ default:
+ goto fail;
+ }
+ }
+ return (char *)ext_acl;
+
+fail:
+ kfree(ext_acl);
+ return ERR_PTR(-EINVAL);
+}
+
+static inline struct posix_acl *
+ext4_iget_acl(struct inode *inode, struct posix_acl **i_acl)
+{
+ struct posix_acl *acl = EXT4_ACL_NOT_CACHED;
+
+ spin_lock(&inode->i_lock);
+ if (*i_acl != EXT4_ACL_NOT_CACHED)
+ acl = posix_acl_dup(*i_acl);
+ spin_unlock(&inode->i_lock);
+
+ return acl;
+}
+
+static inline void
+ext4_iset_acl(struct inode *inode, struct posix_acl **i_acl,
+ struct posix_acl *acl)
+{
+ spin_lock(&inode->i_lock);
+ if (*i_acl != EXT4_ACL_NOT_CACHED)
+ posix_acl_release(*i_acl);
+ *i_acl = posix_acl_dup(acl);
+ spin_unlock(&inode->i_lock);
+}
+
+/*
+ * Inode operation get_posix_acl().
+ *
+ * inode->i_mutex: don't care
+ */
+static struct posix_acl *
+ext4_get_acl(struct inode *inode, int type)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ int name_index;
+ char *value = NULL;
+ struct posix_acl *acl;
+ int retval;
+
+ if (!test_opt(inode->i_sb, POSIX_ACL))
+ return NULL;
+
+ switch(type) {
+ case ACL_TYPE_ACCESS:
+ acl = ext4_iget_acl(inode, &ei->i_acl);
+ if (acl != EXT4_ACL_NOT_CACHED)
+ return acl;
+ name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS;
+ break;
+
+ case ACL_TYPE_DEFAULT:
+ acl = ext4_iget_acl(inode, &ei->i_default_acl);
+ if (acl != EXT4_ACL_NOT_CACHED)
+ return acl;
+ name_index = EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT;
+ break;
+
+ default:
+ return ERR_PTR(-EINVAL);
+ }
+ retval = ext4_xattr_get(inode, name_index, "", NULL, 0);
+ if (retval > 0) {
+ value = kmalloc(retval, GFP_KERNEL);
+ if (!value)
+ return ERR_PTR(-ENOMEM);
+ retval = ext4_xattr_get(inode, name_index, "", value, retval);
+ }
+ if (retval > 0)
+ acl = ext4_acl_from_disk(value, retval);
+ else if (retval == -ENODATA || retval == -ENOSYS)
+ acl = NULL;
+ else
+ acl = ERR_PTR(retval);
+ kfree(value);
+
+ if (!IS_ERR(acl)) {
+ switch(type) {
+ case ACL_TYPE_ACCESS:
+ ext4_iset_acl(inode, &ei->i_acl, acl);
+ break;
+
+ case ACL_TYPE_DEFAULT:
+ ext4_iset_acl(inode, &ei->i_default_acl, acl);
+ break;
+ }
+ }
+ return acl;
+}
+
+/*
+ * Set the access or default ACL of an inode.
+ *
+ * inode->i_mutex: down unless called from ext4_new_inode
+ */
+static int
+ext4_set_acl(handle_t *handle, struct inode *inode, int type,
+ struct posix_acl *acl)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ int name_index;
+ void *value = NULL;
+ size_t size = 0;
+ int error;
+
+ if (S_ISLNK(inode->i_mode))
+ return -EOPNOTSUPP;
+
+ switch(type) {
+ case ACL_TYPE_ACCESS:
+ name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS;
+ if (acl) {
+ mode_t mode = inode->i_mode;
+ error = posix_acl_equiv_mode(acl, &mode);
+ if (error < 0)
+ return error;
+ else {
+ inode->i_mode = mode;
+ ext4_mark_inode_dirty(handle, inode);
+ if (error == 0)
+ acl = NULL;
+ }
+ }
+ break;
+
+ case ACL_TYPE_DEFAULT:
+ name_index = EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT;
+ if (!S_ISDIR(inode->i_mode))
+ return acl ? -EACCES : 0;
+ break;
+
+ default:
+ return -EINVAL;
+ }
+ if (acl) {
+ value = ext4_acl_to_disk(acl, &size);
+ if (IS_ERR(value))
+ return (int)PTR_ERR(value);
+ }
+
+ error = ext4_xattr_set_handle(handle, inode, name_index, "",
+ value, size, 0);
+
+ kfree(value);
+ if (!error) {
+ switch(type) {
+ case ACL_TYPE_ACCESS:
+ ext4_iset_acl(inode, &ei->i_acl, acl);
+ break;
+
+ case ACL_TYPE_DEFAULT:
+ ext4_iset_acl(inode, &ei->i_default_acl, acl);
+ break;
+ }
+ }
+ return error;
+}
+
+static int
+ext4_check_acl(struct inode *inode, int mask)
+{
+ struct posix_acl *acl = ext4_get_acl(inode, ACL_TYPE_ACCESS);
+
+ if (IS_ERR(acl))
+ return PTR_ERR(acl);
+ if (acl) {
+ int error = posix_acl_permission(inode, acl, mask);
+ posix_acl_release(acl);
+ return error;
+ }
+
+ return -EAGAIN;
+}
+
+int
+ext4_permission(struct inode *inode, int mask, struct nameidata *nd)
+{
+ return generic_permission(inode, mask, ext4_check_acl);
+}
+
+/*
+ * Initialize the ACLs of a new inode. Called from ext4_new_inode.
+ *
+ * dir->i_mutex: down
+ * inode->i_mutex: up (access to inode is still exclusive)
+ */
+int
+ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
+{
+ struct posix_acl *acl = NULL;
+ int error = 0;
+
+ if (!S_ISLNK(inode->i_mode)) {
+ if (test_opt(dir->i_sb, POSIX_ACL)) {
+ acl = ext4_get_acl(dir, ACL_TYPE_DEFAULT);
+ if (IS_ERR(acl))
+ return PTR_ERR(acl);
+ }
+ if (!acl)
+ inode->i_mode &= ~current->fs->umask;
+ }
+ if (test_opt(inode->i_sb, POSIX_ACL) && acl) {
+ struct posix_acl *clone;
+ mode_t mode;
+
+ if (S_ISDIR(inode->i_mode)) {
+ error = ext4_set_acl(handle, inode,
+ ACL_TYPE_DEFAULT, acl);
+ if (error)
+ goto cleanup;
+ }
+ clone = posix_acl_clone(acl, GFP_KERNEL);
+ error = -ENOMEM;
+ if (!clone)
+ goto cleanup;
+
+ mode = inode->i_mode;
+ error = posix_acl_create_masq(clone, &mode);
+ if (error >= 0) {
+ inode->i_mode = mode;
+ if (error > 0) {
+ /* This is an extended ACL */
+ error = ext4_set_acl(handle, inode,
+ ACL_TYPE_ACCESS, clone);
+ }
+ }
+ posix_acl_release(clone);
+ }
+cleanup:
+ posix_acl_release(acl);
+ return error;
+}
+
+/*
+ * Does chmod for an inode that may have an Access Control List. The
+ * inode->i_mode field must be updated to the desired value by the caller
+ * before calling this function.
+ * Returns 0 on success, or a negative error number.
+ *
+ * We change the ACL rather than storing some ACL entries in the file
+ * mode permission bits (which would be more efficient), because that
+ * would break once additional permissions (like ACL_APPEND, ACL_DELETE
+ * for directories) are added. There are no more bits available in the
+ * file mode.
+ *
+ * inode->i_mutex: down
+ */
+int
+ext4_acl_chmod(struct inode *inode)
+{
+ struct posix_acl *acl, *clone;
+ int error;
+
+ if (S_ISLNK(inode->i_mode))
+ return -EOPNOTSUPP;
+ if (!test_opt(inode->i_sb, POSIX_ACL))
+ return 0;
+ acl = ext4_get_acl(inode, ACL_TYPE_ACCESS);
+ if (IS_ERR(acl) || !acl)
+ return PTR_ERR(acl);
+ clone = posix_acl_clone(acl, GFP_KERNEL);
+ posix_acl_release(acl);
+ if (!clone)
+ return -ENOMEM;
+ error = posix_acl_chmod_masq(clone, inode->i_mode);
+ if (!error) {
+ handle_t *handle;
+ int retries = 0;
+
+ retry:
+ handle = ext4_journal_start(inode,
+ EXT4_DATA_TRANS_BLOCKS(inode->i_sb));
+ if (IS_ERR(handle)) {
+ error = PTR_ERR(handle);
+ ext4_std_error(inode->i_sb, error);
+ goto out;
+ }
+ error = ext4_set_acl(handle, inode, ACL_TYPE_ACCESS, clone);
+ ext4_journal_stop(handle);
+ if (error == -ENOSPC &&
+ ext4_should_retry_alloc(inode->i_sb, &retries))
+ goto retry;
+ }
+out:
+ posix_acl_release(clone);
+ return error;
+}
+
+/*
+ * Extended attribute handlers
+ */
+static size_t
+ext4_xattr_list_acl_access(struct inode *inode, char *list, size_t list_len,
+ const char *name, size_t name_len)
+{
+ const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS);
+
+ if (!test_opt(inode->i_sb, POSIX_ACL))
+ return 0;
+ if (list && size <= list_len)
+ memcpy(list, POSIX_ACL_XATTR_ACCESS, size);
+ return size;
+}
+
+static size_t
+ext4_xattr_list_acl_default(struct inode *inode, char *list, size_t list_len,
+ const char *name, size_t name_len)
+{
+ const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT);
+
+ if (!test_opt(inode->i_sb, POSIX_ACL))
+ return 0;
+ if (list && size <= list_len)
+ memcpy(list, POSIX_ACL_XATTR_DEFAULT, size);
+ return size;
+}
+
+static int
+ext4_xattr_get_acl(struct inode *inode, int type, void *buffer, size_t size)
+{
+ struct posix_acl *acl;
+ int error;
+
+ if (!test_opt(inode->i_sb, POSIX_ACL))
+ return -EOPNOTSUPP;
+
+ acl = ext4_get_acl(inode, type);
+ if (IS_ERR(acl))
+ return PTR_ERR(acl);
+ if (acl == NULL)
+ return -ENODATA;
+ error = posix_acl_to_xattr(acl, buffer, size);
+ posix_acl_release(acl);
+
+ return error;
+}
+
+static int
+ext4_xattr_get_acl_access(struct inode *inode, const char *name,
+ void *buffer, size_t size)
+{
+ if (strcmp(name, "") != 0)
+ return -EINVAL;
+ return ext4_xattr_get_acl(inode, ACL_TYPE_ACCESS, buffer, size);
+}
+
+static int
+ext4_xattr_get_acl_default(struct inode *inode, const char *name,
+ void *buffer, size_t size)
+{
+ if (strcmp(name, "") != 0)
+ return -EINVAL;
+ return ext4_xattr_get_acl(inode, ACL_TYPE_DEFAULT, buffer, size);
+}
+
+static int
+ext4_xattr_set_acl(struct inode *inode, int type, const void *value,
+ size_t size)
+{
+ handle_t *handle;
+ struct posix_acl *acl;
+ int error, retries = 0;
+
+ if (!test_opt(inode->i_sb, POSIX_ACL))
+ return -EOPNOTSUPP;
+ if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER))
+ return -EPERM;
+
+ if (value) {
+ acl = posix_acl_from_xattr(value, size);
+ if (IS_ERR(acl))
+ return PTR_ERR(acl);
+ else if (acl) {
+ error = posix_acl_valid(acl);
+ if (error)
+ goto release_and_out;
+ }
+ } else
+ acl = NULL;
+
+retry:
+ handle = ext4_journal_start(inode, EXT4_DATA_TRANS_BLOCKS(inode->i_sb));
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+ error = ext4_set_acl(handle, inode, type, acl);
+ ext4_journal_stop(handle);
+ if (error == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
+ goto retry;
+
+release_and_out:
+ posix_acl_release(acl);
+ return error;
+}
+
+static int
+ext4_xattr_set_acl_access(struct inode *inode, const char *name,
+ const void *value, size_t size, int flags)
+{
+ if (strcmp(name, "") != 0)
+ return -EINVAL;
+ return ext4_xattr_set_acl(inode, ACL_TYPE_ACCESS, value, size);
+}
+
+static int
+ext4_xattr_set_acl_default(struct inode *inode, const char *name,
+ const void *value, size_t size, int flags)
+{
+ if (strcmp(name, "") != 0)
+ return -EINVAL;
+ return ext4_xattr_set_acl(inode, ACL_TYPE_DEFAULT, value, size);
+}
+
+struct xattr_handler ext4_xattr_acl_access_handler = {
+ .prefix = POSIX_ACL_XATTR_ACCESS,
+ .list = ext4_xattr_list_acl_access,
+ .get = ext4_xattr_get_acl_access,
+ .set = ext4_xattr_set_acl_access,
+};
+
+struct xattr_handler ext4_xattr_acl_default_handler = {
+ .prefix = POSIX_ACL_XATTR_DEFAULT,
+ .list = ext4_xattr_list_acl_default,
+ .get = ext4_xattr_get_acl_default,
+ .set = ext4_xattr_set_acl_default,
+};
diff --git a/fs/ext4/acl.h b/fs/ext4/acl.h
new file mode 100644
index 000000000000..26a5c1abf147
--- /dev/null
+++ b/fs/ext4/acl.h
@@ -0,0 +1,81 @@
+/*
+ File: fs/ext4/acl.h
+
+ (C) 2001 Andreas Gruenbacher, <a.gruenbacher@computer.org>
+*/
+
+#include <linux/posix_acl_xattr.h>
+
+#define EXT4_ACL_VERSION 0x0001
+
+typedef struct {
+ __le16 e_tag;
+ __le16 e_perm;
+ __le32 e_id;
+} ext4_acl_entry;
+
+typedef struct {
+ __le16 e_tag;
+ __le16 e_perm;
+} ext4_acl_entry_short;
+
+typedef struct {
+ __le32 a_version;
+} ext4_acl_header;
+
+static inline size_t ext4_acl_size(int count)
+{
+ if (count <= 4) {
+ return sizeof(ext4_acl_header) +
+ count * sizeof(ext4_acl_entry_short);
+ } else {
+ return sizeof(ext4_acl_header) +
+ 4 * sizeof(ext4_acl_entry_short) +
+ (count - 4) * sizeof(ext4_acl_entry);
+ }
+}
+
+static inline int ext4_acl_count(size_t size)
+{
+ ssize_t s;
+ size -= sizeof(ext4_acl_header);
+ s = size - 4 * sizeof(ext4_acl_entry_short);
+ if (s < 0) {
+ if (size % sizeof(ext4_acl_entry_short))
+ return -1;
+ return size / sizeof(ext4_acl_entry_short);
+ } else {
+ if (s % sizeof(ext4_acl_entry))
+ return -1;
+ return s / sizeof(ext4_acl_entry) + 4;
+ }
+}
+
+#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
+
+/* Value for inode->u.ext4_i.i_acl and inode->u.ext4_i.i_default_acl
+ if the ACL has not been cached */
+#define EXT4_ACL_NOT_CACHED ((void *)-1)
+
+/* acl.c */
+extern int ext4_permission (struct inode *, int, struct nameidata *);
+extern int ext4_acl_chmod (struct inode *);
+extern int ext4_init_acl (handle_t *, struct inode *, struct inode *);
+
+#else /* CONFIG_EXT4DEV_FS_POSIX_ACL */
+#include <linux/sched.h>
+#define ext4_permission NULL
+
+static inline int
+ext4_acl_chmod(struct inode *inode)
+{
+ return 0;
+}
+
+static inline int
+ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
+{
+ return 0;
+}
+#endif /* CONFIG_EXT4DEV_FS_POSIX_ACL */
+
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
new file mode 100644
index 000000000000..5d45582f9517
--- /dev/null
+++ b/fs/ext4/balloc.c
@@ -0,0 +1,1833 @@
+/*
+ * linux/fs/ext4/balloc.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ * Enhanced block allocation by Stephen Tweedie (sct@redhat.com), 1993
+ * Big-endian to little-endian byte-swapping/bitmaps by
+ * David S. Miller (davem@caip.rutgers.edu), 1995
+ */
+
+#include <linux/time.h>
+#include <linux/capability.h>
+#include <linux/fs.h>
+#include <linux/jbd2.h>
+#include <linux/ext4_fs.h>
+#include <linux/ext4_jbd2.h>
+#include <linux/quotaops.h>
+#include <linux/buffer_head.h>
+
+/*
+ * balloc.c contains the blocks allocation and deallocation routines
+ */
+
+/*
+ * Calculate the block group number and offset, given a block number
+ */
+void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
+ unsigned long *blockgrpp, ext4_grpblk_t *offsetp)
+{
+ struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+ ext4_grpblk_t offset;
+
+ blocknr = blocknr - le32_to_cpu(es->s_first_data_block);
+ offset = do_div(blocknr, EXT4_BLOCKS_PER_GROUP(sb));
+ if (offsetp)
+ *offsetp = offset;
+ if (blockgrpp)
+ *blockgrpp = blocknr;
+
+}
+
+/*
+ * The free blocks are managed by bitmaps. A file system contains several
+ * blocks groups. Each group contains 1 bitmap block for blocks, 1 bitmap
+ * block for inodes, N blocks for the inode table and data blocks.
+ *
+ * The file system contains group descriptors which are located after the
+ * super block. Each descriptor contains the number of the bitmap block and
+ * the free blocks count in the block. The descriptors are loaded in memory
+ * when a file system is mounted (see ext4_read_super).
+ */
+
+
+#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
+
+/**
+ * ext4_get_group_desc() -- load group descriptor from disk
+ * @sb: super block
+ * @block_group: given block group
+ * @bh: pointer to the buffer head to store the block
+ * group descriptor
+ */
+struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
+ unsigned int block_group,
+ struct buffer_head ** bh)
+{
+ unsigned long group_desc;
+ unsigned long offset;
+ struct ext4_group_desc * desc;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+ if (block_group >= sbi->s_groups_count) {
+ ext4_error (sb, "ext4_get_group_desc",
+ "block_group >= groups_count - "
+ "block_group = %d, groups_count = %lu",
+ block_group, sbi->s_groups_count);
+
+ return NULL;
+ }
+ smp_rmb();
+
+ group_desc = block_group >> EXT4_DESC_PER_BLOCK_BITS(sb);
+ offset = block_group & (EXT4_DESC_PER_BLOCK(sb) - 1);
+ if (!sbi->s_group_desc[group_desc]) {
+ ext4_error (sb, "ext4_get_group_desc",
+ "Group descriptor not loaded - "
+ "block_group = %d, group_desc = %lu, desc = %lu",
+ block_group, group_desc, offset);
+ return NULL;
+ }
+
+ desc = (struct ext4_group_desc *)(
+ (__u8 *)sbi->s_group_desc[group_desc]->b_data +
+ offset * EXT4_DESC_SIZE(sb));
+ if (bh)
+ *bh = sbi->s_group_desc[group_desc];
+ return desc;
+}
+
+/**
+ * read_block_bitmap()
+ * @sb: super block
+ * @block_group: given block group
+ *
+ * Read the bitmap for a given block_group, reading into the specified
+ * slot in the superblock's bitmap cache.
+ *
+ * Return buffer_head on success or NULL in case of failure.
+ */
+static struct buffer_head *
+read_block_bitmap(struct super_block *sb, unsigned int block_group)
+{
+ struct ext4_group_desc * desc;
+ struct buffer_head * bh = NULL;
+
+ desc = ext4_get_group_desc (sb, block_group, NULL);
+ if (!desc)
+ goto error_out;
+ bh = sb_bread(sb, ext4_block_bitmap(sb, desc));
+ if (!bh)
+ ext4_error (sb, "read_block_bitmap",
+ "Cannot read block bitmap - "
+ "block_group = %d, block_bitmap = %llu",
+ block_group,
+ ext4_block_bitmap(sb, desc));
+error_out:
+ return bh;
+}
+/*
+ * The reservation window structure operations
+ * --------------------------------------------
+ * Operations include:
+ * dump, find, add, remove, is_empty, find_next_reservable_window, etc.
+ *
+ * We use a red-black tree to represent per-filesystem reservation
+ * windows.
+ *
+ */
+
+/**
+ * __rsv_window_dump() -- Dump the filesystem block allocation reservation map
+ * @rb_root: root of per-filesystem reservation rb tree
+ * @verbose: verbose mode
+ * @fn: function which wishes to dump the reservation map
+ *
+ * If verbose is turned on, it will print the whole block reservation
+ * windows(start, end). Otherwise, it will only print out the "bad" windows,
+ * those windows that overlap with their immediate neighbors.
+ */
+#if 1
+static void __rsv_window_dump(struct rb_root *root, int verbose,
+ const char *fn)
+{
+ struct rb_node *n;
+ struct ext4_reserve_window_node *rsv, *prev;
+ int bad;
+
+restart:
+ n = rb_first(root);
+ bad = 0;
+ prev = NULL;
+
+ printk("Block Allocation Reservation Windows Map (%s):\n", fn);
+ while (n) {
+ rsv = list_entry(n, struct ext4_reserve_window_node, rsv_node);
+ if (verbose)
+ printk("reservation window 0x%p "
+ "start: %llu, end: %llu\n",
+ rsv, rsv->rsv_start, rsv->rsv_end);
+ if (rsv->rsv_start && rsv->rsv_start >= rsv->rsv_end) {
+ printk("Bad reservation %p (start >= end)\n",
+ rsv);
+ bad = 1;
+ }
+ if (prev && prev->rsv_end >= rsv->rsv_start) {
+ printk("Bad reservation %p (prev->end >= start)\n",
+ rsv);
+ bad = 1;
+ }
+ if (bad) {
+ if (!verbose) {
+ printk("Restarting reservation walk in verbose mode\n");
+ verbose = 1;
+ goto restart;
+ }
+ }
+ n = rb_next(n);
+ prev = rsv;
+ }
+ printk("Window map complete.\n");
+ if (bad)
+ BUG();
+}
+#define rsv_window_dump(root, verbose) \
+ __rsv_window_dump((root), (verbose), __FUNCTION__)
+#else
+#define rsv_window_dump(root, verbose) do {} while (0)
+#endif
+
+/**
+ * goal_in_my_reservation()
+ * @rsv: inode's reservation window
+ * @grp_goal: given goal block relative to the allocation block group
+ * @group: the current allocation block group
+ * @sb: filesystem super block
+ *
+ * Test if the given goal block (group relative) is within the file's
+ * own block reservation window range.
+ *
+ * If the reservation window is outside the goal allocation group, return 0;
+ * grp_goal (given goal block) could be -1, which means no specific
+ * goal block. In this case, always return 1.
+ * If the goal block is within the reservation window, return 1;
+ * otherwise, return 0;
+ */
+static int
+goal_in_my_reservation(struct ext4_reserve_window *rsv, ext4_grpblk_t grp_goal,
+ unsigned int group, struct super_block * sb)
+{
+ ext4_fsblk_t group_first_block, group_last_block;
+
+ group_first_block = ext4_group_first_block_no(sb, group);
+ group_last_block = group_first_block + (EXT4_BLOCKS_PER_GROUP(sb) - 1);
+
+ if ((rsv->_rsv_start > group_last_block) ||
+ (rsv->_rsv_end < group_first_block))
+ return 0;
+ if ((grp_goal >= 0) && ((grp_goal + group_first_block < rsv->_rsv_start)
+ || (grp_goal + group_first_block > rsv->_rsv_end)))
+ return 0;
+ return 1;
+}
+
+/**
+ * search_reserve_window()
+ * @rb_root: root of reservation tree
+ * @goal: target allocation block
+ *
+ * Find the reserved window which includes the goal, or the previous one
+ * if the goal is not in any window.
+ * Returns NULL if there are no windows or if all windows start after the goal.
+ */
+static struct ext4_reserve_window_node *
+search_reserve_window(struct rb_root *root, ext4_fsblk_t goal)
+{
+ struct rb_node *n = root->rb_node;
+ struct ext4_reserve_window_node *rsv;
+
+ if (!n)
+ return NULL;
+
+ do {
+ rsv = rb_entry(n, struct ext4_reserve_window_node, rsv_node);
+
+ if (goal < rsv->rsv_start)
+ n = n->rb_left;
+ else if (goal > rsv->rsv_end)
+ n = n->rb_right;
+ else
+ return rsv;
+ } while (n);
+ /*
+ * We've fallen off the end of the tree: the goal wasn't inside
+ * any particular node. OK, the previous node must be to one
+ * side of the interval containing the goal. If it's the RHS,
+ * we need to back up one.
+ */
+ if (rsv->rsv_start > goal) {
+ n = rb_prev(&rsv->rsv_node);
+ rsv = rb_entry(n, struct ext4_reserve_window_node, rsv_node);
+ }
+ return rsv;
+}
+
+/**
+ * ext4_rsv_window_add() -- Insert a window to the block reservation rb tree.
+ * @sb: super block
+ * @rsv: reservation window to add
+ *
+ * Must be called with rsv_lock hold.
+ */
+void ext4_rsv_window_add(struct super_block *sb,
+ struct ext4_reserve_window_node *rsv)
+{
+ struct rb_root *root = &EXT4_SB(sb)->s_rsv_window_root;
+ struct rb_node *node = &rsv->rsv_node;
+ ext4_fsblk_t start = rsv->rsv_start;
+
+ struct rb_node ** p = &root->rb_node;
+ struct rb_node * parent = NULL;
+ struct ext4_reserve_window_node *this;
+
+ while (*p)
+ {
+ parent = *p;
+ this = rb_entry(parent, struct ext4_reserve_window_node, rsv_node);
+
+ if (start < this->rsv_start)
+ p = &(*p)->rb_left;
+ else if (start > this->rsv_end)
+ p = &(*p)->rb_right;
+ else {
+ rsv_window_dump(root, 1);
+ BUG();
+ }
+ }
+
+ rb_link_node(node, parent, p);
+ rb_insert_color(node, root);
+}
+
+/**
+ * ext4_rsv_window_remove() -- unlink a window from the reservation rb tree
+ * @sb: super block
+ * @rsv: reservation window to remove
+ *
+ * Mark the block reservation window as not allocated, and unlink it
+ * from the filesystem reservation window rb tree. Must be called with
+ * rsv_lock hold.
+ */
+static void rsv_window_remove(struct super_block *sb,
+ struct ext4_reserve_window_node *rsv)
+{
+ rsv->rsv_start = EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
+ rsv->rsv_end = EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
+ rsv->rsv_alloc_hit = 0;
+ rb_erase(&rsv->rsv_node, &EXT4_SB(sb)->s_rsv_window_root);
+}
+
+/*
+ * rsv_is_empty() -- Check if the reservation window is allocated.
+ * @rsv: given reservation window to check
+ *
+ * returns 1 if the end block is EXT4_RESERVE_WINDOW_NOT_ALLOCATED.
+ */
+static inline int rsv_is_empty(struct ext4_reserve_window *rsv)
+{
+ /* a valid reservation end block could not be 0 */
+ return rsv->_rsv_end == EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
+}
+
+/**
+ * ext4_init_block_alloc_info()
+ * @inode: file inode structure
+ *
+ * Allocate and initialize the reservation window structure, and
+ * link the window to the ext4 inode structure at last
+ *
+ * The reservation window structure is only dynamically allocated
+ * and linked to ext4 inode the first time the open file
+ * needs a new block. So, before every ext4_new_block(s) call, for
+ * regular files, we should check whether the reservation window
+ * structure exists or not. In the latter case, this function is called.
+ * Fail to do so will result in block reservation being turned off for that
+ * open file.
+ *
+ * This function is called from ext4_get_blocks_handle(), also called
+ * when setting the reservation window size through ioctl before the file
+ * is open for write (needs block allocation).
+ *
+ * Needs truncate_mutex protection prior to call this function.
+ */
+void ext4_init_block_alloc_info(struct inode *inode)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ struct ext4_block_alloc_info *block_i = ei->i_block_alloc_info;
+ struct super_block *sb = inode->i_sb;
+
+ block_i = kmalloc(sizeof(*block_i), GFP_NOFS);
+ if (block_i) {
+ struct ext4_reserve_window_node *rsv = &block_i->rsv_window_node;
+
+ rsv->rsv_start = EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
+ rsv->rsv_end = EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
+
+ /*
+ * if filesystem is mounted with NORESERVATION, the goal
+ * reservation window size is set to zero to indicate
+ * block reservation is off
+ */
+ if (!test_opt(sb, RESERVATION))
+ rsv->rsv_goal_size = 0;
+ else
+ rsv->rsv_goal_size = EXT4_DEFAULT_RESERVE_BLOCKS;
+ rsv->rsv_alloc_hit = 0;
+ block_i->last_alloc_logical_block = 0;
+ block_i->last_alloc_physical_block = 0;
+ }
+ ei->i_block_alloc_info = block_i;
+}
+
+/**
+ * ext4_discard_reservation()
+ * @inode: inode
+ *
+ * Discard(free) block reservation window on last file close, or truncate
+ * or at last iput().
+ *
+ * It is being called in three cases:
+ * ext4_release_file(): last writer close the file
+ * ext4_clear_inode(): last iput(), when nobody link to this file.
+ * ext4_truncate(): when the block indirect map is about to change.
+ *
+ */
+void ext4_discard_reservation(struct inode *inode)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ struct ext4_block_alloc_info *block_i = ei->i_block_alloc_info;
+ struct ext4_reserve_window_node *rsv;
+ spinlock_t *rsv_lock = &EXT4_SB(inode->i_sb)->s_rsv_window_lock;
+
+ if (!block_i)
+ return;
+
+ rsv = &block_i->rsv_window_node;
+ if (!rsv_is_empty(&rsv->rsv_window)) {
+ spin_lock(rsv_lock);
+ if (!rsv_is_empty(&rsv->rsv_window))
+ rsv_window_remove(inode->i_sb, rsv);
+ spin_unlock(rsv_lock);
+ }
+}
+
+/**
+ * ext4_free_blocks_sb() -- Free given blocks and update quota
+ * @handle: handle to this transaction
+ * @sb: super block
+ * @block: start physcial block to free
+ * @count: number of blocks to free
+ * @pdquot_freed_blocks: pointer to quota
+ */
+void ext4_free_blocks_sb(handle_t *handle, struct super_block *sb,
+ ext4_fsblk_t block, unsigned long count,
+ unsigned long *pdquot_freed_blocks)
+{
+ struct buffer_head *bitmap_bh = NULL;
+ struct buffer_head *gd_bh;
+ unsigned long block_group;
+ ext4_grpblk_t bit;
+ unsigned long i;
+ unsigned long overflow;
+ struct ext4_group_desc * desc;
+ struct ext4_super_block * es;
+ struct ext4_sb_info *sbi;
+ int err = 0, ret;
+ ext4_grpblk_t group_freed;
+
+ *pdquot_freed_blocks = 0;
+ sbi = EXT4_SB(sb);
+ es = sbi->s_es;
+ if (block < le32_to_cpu(es->s_first_data_block) ||
+ block + count < block ||
+ block + count > ext4_blocks_count(es)) {
+ ext4_error (sb, "ext4_free_blocks",
+ "Freeing blocks not in datazone - "
+ "block = %llu, count = %lu", block, count);
+ goto error_return;
+ }
+
+ ext4_debug ("freeing block(s) %llu-%llu\n", block, block + count - 1);
+
+do_more:
+ overflow = 0;
+ ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
+ /*
+ * Check to see if we are freeing blocks across a group
+ * boundary.
+ */
+ if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) {
+ overflow = bit + count - EXT4_BLOCKS_PER_GROUP(sb);
+ count -= overflow;
+ }
+ brelse(bitmap_bh);
+ bitmap_bh = read_block_bitmap(sb, block_group);
+ if (!bitmap_bh)
+ goto error_return;
+ desc = ext4_get_group_desc (sb, block_group, &gd_bh);
+ if (!desc)
+ goto error_return;
+
+ if (in_range(ext4_block_bitmap(sb, desc), block, count) ||
+ in_range(ext4_inode_bitmap(sb, desc), block, count) ||
+ in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) ||
+ in_range(block + count - 1, ext4_inode_table(sb, desc),
+ sbi->s_itb_per_group))
+ ext4_error (sb, "ext4_free_blocks",
+ "Freeing blocks in system zones - "
+ "Block = %llu, count = %lu",
+ block, count);
+
+ /*
+ * We are about to start releasing blocks in the bitmap,
+ * so we need undo access.
+ */
+ /* @@@ check errors */
+ BUFFER_TRACE(bitmap_bh, "getting undo access");
+ err = ext4_journal_get_undo_access(handle, bitmap_bh);
+ if (err)
+ goto error_return;
+
+ /*
+ * We are about to modify some metadata. Call the journal APIs
+ * to unshare ->b_data if a currently-committing transaction is
+ * using it
+ */
+ BUFFER_TRACE(gd_bh, "get_write_access");
+ err = ext4_journal_get_write_access(handle, gd_bh);
+ if (err)
+ goto error_return;
+
+ jbd_lock_bh_state(bitmap_bh);
+
+ for (i = 0, group_freed = 0; i < count; i++) {
+ /*
+ * An HJ special. This is expensive...
+ */
+#ifdef CONFIG_JBD_DEBUG
+ jbd_unlock_bh_state(bitmap_bh);
+ {
+ struct buffer_head *debug_bh;
+ debug_bh = sb_find_get_block(sb, block + i);
+ if (debug_bh) {
+ BUFFER_TRACE(debug_bh, "Deleted!");
+ if (!bh2jh(bitmap_bh)->b_committed_data)
+ BUFFER_TRACE(debug_bh,
+ "No commited data in bitmap");
+ BUFFER_TRACE2(debug_bh, bitmap_bh, "bitmap");
+ __brelse(debug_bh);
+ }
+ }
+ jbd_lock_bh_state(bitmap_bh);
+#endif
+ if (need_resched()) {
+ jbd_unlock_bh_state(bitmap_bh);
+ cond_resched();
+ jbd_lock_bh_state(bitmap_bh);
+ }
+ /* @@@ This prevents newly-allocated data from being
+ * freed and then reallocated within the same
+ * transaction.
+ *
+ * Ideally we would want to allow that to happen, but to
+ * do so requires making jbd2_journal_forget() capable of
+ * revoking the queued write of a data block, which
+ * implies blocking on the journal lock. *forget()
+ * cannot block due to truncate races.
+ *
+ * Eventually we can fix this by making jbd2_journal_forget()
+ * return a status indicating whether or not it was able
+ * to revoke the buffer. On successful revoke, it is
+ * safe not to set the allocation bit in the committed
+ * bitmap, because we know that there is no outstanding
+ * activity on the buffer any more and so it is safe to
+ * reallocate it.
+ */
+ BUFFER_TRACE(bitmap_bh, "set in b_committed_data");
+ J_ASSERT_BH(bitmap_bh,
+ bh2jh(bitmap_bh)->b_committed_data != NULL);
+ ext4_set_bit_atomic(sb_bgl_lock(sbi, block_group), bit + i,
+ bh2jh(bitmap_bh)->b_committed_data);
+
+ /*
+ * We clear the bit in the bitmap after setting the committed
+ * data bit, because this is the reverse order to that which
+ * the allocator uses.
+ */
+ BUFFER_TRACE(bitmap_bh, "clear bit");
+ if (!ext4_clear_bit_atomic(sb_bgl_lock(sbi, block_group),
+ bit + i, bitmap_bh->b_data)) {
+ jbd_unlock_bh_state(bitmap_bh);
+ ext4_error(sb, __FUNCTION__,
+ "bit already cleared for block %llu",
+ (ext4_fsblk_t)(block + i));
+ jbd_lock_bh_state(bitmap_bh);
+ BUFFER_TRACE(bitmap_bh, "bit already cleared");
+ } else {
+ group_freed++;
+ }
+ }
+ jbd_unlock_bh_state(bitmap_bh);
+
+ spin_lock(sb_bgl_lock(sbi, block_group));
+ desc->bg_free_blocks_count =
+ cpu_to_le16(le16_to_cpu(desc->bg_free_blocks_count) +
+ group_freed);
+ spin_unlock(sb_bgl_lock(sbi, block_group));
+ percpu_counter_mod(&sbi->s_freeblocks_counter, count);
+
+ /* We dirtied the bitmap block */
+ BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
+ err = ext4_journal_dirty_metadata(handle, bitmap_bh);
+
+ /* And the group descriptor block */
+ BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
+ ret = ext4_journal_dirty_metadata(handle, gd_bh);
+ if (!err) err = ret;
+ *pdquot_freed_blocks += group_freed;
+
+ if (overflow && !err) {
+ block += count;
+ count = overflow;
+ goto do_more;
+ }
+ sb->s_dirt = 1;
+error_return:
+ brelse(bitmap_bh);
+ ext4_std_error(sb, err);
+ return;
+}
+
+/**
+ * ext4_free_blocks() -- Free given blocks and update quota
+ * @handle: handle for this transaction
+ * @inode: inode
+ * @block: start physical block to free
+ * @count: number of blocks to count
+ */
+void ext4_free_blocks(handle_t *handle, struct inode *inode,
+ ext4_fsblk_t block, unsigned long count)
+{
+ struct super_block * sb;
+ unsigned long dquot_freed_blocks;
+
+ sb = inode->i_sb;
+ if (!sb) {
+ printk ("ext4_free_blocks: nonexistent device");
+ return;
+ }
+ ext4_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks);
+ if (dquot_freed_blocks)
+ DQUOT_FREE_BLOCK(inode, dquot_freed_blocks);
+ return;
+}
+
+/**
+ * ext4_test_allocatable()
+ * @nr: given allocation block group
+ * @bh: bufferhead contains the bitmap of the given block group
+ *
+ * For ext4 allocations, we must not reuse any blocks which are
+ * allocated in the bitmap buffer's "last committed data" copy. This
+ * prevents deletes from freeing up the page for reuse until we have
+ * committed the delete transaction.
+ *
+ * If we didn't do this, then deleting something and reallocating it as
+ * data would allow the old block to be overwritten before the
+ * transaction committed (because we force data to disk before commit).
+ * This would lead to corruption if we crashed between overwriting the
+ * data and committing the delete.
+ *
+ * @@@ We may want to make this allocation behaviour conditional on
+ * data-writes at some point, and disable it for metadata allocations or
+ * sync-data inodes.
+ */
+static int ext4_test_allocatable(ext4_grpblk_t nr, struct buffer_head *bh)
+{
+ int ret;
+ struct journal_head *jh = bh2jh(bh);
+
+ if (ext4_test_bit(nr, bh->b_data))
+ return 0;
+
+ jbd_lock_bh_state(bh);
+ if (!jh->b_committed_data)
+ ret = 1;
+ else
+ ret = !ext4_test_bit(nr, jh->b_committed_data);
+ jbd_unlock_bh_state(bh);
+ return ret;
+}
+
+/**
+ * bitmap_search_next_usable_block()
+ * @start: the starting block (group relative) of the search
+ * @bh: bufferhead contains the block group bitmap
+ * @maxblocks: the ending block (group relative) of the reservation
+ *
+ * The bitmap search --- search forward alternately through the actual
+ * bitmap on disk and the last-committed copy in journal, until we find a
+ * bit free in both bitmaps.
+ */
+static ext4_grpblk_t
+bitmap_search_next_usable_block(ext4_grpblk_t start, struct buffer_head *bh,
+ ext4_grpblk_t maxblocks)
+{
+ ext4_grpblk_t next;
+ struct journal_head *jh = bh2jh(bh);
+
+ while (start < maxblocks) {
+ next = ext4_find_next_zero_bit(bh->b_data, maxblocks, start);
+ if (next >= maxblocks)
+ return -1;
+ if (ext4_test_allocatable(next, bh))
+ return next;
+ jbd_lock_bh_state(bh);
+ if (jh->b_committed_data)
+ start = ext4_find_next_zero_bit(jh->b_committed_data,
+ maxblocks, next);
+ jbd_unlock_bh_state(bh);
+ }
+ return -1;
+}
+
+/**
+ * find_next_usable_block()
+ * @start: the starting block (group relative) to find next
+ * allocatable block in bitmap.
+ * @bh: bufferhead contains the block group bitmap
+ * @maxblocks: the ending block (group relative) for the search
+ *
+ * Find an allocatable block in a bitmap. We honor both the bitmap and
+ * its last-committed copy (if that exists), and perform the "most
+ * appropriate allocation" algorithm of looking for a free block near
+ * the initial goal; then for a free byte somewhere in the bitmap; then
+ * for any free bit in the bitmap.
+ */
+static ext4_grpblk_t
+find_next_usable_block(ext4_grpblk_t start, struct buffer_head *bh,
+ ext4_grpblk_t maxblocks)
+{
+ ext4_grpblk_t here, next;
+ char *p, *r;
+
+ if (start > 0) {
+ /*
+ * The goal was occupied; search forward for a free
+ * block within the next XX blocks.
+ *
+ * end_goal is more or less random, but it has to be
+ * less than EXT4_BLOCKS_PER_GROUP. Aligning up to the
+ * next 64-bit boundary is simple..
+ */
+ ext4_grpblk_t end_goal = (start + 63) & ~63;
+ if (end_goal > maxblocks)
+ end_goal = maxblocks;
+ here = ext4_find_next_zero_bit(bh->b_data, end_goal, start);
+ if (here < end_goal && ext4_test_allocatable(here, bh))
+ return here;
+ ext4_debug("Bit not found near goal\n");
+ }
+
+ here = start;
+ if (here < 0)
+ here = 0;
+
+ p = ((char *)bh->b_data) + (here >> 3);
+ r = memscan(p, 0, (maxblocks - here + 7) >> 3);
+ next = (r - ((char *)bh->b_data)) << 3;
+
+ if (next < maxblocks && next >= start && ext4_test_allocatable(next, bh))
+ return next;
+
+ /*
+ * The bitmap search --- search forward alternately through the actual
+ * bitmap and the last-committed copy until we find a bit free in
+ * both
+ */
+ here = bitmap_search_next_usable_block(here, bh, maxblocks);
+ return here;
+}
+
+/**
+ * claim_block()
+ * @block: the free block (group relative) to allocate
+ * @bh: the bufferhead containts the block group bitmap
+ *
+ * We think we can allocate this block in this bitmap. Try to set the bit.
+ * If that succeeds then check that nobody has allocated and then freed the
+ * block since we saw that is was not marked in b_committed_data. If it _was_
+ * allocated and freed then clear the bit in the bitmap again and return
+ * zero (failure).
+ */
+static inline int
+claim_block(spinlock_t *lock, ext4_grpblk_t block, struct buffer_head *bh)
+{
+ struct journal_head *jh = bh2jh(bh);
+ int ret;
+
+ if (ext4_set_bit_atomic(lock, block, bh->b_data))
+ return 0;
+ jbd_lock_bh_state(bh);
+ if (jh->b_committed_data && ext4_test_bit(block,jh->b_committed_data)) {
+ ext4_clear_bit_atomic(lock, block, bh->b_data);
+ ret = 0;
+ } else {
+ ret = 1;
+ }
+ jbd_unlock_bh_state(bh);
+ return ret;
+}
+
+/**
+ * ext4_try_to_allocate()
+ * @sb: superblock
+ * @handle: handle to this transaction
+ * @group: given allocation block group
+ * @bitmap_bh: bufferhead holds the block bitmap
+ * @grp_goal: given target block within the group
+ * @count: target number of blocks to allocate
+ * @my_rsv: reservation window
+ *
+ * Attempt to allocate blocks within a give range. Set the range of allocation
+ * first, then find the first free bit(s) from the bitmap (within the range),
+ * and at last, allocate the blocks by claiming the found free bit as allocated.
+ *
+ * To set the range of this allocation:
+ * if there is a reservation window, only try to allocate block(s) from the
+ * file's own reservation window;
+ * Otherwise, the allocation range starts from the give goal block, ends at
+ * the block group's last block.
+ *
+ * If we failed to allocate the desired block then we may end up crossing to a
+ * new bitmap. In that case we must release write access to the old one via
+ * ext4_journal_release_buffer(), else we'll run out of credits.
+ */
+static ext4_grpblk_t
+ext4_try_to_allocate(struct super_block *sb, handle_t *handle, int group,
+ struct buffer_head *bitmap_bh, ext4_grpblk_t grp_goal,
+ unsigned long *count, struct ext4_reserve_window *my_rsv)
+{
+ ext4_fsblk_t group_first_block;
+ ext4_grpblk_t start, end;
+ unsigned long num = 0;
+
+ /* we do allocation within the reservation window if we have a window */
+ if (my_rsv) {
+ group_first_block = ext4_group_first_block_no(sb, group);
+ if (my_rsv->_rsv_start >= group_first_block)
+ start = my_rsv->_rsv_start - group_first_block;
+ else
+ /* reservation window cross group boundary */
+ start = 0;
+ end = my_rsv->_rsv_end - group_first_block + 1;
+ if (end > EXT4_BLOCKS_PER_GROUP(sb))
+ /* reservation window crosses group boundary */
+ end = EXT4_BLOCKS_PER_GROUP(sb);
+ if ((start <= grp_goal) && (grp_goal < end))
+ start = grp_goal;
+ else
+ grp_goal = -1;
+ } else {
+ if (grp_goal > 0)
+ start = grp_goal;
+ else
+ start = 0;
+ end = EXT4_BLOCKS_PER_GROUP(sb);
+ }
+
+ BUG_ON(start > EXT4_BLOCKS_PER_GROUP(sb));
+
+repeat:
+ if (grp_goal < 0 || !ext4_test_allocatable(grp_goal, bitmap_bh)) {
+ grp_goal = find_next_usable_block(start, bitmap_bh, end);
+ if (grp_goal < 0)
+ goto fail_access;
+ if (!my_rsv) {
+ int i;
+
+ for (i = 0; i < 7 && grp_goal > start &&
+ ext4_test_allocatable(grp_goal - 1,
+ bitmap_bh);
+ i++, grp_goal--)
+ ;
+ }
+ }
+ start = grp_goal;
+
+ if (!claim_block(sb_bgl_lock(EXT4_SB(sb), group),
+ grp_goal, bitmap_bh)) {
+ /*
+ * The block was allocated by another thread, or it was
+ * allocated and then freed by another thread
+ */
+ start++;
+ grp_goal++;
+ if (start >= end)
+ goto fail_access;
+ goto repeat;
+ }
+ num++;
+ grp_goal++;
+ while (num < *count && grp_goal < end
+ && ext4_test_allocatable(grp_goal, bitmap_bh)
+ && claim_block(sb_bgl_lock(EXT4_SB(sb), group),
+ grp_goal, bitmap_bh)) {
+ num++;
+ grp_goal++;
+ }
+ *count = num;
+ return grp_goal - num;
+fail_access:
+ *count = num;
+ return -1;
+}
+
+/**
+ * find_next_reservable_window():
+ * find a reservable space within the given range.
+ * It does not allocate the reservation window for now:
+ * alloc_new_reservation() will do the work later.
+ *
+ * @search_head: the head of the searching list;
+ * This is not necessarily the list head of the whole filesystem
+ *
+ * We have both head and start_block to assist the search
+ * for the reservable space. The list starts from head,
+ * but we will shift to the place where start_block is,
+ * then start from there, when looking for a reservable space.
+ *
+ * @size: the target new reservation window size
+ *
+ * @group_first_block: the first block we consider to start
+ * the real search from
+ *
+ * @last_block:
+ * the maximum block number that our goal reservable space
+ * could start from. This is normally the last block in this
+ * group. The search will end when we found the start of next
+ * possible reservable space is out of this boundary.
+ * This could handle the cross boundary reservation window
+ * request.
+ *
+ * basically we search from the given range, rather than the whole
+ * reservation double linked list, (start_block, last_block)
+ * to find a free region that is of my size and has not
+ * been reserved.
+ *
+ */
+static int find_next_reservable_window(
+ struct ext4_reserve_window_node *search_head,
+ struct ext4_reserve_window_node *my_rsv,
+ struct super_block * sb,
+ ext4_fsblk_t start_block,
+ ext4_fsblk_t last_block)
+{
+ struct rb_node *next;
+ struct ext4_reserve_window_node *rsv, *prev;
+ ext4_fsblk_t cur;
+ int size = my_rsv->rsv_goal_size;
+
+ /* TODO: make the start of the reservation window byte-aligned */
+ /* cur = *start_block & ~7;*/
+ cur = start_block;
+ rsv = search_head;
+ if (!rsv)
+ return -1;
+
+ while (1) {
+ if (cur <= rsv->rsv_end)
+ cur = rsv->rsv_end + 1;
+
+ /* TODO?
+ * in the case we could not find a reservable space
+ * that is what is expected, during the re-search, we could
+ * remember what's the largest reservable space we could have
+ * and return that one.
+ *
+ * For now it will fail if we could not find the reservable
+ * space with expected-size (or more)...
+ */
+ if (cur > last_block)
+ return -1; /* fail */
+
+ prev = rsv;
+ next = rb_next(&rsv->rsv_node);
+ rsv = list_entry(next,struct ext4_reserve_window_node,rsv_node);
+
+ /*
+ * Reached the last reservation, we can just append to the
+ * previous one.
+ */
+ if (!next)
+ break;
+
+ if (cur + size <= rsv->rsv_start) {
+ /*
+ * Found a reserveable space big enough. We could
+ * have a reservation across the group boundary here
+ */
+ break;
+ }
+ }
+ /*
+ * we come here either :
+ * when we reach the end of the whole list,
+ * and there is empty reservable space after last entry in the list.
+ * append it to the end of the list.
+ *
+ * or we found one reservable space in the middle of the list,
+ * return the reservation window that we could append to.
+ * succeed.
+ */
+
+ if ((prev != my_rsv) && (!rsv_is_empty(&my_rsv->rsv_window)))
+ rsv_window_remove(sb, my_rsv);
+
+ /*
+ * Let's book the whole avaliable window for now. We will check the
+ * disk bitmap later and then, if there are free blocks then we adjust
+ * the window size if it's larger than requested.
+ * Otherwise, we will remove this node from the tree next time
+ * call find_next_reservable_window.
+ */
+ my_rsv->rsv_start = cur;
+ my_rsv->rsv_end = cur + size - 1;
+ my_rsv->rsv_alloc_hit = 0;
+
+ if (prev != my_rsv)
+ ext4_rsv_window_add(sb, my_rsv);
+
+ return 0;
+}
+
+/**
+ * alloc_new_reservation()--allocate a new reservation window
+ *
+ * To make a new reservation, we search part of the filesystem
+ * reservation list (the list that inside the group). We try to
+ * allocate a new reservation window near the allocation goal,
+ * or the beginning of the group, if there is no goal.
+ *
+ * We first find a reservable space after the goal, then from
+ * there, we check the bitmap for the first free block after
+ * it. If there is no free block until the end of group, then the
+ * whole group is full, we failed. Otherwise, check if the free
+ * block is inside the expected reservable space, if so, we
+ * succeed.
+ * If the first free block is outside the reservable space, then
+ * start from the first free block, we search for next available
+ * space, and go on.
+ *
+ * on succeed, a new reservation will be found and inserted into the list
+ * It contains at least one free block, and it does not overlap with other
+ * reservation windows.
+ *
+ * failed: we failed to find a reservation window in this group
+ *
+ * @rsv: the reservation
+ *
+ * @grp_goal: The goal (group-relative). It is where the search for a
+ * free reservable space should start from.
+ * if we have a grp_goal(grp_goal >0 ), then start from there,
+ * no grp_goal(grp_goal = -1), we start from the first block
+ * of the group.
+ *
+ * @sb: the super block
+ * @group: the group we are trying to allocate in
+ * @bitmap_bh: the block group block bitmap
+ *
+ */
+static int alloc_new_reservation(struct ext4_reserve_window_node *my_rsv,
+ ext4_grpblk_t grp_goal, struct super_block *sb,
+ unsigned int group, struct buffer_head *bitmap_bh)
+{
+ struct ext4_reserve_window_node *search_head;
+ ext4_fsblk_t group_first_block, group_end_block, start_block;
+ ext4_grpblk_t first_free_block;
+ struct rb_root *fs_rsv_root = &EXT4_SB(sb)->s_rsv_window_root;
+ unsigned long size;
+ int ret;
+ spinlock_t *rsv_lock = &EXT4_SB(sb)->s_rsv_window_lock;
+
+ group_first_block = ext4_group_first_block_no(sb, group);
+ group_end_block = group_first_block + (EXT4_BLOCKS_PER_GROUP(sb) - 1);
+
+ if (grp_goal < 0)
+ start_block = group_first_block;
+ else
+ start_block = grp_goal + group_first_block;
+
+ size = my_rsv->rsv_goal_size;
+
+ if (!rsv_is_empty(&my_rsv->rsv_window)) {
+ /*
+ * if the old reservation is cross group boundary
+ * and if the goal is inside the old reservation window,
+ * we will come here when we just failed to allocate from
+ * the first part of the window. We still have another part
+ * that belongs to the next group. In this case, there is no
+ * point to discard our window and try to allocate a new one
+ * in this group(which will fail). we should
+ * keep the reservation window, just simply move on.
+ *
+ * Maybe we could shift the start block of the reservation
+ * window to the first block of next group.
+ */
+
+ if ((my_rsv->rsv_start <= group_end_block) &&
+ (my_rsv->rsv_end > group_end_block) &&
+ (start_block >= my_rsv->rsv_start))
+ return -1;
+
+ if ((my_rsv->rsv_alloc_hit >
+ (my_rsv->rsv_end - my_rsv->rsv_start + 1) / 2)) {
+ /*
+ * if the previously allocation hit ratio is
+ * greater than 1/2, then we double the size of
+ * the reservation window the next time,
+ * otherwise we keep the same size window
+ */
+ size = size * 2;
+ if (size > EXT4_MAX_RESERVE_BLOCKS)
+ size = EXT4_MAX_RESERVE_BLOCKS;
+ my_rsv->rsv_goal_size= size;
+ }
+ }
+
+ spin_lock(rsv_lock);
+ /*
+ * shift the search start to the window near the goal block
+ */
+ search_head = search_reserve_window(fs_rsv_root, start_block);
+
+ /*
+ * find_next_reservable_window() simply finds a reservable window
+ * inside the given range(start_block, group_end_block).
+ *
+ * To make sure the reservation window has a free bit inside it, we
+ * need to check the bitmap after we found a reservable window.
+ */
+retry:
+ ret = find_next_reservable_window(search_head, my_rsv, sb,
+ start_block, group_end_block);
+
+ if (ret == -1) {
+ if (!rsv_is_empty(&my_rsv->rsv_window))
+ rsv_window_remove(sb, my_rsv);
+ spin_unlock(rsv_lock);
+ return -1;
+ }
+
+ /*
+ * On success, find_next_reservable_window() returns the
+ * reservation window where there is a reservable space after it.
+ * Before we reserve this reservable space, we need
+ * to make sure there is at least a free block inside this region.
+ *
+ * searching the first free bit on the block bitmap and copy of
+ * last committed bitmap alternatively, until we found a allocatable
+ * block. Search start from the start block of the reservable space
+ * we just found.
+ */
+ spin_unlock(rsv_lock);
+ first_free_block = bitmap_search_next_usable_block(
+ my_rsv->rsv_start - group_first_block,
+ bitmap_bh, group_end_block - group_first_block + 1);
+
+ if (first_free_block < 0) {
+ /*
+ * no free block left on the bitmap, no point
+ * to reserve the space. return failed.
+ */
+ spin_lock(rsv_lock);
+ if (!rsv_is_empty(&my_rsv->rsv_window))
+ rsv_window_remove(sb, my_rsv);
+ spin_unlock(rsv_lock);
+ return -1; /* failed */
+ }
+
+ start_block = first_free_block + group_first_block;
+ /*
+ * check if the first free block is within the
+ * free space we just reserved
+ */
+ if (start_block >= my_rsv->rsv_start && start_block < my_rsv->rsv_end)
+ return 0; /* success */
+ /*
+ * if the first free bit we found is out of the reservable space
+ * continue search for next reservable space,
+ * start from where the free block is,
+ * we also shift the list head to where we stopped last time
+ */
+ search_head = my_rsv;
+ spin_lock(rsv_lock);
+ goto retry;
+}
+
+/**
+ * try_to_extend_reservation()
+ * @my_rsv: given reservation window
+ * @sb: super block
+ * @size: the delta to extend
+ *
+ * Attempt to expand the reservation window large enough to have
+ * required number of free blocks
+ *
+ * Since ext4_try_to_allocate() will always allocate blocks within
+ * the reservation window range, if the window size is too small,
+ * multiple blocks allocation has to stop at the end of the reservation
+ * window. To make this more efficient, given the total number of
+ * blocks needed and the current size of the window, we try to
+ * expand the reservation window size if necessary on a best-effort
+ * basis before ext4_new_blocks() tries to allocate blocks,
+ */
+static void try_to_extend_reservation(struct ext4_reserve_window_node *my_rsv,
+ struct super_block *sb, int size)
+{
+ struct ext4_reserve_window_node *next_rsv;
+ struct rb_node *next;
+ spinlock_t *rsv_lock = &EXT4_SB(sb)->s_rsv_window_lock;
+
+ if (!spin_trylock(rsv_lock))
+ return;
+
+ next = rb_next(&my_rsv->rsv_node);
+
+ if (!next)
+ my_rsv->rsv_end += size;
+ else {
+ next_rsv = list_entry(next, struct ext4_reserve_window_node, rsv_node);
+
+ if ((next_rsv->rsv_start - my_rsv->rsv_end - 1) >= size)
+ my_rsv->rsv_end += size;
+ else
+ my_rsv->rsv_end = next_rsv->rsv_start - 1;
+ }
+ spin_unlock(rsv_lock);
+}
+
+/**
+ * ext4_try_to_allocate_with_rsv()
+ * @sb: superblock
+ * @handle: handle to this transaction
+ * @group: given allocation block group
+ * @bitmap_bh: bufferhead holds the block bitmap
+ * @grp_goal: given target block within the group
+ * @count: target number of blocks to allocate
+ * @my_rsv: reservation window
+ * @errp: pointer to store the error code
+ *
+ * This is the main function used to allocate a new block and its reservation
+ * window.
+ *
+ * Each time when a new block allocation is need, first try to allocate from
+ * its own reservation. If it does not have a reservation window, instead of
+ * looking for a free bit on bitmap first, then look up the reservation list to
+ * see if it is inside somebody else's reservation window, we try to allocate a
+ * reservation window for it starting from the goal first. Then do the block
+ * allocation within the reservation window.
+ *
+ * This will avoid keeping on searching the reservation list again and
+ * again when somebody is looking for a free block (without
+ * reservation), and there are lots of free blocks, but they are all
+ * being reserved.
+ *
+ * We use a red-black tree for the per-filesystem reservation list.
+ *
+ */
+static ext4_grpblk_t
+ext4_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle,
+ unsigned int group, struct buffer_head *bitmap_bh,
+ ext4_grpblk_t grp_goal,
+ struct ext4_reserve_window_node * my_rsv,
+ unsigned long *count, int *errp)
+{
+ ext4_fsblk_t group_first_block, group_last_block;
+ ext4_grpblk_t ret = 0;
+ int fatal;
+ unsigned long num = *count;
+
+ *errp = 0;
+
+ /*
+ * Make sure we use undo access for the bitmap, because it is critical
+ * that we do the frozen_data COW on bitmap buffers in all cases even
+ * if the buffer is in BJ_Forget state in the committing transaction.
+ */
+ BUFFER_TRACE(bitmap_bh, "get undo access for new block");
+ fatal = ext4_journal_get_undo_access(handle, bitmap_bh);
+ if (fatal) {
+ *errp = fatal;
+ return -1;
+ }
+
+ /*
+ * we don't deal with reservation when
+ * filesystem is mounted without reservation
+ * or the file is not a regular file
+ * or last attempt to allocate a block with reservation turned on failed
+ */
+ if (my_rsv == NULL ) {
+ ret = ext4_try_to_allocate(sb, handle, group, bitmap_bh,
+ grp_goal, count, NULL);
+ goto out;
+ }
+ /*
+ * grp_goal is a group relative block number (if there is a goal)
+ * 0 < grp_goal < EXT4_BLOCKS_PER_GROUP(sb)
+ * first block is a filesystem wide block number
+ * first block is the block number of the first block in this group
+ */
+ group_first_block = ext4_group_first_block_no(sb, group);
+ group_last_block = group_first_block + (EXT4_BLOCKS_PER_GROUP(sb) - 1);
+
+ /*
+ * Basically we will allocate a new block from inode's reservation
+ * window.
+ *
+ * We need to allocate a new reservation window, if:
+ * a) inode does not have a reservation window; or
+ * b) last attempt to allocate a block from existing reservation
+ * failed; or
+ * c) we come here with a goal and with a reservation window
+ *
+ * We do not need to allocate a new reservation window if we come here
+ * at the beginning with a goal and the goal is inside the window, or
+ * we don't have a goal but already have a reservation window.
+ * then we could go to allocate from the reservation window directly.
+ */
+ while (1) {
+ if (rsv_is_empty(&my_rsv->rsv_window) || (ret < 0) ||
+ !goal_in_my_reservation(&my_rsv->rsv_window,
+ grp_goal, group, sb)) {
+ if (my_rsv->rsv_goal_size < *count)
+ my_rsv->rsv_goal_size = *count;
+ ret = alloc_new_reservation(my_rsv, grp_goal, sb,
+ group, bitmap_bh);
+ if (ret < 0)
+ break; /* failed */
+
+ if (!goal_in_my_reservation(&my_rsv->rsv_window,
+ grp_goal, group, sb))
+ grp_goal = -1;
+ } else if (grp_goal > 0 &&
+ (my_rsv->rsv_end-grp_goal+1) < *count)
+ try_to_extend_reservation(my_rsv, sb,
+ *count-my_rsv->rsv_end + grp_goal - 1);
+
+ if ((my_rsv->rsv_start > group_last_block) ||
+ (my_rsv->rsv_end < group_first_block)) {
+ rsv_window_dump(&EXT4_SB(sb)->s_rsv_window_root, 1);
+ BUG();
+ }
+ ret = ext4_try_to_allocate(sb, handle, group, bitmap_bh,
+ grp_goal, &num, &my_rsv->rsv_window);
+ if (ret >= 0) {
+ my_rsv->rsv_alloc_hit += num;
+ *count = num;
+ break; /* succeed */
+ }
+ num = *count;
+ }
+out:
+ if (ret >= 0) {
+ BUFFER_TRACE(bitmap_bh, "journal_dirty_metadata for "
+ "bitmap block");
+ fatal = ext4_journal_dirty_metadata(handle, bitmap_bh);
+ if (fatal) {
+ *errp = fatal;
+ return -1;
+ }
+ return ret;
+ }
+
+ BUFFER_TRACE(bitmap_bh, "journal_release_buffer");
+ ext4_journal_release_buffer(handle, bitmap_bh);
+ return ret;
+}
+
+/**
+ * ext4_has_free_blocks()
+ * @sbi: in-core super block structure.
+ *
+ * Check if filesystem has at least 1 free block available for allocation.
+ */
+static int ext4_has_free_blocks(struct ext4_sb_info *sbi)
+{
+ ext4_fsblk_t free_blocks, root_blocks;
+
+ free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
+ root_blocks = ext4_r_blocks_count(sbi->s_es);
+ if (free_blocks < root_blocks + 1 && !capable(CAP_SYS_RESOURCE) &&
+ sbi->s_resuid != current->fsuid &&
+ (sbi->s_resgid == 0 || !in_group_p (sbi->s_resgid))) {
+ return 0;
+ }
+ return 1;
+}
+
+/**
+ * ext4_should_retry_alloc()
+ * @sb: super block
+ * @retries number of attemps has been made
+ *
+ * ext4_should_retry_alloc() is called when ENOSPC is returned, and if
+ * it is profitable to retry the operation, this function will wait
+ * for the current or commiting transaction to complete, and then
+ * return TRUE.
+ *
+ * if the total number of retries exceed three times, return FALSE.
+ */
+int ext4_should_retry_alloc(struct super_block *sb, int *retries)
+{
+ if (!ext4_has_free_blocks(EXT4_SB(sb)) || (*retries)++ > 3)
+ return 0;
+
+ jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id);
+
+ return jbd2_journal_force_commit_nested(EXT4_SB(sb)->s_journal);
+}
+
+/**
+ * ext4_new_blocks() -- core block(s) allocation function
+ * @handle: handle to this transaction
+ * @inode: file inode
+ * @goal: given target block(filesystem wide)
+ * @count: target number of blocks to allocate
+ * @errp: error code
+ *
+ * ext4_new_blocks uses a goal block to assist allocation. It tries to
+ * allocate block(s) from the block group contains the goal block first. If that
+ * fails, it will try to allocate block(s) from other block groups without
+ * any specific goal block.
+ *
+ */
+ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
+ ext4_fsblk_t goal, unsigned long *count, int *errp)
+{
+ struct buffer_head *bitmap_bh = NULL;
+ struct buffer_head *gdp_bh;
+ unsigned long group_no;
+ int goal_group;
+ ext4_grpblk_t grp_target_blk; /* blockgroup relative goal block */
+ ext4_grpblk_t grp_alloc_blk; /* blockgroup-relative allocated block*/
+ ext4_fsblk_t ret_block; /* filesyetem-wide allocated block */
+ int bgi; /* blockgroup iteration index */
+ int fatal = 0, err;
+ int performed_allocation = 0;
+ ext4_grpblk_t free_blocks; /* number of free blocks in a group */
+ struct super_block *sb;
+ struct ext4_group_desc *gdp;
+ struct ext4_super_block *es;
+ struct ext4_sb_info *sbi;
+ struct ext4_reserve_window_node *my_rsv = NULL;
+ struct ext4_block_alloc_info *block_i;
+ unsigned short windowsz = 0;
+#ifdef EXT4FS_DEBUG
+ static int goal_hits, goal_attempts;
+#endif
+ unsigned long ngroups;
+ unsigned long num = *count;
+
+ *errp = -ENOSPC;
+ sb = inode->i_sb;
+ if (!sb) {
+ printk("ext4_new_block: nonexistent device");
+ return 0;
+ }
+
+ /*
+ * Check quota for allocation of this block.
+ */
+ if (DQUOT_ALLOC_BLOCK(inode, num)) {
+ *errp = -EDQUOT;
+ return 0;
+ }
+
+ sbi = EXT4_SB(sb);
+ es = EXT4_SB(sb)->s_es;
+ ext4_debug("goal=%lu.\n", goal);
+ /*
+ * Allocate a block from reservation only when
+ * filesystem is mounted with reservation(default,-o reservation), and
+ * it's a regular file, and
+ * the desired window size is greater than 0 (One could use ioctl
+ * command EXT4_IOC_SETRSVSZ to set the window size to 0 to turn off
+ * reservation on that particular file)
+ */
+ block_i = EXT4_I(inode)->i_block_alloc_info;
+ if (block_i && ((windowsz = block_i->rsv_window_node.rsv_goal_size) > 0))
+ my_rsv = &block_i->rsv_window_node;
+
+ if (!ext4_has_free_blocks(sbi)) {
+ *errp = -ENOSPC;
+ goto out;
+ }
+
+ /*
+ * First, test whether the goal block is free.
+ */
+ if (goal < le32_to_cpu(es->s_first_data_block) ||
+ goal >= ext4_blocks_count(es))
+ goal = le32_to_cpu(es->s_first_data_block);
+ ext4_get_group_no_and_offset(sb, goal, &group_no, &grp_target_blk);
+ goal_group = group_no;
+retry_alloc:
+ gdp = ext4_get_group_desc(sb, group_no, &gdp_bh);
+ if (!gdp)
+ goto io_error;
+
+ free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
+ /*
+ * if there is not enough free blocks to make a new resevation
+ * turn off reservation for this allocation
+ */
+ if (my_rsv && (free_blocks < windowsz)
+ && (rsv_is_empty(&my_rsv->rsv_window)))
+ my_rsv = NULL;
+
+ if (free_blocks > 0) {
+ bitmap_bh = read_block_bitmap(sb, group_no);
+ if (!bitmap_bh)
+ goto io_error;
+ grp_alloc_blk = ext4_try_to_allocate_with_rsv(sb, handle,
+ group_no, bitmap_bh, grp_target_blk,
+ my_rsv, &num, &fatal);
+ if (fatal)
+ goto out;
+ if (grp_alloc_blk >= 0)
+ goto allocated;
+ }
+
+ ngroups = EXT4_SB(sb)->s_groups_count;
+ smp_rmb();
+
+ /*
+ * Now search the rest of the groups. We assume that
+ * i and gdp correctly point to the last group visited.
+ */
+ for (bgi = 0; bgi < ngroups; bgi++) {
+ group_no++;
+ if (group_no >= ngroups)
+ group_no = 0;
+ gdp = ext4_get_group_desc(sb, group_no, &gdp_bh);
+ if (!gdp) {
+ *errp = -EIO;
+ goto out;
+ }
+ free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
+ /*
+ * skip this group if the number of
+ * free blocks is less than half of the reservation
+ * window size.
+ */
+ if (free_blocks <= (windowsz/2))
+ continue;
+
+ brelse(bitmap_bh);
+ bitmap_bh = read_block_bitmap(sb, group_no);
+ if (!bitmap_bh)
+ goto io_error;
+ /*
+ * try to allocate block(s) from this group, without a goal(-1).
+ */
+ grp_alloc_blk = ext4_try_to_allocate_with_rsv(sb, handle,
+ group_no, bitmap_bh, -1, my_rsv,
+ &num, &fatal);
+ if (fatal)
+ goto out;
+ if (grp_alloc_blk >= 0)
+ goto allocated;
+ }
+ /*
+ * We may end up a bogus ealier ENOSPC error due to
+ * filesystem is "full" of reservations, but
+ * there maybe indeed free blocks avaliable on disk
+ * In this case, we just forget about the reservations
+ * just do block allocation as without reservations.
+ */
+ if (my_rsv) {
+ my_rsv = NULL;
+ group_no = goal_group;
+ goto retry_alloc;
+ }
+ /* No space left on the device */
+ *errp = -ENOSPC;
+ goto out;
+
+allocated:
+
+ ext4_debug("using block group %d(%d)\n",
+ group_no, gdp->bg_free_blocks_count);
+
+ BUFFER_TRACE(gdp_bh, "get_write_access");
+ fatal = ext4_journal_get_write_access(handle, gdp_bh);
+ if (fatal)
+ goto out;
+
+ ret_block = grp_alloc_blk + ext4_group_first_block_no(sb, group_no);
+
+ if (in_range(ext4_block_bitmap(sb, gdp), ret_block, num) ||
+ in_range(ext4_block_bitmap(sb, gdp), ret_block, num) ||
+ in_range(ret_block, ext4_inode_table(sb, gdp),
+ EXT4_SB(sb)->s_itb_per_group) ||
+ in_range(ret_block + num - 1, ext4_inode_table(sb, gdp),
+ EXT4_SB(sb)->s_itb_per_group))
+ ext4_error(sb, "ext4_new_block",
+ "Allocating block in system zone - "
+ "blocks from %llu, length %lu",
+ ret_block, num);
+
+ performed_allocation = 1;
+
+#ifdef CONFIG_JBD_DEBUG
+ {
+ struct buffer_head *debug_bh;
+
+ /* Record bitmap buffer state in the newly allocated block */
+ debug_bh = sb_find_get_block(sb, ret_block);
+ if (debug_bh) {
+ BUFFER_TRACE(debug_bh, "state when allocated");
+ BUFFER_TRACE2(debug_bh, bitmap_bh, "bitmap state");
+ brelse(debug_bh);
+ }
+ }
+ jbd_lock_bh_state(bitmap_bh);
+ spin_lock(sb_bgl_lock(sbi, group_no));
+ if (buffer_jbd(bitmap_bh) && bh2jh(bitmap_bh)->b_committed_data) {
+ int i;
+
+ for (i = 0; i < num; i++) {
+ if (ext4_test_bit(grp_alloc_blk+i,
+ bh2jh(bitmap_bh)->b_committed_data)) {
+ printk("%s: block was unexpectedly set in "
+ "b_committed_data\n", __FUNCTION__);
+ }
+ }
+ }
+ ext4_debug("found bit %d\n", grp_alloc_blk);
+ spin_unlock(sb_bgl_lock(sbi, group_no));
+ jbd_unlock_bh_state(bitmap_bh);
+#endif
+
+ if (ret_block + num - 1 >= ext4_blocks_count(es)) {
+ ext4_error(sb, "ext4_new_block",
+ "block(%llu) >= blocks count(%llu) - "
+ "block_group = %lu, es == %p ", ret_block,
+ ext4_blocks_count(es), group_no, es);
+ goto out;
+ }
+
+ /*
+ * It is up to the caller to add the new buffer to a journal
+ * list of some description. We don't know in advance whether
+ * the caller wants to use it as metadata or data.
+ */
+ ext4_debug("allocating block %lu. Goal hits %d of %d.\n",
+ ret_block, goal_hits, goal_attempts);
+
+ spin_lock(sb_bgl_lock(sbi, group_no));
+ gdp->bg_free_blocks_count =
+ cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count)-num);
+ spin_unlock(sb_bgl_lock(sbi, group_no));
+ percpu_counter_mod(&sbi->s_freeblocks_counter, -num);
+
+ BUFFER_TRACE(gdp_bh, "journal_dirty_metadata for group descriptor");
+ err = ext4_journal_dirty_metadata(handle, gdp_bh);
+ if (!fatal)
+ fatal = err;
+
+ sb->s_dirt = 1;
+ if (fatal)
+ goto out;
+
+ *errp = 0;
+ brelse(bitmap_bh);
+ DQUOT_FREE_BLOCK(inode, *count-num);
+ *count = num;
+ return ret_block;
+
+io_error:
+ *errp = -EIO;
+out:
+ if (fatal) {
+ *errp = fatal;
+ ext4_std_error(sb, fatal);
+ }
+ /*
+ * Undo the block allocation
+ */
+ if (!performed_allocation)
+ DQUOT_FREE_BLOCK(inode, *count);
+ brelse(bitmap_bh);
+ return 0;
+}
+
+ext4_fsblk_t ext4_new_block(handle_t *handle, struct inode *inode,
+ ext4_fsblk_t goal, int *errp)
+{
+ unsigned long count = 1;
+
+ return ext4_new_blocks(handle, inode, goal, &count, errp);
+}
+
+/**
+ * ext4_count_free_blocks() -- count filesystem free blocks
+ * @sb: superblock
+ *
+ * Adds up the number of free blocks from each block group.
+ */
+ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
+{
+ ext4_fsblk_t desc_count;
+ struct ext4_group_desc *gdp;
+ int i;
+ unsigned long ngroups = EXT4_SB(sb)->s_groups_count;
+#ifdef EXT4FS_DEBUG
+ struct ext4_super_block *es;
+ ext4_fsblk_t bitmap_count;
+ unsigned long x;
+ struct buffer_head *bitmap_bh = NULL;
+
+ es = EXT4_SB(sb)->s_es;
+ desc_count = 0;
+ bitmap_count = 0;
+ gdp = NULL;
+
+ smp_rmb();
+ for (i = 0; i < ngroups; i++) {
+ gdp = ext4_get_group_desc(sb, i, NULL);
+ if (!gdp)
+ continue;
+ desc_count += le16_to_cpu(gdp->bg_free_blocks_count);
+ brelse(bitmap_bh);
+ bitmap_bh = read_block_bitmap(sb, i);
+ if (bitmap_bh == NULL)
+ continue;
+
+ x = ext4_count_free(bitmap_bh, sb->s_blocksize);
+ printk("group %d: stored = %d, counted = %lu\n",
+ i, le16_to_cpu(gdp->bg_free_blocks_count), x);
+ bitmap_count += x;
+ }
+ brelse(bitmap_bh);
+ printk("ext4_count_free_blocks: stored = %llu"
+ ", computed = %llu, %llu\n",
+ EXT4_FREE_BLOCKS_COUNT(es),
+ desc_count, bitmap_count);
+ return bitmap_count;
+#else
+ desc_count = 0;
+ smp_rmb();
+ for (i = 0; i < ngroups; i++) {
+ gdp = ext4_get_group_desc(sb, i, NULL);
+ if (!gdp)
+ continue;
+ desc_count += le16_to_cpu(gdp->bg_free_blocks_count);
+ }
+
+ return desc_count;
+#endif
+}
+
+static inline int
+block_in_use(ext4_fsblk_t block, struct super_block *sb, unsigned char *map)
+{
+ ext4_grpblk_t offset;
+
+ ext4_get_group_no_and_offset(sb, block, NULL, &offset);
+ return ext4_test_bit (offset, map);
+}
+
+static inline int test_root(int a, int b)
+{
+ int num = b;
+
+ while (a > num)
+ num *= b;
+ return num == a;
+}
+
+static int ext4_group_sparse(int group)
+{
+ if (group <= 1)
+ return 1;
+ if (!(group & 1))
+ return 0;
+ return (test_root(group, 7) || test_root(group, 5) ||
+ test_root(group, 3));
+}
+
+/**
+ * ext4_bg_has_super - number of blocks used by the superblock in group
+ * @sb: superblock for filesystem
+ * @group: group number to check
+ *
+ * Return the number of blocks used by the superblock (primary or backup)
+ * in this group. Currently this will be only 0 or 1.
+ */
+int ext4_bg_has_super(struct super_block *sb, int group)
+{
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER) &&
+ !ext4_group_sparse(group))
+ return 0;
+ return 1;
+}
+
+static unsigned long ext4_bg_num_gdb_meta(struct super_block *sb, int group)
+{
+ unsigned long metagroup = group / EXT4_DESC_PER_BLOCK(sb);
+ unsigned long first = metagroup * EXT4_DESC_PER_BLOCK(sb);
+ unsigned long last = first + EXT4_DESC_PER_BLOCK(sb) - 1;
+
+ if (group == first || group == first + 1 || group == last)
+ return 1;
+ return 0;
+}
+
+static unsigned long ext4_bg_num_gdb_nometa(struct super_block *sb, int group)
+{
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER) &&
+ !ext4_group_sparse(group))
+ return 0;
+ return EXT4_SB(sb)->s_gdb_count;
+}
+
+/**
+ * ext4_bg_num_gdb - number of blocks used by the group table in group
+ * @sb: superblock for filesystem
+ * @group: group number to check
+ *
+ * Return the number of blocks used by the group descriptor table
+ * (primary or backup) in this group. In the future there may be a
+ * different number of descriptor blocks in each group.
+ */
+unsigned long ext4_bg_num_gdb(struct super_block *sb, int group)
+{
+ unsigned long first_meta_bg =
+ le32_to_cpu(EXT4_SB(sb)->s_es->s_first_meta_bg);
+ unsigned long metagroup = group / EXT4_DESC_PER_BLOCK(sb);
+
+ if (!EXT4_HAS_INCOMPAT_FEATURE(sb,EXT4_FEATURE_INCOMPAT_META_BG) ||
+ metagroup < first_meta_bg)
+ return ext4_bg_num_gdb_nometa(sb,group);
+
+ return ext4_bg_num_gdb_meta(sb,group);
+
+}
diff --git a/fs/ext4/bitmap.c b/fs/ext4/bitmap.c
new file mode 100644
index 000000000000..11e93c169bcf
--- /dev/null
+++ b/fs/ext4/bitmap.c
@@ -0,0 +1,32 @@
+/*
+ * linux/fs/ext4/bitmap.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ */
+
+#include <linux/buffer_head.h>
+#include <linux/jbd2.h>
+#include <linux/ext4_fs.h>
+
+#ifdef EXT4FS_DEBUG
+
+static int nibblemap[] = {4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0};
+
+unsigned long ext4_count_free (struct buffer_head * map, unsigned int numchars)
+{
+ unsigned int i;
+ unsigned long sum = 0;
+
+ if (!map)
+ return (0);
+ for (i = 0; i < numchars; i++)
+ sum += nibblemap[map->b_data[i] & 0xf] +
+ nibblemap[(map->b_data[i] >> 4) & 0xf];
+ return (sum);
+}
+
+#endif /* EXT4FS_DEBUG */
+
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
new file mode 100644
index 000000000000..f8595787a70e
--- /dev/null
+++ b/fs/ext4/dir.c
@@ -0,0 +1,518 @@
+/*
+ * linux/fs/ext4/dir.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ * from
+ *
+ * linux/fs/minix/dir.c
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ *
+ * ext4 directory handling functions
+ *
+ * Big-endian to little-endian byte-swapping/bitmaps by
+ * David S. Miller (davem@caip.rutgers.edu), 1995
+ *
+ * Hash Tree Directory indexing (c) 2001 Daniel Phillips
+ *
+ */
+
+#include <linux/fs.h>
+#include <linux/jbd2.h>
+#include <linux/ext4_fs.h>
+#include <linux/buffer_head.h>
+#include <linux/smp_lock.h>
+#include <linux/slab.h>
+#include <linux/rbtree.h>
+
+static unsigned char ext4_filetype_table[] = {
+ DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
+};
+
+static int ext4_readdir(struct file *, void *, filldir_t);
+static int ext4_dx_readdir(struct file * filp,
+ void * dirent, filldir_t filldir);
+static int ext4_release_dir (struct inode * inode,
+ struct file * filp);
+
+const struct file_operations ext4_dir_operations = {
+ .llseek = generic_file_llseek,
+ .read = generic_read_dir,
+ .readdir = ext4_readdir, /* we take BKL. needed?*/
+ .ioctl = ext4_ioctl, /* BKL held */
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = ext4_compat_ioctl,
+#endif
+ .fsync = ext4_sync_file, /* BKL held */
+#ifdef CONFIG_EXT4_INDEX
+ .release = ext4_release_dir,
+#endif
+};
+
+
+static unsigned char get_dtype(struct super_block *sb, int filetype)
+{
+ if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE) ||
+ (filetype >= EXT4_FT_MAX))
+ return DT_UNKNOWN;
+
+ return (ext4_filetype_table[filetype]);
+}
+
+
+int ext4_check_dir_entry (const char * function, struct inode * dir,
+ struct ext4_dir_entry_2 * de,
+ struct buffer_head * bh,
+ unsigned long offset)
+{
+ const char * error_msg = NULL;
+ const int rlen = le16_to_cpu(de->rec_len);
+
+ if (rlen < EXT4_DIR_REC_LEN(1))
+ error_msg = "rec_len is smaller than minimal";
+ else if (rlen % 4 != 0)
+ error_msg = "rec_len % 4 != 0";
+ else if (rlen < EXT4_DIR_REC_LEN(de->name_len))
+ error_msg = "rec_len is too small for name_len";
+ else if (((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize)
+ error_msg = "directory entry across blocks";
+ else if (le32_to_cpu(de->inode) >
+ le32_to_cpu(EXT4_SB(dir->i_sb)->s_es->s_inodes_count))
+ error_msg = "inode out of bounds";
+
+ if (error_msg != NULL)
+ ext4_error (dir->i_sb, function,
+ "bad entry in directory #%lu: %s - "
+ "offset=%lu, inode=%lu, rec_len=%d, name_len=%d",
+ dir->i_ino, error_msg, offset,
+ (unsigned long) le32_to_cpu(de->inode),
+ rlen, de->name_len);
+ return error_msg == NULL ? 1 : 0;
+}
+
+static int ext4_readdir(struct file * filp,
+ void * dirent, filldir_t filldir)
+{
+ int error = 0;
+ unsigned long offset;
+ int i, stored;
+ struct ext4_dir_entry_2 *de;
+ struct super_block *sb;
+ int err;
+ struct inode *inode = filp->f_dentry->d_inode;
+ int ret = 0;
+
+ sb = inode->i_sb;
+
+#ifdef CONFIG_EXT4_INDEX
+ if (EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
+ EXT4_FEATURE_COMPAT_DIR_INDEX) &&
+ ((EXT4_I(inode)->i_flags & EXT4_INDEX_FL) ||
+ ((inode->i_size >> sb->s_blocksize_bits) == 1))) {
+ err = ext4_dx_readdir(filp, dirent, filldir);
+ if (err != ERR_BAD_DX_DIR) {
+ ret = err;
+ goto out;
+ }
+ /*
+ * We don't set the inode dirty flag since it's not
+ * critical that it get flushed back to the disk.
+ */
+ EXT4_I(filp->f_dentry->d_inode)->i_flags &= ~EXT4_INDEX_FL;
+ }
+#endif
+ stored = 0;
+ offset = filp->f_pos & (sb->s_blocksize - 1);
+
+ while (!error && !stored && filp->f_pos < inode->i_size) {
+ unsigned long blk = filp->f_pos >> EXT4_BLOCK_SIZE_BITS(sb);
+ struct buffer_head map_bh;
+ struct buffer_head *bh = NULL;
+
+ map_bh.b_state = 0;
+ err = ext4_get_blocks_wrap(NULL, inode, blk, 1, &map_bh, 0, 0);
+ if (err > 0) {
+ page_cache_readahead(sb->s_bdev->bd_inode->i_mapping,
+ &filp->f_ra,
+ filp,
+ map_bh.b_blocknr >>
+ (PAGE_CACHE_SHIFT - inode->i_blkbits),
+ 1);
+ bh = ext4_bread(NULL, inode, blk, 0, &err);
+ }
+
+ /*
+ * We ignore I/O errors on directories so users have a chance
+ * of recovering data when there's a bad sector
+ */
+ if (!bh) {
+ ext4_error (sb, "ext4_readdir",
+ "directory #%lu contains a hole at offset %lu",
+ inode->i_ino, (unsigned long)filp->f_pos);
+ filp->f_pos += sb->s_blocksize - offset;
+ continue;
+ }
+
+revalidate:
+ /* If the dir block has changed since the last call to
+ * readdir(2), then we might be pointing to an invalid
+ * dirent right now. Scan from the start of the block
+ * to make sure. */
+ if (filp->f_version != inode->i_version) {
+ for (i = 0; i < sb->s_blocksize && i < offset; ) {
+ de = (struct ext4_dir_entry_2 *)
+ (bh->b_data + i);
+ /* It's too expensive to do a full
+ * dirent test each time round this
+ * loop, but we do have to test at
+ * least that it is non-zero. A
+ * failure will be detected in the
+ * dirent test below. */
+ if (le16_to_cpu(de->rec_len) <
+ EXT4_DIR_REC_LEN(1))
+ break;
+ i += le16_to_cpu(de->rec_len);
+ }
+ offset = i;
+ filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1))
+ | offset;
+ filp->f_version = inode->i_version;
+ }
+
+ while (!error && filp->f_pos < inode->i_size
+ && offset < sb->s_blocksize) {
+ de = (struct ext4_dir_entry_2 *) (bh->b_data + offset);
+ if (!ext4_check_dir_entry ("ext4_readdir", inode, de,
+ bh, offset)) {
+ /*
+ * On error, skip the f_pos to the next block
+ */
+ filp->f_pos = (filp->f_pos |
+ (sb->s_blocksize - 1)) + 1;
+ brelse (bh);
+ ret = stored;
+ goto out;
+ }
+ offset += le16_to_cpu(de->rec_len);
+ if (le32_to_cpu(de->inode)) {
+ /* We might block in the next section
+ * if the data destination is
+ * currently swapped out. So, use a
+ * version stamp to detect whether or
+ * not the directory has been modified
+ * during the copy operation.
+ */
+ unsigned long version = filp->f_version;
+
+ error = filldir(dirent, de->name,
+ de->name_len,
+ filp->f_pos,
+ le32_to_cpu(de->inode),
+ get_dtype(sb, de->file_type));
+ if (error)
+ break;
+ if (version != filp->f_version)
+ goto revalidate;
+ stored ++;
+ }
+ filp->f_pos += le16_to_cpu(de->rec_len);
+ }
+ offset = 0;
+ brelse (bh);
+ }
+out:
+ return ret;
+}
+
+#ifdef CONFIG_EXT4_INDEX
+/*
+ * These functions convert from the major/minor hash to an f_pos
+ * value.
+ *
+ * Currently we only use major hash numer. This is unfortunate, but
+ * on 32-bit machines, the same VFS interface is used for lseek and
+ * llseek, so if we use the 64 bit offset, then the 32-bit versions of
+ * lseek/telldir/seekdir will blow out spectacularly, and from within
+ * the ext2 low-level routine, we don't know if we're being called by
+ * a 64-bit version of the system call or the 32-bit version of the
+ * system call. Worse yet, NFSv2 only allows for a 32-bit readdir
+ * cookie. Sigh.
+ */
+#define hash2pos(major, minor) (major >> 1)
+#define pos2maj_hash(pos) ((pos << 1) & 0xffffffff)
+#define pos2min_hash(pos) (0)
+
+/*
+ * This structure holds the nodes of the red-black tree used to store
+ * the directory entry in hash order.
+ */
+struct fname {
+ __u32 hash;
+ __u32 minor_hash;
+ struct rb_node rb_hash;
+ struct fname *next;
+ __u32 inode;
+ __u8 name_len;
+ __u8 file_type;
+ char name[0];
+};
+
+/*
+ * This functoin implements a non-recursive way of freeing all of the
+ * nodes in the red-black tree.
+ */
+static void free_rb_tree_fname(struct rb_root *root)
+{
+ struct rb_node *n = root->rb_node;
+ struct rb_node *parent;
+ struct fname *fname;
+
+ while (n) {
+ /* Do the node's children first */
+ if ((n)->rb_left) {
+ n = n->rb_left;
+ continue;
+ }
+ if (n->rb_right) {
+ n = n->rb_right;
+ continue;
+ }
+ /*
+ * The node has no children; free it, and then zero
+ * out parent's link to it. Finally go to the
+ * beginning of the loop and try to free the parent
+ * node.
+ */
+ parent = rb_parent(n);
+ fname = rb_entry(n, struct fname, rb_hash);
+ while (fname) {
+ struct fname * old = fname;
+ fname = fname->next;
+ kfree (old);
+ }
+ if (!parent)
+ root->rb_node = NULL;
+ else if (parent->rb_left == n)
+ parent->rb_left = NULL;
+ else if (parent->rb_right == n)
+ parent->rb_right = NULL;
+ n = parent;
+ }
+ root->rb_node = NULL;
+}
+
+
+static struct dir_private_info *create_dir_info(loff_t pos)
+{
+ struct dir_private_info *p;
+
+ p = kmalloc(sizeof(struct dir_private_info), GFP_KERNEL);
+ if (!p)
+ return NULL;
+ p->root.rb_node = NULL;
+ p->curr_node = NULL;
+ p->extra_fname = NULL;
+ p->last_pos = 0;
+ p->curr_hash = pos2maj_hash(pos);
+ p->curr_minor_hash = pos2min_hash(pos);
+ p->next_hash = 0;
+ return p;
+}
+
+void ext4_htree_free_dir_info(struct dir_private_info *p)
+{
+ free_rb_tree_fname(&p->root);
+ kfree(p);
+}
+
+/*
+ * Given a directory entry, enter it into the fname rb tree.
+ */
+int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
+ __u32 minor_hash,
+ struct ext4_dir_entry_2 *dirent)
+{
+ struct rb_node **p, *parent = NULL;
+ struct fname * fname, *new_fn;
+ struct dir_private_info *info;
+ int len;
+
+ info = (struct dir_private_info *) dir_file->private_data;
+ p = &info->root.rb_node;
+
+ /* Create and allocate the fname structure */
+ len = sizeof(struct fname) + dirent->name_len + 1;
+ new_fn = kzalloc(len, GFP_KERNEL);
+ if (!new_fn)
+ return -ENOMEM;
+ new_fn->hash = hash;
+ new_fn->minor_hash = minor_hash;
+ new_fn->inode = le32_to_cpu(dirent->inode);
+ new_fn->name_len = dirent->name_len;
+ new_fn->file_type = dirent->file_type;
+ memcpy(new_fn->name, dirent->name, dirent->name_len);
+ new_fn->name[dirent->name_len] = 0;
+
+ while (*p) {
+ parent = *p;
+ fname = rb_entry(parent, struct fname, rb_hash);
+
+ /*
+ * If the hash and minor hash match up, then we put
+ * them on a linked list. This rarely happens...
+ */
+ if ((new_fn->hash == fname->hash) &&
+ (new_fn->minor_hash == fname->minor_hash)) {
+ new_fn->next = fname->next;
+ fname->next = new_fn;
+ return 0;
+ }
+
+ if (new_fn->hash < fname->hash)
+ p = &(*p)->rb_left;
+ else if (new_fn->hash > fname->hash)
+ p = &(*p)->rb_right;
+ else if (new_fn->minor_hash < fname->minor_hash)
+ p = &(*p)->rb_left;
+ else /* if (new_fn->minor_hash > fname->minor_hash) */
+ p = &(*p)->rb_right;
+ }
+
+ rb_link_node(&new_fn->rb_hash, parent, p);
+ rb_insert_color(&new_fn->rb_hash, &info->root);
+ return 0;
+}
+
+
+
+/*
+ * This is a helper function for ext4_dx_readdir. It calls filldir
+ * for all entres on the fname linked list. (Normally there is only
+ * one entry on the linked list, unless there are 62 bit hash collisions.)
+ */
+static int call_filldir(struct file * filp, void * dirent,
+ filldir_t filldir, struct fname *fname)
+{
+ struct dir_private_info *info = filp->private_data;
+ loff_t curr_pos;
+ struct inode *inode = filp->f_dentry->d_inode;
+ struct super_block * sb;
+ int error;
+
+ sb = inode->i_sb;
+
+ if (!fname) {
+ printk("call_filldir: called with null fname?!?\n");
+ return 0;
+ }
+ curr_pos = hash2pos(fname->hash, fname->minor_hash);
+ while (fname) {
+ error = filldir(dirent, fname->name,
+ fname->name_len, curr_pos,
+ fname->inode,
+ get_dtype(sb, fname->file_type));
+ if (error) {
+ filp->f_pos = curr_pos;
+ info->extra_fname = fname->next;
+ return error;
+ }
+ fname = fname->next;
+ }
+ return 0;
+}
+
+static int ext4_dx_readdir(struct file * filp,
+ void * dirent, filldir_t filldir)
+{
+ struct dir_private_info *info = filp->private_data;
+ struct inode *inode = filp->f_dentry->d_inode;
+ struct fname *fname;
+ int ret;
+
+ if (!info) {
+ info = create_dir_info(filp->f_pos);
+ if (!info)
+ return -ENOMEM;
+ filp->private_data = info;
+ }
+
+ if (filp->f_pos == EXT4_HTREE_EOF)
+ return 0; /* EOF */
+
+ /* Some one has messed with f_pos; reset the world */
+ if (info->last_pos != filp->f_pos) {
+ free_rb_tree_fname(&info->root);
+ info->curr_node = NULL;
+ info->extra_fname = NULL;
+ info->curr_hash = pos2maj_hash(filp->f_pos);
+ info->curr_minor_hash = pos2min_hash(filp->f_pos);
+ }
+
+ /*
+ * If there are any leftover names on the hash collision
+ * chain, return them first.
+ */
+ if (info->extra_fname &&
+ call_filldir(filp, dirent, filldir, info->extra_fname))
+ goto finished;
+
+ if (!info->curr_node)
+ info->curr_node = rb_first(&info->root);
+
+ while (1) {
+ /*
+ * Fill the rbtree if we have no more entries,
+ * or the inode has changed since we last read in the
+ * cached entries.
+ */
+ if ((!info->curr_node) ||
+ (filp->f_version != inode->i_version)) {
+ info->curr_node = NULL;
+ free_rb_tree_fname(&info->root);
+ filp->f_version = inode->i_version;
+ ret = ext4_htree_fill_tree(filp, info->curr_hash,
+ info->curr_minor_hash,
+ &info->next_hash);
+ if (ret < 0)
+ return ret;
+ if (ret == 0) {
+ filp->f_pos = EXT4_HTREE_EOF;
+ break;
+ }
+ info->curr_node = rb_first(&info->root);
+ }
+
+ fname = rb_entry(info->curr_node, struct fname, rb_hash);
+ info->curr_hash = fname->hash;
+ info->curr_minor_hash = fname->minor_hash;
+ if (call_filldir(filp, dirent, filldir, fname))
+ break;
+
+ info->curr_node = rb_next(info->curr_node);
+ if (!info->curr_node) {
+ if (info->next_hash == ~0) {
+ filp->f_pos = EXT4_HTREE_EOF;
+ break;
+ }
+ info->curr_hash = info->next_hash;
+ info->curr_minor_hash = 0;
+ }
+ }
+finished:
+ info->last_pos = filp->f_pos;
+ return 0;
+}
+
+static int ext4_release_dir (struct inode * inode, struct file * filp)
+{
+ if (filp->private_data)
+ ext4_htree_free_dir_info(filp->private_data);
+
+ return 0;
+}
+
+#endif
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
new file mode 100644
index 000000000000..2608dce18f3e
--- /dev/null
+++ b/fs/ext4/extents.c
@@ -0,0 +1,2152 @@
+/*
+ * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com
+ * Written by Alex Tomas <alex@clusterfs.com>
+ *
+ * Architecture independence:
+ * Copyright (c) 2005, Bull S.A.
+ * Written by Pierre Peiffer <pierre.peiffer@bull.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public Licens
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-
+ */
+
+/*
+ * Extents support for EXT4
+ *
+ * TODO:
+ * - ext4*_error() should be used in some situations
+ * - analyze all BUG()/BUG_ON(), use -EIO where appropriate
+ * - smart tree reduction
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/time.h>
+#include <linux/ext4_jbd2.h>
+#include <linux/jbd.h>
+#include <linux/smp_lock.h>
+#include <linux/highuid.h>
+#include <linux/pagemap.h>
+#include <linux/quotaops.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/ext4_fs_extents.h>
+#include <asm/uaccess.h>
+
+
+/*
+ * ext_pblock:
+ * combine low and high parts of physical block number into ext4_fsblk_t
+ */
+static inline ext4_fsblk_t ext_pblock(struct ext4_extent *ex)
+{
+ ext4_fsblk_t block;
+
+ block = le32_to_cpu(ex->ee_start);
+ block |= ((ext4_fsblk_t) le16_to_cpu(ex->ee_start_hi) << 31) << 1;
+ return block;
+}
+
+/*
+ * idx_pblock:
+ * combine low and high parts of a leaf physical block number into ext4_fsblk_t
+ */
+static inline ext4_fsblk_t idx_pblock(struct ext4_extent_idx *ix)
+{
+ ext4_fsblk_t block;
+
+ block = le32_to_cpu(ix->ei_leaf);
+ block |= ((ext4_fsblk_t) le16_to_cpu(ix->ei_leaf_hi) << 31) << 1;
+ return block;
+}
+
+/*
+ * ext4_ext_store_pblock:
+ * stores a large physical block number into an extent struct,
+ * breaking it into parts
+ */
+static inline void ext4_ext_store_pblock(struct ext4_extent *ex, ext4_fsblk_t pb)
+{
+ ex->ee_start = cpu_to_le32((unsigned long) (pb & 0xffffffff));
+ ex->ee_start_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff);
+}
+
+/*
+ * ext4_idx_store_pblock:
+ * stores a large physical block number into an index struct,
+ * breaking it into parts
+ */
+static inline void ext4_idx_store_pblock(struct ext4_extent_idx *ix, ext4_fsblk_t pb)
+{
+ ix->ei_leaf = cpu_to_le32((unsigned long) (pb & 0xffffffff));
+ ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff);
+}
+
+static int ext4_ext_check_header(const char *function, struct inode *inode,
+ struct ext4_extent_header *eh)
+{
+ const char *error_msg = NULL;
+
+ if (unlikely(eh->eh_magic != EXT4_EXT_MAGIC)) {
+ error_msg = "invalid magic";
+ goto corrupted;
+ }
+ if (unlikely(eh->eh_max == 0)) {
+ error_msg = "invalid eh_max";
+ goto corrupted;
+ }
+ if (unlikely(le16_to_cpu(eh->eh_entries) > le16_to_cpu(eh->eh_max))) {
+ error_msg = "invalid eh_entries";
+ goto corrupted;
+ }
+ return 0;
+
+corrupted:
+ ext4_error(inode->i_sb, function,
+ "bad header in inode #%lu: %s - magic %x, "
+ "entries %u, max %u, depth %u",
+ inode->i_ino, error_msg, le16_to_cpu(eh->eh_magic),
+ le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max),
+ le16_to_cpu(eh->eh_depth));
+
+ return -EIO;
+}
+
+static handle_t *ext4_ext_journal_restart(handle_t *handle, int needed)
+{
+ int err;
+
+ if (handle->h_buffer_credits > needed)
+ return handle;
+ if (!ext4_journal_extend(handle, needed))
+ return handle;
+ err = ext4_journal_restart(handle, needed);
+
+ return handle;
+}
+
+/*
+ * could return:
+ * - EROFS
+ * - ENOMEM
+ */
+static int ext4_ext_get_access(handle_t *handle, struct inode *inode,
+ struct ext4_ext_path *path)
+{
+ if (path->p_bh) {
+ /* path points to block */
+ return ext4_journal_get_write_access(handle, path->p_bh);
+ }
+ /* path points to leaf/index in inode body */
+ /* we use in-core data, no need to protect them */
+ return 0;
+}
+
+/*
+ * could return:
+ * - EROFS
+ * - ENOMEM
+ * - EIO
+ */
+static int ext4_ext_dirty(handle_t *handle, struct inode *inode,
+ struct ext4_ext_path *path)
+{
+ int err;
+ if (path->p_bh) {
+ /* path points to block */
+ err = ext4_journal_dirty_metadata(handle, path->p_bh);
+ } else {
+ /* path points to leaf/index in inode body */
+ err = ext4_mark_inode_dirty(handle, inode);
+ }
+ return err;
+}
+
+static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
+ struct ext4_ext_path *path,
+ ext4_fsblk_t block)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ ext4_fsblk_t bg_start;
+ ext4_grpblk_t colour;
+ int depth;
+
+ if (path) {
+ struct ext4_extent *ex;
+ depth = path->p_depth;
+
+ /* try to predict block placement */
+ if ((ex = path[depth].p_ext))
+ return ext_pblock(ex)+(block-le32_to_cpu(ex->ee_block));
+
+ /* it looks like index is empty;
+ * try to find starting block from index itself */
+ if (path[depth].p_bh)
+ return path[depth].p_bh->b_blocknr;
+ }
+
+ /* OK. use inode's group */
+ bg_start = (ei->i_block_group * EXT4_BLOCKS_PER_GROUP(inode->i_sb)) +
+ le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_first_data_block);
+ colour = (current->pid % 16) *
+ (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
+ return bg_start + colour + block;
+}
+
+static ext4_fsblk_t
+ext4_ext_new_block(handle_t *handle, struct inode *inode,
+ struct ext4_ext_path *path,
+ struct ext4_extent *ex, int *err)
+{
+ ext4_fsblk_t goal, newblock;
+
+ goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block));
+ newblock = ext4_new_block(handle, inode, goal, err);
+ return newblock;
+}
+
+static inline int ext4_ext_space_block(struct inode *inode)
+{
+ int size;
+
+ size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
+ / sizeof(struct ext4_extent);
+#ifdef AGRESSIVE_TEST
+ if (size > 6)
+ size = 6;
+#endif
+ return size;
+}
+
+static inline int ext4_ext_space_block_idx(struct inode *inode)
+{
+ int size;
+
+ size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
+ / sizeof(struct ext4_extent_idx);
+#ifdef AGRESSIVE_TEST
+ if (size > 5)
+ size = 5;
+#endif
+ return size;
+}
+
+static inline int ext4_ext_space_root(struct inode *inode)
+{
+ int size;
+
+ size = sizeof(EXT4_I(inode)->i_data);
+ size -= sizeof(struct ext4_extent_header);
+ size /= sizeof(struct ext4_extent);
+#ifdef AGRESSIVE_TEST
+ if (size > 3)
+ size = 3;
+#endif
+ return size;
+}
+
+static inline int ext4_ext_space_root_idx(struct inode *inode)
+{
+ int size;
+
+ size = sizeof(EXT4_I(inode)->i_data);
+ size -= sizeof(struct ext4_extent_header);
+ size /= sizeof(struct ext4_extent_idx);
+#ifdef AGRESSIVE_TEST
+ if (size > 4)
+ size = 4;
+#endif
+ return size;
+}
+
+#ifdef EXT_DEBUG
+static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path)
+{
+ int k, l = path->p_depth;
+
+ ext_debug("path:");
+ for (k = 0; k <= l; k++, path++) {
+ if (path->p_idx) {
+ ext_debug(" %d->%llu", le32_to_cpu(path->p_idx->ei_block),
+ idx_pblock(path->p_idx));
+ } else if (path->p_ext) {
+ ext_debug(" %d:%d:%llu ",
+ le32_to_cpu(path->p_ext->ee_block),
+ le16_to_cpu(path->p_ext->ee_len),
+ ext_pblock(path->p_ext));
+ } else
+ ext_debug(" []");
+ }
+ ext_debug("\n");
+}
+
+static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path)
+{
+ int depth = ext_depth(inode);
+ struct ext4_extent_header *eh;
+ struct ext4_extent *ex;
+ int i;
+
+ if (!path)
+ return;
+
+ eh = path[depth].p_hdr;
+ ex = EXT_FIRST_EXTENT(eh);
+
+ for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) {
+ ext_debug("%d:%d:%llu ", le32_to_cpu(ex->ee_block),
+ le16_to_cpu(ex->ee_len), ext_pblock(ex));
+ }
+ ext_debug("\n");
+}
+#else
+#define ext4_ext_show_path(inode,path)
+#define ext4_ext_show_leaf(inode,path)
+#endif
+
+static void ext4_ext_drop_refs(struct ext4_ext_path *path)
+{
+ int depth = path->p_depth;
+ int i;
+
+ for (i = 0; i <= depth; i++, path++)
+ if (path->p_bh) {
+ brelse(path->p_bh);
+ path->p_bh = NULL;
+ }
+}
+
+/*
+ * ext4_ext_binsearch_idx:
+ * binary search for the closest index of the given block
+ */
+static void
+ext4_ext_binsearch_idx(struct inode *inode, struct ext4_ext_path *path, int block)
+{
+ struct ext4_extent_header *eh = path->p_hdr;
+ struct ext4_extent_idx *r, *l, *m;
+
+ BUG_ON(eh->eh_magic != EXT4_EXT_MAGIC);
+ BUG_ON(le16_to_cpu(eh->eh_entries) > le16_to_cpu(eh->eh_max));
+ BUG_ON(le16_to_cpu(eh->eh_entries) <= 0);
+
+ ext_debug("binsearch for %d(idx): ", block);
+
+ l = EXT_FIRST_INDEX(eh) + 1;
+ r = EXT_FIRST_INDEX(eh) + le16_to_cpu(eh->eh_entries) - 1;
+ while (l <= r) {
+ m = l + (r - l) / 2;
+ if (block < le32_to_cpu(m->ei_block))
+ r = m - 1;
+ else
+ l = m + 1;
+ ext_debug("%p(%u):%p(%u):%p(%u) ", l, l->ei_block,
+ m, m->ei_block, r, r->ei_block);
+ }
+
+ path->p_idx = l - 1;
+ ext_debug(" -> %d->%lld ", le32_to_cpu(path->p_idx->ei_block),
+ idx_block(path->p_idx));
+
+#ifdef CHECK_BINSEARCH
+ {
+ struct ext4_extent_idx *chix, *ix;
+ int k;
+
+ chix = ix = EXT_FIRST_INDEX(eh);
+ for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ix++) {
+ if (k != 0 &&
+ le32_to_cpu(ix->ei_block) <= le32_to_cpu(ix[-1].ei_block)) {
+ printk("k=%d, ix=0x%p, first=0x%p\n", k,
+ ix, EXT_FIRST_INDEX(eh));
+ printk("%u <= %u\n",
+ le32_to_cpu(ix->ei_block),
+ le32_to_cpu(ix[-1].ei_block));
+ }
+ BUG_ON(k && le32_to_cpu(ix->ei_block)
+ <= le32_to_cpu(ix[-1].ei_block));
+ if (block < le32_to_cpu(ix->ei_block))
+ break;
+ chix = ix;
+ }
+ BUG_ON(chix != path->p_idx);
+ }
+#endif
+
+}
+
+/*
+ * ext4_ext_binsearch:
+ * binary search for closest extent of the given block
+ */
+static void
+ext4_ext_binsearch(struct inode *inode, struct ext4_ext_path *path, int block)
+{
+ struct ext4_extent_header *eh = path->p_hdr;
+ struct ext4_extent *r, *l, *m;
+
+ BUG_ON(eh->eh_magic != EXT4_EXT_MAGIC);
+ BUG_ON(le16_to_cpu(eh->eh_entries) > le16_to_cpu(eh->eh_max));
+
+ if (eh->eh_entries == 0) {
+ /*
+ * this leaf is empty:
+ * we get such a leaf in split/add case
+ */
+ return;
+ }
+
+ ext_debug("binsearch for %d: ", block);
+
+ l = EXT_FIRST_EXTENT(eh) + 1;
+ r = EXT_FIRST_EXTENT(eh) + le16_to_cpu(eh->eh_entries) - 1;
+
+ while (l <= r) {
+ m = l + (r - l) / 2;
+ if (block < le32_to_cpu(m->ee_block))
+ r = m - 1;
+ else
+ l = m + 1;
+ ext_debug("%p(%u):%p(%u):%p(%u) ", l, l->ee_block,
+ m, m->ee_block, r, r->ee_block);
+ }
+
+ path->p_ext = l - 1;
+ ext_debug(" -> %d:%llu:%d ",
+ le32_to_cpu(path->p_ext->ee_block),
+ ext_pblock(path->p_ext),
+ le16_to_cpu(path->p_ext->ee_len));
+
+#ifdef CHECK_BINSEARCH
+ {
+ struct ext4_extent *chex, *ex;
+ int k;
+
+ chex = ex = EXT_FIRST_EXTENT(eh);
+ for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ex++) {
+ BUG_ON(k && le32_to_cpu(ex->ee_block)
+ <= le32_to_cpu(ex[-1].ee_block));
+ if (block < le32_to_cpu(ex->ee_block))
+ break;
+ chex = ex;
+ }
+ BUG_ON(chex != path->p_ext);
+ }
+#endif
+
+}
+
+int ext4_ext_tree_init(handle_t *handle, struct inode *inode)
+{
+ struct ext4_extent_header *eh;
+
+ eh = ext_inode_hdr(inode);
+ eh->eh_depth = 0;
+ eh->eh_entries = 0;
+ eh->eh_magic = EXT4_EXT_MAGIC;
+ eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode));
+ ext4_mark_inode_dirty(handle, inode);
+ ext4_ext_invalidate_cache(inode);
+ return 0;
+}
+
+struct ext4_ext_path *
+ext4_ext_find_extent(struct inode *inode, int block, struct ext4_ext_path *path)
+{
+ struct ext4_extent_header *eh;
+ struct buffer_head *bh;
+ short int depth, i, ppos = 0, alloc = 0;
+
+ eh = ext_inode_hdr(inode);
+ BUG_ON(eh == NULL);
+ if (ext4_ext_check_header(__FUNCTION__, inode, eh))
+ return ERR_PTR(-EIO);
+
+ i = depth = ext_depth(inode);
+
+ /* account possible depth increase */
+ if (!path) {
+ path = kmalloc(sizeof(struct ext4_ext_path) * (depth + 2),
+ GFP_NOFS);
+ if (!path)
+ return ERR_PTR(-ENOMEM);
+ alloc = 1;
+ }
+ memset(path, 0, sizeof(struct ext4_ext_path) * (depth + 1));
+ path[0].p_hdr = eh;
+
+ /* walk through the tree */
+ while (i) {
+ ext_debug("depth %d: num %d, max %d\n",
+ ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
+ ext4_ext_binsearch_idx(inode, path + ppos, block);
+ path[ppos].p_block = idx_pblock(path[ppos].p_idx);
+ path[ppos].p_depth = i;
+ path[ppos].p_ext = NULL;
+
+ bh = sb_bread(inode->i_sb, path[ppos].p_block);
+ if (!bh)
+ goto err;
+
+ eh = ext_block_hdr(bh);
+ ppos++;
+ BUG_ON(ppos > depth);
+ path[ppos].p_bh = bh;
+ path[ppos].p_hdr = eh;
+ i--;
+
+ if (ext4_ext_check_header(__FUNCTION__, inode, eh))
+ goto err;
+ }
+
+ path[ppos].p_depth = i;
+ path[ppos].p_hdr = eh;
+ path[ppos].p_ext = NULL;
+ path[ppos].p_idx = NULL;
+
+ if (ext4_ext_check_header(__FUNCTION__, inode, eh))
+ goto err;
+
+ /* find extent */
+ ext4_ext_binsearch(inode, path + ppos, block);
+
+ ext4_ext_show_path(inode, path);
+
+ return path;
+
+err:
+ ext4_ext_drop_refs(path);
+ if (alloc)
+ kfree(path);
+ return ERR_PTR(-EIO);
+}
+
+/*
+ * ext4_ext_insert_index:
+ * insert new index [@logical;@ptr] into the block at @curp;
+ * check where to insert: before @curp or after @curp
+ */
+static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
+ struct ext4_ext_path *curp,
+ int logical, ext4_fsblk_t ptr)
+{
+ struct ext4_extent_idx *ix;
+ int len, err;
+
+ if ((err = ext4_ext_get_access(handle, inode, curp)))
+ return err;
+
+ BUG_ON(logical == le32_to_cpu(curp->p_idx->ei_block));
+ len = EXT_MAX_INDEX(curp->p_hdr) - curp->p_idx;
+ if (logical > le32_to_cpu(curp->p_idx->ei_block)) {
+ /* insert after */
+ if (curp->p_idx != EXT_LAST_INDEX(curp->p_hdr)) {
+ len = (len - 1) * sizeof(struct ext4_extent_idx);
+ len = len < 0 ? 0 : len;
+ ext_debug("insert new index %d after: %d. "
+ "move %d from 0x%p to 0x%p\n",
+ logical, ptr, len,
+ (curp->p_idx + 1), (curp->p_idx + 2));
+ memmove(curp->p_idx + 2, curp->p_idx + 1, len);
+ }
+ ix = curp->p_idx + 1;
+ } else {
+ /* insert before */
+ len = len * sizeof(struct ext4_extent_idx);
+ len = len < 0 ? 0 : len;
+ ext_debug("insert new index %d before: %d. "
+ "move %d from 0x%p to 0x%p\n",
+ logical, ptr, len,
+ curp->p_idx, (curp->p_idx + 1));
+ memmove(curp->p_idx + 1, curp->p_idx, len);
+ ix = curp->p_idx;
+ }
+
+ ix->ei_block = cpu_to_le32(logical);
+ ext4_idx_store_pblock(ix, ptr);
+ curp->p_hdr->eh_entries = cpu_to_le16(le16_to_cpu(curp->p_hdr->eh_entries)+1);
+
+ BUG_ON(le16_to_cpu(curp->p_hdr->eh_entries)
+ > le16_to_cpu(curp->p_hdr->eh_max));
+ BUG_ON(ix > EXT_LAST_INDEX(curp->p_hdr));
+
+ err = ext4_ext_dirty(handle, inode, curp);
+ ext4_std_error(inode->i_sb, err);
+
+ return err;
+}
+
+/*
+ * ext4_ext_split:
+ * inserts new subtree into the path, using free index entry
+ * at depth @at:
+ * - allocates all needed blocks (new leaf and all intermediate index blocks)
+ * - makes decision where to split
+ * - moves remaining extents and index entries (right to the split point)
+ * into the newly allocated blocks
+ * - initializes subtree
+ */
+static int ext4_ext_split(handle_t *handle, struct inode *inode,
+ struct ext4_ext_path *path,
+ struct ext4_extent *newext, int at)
+{
+ struct buffer_head *bh = NULL;
+ int depth = ext_depth(inode);
+ struct ext4_extent_header *neh;
+ struct ext4_extent_idx *fidx;
+ struct ext4_extent *ex;
+ int i = at, k, m, a;
+ ext4_fsblk_t newblock, oldblock;
+ __le32 border;
+ ext4_fsblk_t *ablocks = NULL; /* array of allocated blocks */
+ int err = 0;
+
+ /* make decision: where to split? */
+ /* FIXME: now decision is simplest: at current extent */
+
+ /* if current leaf will be split, then we should use
+ * border from split point */
+ BUG_ON(path[depth].p_ext > EXT_MAX_EXTENT(path[depth].p_hdr));
+ if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) {
+ border = path[depth].p_ext[1].ee_block;
+ ext_debug("leaf will be split."
+ " next leaf starts at %d\n",
+ le32_to_cpu(border));
+ } else {
+ border = newext->ee_block;
+ ext_debug("leaf will be added."
+ " next leaf starts at %d\n",
+ le32_to_cpu(border));
+ }
+
+ /*
+ * If error occurs, then we break processing
+ * and mark filesystem read-only. index won't
+ * be inserted and tree will be in consistent
+ * state. Next mount will repair buffers too.
+ */
+
+ /*
+ * Get array to track all allocated blocks.
+ * We need this to handle errors and free blocks
+ * upon them.
+ */
+ ablocks = kmalloc(sizeof(ext4_fsblk_t) * depth, GFP_NOFS);
+ if (!ablocks)
+ return -ENOMEM;
+ memset(ablocks, 0, sizeof(ext4_fsblk_t) * depth);
+
+ /* allocate all needed blocks */
+ ext_debug("allocate %d blocks for indexes/leaf\n", depth - at);
+ for (a = 0; a < depth - at; a++) {
+ newblock = ext4_ext_new_block(handle, inode, path, newext, &err);
+ if (newblock == 0)
+ goto cleanup;
+ ablocks[a] = newblock;
+ }
+
+ /* initialize new leaf */
+ newblock = ablocks[--a];
+ BUG_ON(newblock == 0);
+ bh = sb_getblk(inode->i_sb, newblock);
+ if (!bh) {
+ err = -EIO;
+ goto cleanup;
+ }
+ lock_buffer(bh);
+
+ if ((err = ext4_journal_get_create_access(handle, bh)))
+ goto cleanup;
+
+ neh = ext_block_hdr(bh);
+ neh->eh_entries = 0;
+ neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode));
+ neh->eh_magic = EXT4_EXT_MAGIC;
+ neh->eh_depth = 0;
+ ex = EXT_FIRST_EXTENT(neh);
+
+ /* move remainder of path[depth] to the new leaf */
+ BUG_ON(path[depth].p_hdr->eh_entries != path[depth].p_hdr->eh_max);
+ /* start copy from next extent */
+ /* TODO: we could do it by single memmove */
+ m = 0;
+ path[depth].p_ext++;
+ while (path[depth].p_ext <=
+ EXT_MAX_EXTENT(path[depth].p_hdr)) {
+ ext_debug("move %d:%llu:%d in new leaf %llu\n",
+ le32_to_cpu(path[depth].p_ext->ee_block),
+ ext_pblock(path[depth].p_ext),
+ le16_to_cpu(path[depth].p_ext->ee_len),
+ newblock);
+ /*memmove(ex++, path[depth].p_ext++,
+ sizeof(struct ext4_extent));
+ neh->eh_entries++;*/
+ path[depth].p_ext++;
+ m++;
+ }
+ if (m) {
+ memmove(ex, path[depth].p_ext-m, sizeof(struct ext4_extent)*m);
+ neh->eh_entries = cpu_to_le16(le16_to_cpu(neh->eh_entries)+m);
+ }
+
+ set_buffer_uptodate(bh);
+ unlock_buffer(bh);
+
+ if ((err = ext4_journal_dirty_metadata(handle, bh)))
+ goto cleanup;
+ brelse(bh);
+ bh = NULL;
+
+ /* correct old leaf */
+ if (m) {
+ if ((err = ext4_ext_get_access(handle, inode, path + depth)))
+ goto cleanup;
+ path[depth].p_hdr->eh_entries =
+ cpu_to_le16(le16_to_cpu(path[depth].p_hdr->eh_entries)-m);
+ if ((err = ext4_ext_dirty(handle, inode, path + depth)))
+ goto cleanup;
+
+ }
+
+ /* create intermediate indexes */
+ k = depth - at - 1;
+ BUG_ON(k < 0);
+ if (k)
+ ext_debug("create %d intermediate indices\n", k);
+ /* insert new index into current index block */
+ /* current depth stored in i var */
+ i = depth - 1;
+ while (k--) {
+ oldblock = newblock;
+ newblock = ablocks[--a];
+ bh = sb_getblk(inode->i_sb, (ext4_fsblk_t)newblock);
+ if (!bh) {
+ err = -EIO;
+ goto cleanup;
+ }
+ lock_buffer(bh);
+
+ if ((err = ext4_journal_get_create_access(handle, bh)))
+ goto cleanup;
+
+ neh = ext_block_hdr(bh);
+ neh->eh_entries = cpu_to_le16(1);
+ neh->eh_magic = EXT4_EXT_MAGIC;
+ neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode));
+ neh->eh_depth = cpu_to_le16(depth - i);
+ fidx = EXT_FIRST_INDEX(neh);
+ fidx->ei_block = border;
+ ext4_idx_store_pblock(fidx, oldblock);
+
+ ext_debug("int.index at %d (block %llu): %lu -> %llu\n", i,
+ newblock, (unsigned long) le32_to_cpu(border),
+ oldblock);
+ /* copy indexes */
+ m = 0;
+ path[i].p_idx++;
+
+ ext_debug("cur 0x%p, last 0x%p\n", path[i].p_idx,
+ EXT_MAX_INDEX(path[i].p_hdr));
+ BUG_ON(EXT_MAX_INDEX(path[i].p_hdr) !=
+ EXT_LAST_INDEX(path[i].p_hdr));
+ while (path[i].p_idx <= EXT_MAX_INDEX(path[i].p_hdr)) {
+ ext_debug("%d: move %d:%d in new index %llu\n", i,
+ le32_to_cpu(path[i].p_idx->ei_block),
+ idx_pblock(path[i].p_idx),
+ newblock);
+ /*memmove(++fidx, path[i].p_idx++,
+ sizeof(struct ext4_extent_idx));
+ neh->eh_entries++;
+ BUG_ON(neh->eh_entries > neh->eh_max);*/
+ path[i].p_idx++;
+ m++;
+ }
+ if (m) {
+ memmove(++fidx, path[i].p_idx - m,
+ sizeof(struct ext4_extent_idx) * m);
+ neh->eh_entries =
+ cpu_to_le16(le16_to_cpu(neh->eh_entries) + m);
+ }
+ set_buffer_uptodate(bh);
+ unlock_buffer(bh);
+
+ if ((err = ext4_journal_dirty_metadata(handle, bh)))
+ goto cleanup;
+ brelse(bh);
+ bh = NULL;
+
+ /* correct old index */
+ if (m) {
+ err = ext4_ext_get_access(handle, inode, path + i);
+ if (err)
+ goto cleanup;
+ path[i].p_hdr->eh_entries = cpu_to_le16(le16_to_cpu(path[i].p_hdr->eh_entries)-m);
+ err = ext4_ext_dirty(handle, inode, path + i);
+ if (err)
+ goto cleanup;
+ }
+
+ i--;
+ }
+
+ /* insert new index */
+ if (err)
+ goto cleanup;
+
+ err = ext4_ext_insert_index(handle, inode, path + at,
+ le32_to_cpu(border), newblock);
+
+cleanup:
+ if (bh) {
+ if (buffer_locked(bh))
+ unlock_buffer(bh);
+ brelse(bh);
+ }
+
+ if (err) {
+ /* free all allocated blocks in error case */
+ for (i = 0; i < depth; i++) {
+ if (!ablocks[i])
+ continue;
+ ext4_free_blocks(handle, inode, ablocks[i], 1);
+ }
+ }
+ kfree(ablocks);
+
+ return err;
+}
+
+/*
+ * ext4_ext_grow_indepth:
+ * implements tree growing procedure:
+ * - allocates new block
+ * - moves top-level data (index block or leaf) into the new block
+ * - initializes new top-level, creating index that points to the
+ * just created block
+ */
+static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
+ struct ext4_ext_path *path,
+ struct ext4_extent *newext)
+{
+ struct ext4_ext_path *curp = path;
+ struct ext4_extent_header *neh;
+ struct ext4_extent_idx *fidx;
+ struct buffer_head *bh;
+ ext4_fsblk_t newblock;
+ int err = 0;
+
+ newblock = ext4_ext_new_block(handle, inode, path, newext, &err);
+ if (newblock == 0)
+ return err;
+
+ bh = sb_getblk(inode->i_sb, newblock);
+ if (!bh) {
+ err = -EIO;
+ ext4_std_error(inode->i_sb, err);
+ return err;
+ }
+ lock_buffer(bh);
+
+ if ((err = ext4_journal_get_create_access(handle, bh))) {
+ unlock_buffer(bh);
+ goto out;
+ }
+
+ /* move top-level index/leaf into new block */
+ memmove(bh->b_data, curp->p_hdr, sizeof(EXT4_I(inode)->i_data));
+
+ /* set size of new block */
+ neh = ext_block_hdr(bh);
+ /* old root could have indexes or leaves
+ * so calculate e_max right way */
+ if (ext_depth(inode))
+ neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode));
+ else
+ neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode));
+ neh->eh_magic = EXT4_EXT_MAGIC;
+ set_buffer_uptodate(bh);
+ unlock_buffer(bh);
+
+ if ((err = ext4_journal_dirty_metadata(handle, bh)))
+ goto out;
+
+ /* create index in new top-level index: num,max,pointer */
+ if ((err = ext4_ext_get_access(handle, inode, curp)))
+ goto out;
+
+ curp->p_hdr->eh_magic = EXT4_EXT_MAGIC;
+ curp->p_hdr->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode));
+ curp->p_hdr->eh_entries = cpu_to_le16(1);
+ curp->p_idx = EXT_FIRST_INDEX(curp->p_hdr);
+ /* FIXME: it works, but actually path[0] can be index */
+ curp->p_idx->ei_block = EXT_FIRST_EXTENT(path[0].p_hdr)->ee_block;
+ ext4_idx_store_pblock(curp->p_idx, newblock);
+
+ neh = ext_inode_hdr(inode);
+ fidx = EXT_FIRST_INDEX(neh);
+ ext_debug("new root: num %d(%d), lblock %d, ptr %llu\n",
+ le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max),
+ le32_to_cpu(fidx->ei_block), idx_pblock(fidx));
+
+ neh->eh_depth = cpu_to_le16(path->p_depth + 1);
+ err = ext4_ext_dirty(handle, inode, curp);
+out:
+ brelse(bh);
+
+ return err;
+}
+
+/*
+ * ext4_ext_create_new_leaf:
+ * finds empty index and adds new leaf.
+ * if no free index is found, then it requests in-depth growing.
+ */
+static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode,
+ struct ext4_ext_path *path,
+ struct ext4_extent *newext)
+{
+ struct ext4_ext_path *curp;
+ int depth, i, err = 0;
+
+repeat:
+ i = depth = ext_depth(inode);
+
+ /* walk up to the tree and look for free index entry */
+ curp = path + depth;
+ while (i > 0 && !EXT_HAS_FREE_INDEX(curp)) {
+ i--;
+ curp--;
+ }
+
+ /* we use already allocated block for index block,
+ * so subsequent data blocks should be contiguous */
+ if (EXT_HAS_FREE_INDEX(curp)) {
+ /* if we found index with free entry, then use that
+ * entry: create all needed subtree and add new leaf */
+ err = ext4_ext_split(handle, inode, path, newext, i);
+
+ /* refill path */
+ ext4_ext_drop_refs(path);
+ path = ext4_ext_find_extent(inode,
+ le32_to_cpu(newext->ee_block),
+ path);
+ if (IS_ERR(path))
+ err = PTR_ERR(path);
+ } else {
+ /* tree is full, time to grow in depth */
+ err = ext4_ext_grow_indepth(handle, inode, path, newext);
+ if (err)
+ goto out;
+
+ /* refill path */
+ ext4_ext_drop_refs(path);
+ path = ext4_ext_find_extent(inode,
+ le32_to_cpu(newext->ee_block),
+ path);
+ if (IS_ERR(path)) {
+ err = PTR_ERR(path);
+ goto out;
+ }
+
+ /*
+ * only first (depth 0 -> 1) produces free space;
+ * in all other cases we have to split the grown tree
+ */
+ depth = ext_depth(inode);
+ if (path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max) {
+ /* now we need to split */
+ goto repeat;
+ }
+ }
+
+out:
+ return err;
+}
+
+/*
+ * ext4_ext_next_allocated_block:
+ * returns allocated block in subsequent extent or EXT_MAX_BLOCK.
+ * NOTE: it considers block number from index entry as
+ * allocated block. Thus, index entries have to be consistent
+ * with leaves.
+ */
+static unsigned long
+ext4_ext_next_allocated_block(struct ext4_ext_path *path)
+{
+ int depth;
+
+ BUG_ON(path == NULL);
+ depth = path->p_depth;
+
+ if (depth == 0 && path->p_ext == NULL)
+ return EXT_MAX_BLOCK;
+
+ while (depth >= 0) {
+ if (depth == path->p_depth) {
+ /* leaf */
+ if (path[depth].p_ext !=
+ EXT_LAST_EXTENT(path[depth].p_hdr))
+ return le32_to_cpu(path[depth].p_ext[1].ee_block);
+ } else {
+ /* index */
+ if (path[depth].p_idx !=
+ EXT_LAST_INDEX(path[depth].p_hdr))
+ return le32_to_cpu(path[depth].p_idx[1].ei_block);
+ }
+ depth--;
+ }
+
+ return EXT_MAX_BLOCK;
+}
+
+/*
+ * ext4_ext_next_leaf_block:
+ * returns first allocated block from next leaf or EXT_MAX_BLOCK
+ */
+static unsigned ext4_ext_next_leaf_block(struct inode *inode,
+ struct ext4_ext_path *path)
+{
+ int depth;
+
+ BUG_ON(path == NULL);
+ depth = path->p_depth;
+
+ /* zero-tree has no leaf blocks at all */
+ if (depth == 0)
+ return EXT_MAX_BLOCK;
+
+ /* go to index block */
+ depth--;
+
+ while (depth >= 0) {
+ if (path[depth].p_idx !=
+ EXT_LAST_INDEX(path[depth].p_hdr))
+ return le32_to_cpu(path[depth].p_idx[1].ei_block);
+ depth--;
+ }
+
+ return EXT_MAX_BLOCK;
+}
+
+/*
+ * ext4_ext_correct_indexes:
+ * if leaf gets modified and modified extent is first in the leaf,
+ * then we have to correct all indexes above.
+ * TODO: do we need to correct tree in all cases?
+ */
+int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode,
+ struct ext4_ext_path *path)
+{
+ struct ext4_extent_header *eh;
+ int depth = ext_depth(inode);
+ struct ext4_extent *ex;
+ __le32 border;
+ int k, err = 0;
+
+ eh = path[depth].p_hdr;
+ ex = path[depth].p_ext;
+ BUG_ON(ex == NULL);
+ BUG_ON(eh == NULL);
+
+ if (depth == 0) {
+ /* there is no tree at all */
+ return 0;
+ }
+
+ if (ex != EXT_FIRST_EXTENT(eh)) {
+ /* we correct tree if first leaf got modified only */
+ return 0;
+ }
+
+ /*
+ * TODO: we need correction if border is smaller than current one
+ */
+ k = depth - 1;
+ border = path[depth].p_ext->ee_block;
+ if ((err = ext4_ext_get_access(handle, inode, path + k)))
+ return err;
+ path[k].p_idx->ei_block = border;
+ if ((err = ext4_ext_dirty(handle, inode, path + k)))
+ return err;
+
+ while (k--) {
+ /* change all left-side indexes */
+ if (path[k+1].p_idx != EXT_FIRST_INDEX(path[k+1].p_hdr))
+ break;
+ if ((err = ext4_ext_get_access(handle, inode, path + k)))
+ break;
+ path[k].p_idx->ei_block = border;
+ if ((err = ext4_ext_dirty(handle, inode, path + k)))
+ break;
+ }
+
+ return err;
+}
+
+static int inline
+ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
+ struct ext4_extent *ex2)
+{
+ if (le32_to_cpu(ex1->ee_block) + le16_to_cpu(ex1->ee_len) !=
+ le32_to_cpu(ex2->ee_block))
+ return 0;
+
+ /*
+ * To allow future support for preallocated extents to be added
+ * as an RO_COMPAT feature, refuse to merge to extents if
+ * this can result in the top bit of ee_len being set.
+ */
+ if (le16_to_cpu(ex1->ee_len) + le16_to_cpu(ex2->ee_len) > EXT_MAX_LEN)
+ return 0;
+#ifdef AGRESSIVE_TEST
+ if (le16_to_cpu(ex1->ee_len) >= 4)
+ return 0;
+#endif
+
+ if (ext_pblock(ex1) + le16_to_cpu(ex1->ee_len) == ext_pblock(ex2))
+ return 1;
+ return 0;
+}
+
+/*
+ * ext4_ext_insert_extent:
+ * tries to merge requsted extent into the existing extent or
+ * inserts requested extent as new one into the tree,
+ * creating new leaf in the no-space case.
+ */
+int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
+ struct ext4_ext_path *path,
+ struct ext4_extent *newext)
+{
+ struct ext4_extent_header * eh;
+ struct ext4_extent *ex, *fex;
+ struct ext4_extent *nearex; /* nearest extent */
+ struct ext4_ext_path *npath = NULL;
+ int depth, len, err, next;
+
+ BUG_ON(newext->ee_len == 0);
+ depth = ext_depth(inode);
+ ex = path[depth].p_ext;
+ BUG_ON(path[depth].p_hdr == NULL);
+
+ /* try to insert block into found extent and return */
+ if (ex && ext4_can_extents_be_merged(inode, ex, newext)) {
+ ext_debug("append %d block to %d:%d (from %llu)\n",
+ le16_to_cpu(newext->ee_len),
+ le32_to_cpu(ex->ee_block),
+ le16_to_cpu(ex->ee_len), ext_pblock(ex));
+ if ((err = ext4_ext_get_access(handle, inode, path + depth)))
+ return err;
+ ex->ee_len = cpu_to_le16(le16_to_cpu(ex->ee_len)
+ + le16_to_cpu(newext->ee_len));
+ eh = path[depth].p_hdr;
+ nearex = ex;
+ goto merge;
+ }
+
+repeat:
+ depth = ext_depth(inode);
+ eh = path[depth].p_hdr;
+ if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max))
+ goto has_space;
+
+ /* probably next leaf has space for us? */
+ fex = EXT_LAST_EXTENT(eh);
+ next = ext4_ext_next_leaf_block(inode, path);
+ if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block)
+ && next != EXT_MAX_BLOCK) {
+ ext_debug("next leaf block - %d\n", next);
+ BUG_ON(npath != NULL);
+ npath = ext4_ext_find_extent(inode, next, NULL);
+ if (IS_ERR(npath))
+ return PTR_ERR(npath);
+ BUG_ON(npath->p_depth != path->p_depth);
+ eh = npath[depth].p_hdr;
+ if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) {
+ ext_debug("next leaf isnt full(%d)\n",
+ le16_to_cpu(eh->eh_entries));
+ path = npath;
+ goto repeat;
+ }
+ ext_debug("next leaf has no free space(%d,%d)\n",
+ le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
+ }
+
+ /*
+ * There is no free space in the found leaf.
+ * We're gonna add a new leaf in the tree.
+ */
+ err = ext4_ext_create_new_leaf(handle, inode, path, newext);
+ if (err)
+ goto cleanup;
+ depth = ext_depth(inode);
+ eh = path[depth].p_hdr;
+
+has_space:
+ nearex = path[depth].p_ext;
+
+ if ((err = ext4_ext_get_access(handle, inode, path + depth)))
+ goto cleanup;
+
+ if (!nearex) {
+ /* there is no extent in this leaf, create first one */
+ ext_debug("first extent in the leaf: %d:%llu:%d\n",
+ le32_to_cpu(newext->ee_block),
+ ext_pblock(newext),
+ le16_to_cpu(newext->ee_len));
+ path[depth].p_ext = EXT_FIRST_EXTENT(eh);
+ } else if (le32_to_cpu(newext->ee_block)
+ > le32_to_cpu(nearex->ee_block)) {
+/* BUG_ON(newext->ee_block == nearex->ee_block); */
+ if (nearex != EXT_LAST_EXTENT(eh)) {
+ len = EXT_MAX_EXTENT(eh) - nearex;
+ len = (len - 1) * sizeof(struct ext4_extent);
+ len = len < 0 ? 0 : len;
+ ext_debug("insert %d:%llu:%d after: nearest 0x%p, "
+ "move %d from 0x%p to 0x%p\n",
+ le32_to_cpu(newext->ee_block),
+ ext_pblock(newext),
+ le16_to_cpu(newext->ee_len),
+ nearex, len, nearex + 1, nearex + 2);
+ memmove(nearex + 2, nearex + 1, len);
+ }
+ path[depth].p_ext = nearex + 1;
+ } else {
+ BUG_ON(newext->ee_block == nearex->ee_block);
+ len = (EXT_MAX_EXTENT(eh) - nearex) * sizeof(struct ext4_extent);
+ len = len < 0 ? 0 : len;
+ ext_debug("insert %d:%llu:%d before: nearest 0x%p, "
+ "move %d from 0x%p to 0x%p\n",
+ le32_to_cpu(newext->ee_block),
+ ext_pblock(newext),
+ le16_to_cpu(newext->ee_len),
+ nearex, len, nearex + 1, nearex + 2);
+ memmove(nearex + 1, nearex, len);
+ path[depth].p_ext = nearex;
+ }
+
+ eh->eh_entries = cpu_to_le16(le16_to_cpu(eh->eh_entries)+1);
+ nearex = path[depth].p_ext;
+ nearex->ee_block = newext->ee_block;
+ nearex->ee_start = newext->ee_start;
+ nearex->ee_start_hi = newext->ee_start_hi;
+ nearex->ee_len = newext->ee_len;
+
+merge:
+ /* try to merge extents to the right */
+ while (nearex < EXT_LAST_EXTENT(eh)) {
+ if (!ext4_can_extents_be_merged(inode, nearex, nearex + 1))
+ break;
+ /* merge with next extent! */
+ nearex->ee_len = cpu_to_le16(le16_to_cpu(nearex->ee_len)
+ + le16_to_cpu(nearex[1].ee_len));
+ if (nearex + 1 < EXT_LAST_EXTENT(eh)) {
+ len = (EXT_LAST_EXTENT(eh) - nearex - 1)
+ * sizeof(struct ext4_extent);
+ memmove(nearex + 1, nearex + 2, len);
+ }
+ eh->eh_entries = cpu_to_le16(le16_to_cpu(eh->eh_entries)-1);
+ BUG_ON(eh->eh_entries == 0);
+ }
+
+ /* try to merge extents to the left */
+
+ /* time to correct all indexes above */
+ err = ext4_ext_correct_indexes(handle, inode, path);
+ if (err)
+ goto cleanup;
+
+ err = ext4_ext_dirty(handle, inode, path + depth);
+
+cleanup:
+ if (npath) {
+ ext4_ext_drop_refs(npath);
+ kfree(npath);
+ }
+ ext4_ext_tree_changed(inode);
+ ext4_ext_invalidate_cache(inode);
+ return err;
+}
+
+int ext4_ext_walk_space(struct inode *inode, unsigned long block,
+ unsigned long num, ext_prepare_callback func,
+ void *cbdata)
+{
+ struct ext4_ext_path *path = NULL;
+ struct ext4_ext_cache cbex;
+ struct ext4_extent *ex;
+ unsigned long next, start = 0, end = 0;
+ unsigned long last = block + num;
+ int depth, exists, err = 0;
+
+ BUG_ON(func == NULL);
+ BUG_ON(inode == NULL);
+
+ while (block < last && block != EXT_MAX_BLOCK) {
+ num = last - block;
+ /* find extent for this block */
+ path = ext4_ext_find_extent(inode, block, path);
+ if (IS_ERR(path)) {
+ err = PTR_ERR(path);
+ path = NULL;
+ break;
+ }
+
+ depth = ext_depth(inode);
+ BUG_ON(path[depth].p_hdr == NULL);
+ ex = path[depth].p_ext;
+ next = ext4_ext_next_allocated_block(path);
+
+ exists = 0;
+ if (!ex) {
+ /* there is no extent yet, so try to allocate
+ * all requested space */
+ start = block;
+ end = block + num;
+ } else if (le32_to_cpu(ex->ee_block) > block) {
+ /* need to allocate space before found extent */
+ start = block;
+ end = le32_to_cpu(ex->ee_block);
+ if (block + num < end)
+ end = block + num;
+ } else if (block >=
+ le32_to_cpu(ex->ee_block) + le16_to_cpu(ex->ee_len)) {
+ /* need to allocate space after found extent */
+ start = block;
+ end = block + num;
+ if (end >= next)
+ end = next;
+ } else if (block >= le32_to_cpu(ex->ee_block)) {
+ /*
+ * some part of requested space is covered
+ * by found extent
+ */
+ start = block;
+ end = le32_to_cpu(ex->ee_block) + le16_to_cpu(ex->ee_len);
+ if (block + num < end)
+ end = block + num;
+ exists = 1;
+ } else {
+ BUG();
+ }
+ BUG_ON(end <= start);
+
+ if (!exists) {
+ cbex.ec_block = start;
+ cbex.ec_len = end - start;
+ cbex.ec_start = 0;
+ cbex.ec_type = EXT4_EXT_CACHE_GAP;
+ } else {
+ cbex.ec_block = le32_to_cpu(ex->ee_block);
+ cbex.ec_len = le16_to_cpu(ex->ee_len);
+ cbex.ec_start = ext_pblock(ex);
+ cbex.ec_type = EXT4_EXT_CACHE_EXTENT;
+ }
+
+ BUG_ON(cbex.ec_len == 0);
+ err = func(inode, path, &cbex, cbdata);
+ ext4_ext_drop_refs(path);
+
+ if (err < 0)
+ break;
+ if (err == EXT_REPEAT)
+ continue;
+ else if (err == EXT_BREAK) {
+ err = 0;
+ break;
+ }
+
+ if (ext_depth(inode) != depth) {
+ /* depth was changed. we have to realloc path */
+ kfree(path);
+ path = NULL;
+ }
+
+ block = cbex.ec_block + cbex.ec_len;
+ }
+
+ if (path) {
+ ext4_ext_drop_refs(path);
+ kfree(path);
+ }
+
+ return err;
+}
+
+static inline void
+ext4_ext_put_in_cache(struct inode *inode, __u32 block,
+ __u32 len, __u32 start, int type)
+{
+ struct ext4_ext_cache *cex;
+ BUG_ON(len == 0);
+ cex = &EXT4_I(inode)->i_cached_extent;
+ cex->ec_type = type;
+ cex->ec_block = block;
+ cex->ec_len = len;
+ cex->ec_start = start;
+}
+
+/*
+ * ext4_ext_put_gap_in_cache:
+ * calculate boundaries of the gap that the requested block fits into
+ * and cache this gap
+ */
+static inline void
+ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
+ unsigned long block)
+{
+ int depth = ext_depth(inode);
+ unsigned long lblock, len;
+ struct ext4_extent *ex;
+
+ ex = path[depth].p_ext;
+ if (ex == NULL) {
+ /* there is no extent yet, so gap is [0;-] */
+ lblock = 0;
+ len = EXT_MAX_BLOCK;
+ ext_debug("cache gap(whole file):");
+ } else if (block < le32_to_cpu(ex->ee_block)) {
+ lblock = block;
+ len = le32_to_cpu(ex->ee_block) - block;
+ ext_debug("cache gap(before): %lu [%lu:%lu]",
+ (unsigned long) block,
+ (unsigned long) le32_to_cpu(ex->ee_block),
+ (unsigned long) le16_to_cpu(ex->ee_len));
+ } else if (block >= le32_to_cpu(ex->ee_block)
+ + le16_to_cpu(ex->ee_len)) {
+ lblock = le32_to_cpu(ex->ee_block)
+ + le16_to_cpu(ex->ee_len);
+ len = ext4_ext_next_allocated_block(path);
+ ext_debug("cache gap(after): [%lu:%lu] %lu",
+ (unsigned long) le32_to_cpu(ex->ee_block),
+ (unsigned long) le16_to_cpu(ex->ee_len),
+ (unsigned long) block);
+ BUG_ON(len == lblock);
+ len = len - lblock;
+ } else {
+ lblock = len = 0;
+ BUG();
+ }
+
+ ext_debug(" -> %lu:%lu\n", (unsigned long) lblock, len);
+ ext4_ext_put_in_cache(inode, lblock, len, 0, EXT4_EXT_CACHE_GAP);
+}
+
+static inline int
+ext4_ext_in_cache(struct inode *inode, unsigned long block,
+ struct ext4_extent *ex)
+{
+ struct ext4_ext_cache *cex;
+
+ cex = &EXT4_I(inode)->i_cached_extent;
+
+ /* has cache valid data? */
+ if (cex->ec_type == EXT4_EXT_CACHE_NO)
+ return EXT4_EXT_CACHE_NO;
+
+ BUG_ON(cex->ec_type != EXT4_EXT_CACHE_GAP &&
+ cex->ec_type != EXT4_EXT_CACHE_EXTENT);
+ if (block >= cex->ec_block && block < cex->ec_block + cex->ec_len) {
+ ex->ee_block = cpu_to_le32(cex->ec_block);
+ ext4_ext_store_pblock(ex, cex->ec_start);
+ ex->ee_len = cpu_to_le16(cex->ec_len);
+ ext_debug("%lu cached by %lu:%lu:%llu\n",
+ (unsigned long) block,
+ (unsigned long) cex->ec_block,
+ (unsigned long) cex->ec_len,
+ cex->ec_start);
+ return cex->ec_type;
+ }
+
+ /* not in cache */
+ return EXT4_EXT_CACHE_NO;
+}
+
+/*
+ * ext4_ext_rm_idx:
+ * removes index from the index block.
+ * It's used in truncate case only, thus all requests are for
+ * last index in the block only.
+ */
+int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
+ struct ext4_ext_path *path)
+{
+ struct buffer_head *bh;
+ int err;
+ ext4_fsblk_t leaf;
+
+ /* free index block */
+ path--;
+ leaf = idx_pblock(path->p_idx);
+ BUG_ON(path->p_hdr->eh_entries == 0);
+ if ((err = ext4_ext_get_access(handle, inode, path)))
+ return err;
+ path->p_hdr->eh_entries = cpu_to_le16(le16_to_cpu(path->p_hdr->eh_entries)-1);
+ if ((err = ext4_ext_dirty(handle, inode, path)))
+ return err;
+ ext_debug("index is empty, remove it, free block %llu\n", leaf);
+ bh = sb_find_get_block(inode->i_sb, leaf);
+ ext4_forget(handle, 1, inode, bh, leaf);
+ ext4_free_blocks(handle, inode, leaf, 1);
+ return err;
+}
+
+/*
+ * ext4_ext_calc_credits_for_insert:
+ * This routine returns max. credits that the extent tree can consume.
+ * It should be OK for low-performance paths like ->writepage()
+ * To allow many writing processes to fit into a single transaction,
+ * the caller should calculate credits under truncate_mutex and
+ * pass the actual path.
+ */
+int inline ext4_ext_calc_credits_for_insert(struct inode *inode,
+ struct ext4_ext_path *path)
+{
+ int depth, needed;
+
+ if (path) {
+ /* probably there is space in leaf? */
+ depth = ext_depth(inode);
+ if (le16_to_cpu(path[depth].p_hdr->eh_entries)
+ < le16_to_cpu(path[depth].p_hdr->eh_max))
+ return 1;
+ }
+
+ /*
+ * given 32-bit logical block (4294967296 blocks), max. tree
+ * can be 4 levels in depth -- 4 * 340^4 == 53453440000.
+ * Let's also add one more level for imbalance.
+ */
+ depth = 5;
+
+ /* allocation of new data block(s) */
+ needed = 2;
+
+ /*
+ * tree can be full, so it would need to grow in depth:
+ * allocation + old root + new root
+ */
+ needed += 2 + 1 + 1;
+
+ /*
+ * Index split can happen, we would need:
+ * allocate intermediate indexes (bitmap + group)
+ * + change two blocks at each level, but root (already included)
+ */
+ needed = (depth * 2) + (depth * 2);
+
+ /* any allocation modifies superblock */
+ needed += 1;
+
+ return needed;
+}
+
+static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
+ struct ext4_extent *ex,
+ unsigned long from, unsigned long to)
+{
+ struct buffer_head *bh;
+ int i;
+
+#ifdef EXTENTS_STATS
+ {
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ unsigned short ee_len = le16_to_cpu(ex->ee_len);
+ spin_lock(&sbi->s_ext_stats_lock);
+ sbi->s_ext_blocks += ee_len;
+ sbi->s_ext_extents++;
+ if (ee_len < sbi->s_ext_min)
+ sbi->s_ext_min = ee_len;
+ if (ee_len > sbi->s_ext_max)
+ sbi->s_ext_max = ee_len;
+ if (ext_depth(inode) > sbi->s_depth_max)
+ sbi->s_depth_max = ext_depth(inode);
+ spin_unlock(&sbi->s_ext_stats_lock);
+ }
+#endif
+ if (from >= le32_to_cpu(ex->ee_block)
+ && to == le32_to_cpu(ex->ee_block) + le16_to_cpu(ex->ee_len) - 1) {
+ /* tail removal */
+ unsigned long num;
+ ext4_fsblk_t start;
+ num = le32_to_cpu(ex->ee_block) + le16_to_cpu(ex->ee_len) - from;
+ start = ext_pblock(ex) + le16_to_cpu(ex->ee_len) - num;
+ ext_debug("free last %lu blocks starting %llu\n", num, start);
+ for (i = 0; i < num; i++) {
+ bh = sb_find_get_block(inode->i_sb, start + i);
+ ext4_forget(handle, 0, inode, bh, start + i);
+ }
+ ext4_free_blocks(handle, inode, start, num);
+ } else if (from == le32_to_cpu(ex->ee_block)
+ && to <= le32_to_cpu(ex->ee_block) + le16_to_cpu(ex->ee_len) - 1) {
+ printk("strange request: removal %lu-%lu from %u:%u\n",
+ from, to, le32_to_cpu(ex->ee_block), le16_to_cpu(ex->ee_len));
+ } else {
+ printk("strange request: removal(2) %lu-%lu from %u:%u\n",
+ from, to, le32_to_cpu(ex->ee_block), le16_to_cpu(ex->ee_len));
+ }
+ return 0;
+}
+
+static int
+ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
+ struct ext4_ext_path *path, unsigned long start)
+{
+ int err = 0, correct_index = 0;
+ int depth = ext_depth(inode), credits;
+ struct ext4_extent_header *eh;
+ unsigned a, b, block, num;
+ unsigned long ex_ee_block;
+ unsigned short ex_ee_len;
+ struct ext4_extent *ex;
+
+ ext_debug("truncate since %lu in leaf\n", start);
+ if (!path[depth].p_hdr)
+ path[depth].p_hdr = ext_block_hdr(path[depth].p_bh);
+ eh = path[depth].p_hdr;
+ BUG_ON(eh == NULL);
+ BUG_ON(le16_to_cpu(eh->eh_entries) > le16_to_cpu(eh->eh_max));
+ BUG_ON(eh->eh_magic != EXT4_EXT_MAGIC);
+
+ /* find where to start removing */
+ ex = EXT_LAST_EXTENT(eh);
+
+ ex_ee_block = le32_to_cpu(ex->ee_block);
+ ex_ee_len = le16_to_cpu(ex->ee_len);
+
+ while (ex >= EXT_FIRST_EXTENT(eh) &&
+ ex_ee_block + ex_ee_len > start) {
+ ext_debug("remove ext %lu:%u\n", ex_ee_block, ex_ee_len);
+ path[depth].p_ext = ex;
+
+ a = ex_ee_block > start ? ex_ee_block : start;
+ b = ex_ee_block + ex_ee_len - 1 < EXT_MAX_BLOCK ?
+ ex_ee_block + ex_ee_len - 1 : EXT_MAX_BLOCK;
+
+ ext_debug(" border %u:%u\n", a, b);
+
+ if (a != ex_ee_block && b != ex_ee_block + ex_ee_len - 1) {
+ block = 0;
+ num = 0;
+ BUG();
+ } else if (a != ex_ee_block) {
+ /* remove tail of the extent */
+ block = ex_ee_block;
+ num = a - block;
+ } else if (b != ex_ee_block + ex_ee_len - 1) {
+ /* remove head of the extent */
+ block = a;
+ num = b - a;
+ /* there is no "make a hole" API yet */
+ BUG();
+ } else {
+ /* remove whole extent: excellent! */
+ block = ex_ee_block;
+ num = 0;
+ BUG_ON(a != ex_ee_block);
+ BUG_ON(b != ex_ee_block + ex_ee_len - 1);
+ }
+
+ /* at present, extent can't cross block group: */
+ /* leaf + bitmap + group desc + sb + inode */
+ credits = 5;
+ if (ex == EXT_FIRST_EXTENT(eh)) {
+ correct_index = 1;
+ credits += (ext_depth(inode)) + 1;
+ }
+#ifdef CONFIG_QUOTA
+ credits += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
+#endif
+
+ handle = ext4_ext_journal_restart(handle, credits);
+ if (IS_ERR(handle)) {
+ err = PTR_ERR(handle);
+ goto out;
+ }
+
+ err = ext4_ext_get_access(handle, inode, path + depth);
+ if (err)
+ goto out;
+
+ err = ext4_remove_blocks(handle, inode, ex, a, b);
+ if (err)
+ goto out;
+
+ if (num == 0) {
+ /* this extent is removed; mark slot entirely unused */
+ ext4_ext_store_pblock(ex, 0);
+ eh->eh_entries = cpu_to_le16(le16_to_cpu(eh->eh_entries)-1);
+ }
+
+ ex->ee_block = cpu_to_le32(block);
+ ex->ee_len = cpu_to_le16(num);
+
+ err = ext4_ext_dirty(handle, inode, path + depth);
+ if (err)
+ goto out;
+
+ ext_debug("new extent: %u:%u:%llu\n", block, num,
+ ext_pblock(ex));
+ ex--;
+ ex_ee_block = le32_to_cpu(ex->ee_block);
+ ex_ee_len = le16_to_cpu(ex->ee_len);
+ }
+
+ if (correct_index && eh->eh_entries)
+ err = ext4_ext_correct_indexes(handle, inode, path);
+
+ /* if this leaf is free, then we should
+ * remove it from index block above */
+ if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL)
+ err = ext4_ext_rm_idx(handle, inode, path + depth);
+
+out:
+ return err;
+}
+
+/*
+ * ext4_ext_more_to_rm:
+ * returns 1 if current index has to be freed (even partial)
+ */
+static int inline
+ext4_ext_more_to_rm(struct ext4_ext_path *path)
+{
+ BUG_ON(path->p_idx == NULL);
+
+ if (path->p_idx < EXT_FIRST_INDEX(path->p_hdr))
+ return 0;
+
+ /*
+ * if truncate on deeper level happened, it wasn't partial,
+ * so we have to consider current index for truncation
+ */
+ if (le16_to_cpu(path->p_hdr->eh_entries) == path->p_block)
+ return 0;
+ return 1;
+}
+
+int ext4_ext_remove_space(struct inode *inode, unsigned long start)
+{
+ struct super_block *sb = inode->i_sb;
+ int depth = ext_depth(inode);
+ struct ext4_ext_path *path;
+ handle_t *handle;
+ int i = 0, err = 0;
+
+ ext_debug("truncate since %lu\n", start);
+
+ /* probably first extent we're gonna free will be last in block */
+ handle = ext4_journal_start(inode, depth + 1);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+
+ ext4_ext_invalidate_cache(inode);
+
+ /*
+ * We start scanning from right side, freeing all the blocks
+ * after i_size and walking into the tree depth-wise.
+ */
+ path = kmalloc(sizeof(struct ext4_ext_path) * (depth + 1), GFP_KERNEL);
+ if (path == NULL) {
+ ext4_journal_stop(handle);
+ return -ENOMEM;
+ }
+ memset(path, 0, sizeof(struct ext4_ext_path) * (depth + 1));
+ path[0].p_hdr = ext_inode_hdr(inode);
+ if (ext4_ext_check_header(__FUNCTION__, inode, path[0].p_hdr)) {
+ err = -EIO;
+ goto out;
+ }
+ path[0].p_depth = depth;
+
+ while (i >= 0 && err == 0) {
+ if (i == depth) {
+ /* this is leaf block */
+ err = ext4_ext_rm_leaf(handle, inode, path, start);
+ /* root level has p_bh == NULL, brelse() eats this */
+ brelse(path[i].p_bh);
+ path[i].p_bh = NULL;
+ i--;
+ continue;
+ }
+
+ /* this is index block */
+ if (!path[i].p_hdr) {
+ ext_debug("initialize header\n");
+ path[i].p_hdr = ext_block_hdr(path[i].p_bh);
+ if (ext4_ext_check_header(__FUNCTION__, inode,
+ path[i].p_hdr)) {
+ err = -EIO;
+ goto out;
+ }
+ }
+
+ BUG_ON(le16_to_cpu(path[i].p_hdr->eh_entries)
+ > le16_to_cpu(path[i].p_hdr->eh_max));
+ BUG_ON(path[i].p_hdr->eh_magic != EXT4_EXT_MAGIC);
+
+ if (!path[i].p_idx) {
+ /* this level hasn't been touched yet */
+ path[i].p_idx = EXT_LAST_INDEX(path[i].p_hdr);
+ path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries)+1;
+ ext_debug("init index ptr: hdr 0x%p, num %d\n",
+ path[i].p_hdr,
+ le16_to_cpu(path[i].p_hdr->eh_entries));
+ } else {
+ /* we were already here, see at next index */
+ path[i].p_idx--;
+ }
+
+ ext_debug("level %d - index, first 0x%p, cur 0x%p\n",
+ i, EXT_FIRST_INDEX(path[i].p_hdr),
+ path[i].p_idx);
+ if (ext4_ext_more_to_rm(path + i)) {
+ /* go to the next level */
+ ext_debug("move to level %d (block %llu)\n",
+ i + 1, idx_pblock(path[i].p_idx));
+ memset(path + i + 1, 0, sizeof(*path));
+ path[i+1].p_bh =
+ sb_bread(sb, idx_pblock(path[i].p_idx));
+ if (!path[i+1].p_bh) {
+ /* should we reset i_size? */
+ err = -EIO;
+ break;
+ }
+
+ /* save actual number of indexes since this
+ * number is changed at the next iteration */
+ path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries);
+ i++;
+ } else {
+ /* we finished processing this index, go up */
+ if (path[i].p_hdr->eh_entries == 0 && i > 0) {
+ /* index is empty, remove it;
+ * handle must be already prepared by the
+ * truncatei_leaf() */
+ err = ext4_ext_rm_idx(handle, inode, path + i);
+ }
+ /* root level has p_bh == NULL, brelse() eats this */
+ brelse(path[i].p_bh);
+ path[i].p_bh = NULL;
+ i--;
+ ext_debug("return to level %d\n", i);
+ }
+ }
+
+ /* TODO: flexible tree reduction should be here */
+ if (path->p_hdr->eh_entries == 0) {
+ /*
+ * truncate to zero freed all the tree,
+ * so we need to correct eh_depth
+ */
+ err = ext4_ext_get_access(handle, inode, path);
+ if (err == 0) {
+ ext_inode_hdr(inode)->eh_depth = 0;
+ ext_inode_hdr(inode)->eh_max =
+ cpu_to_le16(ext4_ext_space_root(inode));
+ err = ext4_ext_dirty(handle, inode, path);
+ }
+ }
+out:
+ ext4_ext_tree_changed(inode);
+ ext4_ext_drop_refs(path);
+ kfree(path);
+ ext4_journal_stop(handle);
+
+ return err;
+}
+
+/*
+ * called at mount time
+ */
+void ext4_ext_init(struct super_block *sb)
+{
+ /*
+ * possible initialization would be here
+ */
+
+ if (test_opt(sb, EXTENTS)) {
+ printk("EXT4-fs: file extents enabled");
+#ifdef AGRESSIVE_TEST
+ printk(", agressive tests");
+#endif
+#ifdef CHECK_BINSEARCH
+ printk(", check binsearch");
+#endif
+#ifdef EXTENTS_STATS
+ printk(", stats");
+#endif
+ printk("\n");
+#ifdef EXTENTS_STATS
+ spin_lock_init(&EXT4_SB(sb)->s_ext_stats_lock);
+ EXT4_SB(sb)->s_ext_min = 1 << 30;
+ EXT4_SB(sb)->s_ext_max = 0;
+#endif
+ }
+}
+
+/*
+ * called at umount time
+ */
+void ext4_ext_release(struct super_block *sb)
+{
+ if (!test_opt(sb, EXTENTS))
+ return;
+
+#ifdef EXTENTS_STATS
+ if (EXT4_SB(sb)->s_ext_blocks && EXT4_SB(sb)->s_ext_extents) {
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ printk(KERN_ERR "EXT4-fs: %lu blocks in %lu extents (%lu ave)\n",
+ sbi->s_ext_blocks, sbi->s_ext_extents,
+ sbi->s_ext_blocks / sbi->s_ext_extents);
+ printk(KERN_ERR "EXT4-fs: extents: %lu min, %lu max, max depth %lu\n",
+ sbi->s_ext_min, sbi->s_ext_max, sbi->s_depth_max);
+ }
+#endif
+}
+
+int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
+ ext4_fsblk_t iblock,
+ unsigned long max_blocks, struct buffer_head *bh_result,
+ int create, int extend_disksize)
+{
+ struct ext4_ext_path *path = NULL;
+ struct ext4_extent newex, *ex;
+ ext4_fsblk_t goal, newblock;
+ int err = 0, depth;
+ unsigned long allocated = 0;
+
+ __clear_bit(BH_New, &bh_result->b_state);
+ ext_debug("blocks %d/%lu requested for inode %u\n", (int) iblock,
+ max_blocks, (unsigned) inode->i_ino);
+ mutex_lock(&EXT4_I(inode)->truncate_mutex);
+
+ /* check in cache */
+ if ((goal = ext4_ext_in_cache(inode, iblock, &newex))) {
+ if (goal == EXT4_EXT_CACHE_GAP) {
+ if (!create) {
+ /* block isn't allocated yet and
+ * user doesn't want to allocate it */
+ goto out2;
+ }
+ /* we should allocate requested block */
+ } else if (goal == EXT4_EXT_CACHE_EXTENT) {
+ /* block is already allocated */
+ newblock = iblock
+ - le32_to_cpu(newex.ee_block)
+ + ext_pblock(&newex);
+ /* number of remaining blocks in the extent */
+ allocated = le16_to_cpu(newex.ee_len) -
+ (iblock - le32_to_cpu(newex.ee_block));
+ goto out;
+ } else {
+ BUG();
+ }
+ }
+
+ /* find extent for this block */
+ path = ext4_ext_find_extent(inode, iblock, NULL);
+ if (IS_ERR(path)) {
+ err = PTR_ERR(path);
+ path = NULL;
+ goto out2;
+ }
+
+ depth = ext_depth(inode);
+
+ /*
+ * consistent leaf must not be empty;
+ * this situation is possible, though, _during_ tree modification;
+ * this is why assert can't be put in ext4_ext_find_extent()
+ */
+ BUG_ON(path[depth].p_ext == NULL && depth != 0);
+
+ if ((ex = path[depth].p_ext)) {
+ unsigned long ee_block = le32_to_cpu(ex->ee_block);
+ ext4_fsblk_t ee_start = ext_pblock(ex);
+ unsigned short ee_len = le16_to_cpu(ex->ee_len);
+
+ /*
+ * Allow future support for preallocated extents to be added
+ * as an RO_COMPAT feature:
+ * Uninitialized extents are treated as holes, except that
+ * we avoid (fail) allocating new blocks during a write.
+ */
+ if (ee_len > EXT_MAX_LEN)
+ goto out2;
+ /* if found extent covers block, simply return it */
+ if (iblock >= ee_block && iblock < ee_block + ee_len) {
+ newblock = iblock - ee_block + ee_start;
+ /* number of remaining blocks in the extent */
+ allocated = ee_len - (iblock - ee_block);
+ ext_debug("%d fit into %lu:%d -> %llu\n", (int) iblock,
+ ee_block, ee_len, newblock);
+ ext4_ext_put_in_cache(inode, ee_block, ee_len,
+ ee_start, EXT4_EXT_CACHE_EXTENT);
+ goto out;
+ }
+ }
+
+ /*
+ * requested block isn't allocated yet;
+ * we couldn't try to create block if create flag is zero
+ */
+ if (!create) {
+ /* put just found gap into cache to speed up
+ * subsequent requests */
+ ext4_ext_put_gap_in_cache(inode, path, iblock);
+ goto out2;
+ }
+ /*
+ * Okay, we need to do block allocation. Lazily initialize the block
+ * allocation info here if necessary.
+ */
+ if (S_ISREG(inode->i_mode) && (!EXT4_I(inode)->i_block_alloc_info))
+ ext4_init_block_alloc_info(inode);
+
+ /* allocate new block */
+ goal = ext4_ext_find_goal(inode, path, iblock);
+ allocated = max_blocks;
+ newblock = ext4_new_blocks(handle, inode, goal, &allocated, &err);
+ if (!newblock)
+ goto out2;
+ ext_debug("allocate new block: goal %llu, found %llu/%lu\n",
+ goal, newblock, allocated);
+
+ /* try to insert new extent into found leaf and return */
+ newex.ee_block = cpu_to_le32(iblock);
+ ext4_ext_store_pblock(&newex, newblock);
+ newex.ee_len = cpu_to_le16(allocated);
+ err = ext4_ext_insert_extent(handle, inode, path, &newex);
+ if (err)
+ goto out2;
+
+ if (extend_disksize && inode->i_size > EXT4_I(inode)->i_disksize)
+ EXT4_I(inode)->i_disksize = inode->i_size;
+
+ /* previous routine could use block we allocated */
+ newblock = ext_pblock(&newex);
+ __set_bit(BH_New, &bh_result->b_state);
+
+ ext4_ext_put_in_cache(inode, iblock, allocated, newblock,
+ EXT4_EXT_CACHE_EXTENT);
+out:
+ if (allocated > max_blocks)
+ allocated = max_blocks;
+ ext4_ext_show_leaf(inode, path);
+ __set_bit(BH_Mapped, &bh_result->b_state);
+ bh_result->b_bdev = inode->i_sb->s_bdev;
+ bh_result->b_blocknr = newblock;
+out2:
+ if (path) {
+ ext4_ext_drop_refs(path);
+ kfree(path);
+ }
+ mutex_unlock(&EXT4_I(inode)->truncate_mutex);
+
+ return err ? err : allocated;
+}
+
+void ext4_ext_truncate(struct inode * inode, struct page *page)
+{
+ struct address_space *mapping = inode->i_mapping;
+ struct super_block *sb = inode->i_sb;
+ unsigned long last_block;
+ handle_t *handle;
+ int err = 0;
+
+ /*
+ * probably first extent we're gonna free will be last in block
+ */
+ err = ext4_writepage_trans_blocks(inode) + 3;
+ handle = ext4_journal_start(inode, err);
+ if (IS_ERR(handle)) {
+ if (page) {
+ clear_highpage(page);
+ flush_dcache_page(page);
+ unlock_page(page);
+ page_cache_release(page);
+ }
+ return;
+ }
+
+ if (page)
+ ext4_block_truncate_page(handle, page, mapping, inode->i_size);
+
+ mutex_lock(&EXT4_I(inode)->truncate_mutex);
+ ext4_ext_invalidate_cache(inode);
+
+ /*
+ * TODO: optimization is possible here.
+ * Probably we need not scan at all,
+ * because page truncation is enough.
+ */
+ if (ext4_orphan_add(handle, inode))
+ goto out_stop;
+
+ /* we have to know where to truncate from in crash case */
+ EXT4_I(inode)->i_disksize = inode->i_size;
+ ext4_mark_inode_dirty(handle, inode);
+
+ last_block = (inode->i_size + sb->s_blocksize - 1)
+ >> EXT4_BLOCK_SIZE_BITS(sb);
+ err = ext4_ext_remove_space(inode, last_block);
+
+ /* In a multi-transaction truncate, we only make the final
+ * transaction synchronous. */
+ if (IS_SYNC(inode))
+ handle->h_sync = 1;
+
+out_stop:
+ /*
+ * If this was a simple ftruncate() and the file will remain alive,
+ * then we need to clear up the orphan record which we created above.
+ * However, if this was a real unlink then we were called by
+ * ext4_delete_inode(), and we allow that function to clean up the
+ * orphan info for us.
+ */
+ if (inode->i_nlink)
+ ext4_orphan_del(handle, inode);
+
+ mutex_unlock(&EXT4_I(inode)->truncate_mutex);
+ ext4_journal_stop(handle);
+}
+
+/*
+ * ext4_ext_writepage_trans_blocks:
+ * calculate max number of blocks we could modify
+ * in order to allocate new block for an inode
+ */
+int ext4_ext_writepage_trans_blocks(struct inode *inode, int num)
+{
+ int needed;
+
+ needed = ext4_ext_calc_credits_for_insert(inode, NULL);
+
+ /* caller wants to allocate num blocks, but note it includes sb */
+ needed = needed * num - (num - 1);
+
+#ifdef CONFIG_QUOTA
+ needed += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
+#endif
+
+ return needed;
+}
+
+EXPORT_SYMBOL(ext4_mark_inode_dirty);
+EXPORT_SYMBOL(ext4_ext_invalidate_cache);
+EXPORT_SYMBOL(ext4_ext_insert_extent);
+EXPORT_SYMBOL(ext4_ext_walk_space);
+EXPORT_SYMBOL(ext4_ext_find_goal);
+EXPORT_SYMBOL(ext4_ext_calc_credits_for_insert);
+
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
new file mode 100644
index 000000000000..0b622c0624b7
--- /dev/null
+++ b/fs/ext4/file.c
@@ -0,0 +1,139 @@
+/*
+ * linux/fs/ext4/file.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ * from
+ *
+ * linux/fs/minix/file.c
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ *
+ * ext4 fs regular file handling primitives
+ *
+ * 64-bit file support on 64-bit platforms by Jakub Jelinek
+ * (jj@sunsite.ms.mff.cuni.cz)
+ */
+
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/jbd2.h>
+#include <linux/ext4_fs.h>
+#include <linux/ext4_jbd2.h>
+#include "xattr.h"
+#include "acl.h"
+
+/*
+ * Called when an inode is released. Note that this is different
+ * from ext4_file_open: open gets called at every open, but release
+ * gets called only when /all/ the files are closed.
+ */
+static int ext4_release_file (struct inode * inode, struct file * filp)
+{
+ /* if we are the last writer on the inode, drop the block reservation */
+ if ((filp->f_mode & FMODE_WRITE) &&
+ (atomic_read(&inode->i_writecount) == 1))
+ {
+ mutex_lock(&EXT4_I(inode)->truncate_mutex);
+ ext4_discard_reservation(inode);
+ mutex_unlock(&EXT4_I(inode)->truncate_mutex);
+ }
+ if (is_dx(inode) && filp->private_data)
+ ext4_htree_free_dir_info(filp->private_data);
+
+ return 0;
+}
+
+static ssize_t
+ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
+ unsigned long nr_segs, loff_t pos)
+{
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file->f_dentry->d_inode;
+ ssize_t ret;
+ int err;
+
+ ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
+
+ /*
+ * Skip flushing if there was an error, or if nothing was written.
+ */
+ if (ret <= 0)
+ return ret;
+
+ /*
+ * If the inode is IS_SYNC, or is O_SYNC and we are doing data
+ * journalling then we need to make sure that we force the transaction
+ * to disk to keep all metadata uptodate synchronously.
+ */
+ if (file->f_flags & O_SYNC) {
+ /*
+ * If we are non-data-journaled, then the dirty data has
+ * already been flushed to backing store by generic_osync_inode,
+ * and the inode has been flushed too if there have been any
+ * modifications other than mere timestamp updates.
+ *
+ * Open question --- do we care about flushing timestamps too
+ * if the inode is IS_SYNC?
+ */
+ if (!ext4_should_journal_data(inode))
+ return ret;
+
+ goto force_commit;
+ }
+
+ /*
+ * So we know that there has been no forced data flush. If the inode
+ * is marked IS_SYNC, we need to force one ourselves.
+ */
+ if (!IS_SYNC(inode))
+ return ret;
+
+ /*
+ * Open question #2 --- should we force data to disk here too? If we
+ * don't, the only impact is that data=writeback filesystems won't
+ * flush data to disk automatically on IS_SYNC, only metadata (but
+ * historically, that is what ext2 has done.)
+ */
+
+force_commit:
+ err = ext4_force_commit(inode->i_sb);
+ if (err)
+ return err;
+ return ret;
+}
+
+const struct file_operations ext4_file_operations = {
+ .llseek = generic_file_llseek,
+ .read = do_sync_read,
+ .write = do_sync_write,
+ .aio_read = generic_file_aio_read,
+ .aio_write = ext4_file_write,
+ .ioctl = ext4_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = ext4_compat_ioctl,
+#endif
+ .mmap = generic_file_mmap,
+ .open = generic_file_open,
+ .release = ext4_release_file,
+ .fsync = ext4_sync_file,
+ .sendfile = generic_file_sendfile,
+ .splice_read = generic_file_splice_read,
+ .splice_write = generic_file_splice_write,
+};
+
+struct inode_operations ext4_file_inode_operations = {
+ .truncate = ext4_truncate,
+ .setattr = ext4_setattr,
+#ifdef CONFIG_EXT4DEV_FS_XATTR
+ .setxattr = generic_setxattr,
+ .getxattr = generic_getxattr,
+ .listxattr = ext4_listxattr,
+ .removexattr = generic_removexattr,
+#endif
+ .permission = ext4_permission,
+};
+
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
new file mode 100644
index 000000000000..2a167d7131fa
--- /dev/null
+++ b/fs/ext4/fsync.c
@@ -0,0 +1,88 @@
+/*
+ * linux/fs/ext4/fsync.c
+ *
+ * Copyright (C) 1993 Stephen Tweedie (sct@redhat.com)
+ * from
+ * Copyright (C) 1992 Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ * from
+ * linux/fs/minix/truncate.c Copyright (C) 1991, 1992 Linus Torvalds
+ *
+ * ext4fs fsync primitive
+ *
+ * Big-endian to little-endian byte-swapping/bitmaps by
+ * David S. Miller (davem@caip.rutgers.edu), 1995
+ *
+ * Removed unnecessary code duplication for little endian machines
+ * and excessive __inline__s.
+ * Andi Kleen, 1997
+ *
+ * Major simplications and cleanup - we only need to do the metadata, because
+ * we can depend on generic_block_fdatasync() to sync the data blocks.
+ */
+
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include <linux/writeback.h>
+#include <linux/jbd2.h>
+#include <linux/ext4_fs.h>
+#include <linux/ext4_jbd2.h>
+
+/*
+ * akpm: A new design for ext4_sync_file().
+ *
+ * This is only called from sys_fsync(), sys_fdatasync() and sys_msync().
+ * There cannot be a transaction open by this task.
+ * Another task could have dirtied this inode. Its data can be in any
+ * state in the journalling system.
+ *
+ * What we do is just kick off a commit and wait on it. This will snapshot the
+ * inode to disk.
+ */
+
+int ext4_sync_file(struct file * file, struct dentry *dentry, int datasync)
+{
+ struct inode *inode = dentry->d_inode;
+ int ret = 0;
+
+ J_ASSERT(ext4_journal_current_handle() == 0);
+
+ /*
+ * data=writeback:
+ * The caller's filemap_fdatawrite()/wait will sync the data.
+ * sync_inode() will sync the metadata
+ *
+ * data=ordered:
+ * The caller's filemap_fdatawrite() will write the data and
+ * sync_inode() will write the inode if it is dirty. Then the caller's
+ * filemap_fdatawait() will wait on the pages.
+ *
+ * data=journal:
+ * filemap_fdatawrite won't do anything (the buffers are clean).
+ * ext4_force_commit will write the file data into the journal and
+ * will wait on that.
+ * filemap_fdatawait() will encounter a ton of newly-dirtied pages
+ * (they were dirtied by commit). But that's OK - the blocks are
+ * safe in-journal, which is all fsync() needs to ensure.
+ */
+ if (ext4_should_journal_data(inode)) {
+ ret = ext4_force_commit(inode->i_sb);
+ goto out;
+ }
+
+ /*
+ * The VFS has written the file data. If the inode is unaltered
+ * then we need not start a commit.
+ */
+ if (inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC)) {
+ struct writeback_control wbc = {
+ .sync_mode = WB_SYNC_ALL,
+ .nr_to_write = 0, /* sys_fsync did this */
+ };
+ ret = sync_inode(inode, &wbc);
+ }
+out:
+ return ret;
+}
diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c
new file mode 100644
index 000000000000..a67966385e06
--- /dev/null
+++ b/fs/ext4/hash.c
@@ -0,0 +1,152 @@
+/*
+ * linux/fs/ext4/hash.c
+ *
+ * Copyright (C) 2002 by Theodore Ts'o
+ *
+ * This file is released under the GPL v2.
+ *
+ * This file may be redistributed under the terms of the GNU Public
+ * License.
+ */
+
+#include <linux/fs.h>
+#include <linux/jbd2.h>
+#include <linux/sched.h>
+#include <linux/ext4_fs.h>
+#include <linux/cryptohash.h>
+
+#define DELTA 0x9E3779B9
+
+static void TEA_transform(__u32 buf[4], __u32 const in[])
+{
+ __u32 sum = 0;
+ __u32 b0 = buf[0], b1 = buf[1];
+ __u32 a = in[0], b = in[1], c = in[2], d = in[3];
+ int n = 16;
+
+ do {
+ sum += DELTA;
+ b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b);
+ b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d);
+ } while(--n);
+
+ buf[0] += b0;
+ buf[1] += b1;
+}
+
+
+/* The old legacy hash */
+static __u32 dx_hack_hash (const char *name, int len)
+{
+ __u32 hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
+ while (len--) {
+ __u32 hash = hash1 + (hash0 ^ (*name++ * 7152373));
+
+ if (hash & 0x80000000) hash -= 0x7fffffff;
+ hash1 = hash0;
+ hash0 = hash;
+ }
+ return (hash0 << 1);
+}
+
+static void str2hashbuf(const char *msg, int len, __u32 *buf, int num)
+{
+ __u32 pad, val;
+ int i;
+
+ pad = (__u32)len | ((__u32)len << 8);
+ pad |= pad << 16;
+
+ val = pad;
+ if (len > num*4)
+ len = num * 4;
+ for (i=0; i < len; i++) {
+ if ((i % 4) == 0)
+ val = pad;
+ val = msg[i] + (val << 8);
+ if ((i % 4) == 3) {
+ *buf++ = val;
+ val = pad;
+ num--;
+ }
+ }
+ if (--num >= 0)
+ *buf++ = val;
+ while (--num >= 0)
+ *buf++ = pad;
+}
+
+/*
+ * Returns the hash of a filename. If len is 0 and name is NULL, then
+ * this function can be used to test whether or not a hash version is
+ * supported.
+ *
+ * The seed is an 4 longword (32 bits) "secret" which can be used to
+ * uniquify a hash. If the seed is all zero's, then some default seed
+ * may be used.
+ *
+ * A particular hash version specifies whether or not the seed is
+ * represented, and whether or not the returned hash is 32 bits or 64
+ * bits. 32 bit hashes will return 0 for the minor hash.
+ */
+int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
+{
+ __u32 hash;
+ __u32 minor_hash = 0;
+ const char *p;
+ int i;
+ __u32 in[8], buf[4];
+
+ /* Initialize the default seed for the hash checksum functions */
+ buf[0] = 0x67452301;
+ buf[1] = 0xefcdab89;
+ buf[2] = 0x98badcfe;
+ buf[3] = 0x10325476;
+
+ /* Check to see if the seed is all zero's */
+ if (hinfo->seed) {
+ for (i=0; i < 4; i++) {
+ if (hinfo->seed[i])
+ break;
+ }
+ if (i < 4)
+ memcpy(buf, hinfo->seed, sizeof(buf));
+ }
+
+ switch (hinfo->hash_version) {
+ case DX_HASH_LEGACY:
+ hash = dx_hack_hash(name, len);
+ break;
+ case DX_HASH_HALF_MD4:
+ p = name;
+ while (len > 0) {
+ str2hashbuf(p, len, in, 8);
+ half_md4_transform(buf, in);
+ len -= 32;
+ p += 32;
+ }
+ minor_hash = buf[2];
+ hash = buf[1];
+ break;
+ case DX_HASH_TEA:
+ p = name;
+ while (len > 0) {
+ str2hashbuf(p, len, in, 4);
+ TEA_transform(buf, in);
+ len -= 16;
+ p += 16;
+ }
+ hash = buf[0];
+ minor_hash = buf[1];
+ break;
+ default:
+ hinfo->hash = 0;
+ return -1;
+ }
+ hash = hash & ~1;
+ if (hash == (EXT4_HTREE_EOF << 1))
+ hash = (EXT4_HTREE_EOF-1) << 1;
+ hinfo->hash = hash;
+ hinfo->minor_hash = minor_hash;
+ return 0;
+}
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
new file mode 100644
index 000000000000..c88b439ba5cd
--- /dev/null
+++ b/fs/ext4/ialloc.c
@@ -0,0 +1,772 @@
+/*
+ * linux/fs/ext4/ialloc.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ * BSD ufs-inspired inode and directory allocation by
+ * Stephen Tweedie (sct@redhat.com), 1993
+ * Big-endian to little-endian byte-swapping/bitmaps by
+ * David S. Miller (davem@caip.rutgers.edu), 1995
+ */
+
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/jbd2.h>
+#include <linux/ext4_fs.h>
+#include <linux/ext4_jbd2.h>
+#include <linux/stat.h>
+#include <linux/string.h>
+#include <linux/quotaops.h>
+#include <linux/buffer_head.h>
+#include <linux/random.h>
+#include <linux/bitops.h>
+#include <linux/blkdev.h>
+#include <asm/byteorder.h>
+
+#include "xattr.h"
+#include "acl.h"
+
+/*
+ * ialloc.c contains the inodes allocation and deallocation routines
+ */
+
+/*
+ * The free inodes are managed by bitmaps. A file system contains several
+ * blocks groups. Each group contains 1 bitmap block for blocks, 1 bitmap
+ * block for inodes, N blocks for the inode table and data blocks.
+ *
+ * The file system contains group descriptors which are located after the
+ * super block. Each descriptor contains the number of the bitmap block and
+ * the free blocks count in the block.
+ */
+
+
+/*
+ * Read the inode allocation bitmap for a given block_group, reading
+ * into the specified slot in the superblock's bitmap cache.
+ *
+ * Return buffer_head of bitmap on success or NULL.
+ */
+static struct buffer_head *
+read_inode_bitmap(struct super_block * sb, unsigned long block_group)
+{
+ struct ext4_group_desc *desc;
+ struct buffer_head *bh = NULL;
+
+ desc = ext4_get_group_desc(sb, block_group, NULL);
+ if (!desc)
+ goto error_out;
+
+ bh = sb_bread(sb, ext4_inode_bitmap(sb, desc));
+ if (!bh)
+ ext4_error(sb, "read_inode_bitmap",
+ "Cannot read inode bitmap - "
+ "block_group = %lu, inode_bitmap = %llu",
+ block_group, ext4_inode_bitmap(sb, desc));
+error_out:
+ return bh;
+}
+
+/*
+ * NOTE! When we get the inode, we're the only people
+ * that have access to it, and as such there are no
+ * race conditions we have to worry about. The inode
+ * is not on the hash-lists, and it cannot be reached
+ * through the filesystem because the directory entry
+ * has been deleted earlier.
+ *
+ * HOWEVER: we must make sure that we get no aliases,
+ * which means that we have to call "clear_inode()"
+ * _before_ we mark the inode not in use in the inode
+ * bitmaps. Otherwise a newly created file might use
+ * the same inode number (not actually the same pointer
+ * though), and then we'd have two inodes sharing the
+ * same inode number and space on the harddisk.
+ */
+void ext4_free_inode (handle_t *handle, struct inode * inode)
+{
+ struct super_block * sb = inode->i_sb;
+ int is_directory;
+ unsigned long ino;
+ struct buffer_head *bitmap_bh = NULL;
+ struct buffer_head *bh2;
+ unsigned long block_group;
+ unsigned long bit;
+ struct ext4_group_desc * gdp;
+ struct ext4_super_block * es;
+ struct ext4_sb_info *sbi;
+ int fatal = 0, err;
+
+ if (atomic_read(&inode->i_count) > 1) {
+ printk ("ext4_free_inode: inode has count=%d\n",
+ atomic_read(&inode->i_count));
+ return;
+ }
+ if (inode->i_nlink) {
+ printk ("ext4_free_inode: inode has nlink=%d\n",
+ inode->i_nlink);
+ return;
+ }
+ if (!sb) {
+ printk("ext4_free_inode: inode on nonexistent device\n");
+ return;
+ }
+ sbi = EXT4_SB(sb);
+
+ ino = inode->i_ino;
+ ext4_debug ("freeing inode %lu\n", ino);
+
+ /*
+ * Note: we must free any quota before locking the superblock,
+ * as writing the quota to disk may need the lock as well.
+ */
+ DQUOT_INIT(inode);
+ ext4_xattr_delete_inode(handle, inode);
+ DQUOT_FREE_INODE(inode);
+ DQUOT_DROP(inode);
+
+ is_directory = S_ISDIR(inode->i_mode);
+
+ /* Do this BEFORE marking the inode not in use or returning an error */
+ clear_inode (inode);
+
+ es = EXT4_SB(sb)->s_es;
+ if (ino < EXT4_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) {
+ ext4_error (sb, "ext4_free_inode",
+ "reserved or nonexistent inode %lu", ino);
+ goto error_return;
+ }
+ block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
+ bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb);
+ bitmap_bh = read_inode_bitmap(sb, block_group);
+ if (!bitmap_bh)
+ goto error_return;
+
+ BUFFER_TRACE(bitmap_bh, "get_write_access");
+ fatal = ext4_journal_get_write_access(handle, bitmap_bh);
+ if (fatal)
+ goto error_return;
+
+ /* Ok, now we can actually update the inode bitmaps.. */
+ if (!ext4_clear_bit_atomic(sb_bgl_lock(sbi, block_group),
+ bit, bitmap_bh->b_data))
+ ext4_error (sb, "ext4_free_inode",
+ "bit already cleared for inode %lu", ino);
+ else {
+ gdp = ext4_get_group_desc (sb, block_group, &bh2);
+
+ BUFFER_TRACE(bh2, "get_write_access");
+ fatal = ext4_journal_get_write_access(handle, bh2);
+ if (fatal) goto error_return;
+
+ if (gdp) {
+ spin_lock(sb_bgl_lock(sbi, block_group));
+ gdp->bg_free_inodes_count = cpu_to_le16(
+ le16_to_cpu(gdp->bg_free_inodes_count) + 1);
+ if (is_directory)
+ gdp->bg_used_dirs_count = cpu_to_le16(
+ le16_to_cpu(gdp->bg_used_dirs_count) - 1);
+ spin_unlock(sb_bgl_lock(sbi, block_group));
+ percpu_counter_inc(&sbi->s_freeinodes_counter);
+ if (is_directory)
+ percpu_counter_dec(&sbi->s_dirs_counter);
+
+ }
+ BUFFER_TRACE(bh2, "call ext4_journal_dirty_metadata");
+ err = ext4_journal_dirty_metadata(handle, bh2);
+ if (!fatal) fatal = err;
+ }
+ BUFFER_TRACE(bitmap_bh, "call ext4_journal_dirty_metadata");
+ err = ext4_journal_dirty_metadata(handle, bitmap_bh);
+ if (!fatal)
+ fatal = err;
+ sb->s_dirt = 1;
+error_return:
+ brelse(bitmap_bh);
+ ext4_std_error(sb, fatal);
+}
+
+/*
+ * There are two policies for allocating an inode. If the new inode is
+ * a directory, then a forward search is made for a block group with both
+ * free space and a low directory-to-inode ratio; if that fails, then of
+ * the groups with above-average free space, that group with the fewest
+ * directories already is chosen.
+ *
+ * For other inodes, search forward from the parent directory\'s block
+ * group to find a free inode.
+ */
+static int find_group_dir(struct super_block *sb, struct inode *parent)
+{
+ int ngroups = EXT4_SB(sb)->s_groups_count;
+ unsigned int freei, avefreei;
+ struct ext4_group_desc *desc, *best_desc = NULL;
+ struct buffer_head *bh;
+ int group, best_group = -1;
+
+ freei = percpu_counter_read_positive(&EXT4_SB(sb)->s_freeinodes_counter);
+ avefreei = freei / ngroups;
+
+ for (group = 0; group < ngroups; group++) {
+ desc = ext4_get_group_desc (sb, group, &bh);
+ if (!desc || !desc->bg_free_inodes_count)
+ continue;
+ if (le16_to_cpu(desc->bg_free_inodes_count) < avefreei)
+ continue;
+ if (!best_desc ||
+ (le16_to_cpu(desc->bg_free_blocks_count) >
+ le16_to_cpu(best_desc->bg_free_blocks_count))) {
+ best_group = group;
+ best_desc = desc;
+ }
+ }
+ return best_group;
+}
+
+/*
+ * Orlov's allocator for directories.
+ *
+ * We always try to spread first-level directories.
+ *
+ * If there are blockgroups with both free inodes and free blocks counts
+ * not worse than average we return one with smallest directory count.
+ * Otherwise we simply return a random group.
+ *
+ * For the rest rules look so:
+ *
+ * It's OK to put directory into a group unless
+ * it has too many directories already (max_dirs) or
+ * it has too few free inodes left (min_inodes) or
+ * it has too few free blocks left (min_blocks) or
+ * it's already running too large debt (max_debt).
+ * Parent's group is prefered, if it doesn't satisfy these
+ * conditions we search cyclically through the rest. If none
+ * of the groups look good we just look for a group with more
+ * free inodes than average (starting at parent's group).
+ *
+ * Debt is incremented each time we allocate a directory and decremented
+ * when we allocate an inode, within 0--255.
+ */
+
+#define INODE_COST 64
+#define BLOCK_COST 256
+
+static int find_group_orlov(struct super_block *sb, struct inode *parent)
+{
+ int parent_group = EXT4_I(parent)->i_block_group;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_super_block *es = sbi->s_es;
+ int ngroups = sbi->s_groups_count;
+ int inodes_per_group = EXT4_INODES_PER_GROUP(sb);
+ unsigned int freei, avefreei;
+ ext4_fsblk_t freeb, avefreeb;
+ ext4_fsblk_t blocks_per_dir;
+ unsigned int ndirs;
+ int max_debt, max_dirs, min_inodes;
+ ext4_grpblk_t min_blocks;
+ int group = -1, i;
+ struct ext4_group_desc *desc;
+ struct buffer_head *bh;
+
+ freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter);
+ avefreei = freei / ngroups;
+ freeb = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
+ avefreeb = freeb;
+ do_div(avefreeb, ngroups);
+ ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter);
+
+ if ((parent == sb->s_root->d_inode) ||
+ (EXT4_I(parent)->i_flags & EXT4_TOPDIR_FL)) {
+ int best_ndir = inodes_per_group;
+ int best_group = -1;
+
+ get_random_bytes(&group, sizeof(group));
+ parent_group = (unsigned)group % ngroups;
+ for (i = 0; i < ngroups; i++) {
+ group = (parent_group + i) % ngroups;
+ desc = ext4_get_group_desc (sb, group, &bh);
+ if (!desc || !desc->bg_free_inodes_count)
+ continue;
+ if (le16_to_cpu(desc->bg_used_dirs_count) >= best_ndir)
+ continue;
+ if (le16_to_cpu(desc->bg_free_inodes_count) < avefreei)
+ continue;
+ if (le16_to_cpu(desc->bg_free_blocks_count) < avefreeb)
+ continue;
+ best_group = group;
+ best_ndir = le16_to_cpu(desc->bg_used_dirs_count);
+ }
+ if (best_group >= 0)
+ return best_group;
+ goto fallback;
+ }
+
+ blocks_per_dir = ext4_blocks_count(es) - freeb;
+ do_div(blocks_per_dir, ndirs);
+
+ max_dirs = ndirs / ngroups + inodes_per_group / 16;
+ min_inodes = avefreei - inodes_per_group / 4;
+ min_blocks = avefreeb - EXT4_BLOCKS_PER_GROUP(sb) / 4;
+
+ max_debt = EXT4_BLOCKS_PER_GROUP(sb);
+ max_debt /= max_t(int, blocks_per_dir, BLOCK_COST);
+ if (max_debt * INODE_COST > inodes_per_group)
+ max_debt = inodes_per_group / INODE_COST;
+ if (max_debt > 255)
+ max_debt = 255;
+ if (max_debt == 0)
+ max_debt = 1;
+
+ for (i = 0; i < ngroups; i++) {
+ group = (parent_group + i) % ngroups;
+ desc = ext4_get_group_desc (sb, group, &bh);
+ if (!desc || !desc->bg_free_inodes_count)
+ continue;
+ if (le16_to_cpu(desc->bg_used_dirs_count) >= max_dirs)
+ continue;
+ if (le16_to_cpu(desc->bg_free_inodes_count) < min_inodes)
+ continue;
+ if (le16_to_cpu(desc->bg_free_blocks_count) < min_blocks)
+ continue;
+ return group;
+ }
+
+fallback:
+ for (i = 0; i < ngroups; i++) {
+ group = (parent_group + i) % ngroups;
+ desc = ext4_get_group_desc (sb, group, &bh);
+ if (!desc || !desc->bg_free_inodes_count)
+ continue;
+ if (le16_to_cpu(desc->bg_free_inodes_count) >= avefreei)
+ return group;
+ }
+
+ if (avefreei) {
+ /*
+ * The free-inodes counter is approximate, and for really small
+ * filesystems the above test can fail to find any blockgroups
+ */
+ avefreei = 0;
+ goto fallback;
+ }
+
+ return -1;
+}
+
+static int find_group_other(struct super_block *sb, struct inode *parent)
+{
+ int parent_group = EXT4_I(parent)->i_block_group;
+ int ngroups = EXT4_SB(sb)->s_groups_count;
+ struct ext4_group_desc *desc;
+ struct buffer_head *bh;
+ int group, i;
+
+ /*
+ * Try to place the inode in its parent directory
+ */
+ group = parent_group;
+ desc = ext4_get_group_desc (sb, group, &bh);
+ if (desc && le16_to_cpu(desc->bg_free_inodes_count) &&
+ le16_to_cpu(desc->bg_free_blocks_count))
+ return group;
+
+ /*
+ * We're going to place this inode in a different blockgroup from its
+ * parent. We want to cause files in a common directory to all land in
+ * the same blockgroup. But we want files which are in a different
+ * directory which shares a blockgroup with our parent to land in a
+ * different blockgroup.
+ *
+ * So add our directory's i_ino into the starting point for the hash.
+ */
+ group = (group + parent->i_ino) % ngroups;
+
+ /*
+ * Use a quadratic hash to find a group with a free inode and some free
+ * blocks.
+ */
+ for (i = 1; i < ngroups; i <<= 1) {
+ group += i;
+ if (group >= ngroups)
+ group -= ngroups;
+ desc = ext4_get_group_desc (sb, group, &bh);
+ if (desc && le16_to_cpu(desc->bg_free_inodes_count) &&
+ le16_to_cpu(desc->bg_free_blocks_count))
+ return group;
+ }
+
+ /*
+ * That failed: try linear search for a free inode, even if that group
+ * has no free blocks.
+ */
+ group = parent_group;
+ for (i = 0; i < ngroups; i++) {
+ if (++group >= ngroups)
+ group = 0;
+ desc = ext4_get_group_desc (sb, group, &bh);
+ if (desc && le16_to_cpu(desc->bg_free_inodes_count))
+ return group;
+ }
+
+ return -1;
+}
+
+/*
+ * There are two policies for allocating an inode. If the new inode is
+ * a directory, then a forward search is made for a block group with both
+ * free space and a low directory-to-inode ratio; if that fails, then of
+ * the groups with above-average free space, that group with the fewest
+ * directories already is chosen.
+ *
+ * For other inodes, search forward from the parent directory's block
+ * group to find a free inode.
+ */
+struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode)
+{
+ struct super_block *sb;
+ struct buffer_head *bitmap_bh = NULL;
+ struct buffer_head *bh2;
+ int group;
+ unsigned long ino = 0;
+ struct inode * inode;
+ struct ext4_group_desc * gdp = NULL;
+ struct ext4_super_block * es;
+ struct ext4_inode_info *ei;
+ struct ext4_sb_info *sbi;
+ int err = 0;
+ struct inode *ret;
+ int i;
+
+ /* Cannot create files in a deleted directory */
+ if (!dir || !dir->i_nlink)
+ return ERR_PTR(-EPERM);
+
+ sb = dir->i_sb;
+ inode = new_inode(sb);
+ if (!inode)
+ return ERR_PTR(-ENOMEM);
+ ei = EXT4_I(inode);
+
+ sbi = EXT4_SB(sb);
+ es = sbi->s_es;
+ if (S_ISDIR(mode)) {
+ if (test_opt (sb, OLDALLOC))
+ group = find_group_dir(sb, dir);
+ else
+ group = find_group_orlov(sb, dir);
+ } else
+ group = find_group_other(sb, dir);
+
+ err = -ENOSPC;
+ if (group == -1)
+ goto out;
+
+ for (i = 0; i < sbi->s_groups_count; i++) {
+ err = -EIO;
+
+ gdp = ext4_get_group_desc(sb, group, &bh2);
+ if (!gdp)
+ goto fail;
+
+ brelse(bitmap_bh);
+ bitmap_bh = read_inode_bitmap(sb, group);
+ if (!bitmap_bh)
+ goto fail;
+
+ ino = 0;
+
+repeat_in_this_group:
+ ino = ext4_find_next_zero_bit((unsigned long *)
+ bitmap_bh->b_data, EXT4_INODES_PER_GROUP(sb), ino);
+ if (ino < EXT4_INODES_PER_GROUP(sb)) {
+
+ BUFFER_TRACE(bitmap_bh, "get_write_access");
+ err = ext4_journal_get_write_access(handle, bitmap_bh);
+ if (err)
+ goto fail;
+
+ if (!ext4_set_bit_atomic(sb_bgl_lock(sbi, group),
+ ino, bitmap_bh->b_data)) {
+ /* we won it */
+ BUFFER_TRACE(bitmap_bh,
+ "call ext4_journal_dirty_metadata");
+ err = ext4_journal_dirty_metadata(handle,
+ bitmap_bh);
+ if (err)
+ goto fail;
+ goto got;
+ }
+ /* we lost it */
+ jbd2_journal_release_buffer(handle, bitmap_bh);
+
+ if (++ino < EXT4_INODES_PER_GROUP(sb))
+ goto repeat_in_this_group;
+ }
+
+ /*
+ * This case is possible in concurrent environment. It is very
+ * rare. We cannot repeat the find_group_xxx() call because
+ * that will simply return the same blockgroup, because the
+ * group descriptor metadata has not yet been updated.
+ * So we just go onto the next blockgroup.
+ */
+ if (++group == sbi->s_groups_count)
+ group = 0;
+ }
+ err = -ENOSPC;
+ goto out;
+
+got:
+ ino += group * EXT4_INODES_PER_GROUP(sb) + 1;
+ if (ino < EXT4_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) {
+ ext4_error (sb, "ext4_new_inode",
+ "reserved inode or inode > inodes count - "
+ "block_group = %d, inode=%lu", group, ino);
+ err = -EIO;
+ goto fail;
+ }
+
+ BUFFER_TRACE(bh2, "get_write_access");
+ err = ext4_journal_get_write_access(handle, bh2);
+ if (err) goto fail;
+ spin_lock(sb_bgl_lock(sbi, group));
+ gdp->bg_free_inodes_count =
+ cpu_to_le16(le16_to_cpu(gdp->bg_free_inodes_count) - 1);
+ if (S_ISDIR(mode)) {
+ gdp->bg_used_dirs_count =
+ cpu_to_le16(le16_to_cpu(gdp->bg_used_dirs_count) + 1);
+ }
+ spin_unlock(sb_bgl_lock(sbi, group));
+ BUFFER_TRACE(bh2, "call ext4_journal_dirty_metadata");
+ err = ext4_journal_dirty_metadata(handle, bh2);
+ if (err) goto fail;
+
+ percpu_counter_dec(&sbi->s_freeinodes_counter);
+ if (S_ISDIR(mode))
+ percpu_counter_inc(&sbi->s_dirs_counter);
+ sb->s_dirt = 1;
+
+ inode->i_uid = current->fsuid;
+ if (test_opt (sb, GRPID))
+ inode->i_gid = dir->i_gid;
+ else if (dir->i_mode & S_ISGID) {
+ inode->i_gid = dir->i_gid;
+ if (S_ISDIR(mode))
+ mode |= S_ISGID;
+ } else
+ inode->i_gid = current->fsgid;
+ inode->i_mode = mode;
+
+ inode->i_ino = ino;
+ /* This is the optimal IO size (for stat), not the fs block size */
+ inode->i_blocks = 0;
+ inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
+
+ memset(ei->i_data, 0, sizeof(ei->i_data));
+ ei->i_dir_start_lookup = 0;
+ ei->i_disksize = 0;
+
+ ei->i_flags = EXT4_I(dir)->i_flags & ~EXT4_INDEX_FL;
+ if (S_ISLNK(mode))
+ ei->i_flags &= ~(EXT4_IMMUTABLE_FL|EXT4_APPEND_FL);
+ /* dirsync only applies to directories */
+ if (!S_ISDIR(mode))
+ ei->i_flags &= ~EXT4_DIRSYNC_FL;
+#ifdef EXT4_FRAGMENTS
+ ei->i_faddr = 0;
+ ei->i_frag_no = 0;
+ ei->i_frag_size = 0;
+#endif
+ ei->i_file_acl = 0;
+ ei->i_dir_acl = 0;
+ ei->i_dtime = 0;
+ ei->i_block_alloc_info = NULL;
+ ei->i_block_group = group;
+
+ ext4_set_inode_flags(inode);
+ if (IS_DIRSYNC(inode))
+ handle->h_sync = 1;
+ insert_inode_hash(inode);
+ spin_lock(&sbi->s_next_gen_lock);
+ inode->i_generation = sbi->s_next_generation++;
+ spin_unlock(&sbi->s_next_gen_lock);
+
+ ei->i_state = EXT4_STATE_NEW;
+ ei->i_extra_isize =
+ (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) ?
+ sizeof(struct ext4_inode) - EXT4_GOOD_OLD_INODE_SIZE : 0;
+
+ ret = inode;
+ if(DQUOT_ALLOC_INODE(inode)) {
+ err = -EDQUOT;
+ goto fail_drop;
+ }
+
+ err = ext4_init_acl(handle, inode, dir);
+ if (err)
+ goto fail_free_drop;
+
+ err = ext4_init_security(handle,inode, dir);
+ if (err)
+ goto fail_free_drop;
+
+ err = ext4_mark_inode_dirty(handle, inode);
+ if (err) {
+ ext4_std_error(sb, err);
+ goto fail_free_drop;
+ }
+ if (test_opt(sb, EXTENTS)) {
+ EXT4_I(inode)->i_flags |= EXT4_EXTENTS_FL;
+ ext4_ext_tree_init(handle, inode);
+ if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
+ err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh);
+ if (err) goto fail;
+ EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS);
+ BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "call ext4_journal_dirty_metadata");
+ err = ext4_journal_dirty_metadata(handle, EXT4_SB(sb)->s_sbh);
+ }
+ }
+
+ ext4_debug("allocating inode %lu\n", inode->i_ino);
+ goto really_out;
+fail:
+ ext4_std_error(sb, err);
+out:
+ iput(inode);
+ ret = ERR_PTR(err);
+really_out:
+ brelse(bitmap_bh);
+ return ret;
+
+fail_free_drop:
+ DQUOT_FREE_INODE(inode);
+
+fail_drop:
+ DQUOT_DROP(inode);
+ inode->i_flags |= S_NOQUOTA;
+ inode->i_nlink = 0;
+ iput(inode);
+ brelse(bitmap_bh);
+ return ERR_PTR(err);
+}
+
+/* Verify that we are loading a valid orphan from disk */
+struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino)
+{
+ unsigned long max_ino = le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count);
+ unsigned long block_group;
+ int bit;
+ struct buffer_head *bitmap_bh = NULL;
+ struct inode *inode = NULL;
+
+ /* Error cases - e2fsck has already cleaned up for us */
+ if (ino > max_ino) {
+ ext4_warning(sb, __FUNCTION__,
+ "bad orphan ino %lu! e2fsck was run?", ino);
+ goto out;
+ }
+
+ block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
+ bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb);
+ bitmap_bh = read_inode_bitmap(sb, block_group);
+ if (!bitmap_bh) {
+ ext4_warning(sb, __FUNCTION__,
+ "inode bitmap error for orphan %lu", ino);
+ goto out;
+ }
+
+ /* Having the inode bit set should be a 100% indicator that this
+ * is a valid orphan (no e2fsck run on fs). Orphans also include
+ * inodes that were being truncated, so we can't check i_nlink==0.
+ */
+ if (!ext4_test_bit(bit, bitmap_bh->b_data) ||
+ !(inode = iget(sb, ino)) || is_bad_inode(inode) ||
+ NEXT_ORPHAN(inode) > max_ino) {
+ ext4_warning(sb, __FUNCTION__,
+ "bad orphan inode %lu! e2fsck was run?", ino);
+ printk(KERN_NOTICE "ext4_test_bit(bit=%d, block=%llu) = %d\n",
+ bit, (unsigned long long)bitmap_bh->b_blocknr,
+ ext4_test_bit(bit, bitmap_bh->b_data));
+ printk(KERN_NOTICE "inode=%p\n", inode);
+ if (inode) {
+ printk(KERN_NOTICE "is_bad_inode(inode)=%d\n",
+ is_bad_inode(inode));
+ printk(KERN_NOTICE "NEXT_ORPHAN(inode)=%u\n",
+ NEXT_ORPHAN(inode));
+ printk(KERN_NOTICE "max_ino=%lu\n", max_ino);
+ }
+ /* Avoid freeing blocks if we got a bad deleted inode */
+ if (inode && inode->i_nlink == 0)
+ inode->i_blocks = 0;
+ iput(inode);
+ inode = NULL;
+ }
+out:
+ brelse(bitmap_bh);
+ return inode;
+}
+
+unsigned long ext4_count_free_inodes (struct super_block * sb)
+{
+ unsigned long desc_count;
+ struct ext4_group_desc *gdp;
+ int i;
+#ifdef EXT4FS_DEBUG
+ struct ext4_super_block *es;
+ unsigned long bitmap_count, x;
+ struct buffer_head *bitmap_bh = NULL;
+
+ es = EXT4_SB(sb)->s_es;
+ desc_count = 0;
+ bitmap_count = 0;
+ gdp = NULL;
+ for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) {
+ gdp = ext4_get_group_desc (sb, i, NULL);
+ if (!gdp)
+ continue;
+ desc_count += le16_to_cpu(gdp->bg_free_inodes_count);
+ brelse(bitmap_bh);
+ bitmap_bh = read_inode_bitmap(sb, i);
+ if (!bitmap_bh)
+ continue;
+
+ x = ext4_count_free(bitmap_bh, EXT4_INODES_PER_GROUP(sb) / 8);
+ printk("group %d: stored = %d, counted = %lu\n",
+ i, le16_to_cpu(gdp->bg_free_inodes_count), x);
+ bitmap_count += x;
+ }
+ brelse(bitmap_bh);
+ printk("ext4_count_free_inodes: stored = %u, computed = %lu, %lu\n",
+ le32_to_cpu(es->s_free_inodes_count), desc_count, bitmap_count);
+ return desc_count;
+#else
+ desc_count = 0;
+ for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) {
+ gdp = ext4_get_group_desc (sb, i, NULL);
+ if (!gdp)
+ continue;
+ desc_count += le16_to_cpu(gdp->bg_free_inodes_count);
+ cond_resched();
+ }
+ return desc_count;
+#endif
+}
+
+/* Called at mount-time, super-block is locked */
+unsigned long ext4_count_dirs (struct super_block * sb)
+{
+ unsigned long count = 0;
+ int i;
+
+ for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) {
+ struct ext4_group_desc *gdp = ext4_get_group_desc (sb, i, NULL);
+ if (!gdp)
+ continue;
+ count += le16_to_cpu(gdp->bg_used_dirs_count);
+ }
+ return count;
+}
+
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
new file mode 100644
index 000000000000..0a60ec5a16db
--- /dev/null
+++ b/fs/ext4/inode.c
@@ -0,0 +1,3233 @@
+/*
+ * linux/fs/ext4/inode.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ * from
+ *
+ * linux/fs/minix/inode.c
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ *
+ * Goal-directed block allocation by Stephen Tweedie
+ * (sct@redhat.com), 1993, 1998
+ * Big-endian to little-endian byte-swapping/bitmaps by
+ * David S. Miller (davem@caip.rutgers.edu), 1995
+ * 64-bit file support on 64-bit platforms by Jakub Jelinek
+ * (jj@sunsite.ms.mff.cuni.cz)
+ *
+ * Assorted race fixes, rewrite of ext4_get_block() by Al Viro, 2000
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/time.h>
+#include <linux/ext4_jbd2.h>
+#include <linux/jbd2.h>
+#include <linux/smp_lock.h>
+#include <linux/highuid.h>
+#include <linux/pagemap.h>
+#include <linux/quotaops.h>
+#include <linux/string.h>
+#include <linux/buffer_head.h>
+#include <linux/writeback.h>
+#include <linux/mpage.h>
+#include <linux/uio.h>
+#include <linux/bio.h>
+#include "xattr.h"
+#include "acl.h"
+
+/*
+ * Test whether an inode is a fast symlink.
+ */
+static int ext4_inode_is_fast_symlink(struct inode *inode)
+{
+ int ea_blocks = EXT4_I(inode)->i_file_acl ?
+ (inode->i_sb->s_blocksize >> 9) : 0;
+
+ return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0);
+}
+
+/*
+ * The ext4 forget function must perform a revoke if we are freeing data
+ * which has been journaled. Metadata (eg. indirect blocks) must be
+ * revoked in all cases.
+ *
+ * "bh" may be NULL: a metadata block may have been freed from memory
+ * but there may still be a record of it in the journal, and that record
+ * still needs to be revoked.
+ */
+int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode,
+ struct buffer_head *bh, ext4_fsblk_t blocknr)
+{
+ int err;
+
+ might_sleep();
+
+ BUFFER_TRACE(bh, "enter");
+
+ jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, "
+ "data mode %lx\n",
+ bh, is_metadata, inode->i_mode,
+ test_opt(inode->i_sb, DATA_FLAGS));
+
+ /* Never use the revoke function if we are doing full data
+ * journaling: there is no need to, and a V1 superblock won't
+ * support it. Otherwise, only skip the revoke on un-journaled
+ * data blocks. */
+
+ if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ||
+ (!is_metadata && !ext4_should_journal_data(inode))) {
+ if (bh) {
+ BUFFER_TRACE(bh, "call jbd2_journal_forget");
+ return ext4_journal_forget(handle, bh);
+ }
+ return 0;
+ }
+
+ /*
+ * data!=journal && (is_metadata || should_journal_data(inode))
+ */
+ BUFFER_TRACE(bh, "call ext4_journal_revoke");
+ err = ext4_journal_revoke(handle, blocknr, bh);
+ if (err)
+ ext4_abort(inode->i_sb, __FUNCTION__,
+ "error %d when attempting revoke", err);
+ BUFFER_TRACE(bh, "exit");
+ return err;
+}
+
+/*
+ * Work out how many blocks we need to proceed with the next chunk of a
+ * truncate transaction.
+ */
+static unsigned long blocks_for_truncate(struct inode *inode)
+{
+ unsigned long needed;
+
+ needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9);
+
+ /* Give ourselves just enough room to cope with inodes in which
+ * i_blocks is corrupt: we've seen disk corruptions in the past
+ * which resulted in random data in an inode which looked enough
+ * like a regular file for ext4 to try to delete it. Things
+ * will go a bit crazy if that happens, but at least we should
+ * try not to panic the whole kernel. */
+ if (needed < 2)
+ needed = 2;
+
+ /* But we need to bound the transaction so we don't overflow the
+ * journal. */
+ if (needed > EXT4_MAX_TRANS_DATA)
+ needed = EXT4_MAX_TRANS_DATA;
+
+ return EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + needed;
+}
+
+/*
+ * Truncate transactions can be complex and absolutely huge. So we need to
+ * be able to restart the transaction at a conventient checkpoint to make
+ * sure we don't overflow the journal.
+ *
+ * start_transaction gets us a new handle for a truncate transaction,
+ * and extend_transaction tries to extend the existing one a bit. If
+ * extend fails, we need to propagate the failure up and restart the
+ * transaction in the top-level truncate loop. --sct
+ */
+static handle_t *start_transaction(struct inode *inode)
+{
+ handle_t *result;
+
+ result = ext4_journal_start(inode, blocks_for_truncate(inode));
+ if (!IS_ERR(result))
+ return result;
+
+ ext4_std_error(inode->i_sb, PTR_ERR(result));
+ return result;
+}
+
+/*
+ * Try to extend this transaction for the purposes of truncation.
+ *
+ * Returns 0 if we managed to create more room. If we can't create more
+ * room, and the transaction must be restarted we return 1.
+ */
+static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
+{
+ if (handle->h_buffer_credits > EXT4_RESERVE_TRANS_BLOCKS)
+ return 0;
+ if (!ext4_journal_extend(handle, blocks_for_truncate(inode)))
+ return 0;
+ return 1;
+}
+
+/*
+ * Restart the transaction associated with *handle. This does a commit,
+ * so before we call here everything must be consistently dirtied against
+ * this transaction.
+ */
+static int ext4_journal_test_restart(handle_t *handle, struct inode *inode)
+{
+ jbd_debug(2, "restarting handle %p\n", handle);
+ return ext4_journal_restart(handle, blocks_for_truncate(inode));
+}
+
+/*
+ * Called at the last iput() if i_nlink is zero.
+ */
+void ext4_delete_inode (struct inode * inode)
+{
+ handle_t *handle;
+
+ truncate_inode_pages(&inode->i_data, 0);
+
+ if (is_bad_inode(inode))
+ goto no_delete;
+
+ handle = start_transaction(inode);
+ if (IS_ERR(handle)) {
+ /*
+ * If we're going to skip the normal cleanup, we still need to
+ * make sure that the in-core orphan linked list is properly
+ * cleaned up.
+ */
+ ext4_orphan_del(NULL, inode);
+ goto no_delete;
+ }
+
+ if (IS_SYNC(inode))
+ handle->h_sync = 1;
+ inode->i_size = 0;
+ if (inode->i_blocks)
+ ext4_truncate(inode);
+ /*
+ * Kill off the orphan record which ext4_truncate created.
+ * AKPM: I think this can be inside the above `if'.
+ * Note that ext4_orphan_del() has to be able to cope with the
+ * deletion of a non-existent orphan - this is because we don't
+ * know if ext4_truncate() actually created an orphan record.
+ * (Well, we could do this if we need to, but heck - it works)
+ */
+ ext4_orphan_del(handle, inode);
+ EXT4_I(inode)->i_dtime = get_seconds();
+
+ /*
+ * One subtle ordering requirement: if anything has gone wrong
+ * (transaction abort, IO errors, whatever), then we can still
+ * do these next steps (the fs will already have been marked as
+ * having errors), but we can't free the inode if the mark_dirty
+ * fails.
+ */
+ if (ext4_mark_inode_dirty(handle, inode))
+ /* If that failed, just do the required in-core inode clear. */
+ clear_inode(inode);
+ else
+ ext4_free_inode(handle, inode);
+ ext4_journal_stop(handle);
+ return;
+no_delete:
+ clear_inode(inode); /* We must guarantee clearing of inode... */
+}
+
+typedef struct {
+ __le32 *p;
+ __le32 key;
+ struct buffer_head *bh;
+} Indirect;
+
+static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v)
+{
+ p->key = *(p->p = v);
+ p->bh = bh;
+}
+
+static int verify_chain(Indirect *from, Indirect *to)
+{
+ while (from <= to && from->key == *from->p)
+ from++;
+ return (from > to);
+}
+
+/**
+ * ext4_block_to_path - parse the block number into array of offsets
+ * @inode: inode in question (we are only interested in its superblock)
+ * @i_block: block number to be parsed
+ * @offsets: array to store the offsets in
+ * @boundary: set this non-zero if the referred-to block is likely to be
+ * followed (on disk) by an indirect block.
+ *
+ * To store the locations of file's data ext4 uses a data structure common
+ * for UNIX filesystems - tree of pointers anchored in the inode, with
+ * data blocks at leaves and indirect blocks in intermediate nodes.
+ * This function translates the block number into path in that tree -
+ * return value is the path length and @offsets[n] is the offset of
+ * pointer to (n+1)th node in the nth one. If @block is out of range
+ * (negative or too large) warning is printed and zero returned.
+ *
+ * Note: function doesn't find node addresses, so no IO is needed. All
+ * we need to know is the capacity of indirect blocks (taken from the
+ * inode->i_sb).
+ */
+
+/*
+ * Portability note: the last comparison (check that we fit into triple
+ * indirect block) is spelled differently, because otherwise on an
+ * architecture with 32-bit longs and 8Kb pages we might get into trouble
+ * if our filesystem had 8Kb blocks. We might use long long, but that would
+ * kill us on x86. Oh, well, at least the sign propagation does not matter -
+ * i_block would have to be negative in the very beginning, so we would not
+ * get there at all.
+ */
+
+static int ext4_block_to_path(struct inode *inode,
+ long i_block, int offsets[4], int *boundary)
+{
+ int ptrs = EXT4_ADDR_PER_BLOCK(inode->i_sb);
+ int ptrs_bits = EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb);
+ const long direct_blocks = EXT4_NDIR_BLOCKS,
+ indirect_blocks = ptrs,
+ double_blocks = (1 << (ptrs_bits * 2));
+ int n = 0;
+ int final = 0;
+
+ if (i_block < 0) {
+ ext4_warning (inode->i_sb, "ext4_block_to_path", "block < 0");
+ } else if (i_block < direct_blocks) {
+ offsets[n++] = i_block;
+ final = direct_blocks;
+ } else if ( (i_block -= direct_blocks) < indirect_blocks) {
+ offsets[n++] = EXT4_IND_BLOCK;
+ offsets[n++] = i_block;
+ final = ptrs;
+ } else if ((i_block -= indirect_blocks) < double_blocks) {
+ offsets[n++] = EXT4_DIND_BLOCK;
+ offsets[n++] = i_block >> ptrs_bits;
+ offsets[n++] = i_block & (ptrs - 1);
+ final = ptrs;
+ } else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) {
+ offsets[n++] = EXT4_TIND_BLOCK;
+ offsets[n++] = i_block >> (ptrs_bits * 2);
+ offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1);
+ offsets[n++] = i_block & (ptrs - 1);
+ final = ptrs;
+ } else {
+ ext4_warning(inode->i_sb, "ext4_block_to_path", "block > big");
+ }
+ if (boundary)
+ *boundary = final - 1 - (i_block & (ptrs - 1));
+ return n;
+}
+
+/**
+ * ext4_get_branch - read the chain of indirect blocks leading to data
+ * @inode: inode in question
+ * @depth: depth of the chain (1 - direct pointer, etc.)
+ * @offsets: offsets of pointers in inode/indirect blocks
+ * @chain: place to store the result
+ * @err: here we store the error value
+ *
+ * Function fills the array of triples <key, p, bh> and returns %NULL
+ * if everything went OK or the pointer to the last filled triple
+ * (incomplete one) otherwise. Upon the return chain[i].key contains
+ * the number of (i+1)-th block in the chain (as it is stored in memory,
+ * i.e. little-endian 32-bit), chain[i].p contains the address of that
+ * number (it points into struct inode for i==0 and into the bh->b_data
+ * for i>0) and chain[i].bh points to the buffer_head of i-th indirect
+ * block for i>0 and NULL for i==0. In other words, it holds the block
+ * numbers of the chain, addresses they were taken from (and where we can
+ * verify that chain did not change) and buffer_heads hosting these
+ * numbers.
+ *
+ * Function stops when it stumbles upon zero pointer (absent block)
+ * (pointer to last triple returned, *@err == 0)
+ * or when it gets an IO error reading an indirect block
+ * (ditto, *@err == -EIO)
+ * or when it notices that chain had been changed while it was reading
+ * (ditto, *@err == -EAGAIN)
+ * or when it reads all @depth-1 indirect blocks successfully and finds
+ * the whole chain, all way to the data (returns %NULL, *err == 0).
+ */
+static Indirect *ext4_get_branch(struct inode *inode, int depth, int *offsets,
+ Indirect chain[4], int *err)
+{
+ struct super_block *sb = inode->i_sb;
+ Indirect *p = chain;
+ struct buffer_head *bh;
+
+ *err = 0;
+ /* i_data is not going away, no lock needed */
+ add_chain (chain, NULL, EXT4_I(inode)->i_data + *offsets);
+ if (!p->key)
+ goto no_block;
+ while (--depth) {
+ bh = sb_bread(sb, le32_to_cpu(p->key));
+ if (!bh)
+ goto failure;
+ /* Reader: pointers */
+ if (!verify_chain(chain, p))
+ goto changed;
+ add_chain(++p, bh, (__le32*)bh->b_data + *++offsets);
+ /* Reader: end */
+ if (!p->key)
+ goto no_block;
+ }
+ return NULL;
+
+changed:
+ brelse(bh);
+ *err = -EAGAIN;
+ goto no_block;
+failure:
+ *err = -EIO;
+no_block:
+ return p;
+}
+
+/**
+ * ext4_find_near - find a place for allocation with sufficient locality
+ * @inode: owner
+ * @ind: descriptor of indirect block.
+ *
+ * This function returns the prefered place for block allocation.
+ * It is used when heuristic for sequential allocation fails.
+ * Rules are:
+ * + if there is a block to the left of our position - allocate near it.
+ * + if pointer will live in indirect block - allocate near that block.
+ * + if pointer will live in inode - allocate in the same
+ * cylinder group.
+ *
+ * In the latter case we colour the starting block by the callers PID to
+ * prevent it from clashing with concurrent allocations for a different inode
+ * in the same block group. The PID is used here so that functionally related
+ * files will be close-by on-disk.
+ *
+ * Caller must make sure that @ind is valid and will stay that way.
+ */
+static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ __le32 *start = ind->bh ? (__le32*) ind->bh->b_data : ei->i_data;
+ __le32 *p;
+ ext4_fsblk_t bg_start;
+ ext4_grpblk_t colour;
+
+ /* Try to find previous block */
+ for (p = ind->p - 1; p >= start; p--) {
+ if (*p)
+ return le32_to_cpu(*p);
+ }
+
+ /* No such thing, so let's try location of indirect block */
+ if (ind->bh)
+ return ind->bh->b_blocknr;
+
+ /*
+ * It is going to be referred to from the inode itself? OK, just put it
+ * into the same cylinder group then.
+ */
+ bg_start = ext4_group_first_block_no(inode->i_sb, ei->i_block_group);
+ colour = (current->pid % 16) *
+ (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
+ return bg_start + colour;
+}
+
+/**
+ * ext4_find_goal - find a prefered place for allocation.
+ * @inode: owner
+ * @block: block we want
+ * @chain: chain of indirect blocks
+ * @partial: pointer to the last triple within a chain
+ * @goal: place to store the result.
+ *
+ * Normally this function find the prefered place for block allocation,
+ * stores it in *@goal and returns zero.
+ */
+
+static ext4_fsblk_t ext4_find_goal(struct inode *inode, long block,
+ Indirect chain[4], Indirect *partial)
+{
+ struct ext4_block_alloc_info *block_i;
+
+ block_i = EXT4_I(inode)->i_block_alloc_info;
+
+ /*
+ * try the heuristic for sequential allocation,
+ * failing that at least try to get decent locality.
+ */
+ if (block_i && (block == block_i->last_alloc_logical_block + 1)
+ && (block_i->last_alloc_physical_block != 0)) {
+ return block_i->last_alloc_physical_block + 1;
+ }
+
+ return ext4_find_near(inode, partial);
+}
+
+/**
+ * ext4_blks_to_allocate: Look up the block map and count the number
+ * of direct blocks need to be allocated for the given branch.
+ *
+ * @branch: chain of indirect blocks
+ * @k: number of blocks need for indirect blocks
+ * @blks: number of data blocks to be mapped.
+ * @blocks_to_boundary: the offset in the indirect block
+ *
+ * return the total number of blocks to be allocate, including the
+ * direct and indirect blocks.
+ */
+static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned long blks,
+ int blocks_to_boundary)
+{
+ unsigned long count = 0;
+
+ /*
+ * Simple case, [t,d]Indirect block(s) has not allocated yet
+ * then it's clear blocks on that path have not allocated
+ */
+ if (k > 0) {
+ /* right now we don't handle cross boundary allocation */
+ if (blks < blocks_to_boundary + 1)
+ count += blks;
+ else
+ count += blocks_to_boundary + 1;
+ return count;
+ }
+
+ count++;
+ while (count < blks && count <= blocks_to_boundary &&
+ le32_to_cpu(*(branch[0].p + count)) == 0) {
+ count++;
+ }
+ return count;
+}
+
+/**
+ * ext4_alloc_blocks: multiple allocate blocks needed for a branch
+ * @indirect_blks: the number of blocks need to allocate for indirect
+ * blocks
+ *
+ * @new_blocks: on return it will store the new block numbers for
+ * the indirect blocks(if needed) and the first direct block,
+ * @blks: on return it will store the total number of allocated
+ * direct blocks
+ */
+static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
+ ext4_fsblk_t goal, int indirect_blks, int blks,
+ ext4_fsblk_t new_blocks[4], int *err)
+{
+ int target, i;
+ unsigned long count = 0;
+ int index = 0;
+ ext4_fsblk_t current_block = 0;
+ int ret = 0;
+
+ /*
+ * Here we try to allocate the requested multiple blocks at once,
+ * on a best-effort basis.
+ * To build a branch, we should allocate blocks for
+ * the indirect blocks(if not allocated yet), and at least
+ * the first direct block of this branch. That's the
+ * minimum number of blocks need to allocate(required)
+ */
+ target = blks + indirect_blks;
+
+ while (1) {
+ count = target;
+ /* allocating blocks for indirect blocks and direct blocks */
+ current_block = ext4_new_blocks(handle,inode,goal,&count,err);
+ if (*err)
+ goto failed_out;
+
+ target -= count;
+ /* allocate blocks for indirect blocks */
+ while (index < indirect_blks && count) {
+ new_blocks[index++] = current_block++;
+ count--;
+ }
+
+ if (count > 0)
+ break;
+ }
+
+ /* save the new block number for the first direct block */
+ new_blocks[index] = current_block;
+
+ /* total number of blocks allocated for direct blocks */
+ ret = count;
+ *err = 0;
+ return ret;
+failed_out:
+ for (i = 0; i <index; i++)
+ ext4_free_blocks(handle, inode, new_blocks[i], 1);
+ return ret;
+}
+
+/**
+ * ext4_alloc_branch - allocate and set up a chain of blocks.
+ * @inode: owner
+ * @indirect_blks: number of allocated indirect blocks
+ * @blks: number of allocated direct blocks
+ * @offsets: offsets (in the blocks) to store the pointers to next.
+ * @branch: place to store the chain in.
+ *
+ * This function allocates blocks, zeroes out all but the last one,
+ * links them into chain and (if we are synchronous) writes them to disk.
+ * In other words, it prepares a branch that can be spliced onto the
+ * inode. It stores the information about that chain in the branch[], in
+ * the same format as ext4_get_branch() would do. We are calling it after
+ * we had read the existing part of chain and partial points to the last
+ * triple of that (one with zero ->key). Upon the exit we have the same
+ * picture as after the successful ext4_get_block(), except that in one
+ * place chain is disconnected - *branch->p is still zero (we did not
+ * set the last link), but branch->key contains the number that should
+ * be placed into *branch->p to fill that gap.
+ *
+ * If allocation fails we free all blocks we've allocated (and forget
+ * their buffer_heads) and return the error value the from failed
+ * ext4_alloc_block() (normally -ENOSPC). Otherwise we set the chain
+ * as described above and return 0.
+ */
+static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
+ int indirect_blks, int *blks, ext4_fsblk_t goal,
+ int *offsets, Indirect *branch)
+{
+ int blocksize = inode->i_sb->s_blocksize;
+ int i, n = 0;
+ int err = 0;
+ struct buffer_head *bh;
+ int num;
+ ext4_fsblk_t new_blocks[4];
+ ext4_fsblk_t current_block;
+
+ num = ext4_alloc_blocks(handle, inode, goal, indirect_blks,
+ *blks, new_blocks, &err);
+ if (err)
+ return err;
+
+ branch[0].key = cpu_to_le32(new_blocks[0]);
+ /*
+ * metadata blocks and data blocks are allocated.
+ */
+ for (n = 1; n <= indirect_blks; n++) {
+ /*
+ * Get buffer_head for parent block, zero it out
+ * and set the pointer to new one, then send
+ * parent to disk.
+ */
+ bh = sb_getblk(inode->i_sb, new_blocks[n-1]);
+ branch[n].bh = bh;
+ lock_buffer(bh);
+ BUFFER_TRACE(bh, "call get_create_access");
+ err = ext4_journal_get_create_access(handle, bh);
+ if (err) {
+ unlock_buffer(bh);
+ brelse(bh);
+ goto failed;
+ }
+
+ memset(bh->b_data, 0, blocksize);
+ branch[n].p = (__le32 *) bh->b_data + offsets[n];
+ branch[n].key = cpu_to_le32(new_blocks[n]);
+ *branch[n].p = branch[n].key;
+ if ( n == indirect_blks) {
+ current_block = new_blocks[n];
+ /*
+ * End of chain, update the last new metablock of
+ * the chain to point to the new allocated
+ * data blocks numbers
+ */
+ for (i=1; i < num; i++)
+ *(branch[n].p + i) = cpu_to_le32(++current_block);
+ }
+ BUFFER_TRACE(bh, "marking uptodate");
+ set_buffer_uptodate(bh);
+ unlock_buffer(bh);
+
+ BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata");
+ err = ext4_journal_dirty_metadata(handle, bh);
+ if (err)
+ goto failed;
+ }
+ *blks = num;
+ return err;
+failed:
+ /* Allocation failed, free what we already allocated */
+ for (i = 1; i <= n ; i++) {
+ BUFFER_TRACE(branch[i].bh, "call jbd2_journal_forget");
+ ext4_journal_forget(handle, branch[i].bh);
+ }
+ for (i = 0; i <indirect_blks; i++)
+ ext4_free_blocks(handle, inode, new_blocks[i], 1);
+
+ ext4_free_blocks(handle, inode, new_blocks[i], num);
+
+ return err;
+}
+
+/**
+ * ext4_splice_branch - splice the allocated branch onto inode.
+ * @inode: owner
+ * @block: (logical) number of block we are adding
+ * @chain: chain of indirect blocks (with a missing link - see
+ * ext4_alloc_branch)
+ * @where: location of missing link
+ * @num: number of indirect blocks we are adding
+ * @blks: number of direct blocks we are adding
+ *
+ * This function fills the missing link and does all housekeeping needed in
+ * inode (->i_blocks, etc.). In case of success we end up with the full
+ * chain to new block and return 0.
+ */
+static int ext4_splice_branch(handle_t *handle, struct inode *inode,
+ long block, Indirect *where, int num, int blks)
+{
+ int i;
+ int err = 0;
+ struct ext4_block_alloc_info *block_i;
+ ext4_fsblk_t current_block;
+
+ block_i = EXT4_I(inode)->i_block_alloc_info;
+ /*
+ * If we're splicing into a [td]indirect block (as opposed to the
+ * inode) then we need to get write access to the [td]indirect block
+ * before the splice.
+ */
+ if (where->bh) {
+ BUFFER_TRACE(where->bh, "get_write_access");
+ err = ext4_journal_get_write_access(handle, where->bh);
+ if (err)
+ goto err_out;
+ }
+ /* That's it */
+
+ *where->p = where->key;
+
+ /*
+ * Update the host buffer_head or inode to point to more just allocated
+ * direct blocks blocks
+ */
+ if (num == 0 && blks > 1) {
+ current_block = le32_to_cpu(where->key) + 1;
+ for (i = 1; i < blks; i++)
+ *(where->p + i ) = cpu_to_le32(current_block++);
+ }
+
+ /*
+ * update the most recently allocated logical & physical block
+ * in i_block_alloc_info, to assist find the proper goal block for next
+ * allocation
+ */
+ if (block_i) {
+ block_i->last_alloc_logical_block = block + blks - 1;
+ block_i->last_alloc_physical_block =
+ le32_to_cpu(where[num].key) + blks - 1;
+ }
+
+ /* We are done with atomic stuff, now do the rest of housekeeping */
+
+ inode->i_ctime = CURRENT_TIME_SEC;
+ ext4_mark_inode_dirty(handle, inode);
+
+ /* had we spliced it onto indirect block? */
+ if (where->bh) {
+ /*
+ * If we spliced it onto an indirect block, we haven't
+ * altered the inode. Note however that if it is being spliced
+ * onto an indirect block at the very end of the file (the
+ * file is growing) then we *will* alter the inode to reflect
+ * the new i_size. But that is not done here - it is done in
+ * generic_commit_write->__mark_inode_dirty->ext4_dirty_inode.
+ */
+ jbd_debug(5, "splicing indirect only\n");
+ BUFFER_TRACE(where->bh, "call ext4_journal_dirty_metadata");
+ err = ext4_journal_dirty_metadata(handle, where->bh);
+ if (err)
+ goto err_out;
+ } else {
+ /*
+ * OK, we spliced it into the inode itself on a direct block.
+ * Inode was dirtied above.
+ */
+ jbd_debug(5, "splicing direct\n");
+ }
+ return err;
+
+err_out:
+ for (i = 1; i <= num; i++) {
+ BUFFER_TRACE(where[i].bh, "call jbd2_journal_forget");
+ ext4_journal_forget(handle, where[i].bh);
+ ext4_free_blocks(handle,inode,le32_to_cpu(where[i-1].key),1);
+ }
+ ext4_free_blocks(handle, inode, le32_to_cpu(where[num].key), blks);
+
+ return err;
+}
+
+/*
+ * Allocation strategy is simple: if we have to allocate something, we will
+ * have to go the whole way to leaf. So let's do it before attaching anything
+ * to tree, set linkage between the newborn blocks, write them if sync is
+ * required, recheck the path, free and repeat if check fails, otherwise
+ * set the last missing link (that will protect us from any truncate-generated
+ * removals - all blocks on the path are immune now) and possibly force the
+ * write on the parent block.
+ * That has a nice additional property: no special recovery from the failed
+ * allocations is needed - we simply release blocks and do not touch anything
+ * reachable from inode.
+ *
+ * `handle' can be NULL if create == 0.
+ *
+ * The BKL may not be held on entry here. Be sure to take it early.
+ * return > 0, # of blocks mapped or allocated.
+ * return = 0, if plain lookup failed.
+ * return < 0, error case.
+ */
+int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
+ sector_t iblock, unsigned long maxblocks,
+ struct buffer_head *bh_result,
+ int create, int extend_disksize)
+{
+ int err = -EIO;
+ int offsets[4];
+ Indirect chain[4];
+ Indirect *partial;
+ ext4_fsblk_t goal;
+ int indirect_blks;
+ int blocks_to_boundary = 0;
+ int depth;
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ int count = 0;
+ ext4_fsblk_t first_block = 0;
+
+
+ J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL));
+ J_ASSERT(handle != NULL || create == 0);
+ depth = ext4_block_to_path(inode,iblock,offsets,&blocks_to_boundary);
+
+ if (depth == 0)
+ goto out;
+
+ partial = ext4_get_branch(inode, depth, offsets, chain, &err);
+
+ /* Simplest case - block found, no allocation needed */
+ if (!partial) {
+ first_block = le32_to_cpu(chain[depth - 1].key);
+ clear_buffer_new(bh_result);
+ count++;
+ /*map more blocks*/
+ while (count < maxblocks && count <= blocks_to_boundary) {
+ ext4_fsblk_t blk;
+
+ if (!verify_chain(chain, partial)) {
+ /*
+ * Indirect block might be removed by
+ * truncate while we were reading it.
+ * Handling of that case: forget what we've
+ * got now. Flag the err as EAGAIN, so it
+ * will reread.
+ */
+ err = -EAGAIN;
+ count = 0;
+ break;
+ }
+ blk = le32_to_cpu(*(chain[depth-1].p + count));
+
+ if (blk == first_block + count)
+ count++;
+ else
+ break;
+ }
+ if (err != -EAGAIN)
+ goto got_it;
+ }
+
+ /* Next simple case - plain lookup or failed read of indirect block */
+ if (!create || err == -EIO)
+ goto cleanup;
+
+ mutex_lock(&ei->truncate_mutex);
+
+ /*
+ * If the indirect block is missing while we are reading
+ * the chain(ext4_get_branch() returns -EAGAIN err), or
+ * if the chain has been changed after we grab the semaphore,
+ * (either because another process truncated this branch, or
+ * another get_block allocated this branch) re-grab the chain to see if
+ * the request block has been allocated or not.
+ *
+ * Since we already block the truncate/other get_block
+ * at this point, we will have the current copy of the chain when we
+ * splice the branch into the tree.
+ */
+ if (err == -EAGAIN || !verify_chain(chain, partial)) {
+ while (partial > chain) {
+ brelse(partial->bh);
+ partial--;
+ }
+ partial = ext4_get_branch(inode, depth, offsets, chain, &err);
+ if (!partial) {
+ count++;
+ mutex_unlock(&ei->truncate_mutex);
+ if (err)
+ goto cleanup;
+ clear_buffer_new(bh_result);
+ goto got_it;
+ }
+ }
+
+ /*
+ * Okay, we need to do block allocation. Lazily initialize the block
+ * allocation info here if necessary
+ */
+ if (S_ISREG(inode->i_mode) && (!ei->i_block_alloc_info))
+ ext4_init_block_alloc_info(inode);
+
+ goal = ext4_find_goal(inode, iblock, chain, partial);
+
+ /* the number of blocks need to allocate for [d,t]indirect blocks */
+ indirect_blks = (chain + depth) - partial - 1;
+
+ /*
+ * Next look up the indirect map to count the totoal number of
+ * direct blocks to allocate for this branch.
+ */
+ count = ext4_blks_to_allocate(partial, indirect_blks,
+ maxblocks, blocks_to_boundary);
+ /*
+ * Block out ext4_truncate while we alter the tree
+ */
+ err = ext4_alloc_branch(handle, inode, indirect_blks, &count, goal,
+ offsets + (partial - chain), partial);
+
+ /*
+ * The ext4_splice_branch call will free and forget any buffers
+ * on the new chain if there is a failure, but that risks using
+ * up transaction credits, especially for bitmaps where the
+ * credits cannot be returned. Can we handle this somehow? We
+ * may need to return -EAGAIN upwards in the worst case. --sct
+ */
+ if (!err)
+ err = ext4_splice_branch(handle, inode, iblock,
+ partial, indirect_blks, count);
+ /*
+ * i_disksize growing is protected by truncate_mutex. Don't forget to
+ * protect it if you're about to implement concurrent
+ * ext4_get_block() -bzzz
+ */
+ if (!err && extend_disksize && inode->i_size > ei->i_disksize)
+ ei->i_disksize = inode->i_size;
+ mutex_unlock(&ei->truncate_mutex);
+ if (err)
+ goto cleanup;
+
+ set_buffer_new(bh_result);
+got_it:
+ map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key));
+ if (count > blocks_to_boundary)
+ set_buffer_boundary(bh_result);
+ err = count;
+ /* Clean up and exit */
+ partial = chain + depth - 1; /* the whole chain */
+cleanup:
+ while (partial > chain) {
+ BUFFER_TRACE(partial->bh, "call brelse");
+ brelse(partial->bh);
+ partial--;
+ }
+ BUFFER_TRACE(bh_result, "returned");
+out:
+ return err;
+}
+
+#define DIO_CREDITS (EXT4_RESERVE_TRANS_BLOCKS + 32)
+
+static int ext4_get_block(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create)
+{
+ handle_t *handle = journal_current_handle();
+ int ret = 0;
+ unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
+
+ if (!create)
+ goto get_block; /* A read */
+
+ if (max_blocks == 1)
+ goto get_block; /* A single block get */
+
+ if (handle->h_transaction->t_state == T_LOCKED) {
+ /*
+ * Huge direct-io writes can hold off commits for long
+ * periods of time. Let this commit run.
+ */
+ ext4_journal_stop(handle);
+ handle = ext4_journal_start(inode, DIO_CREDITS);
+ if (IS_ERR(handle))
+ ret = PTR_ERR(handle);
+ goto get_block;
+ }
+
+ if (handle->h_buffer_credits <= EXT4_RESERVE_TRANS_BLOCKS) {
+ /*
+ * Getting low on buffer credits...
+ */
+ ret = ext4_journal_extend(handle, DIO_CREDITS);
+ if (ret > 0) {
+ /*
+ * Couldn't extend the transaction. Start a new one.
+ */
+ ret = ext4_journal_restart(handle, DIO_CREDITS);
+ }
+ }
+
+get_block:
+ if (ret == 0) {
+ ret = ext4_get_blocks_wrap(handle, inode, iblock,
+ max_blocks, bh_result, create, 0);
+ if (ret > 0) {
+ bh_result->b_size = (ret << inode->i_blkbits);
+ ret = 0;
+ }
+ }
+ return ret;
+}
+
+/*
+ * `handle' can be NULL if create is zero
+ */
+struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
+ long block, int create, int *errp)
+{
+ struct buffer_head dummy;
+ int fatal = 0, err;
+
+ J_ASSERT(handle != NULL || create == 0);
+
+ dummy.b_state = 0;
+ dummy.b_blocknr = -1000;
+ buffer_trace_init(&dummy.b_history);
+ err = ext4_get_blocks_wrap(handle, inode, block, 1,
+ &dummy, create, 1);
+ /*
+ * ext4_get_blocks_handle() returns number of blocks
+ * mapped. 0 in case of a HOLE.
+ */
+ if (err > 0) {
+ if (err > 1)
+ WARN_ON(1);
+ err = 0;
+ }
+ *errp = err;
+ if (!err && buffer_mapped(&dummy)) {
+ struct buffer_head *bh;
+ bh = sb_getblk(inode->i_sb, dummy.b_blocknr);
+ if (!bh) {
+ *errp = -EIO;
+ goto err;
+ }
+ if (buffer_new(&dummy)) {
+ J_ASSERT(create != 0);
+ J_ASSERT(handle != 0);
+
+ /*
+ * Now that we do not always journal data, we should
+ * keep in mind whether this should always journal the
+ * new buffer as metadata. For now, regular file
+ * writes use ext4_get_block instead, so it's not a
+ * problem.
+ */
+ lock_buffer(bh);
+ BUFFER_TRACE(bh, "call get_create_access");
+ fatal = ext4_journal_get_create_access(handle, bh);
+ if (!fatal && !buffer_uptodate(bh)) {
+ memset(bh->b_data,0,inode->i_sb->s_blocksize);
+ set_buffer_uptodate(bh);
+ }
+ unlock_buffer(bh);
+ BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata");
+ err = ext4_journal_dirty_metadata(handle, bh);
+ if (!fatal)
+ fatal = err;
+ } else {
+ BUFFER_TRACE(bh, "not a new buffer");
+ }
+ if (fatal) {
+ *errp = fatal;
+ brelse(bh);
+ bh = NULL;
+ }
+ return bh;
+ }
+err:
+ return NULL;
+}
+
+struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
+ int block, int create, int *err)
+{
+ struct buffer_head * bh;
+
+ bh = ext4_getblk(handle, inode, block, create, err);
+ if (!bh)
+ return bh;
+ if (buffer_uptodate(bh))
+ return bh;
+ ll_rw_block(READ_META, 1, &bh);
+ wait_on_buffer(bh);
+ if (buffer_uptodate(bh))
+ return bh;
+ put_bh(bh);
+ *err = -EIO;
+ return NULL;
+}
+
+static int walk_page_buffers( handle_t *handle,
+ struct buffer_head *head,
+ unsigned from,
+ unsigned to,
+ int *partial,
+ int (*fn)( handle_t *handle,
+ struct buffer_head *bh))
+{
+ struct buffer_head *bh;
+ unsigned block_start, block_end;
+ unsigned blocksize = head->b_size;
+ int err, ret = 0;
+ struct buffer_head *next;
+
+ for ( bh = head, block_start = 0;
+ ret == 0 && (bh != head || !block_start);
+ block_start = block_end, bh = next)
+ {
+ next = bh->b_this_page;
+ block_end = block_start + blocksize;
+ if (block_end <= from || block_start >= to) {
+ if (partial && !buffer_uptodate(bh))
+ *partial = 1;
+ continue;
+ }
+ err = (*fn)(handle, bh);
+ if (!ret)
+ ret = err;
+ }
+ return ret;
+}
+
+/*
+ * To preserve ordering, it is essential that the hole instantiation and
+ * the data write be encapsulated in a single transaction. We cannot
+ * close off a transaction and start a new one between the ext4_get_block()
+ * and the commit_write(). So doing the jbd2_journal_start at the start of
+ * prepare_write() is the right place.
+ *
+ * Also, this function can nest inside ext4_writepage() ->
+ * block_write_full_page(). In that case, we *know* that ext4_writepage()
+ * has generated enough buffer credits to do the whole page. So we won't
+ * block on the journal in that case, which is good, because the caller may
+ * be PF_MEMALLOC.
+ *
+ * By accident, ext4 can be reentered when a transaction is open via
+ * quota file writes. If we were to commit the transaction while thus
+ * reentered, there can be a deadlock - we would be holding a quota
+ * lock, and the commit would never complete if another thread had a
+ * transaction open and was blocking on the quota lock - a ranking
+ * violation.
+ *
+ * So what we do is to rely on the fact that jbd2_journal_stop/journal_start
+ * will _not_ run commit under these circumstances because handle->h_ref
+ * is elevated. We'll still have enough credits for the tiny quotafile
+ * write.
+ */
+static int do_journal_get_write_access(handle_t *handle,
+ struct buffer_head *bh)
+{
+ if (!buffer_mapped(bh) || buffer_freed(bh))
+ return 0;
+ return ext4_journal_get_write_access(handle, bh);
+}
+
+static int ext4_prepare_write(struct file *file, struct page *page,
+ unsigned from, unsigned to)
+{
+ struct inode *inode = page->mapping->host;
+ int ret, needed_blocks = ext4_writepage_trans_blocks(inode);
+ handle_t *handle;
+ int retries = 0;
+
+retry:
+ handle = ext4_journal_start(inode, needed_blocks);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ goto out;
+ }
+ if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
+ ret = nobh_prepare_write(page, from, to, ext4_get_block);
+ else
+ ret = block_prepare_write(page, from, to, ext4_get_block);
+ if (ret)
+ goto prepare_write_failed;
+
+ if (ext4_should_journal_data(inode)) {
+ ret = walk_page_buffers(handle, page_buffers(page),
+ from, to, NULL, do_journal_get_write_access);
+ }
+prepare_write_failed:
+ if (ret)
+ ext4_journal_stop(handle);
+ if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
+ goto retry;
+out:
+ return ret;
+}
+
+int ext4_journal_dirty_data(handle_t *handle, struct buffer_head *bh)
+{
+ int err = jbd2_journal_dirty_data(handle, bh);
+ if (err)
+ ext4_journal_abort_handle(__FUNCTION__, __FUNCTION__,
+ bh, handle,err);
+ return err;
+}
+
+/* For commit_write() in data=journal mode */
+static int commit_write_fn(handle_t *handle, struct buffer_head *bh)
+{
+ if (!buffer_mapped(bh) || buffer_freed(bh))
+ return 0;
+ set_buffer_uptodate(bh);
+ return ext4_journal_dirty_metadata(handle, bh);
+}
+
+/*
+ * We need to pick up the new inode size which generic_commit_write gave us
+ * `file' can be NULL - eg, when called from page_symlink().
+ *
+ * ext4 never places buffers on inode->i_mapping->private_list. metadata
+ * buffers are managed internally.
+ */
+static int ext4_ordered_commit_write(struct file *file, struct page *page,
+ unsigned from, unsigned to)
+{
+ handle_t *handle = ext4_journal_current_handle();
+ struct inode *inode = page->mapping->host;
+ int ret = 0, ret2;
+
+ ret = walk_page_buffers(handle, page_buffers(page),
+ from, to, NULL, ext4_journal_dirty_data);
+
+ if (ret == 0) {
+ /*
+ * generic_commit_write() will run mark_inode_dirty() if i_size
+ * changes. So let's piggyback the i_disksize mark_inode_dirty
+ * into that.
+ */
+ loff_t new_i_size;
+
+ new_i_size = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
+ if (new_i_size > EXT4_I(inode)->i_disksize)
+ EXT4_I(inode)->i_disksize = new_i_size;
+ ret = generic_commit_write(file, page, from, to);
+ }
+ ret2 = ext4_journal_stop(handle);
+ if (!ret)
+ ret = ret2;
+ return ret;
+}
+
+static int ext4_writeback_commit_write(struct file *file, struct page *page,
+ unsigned from, unsigned to)
+{
+ handle_t *handle = ext4_journal_current_handle();
+ struct inode *inode = page->mapping->host;
+ int ret = 0, ret2;
+ loff_t new_i_size;
+
+ new_i_size = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
+ if (new_i_size > EXT4_I(inode)->i_disksize)
+ EXT4_I(inode)->i_disksize = new_i_size;
+
+ if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
+ ret = nobh_commit_write(file, page, from, to);
+ else
+ ret = generic_commit_write(file, page, from, to);
+
+ ret2 = ext4_journal_stop(handle);
+ if (!ret)
+ ret = ret2;
+ return ret;
+}
+
+static int ext4_journalled_commit_write(struct file *file,
+ struct page *page, unsigned from, unsigned to)
+{
+ handle_t *handle = ext4_journal_current_handle();
+ struct inode *inode = page->mapping->host;
+ int ret = 0, ret2;
+ int partial = 0;
+ loff_t pos;
+
+ /*
+ * Here we duplicate the generic_commit_write() functionality
+ */
+ pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
+
+ ret = walk_page_buffers(handle, page_buffers(page), from,
+ to, &partial, commit_write_fn);
+ if (!partial)
+ SetPageUptodate(page);
+ if (pos > inode->i_size)
+ i_size_write(inode, pos);
+ EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
+ if (inode->i_size > EXT4_I(inode)->i_disksize) {
+ EXT4_I(inode)->i_disksize = inode->i_size;
+ ret2 = ext4_mark_inode_dirty(handle, inode);
+ if (!ret)
+ ret = ret2;
+ }
+ ret2 = ext4_journal_stop(handle);
+ if (!ret)
+ ret = ret2;
+ return ret;
+}
+
+/*
+ * bmap() is special. It gets used by applications such as lilo and by
+ * the swapper to find the on-disk block of a specific piece of data.
+ *
+ * Naturally, this is dangerous if the block concerned is still in the
+ * journal. If somebody makes a swapfile on an ext4 data-journaling
+ * filesystem and enables swap, then they may get a nasty shock when the
+ * data getting swapped to that swapfile suddenly gets overwritten by
+ * the original zero's written out previously to the journal and
+ * awaiting writeback in the kernel's buffer cache.
+ *
+ * So, if we see any bmap calls here on a modified, data-journaled file,
+ * take extra steps to flush any blocks which might be in the cache.
+ */
+static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
+{
+ struct inode *inode = mapping->host;
+ journal_t *journal;
+ int err;
+
+ if (EXT4_I(inode)->i_state & EXT4_STATE_JDATA) {
+ /*
+ * This is a REALLY heavyweight approach, but the use of
+ * bmap on dirty files is expected to be extremely rare:
+ * only if we run lilo or swapon on a freshly made file
+ * do we expect this to happen.
+ *
+ * (bmap requires CAP_SYS_RAWIO so this does not
+ * represent an unprivileged user DOS attack --- we'd be
+ * in trouble if mortal users could trigger this path at
+ * will.)
+ *
+ * NB. EXT4_STATE_JDATA is not set on files other than
+ * regular files. If somebody wants to bmap a directory
+ * or symlink and gets confused because the buffer
+ * hasn't yet been flushed to disk, they deserve
+ * everything they get.
+ */
+
+ EXT4_I(inode)->i_state &= ~EXT4_STATE_JDATA;
+ journal = EXT4_JOURNAL(inode);
+ jbd2_journal_lock_updates(journal);
+ err = jbd2_journal_flush(journal);
+ jbd2_journal_unlock_updates(journal);
+
+ if (err)
+ return 0;
+ }
+
+ return generic_block_bmap(mapping,block,ext4_get_block);
+}
+
+static int bget_one(handle_t *handle, struct buffer_head *bh)
+{
+ get_bh(bh);
+ return 0;
+}
+
+static int bput_one(handle_t *handle, struct buffer_head *bh)
+{
+ put_bh(bh);
+ return 0;
+}
+
+static int jbd2_journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
+{
+ if (buffer_mapped(bh))
+ return ext4_journal_dirty_data(handle, bh);
+ return 0;
+}
+
+/*
+ * Note that we always start a transaction even if we're not journalling
+ * data. This is to preserve ordering: any hole instantiation within
+ * __block_write_full_page -> ext4_get_block() should be journalled
+ * along with the data so we don't crash and then get metadata which
+ * refers to old data.
+ *
+ * In all journalling modes block_write_full_page() will start the I/O.
+ *
+ * Problem:
+ *
+ * ext4_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() ->
+ * ext4_writepage()
+ *
+ * Similar for:
+ *
+ * ext4_file_write() -> generic_file_write() -> __alloc_pages() -> ...
+ *
+ * Same applies to ext4_get_block(). We will deadlock on various things like
+ * lock_journal and i_truncate_mutex.
+ *
+ * Setting PF_MEMALLOC here doesn't work - too many internal memory
+ * allocations fail.
+ *
+ * 16May01: If we're reentered then journal_current_handle() will be
+ * non-zero. We simply *return*.
+ *
+ * 1 July 2001: @@@ FIXME:
+ * In journalled data mode, a data buffer may be metadata against the
+ * current transaction. But the same file is part of a shared mapping
+ * and someone does a writepage() on it.
+ *
+ * We will move the buffer onto the async_data list, but *after* it has
+ * been dirtied. So there's a small window where we have dirty data on
+ * BJ_Metadata.
+ *
+ * Note that this only applies to the last partial page in the file. The
+ * bit which block_write_full_page() uses prepare/commit for. (That's
+ * broken code anyway: it's wrong for msync()).
+ *
+ * It's a rare case: affects the final partial page, for journalled data
+ * where the file is subject to bith write() and writepage() in the same
+ * transction. To fix it we'll need a custom block_write_full_page().
+ * We'll probably need that anyway for journalling writepage() output.
+ *
+ * We don't honour synchronous mounts for writepage(). That would be
+ * disastrous. Any write() or metadata operation will sync the fs for
+ * us.
+ *
+ * AKPM2: if all the page's buffers are mapped to disk and !data=journal,
+ * we don't need to open a transaction here.
+ */
+static int ext4_ordered_writepage(struct page *page,
+ struct writeback_control *wbc)
+{
+ struct inode *inode = page->mapping->host;
+ struct buffer_head *page_bufs;
+ handle_t *handle = NULL;
+ int ret = 0;
+ int err;
+
+ J_ASSERT(PageLocked(page));
+
+ /*
+ * We give up here if we're reentered, because it might be for a
+ * different filesystem.
+ */
+ if (ext4_journal_current_handle())
+ goto out_fail;
+
+ handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
+
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ goto out_fail;
+ }
+
+ if (!page_has_buffers(page)) {
+ create_empty_buffers(page, inode->i_sb->s_blocksize,
+ (1 << BH_Dirty)|(1 << BH_Uptodate));
+ }
+ page_bufs = page_buffers(page);
+ walk_page_buffers(handle, page_bufs, 0,
+ PAGE_CACHE_SIZE, NULL, bget_one);
+
+ ret = block_write_full_page(page, ext4_get_block, wbc);
+
+ /*
+ * The page can become unlocked at any point now, and
+ * truncate can then come in and change things. So we
+ * can't touch *page from now on. But *page_bufs is
+ * safe due to elevated refcount.
+ */
+
+ /*
+ * And attach them to the current transaction. But only if
+ * block_write_full_page() succeeded. Otherwise they are unmapped,
+ * and generally junk.
+ */
+ if (ret == 0) {
+ err = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE,
+ NULL, jbd2_journal_dirty_data_fn);
+ if (!ret)
+ ret = err;
+ }
+ walk_page_buffers(handle, page_bufs, 0,
+ PAGE_CACHE_SIZE, NULL, bput_one);
+ err = ext4_journal_stop(handle);
+ if (!ret)
+ ret = err;
+ return ret;
+
+out_fail:
+ redirty_page_for_writepage(wbc, page);
+ unlock_page(page);
+ return ret;
+}
+
+static int ext4_writeback_writepage(struct page *page,
+ struct writeback_control *wbc)
+{
+ struct inode *inode = page->mapping->host;
+ handle_t *handle = NULL;
+ int ret = 0;
+ int err;
+
+ if (ext4_journal_current_handle())
+ goto out_fail;
+
+ handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ goto out_fail;
+ }
+
+ if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
+ ret = nobh_writepage(page, ext4_get_block, wbc);
+ else
+ ret = block_write_full_page(page, ext4_get_block, wbc);
+
+ err = ext4_journal_stop(handle);
+ if (!ret)
+ ret = err;
+ return ret;
+
+out_fail:
+ redirty_page_for_writepage(wbc, page);
+ unlock_page(page);
+ return ret;
+}
+
+static int ext4_journalled_writepage(struct page *page,
+ struct writeback_control *wbc)
+{
+ struct inode *inode = page->mapping->host;
+ handle_t *handle = NULL;
+ int ret = 0;
+ int err;
+
+ if (ext4_journal_current_handle())
+ goto no_write;
+
+ handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ goto no_write;
+ }
+
+ if (!page_has_buffers(page) || PageChecked(page)) {
+ /*
+ * It's mmapped pagecache. Add buffers and journal it. There
+ * doesn't seem much point in redirtying the page here.
+ */
+ ClearPageChecked(page);
+ ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
+ ext4_get_block);
+ if (ret != 0) {
+ ext4_journal_stop(handle);
+ goto out_unlock;
+ }
+ ret = walk_page_buffers(handle, page_buffers(page), 0,
+ PAGE_CACHE_SIZE, NULL, do_journal_get_write_access);
+
+ err = walk_page_buffers(handle, page_buffers(page), 0,
+ PAGE_CACHE_SIZE, NULL, commit_write_fn);
+ if (ret == 0)
+ ret = err;
+ EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
+ unlock_page(page);
+ } else {
+ /*
+ * It may be a page full of checkpoint-mode buffers. We don't
+ * really know unless we go poke around in the buffer_heads.
+ * But block_write_full_page will do the right thing.
+ */
+ ret = block_write_full_page(page, ext4_get_block, wbc);
+ }
+ err = ext4_journal_stop(handle);
+ if (!ret)
+ ret = err;
+out:
+ return ret;
+
+no_write:
+ redirty_page_for_writepage(wbc, page);
+out_unlock:
+ unlock_page(page);
+ goto out;
+}
+
+static int ext4_readpage(struct file *file, struct page *page)
+{
+ return mpage_readpage(page, ext4_get_block);
+}
+
+static int
+ext4_readpages(struct file *file, struct address_space *mapping,
+ struct list_head *pages, unsigned nr_pages)
+{
+ return mpage_readpages(mapping, pages, nr_pages, ext4_get_block);
+}
+
+static void ext4_invalidatepage(struct page *page, unsigned long offset)
+{
+ journal_t *journal = EXT4_JOURNAL(page->mapping->host);
+
+ /*
+ * If it's a full truncate we just forget about the pending dirtying
+ */
+ if (offset == 0)
+ ClearPageChecked(page);
+
+ jbd2_journal_invalidatepage(journal, page, offset);
+}
+
+static int ext4_releasepage(struct page *page, gfp_t wait)
+{
+ journal_t *journal = EXT4_JOURNAL(page->mapping->host);
+
+ WARN_ON(PageChecked(page));
+ if (!page_has_buffers(page))
+ return 0;
+ return jbd2_journal_try_to_free_buffers(journal, page, wait);
+}
+
+/*
+ * If the O_DIRECT write will extend the file then add this inode to the
+ * orphan list. So recovery will truncate it back to the original size
+ * if the machine crashes during the write.
+ *
+ * If the O_DIRECT write is intantiating holes inside i_size and the machine
+ * crashes then stale disk data _may_ be exposed inside the file.
+ */
+static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
+ const struct iovec *iov, loff_t offset,
+ unsigned long nr_segs)
+{
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file->f_mapping->host;
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ handle_t *handle = NULL;
+ ssize_t ret;
+ int orphan = 0;
+ size_t count = iov_length(iov, nr_segs);
+
+ if (rw == WRITE) {
+ loff_t final_size = offset + count;
+
+ handle = ext4_journal_start(inode, DIO_CREDITS);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ goto out;
+ }
+ if (final_size > inode->i_size) {
+ ret = ext4_orphan_add(handle, inode);
+ if (ret)
+ goto out_stop;
+ orphan = 1;
+ ei->i_disksize = inode->i_size;
+ }
+ }
+
+ ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
+ offset, nr_segs,
+ ext4_get_block, NULL);
+
+ /*
+ * Reacquire the handle: ext4_get_block() can restart the transaction
+ */
+ handle = journal_current_handle();
+
+out_stop:
+ if (handle) {
+ int err;
+
+ if (orphan && inode->i_nlink)
+ ext4_orphan_del(handle, inode);
+ if (orphan && ret > 0) {
+ loff_t end = offset + ret;
+ if (end > inode->i_size) {
+ ei->i_disksize = end;
+ i_size_write(inode, end);
+ /*
+ * We're going to return a positive `ret'
+ * here due to non-zero-length I/O, so there's
+ * no way of reporting error returns from
+ * ext4_mark_inode_dirty() to userspace. So
+ * ignore it.
+ */
+ ext4_mark_inode_dirty(handle, inode);
+ }
+ }
+ err = ext4_journal_stop(handle);
+ if (ret == 0)
+ ret = err;
+ }
+out:
+ return ret;
+}
+
+/*
+ * Pages can be marked dirty completely asynchronously from ext4's journalling
+ * activity. By filemap_sync_pte(), try_to_unmap_one(), etc. We cannot do
+ * much here because ->set_page_dirty is called under VFS locks. The page is
+ * not necessarily locked.
+ *
+ * We cannot just dirty the page and leave attached buffers clean, because the
+ * buffers' dirty state is "definitive". We cannot just set the buffers dirty
+ * or jbddirty because all the journalling code will explode.
+ *
+ * So what we do is to mark the page "pending dirty" and next time writepage
+ * is called, propagate that into the buffers appropriately.
+ */
+static int ext4_journalled_set_page_dirty(struct page *page)
+{
+ SetPageChecked(page);
+ return __set_page_dirty_nobuffers(page);
+}
+
+static const struct address_space_operations ext4_ordered_aops = {
+ .readpage = ext4_readpage,
+ .readpages = ext4_readpages,
+ .writepage = ext4_ordered_writepage,
+ .sync_page = block_sync_page,
+ .prepare_write = ext4_prepare_write,
+ .commit_write = ext4_ordered_commit_write,
+ .bmap = ext4_bmap,
+ .invalidatepage = ext4_invalidatepage,
+ .releasepage = ext4_releasepage,
+ .direct_IO = ext4_direct_IO,
+ .migratepage = buffer_migrate_page,
+};
+
+static const struct address_space_operations ext4_writeback_aops = {
+ .readpage = ext4_readpage,
+ .readpages = ext4_readpages,
+ .writepage = ext4_writeback_writepage,
+ .sync_page = block_sync_page,
+ .prepare_write = ext4_prepare_write,
+ .commit_write = ext4_writeback_commit_write,
+ .bmap = ext4_bmap,
+ .invalidatepage = ext4_invalidatepage,
+ .releasepage = ext4_releasepage,
+ .direct_IO = ext4_direct_IO,
+ .migratepage = buffer_migrate_page,
+};
+
+static const struct address_space_operations ext4_journalled_aops = {
+ .readpage = ext4_readpage,
+ .readpages = ext4_readpages,
+ .writepage = ext4_journalled_writepage,
+ .sync_page = block_sync_page,
+ .prepare_write = ext4_prepare_write,
+ .commit_write = ext4_journalled_commit_write,
+ .set_page_dirty = ext4_journalled_set_page_dirty,
+ .bmap = ext4_bmap,
+ .invalidatepage = ext4_invalidatepage,
+ .releasepage = ext4_releasepage,
+};
+
+void ext4_set_aops(struct inode *inode)
+{
+ if (ext4_should_order_data(inode))
+ inode->i_mapping->a_ops = &ext4_ordered_aops;
+ else if (ext4_should_writeback_data(inode))
+ inode->i_mapping->a_ops = &ext4_writeback_aops;
+ else
+ inode->i_mapping->a_ops = &ext4_journalled_aops;
+}
+
+/*
+ * ext4_block_truncate_page() zeroes out a mapping from file offset `from'
+ * up to the end of the block which corresponds to `from'.
+ * This required during truncate. We need to physically zero the tail end
+ * of that block so it doesn't yield old data if the file is later grown.
+ */
+int ext4_block_truncate_page(handle_t *handle, struct page *page,
+ struct address_space *mapping, loff_t from)
+{
+ ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
+ unsigned offset = from & (PAGE_CACHE_SIZE-1);
+ unsigned blocksize, iblock, length, pos;
+ struct inode *inode = mapping->host;
+ struct buffer_head *bh;
+ int err = 0;
+ void *kaddr;
+
+ blocksize = inode->i_sb->s_blocksize;
+ length = blocksize - (offset & (blocksize - 1));
+ iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
+
+ /*
+ * For "nobh" option, we can only work if we don't need to
+ * read-in the page - otherwise we create buffers to do the IO.
+ */
+ if (!page_has_buffers(page) && test_opt(inode->i_sb, NOBH) &&
+ ext4_should_writeback_data(inode) && PageUptodate(page)) {
+ kaddr = kmap_atomic(page, KM_USER0);
+ memset(kaddr + offset, 0, length);
+ flush_dcache_page(page);
+ kunmap_atomic(kaddr, KM_USER0);
+ set_page_dirty(page);
+ goto unlock;
+ }
+
+ if (!page_has_buffers(page))
+ create_empty_buffers(page, blocksize, 0);
+
+ /* Find the buffer that contains "offset" */
+ bh = page_buffers(page);
+ pos = blocksize;
+ while (offset >= pos) {
+ bh = bh->b_this_page;
+ iblock++;
+ pos += blocksize;
+ }
+
+ err = 0;
+ if (buffer_freed(bh)) {
+ BUFFER_TRACE(bh, "freed: skip");
+ goto unlock;
+ }
+
+ if (!buffer_mapped(bh)) {
+ BUFFER_TRACE(bh, "unmapped");
+ ext4_get_block(inode, iblock, bh, 0);
+ /* unmapped? It's a hole - nothing to do */
+ if (!buffer_mapped(bh)) {
+ BUFFER_TRACE(bh, "still unmapped");
+ goto unlock;
+ }
+ }
+
+ /* Ok, it's mapped. Make sure it's up-to-date */
+ if (PageUptodate(page))
+ set_buffer_uptodate(bh);
+
+ if (!buffer_uptodate(bh)) {
+ err = -EIO;
+ ll_rw_block(READ, 1, &bh);
+ wait_on_buffer(bh);
+ /* Uhhuh. Read error. Complain and punt. */
+ if (!buffer_uptodate(bh))
+ goto unlock;
+ }
+
+ if (ext4_should_journal_data(inode)) {
+ BUFFER_TRACE(bh, "get write access");
+ err = ext4_journal_get_write_access(handle, bh);
+ if (err)
+ goto unlock;
+ }
+
+ kaddr = kmap_atomic(page, KM_USER0);
+ memset(kaddr + offset, 0, length);
+ flush_dcache_page(page);
+ kunmap_atomic(kaddr, KM_USER0);
+
+ BUFFER_TRACE(bh, "zeroed end of block");
+
+ err = 0;
+ if (ext4_should_journal_data(inode)) {
+ err = ext4_journal_dirty_metadata(handle, bh);
+ } else {
+ if (ext4_should_order_data(inode))
+ err = ext4_journal_dirty_data(handle, bh);
+ mark_buffer_dirty(bh);
+ }
+
+unlock:
+ unlock_page(page);
+ page_cache_release(page);
+ return err;
+}
+
+/*
+ * Probably it should be a library function... search for first non-zero word
+ * or memcmp with zero_page, whatever is better for particular architecture.
+ * Linus?
+ */
+static inline int all_zeroes(__le32 *p, __le32 *q)
+{
+ while (p < q)
+ if (*p++)
+ return 0;
+ return 1;
+}
+
+/**
+ * ext4_find_shared - find the indirect blocks for partial truncation.
+ * @inode: inode in question
+ * @depth: depth of the affected branch
+ * @offsets: offsets of pointers in that branch (see ext4_block_to_path)
+ * @chain: place to store the pointers to partial indirect blocks
+ * @top: place to the (detached) top of branch
+ *
+ * This is a helper function used by ext4_truncate().
+ *
+ * When we do truncate() we may have to clean the ends of several
+ * indirect blocks but leave the blocks themselves alive. Block is
+ * partially truncated if some data below the new i_size is refered
+ * from it (and it is on the path to the first completely truncated
+ * data block, indeed). We have to free the top of that path along
+ * with everything to the right of the path. Since no allocation
+ * past the truncation point is possible until ext4_truncate()
+ * finishes, we may safely do the latter, but top of branch may
+ * require special attention - pageout below the truncation point
+ * might try to populate it.
+ *
+ * We atomically detach the top of branch from the tree, store the
+ * block number of its root in *@top, pointers to buffer_heads of
+ * partially truncated blocks - in @chain[].bh and pointers to
+ * their last elements that should not be removed - in
+ * @chain[].p. Return value is the pointer to last filled element
+ * of @chain.
+ *
+ * The work left to caller to do the actual freeing of subtrees:
+ * a) free the subtree starting from *@top
+ * b) free the subtrees whose roots are stored in
+ * (@chain[i].p+1 .. end of @chain[i].bh->b_data)
+ * c) free the subtrees growing from the inode past the @chain[0].
+ * (no partially truncated stuff there). */
+
+static Indirect *ext4_find_shared(struct inode *inode, int depth,
+ int offsets[4], Indirect chain[4], __le32 *top)
+{
+ Indirect *partial, *p;
+ int k, err;
+
+ *top = 0;
+ /* Make k index the deepest non-null offest + 1 */
+ for (k = depth; k > 1 && !offsets[k-1]; k--)
+ ;
+ partial = ext4_get_branch(inode, k, offsets, chain, &err);
+ /* Writer: pointers */
+ if (!partial)
+ partial = chain + k-1;
+ /*
+ * If the branch acquired continuation since we've looked at it -
+ * fine, it should all survive and (new) top doesn't belong to us.
+ */
+ if (!partial->key && *partial->p)
+ /* Writer: end */
+ goto no_top;
+ for (p=partial; p>chain && all_zeroes((__le32*)p->bh->b_data,p->p); p--)
+ ;
+ /*
+ * OK, we've found the last block that must survive. The rest of our
+ * branch should be detached before unlocking. However, if that rest
+ * of branch is all ours and does not grow immediately from the inode
+ * it's easier to cheat and just decrement partial->p.
+ */
+ if (p == chain + k - 1 && p > chain) {
+ p->p--;
+ } else {
+ *top = *p->p;
+ /* Nope, don't do this in ext4. Must leave the tree intact */
+#if 0
+ *p->p = 0;
+#endif
+ }
+ /* Writer: end */
+
+ while(partial > p) {
+ brelse(partial->bh);
+ partial--;
+ }
+no_top:
+ return partial;
+}
+
+/*
+ * Zero a number of block pointers in either an inode or an indirect block.
+ * If we restart the transaction we must again get write access to the
+ * indirect block for further modification.
+ *
+ * We release `count' blocks on disk, but (last - first) may be greater
+ * than `count' because there can be holes in there.
+ */
+static void ext4_clear_blocks(handle_t *handle, struct inode *inode,
+ struct buffer_head *bh, ext4_fsblk_t block_to_free,
+ unsigned long count, __le32 *first, __le32 *last)
+{
+ __le32 *p;
+ if (try_to_extend_transaction(handle, inode)) {
+ if (bh) {
+ BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata");
+ ext4_journal_dirty_metadata(handle, bh);
+ }
+ ext4_mark_inode_dirty(handle, inode);
+ ext4_journal_test_restart(handle, inode);
+ if (bh) {
+ BUFFER_TRACE(bh, "retaking write access");
+ ext4_journal_get_write_access(handle, bh);
+ }
+ }
+
+ /*
+ * Any buffers which are on the journal will be in memory. We find
+ * them on the hash table so jbd2_journal_revoke() will run jbd2_journal_forget()
+ * on them. We've already detached each block from the file, so
+ * bforget() in jbd2_journal_forget() should be safe.
+ *
+ * AKPM: turn on bforget in jbd2_journal_forget()!!!
+ */
+ for (p = first; p < last; p++) {
+ u32 nr = le32_to_cpu(*p);
+ if (nr) {
+ struct buffer_head *bh;
+
+ *p = 0;
+ bh = sb_find_get_block(inode->i_sb, nr);
+ ext4_forget(handle, 0, inode, bh, nr);
+ }
+ }
+
+ ext4_free_blocks(handle, inode, block_to_free, count);
+}
+
+/**
+ * ext4_free_data - free a list of data blocks
+ * @handle: handle for this transaction
+ * @inode: inode we are dealing with
+ * @this_bh: indirect buffer_head which contains *@first and *@last
+ * @first: array of block numbers
+ * @last: points immediately past the end of array
+ *
+ * We are freeing all blocks refered from that array (numbers are stored as
+ * little-endian 32-bit) and updating @inode->i_blocks appropriately.
+ *
+ * We accumulate contiguous runs of blocks to free. Conveniently, if these
+ * blocks are contiguous then releasing them at one time will only affect one
+ * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't
+ * actually use a lot of journal space.
+ *
+ * @this_bh will be %NULL if @first and @last point into the inode's direct
+ * block pointers.
+ */
+static void ext4_free_data(handle_t *handle, struct inode *inode,
+ struct buffer_head *this_bh,
+ __le32 *first, __le32 *last)
+{
+ ext4_fsblk_t block_to_free = 0; /* Starting block # of a run */
+ unsigned long count = 0; /* Number of blocks in the run */
+ __le32 *block_to_free_p = NULL; /* Pointer into inode/ind
+ corresponding to
+ block_to_free */
+ ext4_fsblk_t nr; /* Current block # */
+ __le32 *p; /* Pointer into inode/ind
+ for current block */
+ int err;
+
+ if (this_bh) { /* For indirect block */
+ BUFFER_TRACE(this_bh, "get_write_access");
+ err = ext4_journal_get_write_access(handle, this_bh);
+ /* Important: if we can't update the indirect pointers
+ * to the blocks, we can't free them. */
+ if (err)
+ return;
+ }
+
+ for (p = first; p < last; p++) {
+ nr = le32_to_cpu(*p);
+ if (nr) {
+ /* accumulate blocks to free if they're contiguous */
+ if (count == 0) {
+ block_to_free = nr;
+ block_to_free_p = p;
+ count = 1;
+ } else if (nr == block_to_free + count) {
+ count++;
+ } else {
+ ext4_clear_blocks(handle, inode, this_bh,
+ block_to_free,
+ count, block_to_free_p, p);
+ block_to_free = nr;
+ block_to_free_p = p;
+ count = 1;
+ }
+ }
+ }
+
+ if (count > 0)
+ ext4_clear_blocks(handle, inode, this_bh, block_to_free,
+ count, block_to_free_p, p);
+
+ if (this_bh) {
+ BUFFER_TRACE(this_bh, "call ext4_journal_dirty_metadata");
+ ext4_journal_dirty_metadata(handle, this_bh);
+ }
+}
+
+/**
+ * ext4_free_branches - free an array of branches
+ * @handle: JBD handle for this transaction
+ * @inode: inode we are dealing with
+ * @parent_bh: the buffer_head which contains *@first and *@last
+ * @first: array of block numbers
+ * @last: pointer immediately past the end of array
+ * @depth: depth of the branches to free
+ *
+ * We are freeing all blocks refered from these branches (numbers are
+ * stored as little-endian 32-bit) and updating @inode->i_blocks
+ * appropriately.
+ */
+static void ext4_free_branches(handle_t *handle, struct inode *inode,
+ struct buffer_head *parent_bh,
+ __le32 *first, __le32 *last, int depth)
+{
+ ext4_fsblk_t nr;
+ __le32 *p;
+
+ if (is_handle_aborted(handle))
+ return;
+
+ if (depth--) {
+ struct buffer_head *bh;
+ int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);
+ p = last;
+ while (--p >= first) {
+ nr = le32_to_cpu(*p);
+ if (!nr)
+ continue; /* A hole */
+
+ /* Go read the buffer for the next level down */
+ bh = sb_bread(inode->i_sb, nr);
+
+ /*
+ * A read failure? Report error and clear slot
+ * (should be rare).
+ */
+ if (!bh) {
+ ext4_error(inode->i_sb, "ext4_free_branches",
+ "Read failure, inode=%lu, block=%llu",
+ inode->i_ino, nr);
+ continue;
+ }
+
+ /* This zaps the entire block. Bottom up. */
+ BUFFER_TRACE(bh, "free child branches");
+ ext4_free_branches(handle, inode, bh,
+ (__le32*)bh->b_data,
+ (__le32*)bh->b_data + addr_per_block,
+ depth);
+
+ /*
+ * We've probably journalled the indirect block several
+ * times during the truncate. But it's no longer
+ * needed and we now drop it from the transaction via
+ * jbd2_journal_revoke().
+ *
+ * That's easy if it's exclusively part of this
+ * transaction. But if it's part of the committing
+ * transaction then jbd2_journal_forget() will simply
+ * brelse() it. That means that if the underlying
+ * block is reallocated in ext4_get_block(),
+ * unmap_underlying_metadata() will find this block
+ * and will try to get rid of it. damn, damn.
+ *
+ * If this block has already been committed to the
+ * journal, a revoke record will be written. And
+ * revoke records must be emitted *before* clearing
+ * this block's bit in the bitmaps.
+ */
+ ext4_forget(handle, 1, inode, bh, bh->b_blocknr);
+
+ /*
+ * Everything below this this pointer has been
+ * released. Now let this top-of-subtree go.
+ *
+ * We want the freeing of this indirect block to be
+ * atomic in the journal with the updating of the
+ * bitmap block which owns it. So make some room in
+ * the journal.
+ *
+ * We zero the parent pointer *after* freeing its
+ * pointee in the bitmaps, so if extend_transaction()
+ * for some reason fails to put the bitmap changes and
+ * the release into the same transaction, recovery
+ * will merely complain about releasing a free block,
+ * rather than leaking blocks.
+ */
+ if (is_handle_aborted(handle))
+ return;
+ if (try_to_extend_transaction(handle, inode)) {
+ ext4_mark_inode_dirty(handle, inode);
+ ext4_journal_test_restart(handle, inode);
+ }
+
+ ext4_free_blocks(handle, inode, nr, 1);
+
+ if (parent_bh) {
+ /*
+ * The block which we have just freed is
+ * pointed to by an indirect block: journal it
+ */
+ BUFFER_TRACE(parent_bh, "get_write_access");
+ if (!ext4_journal_get_write_access(handle,
+ parent_bh)){
+ *p = 0;
+ BUFFER_TRACE(parent_bh,
+ "call ext4_journal_dirty_metadata");
+ ext4_journal_dirty_metadata(handle,
+ parent_bh);
+ }
+ }
+ }
+ } else {
+ /* We have reached the bottom of the tree. */
+ BUFFER_TRACE(parent_bh, "free data blocks");
+ ext4_free_data(handle, inode, parent_bh, first, last);
+ }
+}
+
+/*
+ * ext4_truncate()
+ *
+ * We block out ext4_get_block() block instantiations across the entire
+ * transaction, and VFS/VM ensures that ext4_truncate() cannot run
+ * simultaneously on behalf of the same inode.
+ *
+ * As we work through the truncate and commmit bits of it to the journal there
+ * is one core, guiding principle: the file's tree must always be consistent on
+ * disk. We must be able to restart the truncate after a crash.
+ *
+ * The file's tree may be transiently inconsistent in memory (although it
+ * probably isn't), but whenever we close off and commit a journal transaction,
+ * the contents of (the filesystem + the journal) must be consistent and
+ * restartable. It's pretty simple, really: bottom up, right to left (although
+ * left-to-right works OK too).
+ *
+ * Note that at recovery time, journal replay occurs *before* the restart of
+ * truncate against the orphan inode list.
+ *
+ * The committed inode has the new, desired i_size (which is the same as
+ * i_disksize in this case). After a crash, ext4_orphan_cleanup() will see
+ * that this inode's truncate did not complete and it will again call
+ * ext4_truncate() to have another go. So there will be instantiated blocks
+ * to the right of the truncation point in a crashed ext4 filesystem. But
+ * that's fine - as long as they are linked from the inode, the post-crash
+ * ext4_truncate() run will find them and release them.
+ */
+void ext4_truncate(struct inode *inode)
+{
+ handle_t *handle;
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ __le32 *i_data = ei->i_data;
+ int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);
+ struct address_space *mapping = inode->i_mapping;
+ int offsets[4];
+ Indirect chain[4];
+ Indirect *partial;
+ __le32 nr = 0;
+ int n;
+ long last_block;
+ unsigned blocksize = inode->i_sb->s_blocksize;
+ struct page *page;
+
+ if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+ S_ISLNK(inode->i_mode)))
+ return;
+ if (ext4_inode_is_fast_symlink(inode))
+ return;
+ if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
+ return;
+
+ /*
+ * We have to lock the EOF page here, because lock_page() nests
+ * outside jbd2_journal_start().
+ */
+ if ((inode->i_size & (blocksize - 1)) == 0) {
+ /* Block boundary? Nothing to do */
+ page = NULL;
+ } else {
+ page = grab_cache_page(mapping,
+ inode->i_size >> PAGE_CACHE_SHIFT);
+ if (!page)
+ return;
+ }
+
+ if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
+ return ext4_ext_truncate(inode, page);
+
+ handle = start_transaction(inode);
+ if (IS_ERR(handle)) {
+ if (page) {
+ clear_highpage(page);
+ flush_dcache_page(page);
+ unlock_page(page);
+ page_cache_release(page);
+ }
+ return; /* AKPM: return what? */
+ }
+
+ last_block = (inode->i_size + blocksize-1)
+ >> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
+
+ if (page)
+ ext4_block_truncate_page(handle, page, mapping, inode->i_size);
+
+ n = ext4_block_to_path(inode, last_block, offsets, NULL);
+ if (n == 0)
+ goto out_stop; /* error */
+
+ /*
+ * OK. This truncate is going to happen. We add the inode to the
+ * orphan list, so that if this truncate spans multiple transactions,
+ * and we crash, we will resume the truncate when the filesystem
+ * recovers. It also marks the inode dirty, to catch the new size.
+ *
+ * Implication: the file must always be in a sane, consistent
+ * truncatable state while each transaction commits.
+ */
+ if (ext4_orphan_add(handle, inode))
+ goto out_stop;
+
+ /*
+ * The orphan list entry will now protect us from any crash which
+ * occurs before the truncate completes, so it is now safe to propagate
+ * the new, shorter inode size (held for now in i_size) into the
+ * on-disk inode. We do this via i_disksize, which is the value which
+ * ext4 *really* writes onto the disk inode.
+ */
+ ei->i_disksize = inode->i_size;
+
+ /*
+ * From here we block out all ext4_get_block() callers who want to
+ * modify the block allocation tree.
+ */
+ mutex_lock(&ei->truncate_mutex);
+
+ if (n == 1) { /* direct blocks */
+ ext4_free_data(handle, inode, NULL, i_data+offsets[0],
+ i_data + EXT4_NDIR_BLOCKS);
+ goto do_indirects;
+ }
+
+ partial = ext4_find_shared(inode, n, offsets, chain, &nr);
+ /* Kill the top of shared branch (not detached) */
+ if (nr) {
+ if (partial == chain) {
+ /* Shared branch grows from the inode */
+ ext4_free_branches(handle, inode, NULL,
+ &nr, &nr+1, (chain+n-1) - partial);
+ *partial->p = 0;
+ /*
+ * We mark the inode dirty prior to restart,
+ * and prior to stop. No need for it here.
+ */
+ } else {
+ /* Shared branch grows from an indirect block */
+ BUFFER_TRACE(partial->bh, "get_write_access");
+ ext4_free_branches(handle, inode, partial->bh,
+ partial->p,
+ partial->p+1, (chain+n-1) - partial);
+ }
+ }
+ /* Clear the ends of indirect blocks on the shared branch */
+ while (partial > chain) {
+ ext4_free_branches(handle, inode, partial->bh, partial->p + 1,
+ (__le32*)partial->bh->b_data+addr_per_block,
+ (chain+n-1) - partial);
+ BUFFER_TRACE(partial->bh, "call brelse");
+ brelse (partial->bh);
+ partial--;
+ }
+do_indirects:
+ /* Kill the remaining (whole) subtrees */
+ switch (offsets[0]) {
+ default:
+ nr = i_data[EXT4_IND_BLOCK];
+ if (nr) {
+ ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1);
+ i_data[EXT4_IND_BLOCK] = 0;
+ }
+ case EXT4_IND_BLOCK:
+ nr = i_data[EXT4_DIND_BLOCK];
+ if (nr) {
+ ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2);
+ i_data[EXT4_DIND_BLOCK] = 0;
+ }
+ case EXT4_DIND_BLOCK:
+ nr = i_data[EXT4_TIND_BLOCK];
+ if (nr) {
+ ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3);
+ i_data[EXT4_TIND_BLOCK] = 0;
+ }
+ case EXT4_TIND_BLOCK:
+ ;
+ }
+
+ ext4_discard_reservation(inode);
+
+ mutex_unlock(&ei->truncate_mutex);
+ inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
+ ext4_mark_inode_dirty(handle, inode);
+
+ /*
+ * In a multi-transaction truncate, we only make the final transaction
+ * synchronous
+ */
+ if (IS_SYNC(inode))
+ handle->h_sync = 1;
+out_stop:
+ /*
+ * If this was a simple ftruncate(), and the file will remain alive
+ * then we need to clear up the orphan record which we created above.
+ * However, if this was a real unlink then we were called by
+ * ext4_delete_inode(), and we allow that function to clean up the
+ * orphan info for us.
+ */
+ if (inode->i_nlink)
+ ext4_orphan_del(handle, inode);
+
+ ext4_journal_stop(handle);
+}
+
+static ext4_fsblk_t ext4_get_inode_block(struct super_block *sb,
+ unsigned long ino, struct ext4_iloc *iloc)
+{
+ unsigned long desc, group_desc, block_group;
+ unsigned long offset;
+ ext4_fsblk_t block;
+ struct buffer_head *bh;
+ struct ext4_group_desc * gdp;
+
+ if (!ext4_valid_inum(sb, ino)) {
+ /*
+ * This error is already checked for in namei.c unless we are
+ * looking at an NFS filehandle, in which case no error
+ * report is needed
+ */
+ return 0;
+ }
+
+ block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
+ if (block_group >= EXT4_SB(sb)->s_groups_count) {
+ ext4_error(sb,"ext4_get_inode_block","group >= groups count");
+ return 0;
+ }
+ smp_rmb();
+ group_desc = block_group >> EXT4_DESC_PER_BLOCK_BITS(sb);
+ desc = block_group & (EXT4_DESC_PER_BLOCK(sb) - 1);
+ bh = EXT4_SB(sb)->s_group_desc[group_desc];
+ if (!bh) {
+ ext4_error (sb, "ext4_get_inode_block",
+ "Descriptor not loaded");
+ return 0;
+ }
+
+ gdp = (struct ext4_group_desc *)((__u8 *)bh->b_data +
+ desc * EXT4_DESC_SIZE(sb));
+ /*
+ * Figure out the offset within the block group inode table
+ */
+ offset = ((ino - 1) % EXT4_INODES_PER_GROUP(sb)) *
+ EXT4_INODE_SIZE(sb);
+ block = ext4_inode_table(sb, gdp) +
+ (offset >> EXT4_BLOCK_SIZE_BITS(sb));
+
+ iloc->block_group = block_group;
+ iloc->offset = offset & (EXT4_BLOCK_SIZE(sb) - 1);
+ return block;
+}
+
+/*
+ * ext4_get_inode_loc returns with an extra refcount against the inode's
+ * underlying buffer_head on success. If 'in_mem' is true, we have all
+ * data in memory that is needed to recreate the on-disk version of this
+ * inode.
+ */
+static int __ext4_get_inode_loc(struct inode *inode,
+ struct ext4_iloc *iloc, int in_mem)
+{
+ ext4_fsblk_t block;
+ struct buffer_head *bh;
+
+ block = ext4_get_inode_block(inode->i_sb, inode->i_ino, iloc);
+ if (!block)
+ return -EIO;
+
+ bh = sb_getblk(inode->i_sb, block);
+ if (!bh) {
+ ext4_error (inode->i_sb, "ext4_get_inode_loc",
+ "unable to read inode block - "
+ "inode=%lu, block=%llu",
+ inode->i_ino, block);
+ return -EIO;
+ }
+ if (!buffer_uptodate(bh)) {
+ lock_buffer(bh);
+ if (buffer_uptodate(bh)) {
+ /* someone brought it uptodate while we waited */
+ unlock_buffer(bh);
+ goto has_buffer;
+ }
+
+ /*
+ * If we have all information of the inode in memory and this
+ * is the only valid inode in the block, we need not read the
+ * block.
+ */
+ if (in_mem) {
+ struct buffer_head *bitmap_bh;
+ struct ext4_group_desc *desc;
+ int inodes_per_buffer;
+ int inode_offset, i;
+ int block_group;
+ int start;
+
+ block_group = (inode->i_ino - 1) /
+ EXT4_INODES_PER_GROUP(inode->i_sb);
+ inodes_per_buffer = bh->b_size /
+ EXT4_INODE_SIZE(inode->i_sb);
+ inode_offset = ((inode->i_ino - 1) %
+ EXT4_INODES_PER_GROUP(inode->i_sb));
+ start = inode_offset & ~(inodes_per_buffer - 1);
+
+ /* Is the inode bitmap in cache? */
+ desc = ext4_get_group_desc(inode->i_sb,
+ block_group, NULL);
+ if (!desc)
+ goto make_io;
+
+ bitmap_bh = sb_getblk(inode->i_sb,
+ ext4_inode_bitmap(inode->i_sb, desc));
+ if (!bitmap_bh)
+ goto make_io;
+
+ /*
+ * If the inode bitmap isn't in cache then the
+ * optimisation may end up performing two reads instead
+ * of one, so skip it.
+ */
+ if (!buffer_uptodate(bitmap_bh)) {
+ brelse(bitmap_bh);
+ goto make_io;
+ }
+ for (i = start; i < start + inodes_per_buffer; i++) {
+ if (i == inode_offset)
+ continue;
+ if (ext4_test_bit(i, bitmap_bh->b_data))
+ break;
+ }
+ brelse(bitmap_bh);
+ if (i == start + inodes_per_buffer) {
+ /* all other inodes are free, so skip I/O */
+ memset(bh->b_data, 0, bh->b_size);
+ set_buffer_uptodate(bh);
+ unlock_buffer(bh);
+ goto has_buffer;
+ }
+ }
+
+make_io:
+ /*
+ * There are other valid inodes in the buffer, this inode
+ * has in-inode xattrs, or we don't have this inode in memory.
+ * Read the block from disk.
+ */
+ get_bh(bh);
+ bh->b_end_io = end_buffer_read_sync;
+ submit_bh(READ_META, bh);
+ wait_on_buffer(bh);
+ if (!buffer_uptodate(bh)) {
+ ext4_error(inode->i_sb, "ext4_get_inode_loc",
+ "unable to read inode block - "
+ "inode=%lu, block=%llu",
+ inode->i_ino, block);
+ brelse(bh);
+ return -EIO;
+ }
+ }
+has_buffer:
+ iloc->bh = bh;
+ return 0;
+}
+
+int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc)
+{
+ /* We have all inode data except xattrs in memory here. */
+ return __ext4_get_inode_loc(inode, iloc,
+ !(EXT4_I(inode)->i_state & EXT4_STATE_XATTR));
+}
+
+void ext4_set_inode_flags(struct inode *inode)
+{
+ unsigned int flags = EXT4_I(inode)->i_flags;
+
+ inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
+ if (flags & EXT4_SYNC_FL)
+ inode->i_flags |= S_SYNC;
+ if (flags & EXT4_APPEND_FL)
+ inode->i_flags |= S_APPEND;
+ if (flags & EXT4_IMMUTABLE_FL)
+ inode->i_flags |= S_IMMUTABLE;
+ if (flags & EXT4_NOATIME_FL)
+ inode->i_flags |= S_NOATIME;
+ if (flags & EXT4_DIRSYNC_FL)
+ inode->i_flags |= S_DIRSYNC;
+}
+
+void ext4_read_inode(struct inode * inode)
+{
+ struct ext4_iloc iloc;
+ struct ext4_inode *raw_inode;
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ struct buffer_head *bh;
+ int block;
+
+#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
+ ei->i_acl = EXT4_ACL_NOT_CACHED;
+ ei->i_default_acl = EXT4_ACL_NOT_CACHED;
+#endif
+ ei->i_block_alloc_info = NULL;
+
+ if (__ext4_get_inode_loc(inode, &iloc, 0))
+ goto bad_inode;
+ bh = iloc.bh;
+ raw_inode = ext4_raw_inode(&iloc);
+ inode->i_mode = le16_to_cpu(raw_inode->i_mode);
+ inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
+ inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
+ if(!(test_opt (inode->i_sb, NO_UID32))) {
+ inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
+ inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
+ }
+ inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
+ inode->i_size = le32_to_cpu(raw_inode->i_size);
+ inode->i_atime.tv_sec = le32_to_cpu(raw_inode->i_atime);
+ inode->i_ctime.tv_sec = le32_to_cpu(raw_inode->i_ctime);
+ inode->i_mtime.tv_sec = le32_to_cpu(raw_inode->i_mtime);
+ inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec = inode->i_mtime.tv_nsec = 0;
+
+ ei->i_state = 0;
+ ei->i_dir_start_lookup = 0;
+ ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
+ /* We now have enough fields to check if the inode was active or not.
+ * This is needed because nfsd might try to access dead inodes
+ * the test is that same one that e2fsck uses
+ * NeilBrown 1999oct15
+ */
+ if (inode->i_nlink == 0) {
+ if (inode->i_mode == 0 ||
+ !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) {
+ /* this inode is deleted */
+ brelse (bh);
+ goto bad_inode;
+ }
+ /* The only unlinked inodes we let through here have
+ * valid i_mode and are being read by the orphan
+ * recovery code: that's fine, we're about to complete
+ * the process of deleting those. */
+ }
+ inode->i_blocks = le32_to_cpu(raw_inode->i_blocks);
+ ei->i_flags = le32_to_cpu(raw_inode->i_flags);
+#ifdef EXT4_FRAGMENTS
+ ei->i_faddr = le32_to_cpu(raw_inode->i_faddr);
+ ei->i_frag_no = raw_inode->i_frag;
+ ei->i_frag_size = raw_inode->i_fsize;
+#endif
+ ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl);
+ if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
+ cpu_to_le32(EXT4_OS_HURD))
+ ei->i_file_acl |=
+ ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
+ if (!S_ISREG(inode->i_mode)) {
+ ei->i_dir_acl = le32_to_cpu(raw_inode->i_dir_acl);
+ } else {
+ inode->i_size |=
+ ((__u64)le32_to_cpu(raw_inode->i_size_high)) << 32;
+ }
+ ei->i_disksize = inode->i_size;
+ inode->i_generation = le32_to_cpu(raw_inode->i_generation);
+ ei->i_block_group = iloc.block_group;
+ /*
+ * NOTE! The in-memory inode i_data array is in little-endian order
+ * even on big-endian machines: we do NOT byteswap the block numbers!
+ */
+ for (block = 0; block < EXT4_N_BLOCKS; block++)
+ ei->i_data[block] = raw_inode->i_block[block];
+ INIT_LIST_HEAD(&ei->i_orphan);
+
+ if (inode->i_ino >= EXT4_FIRST_INO(inode->i_sb) + 1 &&
+ EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
+ /*
+ * When mke2fs creates big inodes it does not zero out
+ * the unused bytes above EXT4_GOOD_OLD_INODE_SIZE,
+ * so ignore those first few inodes.
+ */
+ ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
+ if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
+ EXT4_INODE_SIZE(inode->i_sb))
+ goto bad_inode;
+ if (ei->i_extra_isize == 0) {
+ /* The extra space is currently unused. Use it. */
+ ei->i_extra_isize = sizeof(struct ext4_inode) -
+ EXT4_GOOD_OLD_INODE_SIZE;
+ } else {
+ __le32 *magic = (void *)raw_inode +
+ EXT4_GOOD_OLD_INODE_SIZE +
+ ei->i_extra_isize;
+ if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC))
+ ei->i_state |= EXT4_STATE_XATTR;
+ }
+ } else
+ ei->i_extra_isize = 0;
+
+ if (S_ISREG(inode->i_mode)) {
+ inode->i_op = &ext4_file_inode_operations;
+ inode->i_fop = &ext4_file_operations;
+ ext4_set_aops(inode);
+ } else if (S_ISDIR(inode->i_mode)) {
+ inode->i_op = &ext4_dir_inode_operations;
+ inode->i_fop = &ext4_dir_operations;
+ } else if (S_ISLNK(inode->i_mode)) {
+ if (ext4_inode_is_fast_symlink(inode))
+ inode->i_op = &ext4_fast_symlink_inode_operations;
+ else {
+ inode->i_op = &ext4_symlink_inode_operations;
+ ext4_set_aops(inode);
+ }
+ } else {
+ inode->i_op = &ext4_special_inode_operations;
+ if (raw_inode->i_block[0])
+ init_special_inode(inode, inode->i_mode,
+ old_decode_dev(le32_to_cpu(raw_inode->i_block[0])));
+ else
+ init_special_inode(inode, inode->i_mode,
+ new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
+ }
+ brelse (iloc.bh);
+ ext4_set_inode_flags(inode);
+ return;
+
+bad_inode:
+ make_bad_inode(inode);
+ return;
+}
+
+/*
+ * Post the struct inode info into an on-disk inode location in the
+ * buffer-cache. This gobbles the caller's reference to the
+ * buffer_head in the inode location struct.
+ *
+ * The caller must have write access to iloc->bh.
+ */
+static int ext4_do_update_inode(handle_t *handle,
+ struct inode *inode,
+ struct ext4_iloc *iloc)
+{
+ struct ext4_inode *raw_inode = ext4_raw_inode(iloc);
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ struct buffer_head *bh = iloc->bh;
+ int err = 0, rc, block;
+
+ /* For fields not not tracking in the in-memory inode,
+ * initialise them to zero for new inodes. */
+ if (ei->i_state & EXT4_STATE_NEW)
+ memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size);
+
+ raw_inode->i_mode = cpu_to_le16(inode->i_mode);
+ if(!(test_opt(inode->i_sb, NO_UID32))) {
+ raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid));
+ raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid));
+/*
+ * Fix up interoperability with old kernels. Otherwise, old inodes get
+ * re-used with the upper 16 bits of the uid/gid intact
+ */
+ if(!ei->i_dtime) {
+ raw_inode->i_uid_high =
+ cpu_to_le16(high_16_bits(inode->i_uid));
+ raw_inode->i_gid_high =
+ cpu_to_le16(high_16_bits(inode->i_gid));
+ } else {
+ raw_inode->i_uid_high = 0;
+ raw_inode->i_gid_high = 0;
+ }
+ } else {
+ raw_inode->i_uid_low =
+ cpu_to_le16(fs_high2lowuid(inode->i_uid));
+ raw_inode->i_gid_low =
+ cpu_to_le16(fs_high2lowgid(inode->i_gid));
+ raw_inode->i_uid_high = 0;
+ raw_inode->i_gid_high = 0;
+ }
+ raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
+ raw_inode->i_size = cpu_to_le32(ei->i_disksize);
+ raw_inode->i_atime = cpu_to_le32(inode->i_atime.tv_sec);
+ raw_inode->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
+ raw_inode->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec);
+ raw_inode->i_blocks = cpu_to_le32(inode->i_blocks);
+ raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
+ raw_inode->i_flags = cpu_to_le32(ei->i_flags);
+#ifdef EXT4_FRAGMENTS
+ raw_inode->i_faddr = cpu_to_le32(ei->i_faddr);
+ raw_inode->i_frag = ei->i_frag_no;
+ raw_inode->i_fsize = ei->i_frag_size;
+#endif
+ if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
+ cpu_to_le32(EXT4_OS_HURD))
+ raw_inode->i_file_acl_high =
+ cpu_to_le16(ei->i_file_acl >> 32);
+ raw_inode->i_file_acl = cpu_to_le32(ei->i_file_acl);
+ if (!S_ISREG(inode->i_mode)) {
+ raw_inode->i_dir_acl = cpu_to_le32(ei->i_dir_acl);
+ } else {
+ raw_inode->i_size_high =
+ cpu_to_le32(ei->i_disksize >> 32);
+ if (ei->i_disksize > 0x7fffffffULL) {
+ struct super_block *sb = inode->i_sb;
+ if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_LARGE_FILE) ||
+ EXT4_SB(sb)->s_es->s_rev_level ==
+ cpu_to_le32(EXT4_GOOD_OLD_REV)) {
+ /* If this is the first large file
+ * created, add a flag to the superblock.
+ */
+ err = ext4_journal_get_write_access(handle,
+ EXT4_SB(sb)->s_sbh);
+ if (err)
+ goto out_brelse;
+ ext4_update_dynamic_rev(sb);
+ EXT4_SET_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_LARGE_FILE);
+ sb->s_dirt = 1;
+ handle->h_sync = 1;
+ err = ext4_journal_dirty_metadata(handle,
+ EXT4_SB(sb)->s_sbh);
+ }
+ }
+ }
+ raw_inode->i_generation = cpu_to_le32(inode->i_generation);
+ if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
+ if (old_valid_dev(inode->i_rdev)) {
+ raw_inode->i_block[0] =
+ cpu_to_le32(old_encode_dev(inode->i_rdev));
+ raw_inode->i_block[1] = 0;
+ } else {
+ raw_inode->i_block[0] = 0;
+ raw_inode->i_block[1] =
+ cpu_to_le32(new_encode_dev(inode->i_rdev));
+ raw_inode->i_block[2] = 0;
+ }
+ } else for (block = 0; block < EXT4_N_BLOCKS; block++)
+ raw_inode->i_block[block] = ei->i_data[block];
+
+ if (ei->i_extra_isize)
+ raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
+
+ BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata");
+ rc = ext4_journal_dirty_metadata(handle, bh);
+ if (!err)
+ err = rc;
+ ei->i_state &= ~EXT4_STATE_NEW;
+
+out_brelse:
+ brelse (bh);
+ ext4_std_error(inode->i_sb, err);
+ return err;
+}
+
+/*
+ * ext4_write_inode()
+ *
+ * We are called from a few places:
+ *
+ * - Within generic_file_write() for O_SYNC files.
+ * Here, there will be no transaction running. We wait for any running
+ * trasnaction to commit.
+ *
+ * - Within sys_sync(), kupdate and such.
+ * We wait on commit, if tol to.
+ *
+ * - Within prune_icache() (PF_MEMALLOC == true)
+ * Here we simply return. We can't afford to block kswapd on the
+ * journal commit.
+ *
+ * In all cases it is actually safe for us to return without doing anything,
+ * because the inode has been copied into a raw inode buffer in
+ * ext4_mark_inode_dirty(). This is a correctness thing for O_SYNC and for
+ * knfsd.
+ *
+ * Note that we are absolutely dependent upon all inode dirtiers doing the
+ * right thing: they *must* call mark_inode_dirty() after dirtying info in
+ * which we are interested.
+ *
+ * It would be a bug for them to not do this. The code:
+ *
+ * mark_inode_dirty(inode)
+ * stuff();
+ * inode->i_size = expr;
+ *
+ * is in error because a kswapd-driven write_inode() could occur while
+ * `stuff()' is running, and the new i_size will be lost. Plus the inode
+ * will no longer be on the superblock's dirty inode list.
+ */
+int ext4_write_inode(struct inode *inode, int wait)
+{
+ if (current->flags & PF_MEMALLOC)
+ return 0;
+
+ if (ext4_journal_current_handle()) {
+ jbd_debug(0, "called recursively, non-PF_MEMALLOC!\n");
+ dump_stack();
+ return -EIO;
+ }
+
+ if (!wait)
+ return 0;
+
+ return ext4_force_commit(inode->i_sb);
+}
+
+/*
+ * ext4_setattr()
+ *
+ * Called from notify_change.
+ *
+ * We want to trap VFS attempts to truncate the file as soon as
+ * possible. In particular, we want to make sure that when the VFS
+ * shrinks i_size, we put the inode on the orphan list and modify
+ * i_disksize immediately, so that during the subsequent flushing of
+ * dirty pages and freeing of disk blocks, we can guarantee that any
+ * commit will leave the blocks being flushed in an unused state on
+ * disk. (On recovery, the inode will get truncated and the blocks will
+ * be freed, so we have a strong guarantee that no future commit will
+ * leave these blocks visible to the user.)
+ *
+ * Called with inode->sem down.
+ */
+int ext4_setattr(struct dentry *dentry, struct iattr *attr)
+{
+ struct inode *inode = dentry->d_inode;
+ int error, rc = 0;
+ const unsigned int ia_valid = attr->ia_valid;
+
+ error = inode_change_ok(inode, attr);
+ if (error)
+ return error;
+
+ if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
+ (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
+ handle_t *handle;
+
+ /* (user+group)*(old+new) structure, inode write (sb,
+ * inode block, ? - but truncate inode update has it) */
+ handle = ext4_journal_start(inode, 2*(EXT4_QUOTA_INIT_BLOCKS(inode->i_sb)+
+ EXT4_QUOTA_DEL_BLOCKS(inode->i_sb))+3);
+ if (IS_ERR(handle)) {
+ error = PTR_ERR(handle);
+ goto err_out;
+ }
+ error = DQUOT_TRANSFER(inode, attr) ? -EDQUOT : 0;
+ if (error) {
+ ext4_journal_stop(handle);
+ return error;
+ }
+ /* Update corresponding info in inode so that everything is in
+ * one transaction */
+ if (attr->ia_valid & ATTR_UID)
+ inode->i_uid = attr->ia_uid;
+ if (attr->ia_valid & ATTR_GID)
+ inode->i_gid = attr->ia_gid;
+ error = ext4_mark_inode_dirty(handle, inode);
+ ext4_journal_stop(handle);
+ }
+
+ if (S_ISREG(inode->i_mode) &&
+ attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) {
+ handle_t *handle;
+
+ handle = ext4_journal_start(inode, 3);
+ if (IS_ERR(handle)) {
+ error = PTR_ERR(handle);
+ goto err_out;
+ }
+
+ error = ext4_orphan_add(handle, inode);
+ EXT4_I(inode)->i_disksize = attr->ia_size;
+ rc = ext4_mark_inode_dirty(handle, inode);
+ if (!error)
+ error = rc;
+ ext4_journal_stop(handle);
+ }
+
+ rc = inode_setattr(inode, attr);
+
+ /* If inode_setattr's call to ext4_truncate failed to get a
+ * transaction handle at all, we need to clean up the in-core
+ * orphan list manually. */
+ if (inode->i_nlink)
+ ext4_orphan_del(NULL, inode);
+
+ if (!rc && (ia_valid & ATTR_MODE))
+ rc = ext4_acl_chmod(inode);
+
+err_out:
+ ext4_std_error(inode->i_sb, error);
+ if (!error)
+ error = rc;
+ return error;
+}
+
+
+/*
+ * How many blocks doth make a writepage()?
+ *
+ * With N blocks per page, it may be:
+ * N data blocks
+ * 2 indirect block
+ * 2 dindirect
+ * 1 tindirect
+ * N+5 bitmap blocks (from the above)
+ * N+5 group descriptor summary blocks
+ * 1 inode block
+ * 1 superblock.
+ * 2 * EXT4_SINGLEDATA_TRANS_BLOCKS for the quote files
+ *
+ * 3 * (N + 5) + 2 + 2 * EXT4_SINGLEDATA_TRANS_BLOCKS
+ *
+ * With ordered or writeback data it's the same, less the N data blocks.
+ *
+ * If the inode's direct blocks can hold an integral number of pages then a
+ * page cannot straddle two indirect blocks, and we can only touch one indirect
+ * and dindirect block, and the "5" above becomes "3".
+ *
+ * This still overestimates under most circumstances. If we were to pass the
+ * start and end offsets in here as well we could do block_to_path() on each
+ * block and work out the exact number of indirects which are touched. Pah.
+ */
+
+int ext4_writepage_trans_blocks(struct inode *inode)
+{
+ int bpp = ext4_journal_blocks_per_page(inode);
+ int indirects = (EXT4_NDIR_BLOCKS % bpp) ? 5 : 3;
+ int ret;
+
+ if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
+ return ext4_ext_writepage_trans_blocks(inode, bpp);
+
+ if (ext4_should_journal_data(inode))
+ ret = 3 * (bpp + indirects) + 2;
+ else
+ ret = 2 * (bpp + indirects) + 2;
+
+#ifdef CONFIG_QUOTA
+ /* We know that structure was already allocated during DQUOT_INIT so
+ * we will be updating only the data blocks + inodes */
+ ret += 2*EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
+#endif
+
+ return ret;
+}
+
+/*
+ * The caller must have previously called ext4_reserve_inode_write().
+ * Give this, we know that the caller already has write access to iloc->bh.
+ */
+int ext4_mark_iloc_dirty(handle_t *handle,
+ struct inode *inode, struct ext4_iloc *iloc)
+{
+ int err = 0;
+
+ /* the do_update_inode consumes one bh->b_count */
+ get_bh(iloc->bh);
+
+ /* ext4_do_update_inode() does jbd2_journal_dirty_metadata */
+ err = ext4_do_update_inode(handle, inode, iloc);
+ put_bh(iloc->bh);
+ return err;
+}
+
+/*
+ * On success, We end up with an outstanding reference count against
+ * iloc->bh. This _must_ be cleaned up later.
+ */
+
+int
+ext4_reserve_inode_write(handle_t *handle, struct inode *inode,
+ struct ext4_iloc *iloc)
+{
+ int err = 0;
+ if (handle) {
+ err = ext4_get_inode_loc(inode, iloc);
+ if (!err) {
+ BUFFER_TRACE(iloc->bh, "get_write_access");
+ err = ext4_journal_get_write_access(handle, iloc->bh);
+ if (err) {
+ brelse(iloc->bh);
+ iloc->bh = NULL;
+ }
+ }
+ }
+ ext4_std_error(inode->i_sb, err);
+ return err;
+}
+
+/*
+ * What we do here is to mark the in-core inode as clean with respect to inode
+ * dirtiness (it may still be data-dirty).
+ * This means that the in-core inode may be reaped by prune_icache
+ * without having to perform any I/O. This is a very good thing,
+ * because *any* task may call prune_icache - even ones which
+ * have a transaction open against a different journal.
+ *
+ * Is this cheating? Not really. Sure, we haven't written the
+ * inode out, but prune_icache isn't a user-visible syncing function.
+ * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync)
+ * we start and wait on commits.
+ *
+ * Is this efficient/effective? Well, we're being nice to the system
+ * by cleaning up our inodes proactively so they can be reaped
+ * without I/O. But we are potentially leaving up to five seconds'
+ * worth of inodes floating about which prune_icache wants us to
+ * write out. One way to fix that would be to get prune_icache()
+ * to do a write_super() to free up some memory. It has the desired
+ * effect.
+ */
+int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
+{
+ struct ext4_iloc iloc;
+ int err;
+
+ might_sleep();
+ err = ext4_reserve_inode_write(handle, inode, &iloc);
+ if (!err)
+ err = ext4_mark_iloc_dirty(handle, inode, &iloc);
+ return err;
+}
+
+/*
+ * ext4_dirty_inode() is called from __mark_inode_dirty()
+ *
+ * We're really interested in the case where a file is being extended.
+ * i_size has been changed by generic_commit_write() and we thus need
+ * to include the updated inode in the current transaction.
+ *
+ * Also, DQUOT_ALLOC_SPACE() will always dirty the inode when blocks
+ * are allocated to the file.
+ *
+ * If the inode is marked synchronous, we don't honour that here - doing
+ * so would cause a commit on atime updates, which we don't bother doing.
+ * We handle synchronous inodes at the highest possible level.
+ */
+void ext4_dirty_inode(struct inode *inode)
+{
+ handle_t *current_handle = ext4_journal_current_handle();
+ handle_t *handle;
+
+ handle = ext4_journal_start(inode, 2);
+ if (IS_ERR(handle))
+ goto out;
+ if (current_handle &&
+ current_handle->h_transaction != handle->h_transaction) {
+ /* This task has a transaction open against a different fs */
+ printk(KERN_EMERG "%s: transactions do not match!\n",
+ __FUNCTION__);
+ } else {
+ jbd_debug(5, "marking dirty. outer handle=%p\n",
+ current_handle);
+ ext4_mark_inode_dirty(handle, inode);
+ }
+ ext4_journal_stop(handle);
+out:
+ return;
+}
+
+#if 0
+/*
+ * Bind an inode's backing buffer_head into this transaction, to prevent
+ * it from being flushed to disk early. Unlike
+ * ext4_reserve_inode_write, this leaves behind no bh reference and
+ * returns no iloc structure, so the caller needs to repeat the iloc
+ * lookup to mark the inode dirty later.
+ */
+static int ext4_pin_inode(handle_t *handle, struct inode *inode)
+{
+ struct ext4_iloc iloc;
+
+ int err = 0;
+ if (handle) {
+ err = ext4_get_inode_loc(inode, &iloc);
+ if (!err) {
+ BUFFER_TRACE(iloc.bh, "get_write_access");
+ err = jbd2_journal_get_write_access(handle, iloc.bh);
+ if (!err)
+ err = ext4_journal_dirty_metadata(handle,
+ iloc.bh);
+ brelse(iloc.bh);
+ }
+ }
+ ext4_std_error(inode->i_sb, err);
+ return err;
+}
+#endif
+
+int ext4_change_inode_journal_flag(struct inode *inode, int val)
+{
+ journal_t *journal;
+ handle_t *handle;
+ int err;
+
+ /*
+ * We have to be very careful here: changing a data block's
+ * journaling status dynamically is dangerous. If we write a
+ * data block to the journal, change the status and then delete
+ * that block, we risk forgetting to revoke the old log record
+ * from the journal and so a subsequent replay can corrupt data.
+ * So, first we make sure that the journal is empty and that
+ * nobody is changing anything.
+ */
+
+ journal = EXT4_JOURNAL(inode);
+ if (is_journal_aborted(journal) || IS_RDONLY(inode))
+ return -EROFS;
+
+ jbd2_journal_lock_updates(journal);
+ jbd2_journal_flush(journal);
+
+ /*
+ * OK, there are no updates running now, and all cached data is
+ * synced to disk. We are now in a completely consistent state
+ * which doesn't have anything in the journal, and we know that
+ * no filesystem updates are running, so it is safe to modify
+ * the inode's in-core data-journaling state flag now.
+ */
+
+ if (val)
+ EXT4_I(inode)->i_flags |= EXT4_JOURNAL_DATA_FL;
+ else
+ EXT4_I(inode)->i_flags &= ~EXT4_JOURNAL_DATA_FL;
+ ext4_set_aops(inode);
+
+ jbd2_journal_unlock_updates(journal);
+
+ /* Finally we can mark the inode as dirty. */
+
+ handle = ext4_journal_start(inode, 1);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+
+ err = ext4_mark_inode_dirty(handle, inode);
+ handle->h_sync = 1;
+ ext4_journal_stop(handle);
+ ext4_std_error(inode->i_sb, err);
+
+ return err;
+}
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
new file mode 100644
index 000000000000..22a737c306c7
--- /dev/null
+++ b/fs/ext4/ioctl.c
@@ -0,0 +1,306 @@
+/*
+ * linux/fs/ext4/ioctl.c
+ *
+ * Copyright (C) 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ */
+
+#include <linux/fs.h>
+#include <linux/jbd2.h>
+#include <linux/capability.h>
+#include <linux/ext4_fs.h>
+#include <linux/ext4_jbd2.h>
+#include <linux/time.h>
+#include <linux/compat.h>
+#include <linux/smp_lock.h>
+#include <asm/uaccess.h>
+
+int ext4_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
+ unsigned long arg)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ unsigned int flags;
+ unsigned short rsv_window_size;
+
+ ext4_debug ("cmd = %u, arg = %lu\n", cmd, arg);
+
+ switch (cmd) {
+ case EXT4_IOC_GETFLAGS:
+ flags = ei->i_flags & EXT4_FL_USER_VISIBLE;
+ return put_user(flags, (int __user *) arg);
+ case EXT4_IOC_SETFLAGS: {
+ handle_t *handle = NULL;
+ int err;
+ struct ext4_iloc iloc;
+ unsigned int oldflags;
+ unsigned int jflag;
+
+ if (IS_RDONLY(inode))
+ return -EROFS;
+
+ if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER))
+ return -EACCES;
+
+ if (get_user(flags, (int __user *) arg))
+ return -EFAULT;
+
+ if (!S_ISDIR(inode->i_mode))
+ flags &= ~EXT4_DIRSYNC_FL;
+
+ mutex_lock(&inode->i_mutex);
+ oldflags = ei->i_flags;
+
+ /* The JOURNAL_DATA flag is modifiable only by root */
+ jflag = flags & EXT4_JOURNAL_DATA_FL;
+
+ /*
+ * The IMMUTABLE and APPEND_ONLY flags can only be changed by
+ * the relevant capability.
+ *
+ * This test looks nicer. Thanks to Pauline Middelink
+ */
+ if ((flags ^ oldflags) & (EXT4_APPEND_FL | EXT4_IMMUTABLE_FL)) {
+ if (!capable(CAP_LINUX_IMMUTABLE)) {
+ mutex_unlock(&inode->i_mutex);
+ return -EPERM;
+ }
+ }
+
+ /*
+ * The JOURNAL_DATA flag can only be changed by
+ * the relevant capability.
+ */
+ if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) {
+ if (!capable(CAP_SYS_RESOURCE)) {
+ mutex_unlock(&inode->i_mutex);
+ return -EPERM;
+ }
+ }
+
+
+ handle = ext4_journal_start(inode, 1);
+ if (IS_ERR(handle)) {
+ mutex_unlock(&inode->i_mutex);
+ return PTR_ERR(handle);
+ }
+ if (IS_SYNC(inode))
+ handle->h_sync = 1;
+ err = ext4_reserve_inode_write(handle, inode, &iloc);
+ if (err)
+ goto flags_err;
+
+ flags = flags & EXT4_FL_USER_MODIFIABLE;
+ flags |= oldflags & ~EXT4_FL_USER_MODIFIABLE;
+ ei->i_flags = flags;
+
+ ext4_set_inode_flags(inode);
+ inode->i_ctime = CURRENT_TIME_SEC;
+
+ err = ext4_mark_iloc_dirty(handle, inode, &iloc);
+flags_err:
+ ext4_journal_stop(handle);
+ if (err) {
+ mutex_unlock(&inode->i_mutex);
+ return err;
+ }
+
+ if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL))
+ err = ext4_change_inode_journal_flag(inode, jflag);
+ mutex_unlock(&inode->i_mutex);
+ return err;
+ }
+ case EXT4_IOC_GETVERSION:
+ case EXT4_IOC_GETVERSION_OLD:
+ return put_user(inode->i_generation, (int __user *) arg);
+ case EXT4_IOC_SETVERSION:
+ case EXT4_IOC_SETVERSION_OLD: {
+ handle_t *handle;
+ struct ext4_iloc iloc;
+ __u32 generation;
+ int err;
+
+ if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER))
+ return -EPERM;
+ if (IS_RDONLY(inode))
+ return -EROFS;
+ if (get_user(generation, (int __user *) arg))
+ return -EFAULT;
+
+ handle = ext4_journal_start(inode, 1);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+ err = ext4_reserve_inode_write(handle, inode, &iloc);
+ if (err == 0) {
+ inode->i_ctime = CURRENT_TIME_SEC;
+ inode->i_generation = generation;
+ err = ext4_mark_iloc_dirty(handle, inode, &iloc);
+ }
+ ext4_journal_stop(handle);
+ return err;
+ }
+#ifdef CONFIG_JBD_DEBUG
+ case EXT4_IOC_WAIT_FOR_READONLY:
+ /*
+ * This is racy - by the time we're woken up and running,
+ * the superblock could be released. And the module could
+ * have been unloaded. So sue me.
+ *
+ * Returns 1 if it slept, else zero.
+ */
+ {
+ struct super_block *sb = inode->i_sb;
+ DECLARE_WAITQUEUE(wait, current);
+ int ret = 0;
+
+ set_current_state(TASK_INTERRUPTIBLE);
+ add_wait_queue(&EXT4_SB(sb)->ro_wait_queue, &wait);
+ if (timer_pending(&EXT4_SB(sb)->turn_ro_timer)) {
+ schedule();
+ ret = 1;
+ }
+ remove_wait_queue(&EXT4_SB(sb)->ro_wait_queue, &wait);
+ return ret;
+ }
+#endif
+ case EXT4_IOC_GETRSVSZ:
+ if (test_opt(inode->i_sb, RESERVATION)
+ && S_ISREG(inode->i_mode)
+ && ei->i_block_alloc_info) {
+ rsv_window_size = ei->i_block_alloc_info->rsv_window_node.rsv_goal_size;
+ return put_user(rsv_window_size, (int __user *)arg);
+ }
+ return -ENOTTY;
+ case EXT4_IOC_SETRSVSZ: {
+
+ if (!test_opt(inode->i_sb, RESERVATION) ||!S_ISREG(inode->i_mode))
+ return -ENOTTY;
+
+ if (IS_RDONLY(inode))
+ return -EROFS;
+
+ if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER))
+ return -EACCES;
+
+ if (get_user(rsv_window_size, (int __user *)arg))
+ return -EFAULT;
+
+ if (rsv_window_size > EXT4_MAX_RESERVE_BLOCKS)
+ rsv_window_size = EXT4_MAX_RESERVE_BLOCKS;
+
+ /*
+ * need to allocate reservation structure for this inode
+ * before set the window size
+ */
+ mutex_lock(&ei->truncate_mutex);
+ if (!ei->i_block_alloc_info)
+ ext4_init_block_alloc_info(inode);
+
+ if (ei->i_block_alloc_info){
+ struct ext4_reserve_window_node *rsv = &ei->i_block_alloc_info->rsv_window_node;
+ rsv->rsv_goal_size = rsv_window_size;
+ }
+ mutex_unlock(&ei->truncate_mutex);
+ return 0;
+ }
+ case EXT4_IOC_GROUP_EXTEND: {
+ ext4_fsblk_t n_blocks_count;
+ struct super_block *sb = inode->i_sb;
+ int err;
+
+ if (!capable(CAP_SYS_RESOURCE))
+ return -EPERM;
+
+ if (IS_RDONLY(inode))
+ return -EROFS;
+
+ if (get_user(n_blocks_count, (__u32 __user *)arg))
+ return -EFAULT;
+
+ err = ext4_group_extend(sb, EXT4_SB(sb)->s_es, n_blocks_count);
+ jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
+ jbd2_journal_flush(EXT4_SB(sb)->s_journal);
+ jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+
+ return err;
+ }
+ case EXT4_IOC_GROUP_ADD: {
+ struct ext4_new_group_data input;
+ struct super_block *sb = inode->i_sb;
+ int err;
+
+ if (!capable(CAP_SYS_RESOURCE))
+ return -EPERM;
+
+ if (IS_RDONLY(inode))
+ return -EROFS;
+
+ if (copy_from_user(&input, (struct ext4_new_group_input __user *)arg,
+ sizeof(input)))
+ return -EFAULT;
+
+ err = ext4_group_add(sb, &input);
+ jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
+ jbd2_journal_flush(EXT4_SB(sb)->s_journal);
+ jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+
+ return err;
+ }
+
+ default:
+ return -ENOTTY;
+ }
+}
+
+#ifdef CONFIG_COMPAT
+long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+ struct inode *inode = file->f_dentry->d_inode;
+ int ret;
+
+ /* These are just misnamed, they actually get/put from/to user an int */
+ switch (cmd) {
+ case EXT4_IOC32_GETFLAGS:
+ cmd = EXT4_IOC_GETFLAGS;
+ break;
+ case EXT4_IOC32_SETFLAGS:
+ cmd = EXT4_IOC_SETFLAGS;
+ break;
+ case EXT4_IOC32_GETVERSION:
+ cmd = EXT4_IOC_GETVERSION;
+ break;
+ case EXT4_IOC32_SETVERSION:
+ cmd = EXT4_IOC_SETVERSION;
+ break;
+ case EXT4_IOC32_GROUP_EXTEND:
+ cmd = EXT4_IOC_GROUP_EXTEND;
+ break;
+ case EXT4_IOC32_GETVERSION_OLD:
+ cmd = EXT4_IOC_GETVERSION_OLD;
+ break;
+ case EXT4_IOC32_SETVERSION_OLD:
+ cmd = EXT4_IOC_SETVERSION_OLD;
+ break;
+#ifdef CONFIG_JBD_DEBUG
+ case EXT4_IOC32_WAIT_FOR_READONLY:
+ cmd = EXT4_IOC_WAIT_FOR_READONLY;
+ break;
+#endif
+ case EXT4_IOC32_GETRSVSZ:
+ cmd = EXT4_IOC_GETRSVSZ;
+ break;
+ case EXT4_IOC32_SETRSVSZ:
+ cmd = EXT4_IOC_SETRSVSZ;
+ break;
+ case EXT4_IOC_GROUP_ADD:
+ break;
+ default:
+ return -ENOIOCTLCMD;
+ }
+ lock_kernel();
+ ret = ext4_ioctl(inode, file, cmd, (unsigned long) compat_ptr(arg));
+ unlock_kernel();
+ return ret;
+}
+#endif
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
new file mode 100644
index 000000000000..8b1bd03d20f5
--- /dev/null
+++ b/fs/ext4/namei.c
@@ -0,0 +1,2395 @@
+/*
+ * linux/fs/ext4/namei.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ * from
+ *
+ * linux/fs/minix/namei.c
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ *
+ * Big-endian to little-endian byte-swapping/bitmaps by
+ * David S. Miller (davem@caip.rutgers.edu), 1995
+ * Directory entry file type support and forward compatibility hooks
+ * for B-tree directories by Theodore Ts'o (tytso@mit.edu), 1998
+ * Hash Tree Directory indexing (c)
+ * Daniel Phillips, 2001
+ * Hash Tree Directory indexing porting
+ * Christopher Li, 2002
+ * Hash Tree Directory indexing cleanup
+ * Theodore Ts'o, 2002
+ */
+
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/jbd2.h>
+#include <linux/time.h>
+#include <linux/ext4_fs.h>
+#include <linux/ext4_jbd2.h>
+#include <linux/fcntl.h>
+#include <linux/stat.h>
+#include <linux/string.h>
+#include <linux/quotaops.h>
+#include <linux/buffer_head.h>
+#include <linux/bio.h>
+#include <linux/smp_lock.h>
+
+#include "namei.h"
+#include "xattr.h"
+#include "acl.h"
+
+/*
+ * define how far ahead to read directories while searching them.
+ */
+#define NAMEI_RA_CHUNKS 2
+#define NAMEI_RA_BLOCKS 4
+#define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS)
+#define NAMEI_RA_INDEX(c,b) (((c) * NAMEI_RA_BLOCKS) + (b))
+
+static struct buffer_head *ext4_append(handle_t *handle,
+ struct inode *inode,
+ u32 *block, int *err)
+{
+ struct buffer_head *bh;
+
+ *block = inode->i_size >> inode->i_sb->s_blocksize_bits;
+
+ if ((bh = ext4_bread(handle, inode, *block, 1, err))) {
+ inode->i_size += inode->i_sb->s_blocksize;
+ EXT4_I(inode)->i_disksize = inode->i_size;
+ ext4_journal_get_write_access(handle,bh);
+ }
+ return bh;
+}
+
+#ifndef assert
+#define assert(test) J_ASSERT(test)
+#endif
+
+#ifndef swap
+#define swap(x, y) do { typeof(x) z = x; x = y; y = z; } while (0)
+#endif
+
+#ifdef DX_DEBUG
+#define dxtrace(command) command
+#else
+#define dxtrace(command)
+#endif
+
+struct fake_dirent
+{
+ __le32 inode;
+ __le16 rec_len;
+ u8 name_len;
+ u8 file_type;
+};
+
+struct dx_countlimit
+{
+ __le16 limit;
+ __le16 count;
+};
+
+struct dx_entry
+{
+ __le32 hash;
+ __le32 block;
+};
+
+/*
+ * dx_root_info is laid out so that if it should somehow get overlaid by a
+ * dirent the two low bits of the hash version will be zero. Therefore, the
+ * hash version mod 4 should never be 0. Sincerely, the paranoia department.
+ */
+
+struct dx_root
+{
+ struct fake_dirent dot;
+ char dot_name[4];
+ struct fake_dirent dotdot;
+ char dotdot_name[4];
+ struct dx_root_info
+ {
+ __le32 reserved_zero;
+ u8 hash_version;
+ u8 info_length; /* 8 */
+ u8 indirect_levels;
+ u8 unused_flags;
+ }
+ info;
+ struct dx_entry entries[0];
+};
+
+struct dx_node
+{
+ struct fake_dirent fake;
+ struct dx_entry entries[0];
+};
+
+
+struct dx_frame
+{
+ struct buffer_head *bh;
+ struct dx_entry *entries;
+ struct dx_entry *at;
+};
+
+struct dx_map_entry
+{
+ u32 hash;
+ u32 offs;
+};
+
+#ifdef CONFIG_EXT4_INDEX
+static inline unsigned dx_get_block (struct dx_entry *entry);
+static void dx_set_block (struct dx_entry *entry, unsigned value);
+static inline unsigned dx_get_hash (struct dx_entry *entry);
+static void dx_set_hash (struct dx_entry *entry, unsigned value);
+static unsigned dx_get_count (struct dx_entry *entries);
+static unsigned dx_get_limit (struct dx_entry *entries);
+static void dx_set_count (struct dx_entry *entries, unsigned value);
+static void dx_set_limit (struct dx_entry *entries, unsigned value);
+static unsigned dx_root_limit (struct inode *dir, unsigned infosize);
+static unsigned dx_node_limit (struct inode *dir);
+static struct dx_frame *dx_probe(struct dentry *dentry,
+ struct inode *dir,
+ struct dx_hash_info *hinfo,
+ struct dx_frame *frame,
+ int *err);
+static void dx_release (struct dx_frame *frames);
+static int dx_make_map (struct ext4_dir_entry_2 *de, int size,
+ struct dx_hash_info *hinfo, struct dx_map_entry map[]);
+static void dx_sort_map(struct dx_map_entry *map, unsigned count);
+static struct ext4_dir_entry_2 *dx_move_dirents (char *from, char *to,
+ struct dx_map_entry *offsets, int count);
+static struct ext4_dir_entry_2* dx_pack_dirents (char *base, int size);
+static void dx_insert_block (struct dx_frame *frame, u32 hash, u32 block);
+static int ext4_htree_next_block(struct inode *dir, __u32 hash,
+ struct dx_frame *frame,
+ struct dx_frame *frames,
+ __u32 *start_hash);
+static struct buffer_head * ext4_dx_find_entry(struct dentry *dentry,
+ struct ext4_dir_entry_2 **res_dir, int *err);
+static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
+ struct inode *inode);
+
+/*
+ * Future: use high four bits of block for coalesce-on-delete flags
+ * Mask them off for now.
+ */
+
+static inline unsigned dx_get_block (struct dx_entry *entry)
+{
+ return le32_to_cpu(entry->block) & 0x00ffffff;
+}
+
+static inline void dx_set_block (struct dx_entry *entry, unsigned value)
+{
+ entry->block = cpu_to_le32(value);
+}
+
+static inline unsigned dx_get_hash (struct dx_entry *entry)
+{
+ return le32_to_cpu(entry->hash);
+}
+
+static inline void dx_set_hash (struct dx_entry *entry, unsigned value)
+{
+ entry->hash = cpu_to_le32(value);
+}
+
+static inline unsigned dx_get_count (struct dx_entry *entries)
+{
+ return le16_to_cpu(((struct dx_countlimit *) entries)->count);
+}
+
+static inline unsigned dx_get_limit (struct dx_entry *entries)
+{
+ return le16_to_cpu(((struct dx_countlimit *) entries)->limit);
+}
+
+static inline void dx_set_count (struct dx_entry *entries, unsigned value)
+{
+ ((struct dx_countlimit *) entries)->count = cpu_to_le16(value);
+}
+
+static inline void dx_set_limit (struct dx_entry *entries, unsigned value)
+{
+ ((struct dx_countlimit *) entries)->limit = cpu_to_le16(value);
+}
+
+static inline unsigned dx_root_limit (struct inode *dir, unsigned infosize)
+{
+ unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(1) -
+ EXT4_DIR_REC_LEN(2) - infosize;
+ return 0? 20: entry_space / sizeof(struct dx_entry);
+}
+
+static inline unsigned dx_node_limit (struct inode *dir)
+{
+ unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(0);
+ return 0? 22: entry_space / sizeof(struct dx_entry);
+}
+
+/*
+ * Debug
+ */
+#ifdef DX_DEBUG
+static void dx_show_index (char * label, struct dx_entry *entries)
+{
+ int i, n = dx_get_count (entries);
+ printk("%s index ", label);
+ for (i = 0; i < n; i++) {
+ printk("%x->%u ", i? dx_get_hash(entries + i) :
+ 0, dx_get_block(entries + i));
+ }
+ printk("\n");
+}
+
+struct stats
+{
+ unsigned names;
+ unsigned space;
+ unsigned bcount;
+};
+
+static struct stats dx_show_leaf(struct dx_hash_info *hinfo, struct ext4_dir_entry_2 *de,
+ int size, int show_names)
+{
+ unsigned names = 0, space = 0;
+ char *base = (char *) de;
+ struct dx_hash_info h = *hinfo;
+
+ printk("names: ");
+ while ((char *) de < base + size)
+ {
+ if (de->inode)
+ {
+ if (show_names)
+ {
+ int len = de->name_len;
+ char *name = de->name;
+ while (len--) printk("%c", *name++);
+ ext4fs_dirhash(de->name, de->name_len, &h);
+ printk(":%x.%u ", h.hash,
+ ((char *) de - base));
+ }
+ space += EXT4_DIR_REC_LEN(de->name_len);
+ names++;
+ }
+ de = (struct ext4_dir_entry_2 *) ((char *) de + le16_to_cpu(de->rec_len));
+ }
+ printk("(%i)\n", names);
+ return (struct stats) { names, space, 1 };
+}
+
+struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
+ struct dx_entry *entries, int levels)
+{
+ unsigned blocksize = dir->i_sb->s_blocksize;
+ unsigned count = dx_get_count (entries), names = 0, space = 0, i;
+ unsigned bcount = 0;
+ struct buffer_head *bh;
+ int err;
+ printk("%i indexed blocks...\n", count);
+ for (i = 0; i < count; i++, entries++)
+ {
+ u32 block = dx_get_block(entries), hash = i? dx_get_hash(entries): 0;
+ u32 range = i < count - 1? (dx_get_hash(entries + 1) - hash): ~hash;
+ struct stats stats;
+ printk("%s%3u:%03u hash %8x/%8x ",levels?"":" ", i, block, hash, range);
+ if (!(bh = ext4_bread (NULL,dir, block, 0,&err))) continue;
+ stats = levels?
+ dx_show_entries(hinfo, dir, ((struct dx_node *) bh->b_data)->entries, levels - 1):
+ dx_show_leaf(hinfo, (struct ext4_dir_entry_2 *) bh->b_data, blocksize, 0);
+ names += stats.names;
+ space += stats.space;
+ bcount += stats.bcount;
+ brelse (bh);
+ }
+ if (bcount)
+ printk("%snames %u, fullness %u (%u%%)\n", levels?"":" ",
+ names, space/bcount,(space/bcount)*100/blocksize);
+ return (struct stats) { names, space, bcount};
+}
+#endif /* DX_DEBUG */
+
+/*
+ * Probe for a directory leaf block to search.
+ *
+ * dx_probe can return ERR_BAD_DX_DIR, which means there was a format
+ * error in the directory index, and the caller should fall back to
+ * searching the directory normally. The callers of dx_probe **MUST**
+ * check for this error code, and make sure it never gets reflected
+ * back to userspace.
+ */
+static struct dx_frame *
+dx_probe(struct dentry *dentry, struct inode *dir,
+ struct dx_hash_info *hinfo, struct dx_frame *frame_in, int *err)
+{
+ unsigned count, indirect;
+ struct dx_entry *at, *entries, *p, *q, *m;
+ struct dx_root *root;
+ struct buffer_head *bh;
+ struct dx_frame *frame = frame_in;
+ u32 hash;
+
+ frame->bh = NULL;
+ if (dentry)
+ dir = dentry->d_parent->d_inode;
+ if (!(bh = ext4_bread (NULL,dir, 0, 0, err)))
+ goto fail;
+ root = (struct dx_root *) bh->b_data;
+ if (root->info.hash_version != DX_HASH_TEA &&
+ root->info.hash_version != DX_HASH_HALF_MD4 &&
+ root->info.hash_version != DX_HASH_LEGACY) {
+ ext4_warning(dir->i_sb, __FUNCTION__,
+ "Unrecognised inode hash code %d",
+ root->info.hash_version);
+ brelse(bh);
+ *err = ERR_BAD_DX_DIR;
+ goto fail;
+ }
+ hinfo->hash_version = root->info.hash_version;
+ hinfo->seed = EXT4_SB(dir->i_sb)->s_hash_seed;
+ if (dentry)
+ ext4fs_dirhash(dentry->d_name.name, dentry->d_name.len, hinfo);
+ hash = hinfo->hash;
+
+ if (root->info.unused_flags & 1) {
+ ext4_warning(dir->i_sb, __FUNCTION__,
+ "Unimplemented inode hash flags: %#06x",
+ root->info.unused_flags);
+ brelse(bh);
+ *err = ERR_BAD_DX_DIR;
+ goto fail;
+ }
+
+ if ((indirect = root->info.indirect_levels) > 1) {
+ ext4_warning(dir->i_sb, __FUNCTION__,
+ "Unimplemented inode hash depth: %#06x",
+ root->info.indirect_levels);
+ brelse(bh);
+ *err = ERR_BAD_DX_DIR;
+ goto fail;
+ }
+
+ entries = (struct dx_entry *) (((char *)&root->info) +
+ root->info.info_length);
+ assert(dx_get_limit(entries) == dx_root_limit(dir,
+ root->info.info_length));
+ dxtrace (printk("Look up %x", hash));
+ while (1)
+ {
+ count = dx_get_count(entries);
+ assert (count && count <= dx_get_limit(entries));
+ p = entries + 1;
+ q = entries + count - 1;
+ while (p <= q)
+ {
+ m = p + (q - p)/2;
+ dxtrace(printk("."));
+ if (dx_get_hash(m) > hash)
+ q = m - 1;
+ else
+ p = m + 1;
+ }
+
+ if (0) // linear search cross check
+ {
+ unsigned n = count - 1;
+ at = entries;
+ while (n--)
+ {
+ dxtrace(printk(","));
+ if (dx_get_hash(++at) > hash)
+ {
+ at--;
+ break;
+ }
+ }
+ assert (at == p - 1);
+ }
+
+ at = p - 1;
+ dxtrace(printk(" %x->%u\n", at == entries? 0: dx_get_hash(at), dx_get_block(at)));
+ frame->bh = bh;
+ frame->entries = entries;
+ frame->at = at;
+ if (!indirect--) return frame;
+ if (!(bh = ext4_bread (NULL,dir, dx_get_block(at), 0, err)))
+ goto fail2;
+ at = entries = ((struct dx_node *) bh->b_data)->entries;
+ assert (dx_get_limit(entries) == dx_node_limit (dir));
+ frame++;
+ }
+fail2:
+ while (frame >= frame_in) {
+ brelse(frame->bh);
+ frame--;
+ }
+fail:
+ return NULL;
+}
+
+static void dx_release (struct dx_frame *frames)
+{
+ if (frames[0].bh == NULL)
+ return;
+
+ if (((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels)
+ brelse(frames[1].bh);
+ brelse(frames[0].bh);
+}
+
+/*
+ * This function increments the frame pointer to search the next leaf
+ * block, and reads in the necessary intervening nodes if the search
+ * should be necessary. Whether or not the search is necessary is
+ * controlled by the hash parameter. If the hash value is even, then
+ * the search is only continued if the next block starts with that
+ * hash value. This is used if we are searching for a specific file.
+ *
+ * If the hash value is HASH_NB_ALWAYS, then always go to the next block.
+ *
+ * This function returns 1 if the caller should continue to search,
+ * or 0 if it should not. If there is an error reading one of the
+ * index blocks, it will a negative error code.
+ *
+ * If start_hash is non-null, it will be filled in with the starting
+ * hash of the next page.
+ */
+static int ext4_htree_next_block(struct inode *dir, __u32 hash,
+ struct dx_frame *frame,
+ struct dx_frame *frames,
+ __u32 *start_hash)
+{
+ struct dx_frame *p;
+ struct buffer_head *bh;
+ int err, num_frames = 0;
+ __u32 bhash;
+
+ p = frame;
+ /*
+ * Find the next leaf page by incrementing the frame pointer.
+ * If we run out of entries in the interior node, loop around and
+ * increment pointer in the parent node. When we break out of
+ * this loop, num_frames indicates the number of interior
+ * nodes need to be read.
+ */
+ while (1) {
+ if (++(p->at) < p->entries + dx_get_count(p->entries))
+ break;
+ if (p == frames)
+ return 0;
+ num_frames++;
+ p--;
+ }
+
+ /*
+ * If the hash is 1, then continue only if the next page has a
+ * continuation hash of any value. This is used for readdir
+ * handling. Otherwise, check to see if the hash matches the
+ * desired contiuation hash. If it doesn't, return since
+ * there's no point to read in the successive index pages.
+ */
+ bhash = dx_get_hash(p->at);
+ if (start_hash)
+ *start_hash = bhash;
+ if ((hash & 1) == 0) {
+ if ((bhash & ~1) != hash)
+ return 0;
+ }
+ /*
+ * If the hash is HASH_NB_ALWAYS, we always go to the next
+ * block so no check is necessary
+ */
+ while (num_frames--) {
+ if (!(bh = ext4_bread(NULL, dir, dx_get_block(p->at),
+ 0, &err)))
+ return err; /* Failure */
+ p++;
+ brelse (p->bh);
+ p->bh = bh;
+ p->at = p->entries = ((struct dx_node *) bh->b_data)->entries;
+ }
+ return 1;
+}
+
+
+/*
+ * p is at least 6 bytes before the end of page
+ */
+static inline struct ext4_dir_entry_2 *ext4_next_entry(struct ext4_dir_entry_2 *p)
+{
+ return (struct ext4_dir_entry_2 *)((char*)p + le16_to_cpu(p->rec_len));
+}
+
+/*
+ * This function fills a red-black tree with information from a
+ * directory block. It returns the number directory entries loaded
+ * into the tree. If there is an error it is returned in err.
+ */
+static int htree_dirblock_to_tree(struct file *dir_file,
+ struct inode *dir, int block,
+ struct dx_hash_info *hinfo,
+ __u32 start_hash, __u32 start_minor_hash)
+{
+ struct buffer_head *bh;
+ struct ext4_dir_entry_2 *de, *top;
+ int err, count = 0;
+
+ dxtrace(printk("In htree dirblock_to_tree: block %d\n", block));
+ if (!(bh = ext4_bread (NULL, dir, block, 0, &err)))
+ return err;
+
+ de = (struct ext4_dir_entry_2 *) bh->b_data;
+ top = (struct ext4_dir_entry_2 *) ((char *) de +
+ dir->i_sb->s_blocksize -
+ EXT4_DIR_REC_LEN(0));
+ for (; de < top; de = ext4_next_entry(de)) {
+ ext4fs_dirhash(de->name, de->name_len, hinfo);
+ if ((hinfo->hash < start_hash) ||
+ ((hinfo->hash == start_hash) &&
+ (hinfo->minor_hash < start_minor_hash)))
+ continue;
+ if (de->inode == 0)
+ continue;
+ if ((err = ext4_htree_store_dirent(dir_file,
+ hinfo->hash, hinfo->minor_hash, de)) != 0) {
+ brelse(bh);
+ return err;
+ }
+ count++;
+ }
+ brelse(bh);
+ return count;
+}
+
+
+/*
+ * This function fills a red-black tree with information from a
+ * directory. We start scanning the directory in hash order, starting
+ * at start_hash and start_minor_hash.
+ *
+ * This function returns the number of entries inserted into the tree,
+ * or a negative error code.
+ */
+int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
+ __u32 start_minor_hash, __u32 *next_hash)
+{
+ struct dx_hash_info hinfo;
+ struct ext4_dir_entry_2 *de;
+ struct dx_frame frames[2], *frame;
+ struct inode *dir;
+ int block, err;
+ int count = 0;
+ int ret;
+ __u32 hashval;
+
+ dxtrace(printk("In htree_fill_tree, start hash: %x:%x\n", start_hash,
+ start_minor_hash));
+ dir = dir_file->f_dentry->d_inode;
+ if (!(EXT4_I(dir)->i_flags & EXT4_INDEX_FL)) {
+ hinfo.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
+ hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed;
+ count = htree_dirblock_to_tree(dir_file, dir, 0, &hinfo,
+ start_hash, start_minor_hash);
+ *next_hash = ~0;
+ return count;
+ }
+ hinfo.hash = start_hash;
+ hinfo.minor_hash = 0;
+ frame = dx_probe(NULL, dir_file->f_dentry->d_inode, &hinfo, frames, &err);
+ if (!frame)
+ return err;
+
+ /* Add '.' and '..' from the htree header */
+ if (!start_hash && !start_minor_hash) {
+ de = (struct ext4_dir_entry_2 *) frames[0].bh->b_data;
+ if ((err = ext4_htree_store_dirent(dir_file, 0, 0, de)) != 0)
+ goto errout;
+ count++;
+ }
+ if (start_hash < 2 || (start_hash ==2 && start_minor_hash==0)) {
+ de = (struct ext4_dir_entry_2 *) frames[0].bh->b_data;
+ de = ext4_next_entry(de);
+ if ((err = ext4_htree_store_dirent(dir_file, 2, 0, de)) != 0)
+ goto errout;
+ count++;
+ }
+
+ while (1) {
+ block = dx_get_block(frame->at);
+ ret = htree_dirblock_to_tree(dir_file, dir, block, &hinfo,
+ start_hash, start_minor_hash);
+ if (ret < 0) {
+ err = ret;
+ goto errout;
+ }
+ count += ret;
+ hashval = ~0;
+ ret = ext4_htree_next_block(dir, HASH_NB_ALWAYS,
+ frame, frames, &hashval);
+ *next_hash = hashval;
+ if (ret < 0) {
+ err = ret;
+ goto errout;
+ }
+ /*
+ * Stop if: (a) there are no more entries, or
+ * (b) we have inserted at least one entry and the
+ * next hash value is not a continuation
+ */
+ if ((ret == 0) ||
+ (count && ((hashval & 1) == 0)))
+ break;
+ }
+ dx_release(frames);
+ dxtrace(printk("Fill tree: returned %d entries, next hash: %x\n",
+ count, *next_hash));
+ return count;
+errout:
+ dx_release(frames);
+ return (err);
+}
+
+
+/*
+ * Directory block splitting, compacting
+ */
+
+static int dx_make_map (struct ext4_dir_entry_2 *de, int size,
+ struct dx_hash_info *hinfo, struct dx_map_entry *map_tail)
+{
+ int count = 0;
+ char *base = (char *) de;
+ struct dx_hash_info h = *hinfo;
+
+ while ((char *) de < base + size)
+ {
+ if (de->name_len && de->inode) {
+ ext4fs_dirhash(de->name, de->name_len, &h);
+ map_tail--;
+ map_tail->hash = h.hash;
+ map_tail->offs = (u32) ((char *) de - base);
+ count++;
+ cond_resched();
+ }
+ /* XXX: do we need to check rec_len == 0 case? -Chris */
+ de = (struct ext4_dir_entry_2 *) ((char *) de + le16_to_cpu(de->rec_len));
+ }
+ return count;
+}
+
+static void dx_sort_map (struct dx_map_entry *map, unsigned count)
+{
+ struct dx_map_entry *p, *q, *top = map + count - 1;
+ int more;
+ /* Combsort until bubble sort doesn't suck */
+ while (count > 2) {
+ count = count*10/13;
+ if (count - 9 < 2) /* 9, 10 -> 11 */
+ count = 11;
+ for (p = top, q = p - count; q >= map; p--, q--)
+ if (p->hash < q->hash)
+ swap(*p, *q);
+ }
+ /* Garden variety bubble sort */
+ do {
+ more = 0;
+ q = top;
+ while (q-- > map) {
+ if (q[1].hash >= q[0].hash)
+ continue;
+ swap(*(q+1), *q);
+ more = 1;
+ }
+ } while(more);
+}
+
+static void dx_insert_block(struct dx_frame *frame, u32 hash, u32 block)
+{
+ struct dx_entry *entries = frame->entries;
+ struct dx_entry *old = frame->at, *new = old + 1;
+ int count = dx_get_count(entries);
+
+ assert(count < dx_get_limit(entries));
+ assert(old < entries + count);
+ memmove(new + 1, new, (char *)(entries + count) - (char *)(new));
+ dx_set_hash(new, hash);
+ dx_set_block(new, block);
+ dx_set_count(entries, count + 1);
+}
+#endif
+
+
+static void ext4_update_dx_flag(struct inode *inode)
+{
+ if (!EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
+ EXT4_FEATURE_COMPAT_DIR_INDEX))
+ EXT4_I(inode)->i_flags &= ~EXT4_INDEX_FL;
+}
+
+/*
+ * NOTE! unlike strncmp, ext4_match returns 1 for success, 0 for failure.
+ *
+ * `len <= EXT4_NAME_LEN' is guaranteed by caller.
+ * `de != NULL' is guaranteed by caller.
+ */
+static inline int ext4_match (int len, const char * const name,
+ struct ext4_dir_entry_2 * de)
+{
+ if (len != de->name_len)
+ return 0;
+ if (!de->inode)
+ return 0;
+ return !memcmp(name, de->name, len);
+}
+
+/*
+ * Returns 0 if not found, -1 on failure, and 1 on success
+ */
+static inline int search_dirblock(struct buffer_head * bh,
+ struct inode *dir,
+ struct dentry *dentry,
+ unsigned long offset,
+ struct ext4_dir_entry_2 ** res_dir)
+{
+ struct ext4_dir_entry_2 * de;
+ char * dlimit;
+ int de_len;
+ const char *name = dentry->d_name.name;
+ int namelen = dentry->d_name.len;
+
+ de = (struct ext4_dir_entry_2 *) bh->b_data;
+ dlimit = bh->b_data + dir->i_sb->s_blocksize;
+ while ((char *) de < dlimit) {
+ /* this code is executed quadratically often */
+ /* do minimal checking `by hand' */
+
+ if ((char *) de + namelen <= dlimit &&
+ ext4_match (namelen, name, de)) {
+ /* found a match - just to be sure, do a full check */
+ if (!ext4_check_dir_entry("ext4_find_entry",
+ dir, de, bh, offset))
+ return -1;
+ *res_dir = de;
+ return 1;
+ }
+ /* prevent looping on a bad block */
+ de_len = le16_to_cpu(de->rec_len);
+ if (de_len <= 0)
+ return -1;
+ offset += de_len;
+ de = (struct ext4_dir_entry_2 *) ((char *) de + de_len);
+ }
+ return 0;
+}
+
+
+/*
+ * ext4_find_entry()
+ *
+ * finds an entry in the specified directory with the wanted name. It
+ * returns the cache buffer in which the entry was found, and the entry
+ * itself (as a parameter - res_dir). It does NOT read the inode of the
+ * entry - you'll have to do that yourself if you want to.
+ *
+ * The returned buffer_head has ->b_count elevated. The caller is expected
+ * to brelse() it when appropriate.
+ */
+static struct buffer_head * ext4_find_entry (struct dentry *dentry,
+ struct ext4_dir_entry_2 ** res_dir)
+{
+ struct super_block * sb;
+ struct buffer_head * bh_use[NAMEI_RA_SIZE];
+ struct buffer_head * bh, *ret = NULL;
+ unsigned long start, block, b;
+ int ra_max = 0; /* Number of bh's in the readahead
+ buffer, bh_use[] */
+ int ra_ptr = 0; /* Current index into readahead
+ buffer */
+ int num = 0;
+ int nblocks, i, err;
+ struct inode *dir = dentry->d_parent->d_inode;
+ int namelen;
+ const u8 *name;
+ unsigned blocksize;
+
+ *res_dir = NULL;
+ sb = dir->i_sb;
+ blocksize = sb->s_blocksize;
+ namelen = dentry->d_name.len;
+ name = dentry->d_name.name;
+ if (namelen > EXT4_NAME_LEN)
+ return NULL;
+#ifdef CONFIG_EXT4_INDEX
+ if (is_dx(dir)) {
+ bh = ext4_dx_find_entry(dentry, res_dir, &err);
+ /*
+ * On success, or if the error was file not found,
+ * return. Otherwise, fall back to doing a search the
+ * old fashioned way.
+ */
+ if (bh || (err != ERR_BAD_DX_DIR))
+ return bh;
+ dxtrace(printk("ext4_find_entry: dx failed, falling back\n"));
+ }
+#endif
+ nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb);
+ start = EXT4_I(dir)->i_dir_start_lookup;
+ if (start >= nblocks)
+ start = 0;
+ block = start;
+restart:
+ do {
+ /*
+ * We deal with the read-ahead logic here.
+ */
+ if (ra_ptr >= ra_max) {
+ /* Refill the readahead buffer */
+ ra_ptr = 0;
+ b = block;
+ for (ra_max = 0; ra_max < NAMEI_RA_SIZE; ra_max++) {
+ /*
+ * Terminate if we reach the end of the
+ * directory and must wrap, or if our
+ * search has finished at this block.
+ */
+ if (b >= nblocks || (num && block == start)) {
+ bh_use[ra_max] = NULL;
+ break;
+ }
+ num++;
+ bh = ext4_getblk(NULL, dir, b++, 0, &err);
+ bh_use[ra_max] = bh;
+ if (bh)
+ ll_rw_block(READ_META, 1, &bh);
+ }
+ }
+ if ((bh = bh_use[ra_ptr++]) == NULL)
+ goto next;
+ wait_on_buffer(bh);
+ if (!buffer_uptodate(bh)) {
+ /* read error, skip block & hope for the best */
+ ext4_error(sb, __FUNCTION__, "reading directory #%lu "
+ "offset %lu", dir->i_ino, block);
+ brelse(bh);
+ goto next;
+ }
+ i = search_dirblock(bh, dir, dentry,
+ block << EXT4_BLOCK_SIZE_BITS(sb), res_dir);
+ if (i == 1) {
+ EXT4_I(dir)->i_dir_start_lookup = block;
+ ret = bh;
+ goto cleanup_and_exit;
+ } else {
+ brelse(bh);
+ if (i < 0)
+ goto cleanup_and_exit;
+ }
+ next:
+ if (++block >= nblocks)
+ block = 0;
+ } while (block != start);
+
+ /*
+ * If the directory has grown while we were searching, then
+ * search the last part of the directory before giving up.
+ */
+ block = nblocks;
+ nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb);
+ if (block < nblocks) {
+ start = 0;
+ goto restart;
+ }
+
+cleanup_and_exit:
+ /* Clean up the read-ahead blocks */
+ for (; ra_ptr < ra_max; ra_ptr++)
+ brelse (bh_use[ra_ptr]);
+ return ret;
+}
+
+#ifdef CONFIG_EXT4_INDEX
+static struct buffer_head * ext4_dx_find_entry(struct dentry *dentry,
+ struct ext4_dir_entry_2 **res_dir, int *err)
+{
+ struct super_block * sb;
+ struct dx_hash_info hinfo;
+ u32 hash;
+ struct dx_frame frames[2], *frame;
+ struct ext4_dir_entry_2 *de, *top;
+ struct buffer_head *bh;
+ unsigned long block;
+ int retval;
+ int namelen = dentry->d_name.len;
+ const u8 *name = dentry->d_name.name;
+ struct inode *dir = dentry->d_parent->d_inode;
+
+ sb = dir->i_sb;
+ /* NFS may look up ".." - look at dx_root directory block */
+ if (namelen > 2 || name[0] != '.'||(name[1] != '.' && name[1] != '\0')){
+ if (!(frame = dx_probe(dentry, NULL, &hinfo, frames, err)))
+ return NULL;
+ } else {
+ frame = frames;
+ frame->bh = NULL; /* for dx_release() */
+ frame->at = (struct dx_entry *)frames; /* hack for zero entry*/
+ dx_set_block(frame->at, 0); /* dx_root block is 0 */
+ }
+ hash = hinfo.hash;
+ do {
+ block = dx_get_block(frame->at);
+ if (!(bh = ext4_bread (NULL,dir, block, 0, err)))
+ goto errout;
+ de = (struct ext4_dir_entry_2 *) bh->b_data;
+ top = (struct ext4_dir_entry_2 *) ((char *) de + sb->s_blocksize -
+ EXT4_DIR_REC_LEN(0));
+ for (; de < top; de = ext4_next_entry(de))
+ if (ext4_match (namelen, name, de)) {
+ if (!ext4_check_dir_entry("ext4_find_entry",
+ dir, de, bh,
+ (block<<EXT4_BLOCK_SIZE_BITS(sb))
+ +((char *)de - bh->b_data))) {
+ brelse (bh);
+ goto errout;
+ }
+ *res_dir = de;
+ dx_release (frames);
+ return bh;
+ }
+ brelse (bh);
+ /* Check to see if we should continue to search */
+ retval = ext4_htree_next_block(dir, hash, frame,
+ frames, NULL);
+ if (retval < 0) {
+ ext4_warning(sb, __FUNCTION__,
+ "error reading index page in directory #%lu",
+ dir->i_ino);
+ *err = retval;
+ goto errout;
+ }
+ } while (retval == 1);
+
+ *err = -ENOENT;
+errout:
+ dxtrace(printk("%s not found\n", name));
+ dx_release (frames);
+ return NULL;
+}
+#endif
+
+static struct dentry *ext4_lookup(struct inode * dir, struct dentry *dentry, struct nameidata *nd)
+{
+ struct inode * inode;
+ struct ext4_dir_entry_2 * de;
+ struct buffer_head * bh;
+
+ if (dentry->d_name.len > EXT4_NAME_LEN)
+ return ERR_PTR(-ENAMETOOLONG);
+
+ bh = ext4_find_entry(dentry, &de);
+ inode = NULL;
+ if (bh) {
+ unsigned long ino = le32_to_cpu(de->inode);
+ brelse (bh);
+ if (!ext4_valid_inum(dir->i_sb, ino)) {
+ ext4_error(dir->i_sb, "ext4_lookup",
+ "bad inode number: %lu", ino);
+ inode = NULL;
+ } else
+ inode = iget(dir->i_sb, ino);
+
+ if (!inode)
+ return ERR_PTR(-EACCES);
+ }
+ return d_splice_alias(inode, dentry);
+}
+
+
+struct dentry *ext4_get_parent(struct dentry *child)
+{
+ unsigned long ino;
+ struct dentry *parent;
+ struct inode *inode;
+ struct dentry dotdot;
+ struct ext4_dir_entry_2 * de;
+ struct buffer_head *bh;
+
+ dotdot.d_name.name = "..";
+ dotdot.d_name.len = 2;
+ dotdot.d_parent = child; /* confusing, isn't it! */
+
+ bh = ext4_find_entry(&dotdot, &de);
+ inode = NULL;
+ if (!bh)
+ return ERR_PTR(-ENOENT);
+ ino = le32_to_cpu(de->inode);
+ brelse(bh);
+
+ if (!ext4_valid_inum(child->d_inode->i_sb, ino)) {
+ ext4_error(child->d_inode->i_sb, "ext4_get_parent",
+ "bad inode number: %lu", ino);
+ inode = NULL;
+ } else
+ inode = iget(child->d_inode->i_sb, ino);
+
+ if (!inode)
+ return ERR_PTR(-EACCES);
+
+ parent = d_alloc_anon(inode);
+ if (!parent) {
+ iput(inode);
+ parent = ERR_PTR(-ENOMEM);
+ }
+ return parent;
+}
+
+#define S_SHIFT 12
+static unsigned char ext4_type_by_mode[S_IFMT >> S_SHIFT] = {
+ [S_IFREG >> S_SHIFT] = EXT4_FT_REG_FILE,
+ [S_IFDIR >> S_SHIFT] = EXT4_FT_DIR,
+ [S_IFCHR >> S_SHIFT] = EXT4_FT_CHRDEV,
+ [S_IFBLK >> S_SHIFT] = EXT4_FT_BLKDEV,
+ [S_IFIFO >> S_SHIFT] = EXT4_FT_FIFO,
+ [S_IFSOCK >> S_SHIFT] = EXT4_FT_SOCK,
+ [S_IFLNK >> S_SHIFT] = EXT4_FT_SYMLINK,
+};
+
+static inline void ext4_set_de_type(struct super_block *sb,
+ struct ext4_dir_entry_2 *de,
+ umode_t mode) {
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE))
+ de->file_type = ext4_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
+}
+
+#ifdef CONFIG_EXT4_INDEX
+static struct ext4_dir_entry_2 *
+dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count)
+{
+ unsigned rec_len = 0;
+
+ while (count--) {
+ struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *) (from + map->offs);
+ rec_len = EXT4_DIR_REC_LEN(de->name_len);
+ memcpy (to, de, rec_len);
+ ((struct ext4_dir_entry_2 *) to)->rec_len =
+ cpu_to_le16(rec_len);
+ de->inode = 0;
+ map++;
+ to += rec_len;
+ }
+ return (struct ext4_dir_entry_2 *) (to - rec_len);
+}
+
+static struct ext4_dir_entry_2* dx_pack_dirents(char *base, int size)
+{
+ struct ext4_dir_entry_2 *next, *to, *prev, *de = (struct ext4_dir_entry_2 *) base;
+ unsigned rec_len = 0;
+
+ prev = to = de;
+ while ((char*)de < base + size) {
+ next = (struct ext4_dir_entry_2 *) ((char *) de +
+ le16_to_cpu(de->rec_len));
+ if (de->inode && de->name_len) {
+ rec_len = EXT4_DIR_REC_LEN(de->name_len);
+ if (de > to)
+ memmove(to, de, rec_len);
+ to->rec_len = cpu_to_le16(rec_len);
+ prev = to;
+ to = (struct ext4_dir_entry_2 *) (((char *) to) + rec_len);
+ }
+ de = next;
+ }
+ return prev;
+}
+
+static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
+ struct buffer_head **bh,struct dx_frame *frame,
+ struct dx_hash_info *hinfo, int *error)
+{
+ unsigned blocksize = dir->i_sb->s_blocksize;
+ unsigned count, continued;
+ struct buffer_head *bh2;
+ u32 newblock;
+ u32 hash2;
+ struct dx_map_entry *map;
+ char *data1 = (*bh)->b_data, *data2;
+ unsigned split;
+ struct ext4_dir_entry_2 *de = NULL, *de2;
+ int err;
+
+ bh2 = ext4_append (handle, dir, &newblock, error);
+ if (!(bh2)) {
+ brelse(*bh);
+ *bh = NULL;
+ goto errout;
+ }
+
+ BUFFER_TRACE(*bh, "get_write_access");
+ err = ext4_journal_get_write_access(handle, *bh);
+ if (err) {
+ journal_error:
+ brelse(*bh);
+ brelse(bh2);
+ *bh = NULL;
+ ext4_std_error(dir->i_sb, err);
+ goto errout;
+ }
+ BUFFER_TRACE(frame->bh, "get_write_access");
+ err = ext4_journal_get_write_access(handle, frame->bh);
+ if (err)
+ goto journal_error;
+
+ data2 = bh2->b_data;
+
+ /* create map in the end of data2 block */
+ map = (struct dx_map_entry *) (data2 + blocksize);
+ count = dx_make_map ((struct ext4_dir_entry_2 *) data1,
+ blocksize, hinfo, map);
+ map -= count;
+ split = count/2; // need to adjust to actual middle
+ dx_sort_map (map, count);
+ hash2 = map[split].hash;
+ continued = hash2 == map[split - 1].hash;
+ dxtrace(printk("Split block %i at %x, %i/%i\n",
+ dx_get_block(frame->at), hash2, split, count-split));
+
+ /* Fancy dance to stay within two buffers */
+ de2 = dx_move_dirents(data1, data2, map + split, count - split);
+ de = dx_pack_dirents(data1,blocksize);
+ de->rec_len = cpu_to_le16(data1 + blocksize - (char *) de);
+ de2->rec_len = cpu_to_le16(data2 + blocksize - (char *) de2);
+ dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data1, blocksize, 1));
+ dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data2, blocksize, 1));
+
+ /* Which block gets the new entry? */
+ if (hinfo->hash >= hash2)
+ {
+ swap(*bh, bh2);
+ de = de2;
+ }
+ dx_insert_block (frame, hash2 + continued, newblock);
+ err = ext4_journal_dirty_metadata (handle, bh2);
+ if (err)
+ goto journal_error;
+ err = ext4_journal_dirty_metadata (handle, frame->bh);
+ if (err)
+ goto journal_error;
+ brelse (bh2);
+ dxtrace(dx_show_index ("frame", frame->entries));
+errout:
+ return de;
+}
+#endif
+
+
+/*
+ * Add a new entry into a directory (leaf) block. If de is non-NULL,
+ * it points to a directory entry which is guaranteed to be large
+ * enough for new directory entry. If de is NULL, then
+ * add_dirent_to_buf will attempt search the directory block for
+ * space. It will return -ENOSPC if no space is available, and -EIO
+ * and -EEXIST if directory entry already exists.
+ *
+ * NOTE! bh is NOT released in the case where ENOSPC is returned. In
+ * all other cases bh is released.
+ */
+static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
+ struct inode *inode, struct ext4_dir_entry_2 *de,
+ struct buffer_head * bh)
+{
+ struct inode *dir = dentry->d_parent->d_inode;
+ const char *name = dentry->d_name.name;
+ int namelen = dentry->d_name.len;
+ unsigned long offset = 0;
+ unsigned short reclen;
+ int nlen, rlen, err;
+ char *top;
+
+ reclen = EXT4_DIR_REC_LEN(namelen);
+ if (!de) {
+ de = (struct ext4_dir_entry_2 *)bh->b_data;
+ top = bh->b_data + dir->i_sb->s_blocksize - reclen;
+ while ((char *) de <= top) {
+ if (!ext4_check_dir_entry("ext4_add_entry", dir, de,
+ bh, offset)) {
+ brelse (bh);
+ return -EIO;
+ }
+ if (ext4_match (namelen, name, de)) {
+ brelse (bh);
+ return -EEXIST;
+ }
+ nlen = EXT4_DIR_REC_LEN(de->name_len);
+ rlen = le16_to_cpu(de->rec_len);
+ if ((de->inode? rlen - nlen: rlen) >= reclen)
+ break;
+ de = (struct ext4_dir_entry_2 *)((char *)de + rlen);
+ offset += rlen;
+ }
+ if ((char *) de > top)
+ return -ENOSPC;
+ }
+ BUFFER_TRACE(bh, "get_write_access");
+ err = ext4_journal_get_write_access(handle, bh);
+ if (err) {
+ ext4_std_error(dir->i_sb, err);
+ brelse(bh);
+ return err;
+ }
+
+ /* By now the buffer is marked for journaling */
+ nlen = EXT4_DIR_REC_LEN(de->name_len);
+ rlen = le16_to_cpu(de->rec_len);
+ if (de->inode) {
+ struct ext4_dir_entry_2 *de1 = (struct ext4_dir_entry_2 *)((char *)de + nlen);
+ de1->rec_len = cpu_to_le16(rlen - nlen);
+ de->rec_len = cpu_to_le16(nlen);
+ de = de1;
+ }
+ de->file_type = EXT4_FT_UNKNOWN;
+ if (inode) {
+ de->inode = cpu_to_le32(inode->i_ino);
+ ext4_set_de_type(dir->i_sb, de, inode->i_mode);
+ } else
+ de->inode = 0;
+ de->name_len = namelen;
+ memcpy (de->name, name, namelen);
+ /*
+ * XXX shouldn't update any times until successful
+ * completion of syscall, but too many callers depend
+ * on this.
+ *
+ * XXX similarly, too many callers depend on
+ * ext4_new_inode() setting the times, but error
+ * recovery deletes the inode, so the worst that can
+ * happen is that the times are slightly out of date
+ * and/or different from the directory change time.
+ */
+ dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
+ ext4_update_dx_flag(dir);
+ dir->i_version++;
+ ext4_mark_inode_dirty(handle, dir);
+ BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata");
+ err = ext4_journal_dirty_metadata(handle, bh);
+ if (err)
+ ext4_std_error(dir->i_sb, err);
+ brelse(bh);
+ return 0;
+}
+
+#ifdef CONFIG_EXT4_INDEX
+/*
+ * This converts a one block unindexed directory to a 3 block indexed
+ * directory, and adds the dentry to the indexed directory.
+ */
+static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
+ struct inode *inode, struct buffer_head *bh)
+{
+ struct inode *dir = dentry->d_parent->d_inode;
+ const char *name = dentry->d_name.name;
+ int namelen = dentry->d_name.len;
+ struct buffer_head *bh2;
+ struct dx_root *root;
+ struct dx_frame frames[2], *frame;
+ struct dx_entry *entries;
+ struct ext4_dir_entry_2 *de, *de2;
+ char *data1, *top;
+ unsigned len;
+ int retval;
+ unsigned blocksize;
+ struct dx_hash_info hinfo;
+ u32 block;
+ struct fake_dirent *fde;
+
+ blocksize = dir->i_sb->s_blocksize;
+ dxtrace(printk("Creating index\n"));
+ retval = ext4_journal_get_write_access(handle, bh);
+ if (retval) {
+ ext4_std_error(dir->i_sb, retval);
+ brelse(bh);
+ return retval;
+ }
+ root = (struct dx_root *) bh->b_data;
+
+ bh2 = ext4_append (handle, dir, &block, &retval);
+ if (!(bh2)) {
+ brelse(bh);
+ return retval;
+ }
+ EXT4_I(dir)->i_flags |= EXT4_INDEX_FL;
+ data1 = bh2->b_data;
+
+ /* The 0th block becomes the root, move the dirents out */
+ fde = &root->dotdot;
+ de = (struct ext4_dir_entry_2 *)((char *)fde + le16_to_cpu(fde->rec_len));
+ len = ((char *) root) + blocksize - (char *) de;
+ memcpy (data1, de, len);
+ de = (struct ext4_dir_entry_2 *) data1;
+ top = data1 + len;
+ while ((char *)(de2=(void*)de+le16_to_cpu(de->rec_len)) < top)
+ de = de2;
+ de->rec_len = cpu_to_le16(data1 + blocksize - (char *) de);
+ /* Initialize the root; the dot dirents already exist */
+ de = (struct ext4_dir_entry_2 *) (&root->dotdot);
+ de->rec_len = cpu_to_le16(blocksize - EXT4_DIR_REC_LEN(2));
+ memset (&root->info, 0, sizeof(root->info));
+ root->info.info_length = sizeof(root->info);
+ root->info.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
+ entries = root->entries;
+ dx_set_block (entries, 1);
+ dx_set_count (entries, 1);
+ dx_set_limit (entries, dx_root_limit(dir, sizeof(root->info)));
+
+ /* Initialize as for dx_probe */
+ hinfo.hash_version = root->info.hash_version;
+ hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed;
+ ext4fs_dirhash(name, namelen, &hinfo);
+ frame = frames;
+ frame->entries = entries;
+ frame->at = entries;
+ frame->bh = bh;
+ bh = bh2;
+ de = do_split(handle,dir, &bh, frame, &hinfo, &retval);
+ dx_release (frames);
+ if (!(de))
+ return retval;
+
+ return add_dirent_to_buf(handle, dentry, inode, de, bh);
+}
+#endif
+
+/*
+ * ext4_add_entry()
+ *
+ * adds a file entry to the specified directory, using the same
+ * semantics as ext4_find_entry(). It returns NULL if it failed.
+ *
+ * NOTE!! The inode part of 'de' is left at 0 - which means you
+ * may not sleep between calling this and putting something into
+ * the entry, as someone else might have used it while you slept.
+ */
+static int ext4_add_entry (handle_t *handle, struct dentry *dentry,
+ struct inode *inode)
+{
+ struct inode *dir = dentry->d_parent->d_inode;
+ unsigned long offset;
+ struct buffer_head * bh;
+ struct ext4_dir_entry_2 *de;
+ struct super_block * sb;
+ int retval;
+#ifdef CONFIG_EXT4_INDEX
+ int dx_fallback=0;
+#endif
+ unsigned blocksize;
+ u32 block, blocks;
+
+ sb = dir->i_sb;
+ blocksize = sb->s_blocksize;
+ if (!dentry->d_name.len)
+ return -EINVAL;
+#ifdef CONFIG_EXT4_INDEX
+ if (is_dx(dir)) {
+ retval = ext4_dx_add_entry(handle, dentry, inode);
+ if (!retval || (retval != ERR_BAD_DX_DIR))
+ return retval;
+ EXT4_I(dir)->i_flags &= ~EXT4_INDEX_FL;
+ dx_fallback++;
+ ext4_mark_inode_dirty(handle, dir);
+ }
+#endif
+ blocks = dir->i_size >> sb->s_blocksize_bits;
+ for (block = 0, offset = 0; block < blocks; block++) {
+ bh = ext4_bread(handle, dir, block, 0, &retval);
+ if(!bh)
+ return retval;
+ retval = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
+ if (retval != -ENOSPC)
+ return retval;
+
+#ifdef CONFIG_EXT4_INDEX
+ if (blocks == 1 && !dx_fallback &&
+ EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX))
+ return make_indexed_dir(handle, dentry, inode, bh);
+#endif
+ brelse(bh);
+ }
+ bh = ext4_append(handle, dir, &block, &retval);
+ if (!bh)
+ return retval;
+ de = (struct ext4_dir_entry_2 *) bh->b_data;
+ de->inode = 0;
+ de->rec_len = cpu_to_le16(blocksize);
+ return add_dirent_to_buf(handle, dentry, inode, de, bh);
+}
+
+#ifdef CONFIG_EXT4_INDEX
+/*
+ * Returns 0 for success, or a negative error value
+ */
+static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
+ struct inode *inode)
+{
+ struct dx_frame frames[2], *frame;
+ struct dx_entry *entries, *at;
+ struct dx_hash_info hinfo;
+ struct buffer_head * bh;
+ struct inode *dir = dentry->d_parent->d_inode;
+ struct super_block * sb = dir->i_sb;
+ struct ext4_dir_entry_2 *de;
+ int err;
+
+ frame = dx_probe(dentry, NULL, &hinfo, frames, &err);
+ if (!frame)
+ return err;
+ entries = frame->entries;
+ at = frame->at;
+
+ if (!(bh = ext4_bread(handle,dir, dx_get_block(frame->at), 0, &err)))
+ goto cleanup;
+
+ BUFFER_TRACE(bh, "get_write_access");
+ err = ext4_journal_get_write_access(handle, bh);
+ if (err)
+ goto journal_error;
+
+ err = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
+ if (err != -ENOSPC) {
+ bh = NULL;
+ goto cleanup;
+ }
+
+ /* Block full, should compress but for now just split */
+ dxtrace(printk("using %u of %u node entries\n",
+ dx_get_count(entries), dx_get_limit(entries)));
+ /* Need to split index? */
+ if (dx_get_count(entries) == dx_get_limit(entries)) {
+ u32 newblock;
+ unsigned icount = dx_get_count(entries);
+ int levels = frame - frames;
+ struct dx_entry *entries2;
+ struct dx_node *node2;
+ struct buffer_head *bh2;
+
+ if (levels && (dx_get_count(frames->entries) ==
+ dx_get_limit(frames->entries))) {
+ ext4_warning(sb, __FUNCTION__,
+ "Directory index full!");
+ err = -ENOSPC;
+ goto cleanup;
+ }
+ bh2 = ext4_append (handle, dir, &newblock, &err);
+ if (!(bh2))
+ goto cleanup;
+ node2 = (struct dx_node *)(bh2->b_data);
+ entries2 = node2->entries;
+ node2->fake.rec_len = cpu_to_le16(sb->s_blocksize);
+ node2->fake.inode = 0;
+ BUFFER_TRACE(frame->bh, "get_write_access");
+ err = ext4_journal_get_write_access(handle, frame->bh);
+ if (err)
+ goto journal_error;
+ if (levels) {
+ unsigned icount1 = icount/2, icount2 = icount - icount1;
+ unsigned hash2 = dx_get_hash(entries + icount1);
+ dxtrace(printk("Split index %i/%i\n", icount1, icount2));
+
+ BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */
+ err = ext4_journal_get_write_access(handle,
+ frames[0].bh);
+ if (err)
+ goto journal_error;
+
+ memcpy ((char *) entries2, (char *) (entries + icount1),
+ icount2 * sizeof(struct dx_entry));
+ dx_set_count (entries, icount1);
+ dx_set_count (entries2, icount2);
+ dx_set_limit (entries2, dx_node_limit(dir));
+
+ /* Which index block gets the new entry? */
+ if (at - entries >= icount1) {
+ frame->at = at = at - entries - icount1 + entries2;
+ frame->entries = entries = entries2;
+ swap(frame->bh, bh2);
+ }
+ dx_insert_block (frames + 0, hash2, newblock);
+ dxtrace(dx_show_index ("node", frames[1].entries));
+ dxtrace(dx_show_index ("node",
+ ((struct dx_node *) bh2->b_data)->entries));
+ err = ext4_journal_dirty_metadata(handle, bh2);
+ if (err)
+ goto journal_error;
+ brelse (bh2);
+ } else {
+ dxtrace(printk("Creating second level index...\n"));
+ memcpy((char *) entries2, (char *) entries,
+ icount * sizeof(struct dx_entry));
+ dx_set_limit(entries2, dx_node_limit(dir));
+
+ /* Set up root */
+ dx_set_count(entries, 1);
+ dx_set_block(entries + 0, newblock);
+ ((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels = 1;
+
+ /* Add new access path frame */
+ frame = frames + 1;
+ frame->at = at = at - entries + entries2;
+ frame->entries = entries = entries2;
+ frame->bh = bh2;
+ err = ext4_journal_get_write_access(handle,
+ frame->bh);
+ if (err)
+ goto journal_error;
+ }
+ ext4_journal_dirty_metadata(handle, frames[0].bh);
+ }
+ de = do_split(handle, dir, &bh, frame, &hinfo, &err);
+ if (!de)
+ goto cleanup;
+ err = add_dirent_to_buf(handle, dentry, inode, de, bh);
+ bh = NULL;
+ goto cleanup;
+
+journal_error:
+ ext4_std_error(dir->i_sb, err);
+cleanup:
+ if (bh)
+ brelse(bh);
+ dx_release(frames);
+ return err;
+}
+#endif
+
+/*
+ * ext4_delete_entry deletes a directory entry by merging it with the
+ * previous entry
+ */
+static int ext4_delete_entry (handle_t *handle,
+ struct inode * dir,
+ struct ext4_dir_entry_2 * de_del,
+ struct buffer_head * bh)
+{
+ struct ext4_dir_entry_2 * de, * pde;
+ int i;
+
+ i = 0;
+ pde = NULL;
+ de = (struct ext4_dir_entry_2 *) bh->b_data;
+ while (i < bh->b_size) {
+ if (!ext4_check_dir_entry("ext4_delete_entry", dir, de, bh, i))
+ return -EIO;
+ if (de == de_del) {
+ BUFFER_TRACE(bh, "get_write_access");
+ ext4_journal_get_write_access(handle, bh);
+ if (pde)
+ pde->rec_len =
+ cpu_to_le16(le16_to_cpu(pde->rec_len) +
+ le16_to_cpu(de->rec_len));
+ else
+ de->inode = 0;
+ dir->i_version++;
+ BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata");
+ ext4_journal_dirty_metadata(handle, bh);
+ return 0;
+ }
+ i += le16_to_cpu(de->rec_len);
+ pde = de;
+ de = (struct ext4_dir_entry_2 *)
+ ((char *) de + le16_to_cpu(de->rec_len));
+ }
+ return -ENOENT;
+}
+
+/*
+ * ext4_mark_inode_dirty is somewhat expensive, so unlike ext2 we
+ * do not perform it in these functions. We perform it at the call site,
+ * if it is needed.
+ */
+static inline void ext4_inc_count(handle_t *handle, struct inode *inode)
+{
+ inc_nlink(inode);
+}
+
+static inline void ext4_dec_count(handle_t *handle, struct inode *inode)
+{
+ drop_nlink(inode);
+}
+
+static int ext4_add_nondir(handle_t *handle,
+ struct dentry *dentry, struct inode *inode)
+{
+ int err = ext4_add_entry(handle, dentry, inode);
+ if (!err) {
+ ext4_mark_inode_dirty(handle, inode);
+ d_instantiate(dentry, inode);
+ return 0;
+ }
+ ext4_dec_count(handle, inode);
+ iput(inode);
+ return err;
+}
+
+/*
+ * By the time this is called, we already have created
+ * the directory cache entry for the new file, but it
+ * is so far negative - it has no inode.
+ *
+ * If the create succeeds, we fill in the inode information
+ * with d_instantiate().
+ */
+static int ext4_create (struct inode * dir, struct dentry * dentry, int mode,
+ struct nameidata *nd)
+{
+ handle_t *handle;
+ struct inode * inode;
+ int err, retries = 0;
+
+retry:
+ handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
+ EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
+ 2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb));
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+
+ if (IS_DIRSYNC(dir))
+ handle->h_sync = 1;
+
+ inode = ext4_new_inode (handle, dir, mode);
+ err = PTR_ERR(inode);
+ if (!IS_ERR(inode)) {
+ inode->i_op = &ext4_file_inode_operations;
+ inode->i_fop = &ext4_file_operations;
+ ext4_set_aops(inode);
+ err = ext4_add_nondir(handle, dentry, inode);
+ }
+ ext4_journal_stop(handle);
+ if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
+ goto retry;
+ return err;
+}
+
+static int ext4_mknod (struct inode * dir, struct dentry *dentry,
+ int mode, dev_t rdev)
+{
+ handle_t *handle;
+ struct inode *inode;
+ int err, retries = 0;
+
+ if (!new_valid_dev(rdev))
+ return -EINVAL;
+
+retry:
+ handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
+ EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
+ 2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb));
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+
+ if (IS_DIRSYNC(dir))
+ handle->h_sync = 1;
+
+ inode = ext4_new_inode (handle, dir, mode);
+ err = PTR_ERR(inode);
+ if (!IS_ERR(inode)) {
+ init_special_inode(inode, inode->i_mode, rdev);
+#ifdef CONFIG_EXT4DEV_FS_XATTR
+ inode->i_op = &ext4_special_inode_operations;
+#endif
+ err = ext4_add_nondir(handle, dentry, inode);
+ }
+ ext4_journal_stop(handle);
+ if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
+ goto retry;
+ return err;
+}
+
+static int ext4_mkdir(struct inode * dir, struct dentry * dentry, int mode)
+{
+ handle_t *handle;
+ struct inode * inode;
+ struct buffer_head * dir_block;
+ struct ext4_dir_entry_2 * de;
+ int err, retries = 0;
+
+ if (dir->i_nlink >= EXT4_LINK_MAX)
+ return -EMLINK;
+
+retry:
+ handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
+ EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
+ 2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb));
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+
+ if (IS_DIRSYNC(dir))
+ handle->h_sync = 1;
+
+ inode = ext4_new_inode (handle, dir, S_IFDIR | mode);
+ err = PTR_ERR(inode);
+ if (IS_ERR(inode))
+ goto out_stop;
+
+ inode->i_op = &ext4_dir_inode_operations;
+ inode->i_fop = &ext4_dir_operations;
+ inode->i_size = EXT4_I(inode)->i_disksize = inode->i_sb->s_blocksize;
+ dir_block = ext4_bread (handle, inode, 0, 1, &err);
+ if (!dir_block) {
+ drop_nlink(inode); /* is this nlink == 0? */
+ ext4_mark_inode_dirty(handle, inode);
+ iput (inode);
+ goto out_stop;
+ }
+ BUFFER_TRACE(dir_block, "get_write_access");
+ ext4_journal_get_write_access(handle, dir_block);
+ de = (struct ext4_dir_entry_2 *) dir_block->b_data;
+ de->inode = cpu_to_le32(inode->i_ino);
+ de->name_len = 1;
+ de->rec_len = cpu_to_le16(EXT4_DIR_REC_LEN(de->name_len));
+ strcpy (de->name, ".");
+ ext4_set_de_type(dir->i_sb, de, S_IFDIR);
+ de = (struct ext4_dir_entry_2 *)
+ ((char *) de + le16_to_cpu(de->rec_len));
+ de->inode = cpu_to_le32(dir->i_ino);
+ de->rec_len = cpu_to_le16(inode->i_sb->s_blocksize-EXT4_DIR_REC_LEN(1));
+ de->name_len = 2;
+ strcpy (de->name, "..");
+ ext4_set_de_type(dir->i_sb, de, S_IFDIR);
+ inode->i_nlink = 2;
+ BUFFER_TRACE(dir_block, "call ext4_journal_dirty_metadata");
+ ext4_journal_dirty_metadata(handle, dir_block);
+ brelse (dir_block);
+ ext4_mark_inode_dirty(handle, inode);
+ err = ext4_add_entry (handle, dentry, inode);
+ if (err) {
+ inode->i_nlink = 0;
+ ext4_mark_inode_dirty(handle, inode);
+ iput (inode);
+ goto out_stop;
+ }
+ inc_nlink(dir);
+ ext4_update_dx_flag(dir);
+ ext4_mark_inode_dirty(handle, dir);
+ d_instantiate(dentry, inode);
+out_stop:
+ ext4_journal_stop(handle);
+ if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
+ goto retry;
+ return err;
+}
+
+/*
+ * routine to check that the specified directory is empty (for rmdir)
+ */
+static int empty_dir (struct inode * inode)
+{
+ unsigned long offset;
+ struct buffer_head * bh;
+ struct ext4_dir_entry_2 * de, * de1;
+ struct super_block * sb;
+ int err = 0;
+
+ sb = inode->i_sb;
+ if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2) ||
+ !(bh = ext4_bread (NULL, inode, 0, 0, &err))) {
+ if (err)
+ ext4_error(inode->i_sb, __FUNCTION__,
+ "error %d reading directory #%lu offset 0",
+ err, inode->i_ino);
+ else
+ ext4_warning(inode->i_sb, __FUNCTION__,
+ "bad directory (dir #%lu) - no data block",
+ inode->i_ino);
+ return 1;
+ }
+ de = (struct ext4_dir_entry_2 *) bh->b_data;
+ de1 = (struct ext4_dir_entry_2 *)
+ ((char *) de + le16_to_cpu(de->rec_len));
+ if (le32_to_cpu(de->inode) != inode->i_ino ||
+ !le32_to_cpu(de1->inode) ||
+ strcmp (".", de->name) ||
+ strcmp ("..", de1->name)) {
+ ext4_warning (inode->i_sb, "empty_dir",
+ "bad directory (dir #%lu) - no `.' or `..'",
+ inode->i_ino);
+ brelse (bh);
+ return 1;
+ }
+ offset = le16_to_cpu(de->rec_len) + le16_to_cpu(de1->rec_len);
+ de = (struct ext4_dir_entry_2 *)
+ ((char *) de1 + le16_to_cpu(de1->rec_len));
+ while (offset < inode->i_size ) {
+ if (!bh ||
+ (void *) de >= (void *) (bh->b_data+sb->s_blocksize)) {
+ err = 0;
+ brelse (bh);
+ bh = ext4_bread (NULL, inode,
+ offset >> EXT4_BLOCK_SIZE_BITS(sb), 0, &err);
+ if (!bh) {
+ if (err)
+ ext4_error(sb, __FUNCTION__,
+ "error %d reading directory"
+ " #%lu offset %lu",
+ err, inode->i_ino, offset);
+ offset += sb->s_blocksize;
+ continue;
+ }
+ de = (struct ext4_dir_entry_2 *) bh->b_data;
+ }
+ if (!ext4_check_dir_entry("empty_dir", inode, de, bh, offset)) {
+ de = (struct ext4_dir_entry_2 *)(bh->b_data +
+ sb->s_blocksize);
+ offset = (offset | (sb->s_blocksize - 1)) + 1;
+ continue;
+ }
+ if (le32_to_cpu(de->inode)) {
+ brelse (bh);
+ return 0;
+ }
+ offset += le16_to_cpu(de->rec_len);
+ de = (struct ext4_dir_entry_2 *)
+ ((char *) de + le16_to_cpu(de->rec_len));
+ }
+ brelse (bh);
+ return 1;
+}
+
+/* ext4_orphan_add() links an unlinked or truncated inode into a list of
+ * such inodes, starting at the superblock, in case we crash before the
+ * file is closed/deleted, or in case the inode truncate spans multiple
+ * transactions and the last transaction is not recovered after a crash.
+ *
+ * At filesystem recovery time, we walk this list deleting unlinked
+ * inodes and truncating linked inodes in ext4_orphan_cleanup().
+ */
+int ext4_orphan_add(handle_t *handle, struct inode *inode)
+{
+ struct super_block *sb = inode->i_sb;
+ struct ext4_iloc iloc;
+ int err = 0, rc;
+
+ lock_super(sb);
+ if (!list_empty(&EXT4_I(inode)->i_orphan))
+ goto out_unlock;
+
+ /* Orphan handling is only valid for files with data blocks
+ * being truncated, or files being unlinked. */
+
+ /* @@@ FIXME: Observation from aviro:
+ * I think I can trigger J_ASSERT in ext4_orphan_add(). We block
+ * here (on lock_super()), so race with ext4_link() which might bump
+ * ->i_nlink. For, say it, character device. Not a regular file,
+ * not a directory, not a symlink and ->i_nlink > 0.
+ */
+ J_ASSERT ((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+ S_ISLNK(inode->i_mode)) || inode->i_nlink == 0);
+
+ BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get_write_access");
+ err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh);
+ if (err)
+ goto out_unlock;
+
+ err = ext4_reserve_inode_write(handle, inode, &iloc);
+ if (err)
+ goto out_unlock;
+
+ /* Insert this inode at the head of the on-disk orphan list... */
+ NEXT_ORPHAN(inode) = le32_to_cpu(EXT4_SB(sb)->s_es->s_last_orphan);
+ EXT4_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino);
+ err = ext4_journal_dirty_metadata(handle, EXT4_SB(sb)->s_sbh);
+ rc = ext4_mark_iloc_dirty(handle, inode, &iloc);
+ if (!err)
+ err = rc;
+
+ /* Only add to the head of the in-memory list if all the
+ * previous operations succeeded. If the orphan_add is going to
+ * fail (possibly taking the journal offline), we can't risk
+ * leaving the inode on the orphan list: stray orphan-list
+ * entries can cause panics at unmount time.
+ *
+ * This is safe: on error we're going to ignore the orphan list
+ * anyway on the next recovery. */
+ if (!err)
+ list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan);
+
+ jbd_debug(4, "superblock will point to %lu\n", inode->i_ino);
+ jbd_debug(4, "orphan inode %lu will point to %d\n",
+ inode->i_ino, NEXT_ORPHAN(inode));
+out_unlock:
+ unlock_super(sb);
+ ext4_std_error(inode->i_sb, err);
+ return err;
+}
+
+/*
+ * ext4_orphan_del() removes an unlinked or truncated inode from the list
+ * of such inodes stored on disk, because it is finally being cleaned up.
+ */
+int ext4_orphan_del(handle_t *handle, struct inode *inode)
+{
+ struct list_head *prev;
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ struct ext4_sb_info *sbi;
+ unsigned long ino_next;
+ struct ext4_iloc iloc;
+ int err = 0;
+
+ lock_super(inode->i_sb);
+ if (list_empty(&ei->i_orphan)) {
+ unlock_super(inode->i_sb);
+ return 0;
+ }
+
+ ino_next = NEXT_ORPHAN(inode);
+ prev = ei->i_orphan.prev;
+ sbi = EXT4_SB(inode->i_sb);
+
+ jbd_debug(4, "remove inode %lu from orphan list\n", inode->i_ino);
+
+ list_del_init(&ei->i_orphan);
+
+ /* If we're on an error path, we may not have a valid
+ * transaction handle with which to update the orphan list on
+ * disk, but we still need to remove the inode from the linked
+ * list in memory. */
+ if (!handle)
+ goto out;
+
+ err = ext4_reserve_inode_write(handle, inode, &iloc);
+ if (err)
+ goto out_err;
+
+ if (prev == &sbi->s_orphan) {
+ jbd_debug(4, "superblock will point to %lu\n", ino_next);
+ BUFFER_TRACE(sbi->s_sbh, "get_write_access");
+ err = ext4_journal_get_write_access(handle, sbi->s_sbh);
+ if (err)
+ goto out_brelse;
+ sbi->s_es->s_last_orphan = cpu_to_le32(ino_next);
+ err = ext4_journal_dirty_metadata(handle, sbi->s_sbh);
+ } else {
+ struct ext4_iloc iloc2;
+ struct inode *i_prev =
+ &list_entry(prev, struct ext4_inode_info, i_orphan)->vfs_inode;
+
+ jbd_debug(4, "orphan inode %lu will point to %lu\n",
+ i_prev->i_ino, ino_next);
+ err = ext4_reserve_inode_write(handle, i_prev, &iloc2);
+ if (err)
+ goto out_brelse;
+ NEXT_ORPHAN(i_prev) = ino_next;
+ err = ext4_mark_iloc_dirty(handle, i_prev, &iloc2);
+ }
+ if (err)
+ goto out_brelse;
+ NEXT_ORPHAN(inode) = 0;
+ err = ext4_mark_iloc_dirty(handle, inode, &iloc);
+
+out_err:
+ ext4_std_error(inode->i_sb, err);
+out:
+ unlock_super(inode->i_sb);
+ return err;
+
+out_brelse:
+ brelse(iloc.bh);
+ goto out_err;
+}
+
+static int ext4_rmdir (struct inode * dir, struct dentry *dentry)
+{
+ int retval;
+ struct inode * inode;
+ struct buffer_head * bh;
+ struct ext4_dir_entry_2 * de;
+ handle_t *handle;
+
+ /* Initialize quotas before so that eventual writes go in
+ * separate transaction */
+ DQUOT_INIT(dentry->d_inode);
+ handle = ext4_journal_start(dir, EXT4_DELETE_TRANS_BLOCKS(dir->i_sb));
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+
+ retval = -ENOENT;
+ bh = ext4_find_entry (dentry, &de);
+ if (!bh)
+ goto end_rmdir;
+
+ if (IS_DIRSYNC(dir))
+ handle->h_sync = 1;
+
+ inode = dentry->d_inode;
+
+ retval = -EIO;
+ if (le32_to_cpu(de->inode) != inode->i_ino)
+ goto end_rmdir;
+
+ retval = -ENOTEMPTY;
+ if (!empty_dir (inode))
+ goto end_rmdir;
+
+ retval = ext4_delete_entry(handle, dir, de, bh);
+ if (retval)
+ goto end_rmdir;
+ if (inode->i_nlink != 2)
+ ext4_warning (inode->i_sb, "ext4_rmdir",
+ "empty directory has nlink!=2 (%d)",
+ inode->i_nlink);
+ inode->i_version++;
+ clear_nlink(inode);
+ /* There's no need to set i_disksize: the fact that i_nlink is
+ * zero will ensure that the right thing happens during any
+ * recovery. */
+ inode->i_size = 0;
+ ext4_orphan_add(handle, inode);
+ inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
+ ext4_mark_inode_dirty(handle, inode);
+ drop_nlink(dir);
+ ext4_update_dx_flag(dir);
+ ext4_mark_inode_dirty(handle, dir);
+
+end_rmdir:
+ ext4_journal_stop(handle);
+ brelse (bh);
+ return retval;
+}
+
+static int ext4_unlink(struct inode * dir, struct dentry *dentry)
+{
+ int retval;
+ struct inode * inode;
+ struct buffer_head * bh;
+ struct ext4_dir_entry_2 * de;
+ handle_t *handle;
+
+ /* Initialize quotas before so that eventual writes go
+ * in separate transaction */
+ DQUOT_INIT(dentry->d_inode);
+ handle = ext4_journal_start(dir, EXT4_DELETE_TRANS_BLOCKS(dir->i_sb));
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+
+ if (IS_DIRSYNC(dir))
+ handle->h_sync = 1;
+
+ retval = -ENOENT;
+ bh = ext4_find_entry (dentry, &de);
+ if (!bh)
+ goto end_unlink;
+
+ inode = dentry->d_inode;
+
+ retval = -EIO;
+ if (le32_to_cpu(de->inode) != inode->i_ino)
+ goto end_unlink;
+
+ if (!inode->i_nlink) {
+ ext4_warning (inode->i_sb, "ext4_unlink",
+ "Deleting nonexistent file (%lu), %d",
+ inode->i_ino, inode->i_nlink);
+ inode->i_nlink = 1;
+ }
+ retval = ext4_delete_entry(handle, dir, de, bh);
+ if (retval)
+ goto end_unlink;
+ dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
+ ext4_update_dx_flag(dir);
+ ext4_mark_inode_dirty(handle, dir);
+ drop_nlink(inode);
+ if (!inode->i_nlink)
+ ext4_orphan_add(handle, inode);
+ inode->i_ctime = dir->i_ctime;
+ ext4_mark_inode_dirty(handle, inode);
+ retval = 0;
+
+end_unlink:
+ ext4_journal_stop(handle);
+ brelse (bh);
+ return retval;
+}
+
+static int ext4_symlink (struct inode * dir,
+ struct dentry *dentry, const char * symname)
+{
+ handle_t *handle;
+ struct inode * inode;
+ int l, err, retries = 0;
+
+ l = strlen(symname)+1;
+ if (l > dir->i_sb->s_blocksize)
+ return -ENAMETOOLONG;
+
+retry:
+ handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
+ EXT4_INDEX_EXTRA_TRANS_BLOCKS + 5 +
+ 2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb));
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+
+ if (IS_DIRSYNC(dir))
+ handle->h_sync = 1;
+
+ inode = ext4_new_inode (handle, dir, S_IFLNK|S_IRWXUGO);
+ err = PTR_ERR(inode);
+ if (IS_ERR(inode))
+ goto out_stop;
+
+ if (l > sizeof (EXT4_I(inode)->i_data)) {
+ inode->i_op = &ext4_symlink_inode_operations;
+ ext4_set_aops(inode);
+ /*
+ * page_symlink() calls into ext4_prepare/commit_write.
+ * We have a transaction open. All is sweetness. It also sets
+ * i_size in generic_commit_write().
+ */
+ err = __page_symlink(inode, symname, l,
+ mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
+ if (err) {
+ ext4_dec_count(handle, inode);
+ ext4_mark_inode_dirty(handle, inode);
+ iput (inode);
+ goto out_stop;
+ }
+ } else {
+ inode->i_op = &ext4_fast_symlink_inode_operations;
+ memcpy((char*)&EXT4_I(inode)->i_data,symname,l);
+ inode->i_size = l-1;
+ }
+ EXT4_I(inode)->i_disksize = inode->i_size;
+ err = ext4_add_nondir(handle, dentry, inode);
+out_stop:
+ ext4_journal_stop(handle);
+ if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
+ goto retry;
+ return err;
+}
+
+static int ext4_link (struct dentry * old_dentry,
+ struct inode * dir, struct dentry *dentry)
+{
+ handle_t *handle;
+ struct inode *inode = old_dentry->d_inode;
+ int err, retries = 0;
+
+ if (inode->i_nlink >= EXT4_LINK_MAX)
+ return -EMLINK;
+
+retry:
+ handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
+ EXT4_INDEX_EXTRA_TRANS_BLOCKS);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+
+ if (IS_DIRSYNC(dir))
+ handle->h_sync = 1;
+
+ inode->i_ctime = CURRENT_TIME_SEC;
+ ext4_inc_count(handle, inode);
+ atomic_inc(&inode->i_count);
+
+ err = ext4_add_nondir(handle, dentry, inode);
+ ext4_journal_stop(handle);
+ if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
+ goto retry;
+ return err;
+}
+
+#define PARENT_INO(buffer) \
+ ((struct ext4_dir_entry_2 *) ((char *) buffer + \
+ le16_to_cpu(((struct ext4_dir_entry_2 *) buffer)->rec_len)))->inode
+
+/*
+ * Anybody can rename anything with this: the permission checks are left to the
+ * higher-level routines.
+ */
+static int ext4_rename (struct inode * old_dir, struct dentry *old_dentry,
+ struct inode * new_dir,struct dentry *new_dentry)
+{
+ handle_t *handle;
+ struct inode * old_inode, * new_inode;
+ struct buffer_head * old_bh, * new_bh, * dir_bh;
+ struct ext4_dir_entry_2 * old_de, * new_de;
+ int retval;
+
+ old_bh = new_bh = dir_bh = NULL;
+
+ /* Initialize quotas before so that eventual writes go
+ * in separate transaction */
+ if (new_dentry->d_inode)
+ DQUOT_INIT(new_dentry->d_inode);
+ handle = ext4_journal_start(old_dir, 2 *
+ EXT4_DATA_TRANS_BLOCKS(old_dir->i_sb) +
+ EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+
+ if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir))
+ handle->h_sync = 1;
+
+ old_bh = ext4_find_entry (old_dentry, &old_de);
+ /*
+ * Check for inode number is _not_ due to possible IO errors.
+ * We might rmdir the source, keep it as pwd of some process
+ * and merrily kill the link to whatever was created under the
+ * same name. Goodbye sticky bit ;-<
+ */
+ old_inode = old_dentry->d_inode;
+ retval = -ENOENT;
+ if (!old_bh || le32_to_cpu(old_de->inode) != old_inode->i_ino)
+ goto end_rename;
+
+ new_inode = new_dentry->d_inode;
+ new_bh = ext4_find_entry (new_dentry, &new_de);
+ if (new_bh) {
+ if (!new_inode) {
+ brelse (new_bh);
+ new_bh = NULL;
+ }
+ }
+ if (S_ISDIR(old_inode->i_mode)) {
+ if (new_inode) {
+ retval = -ENOTEMPTY;
+ if (!empty_dir (new_inode))
+ goto end_rename;
+ }
+ retval = -EIO;
+ dir_bh = ext4_bread (handle, old_inode, 0, 0, &retval);
+ if (!dir_bh)
+ goto end_rename;
+ if (le32_to_cpu(PARENT_INO(dir_bh->b_data)) != old_dir->i_ino)
+ goto end_rename;
+ retval = -EMLINK;
+ if (!new_inode && new_dir!=old_dir &&
+ new_dir->i_nlink >= EXT4_LINK_MAX)
+ goto end_rename;
+ }
+ if (!new_bh) {
+ retval = ext4_add_entry (handle, new_dentry, old_inode);
+ if (retval)
+ goto end_rename;
+ } else {
+ BUFFER_TRACE(new_bh, "get write access");
+ ext4_journal_get_write_access(handle, new_bh);
+ new_de->inode = cpu_to_le32(old_inode->i_ino);
+ if (EXT4_HAS_INCOMPAT_FEATURE(new_dir->i_sb,
+ EXT4_FEATURE_INCOMPAT_FILETYPE))
+ new_de->file_type = old_de->file_type;
+ new_dir->i_version++;
+ BUFFER_TRACE(new_bh, "call ext4_journal_dirty_metadata");
+ ext4_journal_dirty_metadata(handle, new_bh);
+ brelse(new_bh);
+ new_bh = NULL;
+ }
+
+ /*
+ * Like most other Unix systems, set the ctime for inodes on a
+ * rename.
+ */
+ old_inode->i_ctime = CURRENT_TIME_SEC;
+ ext4_mark_inode_dirty(handle, old_inode);
+
+ /*
+ * ok, that's it
+ */
+ if (le32_to_cpu(old_de->inode) != old_inode->i_ino ||
+ old_de->name_len != old_dentry->d_name.len ||
+ strncmp(old_de->name, old_dentry->d_name.name, old_de->name_len) ||
+ (retval = ext4_delete_entry(handle, old_dir,
+ old_de, old_bh)) == -ENOENT) {
+ /* old_de could have moved from under us during htree split, so
+ * make sure that we are deleting the right entry. We might
+ * also be pointing to a stale entry in the unused part of
+ * old_bh so just checking inum and the name isn't enough. */
+ struct buffer_head *old_bh2;
+ struct ext4_dir_entry_2 *old_de2;
+
+ old_bh2 = ext4_find_entry(old_dentry, &old_de2);
+ if (old_bh2) {
+ retval = ext4_delete_entry(handle, old_dir,
+ old_de2, old_bh2);
+ brelse(old_bh2);
+ }
+ }
+ if (retval) {
+ ext4_warning(old_dir->i_sb, "ext4_rename",
+ "Deleting old file (%lu), %d, error=%d",
+ old_dir->i_ino, old_dir->i_nlink, retval);
+ }
+
+ if (new_inode) {
+ drop_nlink(new_inode);
+ new_inode->i_ctime = CURRENT_TIME_SEC;
+ }
+ old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME_SEC;
+ ext4_update_dx_flag(old_dir);
+ if (dir_bh) {
+ BUFFER_TRACE(dir_bh, "get_write_access");
+ ext4_journal_get_write_access(handle, dir_bh);
+ PARENT_INO(dir_bh->b_data) = cpu_to_le32(new_dir->i_ino);
+ BUFFER_TRACE(dir_bh, "call ext4_journal_dirty_metadata");
+ ext4_journal_dirty_metadata(handle, dir_bh);
+ drop_nlink(old_dir);
+ if (new_inode) {
+ drop_nlink(new_inode);
+ } else {
+ inc_nlink(new_dir);
+ ext4_update_dx_flag(new_dir);
+ ext4_mark_inode_dirty(handle, new_dir);
+ }
+ }
+ ext4_mark_inode_dirty(handle, old_dir);
+ if (new_inode) {
+ ext4_mark_inode_dirty(handle, new_inode);
+ if (!new_inode->i_nlink)
+ ext4_orphan_add(handle, new_inode);
+ }
+ retval = 0;
+
+end_rename:
+ brelse (dir_bh);
+ brelse (old_bh);
+ brelse (new_bh);
+ ext4_journal_stop(handle);
+ return retval;
+}
+
+/*
+ * directories can handle most operations...
+ */
+struct inode_operations ext4_dir_inode_operations = {
+ .create = ext4_create,
+ .lookup = ext4_lookup,
+ .link = ext4_link,
+ .unlink = ext4_unlink,
+ .symlink = ext4_symlink,
+ .mkdir = ext4_mkdir,
+ .rmdir = ext4_rmdir,
+ .mknod = ext4_mknod,
+ .rename = ext4_rename,
+ .setattr = ext4_setattr,
+#ifdef CONFIG_EXT4DEV_FS_XATTR
+ .setxattr = generic_setxattr,
+ .getxattr = generic_getxattr,
+ .listxattr = ext4_listxattr,
+ .removexattr = generic_removexattr,
+#endif
+ .permission = ext4_permission,
+};
+
+struct inode_operations ext4_special_inode_operations = {
+ .setattr = ext4_setattr,
+#ifdef CONFIG_EXT4DEV_FS_XATTR
+ .setxattr = generic_setxattr,
+ .getxattr = generic_getxattr,
+ .listxattr = ext4_listxattr,
+ .removexattr = generic_removexattr,
+#endif
+ .permission = ext4_permission,
+};
diff --git a/fs/ext4/namei.h b/fs/ext4/namei.h
new file mode 100644
index 000000000000..5e4dfff36a00
--- /dev/null
+++ b/fs/ext4/namei.h
@@ -0,0 +1,8 @@
+/* linux/fs/ext4/namei.h
+ *
+ * Copyright (C) 2005 Simtec Electronics
+ * Ben Dooks <ben@simtec.co.uk>
+ *
+*/
+
+extern struct dentry *ext4_get_parent(struct dentry *child);
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
new file mode 100644
index 000000000000..1e9578052cd3
--- /dev/null
+++ b/fs/ext4/resize.c
@@ -0,0 +1,1045 @@
+/*
+ * linux/fs/ext4/resize.c
+ *
+ * Support for resizing an ext4 filesystem while it is mounted.
+ *
+ * Copyright (C) 2001, 2002 Andreas Dilger <adilger@clusterfs.com>
+ *
+ * This could probably be made into a module, because it is not often in use.
+ */
+
+
+#define EXT4FS_DEBUG
+
+#include <linux/sched.h>
+#include <linux/smp_lock.h>
+#include <linux/ext4_jbd2.h>
+
+#include <linux/errno.h>
+#include <linux/slab.h>
+
+
+#define outside(b, first, last) ((b) < (first) || (b) >= (last))
+#define inside(b, first, last) ((b) >= (first) && (b) < (last))
+
+static int verify_group_input(struct super_block *sb,
+ struct ext4_new_group_data *input)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_super_block *es = sbi->s_es;
+ ext4_fsblk_t start = ext4_blocks_count(es);
+ ext4_fsblk_t end = start + input->blocks_count;
+ unsigned group = input->group;
+ ext4_fsblk_t itend = input->inode_table + sbi->s_itb_per_group;
+ unsigned overhead = ext4_bg_has_super(sb, group) ?
+ (1 + ext4_bg_num_gdb(sb, group) +
+ le16_to_cpu(es->s_reserved_gdt_blocks)) : 0;
+ ext4_fsblk_t metaend = start + overhead;
+ struct buffer_head *bh = NULL;
+ ext4_grpblk_t free_blocks_count, offset;
+ int err = -EINVAL;
+
+ input->free_blocks_count = free_blocks_count =
+ input->blocks_count - 2 - overhead - sbi->s_itb_per_group;
+
+ if (test_opt(sb, DEBUG))
+ printk(KERN_DEBUG "EXT4-fs: adding %s group %u: %u blocks "
+ "(%d free, %u reserved)\n",
+ ext4_bg_has_super(sb, input->group) ? "normal" :
+ "no-super", input->group, input->blocks_count,
+ free_blocks_count, input->reserved_blocks);
+
+ ext4_get_group_no_and_offset(sb, start, NULL, &offset);
+ if (group != sbi->s_groups_count)
+ ext4_warning(sb, __FUNCTION__,
+ "Cannot add at group %u (only %lu groups)",
+ input->group, sbi->s_groups_count);
+ else if (offset != 0)
+ ext4_warning(sb, __FUNCTION__, "Last group not full");
+ else if (input->reserved_blocks > input->blocks_count / 5)
+ ext4_warning(sb, __FUNCTION__, "Reserved blocks too high (%u)",
+ input->reserved_blocks);
+ else if (free_blocks_count < 0)
+ ext4_warning(sb, __FUNCTION__, "Bad blocks count %u",
+ input->blocks_count);
+ else if (!(bh = sb_bread(sb, end - 1)))
+ ext4_warning(sb, __FUNCTION__,
+ "Cannot read last block (%llu)",
+ end - 1);
+ else if (outside(input->block_bitmap, start, end))
+ ext4_warning(sb, __FUNCTION__,
+ "Block bitmap not in group (block %llu)",
+ input->block_bitmap);
+ else if (outside(input->inode_bitmap, start, end))
+ ext4_warning(sb, __FUNCTION__,
+ "Inode bitmap not in group (block %llu)",
+ input->inode_bitmap);
+ else if (outside(input->inode_table, start, end) ||
+ outside(itend - 1, start, end))
+ ext4_warning(sb, __FUNCTION__,
+ "Inode table not in group (blocks %llu-%llu)",
+ input->inode_table, itend - 1);
+ else if (input->inode_bitmap == input->block_bitmap)
+ ext4_warning(sb, __FUNCTION__,
+ "Block bitmap same as inode bitmap (%llu)",
+ input->block_bitmap);
+ else if (inside(input->block_bitmap, input->inode_table, itend))
+ ext4_warning(sb, __FUNCTION__,
+ "Block bitmap (%llu) in inode table (%llu-%llu)",
+ input->block_bitmap, input->inode_table, itend-1);
+ else if (inside(input->inode_bitmap, input->inode_table, itend))
+ ext4_warning(sb, __FUNCTION__,
+ "Inode bitmap (%llu) in inode table (%llu-%llu)",
+ input->inode_bitmap, input->inode_table, itend-1);
+ else if (inside(input->block_bitmap, start, metaend))
+ ext4_warning(sb, __FUNCTION__,
+ "Block bitmap (%llu) in GDT table"
+ " (%llu-%llu)",
+ input->block_bitmap, start, metaend - 1);
+ else if (inside(input->inode_bitmap, start, metaend))
+ ext4_warning(sb, __FUNCTION__,
+ "Inode bitmap (%llu) in GDT table"
+ " (%llu-%llu)",
+ input->inode_bitmap, start, metaend - 1);
+ else if (inside(input->inode_table, start, metaend) ||
+ inside(itend - 1, start, metaend))
+ ext4_warning(sb, __FUNCTION__,
+ "Inode table (%llu-%llu) overlaps"
+ "GDT table (%llu-%llu)",
+ input->inode_table, itend - 1, start, metaend - 1);
+ else
+ err = 0;
+ brelse(bh);
+
+ return err;
+}
+
+static struct buffer_head *bclean(handle_t *handle, struct super_block *sb,
+ ext4_fsblk_t blk)
+{
+ struct buffer_head *bh;
+ int err;
+
+ bh = sb_getblk(sb, blk);
+ if (!bh)
+ return ERR_PTR(-EIO);
+ if ((err = ext4_journal_get_write_access(handle, bh))) {
+ brelse(bh);
+ bh = ERR_PTR(err);
+ } else {
+ lock_buffer(bh);
+ memset(bh->b_data, 0, sb->s_blocksize);
+ set_buffer_uptodate(bh);
+ unlock_buffer(bh);
+ }
+
+ return bh;
+}
+
+/*
+ * To avoid calling the atomic setbit hundreds or thousands of times, we only
+ * need to use it within a single byte (to ensure we get endianness right).
+ * We can use memset for the rest of the bitmap as there are no other users.
+ */
+static void mark_bitmap_end(int start_bit, int end_bit, char *bitmap)
+{
+ int i;
+
+ if (start_bit >= end_bit)
+ return;
+
+ ext4_debug("mark end bits +%d through +%d used\n", start_bit, end_bit);
+ for (i = start_bit; i < ((start_bit + 7) & ~7UL); i++)
+ ext4_set_bit(i, bitmap);
+ if (i < end_bit)
+ memset(bitmap + (i >> 3), 0xff, (end_bit - i) >> 3);
+}
+
+/*
+ * Set up the block and inode bitmaps, and the inode table for the new group.
+ * This doesn't need to be part of the main transaction, since we are only
+ * changing blocks outside the actual filesystem. We still do journaling to
+ * ensure the recovery is correct in case of a failure just after resize.
+ * If any part of this fails, we simply abort the resize.
+ */
+static int setup_new_group_blocks(struct super_block *sb,
+ struct ext4_new_group_data *input)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ ext4_fsblk_t start = ext4_group_first_block_no(sb, input->group);
+ int reserved_gdb = ext4_bg_has_super(sb, input->group) ?
+ le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) : 0;
+ unsigned long gdblocks = ext4_bg_num_gdb(sb, input->group);
+ struct buffer_head *bh;
+ handle_t *handle;
+ ext4_fsblk_t block;
+ ext4_grpblk_t bit;
+ int i;
+ int err = 0, err2;
+
+ handle = ext4_journal_start_sb(sb, reserved_gdb + gdblocks +
+ 2 + sbi->s_itb_per_group);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+
+ lock_super(sb);
+ if (input->group != sbi->s_groups_count) {
+ err = -EBUSY;
+ goto exit_journal;
+ }
+
+ if (IS_ERR(bh = bclean(handle, sb, input->block_bitmap))) {
+ err = PTR_ERR(bh);
+ goto exit_journal;
+ }
+
+ if (ext4_bg_has_super(sb, input->group)) {
+ ext4_debug("mark backup superblock %#04lx (+0)\n", start);
+ ext4_set_bit(0, bh->b_data);
+ }
+
+ /* Copy all of the GDT blocks into the backup in this group */
+ for (i = 0, bit = 1, block = start + 1;
+ i < gdblocks; i++, block++, bit++) {
+ struct buffer_head *gdb;
+
+ ext4_debug("update backup group %#04lx (+%d)\n", block, bit);
+
+ gdb = sb_getblk(sb, block);
+ if (!gdb) {
+ err = -EIO;
+ goto exit_bh;
+ }
+ if ((err = ext4_journal_get_write_access(handle, gdb))) {
+ brelse(gdb);
+ goto exit_bh;
+ }
+ lock_buffer(bh);
+ memcpy(gdb->b_data, sbi->s_group_desc[i]->b_data, bh->b_size);
+ set_buffer_uptodate(gdb);
+ unlock_buffer(bh);
+ ext4_journal_dirty_metadata(handle, gdb);
+ ext4_set_bit(bit, bh->b_data);
+ brelse(gdb);
+ }
+
+ /* Zero out all of the reserved backup group descriptor table blocks */
+ for (i = 0, bit = gdblocks + 1, block = start + bit;
+ i < reserved_gdb; i++, block++, bit++) {
+ struct buffer_head *gdb;
+
+ ext4_debug("clear reserved block %#04lx (+%d)\n", block, bit);
+
+ if (IS_ERR(gdb = bclean(handle, sb, block))) {
+ err = PTR_ERR(bh);
+ goto exit_bh;
+ }
+ ext4_journal_dirty_metadata(handle, gdb);
+ ext4_set_bit(bit, bh->b_data);
+ brelse(gdb);
+ }
+ ext4_debug("mark block bitmap %#04x (+%ld)\n", input->block_bitmap,
+ input->block_bitmap - start);
+ ext4_set_bit(input->block_bitmap - start, bh->b_data);
+ ext4_debug("mark inode bitmap %#04x (+%ld)\n", input->inode_bitmap,
+ input->inode_bitmap - start);
+ ext4_set_bit(input->inode_bitmap - start, bh->b_data);
+
+ /* Zero out all of the inode table blocks */
+ for (i = 0, block = input->inode_table, bit = block - start;
+ i < sbi->s_itb_per_group; i++, bit++, block++) {
+ struct buffer_head *it;
+
+ ext4_debug("clear inode block %#04lx (+%d)\n", block, bit);
+ if (IS_ERR(it = bclean(handle, sb, block))) {
+ err = PTR_ERR(it);
+ goto exit_bh;
+ }
+ ext4_journal_dirty_metadata(handle, it);
+ brelse(it);
+ ext4_set_bit(bit, bh->b_data);
+ }
+ mark_bitmap_end(input->blocks_count, EXT4_BLOCKS_PER_GROUP(sb),
+ bh->b_data);
+ ext4_journal_dirty_metadata(handle, bh);
+ brelse(bh);
+
+ /* Mark unused entries in inode bitmap used */
+ ext4_debug("clear inode bitmap %#04x (+%ld)\n",
+ input->inode_bitmap, input->inode_bitmap - start);
+ if (IS_ERR(bh = bclean(handle, sb, input->inode_bitmap))) {
+ err = PTR_ERR(bh);
+ goto exit_journal;
+ }
+
+ mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), EXT4_BLOCKS_PER_GROUP(sb),
+ bh->b_data);
+ ext4_journal_dirty_metadata(handle, bh);
+exit_bh:
+ brelse(bh);
+
+exit_journal:
+ unlock_super(sb);
+ if ((err2 = ext4_journal_stop(handle)) && !err)
+ err = err2;
+
+ return err;
+}
+
+
+/*
+ * Iterate through the groups which hold BACKUP superblock/GDT copies in an
+ * ext4 filesystem. The counters should be initialized to 1, 5, and 7 before
+ * calling this for the first time. In a sparse filesystem it will be the
+ * sequence of powers of 3, 5, and 7: 1, 3, 5, 7, 9, 25, 27, 49, 81, ...
+ * For a non-sparse filesystem it will be every group: 1, 2, 3, 4, ...
+ */
+static unsigned ext4_list_backups(struct super_block *sb, unsigned *three,
+ unsigned *five, unsigned *seven)
+{
+ unsigned *min = three;
+ int mult = 3;
+ unsigned ret;
+
+ if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER)) {
+ ret = *min;
+ *min += 1;
+ return ret;
+ }
+
+ if (*five < *min) {
+ min = five;
+ mult = 5;
+ }
+ if (*seven < *min) {
+ min = seven;
+ mult = 7;
+ }
+
+ ret = *min;
+ *min *= mult;
+
+ return ret;
+}
+
+/*
+ * Check that all of the backup GDT blocks are held in the primary GDT block.
+ * It is assumed that they are stored in group order. Returns the number of
+ * groups in current filesystem that have BACKUPS, or -ve error code.
+ */
+static int verify_reserved_gdb(struct super_block *sb,
+ struct buffer_head *primary)
+{
+ const ext4_fsblk_t blk = primary->b_blocknr;
+ const unsigned long end = EXT4_SB(sb)->s_groups_count;
+ unsigned three = 1;
+ unsigned five = 5;
+ unsigned seven = 7;
+ unsigned grp;
+ __le32 *p = (__le32 *)primary->b_data;
+ int gdbackups = 0;
+
+ while ((grp = ext4_list_backups(sb, &three, &five, &seven)) < end) {
+ if (le32_to_cpu(*p++) !=
+ grp * EXT4_BLOCKS_PER_GROUP(sb) + blk){
+ ext4_warning(sb, __FUNCTION__,
+ "reserved GDT %llu"
+ " missing grp %d (%llu)",
+ blk, grp,
+ grp *
+ (ext4_fsblk_t)EXT4_BLOCKS_PER_GROUP(sb) +
+ blk);
+ return -EINVAL;
+ }
+ if (++gdbackups > EXT4_ADDR_PER_BLOCK(sb))
+ return -EFBIG;
+ }
+
+ return gdbackups;
+}
+
+/*
+ * Called when we need to bring a reserved group descriptor table block into
+ * use from the resize inode. The primary copy of the new GDT block currently
+ * is an indirect block (under the double indirect block in the resize inode).
+ * The new backup GDT blocks will be stored as leaf blocks in this indirect
+ * block, in group order. Even though we know all the block numbers we need,
+ * we check to ensure that the resize inode has actually reserved these blocks.
+ *
+ * Don't need to update the block bitmaps because the blocks are still in use.
+ *
+ * We get all of the error cases out of the way, so that we are sure to not
+ * fail once we start modifying the data on disk, because JBD has no rollback.
+ */
+static int add_new_gdb(handle_t *handle, struct inode *inode,
+ struct ext4_new_group_data *input,
+ struct buffer_head **primary)
+{
+ struct super_block *sb = inode->i_sb;
+ struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+ unsigned long gdb_num = input->group / EXT4_DESC_PER_BLOCK(sb);
+ ext4_fsblk_t gdblock = EXT4_SB(sb)->s_sbh->b_blocknr + 1 + gdb_num;
+ struct buffer_head **o_group_desc, **n_group_desc;
+ struct buffer_head *dind;
+ int gdbackups;
+ struct ext4_iloc iloc;
+ __le32 *data;
+ int err;
+
+ if (test_opt(sb, DEBUG))
+ printk(KERN_DEBUG
+ "EXT4-fs: ext4_add_new_gdb: adding group block %lu\n",
+ gdb_num);
+
+ /*
+ * If we are not using the primary superblock/GDT copy don't resize,
+ * because the user tools have no way of handling this. Probably a
+ * bad time to do it anyways.
+ */
+ if (EXT4_SB(sb)->s_sbh->b_blocknr !=
+ le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) {
+ ext4_warning(sb, __FUNCTION__,
+ "won't resize using backup superblock at %llu",
+ (unsigned long long)EXT4_SB(sb)->s_sbh->b_blocknr);
+ return -EPERM;
+ }
+
+ *primary = sb_bread(sb, gdblock);
+ if (!*primary)
+ return -EIO;
+
+ if ((gdbackups = verify_reserved_gdb(sb, *primary)) < 0) {
+ err = gdbackups;
+ goto exit_bh;
+ }
+
+ data = EXT4_I(inode)->i_data + EXT4_DIND_BLOCK;
+ dind = sb_bread(sb, le32_to_cpu(*data));
+ if (!dind) {
+ err = -EIO;
+ goto exit_bh;
+ }
+
+ data = (__le32 *)dind->b_data;
+ if (le32_to_cpu(data[gdb_num % EXT4_ADDR_PER_BLOCK(sb)]) != gdblock) {
+ ext4_warning(sb, __FUNCTION__,
+ "new group %u GDT block %llu not reserved",
+ input->group, gdblock);
+ err = -EINVAL;
+ goto exit_dind;
+ }
+
+ if ((err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh)))
+ goto exit_dind;
+
+ if ((err = ext4_journal_get_write_access(handle, *primary)))
+ goto exit_sbh;
+
+ if ((err = ext4_journal_get_write_access(handle, dind)))
+ goto exit_primary;
+
+ /* ext4_reserve_inode_write() gets a reference on the iloc */
+ if ((err = ext4_reserve_inode_write(handle, inode, &iloc)))
+ goto exit_dindj;
+
+ n_group_desc = kmalloc((gdb_num + 1) * sizeof(struct buffer_head *),
+ GFP_KERNEL);
+ if (!n_group_desc) {
+ err = -ENOMEM;
+ ext4_warning (sb, __FUNCTION__,
+ "not enough memory for %lu groups", gdb_num + 1);
+ goto exit_inode;
+ }
+
+ /*
+ * Finally, we have all of the possible failures behind us...
+ *
+ * Remove new GDT block from inode double-indirect block and clear out
+ * the new GDT block for use (which also "frees" the backup GDT blocks
+ * from the reserved inode). We don't need to change the bitmaps for
+ * these blocks, because they are marked as in-use from being in the
+ * reserved inode, and will become GDT blocks (primary and backup).
+ */
+ data[gdb_num % EXT4_ADDR_PER_BLOCK(sb)] = 0;
+ ext4_journal_dirty_metadata(handle, dind);
+ brelse(dind);
+ inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >> 9;
+ ext4_mark_iloc_dirty(handle, inode, &iloc);
+ memset((*primary)->b_data, 0, sb->s_blocksize);
+ ext4_journal_dirty_metadata(handle, *primary);
+
+ o_group_desc = EXT4_SB(sb)->s_group_desc;
+ memcpy(n_group_desc, o_group_desc,
+ EXT4_SB(sb)->s_gdb_count * sizeof(struct buffer_head *));
+ n_group_desc[gdb_num] = *primary;
+ EXT4_SB(sb)->s_group_desc = n_group_desc;
+ EXT4_SB(sb)->s_gdb_count++;
+ kfree(o_group_desc);
+
+ es->s_reserved_gdt_blocks =
+ cpu_to_le16(le16_to_cpu(es->s_reserved_gdt_blocks) - 1);
+ ext4_journal_dirty_metadata(handle, EXT4_SB(sb)->s_sbh);
+
+ return 0;
+
+exit_inode:
+ //ext4_journal_release_buffer(handle, iloc.bh);
+ brelse(iloc.bh);
+exit_dindj:
+ //ext4_journal_release_buffer(handle, dind);
+exit_primary:
+ //ext4_journal_release_buffer(handle, *primary);
+exit_sbh:
+ //ext4_journal_release_buffer(handle, *primary);
+exit_dind:
+ brelse(dind);
+exit_bh:
+ brelse(*primary);
+
+ ext4_debug("leaving with error %d\n", err);
+ return err;
+}
+
+/*
+ * Called when we are adding a new group which has a backup copy of each of
+ * the GDT blocks (i.e. sparse group) and there are reserved GDT blocks.
+ * We need to add these reserved backup GDT blocks to the resize inode, so
+ * that they are kept for future resizing and not allocated to files.
+ *
+ * Each reserved backup GDT block will go into a different indirect block.
+ * The indirect blocks are actually the primary reserved GDT blocks,
+ * so we know in advance what their block numbers are. We only get the
+ * double-indirect block to verify it is pointing to the primary reserved
+ * GDT blocks so we don't overwrite a data block by accident. The reserved
+ * backup GDT blocks are stored in their reserved primary GDT block.
+ */
+static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
+ struct ext4_new_group_data *input)
+{
+ struct super_block *sb = inode->i_sb;
+ int reserved_gdb =le16_to_cpu(EXT4_SB(sb)->s_es->s_reserved_gdt_blocks);
+ struct buffer_head **primary;
+ struct buffer_head *dind;
+ struct ext4_iloc iloc;
+ ext4_fsblk_t blk;
+ __le32 *data, *end;
+ int gdbackups = 0;
+ int res, i;
+ int err;
+
+ primary = kmalloc(reserved_gdb * sizeof(*primary), GFP_KERNEL);
+ if (!primary)
+ return -ENOMEM;
+
+ data = EXT4_I(inode)->i_data + EXT4_DIND_BLOCK;
+ dind = sb_bread(sb, le32_to_cpu(*data));
+ if (!dind) {
+ err = -EIO;
+ goto exit_free;
+ }
+
+ blk = EXT4_SB(sb)->s_sbh->b_blocknr + 1 + EXT4_SB(sb)->s_gdb_count;
+ data = (__le32 *)dind->b_data + EXT4_SB(sb)->s_gdb_count;
+ end = (__le32 *)dind->b_data + EXT4_ADDR_PER_BLOCK(sb);
+
+ /* Get each reserved primary GDT block and verify it holds backups */
+ for (res = 0; res < reserved_gdb; res++, blk++) {
+ if (le32_to_cpu(*data) != blk) {
+ ext4_warning(sb, __FUNCTION__,
+ "reserved block %llu"
+ " not at offset %ld",
+ blk,
+ (long)(data - (__le32 *)dind->b_data));
+ err = -EINVAL;
+ goto exit_bh;
+ }
+ primary[res] = sb_bread(sb, blk);
+ if (!primary[res]) {
+ err = -EIO;
+ goto exit_bh;
+ }
+ if ((gdbackups = verify_reserved_gdb(sb, primary[res])) < 0) {
+ brelse(primary[res]);
+ err = gdbackups;
+ goto exit_bh;
+ }
+ if (++data >= end)
+ data = (__le32 *)dind->b_data;
+ }
+
+ for (i = 0; i < reserved_gdb; i++) {
+ if ((err = ext4_journal_get_write_access(handle, primary[i]))) {
+ /*
+ int j;
+ for (j = 0; j < i; j++)
+ ext4_journal_release_buffer(handle, primary[j]);
+ */
+ goto exit_bh;
+ }
+ }
+
+ if ((err = ext4_reserve_inode_write(handle, inode, &iloc)))
+ goto exit_bh;
+
+ /*
+ * Finally we can add each of the reserved backup GDT blocks from
+ * the new group to its reserved primary GDT block.
+ */
+ blk = input->group * EXT4_BLOCKS_PER_GROUP(sb);
+ for (i = 0; i < reserved_gdb; i++) {
+ int err2;
+ data = (__le32 *)primary[i]->b_data;
+ /* printk("reserving backup %lu[%u] = %lu\n",
+ primary[i]->b_blocknr, gdbackups,
+ blk + primary[i]->b_blocknr); */
+ data[gdbackups] = cpu_to_le32(blk + primary[i]->b_blocknr);
+ err2 = ext4_journal_dirty_metadata(handle, primary[i]);
+ if (!err)
+ err = err2;
+ }
+ inode->i_blocks += reserved_gdb * sb->s_blocksize >> 9;
+ ext4_mark_iloc_dirty(handle, inode, &iloc);
+
+exit_bh:
+ while (--res >= 0)
+ brelse(primary[res]);
+ brelse(dind);
+
+exit_free:
+ kfree(primary);
+
+ return err;
+}
+
+/*
+ * Update the backup copies of the ext4 metadata. These don't need to be part
+ * of the main resize transaction, because e2fsck will re-write them if there
+ * is a problem (basically only OOM will cause a problem). However, we
+ * _should_ update the backups if possible, in case the primary gets trashed
+ * for some reason and we need to run e2fsck from a backup superblock. The
+ * important part is that the new block and inode counts are in the backup
+ * superblocks, and the location of the new group metadata in the GDT backups.
+ *
+ * We do not need lock_super() for this, because these blocks are not
+ * otherwise touched by the filesystem code when it is mounted. We don't
+ * need to worry about last changing from sbi->s_groups_count, because the
+ * worst that can happen is that we do not copy the full number of backups
+ * at this time. The resize which changed s_groups_count will backup again.
+ */
+static void update_backups(struct super_block *sb,
+ int blk_off, char *data, int size)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ const unsigned long last = sbi->s_groups_count;
+ const int bpg = EXT4_BLOCKS_PER_GROUP(sb);
+ unsigned three = 1;
+ unsigned five = 5;
+ unsigned seven = 7;
+ unsigned group;
+ int rest = sb->s_blocksize - size;
+ handle_t *handle;
+ int err = 0, err2;
+
+ handle = ext4_journal_start_sb(sb, EXT4_MAX_TRANS_DATA);
+ if (IS_ERR(handle)) {
+ group = 1;
+ err = PTR_ERR(handle);
+ goto exit_err;
+ }
+
+ while ((group = ext4_list_backups(sb, &three, &five, &seven)) < last) {
+ struct buffer_head *bh;
+
+ /* Out of journal space, and can't get more - abort - so sad */
+ if (handle->h_buffer_credits == 0 &&
+ ext4_journal_extend(handle, EXT4_MAX_TRANS_DATA) &&
+ (err = ext4_journal_restart(handle, EXT4_MAX_TRANS_DATA)))
+ break;
+
+ bh = sb_getblk(sb, group * bpg + blk_off);
+ if (!bh) {
+ err = -EIO;
+ break;
+ }
+ ext4_debug("update metadata backup %#04lx\n",
+ (unsigned long)bh->b_blocknr);
+ if ((err = ext4_journal_get_write_access(handle, bh)))
+ break;
+ lock_buffer(bh);
+ memcpy(bh->b_data, data, size);
+ if (rest)
+ memset(bh->b_data + size, 0, rest);
+ set_buffer_uptodate(bh);
+ unlock_buffer(bh);
+ ext4_journal_dirty_metadata(handle, bh);
+ brelse(bh);
+ }
+ if ((err2 = ext4_journal_stop(handle)) && !err)
+ err = err2;
+
+ /*
+ * Ugh! Need to have e2fsck write the backup copies. It is too
+ * late to revert the resize, we shouldn't fail just because of
+ * the backup copies (they are only needed in case of corruption).
+ *
+ * However, if we got here we have a journal problem too, so we
+ * can't really start a transaction to mark the superblock.
+ * Chicken out and just set the flag on the hope it will be written
+ * to disk, and if not - we will simply wait until next fsck.
+ */
+exit_err:
+ if (err) {
+ ext4_warning(sb, __FUNCTION__,
+ "can't update backup for group %d (err %d), "
+ "forcing fsck on next reboot", group, err);
+ sbi->s_mount_state &= ~EXT4_VALID_FS;
+ sbi->s_es->s_state &= cpu_to_le16(~EXT4_VALID_FS);
+ mark_buffer_dirty(sbi->s_sbh);
+ }
+}
+
+/* Add group descriptor data to an existing or new group descriptor block.
+ * Ensure we handle all possible error conditions _before_ we start modifying
+ * the filesystem, because we cannot abort the transaction and not have it
+ * write the data to disk.
+ *
+ * If we are on a GDT block boundary, we need to get the reserved GDT block.
+ * Otherwise, we may need to add backup GDT blocks for a sparse group.
+ *
+ * We only need to hold the superblock lock while we are actually adding
+ * in the new group's counts to the superblock. Prior to that we have
+ * not really "added" the group at all. We re-check that we are still
+ * adding in the last group in case things have changed since verifying.
+ */
+int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_super_block *es = sbi->s_es;
+ int reserved_gdb = ext4_bg_has_super(sb, input->group) ?
+ le16_to_cpu(es->s_reserved_gdt_blocks) : 0;
+ struct buffer_head *primary = NULL;
+ struct ext4_group_desc *gdp;
+ struct inode *inode = NULL;
+ handle_t *handle;
+ int gdb_off, gdb_num;
+ int err, err2;
+
+ gdb_num = input->group / EXT4_DESC_PER_BLOCK(sb);
+ gdb_off = input->group % EXT4_DESC_PER_BLOCK(sb);
+
+ if (gdb_off == 0 && !EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER)) {
+ ext4_warning(sb, __FUNCTION__,
+ "Can't resize non-sparse filesystem further");
+ return -EPERM;
+ }
+
+ if (ext4_blocks_count(es) + input->blocks_count <
+ ext4_blocks_count(es)) {
+ ext4_warning(sb, __FUNCTION__, "blocks_count overflow\n");
+ return -EINVAL;
+ }
+
+ if (le32_to_cpu(es->s_inodes_count) + EXT4_INODES_PER_GROUP(sb) <
+ le32_to_cpu(es->s_inodes_count)) {
+ ext4_warning(sb, __FUNCTION__, "inodes_count overflow\n");
+ return -EINVAL;
+ }
+
+ if (reserved_gdb || gdb_off == 0) {
+ if (!EXT4_HAS_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_COMPAT_RESIZE_INODE)){
+ ext4_warning(sb, __FUNCTION__,
+ "No reserved GDT blocks, can't resize");
+ return -EPERM;
+ }
+ inode = iget(sb, EXT4_RESIZE_INO);
+ if (!inode || is_bad_inode(inode)) {
+ ext4_warning(sb, __FUNCTION__,
+ "Error opening resize inode");
+ iput(inode);
+ return -ENOENT;
+ }
+ }
+
+ if ((err = verify_group_input(sb, input)))
+ goto exit_put;
+
+ if ((err = setup_new_group_blocks(sb, input)))
+ goto exit_put;
+
+ /*
+ * We will always be modifying at least the superblock and a GDT
+ * block. If we are adding a group past the last current GDT block,
+ * we will also modify the inode and the dindirect block. If we
+ * are adding a group with superblock/GDT backups we will also
+ * modify each of the reserved GDT dindirect blocks.
+ */
+ handle = ext4_journal_start_sb(sb,
+ ext4_bg_has_super(sb, input->group) ?
+ 3 + reserved_gdb : 4);
+ if (IS_ERR(handle)) {
+ err = PTR_ERR(handle);
+ goto exit_put;
+ }
+
+ lock_super(sb);
+ if (input->group != sbi->s_groups_count) {
+ ext4_warning(sb, __FUNCTION__,
+ "multiple resizers run on filesystem!");
+ err = -EBUSY;
+ goto exit_journal;
+ }
+
+ if ((err = ext4_journal_get_write_access(handle, sbi->s_sbh)))
+ goto exit_journal;
+
+ /*
+ * We will only either add reserved group blocks to a backup group
+ * or remove reserved blocks for the first group in a new group block.
+ * Doing both would be mean more complex code, and sane people don't
+ * use non-sparse filesystems anymore. This is already checked above.
+ */
+ if (gdb_off) {
+ primary = sbi->s_group_desc[gdb_num];
+ if ((err = ext4_journal_get_write_access(handle, primary)))
+ goto exit_journal;
+
+ if (reserved_gdb && ext4_bg_num_gdb(sb, input->group) &&
+ (err = reserve_backup_gdb(handle, inode, input)))
+ goto exit_journal;
+ } else if ((err = add_new_gdb(handle, inode, input, &primary)))
+ goto exit_journal;
+
+ /*
+ * OK, now we've set up the new group. Time to make it active.
+ *
+ * Current kernels don't lock all allocations via lock_super(),
+ * so we have to be safe wrt. concurrent accesses the group
+ * data. So we need to be careful to set all of the relevant
+ * group descriptor data etc. *before* we enable the group.
+ *
+ * The key field here is sbi->s_groups_count: as long as
+ * that retains its old value, nobody is going to access the new
+ * group.
+ *
+ * So first we update all the descriptor metadata for the new
+ * group; then we update the total disk blocks count; then we
+ * update the groups count to enable the group; then finally we
+ * update the free space counts so that the system can start
+ * using the new disk blocks.
+ */
+
+ /* Update group descriptor block for new group */
+ gdp = (struct ext4_group_desc *)primary->b_data + gdb_off;
+
+ ext4_block_bitmap_set(sb, gdp, input->block_bitmap); /* LV FIXME */
+ ext4_inode_bitmap_set(sb, gdp, input->inode_bitmap); /* LV FIXME */
+ ext4_inode_table_set(sb, gdp, input->inode_table); /* LV FIXME */
+ gdp->bg_free_blocks_count = cpu_to_le16(input->free_blocks_count);
+ gdp->bg_free_inodes_count = cpu_to_le16(EXT4_INODES_PER_GROUP(sb));
+
+ /*
+ * Make the new blocks and inodes valid next. We do this before
+ * increasing the group count so that once the group is enabled,
+ * all of its blocks and inodes are already valid.
+ *
+ * We always allocate group-by-group, then block-by-block or
+ * inode-by-inode within a group, so enabling these
+ * blocks/inodes before the group is live won't actually let us
+ * allocate the new space yet.
+ */
+ ext4_blocks_count_set(es, ext4_blocks_count(es) +
+ input->blocks_count);
+ es->s_inodes_count = cpu_to_le32(le32_to_cpu(es->s_inodes_count) +
+ EXT4_INODES_PER_GROUP(sb));
+
+ /*
+ * We need to protect s_groups_count against other CPUs seeing
+ * inconsistent state in the superblock.
+ *
+ * The precise rules we use are:
+ *
+ * * Writers of s_groups_count *must* hold lock_super
+ * AND
+ * * Writers must perform a smp_wmb() after updating all dependent
+ * data and before modifying the groups count
+ *
+ * * Readers must hold lock_super() over the access
+ * OR
+ * * Readers must perform an smp_rmb() after reading the groups count
+ * and before reading any dependent data.
+ *
+ * NB. These rules can be relaxed when checking the group count
+ * while freeing data, as we can only allocate from a block
+ * group after serialising against the group count, and we can
+ * only then free after serialising in turn against that
+ * allocation.
+ */
+ smp_wmb();
+
+ /* Update the global fs size fields */
+ sbi->s_groups_count++;
+
+ ext4_journal_dirty_metadata(handle, primary);
+
+ /* Update the reserved block counts only once the new group is
+ * active. */
+ ext4_r_blocks_count_set(es, ext4_r_blocks_count(es) +
+ input->reserved_blocks);
+
+ /* Update the free space counts */
+ percpu_counter_mod(&sbi->s_freeblocks_counter,
+ input->free_blocks_count);
+ percpu_counter_mod(&sbi->s_freeinodes_counter,
+ EXT4_INODES_PER_GROUP(sb));
+
+ ext4_journal_dirty_metadata(handle, sbi->s_sbh);
+ sb->s_dirt = 1;
+
+exit_journal:
+ unlock_super(sb);
+ if ((err2 = ext4_journal_stop(handle)) && !err)
+ err = err2;
+ if (!err) {
+ update_backups(sb, sbi->s_sbh->b_blocknr, (char *)es,
+ sizeof(struct ext4_super_block));
+ update_backups(sb, primary->b_blocknr, primary->b_data,
+ primary->b_size);
+ }
+exit_put:
+ iput(inode);
+ return err;
+} /* ext4_group_add */
+
+/* Extend the filesystem to the new number of blocks specified. This entry
+ * point is only used to extend the current filesystem to the end of the last
+ * existing group. It can be accessed via ioctl, or by "remount,resize=<size>"
+ * for emergencies (because it has no dependencies on reserved blocks).
+ *
+ * If we _really_ wanted, we could use default values to call ext4_group_add()
+ * allow the "remount" trick to work for arbitrary resizing, assuming enough
+ * GDT blocks are reserved to grow to the desired size.
+ */
+int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
+ ext4_fsblk_t n_blocks_count)
+{
+ ext4_fsblk_t o_blocks_count;
+ unsigned long o_groups_count;
+ ext4_grpblk_t last;
+ ext4_grpblk_t add;
+ struct buffer_head * bh;
+ handle_t *handle;
+ int err;
+ unsigned long freed_blocks;
+
+ /* We don't need to worry about locking wrt other resizers just
+ * yet: we're going to revalidate es->s_blocks_count after
+ * taking lock_super() below. */
+ o_blocks_count = ext4_blocks_count(es);
+ o_groups_count = EXT4_SB(sb)->s_groups_count;
+
+ if (test_opt(sb, DEBUG))
+ printk(KERN_DEBUG "EXT4-fs: extending last group from %llu uto %llu blocks\n",
+ o_blocks_count, n_blocks_count);
+
+ if (n_blocks_count == 0 || n_blocks_count == o_blocks_count)
+ return 0;
+
+ if (n_blocks_count > (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) {
+ printk(KERN_ERR "EXT4-fs: filesystem on %s:"
+ " too large to resize to %llu blocks safely\n",
+ sb->s_id, n_blocks_count);
+ if (sizeof(sector_t) < 8)
+ ext4_warning(sb, __FUNCTION__,
+ "CONFIG_LBD not enabled\n");
+ return -EINVAL;
+ }
+
+ if (n_blocks_count < o_blocks_count) {
+ ext4_warning(sb, __FUNCTION__,
+ "can't shrink FS - resize aborted");
+ return -EBUSY;
+ }
+
+ /* Handle the remaining blocks in the last group only. */
+ ext4_get_group_no_and_offset(sb, o_blocks_count, NULL, &last);
+
+ if (last == 0) {
+ ext4_warning(sb, __FUNCTION__,
+ "need to use ext2online to resize further");
+ return -EPERM;
+ }
+
+ add = EXT4_BLOCKS_PER_GROUP(sb) - last;
+
+ if (o_blocks_count + add < o_blocks_count) {
+ ext4_warning(sb, __FUNCTION__, "blocks_count overflow");
+ return -EINVAL;
+ }
+
+ if (o_blocks_count + add > n_blocks_count)
+ add = n_blocks_count - o_blocks_count;
+
+ if (o_blocks_count + add < n_blocks_count)
+ ext4_warning(sb, __FUNCTION__,
+ "will only finish group (%llu"
+ " blocks, %u new)",
+ o_blocks_count + add, add);
+
+ /* See if the device is actually as big as what was requested */
+ bh = sb_bread(sb, o_blocks_count + add -1);
+ if (!bh) {
+ ext4_warning(sb, __FUNCTION__,
+ "can't read last block, resize aborted");
+ return -ENOSPC;
+ }
+ brelse(bh);
+
+ /* We will update the superblock, one block bitmap, and
+ * one group descriptor via ext4_free_blocks().
+ */
+ handle = ext4_journal_start_sb(sb, 3);
+ if (IS_ERR(handle)) {
+ err = PTR_ERR(handle);
+ ext4_warning(sb, __FUNCTION__, "error %d on journal start",err);
+ goto exit_put;
+ }
+
+ lock_super(sb);
+ if (o_blocks_count != ext4_blocks_count(es)) {
+ ext4_warning(sb, __FUNCTION__,
+ "multiple resizers run on filesystem!");
+ unlock_super(sb);
+ err = -EBUSY;
+ goto exit_put;
+ }
+
+ if ((err = ext4_journal_get_write_access(handle,
+ EXT4_SB(sb)->s_sbh))) {
+ ext4_warning(sb, __FUNCTION__,
+ "error %d on journal write access", err);
+ unlock_super(sb);
+ ext4_journal_stop(handle);
+ goto exit_put;
+ }
+ ext4_blocks_count_set(es, o_blocks_count + add);
+ ext4_journal_dirty_metadata(handle, EXT4_SB(sb)->s_sbh);
+ sb->s_dirt = 1;
+ unlock_super(sb);
+ ext4_debug("freeing blocks %lu through %llu\n", o_blocks_count,
+ o_blocks_count + add);
+ ext4_free_blocks_sb(handle, sb, o_blocks_count, add, &freed_blocks);
+ ext4_debug("freed blocks %llu through %llu\n", o_blocks_count,
+ o_blocks_count + add);
+ if ((err = ext4_journal_stop(handle)))
+ goto exit_put;
+ if (test_opt(sb, DEBUG))
+ printk(KERN_DEBUG "EXT4-fs: extended group to %llu blocks\n",
+ ext4_blocks_count(es));
+ update_backups(sb, EXT4_SB(sb)->s_sbh->b_blocknr, (char *)es,
+ sizeof(struct ext4_super_block));
+exit_put:
+ return err;
+} /* ext4_group_extend */
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
new file mode 100644
index 000000000000..b4b022aa2bc2
--- /dev/null
+++ b/fs/ext4/super.c
@@ -0,0 +1,2829 @@
+/*
+ * linux/fs/ext4/super.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ * from
+ *
+ * linux/fs/minix/inode.c
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ *
+ * Big-endian to little-endian byte-swapping/bitmaps by
+ * David S. Miller (davem@caip.rutgers.edu), 1995
+ */
+
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/fs.h>
+#include <linux/time.h>
+#include <linux/jbd2.h>
+#include <linux/ext4_fs.h>
+#include <linux/ext4_jbd2.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/blkdev.h>
+#include <linux/parser.h>
+#include <linux/smp_lock.h>
+#include <linux/buffer_head.h>
+#include <linux/vfs.h>
+#include <linux/random.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/quotaops.h>
+#include <linux/seq_file.h>
+
+#include <asm/uaccess.h>
+
+#include "xattr.h"
+#include "acl.h"
+#include "namei.h"
+
+static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
+ unsigned long journal_devnum);
+static int ext4_create_journal(struct super_block *, struct ext4_super_block *,
+ unsigned int);
+static void ext4_commit_super (struct super_block * sb,
+ struct ext4_super_block * es,
+ int sync);
+static void ext4_mark_recovery_complete(struct super_block * sb,
+ struct ext4_super_block * es);
+static void ext4_clear_journal_err(struct super_block * sb,
+ struct ext4_super_block * es);
+static int ext4_sync_fs(struct super_block *sb, int wait);
+static const char *ext4_decode_error(struct super_block * sb, int errno,
+ char nbuf[16]);
+static int ext4_remount (struct super_block * sb, int * flags, char * data);
+static int ext4_statfs (struct dentry * dentry, struct kstatfs * buf);
+static void ext4_unlockfs(struct super_block *sb);
+static void ext4_write_super (struct super_block * sb);
+static void ext4_write_super_lockfs(struct super_block *sb);
+
+
+ext4_fsblk_t ext4_block_bitmap(struct super_block *sb,
+ struct ext4_group_desc *bg)
+{
+ return le32_to_cpu(bg->bg_block_bitmap) |
+ (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
+ (ext4_fsblk_t)le32_to_cpu(bg->bg_block_bitmap_hi) << 32 : 0);
+}
+
+ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb,
+ struct ext4_group_desc *bg)
+{
+ return le32_to_cpu(bg->bg_inode_bitmap) |
+ (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
+ (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_bitmap_hi) << 32 : 0);
+}
+
+ext4_fsblk_t ext4_inode_table(struct super_block *sb,
+ struct ext4_group_desc *bg)
+{
+ return le32_to_cpu(bg->bg_inode_table) |
+ (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
+ (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_table_hi) << 32 : 0);
+}
+
+void ext4_block_bitmap_set(struct super_block *sb,
+ struct ext4_group_desc *bg, ext4_fsblk_t blk)
+{
+ bg->bg_block_bitmap = cpu_to_le32((u32)blk);
+ if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
+ bg->bg_block_bitmap_hi = cpu_to_le32(blk >> 32);
+}
+
+void ext4_inode_bitmap_set(struct super_block *sb,
+ struct ext4_group_desc *bg, ext4_fsblk_t blk)
+{
+ bg->bg_inode_bitmap = cpu_to_le32((u32)blk);
+ if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
+ bg->bg_inode_bitmap_hi = cpu_to_le32(blk >> 32);
+}
+
+void ext4_inode_table_set(struct super_block *sb,
+ struct ext4_group_desc *bg, ext4_fsblk_t blk)
+{
+ bg->bg_inode_table = cpu_to_le32((u32)blk);
+ if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
+ bg->bg_inode_table_hi = cpu_to_le32(blk >> 32);
+}
+
+/*
+ * Wrappers for jbd2_journal_start/end.
+ *
+ * The only special thing we need to do here is to make sure that all
+ * journal_end calls result in the superblock being marked dirty, so
+ * that sync() will call the filesystem's write_super callback if
+ * appropriate.
+ */
+handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks)
+{
+ journal_t *journal;
+
+ if (sb->s_flags & MS_RDONLY)
+ return ERR_PTR(-EROFS);
+
+ /* Special case here: if the journal has aborted behind our
+ * backs (eg. EIO in the commit thread), then we still need to
+ * take the FS itself readonly cleanly. */
+ journal = EXT4_SB(sb)->s_journal;
+ if (is_journal_aborted(journal)) {
+ ext4_abort(sb, __FUNCTION__,
+ "Detected aborted journal");
+ return ERR_PTR(-EROFS);
+ }
+
+ return jbd2_journal_start(journal, nblocks);
+}
+
+/*
+ * The only special thing we need to do here is to make sure that all
+ * jbd2_journal_stop calls result in the superblock being marked dirty, so
+ * that sync() will call the filesystem's write_super callback if
+ * appropriate.
+ */
+int __ext4_journal_stop(const char *where, handle_t *handle)
+{
+ struct super_block *sb;
+ int err;
+ int rc;
+
+ sb = handle->h_transaction->t_journal->j_private;
+ err = handle->h_err;
+ rc = jbd2_journal_stop(handle);
+
+ if (!err)
+ err = rc;
+ if (err)
+ __ext4_std_error(sb, where, err);
+ return err;
+}
+
+void ext4_journal_abort_handle(const char *caller, const char *err_fn,
+ struct buffer_head *bh, handle_t *handle, int err)
+{
+ char nbuf[16];
+ const char *errstr = ext4_decode_error(NULL, err, nbuf);
+
+ if (bh)
+ BUFFER_TRACE(bh, "abort");
+
+ if (!handle->h_err)
+ handle->h_err = err;
+
+ if (is_handle_aborted(handle))
+ return;
+
+ printk(KERN_ERR "%s: aborting transaction: %s in %s\n",
+ caller, errstr, err_fn);
+
+ jbd2_journal_abort_handle(handle);
+}
+
+/* Deal with the reporting of failure conditions on a filesystem such as
+ * inconsistencies detected or read IO failures.
+ *
+ * On ext2, we can store the error state of the filesystem in the
+ * superblock. That is not possible on ext4, because we may have other
+ * write ordering constraints on the superblock which prevent us from
+ * writing it out straight away; and given that the journal is about to
+ * be aborted, we can't rely on the current, or future, transactions to
+ * write out the superblock safely.
+ *
+ * We'll just use the jbd2_journal_abort() error code to record an error in
+ * the journal instead. On recovery, the journal will compain about
+ * that error until we've noted it down and cleared it.
+ */
+
+static void ext4_handle_error(struct super_block *sb)
+{
+ struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+
+ EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
+ es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
+
+ if (sb->s_flags & MS_RDONLY)
+ return;
+
+ if (!test_opt (sb, ERRORS_CONT)) {
+ journal_t *journal = EXT4_SB(sb)->s_journal;
+
+ EXT4_SB(sb)->s_mount_opt |= EXT4_MOUNT_ABORT;
+ if (journal)
+ jbd2_journal_abort(journal, -EIO);
+ }
+ if (test_opt (sb, ERRORS_RO)) {
+ printk (KERN_CRIT "Remounting filesystem read-only\n");
+ sb->s_flags |= MS_RDONLY;
+ }
+ ext4_commit_super(sb, es, 1);
+ if (test_opt(sb, ERRORS_PANIC))
+ panic("EXT4-fs (device %s): panic forced after error\n",
+ sb->s_id);
+}
+
+void ext4_error (struct super_block * sb, const char * function,
+ const char * fmt, ...)
+{
+ va_list args;
+
+ va_start(args, fmt);
+ printk(KERN_CRIT "EXT4-fs error (device %s): %s: ",sb->s_id, function);
+ vprintk(fmt, args);
+ printk("\n");
+ va_end(args);
+
+ ext4_handle_error(sb);
+}
+
+static const char *ext4_decode_error(struct super_block * sb, int errno,
+ char nbuf[16])
+{
+ char *errstr = NULL;
+
+ switch (errno) {
+ case -EIO:
+ errstr = "IO failure";
+ break;
+ case -ENOMEM:
+ errstr = "Out of memory";
+ break;
+ case -EROFS:
+ if (!sb || EXT4_SB(sb)->s_journal->j_flags & JBD2_ABORT)
+ errstr = "Journal has aborted";
+ else
+ errstr = "Readonly filesystem";
+ break;
+ default:
+ /* If the caller passed in an extra buffer for unknown
+ * errors, textualise them now. Else we just return
+ * NULL. */
+ if (nbuf) {
+ /* Check for truncated error codes... */
+ if (snprintf(nbuf, 16, "error %d", -errno) >= 0)
+ errstr = nbuf;
+ }
+ break;
+ }
+
+ return errstr;
+}
+
+/* __ext4_std_error decodes expected errors from journaling functions
+ * automatically and invokes the appropriate error response. */
+
+void __ext4_std_error (struct super_block * sb, const char * function,
+ int errno)
+{
+ char nbuf[16];
+ const char *errstr;
+
+ /* Special case: if the error is EROFS, and we're not already
+ * inside a transaction, then there's really no point in logging
+ * an error. */
+ if (errno == -EROFS && journal_current_handle() == NULL &&
+ (sb->s_flags & MS_RDONLY))
+ return;
+
+ errstr = ext4_decode_error(sb, errno, nbuf);
+ printk (KERN_CRIT "EXT4-fs error (device %s) in %s: %s\n",
+ sb->s_id, function, errstr);
+
+ ext4_handle_error(sb);
+}
+
+/*
+ * ext4_abort is a much stronger failure handler than ext4_error. The
+ * abort function may be used to deal with unrecoverable failures such
+ * as journal IO errors or ENOMEM at a critical moment in log management.
+ *
+ * We unconditionally force the filesystem into an ABORT|READONLY state,
+ * unless the error response on the fs has been set to panic in which
+ * case we take the easy way out and panic immediately.
+ */
+
+void ext4_abort (struct super_block * sb, const char * function,
+ const char * fmt, ...)
+{
+ va_list args;
+
+ printk (KERN_CRIT "ext4_abort called.\n");
+
+ va_start(args, fmt);
+ printk(KERN_CRIT "EXT4-fs error (device %s): %s: ",sb->s_id, function);
+ vprintk(fmt, args);
+ printk("\n");
+ va_end(args);
+
+ if (test_opt(sb, ERRORS_PANIC))
+ panic("EXT4-fs panic from previous error\n");
+
+ if (sb->s_flags & MS_RDONLY)
+ return;
+
+ printk(KERN_CRIT "Remounting filesystem read-only\n");
+ EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
+ sb->s_flags |= MS_RDONLY;
+ EXT4_SB(sb)->s_mount_opt |= EXT4_MOUNT_ABORT;
+ jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO);
+}
+
+void ext4_warning (struct super_block * sb, const char * function,
+ const char * fmt, ...)
+{
+ va_list args;
+
+ va_start(args, fmt);
+ printk(KERN_WARNING "EXT4-fs warning (device %s): %s: ",
+ sb->s_id, function);
+ vprintk(fmt, args);
+ printk("\n");
+ va_end(args);
+}
+
+void ext4_update_dynamic_rev(struct super_block *sb)
+{
+ struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+
+ if (le32_to_cpu(es->s_rev_level) > EXT4_GOOD_OLD_REV)
+ return;
+
+ ext4_warning(sb, __FUNCTION__,
+ "updating to rev %d because of new feature flag, "
+ "running e2fsck is recommended",
+ EXT4_DYNAMIC_REV);
+
+ es->s_first_ino = cpu_to_le32(EXT4_GOOD_OLD_FIRST_INO);
+ es->s_inode_size = cpu_to_le16(EXT4_GOOD_OLD_INODE_SIZE);
+ es->s_rev_level = cpu_to_le32(EXT4_DYNAMIC_REV);
+ /* leave es->s_feature_*compat flags alone */
+ /* es->s_uuid will be set by e2fsck if empty */
+
+ /*
+ * The rest of the superblock fields should be zero, and if not it
+ * means they are likely already in use, so leave them alone. We
+ * can leave it up to e2fsck to clean up any inconsistencies there.
+ */
+}
+
+/*
+ * Open the external journal device
+ */
+static struct block_device *ext4_blkdev_get(dev_t dev)
+{
+ struct block_device *bdev;
+ char b[BDEVNAME_SIZE];
+
+ bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE);
+ if (IS_ERR(bdev))
+ goto fail;
+ return bdev;
+
+fail:
+ printk(KERN_ERR "EXT4: failed to open journal device %s: %ld\n",
+ __bdevname(dev, b), PTR_ERR(bdev));
+ return NULL;
+}
+
+/*
+ * Release the journal device
+ */
+static int ext4_blkdev_put(struct block_device *bdev)
+{
+ bd_release(bdev);
+ return blkdev_put(bdev);
+}
+
+static int ext4_blkdev_remove(struct ext4_sb_info *sbi)
+{
+ struct block_device *bdev;
+ int ret = -ENODEV;
+
+ bdev = sbi->journal_bdev;
+ if (bdev) {
+ ret = ext4_blkdev_put(bdev);
+ sbi->journal_bdev = NULL;
+ }
+ return ret;
+}
+
+static inline struct inode *orphan_list_entry(struct list_head *l)
+{
+ return &list_entry(l, struct ext4_inode_info, i_orphan)->vfs_inode;
+}
+
+static void dump_orphan_list(struct super_block *sb, struct ext4_sb_info *sbi)
+{
+ struct list_head *l;
+
+ printk(KERN_ERR "sb orphan head is %d\n",
+ le32_to_cpu(sbi->s_es->s_last_orphan));
+
+ printk(KERN_ERR "sb_info orphan list:\n");
+ list_for_each(l, &sbi->s_orphan) {
+ struct inode *inode = orphan_list_entry(l);
+ printk(KERN_ERR " "
+ "inode %s:%lu at %p: mode %o, nlink %d, next %d\n",
+ inode->i_sb->s_id, inode->i_ino, inode,
+ inode->i_mode, inode->i_nlink,
+ NEXT_ORPHAN(inode));
+ }
+}
+
+static void ext4_put_super (struct super_block * sb)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_super_block *es = sbi->s_es;
+ int i;
+
+ ext4_ext_release(sb);
+ ext4_xattr_put_super(sb);
+ jbd2_journal_destroy(sbi->s_journal);
+ if (!(sb->s_flags & MS_RDONLY)) {
+ EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
+ es->s_state = cpu_to_le16(sbi->s_mount_state);
+ BUFFER_TRACE(sbi->s_sbh, "marking dirty");
+ mark_buffer_dirty(sbi->s_sbh);
+ ext4_commit_super(sb, es, 1);
+ }
+
+ for (i = 0; i < sbi->s_gdb_count; i++)
+ brelse(sbi->s_group_desc[i]);
+ kfree(sbi->s_group_desc);
+ percpu_counter_destroy(&sbi->s_freeblocks_counter);
+ percpu_counter_destroy(&sbi->s_freeinodes_counter);
+ percpu_counter_destroy(&sbi->s_dirs_counter);
+ brelse(sbi->s_sbh);
+#ifdef CONFIG_QUOTA
+ for (i = 0; i < MAXQUOTAS; i++)
+ kfree(sbi->s_qf_names[i]);
+#endif
+
+ /* Debugging code just in case the in-memory inode orphan list
+ * isn't empty. The on-disk one can be non-empty if we've
+ * detected an error and taken the fs readonly, but the
+ * in-memory list had better be clean by this point. */
+ if (!list_empty(&sbi->s_orphan))
+ dump_orphan_list(sb, sbi);
+ J_ASSERT(list_empty(&sbi->s_orphan));
+
+ invalidate_bdev(sb->s_bdev, 0);
+ if (sbi->journal_bdev && sbi->journal_bdev != sb->s_bdev) {
+ /*
+ * Invalidate the journal device's buffers. We don't want them
+ * floating about in memory - the physical journal device may
+ * hotswapped, and it breaks the `ro-after' testing code.
+ */
+ sync_blockdev(sbi->journal_bdev);
+ invalidate_bdev(sbi->journal_bdev, 0);
+ ext4_blkdev_remove(sbi);
+ }
+ sb->s_fs_info = NULL;
+ kfree(sbi);
+ return;
+}
+
+static kmem_cache_t *ext4_inode_cachep;
+
+/*
+ * Called inside transaction, so use GFP_NOFS
+ */
+static struct inode *ext4_alloc_inode(struct super_block *sb)
+{
+ struct ext4_inode_info *ei;
+
+ ei = kmem_cache_alloc(ext4_inode_cachep, SLAB_NOFS);
+ if (!ei)
+ return NULL;
+#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
+ ei->i_acl = EXT4_ACL_NOT_CACHED;
+ ei->i_default_acl = EXT4_ACL_NOT_CACHED;
+#endif
+ ei->i_block_alloc_info = NULL;
+ ei->vfs_inode.i_version = 1;
+ memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache));
+ return &ei->vfs_inode;
+}
+
+static void ext4_destroy_inode(struct inode *inode)
+{
+ kmem_cache_free(ext4_inode_cachep, EXT4_I(inode));
+}
+
+static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags)
+{
+ struct ext4_inode_info *ei = (struct ext4_inode_info *) foo;
+
+ if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
+ SLAB_CTOR_CONSTRUCTOR) {
+ INIT_LIST_HEAD(&ei->i_orphan);
+#ifdef CONFIG_EXT4DEV_FS_XATTR
+ init_rwsem(&ei->xattr_sem);
+#endif
+ mutex_init(&ei->truncate_mutex);
+ inode_init_once(&ei->vfs_inode);
+ }
+}
+
+static int init_inodecache(void)
+{
+ ext4_inode_cachep = kmem_cache_create("ext4_inode_cache",
+ sizeof(struct ext4_inode_info),
+ 0, (SLAB_RECLAIM_ACCOUNT|
+ SLAB_MEM_SPREAD),
+ init_once, NULL);
+ if (ext4_inode_cachep == NULL)
+ return -ENOMEM;
+ return 0;
+}
+
+static void destroy_inodecache(void)
+{
+ kmem_cache_destroy(ext4_inode_cachep);
+}
+
+static void ext4_clear_inode(struct inode *inode)
+{
+ struct ext4_block_alloc_info *rsv = EXT4_I(inode)->i_block_alloc_info;
+#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
+ if (EXT4_I(inode)->i_acl &&
+ EXT4_I(inode)->i_acl != EXT4_ACL_NOT_CACHED) {
+ posix_acl_release(EXT4_I(inode)->i_acl);
+ EXT4_I(inode)->i_acl = EXT4_ACL_NOT_CACHED;
+ }
+ if (EXT4_I(inode)->i_default_acl &&
+ EXT4_I(inode)->i_default_acl != EXT4_ACL_NOT_CACHED) {
+ posix_acl_release(EXT4_I(inode)->i_default_acl);
+ EXT4_I(inode)->i_default_acl = EXT4_ACL_NOT_CACHED;
+ }
+#endif
+ ext4_discard_reservation(inode);
+ EXT4_I(inode)->i_block_alloc_info = NULL;
+ if (unlikely(rsv))
+ kfree(rsv);
+}
+
+static inline void ext4_show_quota_options(struct seq_file *seq, struct super_block *sb)
+{
+#if defined(CONFIG_QUOTA)
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+ if (sbi->s_jquota_fmt)
+ seq_printf(seq, ",jqfmt=%s",
+ (sbi->s_jquota_fmt == QFMT_VFS_OLD) ? "vfsold": "vfsv0");
+
+ if (sbi->s_qf_names[USRQUOTA])
+ seq_printf(seq, ",usrjquota=%s", sbi->s_qf_names[USRQUOTA]);
+
+ if (sbi->s_qf_names[GRPQUOTA])
+ seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]);
+
+ if (sbi->s_mount_opt & EXT4_MOUNT_USRQUOTA)
+ seq_puts(seq, ",usrquota");
+
+ if (sbi->s_mount_opt & EXT4_MOUNT_GRPQUOTA)
+ seq_puts(seq, ",grpquota");
+#endif
+}
+
+static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
+{
+ struct super_block *sb = vfs->mnt_sb;
+
+ if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
+ seq_puts(seq, ",data=journal");
+ else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
+ seq_puts(seq, ",data=ordered");
+ else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
+ seq_puts(seq, ",data=writeback");
+
+ ext4_show_quota_options(seq, sb);
+
+ return 0;
+}
+
+
+static struct dentry *ext4_get_dentry(struct super_block *sb, void *vobjp)
+{
+ __u32 *objp = vobjp;
+ unsigned long ino = objp[0];
+ __u32 generation = objp[1];
+ struct inode *inode;
+ struct dentry *result;
+
+ if (ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO)
+ return ERR_PTR(-ESTALE);
+ if (ino > le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count))
+ return ERR_PTR(-ESTALE);
+
+ /* iget isn't really right if the inode is currently unallocated!!
+ *
+ * ext4_read_inode will return a bad_inode if the inode had been
+ * deleted, so we should be safe.
+ *
+ * Currently we don't know the generation for parent directory, so
+ * a generation of 0 means "accept any"
+ */
+ inode = iget(sb, ino);
+ if (inode == NULL)
+ return ERR_PTR(-ENOMEM);
+ if (is_bad_inode(inode) ||
+ (generation && inode->i_generation != generation)) {
+ iput(inode);
+ return ERR_PTR(-ESTALE);
+ }
+ /* now to find a dentry.
+ * If possible, get a well-connected one
+ */
+ result = d_alloc_anon(inode);
+ if (!result) {
+ iput(inode);
+ return ERR_PTR(-ENOMEM);
+ }
+ return result;
+}
+
+#ifdef CONFIG_QUOTA
+#define QTYPE2NAME(t) ((t)==USRQUOTA?"user":"group")
+#define QTYPE2MOPT(on, t) ((t)==USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA))
+
+static int ext4_dquot_initialize(struct inode *inode, int type);
+static int ext4_dquot_drop(struct inode *inode);
+static int ext4_write_dquot(struct dquot *dquot);
+static int ext4_acquire_dquot(struct dquot *dquot);
+static int ext4_release_dquot(struct dquot *dquot);
+static int ext4_mark_dquot_dirty(struct dquot *dquot);
+static int ext4_write_info(struct super_block *sb, int type);
+static int ext4_quota_on(struct super_block *sb, int type, int format_id, char *path);
+static int ext4_quota_on_mount(struct super_block *sb, int type);
+static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
+ size_t len, loff_t off);
+static ssize_t ext4_quota_write(struct super_block *sb, int type,
+ const char *data, size_t len, loff_t off);
+
+static struct dquot_operations ext4_quota_operations = {
+ .initialize = ext4_dquot_initialize,
+ .drop = ext4_dquot_drop,
+ .alloc_space = dquot_alloc_space,
+ .alloc_inode = dquot_alloc_inode,
+ .free_space = dquot_free_space,
+ .free_inode = dquot_free_inode,
+ .transfer = dquot_transfer,
+ .write_dquot = ext4_write_dquot,
+ .acquire_dquot = ext4_acquire_dquot,
+ .release_dquot = ext4_release_dquot,
+ .mark_dirty = ext4_mark_dquot_dirty,
+ .write_info = ext4_write_info
+};
+
+static struct quotactl_ops ext4_qctl_operations = {
+ .quota_on = ext4_quota_on,
+ .quota_off = vfs_quota_off,
+ .quota_sync = vfs_quota_sync,
+ .get_info = vfs_get_dqinfo,
+ .set_info = vfs_set_dqinfo,
+ .get_dqblk = vfs_get_dqblk,
+ .set_dqblk = vfs_set_dqblk
+};
+#endif
+
+static struct super_operations ext4_sops = {
+ .alloc_inode = ext4_alloc_inode,
+ .destroy_inode = ext4_destroy_inode,
+ .read_inode = ext4_read_inode,
+ .write_inode = ext4_write_inode,
+ .dirty_inode = ext4_dirty_inode,
+ .delete_inode = ext4_delete_inode,
+ .put_super = ext4_put_super,
+ .write_super = ext4_write_super,
+ .sync_fs = ext4_sync_fs,
+ .write_super_lockfs = ext4_write_super_lockfs,
+ .unlockfs = ext4_unlockfs,
+ .statfs = ext4_statfs,
+ .remount_fs = ext4_remount,
+ .clear_inode = ext4_clear_inode,
+ .show_options = ext4_show_options,
+#ifdef CONFIG_QUOTA
+ .quota_read = ext4_quota_read,
+ .quota_write = ext4_quota_write,
+#endif
+};
+
+static struct export_operations ext4_export_ops = {
+ .get_parent = ext4_get_parent,
+ .get_dentry = ext4_get_dentry,
+};
+
+enum {
+ Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid,
+ Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro,
+ Opt_nouid32, Opt_nocheck, Opt_debug, Opt_oldalloc, Opt_orlov,
+ Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl,
+ Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh,
+ Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev,
+ Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
+ Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
+ Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
+ Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
+ Opt_grpquota, Opt_extents,
+};
+
+static match_table_t tokens = {
+ {Opt_bsd_df, "bsddf"},
+ {Opt_minix_df, "minixdf"},
+ {Opt_grpid, "grpid"},
+ {Opt_grpid, "bsdgroups"},
+ {Opt_nogrpid, "nogrpid"},
+ {Opt_nogrpid, "sysvgroups"},
+ {Opt_resgid, "resgid=%u"},
+ {Opt_resuid, "resuid=%u"},
+ {Opt_sb, "sb=%u"},
+ {Opt_err_cont, "errors=continue"},
+ {Opt_err_panic, "errors=panic"},
+ {Opt_err_ro, "errors=remount-ro"},
+ {Opt_nouid32, "nouid32"},
+ {Opt_nocheck, "nocheck"},
+ {Opt_nocheck, "check=none"},
+ {Opt_debug, "debug"},
+ {Opt_oldalloc, "oldalloc"},
+ {Opt_orlov, "orlov"},
+ {Opt_user_xattr, "user_xattr"},
+ {Opt_nouser_xattr, "nouser_xattr"},
+ {Opt_acl, "acl"},
+ {Opt_noacl, "noacl"},
+ {Opt_reservation, "reservation"},
+ {Opt_noreservation, "noreservation"},
+ {Opt_noload, "noload"},
+ {Opt_nobh, "nobh"},
+ {Opt_bh, "bh"},
+ {Opt_commit, "commit=%u"},
+ {Opt_journal_update, "journal=update"},
+ {Opt_journal_inum, "journal=%u"},
+ {Opt_journal_dev, "journal_dev=%u"},
+ {Opt_abort, "abort"},
+ {Opt_data_journal, "data=journal"},
+ {Opt_data_ordered, "data=ordered"},
+ {Opt_data_writeback, "data=writeback"},
+ {Opt_offusrjquota, "usrjquota="},
+ {Opt_usrjquota, "usrjquota=%s"},
+ {Opt_offgrpjquota, "grpjquota="},
+ {Opt_grpjquota, "grpjquota=%s"},
+ {Opt_jqfmt_vfsold, "jqfmt=vfsold"},
+ {Opt_jqfmt_vfsv0, "jqfmt=vfsv0"},
+ {Opt_grpquota, "grpquota"},
+ {Opt_noquota, "noquota"},
+ {Opt_quota, "quota"},
+ {Opt_usrquota, "usrquota"},
+ {Opt_barrier, "barrier=%u"},
+ {Opt_extents, "extents"},
+ {Opt_err, NULL},
+ {Opt_resize, "resize"},
+};
+
+static ext4_fsblk_t get_sb_block(void **data)
+{
+ ext4_fsblk_t sb_block;
+ char *options = (char *) *data;
+
+ if (!options || strncmp(options, "sb=", 3) != 0)
+ return 1; /* Default location */
+ options += 3;
+ /*todo: use simple_strtoll with >32bit ext4 */
+ sb_block = simple_strtoul(options, &options, 0);
+ if (*options && *options != ',') {
+ printk("EXT4-fs: Invalid sb specification: %s\n",
+ (char *) *data);
+ return 1;
+ }
+ if (*options == ',')
+ options++;
+ *data = (void *) options;
+ return sb_block;
+}
+
+static int parse_options (char *options, struct super_block *sb,
+ unsigned int *inum, unsigned long *journal_devnum,
+ ext4_fsblk_t *n_blocks_count, int is_remount)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ char * p;
+ substring_t args[MAX_OPT_ARGS];
+ int data_opt = 0;
+ int option;
+#ifdef CONFIG_QUOTA
+ int qtype;
+ char *qname;
+#endif
+
+ if (!options)
+ return 1;
+
+ while ((p = strsep (&options, ",")) != NULL) {
+ int token;
+ if (!*p)
+ continue;
+
+ token = match_token(p, tokens, args);
+ switch (token) {
+ case Opt_bsd_df:
+ clear_opt (sbi->s_mount_opt, MINIX_DF);
+ break;
+ case Opt_minix_df:
+ set_opt (sbi->s_mount_opt, MINIX_DF);
+ break;
+ case Opt_grpid:
+ set_opt (sbi->s_mount_opt, GRPID);
+ break;
+ case Opt_nogrpid:
+ clear_opt (sbi->s_mount_opt, GRPID);
+ break;
+ case Opt_resuid:
+ if (match_int(&args[0], &option))
+ return 0;
+ sbi->s_resuid = option;
+ break;
+ case Opt_resgid:
+ if (match_int(&args[0], &option))
+ return 0;
+ sbi->s_resgid = option;
+ break;
+ case Opt_sb:
+ /* handled by get_sb_block() instead of here */
+ /* *sb_block = match_int(&args[0]); */
+ break;
+ case Opt_err_panic:
+ clear_opt (sbi->s_mount_opt, ERRORS_CONT);
+ clear_opt (sbi->s_mount_opt, ERRORS_RO);
+ set_opt (sbi->s_mount_opt, ERRORS_PANIC);
+ break;
+ case Opt_err_ro:
+ clear_opt (sbi->s_mount_opt, ERRORS_CONT);
+ clear_opt (sbi->s_mount_opt, ERRORS_PANIC);
+ set_opt (sbi->s_mount_opt, ERRORS_RO);
+ break;
+ case Opt_err_cont:
+ clear_opt (sbi->s_mount_opt, ERRORS_RO);
+ clear_opt (sbi->s_mount_opt, ERRORS_PANIC);
+ set_opt (sbi->s_mount_opt, ERRORS_CONT);
+ break;
+ case Opt_nouid32:
+ set_opt (sbi->s_mount_opt, NO_UID32);
+ break;
+ case Opt_nocheck:
+ clear_opt (sbi->s_mount_opt, CHECK);
+ break;
+ case Opt_debug:
+ set_opt (sbi->s_mount_opt, DEBUG);
+ break;
+ case Opt_oldalloc:
+ set_opt (sbi->s_mount_opt, OLDALLOC);
+ break;
+ case Opt_orlov:
+ clear_opt (sbi->s_mount_opt, OLDALLOC);
+ break;
+#ifdef CONFIG_EXT4DEV_FS_XATTR
+ case Opt_user_xattr:
+ set_opt (sbi->s_mount_opt, XATTR_USER);
+ break;
+ case Opt_nouser_xattr:
+ clear_opt (sbi->s_mount_opt, XATTR_USER);
+ break;
+#else
+ case Opt_user_xattr:
+ case Opt_nouser_xattr:
+ printk("EXT4 (no)user_xattr options not supported\n");
+ break;
+#endif
+#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
+ case Opt_acl:
+ set_opt(sbi->s_mount_opt, POSIX_ACL);
+ break;
+ case Opt_noacl:
+ clear_opt(sbi->s_mount_opt, POSIX_ACL);
+ break;
+#else
+ case Opt_acl:
+ case Opt_noacl:
+ printk("EXT4 (no)acl options not supported\n");
+ break;
+#endif
+ case Opt_reservation:
+ set_opt(sbi->s_mount_opt, RESERVATION);
+ break;
+ case Opt_noreservation:
+ clear_opt(sbi->s_mount_opt, RESERVATION);
+ break;
+ case Opt_journal_update:
+ /* @@@ FIXME */
+ /* Eventually we will want to be able to create
+ a journal file here. For now, only allow the
+ user to specify an existing inode to be the
+ journal file. */
+ if (is_remount) {
+ printk(KERN_ERR "EXT4-fs: cannot specify "
+ "journal on remount\n");
+ return 0;
+ }
+ set_opt (sbi->s_mount_opt, UPDATE_JOURNAL);
+ break;
+ case Opt_journal_inum:
+ if (is_remount) {
+ printk(KERN_ERR "EXT4-fs: cannot specify "
+ "journal on remount\n");
+ return 0;
+ }
+ if (match_int(&args[0], &option))
+ return 0;
+ *inum = option;
+ break;
+ case Opt_journal_dev:
+ if (is_remount) {
+ printk(KERN_ERR "EXT4-fs: cannot specify "
+ "journal on remount\n");
+ return 0;
+ }
+ if (match_int(&args[0], &option))
+ return 0;
+ *journal_devnum = option;
+ break;
+ case Opt_noload:
+ set_opt (sbi->s_mount_opt, NOLOAD);
+ break;
+ case Opt_commit:
+ if (match_int(&args[0], &option))
+ return 0;
+ if (option < 0)
+ return 0;
+ if (option == 0)
+ option = JBD_DEFAULT_MAX_COMMIT_AGE;
+ sbi->s_commit_interval = HZ * option;
+ break;
+ case Opt_data_journal:
+ data_opt = EXT4_MOUNT_JOURNAL_DATA;
+ goto datacheck;
+ case Opt_data_ordered:
+ data_opt = EXT4_MOUNT_ORDERED_DATA;
+ goto datacheck;
+ case Opt_data_writeback:
+ data_opt = EXT4_MOUNT_WRITEBACK_DATA;
+ datacheck:
+ if (is_remount) {
+ if ((sbi->s_mount_opt & EXT4_MOUNT_DATA_FLAGS)
+ != data_opt) {
+ printk(KERN_ERR
+ "EXT4-fs: cannot change data "
+ "mode on remount\n");
+ return 0;
+ }
+ } else {
+ sbi->s_mount_opt &= ~EXT4_MOUNT_DATA_FLAGS;
+ sbi->s_mount_opt |= data_opt;
+ }
+ break;
+#ifdef CONFIG_QUOTA
+ case Opt_usrjquota:
+ qtype = USRQUOTA;
+ goto set_qf_name;
+ case Opt_grpjquota:
+ qtype = GRPQUOTA;
+set_qf_name:
+ if (sb_any_quota_enabled(sb)) {
+ printk(KERN_ERR
+ "EXT4-fs: Cannot change journalled "
+ "quota options when quota turned on.\n");
+ return 0;
+ }
+ qname = match_strdup(&args[0]);
+ if (!qname) {
+ printk(KERN_ERR
+ "EXT4-fs: not enough memory for "
+ "storing quotafile name.\n");
+ return 0;
+ }
+ if (sbi->s_qf_names[qtype] &&
+ strcmp(sbi->s_qf_names[qtype], qname)) {
+ printk(KERN_ERR
+ "EXT4-fs: %s quota file already "
+ "specified.\n", QTYPE2NAME(qtype));
+ kfree(qname);
+ return 0;
+ }
+ sbi->s_qf_names[qtype] = qname;
+ if (strchr(sbi->s_qf_names[qtype], '/')) {
+ printk(KERN_ERR
+ "EXT4-fs: quotafile must be on "
+ "filesystem root.\n");
+ kfree(sbi->s_qf_names[qtype]);
+ sbi->s_qf_names[qtype] = NULL;
+ return 0;
+ }
+ set_opt(sbi->s_mount_opt, QUOTA);
+ break;
+ case Opt_offusrjquota:
+ qtype = USRQUOTA;
+ goto clear_qf_name;
+ case Opt_offgrpjquota:
+ qtype = GRPQUOTA;
+clear_qf_name:
+ if (sb_any_quota_enabled(sb)) {
+ printk(KERN_ERR "EXT4-fs: Cannot change "
+ "journalled quota options when "
+ "quota turned on.\n");
+ return 0;
+ }
+ /*
+ * The space will be released later when all options
+ * are confirmed to be correct
+ */
+ sbi->s_qf_names[qtype] = NULL;
+ break;
+ case Opt_jqfmt_vfsold:
+ sbi->s_jquota_fmt = QFMT_VFS_OLD;
+ break;
+ case Opt_jqfmt_vfsv0:
+ sbi->s_jquota_fmt = QFMT_VFS_V0;
+ break;
+ case Opt_quota:
+ case Opt_usrquota:
+ set_opt(sbi->s_mount_opt, QUOTA);
+ set_opt(sbi->s_mount_opt, USRQUOTA);
+ break;
+ case Opt_grpquota:
+ set_opt(sbi->s_mount_opt, QUOTA);
+ set_opt(sbi->s_mount_opt, GRPQUOTA);
+ break;
+ case Opt_noquota:
+ if (sb_any_quota_enabled(sb)) {
+ printk(KERN_ERR "EXT4-fs: Cannot change quota "
+ "options when quota turned on.\n");
+ return 0;
+ }
+ clear_opt(sbi->s_mount_opt, QUOTA);
+ clear_opt(sbi->s_mount_opt, USRQUOTA);
+ clear_opt(sbi->s_mount_opt, GRPQUOTA);
+ break;
+#else
+ case Opt_quota:
+ case Opt_usrquota:
+ case Opt_grpquota:
+ case Opt_usrjquota:
+ case Opt_grpjquota:
+ case Opt_offusrjquota:
+ case Opt_offgrpjquota:
+ case Opt_jqfmt_vfsold:
+ case Opt_jqfmt_vfsv0:
+ printk(KERN_ERR
+ "EXT4-fs: journalled quota options not "
+ "supported.\n");
+ break;
+ case Opt_noquota:
+ break;
+#endif
+ case Opt_abort:
+ set_opt(sbi->s_mount_opt, ABORT);
+ break;
+ case Opt_barrier:
+ if (match_int(&args[0], &option))
+ return 0;
+ if (option)
+ set_opt(sbi->s_mount_opt, BARRIER);
+ else
+ clear_opt(sbi->s_mount_opt, BARRIER);
+ break;
+ case Opt_ignore:
+ break;
+ case Opt_resize:
+ if (!is_remount) {
+ printk("EXT4-fs: resize option only available "
+ "for remount\n");
+ return 0;
+ }
+ if (match_int(&args[0], &option) != 0)
+ return 0;
+ *n_blocks_count = option;
+ break;
+ case Opt_nobh:
+ set_opt(sbi->s_mount_opt, NOBH);
+ break;
+ case Opt_bh:
+ clear_opt(sbi->s_mount_opt, NOBH);
+ break;
+ case Opt_extents:
+ set_opt (sbi->s_mount_opt, EXTENTS);
+ break;
+ default:
+ printk (KERN_ERR
+ "EXT4-fs: Unrecognized mount option \"%s\" "
+ "or missing value\n", p);
+ return 0;
+ }
+ }
+#ifdef CONFIG_QUOTA
+ if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) {
+ if ((sbi->s_mount_opt & EXT4_MOUNT_USRQUOTA) &&
+ sbi->s_qf_names[USRQUOTA])
+ clear_opt(sbi->s_mount_opt, USRQUOTA);
+
+ if ((sbi->s_mount_opt & EXT4_MOUNT_GRPQUOTA) &&
+ sbi->s_qf_names[GRPQUOTA])
+ clear_opt(sbi->s_mount_opt, GRPQUOTA);
+
+ if ((sbi->s_qf_names[USRQUOTA] &&
+ (sbi->s_mount_opt & EXT4_MOUNT_GRPQUOTA)) ||
+ (sbi->s_qf_names[GRPQUOTA] &&
+ (sbi->s_mount_opt & EXT4_MOUNT_USRQUOTA))) {
+ printk(KERN_ERR "EXT4-fs: old and new quota "
+ "format mixing.\n");
+ return 0;
+ }
+
+ if (!sbi->s_jquota_fmt) {
+ printk(KERN_ERR "EXT4-fs: journalled quota format "
+ "not specified.\n");
+ return 0;
+ }
+ } else {
+ if (sbi->s_jquota_fmt) {
+ printk(KERN_ERR "EXT4-fs: journalled quota format "
+ "specified with no journalling "
+ "enabled.\n");
+ return 0;
+ }
+ }
+#endif
+ return 1;
+}
+
+static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
+ int read_only)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ int res = 0;
+
+ if (le32_to_cpu(es->s_rev_level) > EXT4_MAX_SUPP_REV) {
+ printk (KERN_ERR "EXT4-fs warning: revision level too high, "
+ "forcing read-only mode\n");
+ res = MS_RDONLY;
+ }
+ if (read_only)
+ return res;
+ if (!(sbi->s_mount_state & EXT4_VALID_FS))
+ printk (KERN_WARNING "EXT4-fs warning: mounting unchecked fs, "
+ "running e2fsck is recommended\n");
+ else if ((sbi->s_mount_state & EXT4_ERROR_FS))
+ printk (KERN_WARNING
+ "EXT4-fs warning: mounting fs with errors, "
+ "running e2fsck is recommended\n");
+ else if ((__s16) le16_to_cpu(es->s_max_mnt_count) >= 0 &&
+ le16_to_cpu(es->s_mnt_count) >=
+ (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count))
+ printk (KERN_WARNING
+ "EXT4-fs warning: maximal mount count reached, "
+ "running e2fsck is recommended\n");
+ else if (le32_to_cpu(es->s_checkinterval) &&
+ (le32_to_cpu(es->s_lastcheck) +
+ le32_to_cpu(es->s_checkinterval) <= get_seconds()))
+ printk (KERN_WARNING
+ "EXT4-fs warning: checktime reached, "
+ "running e2fsck is recommended\n");
+#if 0
+ /* @@@ We _will_ want to clear the valid bit if we find
+ * inconsistencies, to force a fsck at reboot. But for
+ * a plain journaled filesystem we can keep it set as
+ * valid forever! :)
+ */
+ es->s_state = cpu_to_le16(le16_to_cpu(es->s_state) & ~EXT4_VALID_FS);
+#endif
+ if (!(__s16) le16_to_cpu(es->s_max_mnt_count))
+ es->s_max_mnt_count = cpu_to_le16(EXT4_DFL_MAX_MNT_COUNT);
+ es->s_mnt_count=cpu_to_le16(le16_to_cpu(es->s_mnt_count) + 1);
+ es->s_mtime = cpu_to_le32(get_seconds());
+ ext4_update_dynamic_rev(sb);
+ EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
+
+ ext4_commit_super(sb, es, 1);
+ if (test_opt(sb, DEBUG))
+ printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%lu, "
+ "bpg=%lu, ipg=%lu, mo=%04lx]\n",
+ sb->s_blocksize,
+ sbi->s_groups_count,
+ EXT4_BLOCKS_PER_GROUP(sb),
+ EXT4_INODES_PER_GROUP(sb),
+ sbi->s_mount_opt);
+
+ printk(KERN_INFO "EXT4 FS on %s, ", sb->s_id);
+ if (EXT4_SB(sb)->s_journal->j_inode == NULL) {
+ char b[BDEVNAME_SIZE];
+
+ printk("external journal on %s\n",
+ bdevname(EXT4_SB(sb)->s_journal->j_dev, b));
+ } else {
+ printk("internal journal\n");
+ }
+ return res;
+}
+
+/* Called at mount-time, super-block is locked */
+static int ext4_check_descriptors (struct super_block * sb)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ ext4_fsblk_t first_block = le32_to_cpu(sbi->s_es->s_first_data_block);
+ ext4_fsblk_t last_block;
+ ext4_fsblk_t block_bitmap;
+ ext4_fsblk_t inode_bitmap;
+ ext4_fsblk_t inode_table;
+ struct ext4_group_desc * gdp = NULL;
+ int desc_block = 0;
+ int i;
+
+ ext4_debug ("Checking group descriptors");
+
+ for (i = 0; i < sbi->s_groups_count; i++)
+ {
+ if (i == sbi->s_groups_count - 1)
+ last_block = ext4_blocks_count(sbi->s_es) - 1;
+ else
+ last_block = first_block +
+ (EXT4_BLOCKS_PER_GROUP(sb) - 1);
+
+ if ((i % EXT4_DESC_PER_BLOCK(sb)) == 0)
+ gdp = (struct ext4_group_desc *)
+ sbi->s_group_desc[desc_block++]->b_data;
+ block_bitmap = ext4_block_bitmap(sb, gdp);
+ if (block_bitmap < first_block || block_bitmap > last_block)
+ {
+ ext4_error (sb, "ext4_check_descriptors",
+ "Block bitmap for group %d"
+ " not in group (block %llu)!",
+ i, block_bitmap);
+ return 0;
+ }
+ inode_bitmap = ext4_inode_bitmap(sb, gdp);
+ if (inode_bitmap < first_block || inode_bitmap > last_block)
+ {
+ ext4_error (sb, "ext4_check_descriptors",
+ "Inode bitmap for group %d"
+ " not in group (block %llu)!",
+ i, inode_bitmap);
+ return 0;
+ }
+ inode_table = ext4_inode_table(sb, gdp);
+ if (inode_table < first_block ||
+ inode_table + sbi->s_itb_per_group > last_block)
+ {
+ ext4_error (sb, "ext4_check_descriptors",
+ "Inode table for group %d"
+ " not in group (block %llu)!",
+ i, inode_table);
+ return 0;
+ }
+ first_block += EXT4_BLOCKS_PER_GROUP(sb);
+ gdp = (struct ext4_group_desc *)
+ ((__u8 *)gdp + EXT4_DESC_SIZE(sb));
+ }
+
+ ext4_free_blocks_count_set(sbi->s_es, ext4_count_free_blocks(sb));
+ sbi->s_es->s_free_inodes_count=cpu_to_le32(ext4_count_free_inodes(sb));
+ return 1;
+}
+
+
+/* ext4_orphan_cleanup() walks a singly-linked list of inodes (starting at
+ * the superblock) which were deleted from all directories, but held open by
+ * a process at the time of a crash. We walk the list and try to delete these
+ * inodes at recovery time (only with a read-write filesystem).
+ *
+ * In order to keep the orphan inode chain consistent during traversal (in
+ * case of crash during recovery), we link each inode into the superblock
+ * orphan list_head and handle it the same way as an inode deletion during
+ * normal operation (which journals the operations for us).
+ *
+ * We only do an iget() and an iput() on each inode, which is very safe if we
+ * accidentally point at an in-use or already deleted inode. The worst that
+ * can happen in this case is that we get a "bit already cleared" message from
+ * ext4_free_inode(). The only reason we would point at a wrong inode is if
+ * e2fsck was run on this filesystem, and it must have already done the orphan
+ * inode cleanup for us, so we can safely abort without any further action.
+ */
+static void ext4_orphan_cleanup (struct super_block * sb,
+ struct ext4_super_block * es)
+{
+ unsigned int s_flags = sb->s_flags;
+ int nr_orphans = 0, nr_truncates = 0;
+#ifdef CONFIG_QUOTA
+ int i;
+#endif
+ if (!es->s_last_orphan) {
+ jbd_debug(4, "no orphan inodes to clean up\n");
+ return;
+ }
+
+ if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) {
+ if (es->s_last_orphan)
+ jbd_debug(1, "Errors on filesystem, "
+ "clearing orphan list.\n");
+ es->s_last_orphan = 0;
+ jbd_debug(1, "Skipping orphan recovery on fs with errors.\n");
+ return;
+ }
+
+ if (s_flags & MS_RDONLY) {
+ printk(KERN_INFO "EXT4-fs: %s: orphan cleanup on readonly fs\n",
+ sb->s_id);
+ sb->s_flags &= ~MS_RDONLY;
+ }
+#ifdef CONFIG_QUOTA
+ /* Needed for iput() to work correctly and not trash data */
+ sb->s_flags |= MS_ACTIVE;
+ /* Turn on quotas so that they are updated correctly */
+ for (i = 0; i < MAXQUOTAS; i++) {
+ if (EXT4_SB(sb)->s_qf_names[i]) {
+ int ret = ext4_quota_on_mount(sb, i);
+ if (ret < 0)
+ printk(KERN_ERR
+ "EXT4-fs: Cannot turn on journalled "
+ "quota: error %d\n", ret);
+ }
+ }
+#endif
+
+ while (es->s_last_orphan) {
+ struct inode *inode;
+
+ if (!(inode =
+ ext4_orphan_get(sb, le32_to_cpu(es->s_last_orphan)))) {
+ es->s_last_orphan = 0;
+ break;
+ }
+
+ list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan);
+ DQUOT_INIT(inode);
+ if (inode->i_nlink) {
+ printk(KERN_DEBUG
+ "%s: truncating inode %lu to %Ld bytes\n",
+ __FUNCTION__, inode->i_ino, inode->i_size);
+ jbd_debug(2, "truncating inode %lu to %Ld bytes\n",
+ inode->i_ino, inode->i_size);
+ ext4_truncate(inode);
+ nr_truncates++;
+ } else {
+ printk(KERN_DEBUG
+ "%s: deleting unreferenced inode %lu\n",
+ __FUNCTION__, inode->i_ino);
+ jbd_debug(2, "deleting unreferenced inode %lu\n",
+ inode->i_ino);
+ nr_orphans++;
+ }
+ iput(inode); /* The delete magic happens here! */
+ }
+
+#define PLURAL(x) (x), ((x)==1) ? "" : "s"
+
+ if (nr_orphans)
+ printk(KERN_INFO "EXT4-fs: %s: %d orphan inode%s deleted\n",
+ sb->s_id, PLURAL(nr_orphans));
+ if (nr_truncates)
+ printk(KERN_INFO "EXT4-fs: %s: %d truncate%s cleaned up\n",
+ sb->s_id, PLURAL(nr_truncates));
+#ifdef CONFIG_QUOTA
+ /* Turn quotas off */
+ for (i = 0; i < MAXQUOTAS; i++) {
+ if (sb_dqopt(sb)->files[i])
+ vfs_quota_off(sb, i);
+ }
+#endif
+ sb->s_flags = s_flags; /* Restore MS_RDONLY status */
+}
+
+#define log2(n) ffz(~(n))
+
+/*
+ * Maximal file size. There is a direct, and {,double-,triple-}indirect
+ * block limit, and also a limit of (2^32 - 1) 512-byte sectors in i_blocks.
+ * We need to be 1 filesystem block less than the 2^32 sector limit.
+ */
+static loff_t ext4_max_size(int bits)
+{
+ loff_t res = EXT4_NDIR_BLOCKS;
+ /* This constant is calculated to be the largest file size for a
+ * dense, 4k-blocksize file such that the total number of
+ * sectors in the file, including data and all indirect blocks,
+ * does not exceed 2^32. */
+ const loff_t upper_limit = 0x1ff7fffd000LL;
+
+ res += 1LL << (bits-2);
+ res += 1LL << (2*(bits-2));
+ res += 1LL << (3*(bits-2));
+ res <<= bits;
+ if (res > upper_limit)
+ res = upper_limit;
+ return res;
+}
+
+static ext4_fsblk_t descriptor_loc(struct super_block *sb,
+ ext4_fsblk_t logical_sb_block, int nr)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ unsigned long bg, first_meta_bg;
+ int has_super = 0;
+
+ first_meta_bg = le32_to_cpu(sbi->s_es->s_first_meta_bg);
+
+ if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG) ||
+ nr < first_meta_bg)
+ return logical_sb_block + nr + 1;
+ bg = sbi->s_desc_per_block * nr;
+ if (ext4_bg_has_super(sb, bg))
+ has_super = 1;
+ return (has_super + ext4_group_first_block_no(sb, bg));
+}
+
+
+static int ext4_fill_super (struct super_block *sb, void *data, int silent)
+{
+ struct buffer_head * bh;
+ struct ext4_super_block *es = NULL;
+ struct ext4_sb_info *sbi;
+ ext4_fsblk_t block;
+ ext4_fsblk_t sb_block = get_sb_block(&data);
+ ext4_fsblk_t logical_sb_block;
+ unsigned long offset = 0;
+ unsigned int journal_inum = 0;
+ unsigned long journal_devnum = 0;
+ unsigned long def_mount_opts;
+ struct inode *root;
+ int blocksize;
+ int hblock;
+ int db_count;
+ int i;
+ int needs_recovery;
+ __le32 features;
+ __u64 blocks_count;
+
+ sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
+ if (!sbi)
+ return -ENOMEM;
+ sb->s_fs_info = sbi;
+ sbi->s_mount_opt = 0;
+ sbi->s_resuid = EXT4_DEF_RESUID;
+ sbi->s_resgid = EXT4_DEF_RESGID;
+
+ unlock_kernel();
+
+ blocksize = sb_min_blocksize(sb, EXT4_MIN_BLOCK_SIZE);
+ if (!blocksize) {
+ printk(KERN_ERR "EXT4-fs: unable to set blocksize\n");
+ goto out_fail;
+ }
+
+ /*
+ * The ext4 superblock will not be buffer aligned for other than 1kB
+ * block sizes. We need to calculate the offset from buffer start.
+ */
+ if (blocksize != EXT4_MIN_BLOCK_SIZE) {
+ logical_sb_block = sb_block * EXT4_MIN_BLOCK_SIZE;
+ offset = do_div(logical_sb_block, blocksize);
+ } else {
+ logical_sb_block = sb_block;
+ }
+
+ if (!(bh = sb_bread(sb, logical_sb_block))) {
+ printk (KERN_ERR "EXT4-fs: unable to read superblock\n");
+ goto out_fail;
+ }
+ /*
+ * Note: s_es must be initialized as soon as possible because
+ * some ext4 macro-instructions depend on its value
+ */
+ es = (struct ext4_super_block *) (((char *)bh->b_data) + offset);
+ sbi->s_es = es;
+ sb->s_magic = le16_to_cpu(es->s_magic);
+ if (sb->s_magic != EXT4_SUPER_MAGIC)
+ goto cantfind_ext4;
+
+ /* Set defaults before we parse the mount options */
+ def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
+ if (def_mount_opts & EXT4_DEFM_DEBUG)
+ set_opt(sbi->s_mount_opt, DEBUG);
+ if (def_mount_opts & EXT4_DEFM_BSDGROUPS)
+ set_opt(sbi->s_mount_opt, GRPID);
+ if (def_mount_opts & EXT4_DEFM_UID16)
+ set_opt(sbi->s_mount_opt, NO_UID32);
+ if (def_mount_opts & EXT4_DEFM_XATTR_USER)
+ set_opt(sbi->s_mount_opt, XATTR_USER);
+ if (def_mount_opts & EXT4_DEFM_ACL)
+ set_opt(sbi->s_mount_opt, POSIX_ACL);
+ if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_DATA)
+ sbi->s_mount_opt |= EXT4_MOUNT_JOURNAL_DATA;
+ else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_ORDERED)
+ sbi->s_mount_opt |= EXT4_MOUNT_ORDERED_DATA;
+ else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_WBACK)
+ sbi->s_mount_opt |= EXT4_MOUNT_WRITEBACK_DATA;
+
+ if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_PANIC)
+ set_opt(sbi->s_mount_opt, ERRORS_PANIC);
+ else if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_RO)
+ set_opt(sbi->s_mount_opt, ERRORS_RO);
+ else
+ set_opt(sbi->s_mount_opt, ERRORS_CONT);
+
+ sbi->s_resuid = le16_to_cpu(es->s_def_resuid);
+ sbi->s_resgid = le16_to_cpu(es->s_def_resgid);
+
+ set_opt(sbi->s_mount_opt, RESERVATION);
+
+ if (!parse_options ((char *) data, sb, &journal_inum, &journal_devnum,
+ NULL, 0))
+ goto failed_mount;
+
+ sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
+ ((sbi->s_mount_opt & EXT4_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0);
+
+ if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV &&
+ (EXT4_HAS_COMPAT_FEATURE(sb, ~0U) ||
+ EXT4_HAS_RO_COMPAT_FEATURE(sb, ~0U) ||
+ EXT4_HAS_INCOMPAT_FEATURE(sb, ~0U)))
+ printk(KERN_WARNING
+ "EXT4-fs warning: feature flags set on rev 0 fs, "
+ "running e2fsck is recommended\n");
+ /*
+ * Check feature flags regardless of the revision level, since we
+ * previously didn't change the revision level when setting the flags,
+ * so there is a chance incompat flags are set on a rev 0 filesystem.
+ */
+ features = EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT4_FEATURE_INCOMPAT_SUPP);
+ if (features) {
+ printk(KERN_ERR "EXT4-fs: %s: couldn't mount because of "
+ "unsupported optional features (%x).\n",
+ sb->s_id, le32_to_cpu(features));
+ goto failed_mount;
+ }
+ features = EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT4_FEATURE_RO_COMPAT_SUPP);
+ if (!(sb->s_flags & MS_RDONLY) && features) {
+ printk(KERN_ERR "EXT4-fs: %s: couldn't mount RDWR because of "
+ "unsupported optional features (%x).\n",
+ sb->s_id, le32_to_cpu(features));
+ goto failed_mount;
+ }
+ blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);
+
+ if (blocksize < EXT4_MIN_BLOCK_SIZE ||
+ blocksize > EXT4_MAX_BLOCK_SIZE) {
+ printk(KERN_ERR
+ "EXT4-fs: Unsupported filesystem blocksize %d on %s.\n",
+ blocksize, sb->s_id);
+ goto failed_mount;
+ }
+
+ hblock = bdev_hardsect_size(sb->s_bdev);
+ if (sb->s_blocksize != blocksize) {
+ /*
+ * Make sure the blocksize for the filesystem is larger
+ * than the hardware sectorsize for the machine.
+ */
+ if (blocksize < hblock) {
+ printk(KERN_ERR "EXT4-fs: blocksize %d too small for "
+ "device blocksize %d.\n", blocksize, hblock);
+ goto failed_mount;
+ }
+
+ brelse (bh);
+ sb_set_blocksize(sb, blocksize);
+ logical_sb_block = sb_block * EXT4_MIN_BLOCK_SIZE;
+ offset = do_div(logical_sb_block, blocksize);
+ bh = sb_bread(sb, logical_sb_block);
+ if (!bh) {
+ printk(KERN_ERR
+ "EXT4-fs: Can't read superblock on 2nd try.\n");
+ goto failed_mount;
+ }
+ es = (struct ext4_super_block *)(((char *)bh->b_data) + offset);
+ sbi->s_es = es;
+ if (es->s_magic != cpu_to_le16(EXT4_SUPER_MAGIC)) {
+ printk (KERN_ERR
+ "EXT4-fs: Magic mismatch, very weird !\n");
+ goto failed_mount;
+ }
+ }
+
+ sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits);
+
+ if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV) {
+ sbi->s_inode_size = EXT4_GOOD_OLD_INODE_SIZE;
+ sbi->s_first_ino = EXT4_GOOD_OLD_FIRST_INO;
+ } else {
+ sbi->s_inode_size = le16_to_cpu(es->s_inode_size);
+ sbi->s_first_ino = le32_to_cpu(es->s_first_ino);
+ if ((sbi->s_inode_size < EXT4_GOOD_OLD_INODE_SIZE) ||
+ (sbi->s_inode_size & (sbi->s_inode_size - 1)) ||
+ (sbi->s_inode_size > blocksize)) {
+ printk (KERN_ERR
+ "EXT4-fs: unsupported inode size: %d\n",
+ sbi->s_inode_size);
+ goto failed_mount;
+ }
+ }
+ sbi->s_frag_size = EXT4_MIN_FRAG_SIZE <<
+ le32_to_cpu(es->s_log_frag_size);
+ if (blocksize != sbi->s_frag_size) {
+ printk(KERN_ERR
+ "EXT4-fs: fragsize %lu != blocksize %u (unsupported)\n",
+ sbi->s_frag_size, blocksize);
+ goto failed_mount;
+ }
+ sbi->s_desc_size = le16_to_cpu(es->s_desc_size);
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT)) {
+ if (sbi->s_desc_size < EXT4_MIN_DESC_SIZE_64BIT ||
+ sbi->s_desc_size > EXT4_MAX_DESC_SIZE ||
+ sbi->s_desc_size & (sbi->s_desc_size - 1)) {
+ printk(KERN_ERR
+ "EXT4-fs: unsupported descriptor size %lu\n",
+ sbi->s_desc_size);
+ goto failed_mount;
+ }
+ } else
+ sbi->s_desc_size = EXT4_MIN_DESC_SIZE;
+ sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group);
+ sbi->s_frags_per_group = le32_to_cpu(es->s_frags_per_group);
+ sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group);
+ if (EXT4_INODE_SIZE(sb) == 0)
+ goto cantfind_ext4;
+ sbi->s_inodes_per_block = blocksize / EXT4_INODE_SIZE(sb);
+ if (sbi->s_inodes_per_block == 0)
+ goto cantfind_ext4;
+ sbi->s_itb_per_group = sbi->s_inodes_per_group /
+ sbi->s_inodes_per_block;
+ sbi->s_desc_per_block = blocksize / EXT4_DESC_SIZE(sb);
+ sbi->s_sbh = bh;
+ sbi->s_mount_state = le16_to_cpu(es->s_state);
+ sbi->s_addr_per_block_bits = log2(EXT4_ADDR_PER_BLOCK(sb));
+ sbi->s_desc_per_block_bits = log2(EXT4_DESC_PER_BLOCK(sb));
+ for (i=0; i < 4; i++)
+ sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
+ sbi->s_def_hash_version = es->s_def_hash_version;
+
+ if (sbi->s_blocks_per_group > blocksize * 8) {
+ printk (KERN_ERR
+ "EXT4-fs: #blocks per group too big: %lu\n",
+ sbi->s_blocks_per_group);
+ goto failed_mount;
+ }
+ if (sbi->s_frags_per_group > blocksize * 8) {
+ printk (KERN_ERR
+ "EXT4-fs: #fragments per group too big: %lu\n",
+ sbi->s_frags_per_group);
+ goto failed_mount;
+ }
+ if (sbi->s_inodes_per_group > blocksize * 8) {
+ printk (KERN_ERR
+ "EXT4-fs: #inodes per group too big: %lu\n",
+ sbi->s_inodes_per_group);
+ goto failed_mount;
+ }
+
+ if (ext4_blocks_count(es) >
+ (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) {
+ printk(KERN_ERR "EXT4-fs: filesystem on %s:"
+ " too large to mount safely\n", sb->s_id);
+ if (sizeof(sector_t) < 8)
+ printk(KERN_WARNING "EXT4-fs: CONFIG_LBD not "
+ "enabled\n");
+ goto failed_mount;
+ }
+
+ if (EXT4_BLOCKS_PER_GROUP(sb) == 0)
+ goto cantfind_ext4;
+ blocks_count = (ext4_blocks_count(es) -
+ le32_to_cpu(es->s_first_data_block) +
+ EXT4_BLOCKS_PER_GROUP(sb) - 1);
+ do_div(blocks_count, EXT4_BLOCKS_PER_GROUP(sb));
+ sbi->s_groups_count = blocks_count;
+ db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) /
+ EXT4_DESC_PER_BLOCK(sb);
+ sbi->s_group_desc = kmalloc(db_count * sizeof (struct buffer_head *),
+ GFP_KERNEL);
+ if (sbi->s_group_desc == NULL) {
+ printk (KERN_ERR "EXT4-fs: not enough memory\n");
+ goto failed_mount;
+ }
+
+ bgl_lock_init(&sbi->s_blockgroup_lock);
+
+ for (i = 0; i < db_count; i++) {
+ block = descriptor_loc(sb, logical_sb_block, i);
+ sbi->s_group_desc[i] = sb_bread(sb, block);
+ if (!sbi->s_group_desc[i]) {
+ printk (KERN_ERR "EXT4-fs: "
+ "can't read group descriptor %d\n", i);
+ db_count = i;
+ goto failed_mount2;
+ }
+ }
+ if (!ext4_check_descriptors (sb)) {
+ printk(KERN_ERR "EXT4-fs: group descriptors corrupted!\n");
+ goto failed_mount2;
+ }
+ sbi->s_gdb_count = db_count;
+ get_random_bytes(&sbi->s_next_generation, sizeof(u32));
+ spin_lock_init(&sbi->s_next_gen_lock);
+
+ percpu_counter_init(&sbi->s_freeblocks_counter,
+ ext4_count_free_blocks(sb));
+ percpu_counter_init(&sbi->s_freeinodes_counter,
+ ext4_count_free_inodes(sb));
+ percpu_counter_init(&sbi->s_dirs_counter,
+ ext4_count_dirs(sb));
+
+ /* per fileystem reservation list head & lock */
+ spin_lock_init(&sbi->s_rsv_window_lock);
+ sbi->s_rsv_window_root = RB_ROOT;
+ /* Add a single, static dummy reservation to the start of the
+ * reservation window list --- it gives us a placeholder for
+ * append-at-start-of-list which makes the allocation logic
+ * _much_ simpler. */
+ sbi->s_rsv_window_head.rsv_start = EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
+ sbi->s_rsv_window_head.rsv_end = EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
+ sbi->s_rsv_window_head.rsv_alloc_hit = 0;
+ sbi->s_rsv_window_head.rsv_goal_size = 0;
+ ext4_rsv_window_add(sb, &sbi->s_rsv_window_head);
+
+ /*
+ * set up enough so that it can read an inode
+ */
+ sb->s_op = &ext4_sops;
+ sb->s_export_op = &ext4_export_ops;
+ sb->s_xattr = ext4_xattr_handlers;
+#ifdef CONFIG_QUOTA
+ sb->s_qcop = &ext4_qctl_operations;
+ sb->dq_op = &ext4_quota_operations;
+#endif
+ INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
+
+ sb->s_root = NULL;
+
+ needs_recovery = (es->s_last_orphan != 0 ||
+ EXT4_HAS_INCOMPAT_FEATURE(sb,
+ EXT4_FEATURE_INCOMPAT_RECOVER));
+
+ /*
+ * The first inode we look at is the journal inode. Don't try
+ * root first: it may be modified in the journal!
+ */
+ if (!test_opt(sb, NOLOAD) &&
+ EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) {
+ if (ext4_load_journal(sb, es, journal_devnum))
+ goto failed_mount3;
+ } else if (journal_inum) {
+ if (ext4_create_journal(sb, es, journal_inum))
+ goto failed_mount3;
+ } else {
+ if (!silent)
+ printk (KERN_ERR
+ "ext4: No journal on filesystem on %s\n",
+ sb->s_id);
+ goto failed_mount3;
+ }
+
+ /* We have now updated the journal if required, so we can
+ * validate the data journaling mode. */
+ switch (test_opt(sb, DATA_FLAGS)) {
+ case 0:
+ /* No mode set, assume a default based on the journal
+ * capabilities: ORDERED_DATA if the journal can
+ * cope, else JOURNAL_DATA
+ */
+ if (jbd2_journal_check_available_features
+ (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE))
+ set_opt(sbi->s_mount_opt, ORDERED_DATA);
+ else
+ set_opt(sbi->s_mount_opt, JOURNAL_DATA);
+ break;
+
+ case EXT4_MOUNT_ORDERED_DATA:
+ case EXT4_MOUNT_WRITEBACK_DATA:
+ if (!jbd2_journal_check_available_features
+ (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) {
+ printk(KERN_ERR "EXT4-fs: Journal does not support "
+ "requested data journaling mode\n");
+ goto failed_mount4;
+ }
+ default:
+ break;
+ }
+
+ if (test_opt(sb, NOBH)) {
+ if (!(test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)) {
+ printk(KERN_WARNING "EXT4-fs: Ignoring nobh option - "
+ "its supported only with writeback mode\n");
+ clear_opt(sbi->s_mount_opt, NOBH);
+ }
+ }
+ /*
+ * The jbd2_journal_load will have done any necessary log recovery,
+ * so we can safely mount the rest of the filesystem now.
+ */
+
+ root = iget(sb, EXT4_ROOT_INO);
+ sb->s_root = d_alloc_root(root);
+ if (!sb->s_root) {
+ printk(KERN_ERR "EXT4-fs: get root inode failed\n");
+ iput(root);
+ goto failed_mount4;
+ }
+ if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
+ dput(sb->s_root);
+ sb->s_root = NULL;
+ printk(KERN_ERR "EXT4-fs: corrupt root inode, run e2fsck\n");
+ goto failed_mount4;
+ }
+
+ ext4_setup_super (sb, es, sb->s_flags & MS_RDONLY);
+ /*
+ * akpm: core read_super() calls in here with the superblock locked.
+ * That deadlocks, because orphan cleanup needs to lock the superblock
+ * in numerous places. Here we just pop the lock - it's relatively
+ * harmless, because we are now ready to accept write_super() requests,
+ * and aviro says that's the only reason for hanging onto the
+ * superblock lock.
+ */
+ EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS;
+ ext4_orphan_cleanup(sb, es);
+ EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS;
+ if (needs_recovery)
+ printk (KERN_INFO "EXT4-fs: recovery complete.\n");
+ ext4_mark_recovery_complete(sb, es);
+ printk (KERN_INFO "EXT4-fs: mounted filesystem with %s data mode.\n",
+ test_opt(sb,DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ? "journal":
+ test_opt(sb,DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA ? "ordered":
+ "writeback");
+
+ ext4_ext_init(sb);
+
+ lock_kernel();
+ return 0;
+
+cantfind_ext4:
+ if (!silent)
+ printk(KERN_ERR "VFS: Can't find ext4 filesystem on dev %s.\n",
+ sb->s_id);
+ goto failed_mount;
+
+failed_mount4:
+ jbd2_journal_destroy(sbi->s_journal);
+failed_mount3:
+ percpu_counter_destroy(&sbi->s_freeblocks_counter);
+ percpu_counter_destroy(&sbi->s_freeinodes_counter);
+ percpu_counter_destroy(&sbi->s_dirs_counter);
+failed_mount2:
+ for (i = 0; i < db_count; i++)
+ brelse(sbi->s_group_desc[i]);
+ kfree(sbi->s_group_desc);
+failed_mount:
+#ifdef CONFIG_QUOTA
+ for (i = 0; i < MAXQUOTAS; i++)
+ kfree(sbi->s_qf_names[i]);
+#endif
+ ext4_blkdev_remove(sbi);
+ brelse(bh);
+out_fail:
+ sb->s_fs_info = NULL;
+ kfree(sbi);
+ lock_kernel();
+ return -EINVAL;
+}
+
+/*
+ * Setup any per-fs journal parameters now. We'll do this both on
+ * initial mount, once the journal has been initialised but before we've
+ * done any recovery; and again on any subsequent remount.
+ */
+static void ext4_init_journal_params(struct super_block *sb, journal_t *journal)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+ if (sbi->s_commit_interval)
+ journal->j_commit_interval = sbi->s_commit_interval;
+ /* We could also set up an ext4-specific default for the commit
+ * interval here, but for now we'll just fall back to the jbd
+ * default. */
+
+ spin_lock(&journal->j_state_lock);
+ if (test_opt(sb, BARRIER))
+ journal->j_flags |= JBD2_BARRIER;
+ else
+ journal->j_flags &= ~JBD2_BARRIER;
+ spin_unlock(&journal->j_state_lock);
+}
+
+static journal_t *ext4_get_journal(struct super_block *sb,
+ unsigned int journal_inum)
+{
+ struct inode *journal_inode;
+ journal_t *journal;
+
+ /* First, test for the existence of a valid inode on disk. Bad
+ * things happen if we iget() an unused inode, as the subsequent
+ * iput() will try to delete it. */
+
+ journal_inode = iget(sb, journal_inum);
+ if (!journal_inode) {
+ printk(KERN_ERR "EXT4-fs: no journal found.\n");
+ return NULL;
+ }
+ if (!journal_inode->i_nlink) {
+ make_bad_inode(journal_inode);
+ iput(journal_inode);
+ printk(KERN_ERR "EXT4-fs: journal inode is deleted.\n");
+ return NULL;
+ }
+
+ jbd_debug(2, "Journal inode found at %p: %Ld bytes\n",
+ journal_inode, journal_inode->i_size);
+ if (is_bad_inode(journal_inode) || !S_ISREG(journal_inode->i_mode)) {
+ printk(KERN_ERR "EXT4-fs: invalid journal inode.\n");
+ iput(journal_inode);
+ return NULL;
+ }
+
+ journal = jbd2_journal_init_inode(journal_inode);
+ if (!journal) {
+ printk(KERN_ERR "EXT4-fs: Could not load journal inode\n");
+ iput(journal_inode);
+ return NULL;
+ }
+ journal->j_private = sb;
+ ext4_init_journal_params(sb, journal);
+ return journal;
+}
+
+static journal_t *ext4_get_dev_journal(struct super_block *sb,
+ dev_t j_dev)
+{
+ struct buffer_head * bh;
+ journal_t *journal;
+ ext4_fsblk_t start;
+ ext4_fsblk_t len;
+ int hblock, blocksize;
+ ext4_fsblk_t sb_block;
+ unsigned long offset;
+ struct ext4_super_block * es;
+ struct block_device *bdev;
+
+ bdev = ext4_blkdev_get(j_dev);
+ if (bdev == NULL)
+ return NULL;
+
+ if (bd_claim(bdev, sb)) {
+ printk(KERN_ERR
+ "EXT4: failed to claim external journal device.\n");
+ blkdev_put(bdev);
+ return NULL;
+ }
+
+ blocksize = sb->s_blocksize;
+ hblock = bdev_hardsect_size(bdev);
+ if (blocksize < hblock) {
+ printk(KERN_ERR
+ "EXT4-fs: blocksize too small for journal device.\n");
+ goto out_bdev;
+ }
+
+ sb_block = EXT4_MIN_BLOCK_SIZE / blocksize;
+ offset = EXT4_MIN_BLOCK_SIZE % blocksize;
+ set_blocksize(bdev, blocksize);
+ if (!(bh = __bread(bdev, sb_block, blocksize))) {
+ printk(KERN_ERR "EXT4-fs: couldn't read superblock of "
+ "external journal\n");
+ goto out_bdev;
+ }
+
+ es = (struct ext4_super_block *) (((char *)bh->b_data) + offset);
+ if ((le16_to_cpu(es->s_magic) != EXT4_SUPER_MAGIC) ||
+ !(le32_to_cpu(es->s_feature_incompat) &
+ EXT4_FEATURE_INCOMPAT_JOURNAL_DEV)) {
+ printk(KERN_ERR "EXT4-fs: external journal has "
+ "bad superblock\n");
+ brelse(bh);
+ goto out_bdev;
+ }
+
+ if (memcmp(EXT4_SB(sb)->s_es->s_journal_uuid, es->s_uuid, 16)) {
+ printk(KERN_ERR "EXT4-fs: journal UUID does not match\n");
+ brelse(bh);
+ goto out_bdev;
+ }
+
+ len = ext4_blocks_count(es);
+ start = sb_block + 1;
+ brelse(bh); /* we're done with the superblock */
+
+ journal = jbd2_journal_init_dev(bdev, sb->s_bdev,
+ start, len, blocksize);
+ if (!journal) {
+ printk(KERN_ERR "EXT4-fs: failed to create device journal\n");
+ goto out_bdev;
+ }
+ journal->j_private = sb;
+ ll_rw_block(READ, 1, &journal->j_sb_buffer);
+ wait_on_buffer(journal->j_sb_buffer);
+ if (!buffer_uptodate(journal->j_sb_buffer)) {
+ printk(KERN_ERR "EXT4-fs: I/O error on journal device\n");
+ goto out_journal;
+ }
+ if (be32_to_cpu(journal->j_superblock->s_nr_users) != 1) {
+ printk(KERN_ERR "EXT4-fs: External journal has more than one "
+ "user (unsupported) - %d\n",
+ be32_to_cpu(journal->j_superblock->s_nr_users));
+ goto out_journal;
+ }
+ EXT4_SB(sb)->journal_bdev = bdev;
+ ext4_init_journal_params(sb, journal);
+ return journal;
+out_journal:
+ jbd2_journal_destroy(journal);
+out_bdev:
+ ext4_blkdev_put(bdev);
+ return NULL;
+}
+
+static int ext4_load_journal(struct super_block *sb,
+ struct ext4_super_block *es,
+ unsigned long journal_devnum)
+{
+ journal_t *journal;
+ unsigned int journal_inum = le32_to_cpu(es->s_journal_inum);
+ dev_t journal_dev;
+ int err = 0;
+ int really_read_only;
+
+ if (journal_devnum &&
+ journal_devnum != le32_to_cpu(es->s_journal_dev)) {
+ printk(KERN_INFO "EXT4-fs: external journal device major/minor "
+ "numbers have changed\n");
+ journal_dev = new_decode_dev(journal_devnum);
+ } else
+ journal_dev = new_decode_dev(le32_to_cpu(es->s_journal_dev));
+
+ really_read_only = bdev_read_only(sb->s_bdev);
+
+ /*
+ * Are we loading a blank journal or performing recovery after a
+ * crash? For recovery, we need to check in advance whether we
+ * can get read-write access to the device.
+ */
+
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) {
+ if (sb->s_flags & MS_RDONLY) {
+ printk(KERN_INFO "EXT4-fs: INFO: recovery "
+ "required on readonly filesystem.\n");
+ if (really_read_only) {
+ printk(KERN_ERR "EXT4-fs: write access "
+ "unavailable, cannot proceed.\n");
+ return -EROFS;
+ }
+ printk (KERN_INFO "EXT4-fs: write access will "
+ "be enabled during recovery.\n");
+ }
+ }
+
+ if (journal_inum && journal_dev) {
+ printk(KERN_ERR "EXT4-fs: filesystem has both journal "
+ "and inode journals!\n");
+ return -EINVAL;
+ }
+
+ if (journal_inum) {
+ if (!(journal = ext4_get_journal(sb, journal_inum)))
+ return -EINVAL;
+ } else {
+ if (!(journal = ext4_get_dev_journal(sb, journal_dev)))
+ return -EINVAL;
+ }
+
+ if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) {
+ err = jbd2_journal_update_format(journal);
+ if (err) {
+ printk(KERN_ERR "EXT4-fs: error updating journal.\n");
+ jbd2_journal_destroy(journal);
+ return err;
+ }
+ }
+
+ if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER))
+ err = jbd2_journal_wipe(journal, !really_read_only);
+ if (!err)
+ err = jbd2_journal_load(journal);
+
+ if (err) {
+ printk(KERN_ERR "EXT4-fs: error loading journal.\n");
+ jbd2_journal_destroy(journal);
+ return err;
+ }
+
+ EXT4_SB(sb)->s_journal = journal;
+ ext4_clear_journal_err(sb, es);
+
+ if (journal_devnum &&
+ journal_devnum != le32_to_cpu(es->s_journal_dev)) {
+ es->s_journal_dev = cpu_to_le32(journal_devnum);
+ sb->s_dirt = 1;
+
+ /* Make sure we flush the recovery flag to disk. */
+ ext4_commit_super(sb, es, 1);
+ }
+
+ return 0;
+}
+
+static int ext4_create_journal(struct super_block * sb,
+ struct ext4_super_block * es,
+ unsigned int journal_inum)
+{
+ journal_t *journal;
+
+ if (sb->s_flags & MS_RDONLY) {
+ printk(KERN_ERR "EXT4-fs: readonly filesystem when trying to "
+ "create journal.\n");
+ return -EROFS;
+ }
+
+ if (!(journal = ext4_get_journal(sb, journal_inum)))
+ return -EINVAL;
+
+ printk(KERN_INFO "EXT4-fs: creating new journal on inode %u\n",
+ journal_inum);
+
+ if (jbd2_journal_create(journal)) {
+ printk(KERN_ERR "EXT4-fs: error creating journal.\n");
+ jbd2_journal_destroy(journal);
+ return -EIO;
+ }
+
+ EXT4_SB(sb)->s_journal = journal;
+
+ ext4_update_dynamic_rev(sb);
+ EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
+ EXT4_SET_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL);
+
+ es->s_journal_inum = cpu_to_le32(journal_inum);
+ sb->s_dirt = 1;
+
+ /* Make sure we flush the recovery flag to disk. */
+ ext4_commit_super(sb, es, 1);
+
+ return 0;
+}
+
+static void ext4_commit_super (struct super_block * sb,
+ struct ext4_super_block * es,
+ int sync)
+{
+ struct buffer_head *sbh = EXT4_SB(sb)->s_sbh;
+
+ if (!sbh)
+ return;
+ es->s_wtime = cpu_to_le32(get_seconds());
+ ext4_free_blocks_count_set(es, ext4_count_free_blocks(sb));
+ es->s_free_inodes_count = cpu_to_le32(ext4_count_free_inodes(sb));
+ BUFFER_TRACE(sbh, "marking dirty");
+ mark_buffer_dirty(sbh);
+ if (sync)
+ sync_dirty_buffer(sbh);
+}
+
+
+/*
+ * Have we just finished recovery? If so, and if we are mounting (or
+ * remounting) the filesystem readonly, then we will end up with a
+ * consistent fs on disk. Record that fact.
+ */
+static void ext4_mark_recovery_complete(struct super_block * sb,
+ struct ext4_super_block * es)
+{
+ journal_t *journal = EXT4_SB(sb)->s_journal;
+
+ jbd2_journal_lock_updates(journal);
+ jbd2_journal_flush(journal);
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER) &&
+ sb->s_flags & MS_RDONLY) {
+ EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
+ sb->s_dirt = 0;
+ ext4_commit_super(sb, es, 1);
+ }
+ jbd2_journal_unlock_updates(journal);
+}
+
+/*
+ * If we are mounting (or read-write remounting) a filesystem whose journal
+ * has recorded an error from a previous lifetime, move that error to the
+ * main filesystem now.
+ */
+static void ext4_clear_journal_err(struct super_block * sb,
+ struct ext4_super_block * es)
+{
+ journal_t *journal;
+ int j_errno;
+ const char *errstr;
+
+ journal = EXT4_SB(sb)->s_journal;
+
+ /*
+ * Now check for any error status which may have been recorded in the
+ * journal by a prior ext4_error() or ext4_abort()
+ */
+
+ j_errno = jbd2_journal_errno(journal);
+ if (j_errno) {
+ char nbuf[16];
+
+ errstr = ext4_decode_error(sb, j_errno, nbuf);
+ ext4_warning(sb, __FUNCTION__, "Filesystem error recorded "
+ "from previous mount: %s", errstr);
+ ext4_warning(sb, __FUNCTION__, "Marking fs in need of "
+ "filesystem check.");
+
+ EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
+ es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
+ ext4_commit_super (sb, es, 1);
+
+ jbd2_journal_clear_err(journal);
+ }
+}
+
+/*
+ * Force the running and committing transactions to commit,
+ * and wait on the commit.
+ */
+int ext4_force_commit(struct super_block *sb)
+{
+ journal_t *journal;
+ int ret;
+
+ if (sb->s_flags & MS_RDONLY)
+ return 0;
+
+ journal = EXT4_SB(sb)->s_journal;
+ sb->s_dirt = 0;
+ ret = ext4_journal_force_commit(journal);
+ return ret;
+}
+
+/*
+ * Ext4 always journals updates to the superblock itself, so we don't
+ * have to propagate any other updates to the superblock on disk at this
+ * point. Just start an async writeback to get the buffers on their way
+ * to the disk.
+ *
+ * This implicitly triggers the writebehind on sync().
+ */
+
+static void ext4_write_super (struct super_block * sb)
+{
+ if (mutex_trylock(&sb->s_lock) != 0)
+ BUG();
+ sb->s_dirt = 0;
+}
+
+static int ext4_sync_fs(struct super_block *sb, int wait)
+{
+ tid_t target;
+
+ sb->s_dirt = 0;
+ if (jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, &target)) {
+ if (wait)
+ jbd2_log_wait_commit(EXT4_SB(sb)->s_journal, target);
+ }
+ return 0;
+}
+
+/*
+ * LVM calls this function before a (read-only) snapshot is created. This
+ * gives us a chance to flush the journal completely and mark the fs clean.
+ */
+static void ext4_write_super_lockfs(struct super_block *sb)
+{
+ sb->s_dirt = 0;
+
+ if (!(sb->s_flags & MS_RDONLY)) {
+ journal_t *journal = EXT4_SB(sb)->s_journal;
+
+ /* Now we set up the journal barrier. */
+ jbd2_journal_lock_updates(journal);
+ jbd2_journal_flush(journal);
+
+ /* Journal blocked and flushed, clear needs_recovery flag. */
+ EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
+ ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1);
+ }
+}
+
+/*
+ * Called by LVM after the snapshot is done. We need to reset the RECOVER
+ * flag here, even though the filesystem is not technically dirty yet.
+ */
+static void ext4_unlockfs(struct super_block *sb)
+{
+ if (!(sb->s_flags & MS_RDONLY)) {
+ lock_super(sb);
+ /* Reser the needs_recovery flag before the fs is unlocked. */
+ EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
+ ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1);
+ unlock_super(sb);
+ jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+ }
+}
+
+static int ext4_remount (struct super_block * sb, int * flags, char * data)
+{
+ struct ext4_super_block * es;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ ext4_fsblk_t n_blocks_count = 0;
+ unsigned long old_sb_flags;
+ struct ext4_mount_options old_opts;
+ int err;
+#ifdef CONFIG_QUOTA
+ int i;
+#endif
+
+ /* Store the original options */
+ old_sb_flags = sb->s_flags;
+ old_opts.s_mount_opt = sbi->s_mount_opt;
+ old_opts.s_resuid = sbi->s_resuid;
+ old_opts.s_resgid = sbi->s_resgid;
+ old_opts.s_commit_interval = sbi->s_commit_interval;
+#ifdef CONFIG_QUOTA
+ old_opts.s_jquota_fmt = sbi->s_jquota_fmt;
+ for (i = 0; i < MAXQUOTAS; i++)
+ old_opts.s_qf_names[i] = sbi->s_qf_names[i];
+#endif
+
+ /*
+ * Allow the "check" option to be passed as a remount option.
+ */
+ if (!parse_options(data, sb, NULL, NULL, &n_blocks_count, 1)) {
+ err = -EINVAL;
+ goto restore_opts;
+ }
+
+ if (sbi->s_mount_opt & EXT4_MOUNT_ABORT)
+ ext4_abort(sb, __FUNCTION__, "Abort forced by user");
+
+ sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
+ ((sbi->s_mount_opt & EXT4_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0);
+
+ es = sbi->s_es;
+
+ ext4_init_journal_params(sb, sbi->s_journal);
+
+ if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY) ||
+ n_blocks_count > ext4_blocks_count(es)) {
+ if (sbi->s_mount_opt & EXT4_MOUNT_ABORT) {
+ err = -EROFS;
+ goto restore_opts;
+ }
+
+ if (*flags & MS_RDONLY) {
+ /*
+ * First of all, the unconditional stuff we have to do
+ * to disable replay of the journal when we next remount
+ */
+ sb->s_flags |= MS_RDONLY;
+
+ /*
+ * OK, test if we are remounting a valid rw partition
+ * readonly, and if so set the rdonly flag and then
+ * mark the partition as valid again.
+ */
+ if (!(es->s_state & cpu_to_le16(EXT4_VALID_FS)) &&
+ (sbi->s_mount_state & EXT4_VALID_FS))
+ es->s_state = cpu_to_le16(sbi->s_mount_state);
+
+ ext4_mark_recovery_complete(sb, es);
+ } else {
+ __le32 ret;
+ if ((ret = EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ ~EXT4_FEATURE_RO_COMPAT_SUPP))) {
+ printk(KERN_WARNING "EXT4-fs: %s: couldn't "
+ "remount RDWR because of unsupported "
+ "optional features (%x).\n",
+ sb->s_id, le32_to_cpu(ret));
+ err = -EROFS;
+ goto restore_opts;
+ }
+ /*
+ * Mounting a RDONLY partition read-write, so reread
+ * and store the current valid flag. (It may have
+ * been changed by e2fsck since we originally mounted
+ * the partition.)
+ */
+ ext4_clear_journal_err(sb, es);
+ sbi->s_mount_state = le16_to_cpu(es->s_state);
+ if ((err = ext4_group_extend(sb, es, n_blocks_count)))
+ goto restore_opts;
+ if (!ext4_setup_super (sb, es, 0))
+ sb->s_flags &= ~MS_RDONLY;
+ }
+ }
+#ifdef CONFIG_QUOTA
+ /* Release old quota file names */
+ for (i = 0; i < MAXQUOTAS; i++)
+ if (old_opts.s_qf_names[i] &&
+ old_opts.s_qf_names[i] != sbi->s_qf_names[i])
+ kfree(old_opts.s_qf_names[i]);
+#endif
+ return 0;
+restore_opts:
+ sb->s_flags = old_sb_flags;
+ sbi->s_mount_opt = old_opts.s_mount_opt;
+ sbi->s_resuid = old_opts.s_resuid;
+ sbi->s_resgid = old_opts.s_resgid;
+ sbi->s_commit_interval = old_opts.s_commit_interval;
+#ifdef CONFIG_QUOTA
+ sbi->s_jquota_fmt = old_opts.s_jquota_fmt;
+ for (i = 0; i < MAXQUOTAS; i++) {
+ if (sbi->s_qf_names[i] &&
+ old_opts.s_qf_names[i] != sbi->s_qf_names[i])
+ kfree(sbi->s_qf_names[i]);
+ sbi->s_qf_names[i] = old_opts.s_qf_names[i];
+ }
+#endif
+ return err;
+}
+
+static int ext4_statfs (struct dentry * dentry, struct kstatfs * buf)
+{
+ struct super_block *sb = dentry->d_sb;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_super_block *es = sbi->s_es;
+ ext4_fsblk_t overhead;
+ int i;
+
+ if (test_opt (sb, MINIX_DF))
+ overhead = 0;
+ else {
+ unsigned long ngroups;
+ ngroups = EXT4_SB(sb)->s_groups_count;
+ smp_rmb();
+
+ /*
+ * Compute the overhead (FS structures)
+ */
+
+ /*
+ * All of the blocks before first_data_block are
+ * overhead
+ */
+ overhead = le32_to_cpu(es->s_first_data_block);
+
+ /*
+ * Add the overhead attributed to the superblock and
+ * block group descriptors. If the sparse superblocks
+ * feature is turned on, then not all groups have this.
+ */
+ for (i = 0; i < ngroups; i++) {
+ overhead += ext4_bg_has_super(sb, i) +
+ ext4_bg_num_gdb(sb, i);
+ cond_resched();
+ }
+
+ /*
+ * Every block group has an inode bitmap, a block
+ * bitmap, and an inode table.
+ */
+ overhead += (ngroups * (2 + EXT4_SB(sb)->s_itb_per_group));
+ }
+
+ buf->f_type = EXT4_SUPER_MAGIC;
+ buf->f_bsize = sb->s_blocksize;
+ buf->f_blocks = ext4_blocks_count(es) - overhead;
+ buf->f_bfree = percpu_counter_sum(&sbi->s_freeblocks_counter);
+ buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es);
+ if (buf->f_bfree < ext4_r_blocks_count(es))
+ buf->f_bavail = 0;
+ buf->f_files = le32_to_cpu(es->s_inodes_count);
+ buf->f_ffree = percpu_counter_sum(&sbi->s_freeinodes_counter);
+ buf->f_namelen = EXT4_NAME_LEN;
+ return 0;
+}
+
+/* Helper function for writing quotas on sync - we need to start transaction before quota file
+ * is locked for write. Otherwise the are possible deadlocks:
+ * Process 1 Process 2
+ * ext4_create() quota_sync()
+ * jbd2_journal_start() write_dquot()
+ * DQUOT_INIT() down(dqio_mutex)
+ * down(dqio_mutex) jbd2_journal_start()
+ *
+ */
+
+#ifdef CONFIG_QUOTA
+
+static inline struct inode *dquot_to_inode(struct dquot *dquot)
+{
+ return sb_dqopt(dquot->dq_sb)->files[dquot->dq_type];
+}
+
+static int ext4_dquot_initialize(struct inode *inode, int type)
+{
+ handle_t *handle;
+ int ret, err;
+
+ /* We may create quota structure so we need to reserve enough blocks */
+ handle = ext4_journal_start(inode, 2*EXT4_QUOTA_INIT_BLOCKS(inode->i_sb));
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+ ret = dquot_initialize(inode, type);
+ err = ext4_journal_stop(handle);
+ if (!ret)
+ ret = err;
+ return ret;
+}
+
+static int ext4_dquot_drop(struct inode *inode)
+{
+ handle_t *handle;
+ int ret, err;
+
+ /* We may delete quota structure so we need to reserve enough blocks */
+ handle = ext4_journal_start(inode, 2*EXT4_QUOTA_DEL_BLOCKS(inode->i_sb));
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+ ret = dquot_drop(inode);
+ err = ext4_journal_stop(handle);
+ if (!ret)
+ ret = err;
+ return ret;
+}
+
+static int ext4_write_dquot(struct dquot *dquot)
+{
+ int ret, err;
+ handle_t *handle;
+ struct inode *inode;
+
+ inode = dquot_to_inode(dquot);
+ handle = ext4_journal_start(inode,
+ EXT4_QUOTA_TRANS_BLOCKS(dquot->dq_sb));
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+ ret = dquot_commit(dquot);
+ err = ext4_journal_stop(handle);
+ if (!ret)
+ ret = err;
+ return ret;
+}
+
+static int ext4_acquire_dquot(struct dquot *dquot)
+{
+ int ret, err;
+ handle_t *handle;
+
+ handle = ext4_journal_start(dquot_to_inode(dquot),
+ EXT4_QUOTA_INIT_BLOCKS(dquot->dq_sb));
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+ ret = dquot_acquire(dquot);
+ err = ext4_journal_stop(handle);
+ if (!ret)
+ ret = err;
+ return ret;
+}
+
+static int ext4_release_dquot(struct dquot *dquot)
+{
+ int ret, err;
+ handle_t *handle;
+
+ handle = ext4_journal_start(dquot_to_inode(dquot),
+ EXT4_QUOTA_DEL_BLOCKS(dquot->dq_sb));
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+ ret = dquot_release(dquot);
+ err = ext4_journal_stop(handle);
+ if (!ret)
+ ret = err;
+ return ret;
+}
+
+static int ext4_mark_dquot_dirty(struct dquot *dquot)
+{
+ /* Are we journalling quotas? */
+ if (EXT4_SB(dquot->dq_sb)->s_qf_names[USRQUOTA] ||
+ EXT4_SB(dquot->dq_sb)->s_qf_names[GRPQUOTA]) {
+ dquot_mark_dquot_dirty(dquot);
+ return ext4_write_dquot(dquot);
+ } else {
+ return dquot_mark_dquot_dirty(dquot);
+ }
+}
+
+static int ext4_write_info(struct super_block *sb, int type)
+{
+ int ret, err;
+ handle_t *handle;
+
+ /* Data block + inode block */
+ handle = ext4_journal_start(sb->s_root->d_inode, 2);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+ ret = dquot_commit_info(sb, type);
+ err = ext4_journal_stop(handle);
+ if (!ret)
+ ret = err;
+ return ret;
+}
+
+/*
+ * Turn on quotas during mount time - we need to find
+ * the quota file and such...
+ */
+static int ext4_quota_on_mount(struct super_block *sb, int type)
+{
+ return vfs_quota_on_mount(sb, EXT4_SB(sb)->s_qf_names[type],
+ EXT4_SB(sb)->s_jquota_fmt, type);
+}
+
+/*
+ * Standard function to be called on quota_on
+ */
+static int ext4_quota_on(struct super_block *sb, int type, int format_id,
+ char *path)
+{
+ int err;
+ struct nameidata nd;
+
+ if (!test_opt(sb, QUOTA))
+ return -EINVAL;
+ /* Not journalling quota? */
+ if (!EXT4_SB(sb)->s_qf_names[USRQUOTA] &&
+ !EXT4_SB(sb)->s_qf_names[GRPQUOTA])
+ return vfs_quota_on(sb, type, format_id, path);
+ err = path_lookup(path, LOOKUP_FOLLOW, &nd);
+ if (err)
+ return err;
+ /* Quotafile not on the same filesystem? */
+ if (nd.mnt->mnt_sb != sb) {
+ path_release(&nd);
+ return -EXDEV;
+ }
+ /* Quotafile not of fs root? */
+ if (nd.dentry->d_parent->d_inode != sb->s_root->d_inode)
+ printk(KERN_WARNING
+ "EXT4-fs: Quota file not on filesystem root. "
+ "Journalled quota will not work.\n");
+ path_release(&nd);
+ return vfs_quota_on(sb, type, format_id, path);
+}
+
+/* Read data from quotafile - avoid pagecache and such because we cannot afford
+ * acquiring the locks... As quota files are never truncated and quota code
+ * itself serializes the operations (and noone else should touch the files)
+ * we don't have to be afraid of races */
+static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
+ size_t len, loff_t off)
+{
+ struct inode *inode = sb_dqopt(sb)->files[type];
+ sector_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb);
+ int err = 0;
+ int offset = off & (sb->s_blocksize - 1);
+ int tocopy;
+ size_t toread;
+ struct buffer_head *bh;
+ loff_t i_size = i_size_read(inode);
+
+ if (off > i_size)
+ return 0;
+ if (off+len > i_size)
+ len = i_size-off;
+ toread = len;
+ while (toread > 0) {
+ tocopy = sb->s_blocksize - offset < toread ?
+ sb->s_blocksize - offset : toread;
+ bh = ext4_bread(NULL, inode, blk, 0, &err);
+ if (err)
+ return err;
+ if (!bh) /* A hole? */
+ memset(data, 0, tocopy);
+ else
+ memcpy(data, bh->b_data+offset, tocopy);
+ brelse(bh);
+ offset = 0;
+ toread -= tocopy;
+ data += tocopy;
+ blk++;
+ }
+ return len;
+}
+
+/* Write to quotafile (we know the transaction is already started and has
+ * enough credits) */
+static ssize_t ext4_quota_write(struct super_block *sb, int type,
+ const char *data, size_t len, loff_t off)
+{
+ struct inode *inode = sb_dqopt(sb)->files[type];
+ sector_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb);
+ int err = 0;
+ int offset = off & (sb->s_blocksize - 1);
+ int tocopy;
+ int journal_quota = EXT4_SB(sb)->s_qf_names[type] != NULL;
+ size_t towrite = len;
+ struct buffer_head *bh;
+ handle_t *handle = journal_current_handle();
+
+ mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA);
+ while (towrite > 0) {
+ tocopy = sb->s_blocksize - offset < towrite ?
+ sb->s_blocksize - offset : towrite;
+ bh = ext4_bread(handle, inode, blk, 1, &err);
+ if (!bh)
+ goto out;
+ if (journal_quota) {
+ err = ext4_journal_get_write_access(handle, bh);
+ if (err) {
+ brelse(bh);
+ goto out;
+ }
+ }
+ lock_buffer(bh);
+ memcpy(bh->b_data+offset, data, tocopy);
+ flush_dcache_page(bh->b_page);
+ unlock_buffer(bh);
+ if (journal_quota)
+ err = ext4_journal_dirty_metadata(handle, bh);
+ else {
+ /* Always do at least ordered writes for quotas */
+ err = ext4_journal_dirty_data(handle, bh);
+ mark_buffer_dirty(bh);
+ }
+ brelse(bh);
+ if (err)
+ goto out;
+ offset = 0;
+ towrite -= tocopy;
+ data += tocopy;
+ blk++;
+ }
+out:
+ if (len == towrite)
+ return err;
+ if (inode->i_size < off+len-towrite) {
+ i_size_write(inode, off+len-towrite);
+ EXT4_I(inode)->i_disksize = inode->i_size;
+ }
+ inode->i_version++;
+ inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ ext4_mark_inode_dirty(handle, inode);
+ mutex_unlock(&inode->i_mutex);
+ return len - towrite;
+}
+
+#endif
+
+static int ext4_get_sb(struct file_system_type *fs_type,
+ int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+{
+ return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super, mnt);
+}
+
+static struct file_system_type ext4dev_fs_type = {
+ .owner = THIS_MODULE,
+ .name = "ext4dev",
+ .get_sb = ext4_get_sb,
+ .kill_sb = kill_block_super,
+ .fs_flags = FS_REQUIRES_DEV,
+};
+
+static int __init init_ext4_fs(void)
+{
+ int err = init_ext4_xattr();
+ if (err)
+ return err;
+ err = init_inodecache();
+ if (err)
+ goto out1;
+ err = register_filesystem(&ext4dev_fs_type);
+ if (err)
+ goto out;
+ return 0;
+out:
+ destroy_inodecache();
+out1:
+ exit_ext4_xattr();
+ return err;
+}
+
+static void __exit exit_ext4_fs(void)
+{
+ unregister_filesystem(&ext4dev_fs_type);
+ destroy_inodecache();
+ exit_ext4_xattr();
+}
+
+MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
+MODULE_DESCRIPTION("Fourth Extended Filesystem with extents");
+MODULE_LICENSE("GPL");
+module_init(init_ext4_fs)
+module_exit(exit_ext4_fs)
diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c
new file mode 100644
index 000000000000..fcf527286d75
--- /dev/null
+++ b/fs/ext4/symlink.c
@@ -0,0 +1,54 @@
+/*
+ * linux/fs/ext4/symlink.c
+ *
+ * Only fast symlinks left here - the rest is done by generic code. AV, 1999
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ * from
+ *
+ * linux/fs/minix/symlink.c
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ *
+ * ext4 symlink handling code
+ */
+
+#include <linux/fs.h>
+#include <linux/jbd2.h>
+#include <linux/ext4_fs.h>
+#include <linux/namei.h>
+#include "xattr.h"
+
+static void * ext4_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+ struct ext4_inode_info *ei = EXT4_I(dentry->d_inode);
+ nd_set_link(nd, (char*)ei->i_data);
+ return NULL;
+}
+
+struct inode_operations ext4_symlink_inode_operations = {
+ .readlink = generic_readlink,
+ .follow_link = page_follow_link_light,
+ .put_link = page_put_link,
+#ifdef CONFIG_EXT4DEV_FS_XATTR
+ .setxattr = generic_setxattr,
+ .getxattr = generic_getxattr,
+ .listxattr = ext4_listxattr,
+ .removexattr = generic_removexattr,
+#endif
+};
+
+struct inode_operations ext4_fast_symlink_inode_operations = {
+ .readlink = generic_readlink,
+ .follow_link = ext4_follow_link,
+#ifdef CONFIG_EXT4DEV_FS_XATTR
+ .setxattr = generic_setxattr,
+ .getxattr = generic_getxattr,
+ .listxattr = ext4_listxattr,
+ .removexattr = generic_removexattr,
+#endif
+};
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
new file mode 100644
index 000000000000..63233cd946a7
--- /dev/null
+++ b/fs/ext4/xattr.c
@@ -0,0 +1,1317 @@
+/*
+ * linux/fs/ext4/xattr.c
+ *
+ * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de>
+ *
+ * Fix by Harrison Xing <harrison@mountainviewdata.com>.
+ * Ext4 code with a lot of help from Eric Jarman <ejarman@acm.org>.
+ * Extended attributes for symlinks and special files added per
+ * suggestion of Luka Renko <luka.renko@hermes.si>.
+ * xattr consolidation Copyright (c) 2004 James Morris <jmorris@redhat.com>,
+ * Red Hat Inc.
+ * ea-in-inode support by Alex Tomas <alex@clusterfs.com> aka bzzz
+ * and Andreas Gruenbacher <agruen@suse.de>.
+ */
+
+/*
+ * Extended attributes are stored directly in inodes (on file systems with
+ * inodes bigger than 128 bytes) and on additional disk blocks. The i_file_acl
+ * field contains the block number if an inode uses an additional block. All
+ * attributes must fit in the inode and one additional block. Blocks that
+ * contain the identical set of attributes may be shared among several inodes.
+ * Identical blocks are detected by keeping a cache of blocks that have
+ * recently been accessed.
+ *
+ * The attributes in inodes and on blocks have a different header; the entries
+ * are stored in the same format:
+ *
+ * +------------------+
+ * | header |
+ * | entry 1 | |
+ * | entry 2 | | growing downwards
+ * | entry 3 | v
+ * | four null bytes |
+ * | . . . |
+ * | value 1 | ^
+ * | value 3 | | growing upwards
+ * | value 2 | |
+ * +------------------+
+ *
+ * The header is followed by multiple entry descriptors. In disk blocks, the
+ * entry descriptors are kept sorted. In inodes, they are unsorted. The
+ * attribute values are aligned to the end of the block in no specific order.
+ *
+ * Locking strategy
+ * ----------------
+ * EXT4_I(inode)->i_file_acl is protected by EXT4_I(inode)->xattr_sem.
+ * EA blocks are only changed if they are exclusive to an inode, so
+ * holding xattr_sem also means that nothing but the EA block's reference
+ * count can change. Multiple writers to the same block are synchronized
+ * by the buffer lock.
+ */
+
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/ext4_jbd2.h>
+#include <linux/ext4_fs.h>
+#include <linux/mbcache.h>
+#include <linux/quotaops.h>
+#include <linux/rwsem.h>
+#include "xattr.h"
+#include "acl.h"
+
+#define BHDR(bh) ((struct ext4_xattr_header *)((bh)->b_data))
+#define ENTRY(ptr) ((struct ext4_xattr_entry *)(ptr))
+#define BFIRST(bh) ENTRY(BHDR(bh)+1)
+#define IS_LAST_ENTRY(entry) (*(__u32 *)(entry) == 0)
+
+#define IHDR(inode, raw_inode) \
+ ((struct ext4_xattr_ibody_header *) \
+ ((void *)raw_inode + \
+ EXT4_GOOD_OLD_INODE_SIZE + \
+ EXT4_I(inode)->i_extra_isize))
+#define IFIRST(hdr) ((struct ext4_xattr_entry *)((hdr)+1))
+
+#ifdef EXT4_XATTR_DEBUG
+# define ea_idebug(inode, f...) do { \
+ printk(KERN_DEBUG "inode %s:%lu: ", \
+ inode->i_sb->s_id, inode->i_ino); \
+ printk(f); \
+ printk("\n"); \
+ } while (0)
+# define ea_bdebug(bh, f...) do { \
+ char b[BDEVNAME_SIZE]; \
+ printk(KERN_DEBUG "block %s:%lu: ", \
+ bdevname(bh->b_bdev, b), \
+ (unsigned long) bh->b_blocknr); \
+ printk(f); \
+ printk("\n"); \
+ } while (0)
+#else
+# define ea_idebug(f...)
+# define ea_bdebug(f...)
+#endif
+
+static void ext4_xattr_cache_insert(struct buffer_head *);
+static struct buffer_head *ext4_xattr_cache_find(struct inode *,
+ struct ext4_xattr_header *,
+ struct mb_cache_entry **);
+static void ext4_xattr_rehash(struct ext4_xattr_header *,
+ struct ext4_xattr_entry *);
+
+static struct mb_cache *ext4_xattr_cache;
+
+static struct xattr_handler *ext4_xattr_handler_map[] = {
+ [EXT4_XATTR_INDEX_USER] = &ext4_xattr_user_handler,
+#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
+ [EXT4_XATTR_INDEX_POSIX_ACL_ACCESS] = &ext4_xattr_acl_access_handler,
+ [EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT] = &ext4_xattr_acl_default_handler,
+#endif
+ [EXT4_XATTR_INDEX_TRUSTED] = &ext4_xattr_trusted_handler,
+#ifdef CONFIG_EXT4DEV_FS_SECURITY
+ [EXT4_XATTR_INDEX_SECURITY] = &ext4_xattr_security_handler,
+#endif
+};
+
+struct xattr_handler *ext4_xattr_handlers[] = {
+ &ext4_xattr_user_handler,
+ &ext4_xattr_trusted_handler,
+#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
+ &ext4_xattr_acl_access_handler,
+ &ext4_xattr_acl_default_handler,
+#endif
+#ifdef CONFIG_EXT4DEV_FS_SECURITY
+ &ext4_xattr_security_handler,
+#endif
+ NULL
+};
+
+static inline struct xattr_handler *
+ext4_xattr_handler(int name_index)
+{
+ struct xattr_handler *handler = NULL;
+
+ if (name_index > 0 && name_index < ARRAY_SIZE(ext4_xattr_handler_map))
+ handler = ext4_xattr_handler_map[name_index];
+ return handler;
+}
+
+/*
+ * Inode operation listxattr()
+ *
+ * dentry->d_inode->i_mutex: don't care
+ */
+ssize_t
+ext4_listxattr(struct dentry *dentry, char *buffer, size_t size)
+{
+ return ext4_xattr_list(dentry->d_inode, buffer, size);
+}
+
+static int
+ext4_xattr_check_names(struct ext4_xattr_entry *entry, void *end)
+{
+ while (!IS_LAST_ENTRY(entry)) {
+ struct ext4_xattr_entry *next = EXT4_XATTR_NEXT(entry);
+ if ((void *)next >= end)
+ return -EIO;
+ entry = next;
+ }
+ return 0;
+}
+
+static inline int
+ext4_xattr_check_block(struct buffer_head *bh)
+{
+ int error;
+
+ if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) ||
+ BHDR(bh)->h_blocks != cpu_to_le32(1))
+ return -EIO;
+ error = ext4_xattr_check_names(BFIRST(bh), bh->b_data + bh->b_size);
+ return error;
+}
+
+static inline int
+ext4_xattr_check_entry(struct ext4_xattr_entry *entry, size_t size)
+{
+ size_t value_size = le32_to_cpu(entry->e_value_size);
+
+ if (entry->e_value_block != 0 || value_size > size ||
+ le16_to_cpu(entry->e_value_offs) + value_size > size)
+ return -EIO;
+ return 0;
+}
+
+static int
+ext4_xattr_find_entry(struct ext4_xattr_entry **pentry, int name_index,
+ const char *name, size_t size, int sorted)
+{
+ struct ext4_xattr_entry *entry;
+ size_t name_len;
+ int cmp = 1;
+
+ if (name == NULL)
+ return -EINVAL;
+ name_len = strlen(name);
+ entry = *pentry;
+ for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) {
+ cmp = name_index - entry->e_name_index;
+ if (!cmp)
+ cmp = name_len - entry->e_name_len;
+ if (!cmp)
+ cmp = memcmp(name, entry->e_name, name_len);
+ if (cmp <= 0 && (sorted || cmp == 0))
+ break;
+ }
+ *pentry = entry;
+ if (!cmp && ext4_xattr_check_entry(entry, size))
+ return -EIO;
+ return cmp ? -ENODATA : 0;
+}
+
+static int
+ext4_xattr_block_get(struct inode *inode, int name_index, const char *name,
+ void *buffer, size_t buffer_size)
+{
+ struct buffer_head *bh = NULL;
+ struct ext4_xattr_entry *entry;
+ size_t size;
+ int error;
+
+ ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld",
+ name_index, name, buffer, (long)buffer_size);
+
+ error = -ENODATA;
+ if (!EXT4_I(inode)->i_file_acl)
+ goto cleanup;
+ ea_idebug(inode, "reading block %u", EXT4_I(inode)->i_file_acl);
+ bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
+ if (!bh)
+ goto cleanup;
+ ea_bdebug(bh, "b_count=%d, refcount=%d",
+ atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
+ if (ext4_xattr_check_block(bh)) {
+bad_block: ext4_error(inode->i_sb, __FUNCTION__,
+ "inode %lu: bad block %llu", inode->i_ino,
+ EXT4_I(inode)->i_file_acl);
+ error = -EIO;
+ goto cleanup;
+ }
+ ext4_xattr_cache_insert(bh);
+ entry = BFIRST(bh);
+ error = ext4_xattr_find_entry(&entry, name_index, name, bh->b_size, 1);
+ if (error == -EIO)
+ goto bad_block;
+ if (error)
+ goto cleanup;
+ size = le32_to_cpu(entry->e_value_size);
+ if (buffer) {
+ error = -ERANGE;
+ if (size > buffer_size)
+ goto cleanup;
+ memcpy(buffer, bh->b_data + le16_to_cpu(entry->e_value_offs),
+ size);
+ }
+ error = size;
+
+cleanup:
+ brelse(bh);
+ return error;
+}
+
+static int
+ext4_xattr_ibody_get(struct inode *inode, int name_index, const char *name,
+ void *buffer, size_t buffer_size)
+{
+ struct ext4_xattr_ibody_header *header;
+ struct ext4_xattr_entry *entry;
+ struct ext4_inode *raw_inode;
+ struct ext4_iloc iloc;
+ size_t size;
+ void *end;
+ int error;
+
+ if (!(EXT4_I(inode)->i_state & EXT4_STATE_XATTR))
+ return -ENODATA;
+ error = ext4_get_inode_loc(inode, &iloc);
+ if (error)
+ return error;
+ raw_inode = ext4_raw_inode(&iloc);
+ header = IHDR(inode, raw_inode);
+ entry = IFIRST(header);
+ end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
+ error = ext4_xattr_check_names(entry, end);
+ if (error)
+ goto cleanup;
+ error = ext4_xattr_find_entry(&entry, name_index, name,
+ end - (void *)entry, 0);
+ if (error)
+ goto cleanup;
+ size = le32_to_cpu(entry->e_value_size);
+ if (buffer) {
+ error = -ERANGE;
+ if (size > buffer_size)
+ goto cleanup;
+ memcpy(buffer, (void *)IFIRST(header) +
+ le16_to_cpu(entry->e_value_offs), size);
+ }
+ error = size;
+
+cleanup:
+ brelse(iloc.bh);
+ return error;
+}
+
+/*
+ * ext4_xattr_get()
+ *
+ * Copy an extended attribute into the buffer
+ * provided, or compute the buffer size required.
+ * Buffer is NULL to compute the size of the buffer required.
+ *
+ * Returns a negative error number on failure, or the number of bytes
+ * used / required on success.
+ */
+int
+ext4_xattr_get(struct inode *inode, int name_index, const char *name,
+ void *buffer, size_t buffer_size)
+{
+ int error;
+
+ down_read(&EXT4_I(inode)->xattr_sem);
+ error = ext4_xattr_ibody_get(inode, name_index, name, buffer,
+ buffer_size);
+ if (error == -ENODATA)
+ error = ext4_xattr_block_get(inode, name_index, name, buffer,
+ buffer_size);
+ up_read(&EXT4_I(inode)->xattr_sem);
+ return error;
+}
+
+static int
+ext4_xattr_list_entries(struct inode *inode, struct ext4_xattr_entry *entry,
+ char *buffer, size_t buffer_size)
+{
+ size_t rest = buffer_size;
+
+ for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) {
+ struct xattr_handler *handler =
+ ext4_xattr_handler(entry->e_name_index);
+
+ if (handler) {
+ size_t size = handler->list(inode, buffer, rest,
+ entry->e_name,
+ entry->e_name_len);
+ if (buffer) {
+ if (size > rest)
+ return -ERANGE;
+ buffer += size;
+ }
+ rest -= size;
+ }
+ }
+ return buffer_size - rest;
+}
+
+static int
+ext4_xattr_block_list(struct inode *inode, char *buffer, size_t buffer_size)
+{
+ struct buffer_head *bh = NULL;
+ int error;
+
+ ea_idebug(inode, "buffer=%p, buffer_size=%ld",
+ buffer, (long)buffer_size);
+
+ error = 0;
+ if (!EXT4_I(inode)->i_file_acl)
+ goto cleanup;
+ ea_idebug(inode, "reading block %u", EXT4_I(inode)->i_file_acl);
+ bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
+ error = -EIO;
+ if (!bh)
+ goto cleanup;
+ ea_bdebug(bh, "b_count=%d, refcount=%d",
+ atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
+ if (ext4_xattr_check_block(bh)) {
+ ext4_error(inode->i_sb, __FUNCTION__,
+ "inode %lu: bad block %llu", inode->i_ino,
+ EXT4_I(inode)->i_file_acl);
+ error = -EIO;
+ goto cleanup;
+ }
+ ext4_xattr_cache_insert(bh);
+ error = ext4_xattr_list_entries(inode, BFIRST(bh), buffer, buffer_size);
+
+cleanup:
+ brelse(bh);
+
+ return error;
+}
+
+static int
+ext4_xattr_ibody_list(struct inode *inode, char *buffer, size_t buffer_size)
+{
+ struct ext4_xattr_ibody_header *header;
+ struct ext4_inode *raw_inode;
+ struct ext4_iloc iloc;
+ void *end;
+ int error;
+
+ if (!(EXT4_I(inode)->i_state & EXT4_STATE_XATTR))
+ return 0;
+ error = ext4_get_inode_loc(inode, &iloc);
+ if (error)
+ return error;
+ raw_inode = ext4_raw_inode(&iloc);
+ header = IHDR(inode, raw_inode);
+ end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
+ error = ext4_xattr_check_names(IFIRST(header), end);
+ if (error)
+ goto cleanup;
+ error = ext4_xattr_list_entries(inode, IFIRST(header),
+ buffer, buffer_size);
+
+cleanup:
+ brelse(iloc.bh);
+ return error;
+}
+
+/*
+ * ext4_xattr_list()
+ *
+ * Copy a list of attribute names into the buffer
+ * provided, or compute the buffer size required.
+ * Buffer is NULL to compute the size of the buffer required.
+ *
+ * Returns a negative error number on failure, or the number of bytes
+ * used / required on success.
+ */
+int
+ext4_xattr_list(struct inode *inode, char *buffer, size_t buffer_size)
+{
+ int i_error, b_error;
+
+ down_read(&EXT4_I(inode)->xattr_sem);
+ i_error = ext4_xattr_ibody_list(inode, buffer, buffer_size);
+ if (i_error < 0) {
+ b_error = 0;
+ } else {
+ if (buffer) {
+ buffer += i_error;
+ buffer_size -= i_error;
+ }
+ b_error = ext4_xattr_block_list(inode, buffer, buffer_size);
+ if (b_error < 0)
+ i_error = 0;
+ }
+ up_read(&EXT4_I(inode)->xattr_sem);
+ return i_error + b_error;
+}
+
+/*
+ * If the EXT4_FEATURE_COMPAT_EXT_ATTR feature of this file system is
+ * not set, set it.
+ */
+static void ext4_xattr_update_super_block(handle_t *handle,
+ struct super_block *sb)
+{
+ if (EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_EXT_ATTR))
+ return;
+
+ lock_super(sb);
+ if (ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh) == 0) {
+ EXT4_SB(sb)->s_es->s_feature_compat |=
+ cpu_to_le32(EXT4_FEATURE_COMPAT_EXT_ATTR);
+ sb->s_dirt = 1;
+ ext4_journal_dirty_metadata(handle, EXT4_SB(sb)->s_sbh);
+ }
+ unlock_super(sb);
+}
+
+/*
+ * Release the xattr block BH: If the reference count is > 1, decrement
+ * it; otherwise free the block.
+ */
+static void
+ext4_xattr_release_block(handle_t *handle, struct inode *inode,
+ struct buffer_head *bh)
+{
+ struct mb_cache_entry *ce = NULL;
+
+ ce = mb_cache_entry_get(ext4_xattr_cache, bh->b_bdev, bh->b_blocknr);
+ if (BHDR(bh)->h_refcount == cpu_to_le32(1)) {
+ ea_bdebug(bh, "refcount now=0; freeing");
+ if (ce)
+ mb_cache_entry_free(ce);
+ ext4_free_blocks(handle, inode, bh->b_blocknr, 1);
+ get_bh(bh);
+ ext4_forget(handle, 1, inode, bh, bh->b_blocknr);
+ } else {
+ if (ext4_journal_get_write_access(handle, bh) == 0) {
+ lock_buffer(bh);
+ BHDR(bh)->h_refcount = cpu_to_le32(
+ le32_to_cpu(BHDR(bh)->h_refcount) - 1);
+ ext4_journal_dirty_metadata(handle, bh);
+ if (IS_SYNC(inode))
+ handle->h_sync = 1;
+ DQUOT_FREE_BLOCK(inode, 1);
+ unlock_buffer(bh);
+ ea_bdebug(bh, "refcount now=%d; releasing",
+ le32_to_cpu(BHDR(bh)->h_refcount));
+ }
+ if (ce)
+ mb_cache_entry_release(ce);
+ }
+}
+
+struct ext4_xattr_info {
+ int name_index;
+ const char *name;
+ const void *value;
+ size_t value_len;
+};
+
+struct ext4_xattr_search {
+ struct ext4_xattr_entry *first;
+ void *base;
+ void *end;
+ struct ext4_xattr_entry *here;
+ int not_found;
+};
+
+static int
+ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s)
+{
+ struct ext4_xattr_entry *last;
+ size_t free, min_offs = s->end - s->base, name_len = strlen(i->name);
+
+ /* Compute min_offs and last. */
+ last = s->first;
+ for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) {
+ if (!last->e_value_block && last->e_value_size) {
+ size_t offs = le16_to_cpu(last->e_value_offs);
+ if (offs < min_offs)
+ min_offs = offs;
+ }
+ }
+ free = min_offs - ((void *)last - s->base) - sizeof(__u32);
+ if (!s->not_found) {
+ if (!s->here->e_value_block && s->here->e_value_size) {
+ size_t size = le32_to_cpu(s->here->e_value_size);
+ free += EXT4_XATTR_SIZE(size);
+ }
+ free += EXT4_XATTR_LEN(name_len);
+ }
+ if (i->value) {
+ if (free < EXT4_XATTR_SIZE(i->value_len) ||
+ free < EXT4_XATTR_LEN(name_len) +
+ EXT4_XATTR_SIZE(i->value_len))
+ return -ENOSPC;
+ }
+
+ if (i->value && s->not_found) {
+ /* Insert the new name. */
+ size_t size = EXT4_XATTR_LEN(name_len);
+ size_t rest = (void *)last - (void *)s->here + sizeof(__u32);
+ memmove((void *)s->here + size, s->here, rest);
+ memset(s->here, 0, size);
+ s->here->e_name_index = i->name_index;
+ s->here->e_name_len = name_len;
+ memcpy(s->here->e_name, i->name, name_len);
+ } else {
+ if (!s->here->e_value_block && s->here->e_value_size) {
+ void *first_val = s->base + min_offs;
+ size_t offs = le16_to_cpu(s->here->e_value_offs);
+ void *val = s->base + offs;
+ size_t size = EXT4_XATTR_SIZE(
+ le32_to_cpu(s->here->e_value_size));
+
+ if (i->value && size == EXT4_XATTR_SIZE(i->value_len)) {
+ /* The old and the new value have the same
+ size. Just replace. */
+ s->here->e_value_size =
+ cpu_to_le32(i->value_len);
+ memset(val + size - EXT4_XATTR_PAD, 0,
+ EXT4_XATTR_PAD); /* Clear pad bytes. */
+ memcpy(val, i->value, i->value_len);
+ return 0;
+ }
+
+ /* Remove the old value. */
+ memmove(first_val + size, first_val, val - first_val);
+ memset(first_val, 0, size);
+ s->here->e_value_size = 0;
+ s->here->e_value_offs = 0;
+ min_offs += size;
+
+ /* Adjust all value offsets. */
+ last = s->first;
+ while (!IS_LAST_ENTRY(last)) {
+ size_t o = le16_to_cpu(last->e_value_offs);
+ if (!last->e_value_block &&
+ last->e_value_size && o < offs)
+ last->e_value_offs =
+ cpu_to_le16(o + size);
+ last = EXT4_XATTR_NEXT(last);
+ }
+ }
+ if (!i->value) {
+ /* Remove the old name. */
+ size_t size = EXT4_XATTR_LEN(name_len);
+ last = ENTRY((void *)last - size);
+ memmove(s->here, (void *)s->here + size,
+ (void *)last - (void *)s->here + sizeof(__u32));
+ memset(last, 0, size);
+ }
+ }
+
+ if (i->value) {
+ /* Insert the new value. */
+ s->here->e_value_size = cpu_to_le32(i->value_len);
+ if (i->value_len) {
+ size_t size = EXT4_XATTR_SIZE(i->value_len);
+ void *val = s->base + min_offs - size;
+ s->here->e_value_offs = cpu_to_le16(min_offs - size);
+ memset(val + size - EXT4_XATTR_PAD, 0,
+ EXT4_XATTR_PAD); /* Clear the pad bytes. */
+ memcpy(val, i->value, i->value_len);
+ }
+ }
+ return 0;
+}
+
+struct ext4_xattr_block_find {
+ struct ext4_xattr_search s;
+ struct buffer_head *bh;
+};
+
+static int
+ext4_xattr_block_find(struct inode *inode, struct ext4_xattr_info *i,
+ struct ext4_xattr_block_find *bs)
+{
+ struct super_block *sb = inode->i_sb;
+ int error;
+
+ ea_idebug(inode, "name=%d.%s, value=%p, value_len=%ld",
+ i->name_index, i->name, i->value, (long)i->value_len);
+
+ if (EXT4_I(inode)->i_file_acl) {
+ /* The inode already has an extended attribute block. */
+ bs->bh = sb_bread(sb, EXT4_I(inode)->i_file_acl);
+ error = -EIO;
+ if (!bs->bh)
+ goto cleanup;
+ ea_bdebug(bs->bh, "b_count=%d, refcount=%d",
+ atomic_read(&(bs->bh->b_count)),
+ le32_to_cpu(BHDR(bs->bh)->h_refcount));
+ if (ext4_xattr_check_block(bs->bh)) {
+ ext4_error(sb, __FUNCTION__,
+ "inode %lu: bad block %llu", inode->i_ino,
+ EXT4_I(inode)->i_file_acl);
+ error = -EIO;
+ goto cleanup;
+ }
+ /* Find the named attribute. */
+ bs->s.base = BHDR(bs->bh);
+ bs->s.first = BFIRST(bs->bh);
+ bs->s.end = bs->bh->b_data + bs->bh->b_size;
+ bs->s.here = bs->s.first;
+ error = ext4_xattr_find_entry(&bs->s.here, i->name_index,
+ i->name, bs->bh->b_size, 1);
+ if (error && error != -ENODATA)
+ goto cleanup;
+ bs->s.not_found = error;
+ }
+ error = 0;
+
+cleanup:
+ return error;
+}
+
+static int
+ext4_xattr_block_set(handle_t *handle, struct inode *inode,
+ struct ext4_xattr_info *i,
+ struct ext4_xattr_block_find *bs)
+{
+ struct super_block *sb = inode->i_sb;
+ struct buffer_head *new_bh = NULL;
+ struct ext4_xattr_search *s = &bs->s;
+ struct mb_cache_entry *ce = NULL;
+ int error;
+
+#define header(x) ((struct ext4_xattr_header *)(x))
+
+ if (i->value && i->value_len > sb->s_blocksize)
+ return -ENOSPC;
+ if (s->base) {
+ ce = mb_cache_entry_get(ext4_xattr_cache, bs->bh->b_bdev,
+ bs->bh->b_blocknr);
+ if (header(s->base)->h_refcount == cpu_to_le32(1)) {
+ if (ce) {
+ mb_cache_entry_free(ce);
+ ce = NULL;
+ }
+ ea_bdebug(bs->bh, "modifying in-place");
+ error = ext4_journal_get_write_access(handle, bs->bh);
+ if (error)
+ goto cleanup;
+ lock_buffer(bs->bh);
+ error = ext4_xattr_set_entry(i, s);
+ if (!error) {
+ if (!IS_LAST_ENTRY(s->first))
+ ext4_xattr_rehash(header(s->base),
+ s->here);
+ ext4_xattr_cache_insert(bs->bh);
+ }
+ unlock_buffer(bs->bh);
+ if (error == -EIO)
+ goto bad_block;
+ if (!error)
+ error = ext4_journal_dirty_metadata(handle,
+ bs->bh);
+ if (error)
+ goto cleanup;
+ goto inserted;
+ } else {
+ int offset = (char *)s->here - bs->bh->b_data;
+
+ if (ce) {
+ mb_cache_entry_release(ce);
+ ce = NULL;
+ }
+ ea_bdebug(bs->bh, "cloning");
+ s->base = kmalloc(bs->bh->b_size, GFP_KERNEL);
+ error = -ENOMEM;
+ if (s->base == NULL)
+ goto cleanup;
+ memcpy(s->base, BHDR(bs->bh), bs->bh->b_size);
+ s->first = ENTRY(header(s->base)+1);
+ header(s->base)->h_refcount = cpu_to_le32(1);
+ s->here = ENTRY(s->base + offset);
+ s->end = s->base + bs->bh->b_size;
+ }
+ } else {
+ /* Allocate a buffer where we construct the new block. */
+ s->base = kmalloc(sb->s_blocksize, GFP_KERNEL);
+ /* assert(header == s->base) */
+ error = -ENOMEM;
+ if (s->base == NULL)
+ goto cleanup;
+ memset(s->base, 0, sb->s_blocksize);
+ header(s->base)->h_magic = cpu_to_le32(EXT4_XATTR_MAGIC);
+ header(s->base)->h_blocks = cpu_to_le32(1);
+ header(s->base)->h_refcount = cpu_to_le32(1);
+ s->first = ENTRY(header(s->base)+1);
+ s->here = ENTRY(header(s->base)+1);
+ s->end = s->base + sb->s_blocksize;
+ }
+
+ error = ext4_xattr_set_entry(i, s);
+ if (error == -EIO)
+ goto bad_block;
+ if (error)
+ goto cleanup;
+ if (!IS_LAST_ENTRY(s->first))
+ ext4_xattr_rehash(header(s->base), s->here);
+
+inserted:
+ if (!IS_LAST_ENTRY(s->first)) {
+ new_bh = ext4_xattr_cache_find(inode, header(s->base), &ce);
+ if (new_bh) {
+ /* We found an identical block in the cache. */
+ if (new_bh == bs->bh)
+ ea_bdebug(new_bh, "keeping");
+ else {
+ /* The old block is released after updating
+ the inode. */
+ error = -EDQUOT;
+ if (DQUOT_ALLOC_BLOCK(inode, 1))
+ goto cleanup;
+ error = ext4_journal_get_write_access(handle,
+ new_bh);
+ if (error)
+ goto cleanup_dquot;
+ lock_buffer(new_bh);
+ BHDR(new_bh)->h_refcount = cpu_to_le32(1 +
+ le32_to_cpu(BHDR(new_bh)->h_refcount));
+ ea_bdebug(new_bh, "reusing; refcount now=%d",
+ le32_to_cpu(BHDR(new_bh)->h_refcount));
+ unlock_buffer(new_bh);
+ error = ext4_journal_dirty_metadata(handle,
+ new_bh);
+ if (error)
+ goto cleanup_dquot;
+ }
+ mb_cache_entry_release(ce);
+ ce = NULL;
+ } else if (bs->bh && s->base == bs->bh->b_data) {
+ /* We were modifying this block in-place. */
+ ea_bdebug(bs->bh, "keeping this block");
+ new_bh = bs->bh;
+ get_bh(new_bh);
+ } else {
+ /* We need to allocate a new block */
+ ext4_fsblk_t goal = le32_to_cpu(
+ EXT4_SB(sb)->s_es->s_first_data_block) +
+ (ext4_fsblk_t)EXT4_I(inode)->i_block_group *
+ EXT4_BLOCKS_PER_GROUP(sb);
+ ext4_fsblk_t block = ext4_new_block(handle, inode,
+ goal, &error);
+ if (error)
+ goto cleanup;
+ ea_idebug(inode, "creating block %d", block);
+
+ new_bh = sb_getblk(sb, block);
+ if (!new_bh) {
+getblk_failed:
+ ext4_free_blocks(handle, inode, block, 1);
+ error = -EIO;
+ goto cleanup;
+ }
+ lock_buffer(new_bh);
+ error = ext4_journal_get_create_access(handle, new_bh);
+ if (error) {
+ unlock_buffer(new_bh);
+ goto getblk_failed;
+ }
+ memcpy(new_bh->b_data, s->base, new_bh->b_size);
+ set_buffer_uptodate(new_bh);
+ unlock_buffer(new_bh);
+ ext4_xattr_cache_insert(new_bh);
+ error = ext4_journal_dirty_metadata(handle, new_bh);
+ if (error)
+ goto cleanup;
+ }
+ }
+
+ /* Update the inode. */
+ EXT4_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0;
+
+ /* Drop the previous xattr block. */
+ if (bs->bh && bs->bh != new_bh)
+ ext4_xattr_release_block(handle, inode, bs->bh);
+ error = 0;
+
+cleanup:
+ if (ce)
+ mb_cache_entry_release(ce);
+ brelse(new_bh);
+ if (!(bs->bh && s->base == bs->bh->b_data))
+ kfree(s->base);
+
+ return error;
+
+cleanup_dquot:
+ DQUOT_FREE_BLOCK(inode, 1);
+ goto cleanup;
+
+bad_block:
+ ext4_error(inode->i_sb, __FUNCTION__,
+ "inode %lu: bad block %llu", inode->i_ino,
+ EXT4_I(inode)->i_file_acl);
+ goto cleanup;
+
+#undef header
+}
+
+struct ext4_xattr_ibody_find {
+ struct ext4_xattr_search s;
+ struct ext4_iloc iloc;
+};
+
+static int
+ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i,
+ struct ext4_xattr_ibody_find *is)
+{
+ struct ext4_xattr_ibody_header *header;
+ struct ext4_inode *raw_inode;
+ int error;
+
+ if (EXT4_I(inode)->i_extra_isize == 0)
+ return 0;
+ raw_inode = ext4_raw_inode(&is->iloc);
+ header = IHDR(inode, raw_inode);
+ is->s.base = is->s.first = IFIRST(header);
+ is->s.here = is->s.first;
+ is->s.end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
+ if (EXT4_I(inode)->i_state & EXT4_STATE_XATTR) {
+ error = ext4_xattr_check_names(IFIRST(header), is->s.end);
+ if (error)
+ return error;
+ /* Find the named attribute. */
+ error = ext4_xattr_find_entry(&is->s.here, i->name_index,
+ i->name, is->s.end -
+ (void *)is->s.base, 0);
+ if (error && error != -ENODATA)
+ return error;
+ is->s.not_found = error;
+ }
+ return 0;
+}
+
+static int
+ext4_xattr_ibody_set(handle_t *handle, struct inode *inode,
+ struct ext4_xattr_info *i,
+ struct ext4_xattr_ibody_find *is)
+{
+ struct ext4_xattr_ibody_header *header;
+ struct ext4_xattr_search *s = &is->s;
+ int error;
+
+ if (EXT4_I(inode)->i_extra_isize == 0)
+ return -ENOSPC;
+ error = ext4_xattr_set_entry(i, s);
+ if (error)
+ return error;
+ header = IHDR(inode, ext4_raw_inode(&is->iloc));
+ if (!IS_LAST_ENTRY(s->first)) {
+ header->h_magic = cpu_to_le32(EXT4_XATTR_MAGIC);
+ EXT4_I(inode)->i_state |= EXT4_STATE_XATTR;
+ } else {
+ header->h_magic = cpu_to_le32(0);
+ EXT4_I(inode)->i_state &= ~EXT4_STATE_XATTR;
+ }
+ return 0;
+}
+
+/*
+ * ext4_xattr_set_handle()
+ *
+ * Create, replace or remove an extended attribute for this inode. Buffer
+ * is NULL to remove an existing extended attribute, and non-NULL to
+ * either replace an existing extended attribute, or create a new extended
+ * attribute. The flags XATTR_REPLACE and XATTR_CREATE
+ * specify that an extended attribute must exist and must not exist
+ * previous to the call, respectively.
+ *
+ * Returns 0, or a negative error number on failure.
+ */
+int
+ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
+ const char *name, const void *value, size_t value_len,
+ int flags)
+{
+ struct ext4_xattr_info i = {
+ .name_index = name_index,
+ .name = name,
+ .value = value,
+ .value_len = value_len,
+
+ };
+ struct ext4_xattr_ibody_find is = {
+ .s = { .not_found = -ENODATA, },
+ };
+ struct ext4_xattr_block_find bs = {
+ .s = { .not_found = -ENODATA, },
+ };
+ int error;
+
+ if (!name)
+ return -EINVAL;
+ if (strlen(name) > 255)
+ return -ERANGE;
+ down_write(&EXT4_I(inode)->xattr_sem);
+ error = ext4_get_inode_loc(inode, &is.iloc);
+ if (error)
+ goto cleanup;
+
+ if (EXT4_I(inode)->i_state & EXT4_STATE_NEW) {
+ struct ext4_inode *raw_inode = ext4_raw_inode(&is.iloc);
+ memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size);
+ EXT4_I(inode)->i_state &= ~EXT4_STATE_NEW;
+ }
+
+ error = ext4_xattr_ibody_find(inode, &i, &is);
+ if (error)
+ goto cleanup;
+ if (is.s.not_found)
+ error = ext4_xattr_block_find(inode, &i, &bs);
+ if (error)
+ goto cleanup;
+ if (is.s.not_found && bs.s.not_found) {
+ error = -ENODATA;
+ if (flags & XATTR_REPLACE)
+ goto cleanup;
+ error = 0;
+ if (!value)
+ goto cleanup;
+ } else {
+ error = -EEXIST;
+ if (flags & XATTR_CREATE)
+ goto cleanup;
+ }
+ error = ext4_journal_get_write_access(handle, is.iloc.bh);
+ if (error)
+ goto cleanup;
+ if (!value) {
+ if (!is.s.not_found)
+ error = ext4_xattr_ibody_set(handle, inode, &i, &is);
+ else if (!bs.s.not_found)
+ error = ext4_xattr_block_set(handle, inode, &i, &bs);
+ } else {
+ error = ext4_xattr_ibody_set(handle, inode, &i, &is);
+ if (!error && !bs.s.not_found) {
+ i.value = NULL;
+ error = ext4_xattr_block_set(handle, inode, &i, &bs);
+ } else if (error == -ENOSPC) {
+ error = ext4_xattr_block_set(handle, inode, &i, &bs);
+ if (error)
+ goto cleanup;
+ if (!is.s.not_found) {
+ i.value = NULL;
+ error = ext4_xattr_ibody_set(handle, inode, &i,
+ &is);
+ }
+ }
+ }
+ if (!error) {
+ ext4_xattr_update_super_block(handle, inode->i_sb);
+ inode->i_ctime = CURRENT_TIME_SEC;
+ error = ext4_mark_iloc_dirty(handle, inode, &is.iloc);
+ /*
+ * The bh is consumed by ext4_mark_iloc_dirty, even with
+ * error != 0.
+ */
+ is.iloc.bh = NULL;
+ if (IS_SYNC(inode))
+ handle->h_sync = 1;
+ }
+
+cleanup:
+ brelse(is.iloc.bh);
+ brelse(bs.bh);
+ up_write(&EXT4_I(inode)->xattr_sem);
+ return error;
+}
+
+/*
+ * ext4_xattr_set()
+ *
+ * Like ext4_xattr_set_handle, but start from an inode. This extended
+ * attribute modification is a filesystem transaction by itself.
+ *
+ * Returns 0, or a negative error number on failure.
+ */
+int
+ext4_xattr_set(struct inode *inode, int name_index, const char *name,
+ const void *value, size_t value_len, int flags)
+{
+ handle_t *handle;
+ int error, retries = 0;
+
+retry:
+ handle = ext4_journal_start(inode, EXT4_DATA_TRANS_BLOCKS(inode->i_sb));
+ if (IS_ERR(handle)) {
+ error = PTR_ERR(handle);
+ } else {
+ int error2;
+
+ error = ext4_xattr_set_handle(handle, inode, name_index, name,
+ value, value_len, flags);
+ error2 = ext4_journal_stop(handle);
+ if (error == -ENOSPC &&
+ ext4_should_retry_alloc(inode->i_sb, &retries))
+ goto retry;
+ if (error == 0)
+ error = error2;
+ }
+
+ return error;
+}
+
+/*
+ * ext4_xattr_delete_inode()
+ *
+ * Free extended attribute resources associated with this inode. This
+ * is called immediately before an inode is freed. We have exclusive
+ * access to the inode.
+ */
+void
+ext4_xattr_delete_inode(handle_t *handle, struct inode *inode)
+{
+ struct buffer_head *bh = NULL;
+
+ if (!EXT4_I(inode)->i_file_acl)
+ goto cleanup;
+ bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
+ if (!bh) {
+ ext4_error(inode->i_sb, __FUNCTION__,
+ "inode %lu: block %llu read error", inode->i_ino,
+ EXT4_I(inode)->i_file_acl);
+ goto cleanup;
+ }
+ if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) ||
+ BHDR(bh)->h_blocks != cpu_to_le32(1)) {
+ ext4_error(inode->i_sb, __FUNCTION__,
+ "inode %lu: bad block %llu", inode->i_ino,
+ EXT4_I(inode)->i_file_acl);
+ goto cleanup;
+ }
+ ext4_xattr_release_block(handle, inode, bh);
+ EXT4_I(inode)->i_file_acl = 0;
+
+cleanup:
+ brelse(bh);
+}
+
+/*
+ * ext4_xattr_put_super()
+ *
+ * This is called when a file system is unmounted.
+ */
+void
+ext4_xattr_put_super(struct super_block *sb)
+{
+ mb_cache_shrink(sb->s_bdev);
+}
+
+/*
+ * ext4_xattr_cache_insert()
+ *
+ * Create a new entry in the extended attribute cache, and insert
+ * it unless such an entry is already in the cache.
+ *
+ * Returns 0, or a negative error number on failure.
+ */
+static void
+ext4_xattr_cache_insert(struct buffer_head *bh)
+{
+ __u32 hash = le32_to_cpu(BHDR(bh)->h_hash);
+ struct mb_cache_entry *ce;
+ int error;
+
+ ce = mb_cache_entry_alloc(ext4_xattr_cache);
+ if (!ce) {
+ ea_bdebug(bh, "out of memory");
+ return;
+ }
+ error = mb_cache_entry_insert(ce, bh->b_bdev, bh->b_blocknr, &hash);
+ if (error) {
+ mb_cache_entry_free(ce);
+ if (error == -EBUSY) {
+ ea_bdebug(bh, "already in cache");
+ error = 0;
+ }
+ } else {
+ ea_bdebug(bh, "inserting [%x]", (int)hash);
+ mb_cache_entry_release(ce);
+ }
+}
+
+/*
+ * ext4_xattr_cmp()
+ *
+ * Compare two extended attribute blocks for equality.
+ *
+ * Returns 0 if the blocks are equal, 1 if they differ, and
+ * a negative error number on errors.
+ */
+static int
+ext4_xattr_cmp(struct ext4_xattr_header *header1,
+ struct ext4_xattr_header *header2)
+{
+ struct ext4_xattr_entry *entry1, *entry2;
+
+ entry1 = ENTRY(header1+1);
+ entry2 = ENTRY(header2+1);
+ while (!IS_LAST_ENTRY(entry1)) {
+ if (IS_LAST_ENTRY(entry2))
+ return 1;
+ if (entry1->e_hash != entry2->e_hash ||
+ entry1->e_name_index != entry2->e_name_index ||
+ entry1->e_name_len != entry2->e_name_len ||
+ entry1->e_value_size != entry2->e_value_size ||
+ memcmp(entry1->e_name, entry2->e_name, entry1->e_name_len))
+ return 1;
+ if (entry1->e_value_block != 0 || entry2->e_value_block != 0)
+ return -EIO;
+ if (memcmp((char *)header1 + le16_to_cpu(entry1->e_value_offs),
+ (char *)header2 + le16_to_cpu(entry2->e_value_offs),
+ le32_to_cpu(entry1->e_value_size)))
+ return 1;
+
+ entry1 = EXT4_XATTR_NEXT(entry1);
+ entry2 = EXT4_XATTR_NEXT(entry2);
+ }
+ if (!IS_LAST_ENTRY(entry2))
+ return 1;
+ return 0;
+}
+
+/*
+ * ext4_xattr_cache_find()
+ *
+ * Find an identical extended attribute block.
+ *
+ * Returns a pointer to the block found, or NULL if such a block was
+ * not found or an error occurred.
+ */
+static struct buffer_head *
+ext4_xattr_cache_find(struct inode *inode, struct ext4_xattr_header *header,
+ struct mb_cache_entry **pce)
+{
+ __u32 hash = le32_to_cpu(header->h_hash);
+ struct mb_cache_entry *ce;
+
+ if (!header->h_hash)
+ return NULL; /* never share */
+ ea_idebug(inode, "looking for cached blocks [%x]", (int)hash);
+again:
+ ce = mb_cache_entry_find_first(ext4_xattr_cache, 0,
+ inode->i_sb->s_bdev, hash);
+ while (ce) {
+ struct buffer_head *bh;
+
+ if (IS_ERR(ce)) {
+ if (PTR_ERR(ce) == -EAGAIN)
+ goto again;
+ break;
+ }
+ bh = sb_bread(inode->i_sb, ce->e_block);
+ if (!bh) {
+ ext4_error(inode->i_sb, __FUNCTION__,
+ "inode %lu: block %lu read error",
+ inode->i_ino, (unsigned long) ce->e_block);
+ } else if (le32_to_cpu(BHDR(bh)->h_refcount) >=
+ EXT4_XATTR_REFCOUNT_MAX) {
+ ea_idebug(inode, "block %lu refcount %d>=%d",
+ (unsigned long) ce->e_block,
+ le32_to_cpu(BHDR(bh)->h_refcount),
+ EXT4_XATTR_REFCOUNT_MAX);
+ } else if (ext4_xattr_cmp(header, BHDR(bh)) == 0) {
+ *pce = ce;
+ return bh;
+ }
+ brelse(bh);
+ ce = mb_cache_entry_find_next(ce, 0, inode->i_sb->s_bdev, hash);
+ }
+ return NULL;
+}
+
+#define NAME_HASH_SHIFT 5
+#define VALUE_HASH_SHIFT 16
+
+/*
+ * ext4_xattr_hash_entry()
+ *
+ * Compute the hash of an extended attribute.
+ */
+static inline void ext4_xattr_hash_entry(struct ext4_xattr_header *header,
+ struct ext4_xattr_entry *entry)
+{
+ __u32 hash = 0;
+ char *name = entry->e_name;
+ int n;
+
+ for (n=0; n < entry->e_name_len; n++) {
+ hash = (hash << NAME_HASH_SHIFT) ^
+ (hash >> (8*sizeof(hash) - NAME_HASH_SHIFT)) ^
+ *name++;
+ }
+
+ if (entry->e_value_block == 0 && entry->e_value_size != 0) {
+ __le32 *value = (__le32 *)((char *)header +
+ le16_to_cpu(entry->e_value_offs));
+ for (n = (le32_to_cpu(entry->e_value_size) +
+ EXT4_XATTR_ROUND) >> EXT4_XATTR_PAD_BITS; n; n--) {
+ hash = (hash << VALUE_HASH_SHIFT) ^
+ (hash >> (8*sizeof(hash) - VALUE_HASH_SHIFT)) ^
+ le32_to_cpu(*value++);
+ }
+ }
+ entry->e_hash = cpu_to_le32(hash);
+}
+
+#undef NAME_HASH_SHIFT
+#undef VALUE_HASH_SHIFT
+
+#define BLOCK_HASH_SHIFT 16
+
+/*
+ * ext4_xattr_rehash()
+ *
+ * Re-compute the extended attribute hash value after an entry has changed.
+ */
+static void ext4_xattr_rehash(struct ext4_xattr_header *header,
+ struct ext4_xattr_entry *entry)
+{
+ struct ext4_xattr_entry *here;
+ __u32 hash = 0;
+
+ ext4_xattr_hash_entry(header, entry);
+ here = ENTRY(header+1);
+ while (!IS_LAST_ENTRY(here)) {
+ if (!here->e_hash) {
+ /* Block is not shared if an entry's hash value == 0 */
+ hash = 0;
+ break;
+ }
+ hash = (hash << BLOCK_HASH_SHIFT) ^
+ (hash >> (8*sizeof(hash) - BLOCK_HASH_SHIFT)) ^
+ le32_to_cpu(here->e_hash);
+ here = EXT4_XATTR_NEXT(here);
+ }
+ header->h_hash = cpu_to_le32(hash);
+}
+
+#undef BLOCK_HASH_SHIFT
+
+int __init
+init_ext4_xattr(void)
+{
+ ext4_xattr_cache = mb_cache_create("ext4_xattr", NULL,
+ sizeof(struct mb_cache_entry) +
+ sizeof(((struct mb_cache_entry *) 0)->e_indexes[0]), 1, 6);
+ if (!ext4_xattr_cache)
+ return -ENOMEM;
+ return 0;
+}
+
+void
+exit_ext4_xattr(void)
+{
+ if (ext4_xattr_cache)
+ mb_cache_destroy(ext4_xattr_cache);
+ ext4_xattr_cache = NULL;
+}
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
new file mode 100644
index 000000000000..79432b35398f
--- /dev/null
+++ b/fs/ext4/xattr.h
@@ -0,0 +1,145 @@
+/*
+ File: fs/ext4/xattr.h
+
+ On-disk format of extended attributes for the ext4 filesystem.
+
+ (C) 2001 Andreas Gruenbacher, <a.gruenbacher@computer.org>
+*/
+
+#include <linux/xattr.h>
+
+/* Magic value in attribute blocks */
+#define EXT4_XATTR_MAGIC 0xEA020000
+
+/* Maximum number of references to one attribute block */
+#define EXT4_XATTR_REFCOUNT_MAX 1024
+
+/* Name indexes */
+#define EXT4_XATTR_INDEX_USER 1
+#define EXT4_XATTR_INDEX_POSIX_ACL_ACCESS 2
+#define EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT 3
+#define EXT4_XATTR_INDEX_TRUSTED 4
+#define EXT4_XATTR_INDEX_LUSTRE 5
+#define EXT4_XATTR_INDEX_SECURITY 6
+
+struct ext4_xattr_header {
+ __le32 h_magic; /* magic number for identification */
+ __le32 h_refcount; /* reference count */
+ __le32 h_blocks; /* number of disk blocks used */
+ __le32 h_hash; /* hash value of all attributes */
+ __u32 h_reserved[4]; /* zero right now */
+};
+
+struct ext4_xattr_ibody_header {
+ __le32 h_magic; /* magic number for identification */
+};
+
+struct ext4_xattr_entry {
+ __u8 e_name_len; /* length of name */
+ __u8 e_name_index; /* attribute name index */
+ __le16 e_value_offs; /* offset in disk block of value */
+ __le32 e_value_block; /* disk block attribute is stored on (n/i) */
+ __le32 e_value_size; /* size of attribute value */
+ __le32 e_hash; /* hash value of name and value */
+ char e_name[0]; /* attribute name */
+};
+
+#define EXT4_XATTR_PAD_BITS 2
+#define EXT4_XATTR_PAD (1<<EXT4_XATTR_PAD_BITS)
+#define EXT4_XATTR_ROUND (EXT4_XATTR_PAD-1)
+#define EXT4_XATTR_LEN(name_len) \
+ (((name_len) + EXT4_XATTR_ROUND + \
+ sizeof(struct ext4_xattr_entry)) & ~EXT4_XATTR_ROUND)
+#define EXT4_XATTR_NEXT(entry) \
+ ( (struct ext4_xattr_entry *)( \
+ (char *)(entry) + EXT4_XATTR_LEN((entry)->e_name_len)) )
+#define EXT4_XATTR_SIZE(size) \
+ (((size) + EXT4_XATTR_ROUND) & ~EXT4_XATTR_ROUND)
+
+# ifdef CONFIG_EXT4DEV_FS_XATTR
+
+extern struct xattr_handler ext4_xattr_user_handler;
+extern struct xattr_handler ext4_xattr_trusted_handler;
+extern struct xattr_handler ext4_xattr_acl_access_handler;
+extern struct xattr_handler ext4_xattr_acl_default_handler;
+extern struct xattr_handler ext4_xattr_security_handler;
+
+extern ssize_t ext4_listxattr(struct dentry *, char *, size_t);
+
+extern int ext4_xattr_get(struct inode *, int, const char *, void *, size_t);
+extern int ext4_xattr_list(struct inode *, char *, size_t);
+extern int ext4_xattr_set(struct inode *, int, const char *, const void *, size_t, int);
+extern int ext4_xattr_set_handle(handle_t *, struct inode *, int, const char *, const void *, size_t, int);
+
+extern void ext4_xattr_delete_inode(handle_t *, struct inode *);
+extern void ext4_xattr_put_super(struct super_block *);
+
+extern int init_ext4_xattr(void);
+extern void exit_ext4_xattr(void);
+
+extern struct xattr_handler *ext4_xattr_handlers[];
+
+# else /* CONFIG_EXT4DEV_FS_XATTR */
+
+static inline int
+ext4_xattr_get(struct inode *inode, int name_index, const char *name,
+ void *buffer, size_t size, int flags)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline int
+ext4_xattr_list(struct inode *inode, void *buffer, size_t size)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline int
+ext4_xattr_set(struct inode *inode, int name_index, const char *name,
+ const void *value, size_t size, int flags)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline int
+ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
+ const char *name, const void *value, size_t size, int flags)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline void
+ext4_xattr_delete_inode(handle_t *handle, struct inode *inode)
+{
+}
+
+static inline void
+ext4_xattr_put_super(struct super_block *sb)
+{
+}
+
+static inline int
+init_ext4_xattr(void)
+{
+ return 0;
+}
+
+static inline void
+exit_ext4_xattr(void)
+{
+}
+
+#define ext4_xattr_handlers NULL
+
+# endif /* CONFIG_EXT4DEV_FS_XATTR */
+
+#ifdef CONFIG_EXT4DEV_FS_SECURITY
+extern int ext4_init_security(handle_t *handle, struct inode *inode,
+ struct inode *dir);
+#else
+static inline int ext4_init_security(handle_t *handle, struct inode *inode,
+ struct inode *dir)
+{
+ return 0;
+}
+#endif
diff --git a/fs/ext4/xattr_security.c b/fs/ext4/xattr_security.c
new file mode 100644
index 000000000000..b6a6861951f9
--- /dev/null
+++ b/fs/ext4/xattr_security.c
@@ -0,0 +1,77 @@
+/*
+ * linux/fs/ext4/xattr_security.c
+ * Handler for storing security labels as extended attributes.
+ */
+
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/fs.h>
+#include <linux/smp_lock.h>
+#include <linux/ext4_jbd2.h>
+#include <linux/ext4_fs.h>
+#include <linux/security.h>
+#include "xattr.h"
+
+static size_t
+ext4_xattr_security_list(struct inode *inode, char *list, size_t list_size,
+ const char *name, size_t name_len)
+{
+ const size_t prefix_len = sizeof(XATTR_SECURITY_PREFIX)-1;
+ const size_t total_len = prefix_len + name_len + 1;
+
+
+ if (list && total_len <= list_size) {
+ memcpy(list, XATTR_SECURITY_PREFIX, prefix_len);
+ memcpy(list+prefix_len, name, name_len);
+ list[prefix_len + name_len] = '\0';
+ }
+ return total_len;
+}
+
+static int
+ext4_xattr_security_get(struct inode *inode, const char *name,
+ void *buffer, size_t size)
+{
+ if (strcmp(name, "") == 0)
+ return -EINVAL;
+ return ext4_xattr_get(inode, EXT4_XATTR_INDEX_SECURITY, name,
+ buffer, size);
+}
+
+static int
+ext4_xattr_security_set(struct inode *inode, const char *name,
+ const void *value, size_t size, int flags)
+{
+ if (strcmp(name, "") == 0)
+ return -EINVAL;
+ return ext4_xattr_set(inode, EXT4_XATTR_INDEX_SECURITY, name,
+ value, size, flags);
+}
+
+int
+ext4_init_security(handle_t *handle, struct inode *inode, struct inode *dir)
+{
+ int err;
+ size_t len;
+ void *value;
+ char *name;
+
+ err = security_inode_init_security(inode, dir, &name, &value, &len);
+ if (err) {
+ if (err == -EOPNOTSUPP)
+ return 0;
+ return err;
+ }
+ err = ext4_xattr_set_handle(handle, inode, EXT4_XATTR_INDEX_SECURITY,
+ name, value, len, 0);
+ kfree(name);
+ kfree(value);
+ return err;
+}
+
+struct xattr_handler ext4_xattr_security_handler = {
+ .prefix = XATTR_SECURITY_PREFIX,
+ .list = ext4_xattr_security_list,
+ .get = ext4_xattr_security_get,
+ .set = ext4_xattr_security_set,
+};
diff --git a/fs/ext4/xattr_trusted.c b/fs/ext4/xattr_trusted.c
new file mode 100644
index 000000000000..b76f2dbc82da
--- /dev/null
+++ b/fs/ext4/xattr_trusted.c
@@ -0,0 +1,62 @@
+/*
+ * linux/fs/ext4/xattr_trusted.c
+ * Handler for trusted extended attributes.
+ *
+ * Copyright (C) 2003 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
+ */
+
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/capability.h>
+#include <linux/fs.h>
+#include <linux/smp_lock.h>
+#include <linux/ext4_jbd2.h>
+#include <linux/ext4_fs.h>
+#include "xattr.h"
+
+#define XATTR_TRUSTED_PREFIX "trusted."
+
+static size_t
+ext4_xattr_trusted_list(struct inode *inode, char *list, size_t list_size,
+ const char *name, size_t name_len)
+{
+ const size_t prefix_len = sizeof(XATTR_TRUSTED_PREFIX)-1;
+ const size_t total_len = prefix_len + name_len + 1;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return 0;
+
+ if (list && total_len <= list_size) {
+ memcpy(list, XATTR_TRUSTED_PREFIX, prefix_len);
+ memcpy(list+prefix_len, name, name_len);
+ list[prefix_len + name_len] = '\0';
+ }
+ return total_len;
+}
+
+static int
+ext4_xattr_trusted_get(struct inode *inode, const char *name,
+ void *buffer, size_t size)
+{
+ if (strcmp(name, "") == 0)
+ return -EINVAL;
+ return ext4_xattr_get(inode, EXT4_XATTR_INDEX_TRUSTED, name,
+ buffer, size);
+}
+
+static int
+ext4_xattr_trusted_set(struct inode *inode, const char *name,
+ const void *value, size_t size, int flags)
+{
+ if (strcmp(name, "") == 0)
+ return -EINVAL;
+ return ext4_xattr_set(inode, EXT4_XATTR_INDEX_TRUSTED, name,
+ value, size, flags);
+}
+
+struct xattr_handler ext4_xattr_trusted_handler = {
+ .prefix = XATTR_TRUSTED_PREFIX,
+ .list = ext4_xattr_trusted_list,
+ .get = ext4_xattr_trusted_get,
+ .set = ext4_xattr_trusted_set,
+};
diff --git a/fs/ext4/xattr_user.c b/fs/ext4/xattr_user.c
new file mode 100644
index 000000000000..c53cded0761a
--- /dev/null
+++ b/fs/ext4/xattr_user.c
@@ -0,0 +1,64 @@
+/*
+ * linux/fs/ext4/xattr_user.c
+ * Handler for extended user attributes.
+ *
+ * Copyright (C) 2001 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
+ */
+
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/fs.h>
+#include <linux/smp_lock.h>
+#include <linux/ext4_jbd2.h>
+#include <linux/ext4_fs.h>
+#include "xattr.h"
+
+#define XATTR_USER_PREFIX "user."
+
+static size_t
+ext4_xattr_user_list(struct inode *inode, char *list, size_t list_size,
+ const char *name, size_t name_len)
+{
+ const size_t prefix_len = sizeof(XATTR_USER_PREFIX)-1;
+ const size_t total_len = prefix_len + name_len + 1;
+
+ if (!test_opt(inode->i_sb, XATTR_USER))
+ return 0;
+
+ if (list && total_len <= list_size) {
+ memcpy(list, XATTR_USER_PREFIX, prefix_len);
+ memcpy(list+prefix_len, name, name_len);
+ list[prefix_len + name_len] = '\0';
+ }
+ return total_len;
+}
+
+static int
+ext4_xattr_user_get(struct inode *inode, const char *name,
+ void *buffer, size_t size)
+{
+ if (strcmp(name, "") == 0)
+ return -EINVAL;
+ if (!test_opt(inode->i_sb, XATTR_USER))
+ return -EOPNOTSUPP;
+ return ext4_xattr_get(inode, EXT4_XATTR_INDEX_USER, name, buffer, size);
+}
+
+static int
+ext4_xattr_user_set(struct inode *inode, const char *name,
+ const void *value, size_t size, int flags)
+{
+ if (strcmp(name, "") == 0)
+ return -EINVAL;
+ if (!test_opt(inode->i_sb, XATTR_USER))
+ return -EOPNOTSUPP;
+ return ext4_xattr_set(inode, EXT4_XATTR_INDEX_USER, name,
+ value, size, flags);
+}
+
+struct xattr_handler ext4_xattr_user_handler = {
+ .prefix = XATTR_USER_PREFIX,
+ .list = ext4_xattr_user_list,
+ .get = ext4_xattr_user_get,
+ .set = ext4_xattr_user_set,
+};
diff --git a/fs/fat/file.c b/fs/fat/file.c
index f4b8f8b3fbdd..8337451e7897 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -13,6 +13,7 @@
#include <linux/smp_lock.h>
#include <linux/buffer_head.h>
#include <linux/writeback.h>
+#include <linux/backing-dev.h>
#include <linux/blkdev.h>
int fat_generic_ioctl(struct inode *inode, struct file *filp,
@@ -118,7 +119,7 @@ static int fat_file_release(struct inode *inode, struct file *filp)
if ((filp->f_mode & FMODE_WRITE) &&
MSDOS_SB(inode->i_sb)->options.flush) {
fat_flush_inodes(inode->i_sb, inode, NULL);
- blk_congestion_wait(WRITE, HZ/10);
+ congestion_wait(WRITE, HZ/10);
}
return 0;
}
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 045738032a83..78945b53b0f8 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -384,7 +384,7 @@ static int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de)
le16_to_cpu(de->cdate)) + secs;
inode->i_ctime.tv_nsec = csecs * 10000000;
inode->i_atime.tv_sec =
- date_dos2unix(le16_to_cpu(0), le16_to_cpu(de->adate));
+ date_dos2unix(0, le16_to_cpu(de->adate));
inode->i_atime.tv_nsec = 0;
} else
inode->i_ctime = inode->i_atime = inode->i_mtime;
@@ -1472,7 +1472,7 @@ int fat_flush_inodes(struct super_block *sb, struct inode *i1, struct inode *i2)
ret = writeback_inode(i1);
if (!ret && i2)
ret = writeback_inode(i2);
- if (!ret && sb) {
+ if (!ret) {
struct address_space *mapping = sb->s_bdev->bd_inode->i_mapping;
ret = filemap_flush(mapping);
}
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 8605155db171..cfc8f81e60d0 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -138,6 +138,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
struct fuse_entry_out outarg;
struct fuse_conn *fc;
struct fuse_req *req;
+ struct dentry *parent;
/* Doesn't hurt to "reset" the validity timeout */
fuse_invalidate_entry_cache(entry);
@@ -151,8 +152,10 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
if (IS_ERR(req))
return 0;
- fuse_lookup_init(req, entry->d_parent->d_inode, entry, &outarg);
+ parent = dget_parent(entry);
+ fuse_lookup_init(req, parent->d_inode, entry, &outarg);
request_send(fc, req);
+ dput(parent);
err = req->out.h.error;
/* Zero nodeid is same as -ENOENT */
if (!err && !outarg.nodeid)
@@ -163,7 +166,9 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
fuse_send_forget(fc, req, outarg.nodeid, 1);
return 0;
}
+ spin_lock(&fc->lock);
fi->nlookup ++;
+ spin_unlock(&fc->lock);
}
fuse_put_request(fc, req);
if (err || (outarg.attr.mode ^ inode->i_mode) & S_IFMT)
@@ -175,22 +180,6 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
return 1;
}
-/*
- * Check if there's already a hashed alias of this directory inode.
- * If yes, then lookup and mkdir must not create a new alias.
- */
-static int dir_alias(struct inode *inode)
-{
- if (S_ISDIR(inode->i_mode)) {
- struct dentry *alias = d_find_alias(inode);
- if (alias) {
- dput(alias);
- return 1;
- }
- }
- return 0;
-}
-
static int invalid_nodeid(u64 nodeid)
{
return !nodeid || nodeid == FUSE_ROOT_ID;
@@ -206,6 +195,24 @@ static int valid_mode(int m)
S_ISBLK(m) || S_ISFIFO(m) || S_ISSOCK(m);
}
+/*
+ * Add a directory inode to a dentry, ensuring that no other dentry
+ * refers to this inode. Called with fc->inst_mutex.
+ */
+static int fuse_d_add_directory(struct dentry *entry, struct inode *inode)
+{
+ struct dentry *alias = d_find_alias(inode);
+ if (alias) {
+ /* This tries to shrink the subtree below alias */
+ fuse_invalidate_entry(alias);
+ dput(alias);
+ if (!list_empty(&inode->i_dentry))
+ return -EBUSY;
+ }
+ d_add(entry, inode);
+ return 0;
+}
+
static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
struct nameidata *nd)
{
@@ -241,11 +248,17 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
if (err && err != -ENOENT)
return ERR_PTR(err);
- if (inode && dir_alias(inode)) {
- iput(inode);
- return ERR_PTR(-EIO);
- }
- d_add(entry, inode);
+ if (inode && S_ISDIR(inode->i_mode)) {
+ mutex_lock(&fc->inst_mutex);
+ err = fuse_d_add_directory(entry, inode);
+ mutex_unlock(&fc->inst_mutex);
+ if (err) {
+ iput(inode);
+ return ERR_PTR(err);
+ }
+ } else
+ d_add(entry, inode);
+
entry->d_op = &fuse_dentry_operations;
if (!err)
fuse_change_timeout(entry, &outarg);
@@ -401,12 +414,22 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_req *req,
}
fuse_put_request(fc, req);
- if (dir_alias(inode)) {
- iput(inode);
- return -EIO;
- }
+ if (S_ISDIR(inode->i_mode)) {
+ struct dentry *alias;
+ mutex_lock(&fc->inst_mutex);
+ alias = d_find_alias(inode);
+ if (alias) {
+ /* New directory must have moved since mkdir */
+ mutex_unlock(&fc->inst_mutex);
+ dput(alias);
+ iput(inode);
+ return -EBUSY;
+ }
+ d_instantiate(entry, inode);
+ mutex_unlock(&fc->inst_mutex);
+ } else
+ d_instantiate(entry, inode);
- d_instantiate(entry, inode);
fuse_change_timeout(entry, &outarg);
fuse_invalidate_attr(dir);
return 0;
@@ -935,14 +958,30 @@ static void iattr_to_fattr(struct iattr *iattr, struct fuse_setattr_in *arg)
}
}
+static void fuse_vmtruncate(struct inode *inode, loff_t offset)
+{
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ int need_trunc;
+
+ spin_lock(&fc->lock);
+ need_trunc = inode->i_size > offset;
+ i_size_write(inode, offset);
+ spin_unlock(&fc->lock);
+
+ if (need_trunc) {
+ struct address_space *mapping = inode->i_mapping;
+ unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
+ truncate_inode_pages(mapping, offset);
+ }
+}
+
/*
* Set attributes, and at the same time refresh them.
*
* Truncation is slightly complicated, because the 'truncate' request
* may fail, in which case we don't want to touch the mapping.
- * vmtruncate() doesn't allow for this case. So do the rlimit
- * checking by hand and call vmtruncate() only after the file has
- * actually been truncated.
+ * vmtruncate() doesn't allow for this case, so do the rlimit checking
+ * and the actual truncation by hand.
*/
static int fuse_setattr(struct dentry *entry, struct iattr *attr)
{
@@ -993,12 +1032,8 @@ static int fuse_setattr(struct dentry *entry, struct iattr *attr)
make_bad_inode(inode);
err = -EIO;
} else {
- if (is_truncate) {
- loff_t origsize = i_size_read(inode);
- i_size_write(inode, outarg.attr.size);
- if (origsize > outarg.attr.size)
- vmtruncate(inode, outarg.attr.size);
- }
+ if (is_truncate)
+ fuse_vmtruncate(inode, outarg.attr.size);
fuse_change_attributes(inode, &outarg.attr);
fi->i_time = time_to_jiffies(outarg.attr_valid,
outarg.attr_valid_nsec);
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 183626868eea..2bb5ace3882d 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -481,8 +481,10 @@ static int fuse_commit_write(struct file *file, struct page *page,
err = -EIO;
if (!err) {
pos += count;
- if (pos > i_size_read(inode))
+ spin_lock(&fc->lock);
+ if (pos > inode->i_size)
i_size_write(inode, pos);
+ spin_unlock(&fc->lock);
if (offset == 0 && to == PAGE_CACHE_SIZE) {
clear_page_dirty(page);
@@ -586,8 +588,12 @@ static ssize_t fuse_direct_io(struct file *file, const char __user *buf,
}
fuse_put_request(fc, req);
if (res > 0) {
- if (write && pos > i_size_read(inode))
- i_size_write(inode, pos);
+ if (write) {
+ spin_lock(&fc->lock);
+ if (pos > inode->i_size)
+ i_size_write(inode, pos);
+ spin_unlock(&fc->lock);
+ }
*ppos = pos;
}
fuse_invalidate_attr(inode);
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 69c7750d55b8..91edb8932d90 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -239,6 +239,9 @@ struct fuse_conn {
/** Lock protecting accessess to members of this structure */
spinlock_t lock;
+ /** Mutex protecting against directory alias creation */
+ struct mutex inst_mutex;
+
/** Refcount */
atomic_t count;
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 7d0a9aee01f2..fc4203570370 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -109,6 +109,7 @@ static int fuse_remount_fs(struct super_block *sb, int *flags, char *data)
void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr)
{
+ struct fuse_conn *fc = get_fuse_conn(inode);
if (S_ISREG(inode->i_mode) && i_size_read(inode) != attr->size)
invalidate_inode_pages(inode->i_mapping);
@@ -117,7 +118,9 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr)
inode->i_nlink = attr->nlink;
inode->i_uid = attr->uid;
inode->i_gid = attr->gid;
+ spin_lock(&fc->lock);
i_size_write(inode, attr->size);
+ spin_unlock(&fc->lock);
inode->i_blocks = attr->blocks;
inode->i_atime.tv_sec = attr->atime;
inode->i_atime.tv_nsec = attr->atimensec;
@@ -130,7 +133,7 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr)
static void fuse_init_inode(struct inode *inode, struct fuse_attr *attr)
{
inode->i_mode = attr->mode & S_IFMT;
- i_size_write(inode, attr->size);
+ inode->i_size = attr->size;
if (S_ISREG(inode->i_mode)) {
fuse_init_common(inode);
fuse_init_file_inode(inode);
@@ -169,7 +172,6 @@ struct inode *fuse_iget(struct super_block *sb, unsigned long nodeid,
struct inode *inode;
struct fuse_inode *fi;
struct fuse_conn *fc = get_fuse_conn_super(sb);
- int retried = 0;
retry:
inode = iget5_locked(sb, nodeid, fuse_inode_eq, fuse_inode_set, &nodeid);
@@ -183,16 +185,16 @@ struct inode *fuse_iget(struct super_block *sb, unsigned long nodeid,
fuse_init_inode(inode, attr);
unlock_new_inode(inode);
} else if ((inode->i_mode ^ attr->mode) & S_IFMT) {
- BUG_ON(retried);
/* Inode has changed type, any I/O on the old should fail */
make_bad_inode(inode);
iput(inode);
- retried = 1;
goto retry;
}
fi = get_fuse_inode(inode);
+ spin_lock(&fc->lock);
fi->nlookup ++;
+ spin_unlock(&fc->lock);
fuse_change_attributes(inode, attr);
return inode;
}
@@ -377,6 +379,7 @@ static struct fuse_conn *new_conn(void)
fc = kzalloc(sizeof(*fc), GFP_KERNEL);
if (fc) {
spin_lock_init(&fc->lock);
+ mutex_init(&fc->inst_mutex);
atomic_set(&fc->count, 1);
init_waitqueue_head(&fc->waitq);
init_waitqueue_head(&fc->blocked_waitq);
@@ -396,8 +399,10 @@ static struct fuse_conn *new_conn(void)
void fuse_conn_put(struct fuse_conn *fc)
{
- if (atomic_dec_and_test(&fc->count))
+ if (atomic_dec_and_test(&fc->count)) {
+ mutex_destroy(&fc->inst_mutex);
kfree(fc);
+ }
}
struct fuse_conn *fuse_conn_get(struct fuse_conn *fc)
diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig
new file mode 100644
index 000000000000..8c27de8b9568
--- /dev/null
+++ b/fs/gfs2/Kconfig
@@ -0,0 +1,44 @@
+config GFS2_FS
+ tristate "GFS2 file system support"
+ depends on EXPERIMENTAL
+ select FS_POSIX_ACL
+ help
+ A cluster filesystem.
+
+ Allows a cluster of computers to simultaneously use a block device
+ that is shared between them (with FC, iSCSI, NBD, etc...). GFS reads
+ and writes to the block device like a local filesystem, but also uses
+ a lock module to allow the computers coordinate their I/O so
+ filesystem consistency is maintained. One of the nifty features of
+ GFS is perfect consistency -- changes made to the filesystem on one
+ machine show up immediately on all other machines in the cluster.
+
+ To use the GFS2 filesystem, you will need to enable one or more of
+ the below locking modules. Documentation and utilities for GFS2 can
+ be found here: http://sources.redhat.com/cluster
+
+config GFS2_FS_LOCKING_NOLOCK
+ tristate "GFS2 \"nolock\" locking module"
+ depends on GFS2_FS
+ help
+ Single node locking module for GFS2.
+
+ Use this module if you want to use GFS2 on a single node without
+ its clustering features. You can still take advantage of the
+ large file support, and upgrade to running a full cluster later on
+ if required.
+
+ If you will only be using GFS2 in cluster mode, you do not need this
+ module.
+
+config GFS2_FS_LOCKING_DLM
+ tristate "GFS2 DLM locking module"
+ depends on GFS2_FS
+ select DLM
+ help
+ Multiple node locking module for GFS2
+
+ Most users of GFS2 will require this module. It provides the locking
+ interface between GFS2 and the DLM, which is required to use GFS2
+ in a cluster environment.
+
diff --git a/fs/gfs2/Makefile b/fs/gfs2/Makefile
new file mode 100644
index 000000000000..e3f1ada643ac
--- /dev/null
+++ b/fs/gfs2/Makefile
@@ -0,0 +1,10 @@
+obj-$(CONFIG_GFS2_FS) += gfs2.o
+gfs2-y := acl.o bmap.o daemon.o dir.o eaops.o eattr.o glock.o \
+ glops.o inode.o lm.o log.o lops.o locking.o main.o meta_io.o \
+ mount.o ondisk.o ops_address.o ops_dentry.o ops_export.o ops_file.o \
+ ops_fstype.o ops_inode.o ops_super.o ops_vm.o quota.o \
+ recovery.o rgrp.o super.o sys.o trans.o util.o
+
+obj-$(CONFIG_GFS2_FS_LOCKING_NOLOCK) += locking/nolock/
+obj-$(CONFIG_GFS2_FS_LOCKING_DLM) += locking/dlm/
+
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
new file mode 100644
index 000000000000..5f959b8ce406
--- /dev/null
+++ b/fs/gfs2/acl.c
@@ -0,0 +1,309 @@
+/*
+ * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/posix_acl.h>
+#include <linux/posix_acl_xattr.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/lm_interface.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "acl.h"
+#include "eaops.h"
+#include "eattr.h"
+#include "glock.h"
+#include "inode.h"
+#include "meta_io.h"
+#include "trans.h"
+#include "util.h"
+
+#define ACL_ACCESS 1
+#define ACL_DEFAULT 0
+
+int gfs2_acl_validate_set(struct gfs2_inode *ip, int access,
+ struct gfs2_ea_request *er,
+ int *remove, mode_t *mode)
+{
+ struct posix_acl *acl;
+ int error;
+
+ error = gfs2_acl_validate_remove(ip, access);
+ if (error)
+ return error;
+
+ if (!er->er_data)
+ return -EINVAL;
+
+ acl = posix_acl_from_xattr(er->er_data, er->er_data_len);
+ if (IS_ERR(acl))
+ return PTR_ERR(acl);
+ if (!acl) {
+ *remove = 1;
+ return 0;
+ }
+
+ error = posix_acl_valid(acl);
+ if (error)
+ goto out;
+
+ if (access) {
+ error = posix_acl_equiv_mode(acl, mode);
+ if (!error)
+ *remove = 1;
+ else if (error > 0)
+ error = 0;
+ }
+
+out:
+ posix_acl_release(acl);
+ return error;
+}
+
+int gfs2_acl_validate_remove(struct gfs2_inode *ip, int access)
+{
+ if (!GFS2_SB(&ip->i_inode)->sd_args.ar_posix_acl)
+ return -EOPNOTSUPP;
+ if (current->fsuid != ip->i_di.di_uid && !capable(CAP_FOWNER))
+ return -EPERM;
+ if (S_ISLNK(ip->i_di.di_mode))
+ return -EOPNOTSUPP;
+ if (!access && !S_ISDIR(ip->i_di.di_mode))
+ return -EACCES;
+
+ return 0;
+}
+
+static int acl_get(struct gfs2_inode *ip, int access, struct posix_acl **acl,
+ struct gfs2_ea_location *el, char **data, unsigned int *len)
+{
+ struct gfs2_ea_request er;
+ struct gfs2_ea_location el_this;
+ int error;
+
+ if (!ip->i_di.di_eattr)
+ return 0;
+
+ memset(&er, 0, sizeof(struct gfs2_ea_request));
+ if (access) {
+ er.er_name = GFS2_POSIX_ACL_ACCESS;
+ er.er_name_len = GFS2_POSIX_ACL_ACCESS_LEN;
+ } else {
+ er.er_name = GFS2_POSIX_ACL_DEFAULT;
+ er.er_name_len = GFS2_POSIX_ACL_DEFAULT_LEN;
+ }
+ er.er_type = GFS2_EATYPE_SYS;
+
+ if (!el)
+ el = &el_this;
+
+ error = gfs2_ea_find(ip, &er, el);
+ if (error)
+ return error;
+ if (!el->el_ea)
+ return 0;
+ if (!GFS2_EA_DATA_LEN(el->el_ea))
+ goto out;
+
+ er.er_data_len = GFS2_EA_DATA_LEN(el->el_ea);
+ er.er_data = kmalloc(er.er_data_len, GFP_KERNEL);
+ error = -ENOMEM;
+ if (!er.er_data)
+ goto out;
+
+ error = gfs2_ea_get_copy(ip, el, er.er_data);
+ if (error)
+ goto out_kfree;
+
+ if (acl) {
+ *acl = posix_acl_from_xattr(er.er_data, er.er_data_len);
+ if (IS_ERR(*acl))
+ error = PTR_ERR(*acl);
+ }
+
+out_kfree:
+ if (error || !data)
+ kfree(er.er_data);
+ else {
+ *data = er.er_data;
+ *len = er.er_data_len;
+ }
+out:
+ if (error || el == &el_this)
+ brelse(el->el_bh);
+ return error;
+}
+
+/**
+ * gfs2_check_acl_locked - Check an ACL to see if we're allowed to do something
+ * @inode: the file we want to do something to
+ * @mask: what we want to do
+ *
+ * Returns: errno
+ */
+
+int gfs2_check_acl_locked(struct inode *inode, int mask)
+{
+ struct posix_acl *acl = NULL;
+ int error;
+
+ error = acl_get(GFS2_I(inode), ACL_ACCESS, &acl, NULL, NULL, NULL);
+ if (error)
+ return error;
+
+ if (acl) {
+ error = posix_acl_permission(inode, acl, mask);
+ posix_acl_release(acl);
+ return error;
+ }
+
+ return -EAGAIN;
+}
+
+int gfs2_check_acl(struct inode *inode, int mask)
+{
+ struct gfs2_inode *ip = GFS2_I(inode);
+ struct gfs2_holder i_gh;
+ int error;
+
+ error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
+ if (!error) {
+ error = gfs2_check_acl_locked(inode, mask);
+ gfs2_glock_dq_uninit(&i_gh);
+ }
+
+ return error;
+}
+
+static int munge_mode(struct gfs2_inode *ip, mode_t mode)
+{
+ struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+ struct buffer_head *dibh;
+ int error;
+
+ error = gfs2_trans_begin(sdp, RES_DINODE, 0);
+ if (error)
+ return error;
+
+ error = gfs2_meta_inode_buffer(ip, &dibh);
+ if (!error) {
+ gfs2_assert_withdraw(sdp,
+ (ip->i_di.di_mode & S_IFMT) == (mode & S_IFMT));
+ ip->i_di.di_mode = mode;
+ gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+ gfs2_dinode_out(&ip->i_di, dibh->b_data);
+ brelse(dibh);
+ }
+
+ gfs2_trans_end(sdp);
+
+ return 0;
+}
+
+int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip)
+{
+ struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
+ struct posix_acl *acl = NULL, *clone;
+ struct gfs2_ea_request er;
+ mode_t mode = ip->i_di.di_mode;
+ int error;
+
+ if (!sdp->sd_args.ar_posix_acl)
+ return 0;
+ if (S_ISLNK(ip->i_di.di_mode))
+ return 0;
+
+ memset(&er, 0, sizeof(struct gfs2_ea_request));
+ er.er_type = GFS2_EATYPE_SYS;
+
+ error = acl_get(dip, ACL_DEFAULT, &acl, NULL,
+ &er.er_data, &er.er_data_len);
+ if (error)
+ return error;
+ if (!acl) {
+ mode &= ~current->fs->umask;
+ if (mode != ip->i_di.di_mode)
+ error = munge_mode(ip, mode);
+ return error;
+ }
+
+ clone = posix_acl_clone(acl, GFP_KERNEL);
+ error = -ENOMEM;
+ if (!clone)
+ goto out;
+ posix_acl_release(acl);
+ acl = clone;
+
+ if (S_ISDIR(ip->i_di.di_mode)) {
+ er.er_name = GFS2_POSIX_ACL_DEFAULT;
+ er.er_name_len = GFS2_POSIX_ACL_DEFAULT_LEN;
+ error = gfs2_system_eaops.eo_set(ip, &er);
+ if (error)
+ goto out;
+ }
+
+ error = posix_acl_create_masq(acl, &mode);
+ if (error < 0)
+ goto out;
+ if (error > 0) {
+ er.er_name = GFS2_POSIX_ACL_ACCESS;
+ er.er_name_len = GFS2_POSIX_ACL_ACCESS_LEN;
+ posix_acl_to_xattr(acl, er.er_data, er.er_data_len);
+ er.er_mode = mode;
+ er.er_flags = GFS2_ERF_MODE;
+ error = gfs2_system_eaops.eo_set(ip, &er);
+ if (error)
+ goto out;
+ } else
+ munge_mode(ip, mode);
+
+out:
+ posix_acl_release(acl);
+ kfree(er.er_data);
+ return error;
+}
+
+int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr)
+{
+ struct posix_acl *acl = NULL, *clone;
+ struct gfs2_ea_location el;
+ char *data;
+ unsigned int len;
+ int error;
+
+ error = acl_get(ip, ACL_ACCESS, &acl, &el, &data, &len);
+ if (error)
+ return error;
+ if (!acl)
+ return gfs2_setattr_simple(ip, attr);
+
+ clone = posix_acl_clone(acl, GFP_KERNEL);
+ error = -ENOMEM;
+ if (!clone)
+ goto out;
+ posix_acl_release(acl);
+ acl = clone;
+
+ error = posix_acl_chmod_masq(acl, attr->ia_mode);
+ if (!error) {
+ posix_acl_to_xattr(acl, data, len);
+ error = gfs2_ea_acl_chmod(ip, &el, attr, data);
+ }
+
+out:
+ posix_acl_release(acl);
+ brelse(el.el_bh);
+ kfree(data);
+ return error;
+}
+
diff --git a/fs/gfs2/acl.h b/fs/gfs2/acl.h
new file mode 100644
index 000000000000..05c294fe0d78
--- /dev/null
+++ b/fs/gfs2/acl.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#ifndef __ACL_DOT_H__
+#define __ACL_DOT_H__
+
+#include "incore.h"
+
+#define GFS2_POSIX_ACL_ACCESS "posix_acl_access"
+#define GFS2_POSIX_ACL_ACCESS_LEN 16
+#define GFS2_POSIX_ACL_DEFAULT "posix_acl_default"
+#define GFS2_POSIX_ACL_DEFAULT_LEN 17
+
+#define GFS2_ACL_IS_ACCESS(name, len) \
+ ((len) == GFS2_POSIX_ACL_ACCESS_LEN && \
+ !memcmp(GFS2_POSIX_ACL_ACCESS, (name), (len)))
+
+#define GFS2_ACL_IS_DEFAULT(name, len) \
+ ((len) == GFS2_POSIX_ACL_DEFAULT_LEN && \
+ !memcmp(GFS2_POSIX_ACL_DEFAULT, (name), (len)))
+
+struct gfs2_ea_request;
+
+int gfs2_acl_validate_set(struct gfs2_inode *ip, int access,
+ struct gfs2_ea_request *er,
+ int *remove, mode_t *mode);
+int gfs2_acl_validate_remove(struct gfs2_inode *ip, int access);
+int gfs2_check_acl_locked(struct inode *inode, int mask);
+int gfs2_check_acl(struct inode *inode, int mask);
+int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip);
+int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr);
+
+#endif /* __ACL_DOT_H__ */
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
new file mode 100644
index 000000000000..06e9a8cb45e9
--- /dev/null
+++ b/fs/gfs2/bmap.c
@@ -0,0 +1,1222 @@
+/*
+ * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/crc32.h>
+#include <linux/lm_interface.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "bmap.h"
+#include "glock.h"
+#include "inode.h"
+#include "meta_io.h"
+#include "quota.h"
+#include "rgrp.h"
+#include "trans.h"
+#include "dir.h"
+#include "util.h"
+#include "ops_address.h"
+
+/* This doesn't need to be that large as max 64 bit pointers in a 4k
+ * block is 512, so __u16 is fine for that. It saves stack space to
+ * keep it small.
+ */
+struct metapath {
+ __u16 mp_list[GFS2_MAX_META_HEIGHT];
+};
+
+typedef int (*block_call_t) (struct gfs2_inode *ip, struct buffer_head *dibh,
+ struct buffer_head *bh, u64 *top,
+ u64 *bottom, unsigned int height,
+ void *data);
+
+struct strip_mine {
+ int sm_first;
+ unsigned int sm_height;
+};
+
+/**
+ * gfs2_unstuffer_page - unstuff a stuffed inode into a block cached by a page
+ * @ip: the inode
+ * @dibh: the dinode buffer
+ * @block: the block number that was allocated
+ * @private: any locked page held by the caller process
+ *
+ * Returns: errno
+ */
+
+static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh,
+ u64 block, struct page *page)
+{
+ struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+ struct inode *inode = &ip->i_inode;
+ struct buffer_head *bh;
+ int release = 0;
+
+ if (!page || page->index) {
+ page = grab_cache_page(inode->i_mapping, 0);
+ if (!page)
+ return -ENOMEM;
+ release = 1;
+ }
+
+ if (!PageUptodate(page)) {
+ void *kaddr = kmap(page);
+
+ memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode),
+ ip->i_di.di_size);
+ memset(kaddr + ip->i_di.di_size, 0,
+ PAGE_CACHE_SIZE - ip->i_di.di_size);
+ kunmap(page);
+
+ SetPageUptodate(page);
+ }
+
+ if (!page_has_buffers(page))
+ create_empty_buffers(page, 1 << inode->i_blkbits,
+ (1 << BH_Uptodate));
+
+ bh = page_buffers(page);
+
+ if (!buffer_mapped(bh))
+ map_bh(bh, inode->i_sb, block);
+
+ set_buffer_uptodate(bh);
+ if (sdp->sd_args.ar_data == GFS2_DATA_ORDERED || gfs2_is_jdata(ip))
+ gfs2_trans_add_bh(ip->i_gl, bh, 0);
+ mark_buffer_dirty(bh);
+
+ if (release) {
+ unlock_page(page);
+ page_cache_release(page);
+ }
+
+ return 0;
+}
+
+/**
+ * gfs2_unstuff_dinode - Unstuff a dinode when the data has grown too big
+ * @ip: The GFS2 inode to unstuff
+ * @unstuffer: the routine that handles unstuffing a non-zero length file
+ * @private: private data for the unstuffer
+ *
+ * This routine unstuffs a dinode and returns it to a "normal" state such
+ * that the height can be grown in the traditional way.
+ *
+ * Returns: errno
+ */
+
+int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page)
+{
+ struct buffer_head *bh, *dibh;
+ struct gfs2_dinode *di;
+ u64 block = 0;
+ int isdir = gfs2_is_dir(ip);
+ int error;
+
+ down_write(&ip->i_rw_mutex);
+
+ error = gfs2_meta_inode_buffer(ip, &dibh);
+ if (error)
+ goto out;
+
+ if (ip->i_di.di_size) {
+ /* Get a free block, fill it with the stuffed data,
+ and write it out to disk */
+
+ if (isdir) {
+ block = gfs2_alloc_meta(ip);
+
+ error = gfs2_dir_get_new_buffer(ip, block, &bh);
+ if (error)
+ goto out_brelse;
+ gfs2_buffer_copy_tail(bh, sizeof(struct gfs2_meta_header),
+ dibh, sizeof(struct gfs2_dinode));
+ brelse(bh);
+ } else {
+ block = gfs2_alloc_data(ip);
+
+ error = gfs2_unstuffer_page(ip, dibh, block, page);
+ if (error)
+ goto out_brelse;
+ }
+ }
+
+ /* Set up the pointer to the new block */
+
+ gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+ di = (struct gfs2_dinode *)dibh->b_data;
+ gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
+
+ if (ip->i_di.di_size) {
+ *(__be64 *)(di + 1) = cpu_to_be64(block);
+ ip->i_di.di_blocks++;
+ di->di_blocks = cpu_to_be64(ip->i_di.di_blocks);
+ }
+
+ ip->i_di.di_height = 1;
+ di->di_height = cpu_to_be16(1);
+
+out_brelse:
+ brelse(dibh);
+out:
+ up_write(&ip->i_rw_mutex);
+ return error;
+}
+
+/**
+ * calc_tree_height - Calculate the height of a metadata tree
+ * @ip: The GFS2 inode
+ * @size: The proposed size of the file
+ *
+ * Work out how tall a metadata tree needs to be in order to accommodate a
+ * file of a particular size. If size is less than the current size of
+ * the inode, then the current size of the inode is used instead of the
+ * supplied one.
+ *
+ * Returns: the height the tree should be
+ */
+
+static unsigned int calc_tree_height(struct gfs2_inode *ip, u64 size)
+{
+ struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+ u64 *arr;
+ unsigned int max, height;
+
+ if (ip->i_di.di_size > size)
+ size = ip->i_di.di_size;
+
+ if (gfs2_is_dir(ip)) {
+ arr = sdp->sd_jheightsize;
+ max = sdp->sd_max_jheight;
+ } else {
+ arr = sdp->sd_heightsize;
+ max = sdp->sd_max_height;
+ }
+
+ for (height = 0; height < max; height++)
+ if (arr[height] >= size)
+ break;
+
+ return height;
+}
+
+/**
+ * build_height - Build a metadata tree of the requested height
+ * @ip: The GFS2 inode
+ * @height: The height to build to
+ *
+ *
+ * Returns: errno
+ */
+
+static int build_height(struct inode *inode, unsigned height)
+{
+ struct gfs2_inode *ip = GFS2_I(inode);
+ unsigned new_height = height - ip->i_di.di_height;
+ struct buffer_head *dibh;
+ struct buffer_head *blocks[GFS2_MAX_META_HEIGHT];
+ struct gfs2_dinode *di;
+ int error;
+ u64 *bp;
+ u64 bn;
+ unsigned n;
+
+ if (height <= ip->i_di.di_height)
+ return 0;
+
+ error = gfs2_meta_inode_buffer(ip, &dibh);
+ if (error)
+ return error;
+
+ for(n = 0; n < new_height; n++) {
+ bn = gfs2_alloc_meta(ip);
+ blocks[n] = gfs2_meta_new(ip->i_gl, bn);
+ gfs2_trans_add_bh(ip->i_gl, blocks[n], 1);
+ }
+
+ n = 0;
+ bn = blocks[0]->b_blocknr;
+ if (new_height > 1) {
+ for(; n < new_height-1; n++) {
+ gfs2_metatype_set(blocks[n], GFS2_METATYPE_IN,
+ GFS2_FORMAT_IN);
+ gfs2_buffer_clear_tail(blocks[n],
+ sizeof(struct gfs2_meta_header));
+ bp = (u64 *)(blocks[n]->b_data +
+ sizeof(struct gfs2_meta_header));
+ *bp = cpu_to_be64(blocks[n+1]->b_blocknr);
+ brelse(blocks[n]);
+ blocks[n] = NULL;
+ }
+ }
+ gfs2_metatype_set(blocks[n], GFS2_METATYPE_IN, GFS2_FORMAT_IN);
+ gfs2_buffer_copy_tail(blocks[n], sizeof(struct gfs2_meta_header),
+ dibh, sizeof(struct gfs2_dinode));
+ brelse(blocks[n]);
+ gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+ di = (struct gfs2_dinode *)dibh->b_data;
+ gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
+ *(__be64 *)(di + 1) = cpu_to_be64(bn);
+ ip->i_di.di_height += new_height;
+ ip->i_di.di_blocks += new_height;
+ di->di_height = cpu_to_be16(ip->i_di.di_height);
+ di->di_blocks = cpu_to_be64(ip->i_di.di_blocks);
+ brelse(dibh);
+ return error;
+}
+
+/**
+ * find_metapath - Find path through the metadata tree
+ * @ip: The inode pointer
+ * @mp: The metapath to return the result in
+ * @block: The disk block to look up
+ *
+ * This routine returns a struct metapath structure that defines a path
+ * through the metadata of inode "ip" to get to block "block".
+ *
+ * Example:
+ * Given: "ip" is a height 3 file, "offset" is 101342453, and this is a
+ * filesystem with a blocksize of 4096.
+ *
+ * find_metapath() would return a struct metapath structure set to:
+ * mp_offset = 101342453, mp_height = 3, mp_list[0] = 0, mp_list[1] = 48,
+ * and mp_list[2] = 165.
+ *
+ * That means that in order to get to the block containing the byte at
+ * offset 101342453, we would load the indirect block pointed to by pointer
+ * 0 in the dinode. We would then load the indirect block pointed to by
+ * pointer 48 in that indirect block. We would then load the data block
+ * pointed to by pointer 165 in that indirect block.
+ *
+ * ----------------------------------------
+ * | Dinode | |
+ * | | 4|
+ * | |0 1 2 3 4 5 9|
+ * | | 6|
+ * ----------------------------------------
+ * |
+ * |
+ * V
+ * ----------------------------------------
+ * | Indirect Block |
+ * | 5|
+ * | 4 4 4 4 4 5 5 1|
+ * |0 5 6 7 8 9 0 1 2|
+ * ----------------------------------------
+ * |
+ * |
+ * V
+ * ----------------------------------------
+ * | Indirect Block |
+ * | 1 1 1 1 1 5|
+ * | 6 6 6 6 6 1|
+ * |0 3 4 5 6 7 2|
+ * ----------------------------------------
+ * |
+ * |
+ * V
+ * ----------------------------------------
+ * | Data block containing offset |
+ * | 101342453 |
+ * | |
+ * | |
+ * ----------------------------------------
+ *
+ */
+
+static void find_metapath(struct gfs2_inode *ip, u64 block,
+ struct metapath *mp)
+{
+ struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+ u64 b = block;
+ unsigned int i;
+
+ for (i = ip->i_di.di_height; i--;)
+ mp->mp_list[i] = do_div(b, sdp->sd_inptrs);
+
+}
+
+/**
+ * metapointer - Return pointer to start of metadata in a buffer
+ * @bh: The buffer
+ * @height: The metadata height (0 = dinode)
+ * @mp: The metapath
+ *
+ * Return a pointer to the block number of the next height of the metadata
+ * tree given a buffer containing the pointer to the current height of the
+ * metadata tree.
+ */
+
+static inline u64 *metapointer(struct buffer_head *bh, int *boundary,
+ unsigned int height, const struct metapath *mp)
+{
+ unsigned int head_size = (height > 0) ?
+ sizeof(struct gfs2_meta_header) : sizeof(struct gfs2_dinode);
+ u64 *ptr;
+ *boundary = 0;
+ ptr = ((u64 *)(bh->b_data + head_size)) + mp->mp_list[height];
+ if (ptr + 1 == (u64 *)(bh->b_data + bh->b_size))
+ *boundary = 1;
+ return ptr;
+}
+
+/**
+ * lookup_block - Get the next metadata block in metadata tree
+ * @ip: The GFS2 inode
+ * @bh: Buffer containing the pointers to metadata blocks
+ * @height: The height of the tree (0 = dinode)
+ * @mp: The metapath
+ * @create: Non-zero if we may create a new meatdata block
+ * @new: Used to indicate if we did create a new metadata block
+ * @block: the returned disk block number
+ *
+ * Given a metatree, complete to a particular height, checks to see if the next
+ * height of the tree exists. If not the next height of the tree is created.
+ * The block number of the next height of the metadata tree is returned.
+ *
+ */
+
+static int lookup_block(struct gfs2_inode *ip, struct buffer_head *bh,
+ unsigned int height, struct metapath *mp, int create,
+ int *new, u64 *block)
+{
+ int boundary;
+ u64 *ptr = metapointer(bh, &boundary, height, mp);
+
+ if (*ptr) {
+ *block = be64_to_cpu(*ptr);
+ return boundary;
+ }
+
+ *block = 0;
+
+ if (!create)
+ return 0;
+
+ if (height == ip->i_di.di_height - 1 && !gfs2_is_dir(ip))
+ *block = gfs2_alloc_data(ip);
+ else
+ *block = gfs2_alloc_meta(ip);
+
+ gfs2_trans_add_bh(ip->i_gl, bh, 1);
+
+ *ptr = cpu_to_be64(*block);
+ ip->i_di.di_blocks++;
+
+ *new = 1;
+ return 0;
+}
+
+/**
+ * gfs2_block_pointers - Map a block from an inode to a disk block
+ * @inode: The inode
+ * @lblock: The logical block number
+ * @map_bh: The bh to be mapped
+ * @mp: metapath to use
+ *
+ * Find the block number on the current device which corresponds to an
+ * inode's block. If the block had to be created, "new" will be set.
+ *
+ * Returns: errno
+ */
+
+static int gfs2_block_pointers(struct inode *inode, u64 lblock, int create,
+ struct buffer_head *bh_map, struct metapath *mp)
+{
+ struct gfs2_inode *ip = GFS2_I(inode);
+ struct gfs2_sbd *sdp = GFS2_SB(inode);
+ struct buffer_head *bh;
+ unsigned int bsize;
+ unsigned int height;
+ unsigned int end_of_metadata;
+ unsigned int x;
+ int error = 0;
+ int new = 0;
+ u64 dblock = 0;
+ int boundary;
+ unsigned int maxlen = bh_map->b_size >> inode->i_blkbits;
+
+ BUG_ON(maxlen == 0);
+
+ if (gfs2_assert_warn(sdp, !gfs2_is_stuffed(ip)))
+ return 0;
+
+ bsize = gfs2_is_dir(ip) ? sdp->sd_jbsize : sdp->sd_sb.sb_bsize;
+
+ height = calc_tree_height(ip, (lblock + 1) * bsize);
+ if (ip->i_di.di_height < height) {
+ if (!create)
+ return 0;
+
+ error = build_height(inode, height);
+ if (error)
+ return error;
+ }
+
+ find_metapath(ip, lblock, mp);
+ end_of_metadata = ip->i_di.di_height - 1;
+
+ error = gfs2_meta_inode_buffer(ip, &bh);
+ if (error)
+ return error;
+
+ for (x = 0; x < end_of_metadata; x++) {
+ lookup_block(ip, bh, x, mp, create, &new, &dblock);
+ brelse(bh);
+ if (!dblock)
+ return 0;
+
+ error = gfs2_meta_indirect_buffer(ip, x+1, dblock, new, &bh);
+ if (error)
+ return error;
+ }
+
+ boundary = lookup_block(ip, bh, end_of_metadata, mp, create, &new, &dblock);
+ clear_buffer_mapped(bh_map);
+ clear_buffer_new(bh_map);
+ clear_buffer_boundary(bh_map);
+
+ if (dblock) {
+ map_bh(bh_map, inode->i_sb, dblock);
+ if (boundary)
+ set_buffer_boundary(bh);
+ if (new) {
+ struct buffer_head *dibh;
+ error = gfs2_meta_inode_buffer(ip, &dibh);
+ if (!error) {
+ gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+ gfs2_dinode_out(&ip->i_di, dibh->b_data);
+ brelse(dibh);
+ }
+ set_buffer_new(bh_map);
+ goto out_brelse;
+ }
+ while(--maxlen && !buffer_boundary(bh_map)) {
+ u64 eblock;
+
+ mp->mp_list[end_of_metadata]++;
+ boundary = lookup_block(ip, bh, end_of_metadata, mp, 0, &new, &eblock);
+ if (eblock != ++dblock)
+ break;
+ bh_map->b_size += (1 << inode->i_blkbits);
+ if (boundary)
+ set_buffer_boundary(bh_map);
+ }
+ }
+out_brelse:
+ brelse(bh);
+ return 0;
+}
+
+
+static inline void bmap_lock(struct inode *inode, int create)
+{
+ struct gfs2_inode *ip = GFS2_I(inode);
+ if (create)
+ down_write(&ip->i_rw_mutex);
+ else
+ down_read(&ip->i_rw_mutex);
+}
+
+static inline void bmap_unlock(struct inode *inode, int create)
+{
+ struct gfs2_inode *ip = GFS2_I(inode);
+ if (create)
+ up_write(&ip->i_rw_mutex);
+ else
+ up_read(&ip->i_rw_mutex);
+}
+
+int gfs2_block_map(struct inode *inode, u64 lblock, int create,
+ struct buffer_head *bh)
+{
+ struct metapath mp;
+ int ret;
+
+ bmap_lock(inode, create);
+ ret = gfs2_block_pointers(inode, lblock, create, bh, &mp);
+ bmap_unlock(inode, create);
+ return ret;
+}
+
+int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsigned *extlen)
+{
+ struct metapath mp;
+ struct buffer_head bh = { .b_state = 0, .b_blocknr = 0 };
+ int ret;
+ int create = *new;
+
+ BUG_ON(!extlen);
+ BUG_ON(!dblock);
+ BUG_ON(!new);
+
+ bh.b_size = 1 << (inode->i_blkbits + 5);
+ bmap_lock(inode, create);
+ ret = gfs2_block_pointers(inode, lblock, create, &bh, &mp);
+ bmap_unlock(inode, create);
+ *extlen = bh.b_size >> inode->i_blkbits;
+ *dblock = bh.b_blocknr;
+ if (buffer_new(&bh))
+ *new = 1;
+ else
+ *new = 0;
+ return ret;
+}
+
+/**
+ * recursive_scan - recursively scan through the end of a file
+ * @ip: the inode
+ * @dibh: the dinode buffer
+ * @mp: the path through the metadata to the point to start
+ * @height: the height the recursion is at
+ * @block: the indirect block to look at
+ * @first: 1 if this is the first block
+ * @bc: the call to make for each piece of metadata
+ * @data: data opaque to this function to pass to @bc
+ *
+ * When this is first called @height and @block should be zero and
+ * @first should be 1.
+ *
+ * Returns: errno
+ */
+
+static int recursive_scan(struct gfs2_inode *ip, struct buffer_head *dibh,
+ struct metapath *mp, unsigned int height,
+ u64 block, int first, block_call_t bc,
+ void *data)
+{
+ struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+ struct buffer_head *bh = NULL;
+ u64 *top, *bottom;
+ u64 bn;
+ int error;
+ int mh_size = sizeof(struct gfs2_meta_header);
+
+ if (!height) {
+ error = gfs2_meta_inode_buffer(ip, &bh);
+ if (error)
+ return error;
+ dibh = bh;
+
+ top = (u64 *)(bh->b_data + sizeof(struct gfs2_dinode)) + mp->mp_list[0];
+ bottom = (u64 *)(bh->b_data + sizeof(struct gfs2_dinode)) + sdp->sd_diptrs;
+ } else {
+ error = gfs2_meta_indirect_buffer(ip, height, block, 0, &bh);
+ if (error)
+ return error;
+
+ top = (u64 *)(bh->b_data + mh_size) +
+ (first ? mp->mp_list[height] : 0);
+
+ bottom = (u64 *)(bh->b_data + mh_size) + sdp->sd_inptrs;
+ }
+
+ error = bc(ip, dibh, bh, top, bottom, height, data);
+ if (error)
+ goto out;
+
+ if (height < ip->i_di.di_height - 1)
+ for (; top < bottom; top++, first = 0) {
+ if (!*top)
+ continue;
+
+ bn = be64_to_cpu(*top);
+
+ error = recursive_scan(ip, dibh, mp, height + 1, bn,
+ first, bc, data);
+ if (error)
+ break;
+ }
+
+out:
+ brelse(bh);
+ return error;
+}
+
+/**
+ * do_strip - Look for a layer a particular layer of the file and strip it off
+ * @ip: the inode
+ * @dibh: the dinode buffer
+ * @bh: A buffer of pointers
+ * @top: The first pointer in the buffer
+ * @bottom: One more than the last pointer
+ * @height: the height this buffer is at
+ * @data: a pointer to a struct strip_mine
+ *
+ * Returns: errno
+ */
+
+static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
+ struct buffer_head *bh, u64 *top, u64 *bottom,
+ unsigned int height, void *data)
+{
+ struct strip_mine *sm = data;
+ struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+ struct gfs2_rgrp_list rlist;
+ u64 bn, bstart;
+ u32 blen;
+ u64 *p;
+ unsigned int rg_blocks = 0;
+ int metadata;
+ unsigned int revokes = 0;
+ int x;
+ int error;
+
+ if (!*top)
+ sm->sm_first = 0;
+
+ if (height != sm->sm_height)
+ return 0;
+
+ if (sm->sm_first) {
+ top++;
+ sm->sm_first = 0;
+ }
+
+ metadata = (height != ip->i_di.di_height - 1);
+ if (metadata)
+ revokes = (height) ? sdp->sd_inptrs : sdp->sd_diptrs;
+
+ error = gfs2_rindex_hold(sdp, &ip->i_alloc.al_ri_gh);
+ if (error)
+ return error;
+
+ memset(&rlist, 0, sizeof(struct gfs2_rgrp_list));
+ bstart = 0;
+ blen = 0;
+
+ for (p = top; p < bottom; p++) {
+ if (!*p)
+ continue;
+
+ bn = be64_to_cpu(*p);
+
+ if (bstart + blen == bn)
+ blen++;
+ else {
+ if (bstart)
+ gfs2_rlist_add(sdp, &rlist, bstart);
+
+ bstart = bn;
+ blen = 1;
+ }
+ }
+
+ if (bstart)
+ gfs2_rlist_add(sdp, &rlist, bstart);
+ else
+ goto out; /* Nothing to do */
+
+ gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE, 0);
+
+ for (x = 0; x < rlist.rl_rgrps; x++) {
+ struct gfs2_rgrpd *rgd;
+ rgd = rlist.rl_ghs[x].gh_gl->gl_object;
+ rg_blocks += rgd->rd_ri.ri_length;
+ }
+
+ error = gfs2_glock_nq_m(rlist.rl_rgrps, rlist.rl_ghs);
+ if (error)
+ goto out_rlist;
+
+ error = gfs2_trans_begin(sdp, rg_blocks + RES_DINODE +
+ RES_INDIRECT + RES_STATFS + RES_QUOTA,
+ revokes);
+ if (error)
+ goto out_rg_gunlock;
+
+ down_write(&ip->i_rw_mutex);
+
+ gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+ gfs2_trans_add_bh(ip->i_gl, bh, 1);
+
+ bstart = 0;
+ blen = 0;
+
+ for (p = top; p < bottom; p++) {
+ if (!*p)
+ continue;
+
+ bn = be64_to_cpu(*p);
+
+ if (bstart + blen == bn)
+ blen++;
+ else {
+ if (bstart) {
+ if (metadata)
+ gfs2_free_meta(ip, bstart, blen);
+ else
+ gfs2_free_data(ip, bstart, blen);
+ }
+
+ bstart = bn;
+ blen = 1;
+ }
+
+ *p = 0;
+ if (!ip->i_di.di_blocks)
+ gfs2_consist_inode(ip);
+ ip->i_di.di_blocks--;
+ }
+ if (bstart) {
+ if (metadata)
+ gfs2_free_meta(ip, bstart, blen);
+ else
+ gfs2_free_data(ip, bstart, blen);
+ }
+
+ ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
+
+ gfs2_dinode_out(&ip->i_di, dibh->b_data);
+
+ up_write(&ip->i_rw_mutex);
+
+ gfs2_trans_end(sdp);
+
+out_rg_gunlock:
+ gfs2_glock_dq_m(rlist.rl_rgrps, rlist.rl_ghs);
+out_rlist:
+ gfs2_rlist_free(&rlist);
+out:
+ gfs2_glock_dq_uninit(&ip->i_alloc.al_ri_gh);
+ return error;
+}
+
+/**
+ * do_grow - Make a file look bigger than it is
+ * @ip: the inode
+ * @size: the size to set the file to
+ *
+ * Called with an exclusive lock on @ip.
+ *
+ * Returns: errno
+ */
+
+static int do_grow(struct gfs2_inode *ip, u64 size)
+{
+ struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+ struct gfs2_alloc *al;
+ struct buffer_head *dibh;
+ unsigned int h;
+ int error;
+
+ al = gfs2_alloc_get(ip);
+
+ error = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+ if (error)
+ goto out;
+
+ error = gfs2_quota_check(ip, ip->i_di.di_uid, ip->i_di.di_gid);
+ if (error)
+ goto out_gunlock_q;
+
+ al->al_requested = sdp->sd_max_height + RES_DATA;
+
+ error = gfs2_inplace_reserve(ip);
+ if (error)
+ goto out_gunlock_q;
+
+ error = gfs2_trans_begin(sdp,
+ sdp->sd_max_height + al->al_rgd->rd_ri.ri_length +
+ RES_JDATA + RES_DINODE + RES_STATFS + RES_QUOTA, 0);
+ if (error)
+ goto out_ipres;
+
+ if (size > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)) {
+ if (gfs2_is_stuffed(ip)) {
+ error = gfs2_unstuff_dinode(ip, NULL);
+ if (error)
+ goto out_end_trans;
+ }
+
+ h = calc_tree_height(ip, size);
+ if (ip->i_di.di_height < h) {
+ down_write(&ip->i_rw_mutex);
+ error = build_height(&ip->i_inode, h);
+ up_write(&ip->i_rw_mutex);
+ if (error)
+ goto out_end_trans;
+ }
+ }
+
+ ip->i_di.di_size = size;
+ ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
+
+ error = gfs2_meta_inode_buffer(ip, &dibh);
+ if (error)
+ goto out_end_trans;
+
+ gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+ gfs2_dinode_out(&ip->i_di, dibh->b_data);
+ brelse(dibh);
+
+out_end_trans:
+ gfs2_trans_end(sdp);
+out_ipres:
+ gfs2_inplace_release(ip);
+out_gunlock_q:
+ gfs2_quota_unlock(ip);
+out:
+ gfs2_alloc_put(ip);
+ return error;
+}
+
+
+/**
+ * gfs2_block_truncate_page - Deal with zeroing out data for truncate
+ *
+ * This is partly borrowed from ext3.
+ */
+static int gfs2_block_truncate_page(struct address_space *mapping)
+{
+ struct inode *inode = mapping->host;
+ struct gfs2_inode *ip = GFS2_I(inode);
+ struct gfs2_sbd *sdp = GFS2_SB(inode);
+ loff_t from = inode->i_size;
+ unsigned long index = from >> PAGE_CACHE_SHIFT;
+ unsigned offset = from & (PAGE_CACHE_SIZE-1);
+ unsigned blocksize, iblock, length, pos;
+ struct buffer_head *bh;
+ struct page *page;
+ void *kaddr;
+ int err;
+
+ page = grab_cache_page(mapping, index);
+ if (!page)
+ return 0;
+
+ blocksize = inode->i_sb->s_blocksize;
+ length = blocksize - (offset & (blocksize - 1));
+ iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
+
+ if (!page_has_buffers(page))
+ create_empty_buffers(page, blocksize, 0);
+
+ /* Find the buffer that contains "offset" */
+ bh = page_buffers(page);
+ pos = blocksize;
+ while (offset >= pos) {
+ bh = bh->b_this_page;
+ iblock++;
+ pos += blocksize;
+ }
+
+ err = 0;
+
+ if (!buffer_mapped(bh)) {
+ gfs2_get_block(inode, iblock, bh, 0);
+ /* unmapped? It's a hole - nothing to do */
+ if (!buffer_mapped(bh))
+ goto unlock;
+ }
+
+ /* Ok, it's mapped. Make sure it's up-to-date */
+ if (PageUptodate(page))
+ set_buffer_uptodate(bh);
+
+ if (!buffer_uptodate(bh)) {
+ err = -EIO;
+ ll_rw_block(READ, 1, &bh);
+ wait_on_buffer(bh);
+ /* Uhhuh. Read error. Complain and punt. */
+ if (!buffer_uptodate(bh))
+ goto unlock;
+ }
+
+ if (sdp->sd_args.ar_data == GFS2_DATA_ORDERED || gfs2_is_jdata(ip))
+ gfs2_trans_add_bh(ip->i_gl, bh, 0);
+
+ kaddr = kmap_atomic(page, KM_USER0);
+ memset(kaddr + offset, 0, length);
+ flush_dcache_page(page);
+ kunmap_atomic(kaddr, KM_USER0);
+
+unlock:
+ unlock_page(page);
+ page_cache_release(page);
+ return err;
+}
+
+static int trunc_start(struct gfs2_inode *ip, u64 size)
+{
+ struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+ struct buffer_head *dibh;
+ int journaled = gfs2_is_jdata(ip);
+ int error;
+
+ error = gfs2_trans_begin(sdp,
+ RES_DINODE + (journaled ? RES_JDATA : 0), 0);
+ if (error)
+ return error;
+
+ error = gfs2_meta_inode_buffer(ip, &dibh);
+ if (error)
+ goto out;
+
+ if (gfs2_is_stuffed(ip)) {
+ ip->i_di.di_size = size;
+ ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
+ gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+ gfs2_dinode_out(&ip->i_di, dibh->b_data);
+ gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode) + size);
+ error = 1;
+
+ } else {
+ if (size & (u64)(sdp->sd_sb.sb_bsize - 1))
+ error = gfs2_block_truncate_page(ip->i_inode.i_mapping);
+
+ if (!error) {
+ ip->i_di.di_size = size;
+ ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
+ ip->i_di.di_flags |= GFS2_DIF_TRUNC_IN_PROG;
+ gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+ gfs2_dinode_out(&ip->i_di, dibh->b_data);
+ }
+ }
+
+ brelse(dibh);
+
+out:
+ gfs2_trans_end(sdp);
+ return error;
+}
+
+static int trunc_dealloc(struct gfs2_inode *ip, u64 size)
+{
+ unsigned int height = ip->i_di.di_height;
+ u64 lblock;
+ struct metapath mp;
+ int error;
+
+ if (!size)
+ lblock = 0;
+ else
+ lblock = (size - 1) >> GFS2_SB(&ip->i_inode)->sd_sb.sb_bsize_shift;
+
+ find_metapath(ip, lblock, &mp);
+ gfs2_alloc_get(ip);
+
+ error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+ if (error)
+ goto out;
+
+ while (height--) {
+ struct strip_mine sm;
+ sm.sm_first = !!size;
+ sm.sm_height = height;
+
+ error = recursive_scan(ip, NULL, &mp, 0, 0, 1, do_strip, &sm);
+ if (error)
+ break;
+ }
+
+ gfs2_quota_unhold(ip);
+
+out:
+ gfs2_alloc_put(ip);
+ return error;
+}
+
+static int trunc_end(struct gfs2_inode *ip)
+{
+ struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+ struct buffer_head *dibh;
+ int error;
+
+ error = gfs2_trans_begin(sdp, RES_DINODE, 0);
+ if (error)
+ return error;
+
+ down_write(&ip->i_rw_mutex);
+
+ error = gfs2_meta_inode_buffer(ip, &dibh);
+ if (error)
+ goto out;
+
+ if (!ip->i_di.di_size) {
+ ip->i_di.di_height = 0;
+ ip->i_di.di_goal_meta =
+ ip->i_di.di_goal_data =
+ ip->i_num.no_addr;
+ gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
+ }
+ ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
+ ip->i_di.di_flags &= ~GFS2_DIF_TRUNC_IN_PROG;
+
+ gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+ gfs2_dinode_out(&ip->i_di, dibh->b_data);
+ brelse(dibh);
+
+out:
+ up_write(&ip->i_rw_mutex);
+ gfs2_trans_end(sdp);
+ return error;
+}
+
+/**
+ * do_shrink - make a file smaller
+ * @ip: the inode
+ * @size: the size to make the file
+ * @truncator: function to truncate the last partial block
+ *
+ * Called with an exclusive lock on @ip.
+ *
+ * Returns: errno
+ */
+
+static int do_shrink(struct gfs2_inode *ip, u64 size)
+{
+ int error;
+
+ error = trunc_start(ip, size);
+ if (error < 0)
+ return error;
+ if (error > 0)
+ return 0;
+
+ error = trunc_dealloc(ip, size);
+ if (!error)
+ error = trunc_end(ip);
+
+ return error;
+}
+
+/**
+ * gfs2_truncatei - make a file a given size
+ * @ip: the inode
+ * @size: the size to make the file
+ * @truncator: function to truncate the last partial block
+ *
+ * The file size can grow, shrink, or stay the same size.
+ *
+ * Returns: errno
+ */
+
+int gfs2_truncatei(struct gfs2_inode *ip, u64 size)
+{
+ int error;
+
+ if (gfs2_assert_warn(GFS2_SB(&ip->i_inode), S_ISREG(ip->i_di.di_mode)))
+ return -EINVAL;
+
+ if (size > ip->i_di.di_size)
+ error = do_grow(ip, size);
+ else
+ error = do_shrink(ip, size);
+
+ return error;
+}
+
+int gfs2_truncatei_resume(struct gfs2_inode *ip)
+{
+ int error;
+ error = trunc_dealloc(ip, ip->i_di.di_size);
+ if (!error)
+ error = trunc_end(ip);
+ return error;
+}
+
+int gfs2_file_dealloc(struct gfs2_inode *ip)
+{
+ return trunc_dealloc(ip, 0);
+}
+
+/**
+ * gfs2_write_calc_reserv - calculate number of blocks needed to write to a file
+ * @ip: the file
+ * @len: the number of bytes to be written to the file
+ * @data_blocks: returns the number of data blocks required
+ * @ind_blocks: returns the number of indirect blocks required
+ *
+ */
+
+void gfs2_write_calc_reserv(struct gfs2_inode *ip, unsigned int len,
+ unsigned int *data_blocks, unsigned int *ind_blocks)
+{
+ struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+ unsigned int tmp;
+
+ if (gfs2_is_dir(ip)) {
+ *data_blocks = DIV_ROUND_UP(len, sdp->sd_jbsize) + 2;
+ *ind_blocks = 3 * (sdp->sd_max_jheight - 1);
+ } else {
+ *data_blocks = (len >> sdp->sd_sb.sb_bsize_shift) + 3;
+ *ind_blocks = 3 * (sdp->sd_max_height - 1);
+ }
+
+ for (tmp = *data_blocks; tmp > sdp->sd_diptrs;) {
+ tmp = DIV_ROUND_UP(tmp, sdp->sd_inptrs);
+ *ind_blocks += tmp;
+ }
+}
+
+/**
+ * gfs2_write_alloc_required - figure out if a write will require an allocation
+ * @ip: the file being written to
+ * @offset: the offset to write to
+ * @len: the number of bytes being written
+ * @alloc_required: set to 1 if an alloc is required, 0 otherwise
+ *
+ * Returns: errno
+ */
+
+int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
+ unsigned int len, int *alloc_required)
+{
+ struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+ u64 lblock, lblock_stop, dblock;
+ u32 extlen;
+ int new = 0;
+ int error = 0;
+
+ *alloc_required = 0;
+
+ if (!len)
+ return 0;
+
+ if (gfs2_is_stuffed(ip)) {
+ if (offset + len >
+ sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode))
+ *alloc_required = 1;
+ return 0;
+ }
+
+ if (gfs2_is_dir(ip)) {
+ unsigned int bsize = sdp->sd_jbsize;
+ lblock = offset;
+ do_div(lblock, bsize);
+ lblock_stop = offset + len + bsize - 1;
+ do_div(lblock_stop, bsize);
+ } else {
+ unsigned int shift = sdp->sd_sb.sb_bsize_shift;
+ lblock = offset >> shift;
+ lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift;
+ }
+
+ for (; lblock < lblock_stop; lblock += extlen) {
+ error = gfs2_extent_map(&ip->i_inode, lblock, &new, &dblock, &extlen);
+ if (error)
+ return error;
+
+ if (!dblock) {
+ *alloc_required = 1;
+ return 0;
+ }
+ }
+
+ return 0;
+}
+
diff --git a/fs/gfs2/bmap.h b/fs/gfs2/bmap.h
new file mode 100644
index 000000000000..ac2fd04370dc
--- /dev/null
+++ b/fs/gfs2/bmap.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#ifndef __BMAP_DOT_H__
+#define __BMAP_DOT_H__
+
+struct inode;
+struct gfs2_inode;
+struct page;
+
+int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page);
+int gfs2_block_map(struct inode *inode, u64 lblock, int create, struct buffer_head *bh);
+int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsigned *extlen);
+
+int gfs2_truncatei(struct gfs2_inode *ip, u64 size);
+int gfs2_truncatei_resume(struct gfs2_inode *ip);
+int gfs2_file_dealloc(struct gfs2_inode *ip);
+
+void gfs2_write_calc_reserv(struct gfs2_inode *ip, unsigned int len,
+ unsigned int *data_blocks,
+ unsigned int *ind_blocks);
+int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
+ unsigned int len, int *alloc_required);
+
+#endif /* __BMAP_DOT_H__ */
diff --git a/fs/gfs2/daemon.c b/fs/gfs2/daemon.c
new file mode 100644
index 000000000000..cab1f68d4685
--- /dev/null
+++ b/fs/gfs2/daemon.c
@@ -0,0 +1,196 @@
+/*
+ * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/kthread.h>
+#include <linux/delay.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/lm_interface.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "daemon.h"
+#include "glock.h"
+#include "log.h"
+#include "quota.h"
+#include "recovery.h"
+#include "super.h"
+#include "util.h"
+
+/* This uses schedule_timeout() instead of msleep() because it's good for
+ the daemons to wake up more often than the timeout when unmounting so
+ the user's unmount doesn't sit there forever.
+
+ The kthread functions used to start these daemons block and flush signals. */
+
+/**
+ * gfs2_scand - Look for cached glocks and inodes to toss from memory
+ * @sdp: Pointer to GFS2 superblock
+ *
+ * One of these daemons runs, finding candidates to add to sd_reclaim_list.
+ * See gfs2_glockd()
+ */
+
+int gfs2_scand(void *data)
+{
+ struct gfs2_sbd *sdp = data;
+ unsigned long t;
+
+ while (!kthread_should_stop()) {
+ gfs2_scand_internal(sdp);
+ t = gfs2_tune_get(sdp, gt_scand_secs) * HZ;
+ schedule_timeout_interruptible(t);
+ }
+
+ return 0;
+}
+
+/**
+ * gfs2_glockd - Reclaim unused glock structures
+ * @sdp: Pointer to GFS2 superblock
+ *
+ * One or more of these daemons run, reclaiming glocks on sd_reclaim_list.
+ * Number of daemons can be set by user, with num_glockd mount option.
+ */
+
+int gfs2_glockd(void *data)
+{
+ struct gfs2_sbd *sdp = data;
+
+ while (!kthread_should_stop()) {
+ while (atomic_read(&sdp->sd_reclaim_count))
+ gfs2_reclaim_glock(sdp);
+
+ wait_event_interruptible(sdp->sd_reclaim_wq,
+ (atomic_read(&sdp->sd_reclaim_count) ||
+ kthread_should_stop()));
+ }
+
+ return 0;
+}
+
+/**
+ * gfs2_recoverd - Recover dead machine's journals
+ * @sdp: Pointer to GFS2 superblock
+ *
+ */
+
+int gfs2_recoverd(void *data)
+{
+ struct gfs2_sbd *sdp = data;
+ unsigned long t;
+
+ while (!kthread_should_stop()) {
+ gfs2_check_journals(sdp);
+ t = gfs2_tune_get(sdp, gt_recoverd_secs) * HZ;
+ schedule_timeout_interruptible(t);
+ }
+
+ return 0;
+}
+
+/**
+ * gfs2_logd - Update log tail as Active Items get flushed to in-place blocks
+ * @sdp: Pointer to GFS2 superblock
+ *
+ * Also, periodically check to make sure that we're using the most recent
+ * journal index.
+ */
+
+int gfs2_logd(void *data)
+{
+ struct gfs2_sbd *sdp = data;
+ struct gfs2_holder ji_gh;
+ unsigned long t;
+
+ while (!kthread_should_stop()) {
+ /* Advance the log tail */
+
+ t = sdp->sd_log_flush_time +
+ gfs2_tune_get(sdp, gt_log_flush_secs) * HZ;
+
+ gfs2_ail1_empty(sdp, DIO_ALL);
+
+ if (time_after_eq(jiffies, t)) {
+ gfs2_log_flush(sdp, NULL);
+ sdp->sd_log_flush_time = jiffies;
+ }
+
+ /* Check for latest journal index */
+
+ t = sdp->sd_jindex_refresh_time +
+ gfs2_tune_get(sdp, gt_jindex_refresh_secs) * HZ;
+
+ if (time_after_eq(jiffies, t)) {
+ if (!gfs2_jindex_hold(sdp, &ji_gh))
+ gfs2_glock_dq_uninit(&ji_gh);
+ sdp->sd_jindex_refresh_time = jiffies;
+ }
+
+ t = gfs2_tune_get(sdp, gt_logd_secs) * HZ;
+ schedule_timeout_interruptible(t);
+ }
+
+ return 0;
+}
+
+/**
+ * gfs2_quotad - Write cached quota changes into the quota file
+ * @sdp: Pointer to GFS2 superblock
+ *
+ */
+
+int gfs2_quotad(void *data)
+{
+ struct gfs2_sbd *sdp = data;
+ unsigned long t;
+ int error;
+
+ while (!kthread_should_stop()) {
+ /* Update the master statfs file */
+
+ t = sdp->sd_statfs_sync_time +
+ gfs2_tune_get(sdp, gt_statfs_quantum) * HZ;
+
+ if (time_after_eq(jiffies, t)) {
+ error = gfs2_statfs_sync(sdp);
+ if (error &&
+ error != -EROFS &&
+ !test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
+ fs_err(sdp, "quotad: (1) error=%d\n", error);
+ sdp->sd_statfs_sync_time = jiffies;
+ }
+
+ /* Update quota file */
+
+ t = sdp->sd_quota_sync_time +
+ gfs2_tune_get(sdp, gt_quota_quantum) * HZ;
+
+ if (time_after_eq(jiffies, t)) {
+ error = gfs2_quota_sync(sdp);
+ if (error &&
+ error != -EROFS &&
+ !test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
+ fs_err(sdp, "quotad: (2) error=%d\n", error);
+ sdp->sd_quota_sync_time = jiffies;
+ }
+
+ gfs2_quota_scan(sdp);
+
+ t = gfs2_tune_get(sdp, gt_quotad_secs) * HZ;
+ schedule_timeout_interruptible(t);
+ }
+
+ return 0;
+}
+
diff --git a/fs/gfs2/daemon.h b/fs/gfs2/daemon.h
new file mode 100644
index 000000000000..801007120fb2
--- /dev/null
+++ b/fs/gfs2/daemon.h
@@ -0,0 +1,19 @@
+/*
+ * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#ifndef __DAEMON_DOT_H__
+#define __DAEMON_DOT_H__
+
+int gfs2_scand(void *data);
+int gfs2_glockd(void *data);
+int gfs2_recoverd(void *data);
+int gfs2_logd(void *data);
+int gfs2_quotad(void *data);
+
+#endif /* __DAEMON_DOT_H__ */
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
new file mode 100644
index 000000000000..e24af28b1a12
--- /dev/null
+++ b/fs/gfs2/dir.c
@@ -0,0 +1,1957 @@
+/*
+ * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+/*
+ * Implements Extendible Hashing as described in:
+ * "Extendible Hashing" by Fagin, et al in
+ * __ACM Trans. on Database Systems__, Sept 1979.
+ *
+ *
+ * Here's the layout of dirents which is essentially the same as that of ext2
+ * within a single block. The field de_name_len is the number of bytes
+ * actually required for the name (no null terminator). The field de_rec_len
+ * is the number of bytes allocated to the dirent. The offset of the next
+ * dirent in the block is (dirent + dirent->de_rec_len). When a dirent is
+ * deleted, the preceding dirent inherits its allocated space, ie
+ * prev->de_rec_len += deleted->de_rec_len. Since the next dirent is obtained
+ * by adding de_rec_len to the current dirent, this essentially causes the
+ * deleted dirent to get jumped over when iterating through all the dirents.
+ *
+ * When deleting the first dirent in a block, there is no previous dirent so
+ * the field de_ino is set to zero to designate it as deleted. When allocating
+ * a dirent, gfs2_dirent_alloc iterates through the dirents in a block. If the
+ * first dirent has (de_ino == 0) and de_rec_len is large enough, this first
+ * dirent is allocated. Otherwise it must go through all the 'used' dirents
+ * searching for one in which the amount of total space minus the amount of
+ * used space will provide enough space for the new dirent.
+ *
+ * There are two types of blocks in which dirents reside. In a stuffed dinode,
+ * the dirents begin at offset sizeof(struct gfs2_dinode) from the beginning of
+ * the block. In leaves, they begin at offset sizeof(struct gfs2_leaf) from the
+ * beginning of the leaf block. The dirents reside in leaves when
+ *
+ * dip->i_di.di_flags & GFS2_DIF_EXHASH is true
+ *
+ * Otherwise, the dirents are "linear", within a single stuffed dinode block.
+ *
+ * When the dirents are in leaves, the actual contents of the directory file are
+ * used as an array of 64-bit block pointers pointing to the leaf blocks. The
+ * dirents are NOT in the directory file itself. There can be more than one
+ * block pointer in the array that points to the same leaf. In fact, when a
+ * directory is first converted from linear to exhash, all of the pointers
+ * point to the same leaf.
+ *
+ * When a leaf is completely full, the size of the hash table can be
+ * doubled unless it is already at the maximum size which is hard coded into
+ * GFS2_DIR_MAX_DEPTH. After that, leaves are chained together in a linked list,
+ * but never before the maximum hash table size has been reached.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/buffer_head.h>
+#include <linux/sort.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/crc32.h>
+#include <linux/vmalloc.h>
+#include <linux/lm_interface.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "dir.h"
+#include "glock.h"
+#include "inode.h"
+#include "meta_io.h"
+#include "quota.h"
+#include "rgrp.h"
+#include "trans.h"
+#include "bmap.h"
+#include "util.h"
+
+#define IS_LEAF 1 /* Hashed (leaf) directory */
+#define IS_DINODE 2 /* Linear (stuffed dinode block) directory */
+
+#define gfs2_disk_hash2offset(h) (((u64)(h)) >> 1)
+#define gfs2_dir_offset2hash(p) ((u32)(((u64)(p)) << 1))
+
+typedef int (*leaf_call_t) (struct gfs2_inode *dip, u32 index, u32 len,
+ u64 leaf_no, void *data);
+typedef int (*gfs2_dscan_t)(const struct gfs2_dirent *dent,
+ const struct qstr *name, void *opaque);
+
+
+int gfs2_dir_get_new_buffer(struct gfs2_inode *ip, u64 block,
+ struct buffer_head **bhp)
+{
+ struct buffer_head *bh;
+
+ bh = gfs2_meta_new(ip->i_gl, block);
+ gfs2_trans_add_bh(ip->i_gl, bh, 1);
+ gfs2_metatype_set(bh, GFS2_METATYPE_JD, GFS2_FORMAT_JD);
+ gfs2_buffer_clear_tail(bh, sizeof(struct gfs2_meta_header));
+ *bhp = bh;
+ return 0;
+}
+
+static int gfs2_dir_get_existing_buffer(struct gfs2_inode *ip, u64 block,
+ struct buffer_head **bhp)
+{
+ struct buffer_head *bh;
+ int error;
+
+ error = gfs2_meta_read(ip->i_gl, block, DIO_WAIT, &bh);
+ if (error)
+ return error;
+ if (gfs2_metatype_check(GFS2_SB(&ip->i_inode), bh, GFS2_METATYPE_JD)) {
+ brelse(bh);
+ return -EIO;
+ }
+ *bhp = bh;
+ return 0;
+}
+
+static int gfs2_dir_write_stuffed(struct gfs2_inode *ip, const char *buf,
+ unsigned int offset, unsigned int size)
+{
+ struct buffer_head *dibh;
+ int error;
+
+ error = gfs2_meta_inode_buffer(ip, &dibh);
+ if (error)
+ return error;
+
+ gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+ memcpy(dibh->b_data + offset + sizeof(struct gfs2_dinode), buf, size);
+ if (ip->i_di.di_size < offset + size)
+ ip->i_di.di_size = offset + size;
+ ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
+ gfs2_dinode_out(&ip->i_di, dibh->b_data);
+
+ brelse(dibh);
+
+ return size;
+}
+
+
+
+/**
+ * gfs2_dir_write_data - Write directory information to the inode
+ * @ip: The GFS2 inode
+ * @buf: The buffer containing information to be written
+ * @offset: The file offset to start writing at
+ * @size: The amount of data to write
+ *
+ * Returns: The number of bytes correctly written or error code
+ */
+static int gfs2_dir_write_data(struct gfs2_inode *ip, const char *buf,
+ u64 offset, unsigned int size)
+{
+ struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+ struct buffer_head *dibh;
+ u64 lblock, dblock;
+ u32 extlen = 0;
+ unsigned int o;
+ int copied = 0;
+ int error = 0;
+
+ if (!size)
+ return 0;
+
+ if (gfs2_is_stuffed(ip) &&
+ offset + size <= sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode))
+ return gfs2_dir_write_stuffed(ip, buf, (unsigned int)offset,
+ size);
+
+ if (gfs2_assert_warn(sdp, gfs2_is_jdata(ip)))
+ return -EINVAL;
+
+ if (gfs2_is_stuffed(ip)) {
+ error = gfs2_unstuff_dinode(ip, NULL);
+ if (error)
+ return error;
+ }
+
+ lblock = offset;
+ o = do_div(lblock, sdp->sd_jbsize) + sizeof(struct gfs2_meta_header);
+
+ while (copied < size) {
+ unsigned int amount;
+ struct buffer_head *bh;
+ int new = 0;
+
+ amount = size - copied;
+ if (amount > sdp->sd_sb.sb_bsize - o)
+ amount = sdp->sd_sb.sb_bsize - o;
+
+ if (!extlen) {
+ new = 1;
+ error = gfs2_extent_map(&ip->i_inode, lblock, &new,
+ &dblock, &extlen);
+ if (error)
+ goto fail;
+ error = -EIO;
+ if (gfs2_assert_withdraw(sdp, dblock))
+ goto fail;
+ }
+
+ if (amount == sdp->sd_jbsize || new)
+ error = gfs2_dir_get_new_buffer(ip, dblock, &bh);
+ else
+ error = gfs2_dir_get_existing_buffer(ip, dblock, &bh);
+
+ if (error)
+ goto fail;
+
+ gfs2_trans_add_bh(ip->i_gl, bh, 1);
+ memcpy(bh->b_data + o, buf, amount);
+ brelse(bh);
+
+ buf += amount;
+ copied += amount;
+ lblock++;
+ dblock++;
+ extlen--;
+
+ o = sizeof(struct gfs2_meta_header);
+ }
+
+out:
+ error = gfs2_meta_inode_buffer(ip, &dibh);
+ if (error)
+ return error;
+
+ if (ip->i_di.di_size < offset + copied)
+ ip->i_di.di_size = offset + copied;
+ ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
+
+ gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+ gfs2_dinode_out(&ip->i_di, dibh->b_data);
+ brelse(dibh);
+
+ return copied;
+fail:
+ if (copied)
+ goto out;
+ return error;
+}
+
+static int gfs2_dir_read_stuffed(struct gfs2_inode *ip, char *buf,
+ u64 offset, unsigned int size)
+{
+ struct buffer_head *dibh;
+ int error;
+
+ error = gfs2_meta_inode_buffer(ip, &dibh);
+ if (!error) {
+ offset += sizeof(struct gfs2_dinode);
+ memcpy(buf, dibh->b_data + offset, size);
+ brelse(dibh);
+ }
+
+ return (error) ? error : size;
+}
+
+
+/**
+ * gfs2_dir_read_data - Read a data from a directory inode
+ * @ip: The GFS2 Inode
+ * @buf: The buffer to place result into
+ * @offset: File offset to begin jdata_readng from
+ * @size: Amount of data to transfer
+ *
+ * Returns: The amount of data actually copied or the error
+ */
+static int gfs2_dir_read_data(struct gfs2_inode *ip, char *buf, u64 offset,
+ unsigned int size, unsigned ra)
+{
+ struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+ u64 lblock, dblock;
+ u32 extlen = 0;
+ unsigned int o;
+ int copied = 0;
+ int error = 0;
+
+ if (offset >= ip->i_di.di_size)
+ return 0;
+
+ if (offset + size > ip->i_di.di_size)
+ size = ip->i_di.di_size - offset;
+
+ if (!size)
+ return 0;
+
+ if (gfs2_is_stuffed(ip))
+ return gfs2_dir_read_stuffed(ip, buf, offset, size);
+
+ if (gfs2_assert_warn(sdp, gfs2_is_jdata(ip)))
+ return -EINVAL;
+
+ lblock = offset;
+ o = do_div(lblock, sdp->sd_jbsize) + sizeof(struct gfs2_meta_header);
+
+ while (copied < size) {
+ unsigned int amount;
+ struct buffer_head *bh;
+ int new;
+
+ amount = size - copied;
+ if (amount > sdp->sd_sb.sb_bsize - o)
+ amount = sdp->sd_sb.sb_bsize - o;
+
+ if (!extlen) {
+ new = 0;
+ error = gfs2_extent_map(&ip->i_inode, lblock, &new,
+ &dblock, &extlen);
+ if (error || !dblock)
+ goto fail;
+ BUG_ON(extlen < 1);
+ if (!ra)
+ extlen = 1;
+ bh = gfs2_meta_ra(ip->i_gl, dblock, extlen);
+ } else {
+ error = gfs2_meta_read(ip->i_gl, dblock, DIO_WAIT, &bh);
+ if (error)
+ goto fail;
+ }
+ error = gfs2_metatype_check(sdp, bh, GFS2_METATYPE_JD);
+ if (error) {
+ brelse(bh);
+ goto fail;
+ }
+ dblock++;
+ extlen--;
+ memcpy(buf, bh->b_data + o, amount);
+ brelse(bh);
+ buf += amount;
+ copied += amount;
+ lblock++;
+ o = sizeof(struct gfs2_meta_header);
+ }
+
+ return copied;
+fail:
+ return (copied) ? copied : error;
+}
+
+static inline int __gfs2_dirent_find(const struct gfs2_dirent *dent,
+ const struct qstr *name, int ret)
+{
+ if (dent->de_inum.no_addr != 0 &&
+ be32_to_cpu(dent->de_hash) == name->hash &&
+ be16_to_cpu(dent->de_name_len) == name->len &&
+ memcmp(dent+1, name->name, name->len) == 0)
+ return ret;
+ return 0;
+}
+
+static int gfs2_dirent_find(const struct gfs2_dirent *dent,
+ const struct qstr *name,
+ void *opaque)
+{
+ return __gfs2_dirent_find(dent, name, 1);
+}
+
+static int gfs2_dirent_prev(const struct gfs2_dirent *dent,
+ const struct qstr *name,
+ void *opaque)
+{
+ return __gfs2_dirent_find(dent, name, 2);
+}
+
+/*
+ * name->name holds ptr to start of block.
+ * name->len holds size of block.
+ */
+static int gfs2_dirent_last(const struct gfs2_dirent *dent,
+ const struct qstr *name,
+ void *opaque)
+{
+ const char *start = name->name;
+ const char *end = (const char *)dent + be16_to_cpu(dent->de_rec_len);
+ if (name->len == (end - start))
+ return 1;
+ return 0;
+}
+
+static int gfs2_dirent_find_space(const struct gfs2_dirent *dent,
+ const struct qstr *name,
+ void *opaque)
+{
+ unsigned required = GFS2_DIRENT_SIZE(name->len);
+ unsigned actual = GFS2_DIRENT_SIZE(be16_to_cpu(dent->de_name_len));
+ unsigned totlen = be16_to_cpu(dent->de_rec_len);
+
+ if (!dent->de_inum.no_addr)
+ actual = GFS2_DIRENT_SIZE(0);
+ if (totlen - actual >= required)
+ return 1;
+ return 0;
+}
+
+struct dirent_gather {
+ const struct gfs2_dirent **pdent;
+ unsigned offset;
+};
+
+static int gfs2_dirent_gather(const struct gfs2_dirent *dent,
+ const struct qstr *name,
+ void *opaque)
+{
+ struct dirent_gather *g = opaque;
+ if (dent->de_inum.no_addr) {
+ g->pdent[g->offset++] = dent;
+ }
+ return 0;
+}
+
+/*
+ * Other possible things to check:
+ * - Inode located within filesystem size (and on valid block)
+ * - Valid directory entry type
+ * Not sure how heavy-weight we want to make this... could also check
+ * hash is correct for example, but that would take a lot of extra time.
+ * For now the most important thing is to check that the various sizes
+ * are correct.
+ */
+static int gfs2_check_dirent(struct gfs2_dirent *dent, unsigned int offset,
+ unsigned int size, unsigned int len, int first)
+{
+ const char *msg = "gfs2_dirent too small";
+ if (unlikely(size < sizeof(struct gfs2_dirent)))
+ goto error;
+ msg = "gfs2_dirent misaligned";
+ if (unlikely(offset & 0x7))
+ goto error;
+ msg = "gfs2_dirent points beyond end of block";
+ if (unlikely(offset + size > len))
+ goto error;
+ msg = "zero inode number";
+ if (unlikely(!first && !dent->de_inum.no_addr))
+ goto error;
+ msg = "name length is greater than space in dirent";
+ if (dent->de_inum.no_addr &&
+ unlikely(sizeof(struct gfs2_dirent)+be16_to_cpu(dent->de_name_len) >
+ size))
+ goto error;
+ return 0;
+error:
+ printk(KERN_WARNING "gfs2_check_dirent: %s (%s)\n", msg,
+ first ? "first in block" : "not first in block");
+ return -EIO;
+}
+
+static int gfs2_dirent_offset(const void *buf)
+{
+ const struct gfs2_meta_header *h = buf;
+ int offset;
+
+ BUG_ON(buf == NULL);
+
+ switch(be32_to_cpu(h->mh_type)) {
+ case GFS2_METATYPE_LF:
+ offset = sizeof(struct gfs2_leaf);
+ break;
+ case GFS2_METATYPE_DI:
+ offset = sizeof(struct gfs2_dinode);
+ break;
+ default:
+ goto wrong_type;
+ }
+ return offset;
+wrong_type:
+ printk(KERN_WARNING "gfs2_scan_dirent: wrong block type %u\n",
+ be32_to_cpu(h->mh_type));
+ return -1;
+}
+
+static struct gfs2_dirent *gfs2_dirent_scan(struct inode *inode, void *buf,
+ unsigned int len, gfs2_dscan_t scan,
+ const struct qstr *name,
+ void *opaque)
+{
+ struct gfs2_dirent *dent, *prev;
+ unsigned offset;
+ unsigned size;
+ int ret = 0;
+
+ ret = gfs2_dirent_offset(buf);
+ if (ret < 0)
+ goto consist_inode;
+
+ offset = ret;
+ prev = NULL;
+ dent = buf + offset;
+ size = be16_to_cpu(dent->de_rec_len);
+ if (gfs2_check_dirent(dent, offset, size, len, 1))
+ goto consist_inode;
+ do {
+ ret = scan(dent, name, opaque);
+ if (ret)
+ break;
+ offset += size;
+ if (offset == len)
+ break;
+ prev = dent;
+ dent = buf + offset;
+ size = be16_to_cpu(dent->de_rec_len);
+ if (gfs2_check_dirent(dent, offset, size, len, 0))
+ goto consist_inode;
+ } while(1);
+
+ switch(ret) {
+ case 0:
+ return NULL;
+ case 1:
+ return dent;
+ case 2:
+ return prev ? prev : dent;
+ default:
+ BUG_ON(ret > 0);
+ return ERR_PTR(ret);
+ }
+
+consist_inode:
+ gfs2_consist_inode(GFS2_I(inode));
+ return ERR_PTR(-EIO);
+}
+
+
+/**
+ * dirent_first - Return the first dirent
+ * @dip: the directory
+ * @bh: The buffer
+ * @dent: Pointer to list of dirents
+ *
+ * return first dirent whether bh points to leaf or stuffed dinode
+ *
+ * Returns: IS_LEAF, IS_DINODE, or -errno
+ */
+
+static int dirent_first(struct gfs2_inode *dip, struct buffer_head *bh,
+ struct gfs2_dirent **dent)
+{
+ struct gfs2_meta_header *h = (struct gfs2_meta_header *)bh->b_data;
+
+ if (be32_to_cpu(h->mh_type) == GFS2_METATYPE_LF) {
+ if (gfs2_meta_check(GFS2_SB(&dip->i_inode), bh))
+ return -EIO;
+ *dent = (struct gfs2_dirent *)(bh->b_data +
+ sizeof(struct gfs2_leaf));
+ return IS_LEAF;
+ } else {
+ if (gfs2_metatype_check(GFS2_SB(&dip->i_inode), bh, GFS2_METATYPE_DI))
+ return -EIO;
+ *dent = (struct gfs2_dirent *)(bh->b_data +
+ sizeof(struct gfs2_dinode));
+ return IS_DINODE;
+ }
+}
+
+static int dirent_check_reclen(struct gfs2_inode *dip,
+ const struct gfs2_dirent *d, const void *end_p)
+{
+ const void *ptr = d;
+ u16 rec_len = be16_to_cpu(d->de_rec_len);
+
+ if (unlikely(rec_len < sizeof(struct gfs2_dirent)))
+ goto broken;
+ ptr += rec_len;
+ if (ptr < end_p)
+ return rec_len;
+ if (ptr == end_p)
+ return -ENOENT;
+broken:
+ gfs2_consist_inode(dip);
+ return -EIO;
+}
+
+/**
+ * dirent_next - Next dirent
+ * @dip: the directory
+ * @bh: The buffer
+ * @dent: Pointer to list of dirents
+ *
+ * Returns: 0 on success, error code otherwise
+ */
+
+static int dirent_next(struct gfs2_inode *dip, struct buffer_head *bh,
+ struct gfs2_dirent **dent)
+{
+ struct gfs2_dirent *cur = *dent, *tmp;
+ char *bh_end = bh->b_data + bh->b_size;
+ int ret;
+
+ ret = dirent_check_reclen(dip, cur, bh_end);
+ if (ret < 0)
+ return ret;
+
+ tmp = (void *)cur + ret;
+ ret = dirent_check_reclen(dip, tmp, bh_end);
+ if (ret == -EIO)
+ return ret;
+
+ /* Only the first dent could ever have de_inum.no_addr == 0 */
+ if (!tmp->de_inum.no_addr) {
+ gfs2_consist_inode(dip);
+ return -EIO;
+ }
+
+ *dent = tmp;
+ return 0;
+}
+
+/**
+ * dirent_del - Delete a dirent
+ * @dip: The GFS2 inode
+ * @bh: The buffer
+ * @prev: The previous dirent
+ * @cur: The current dirent
+ *
+ */
+
+static void dirent_del(struct gfs2_inode *dip, struct buffer_head *bh,
+ struct gfs2_dirent *prev, struct gfs2_dirent *cur)
+{
+ u16 cur_rec_len, prev_rec_len;
+
+ if (!cur->de_inum.no_addr) {
+ gfs2_consist_inode(dip);
+ return;
+ }
+
+ gfs2_trans_add_bh(dip->i_gl, bh, 1);
+
+ /* If there is no prev entry, this is the first entry in the block.
+ The de_rec_len is already as big as it needs to be. Just zero
+ out the inode number and return. */
+
+ if (!prev) {
+ cur->de_inum.no_addr = 0; /* No endianess worries */
+ return;
+ }
+
+ /* Combine this dentry with the previous one. */
+
+ prev_rec_len = be16_to_cpu(prev->de_rec_len);
+ cur_rec_len = be16_to_cpu(cur->de_rec_len);
+
+ if ((char *)prev + prev_rec_len != (char *)cur)
+ gfs2_consist_inode(dip);
+ if ((char *)cur + cur_rec_len > bh->b_data + bh->b_size)
+ gfs2_consist_inode(dip);
+
+ prev_rec_len += cur_rec_len;
+ prev->de_rec_len = cpu_to_be16(prev_rec_len);
+}
+
+/*
+ * Takes a dent from which to grab space as an argument. Returns the
+ * newly created dent.
+ */
+static struct gfs2_dirent *gfs2_init_dirent(struct inode *inode,
+ struct gfs2_dirent *dent,
+ const struct qstr *name,
+ struct buffer_head *bh)
+{
+ struct gfs2_inode *ip = GFS2_I(inode);
+ struct gfs2_dirent *ndent;
+ unsigned offset = 0, totlen;
+
+ if (dent->de_inum.no_addr)
+ offset = GFS2_DIRENT_SIZE(be16_to_cpu(dent->de_name_len));
+ totlen = be16_to_cpu(dent->de_rec_len);
+ BUG_ON(offset + name->len > totlen);
+ gfs2_trans_add_bh(ip->i_gl, bh, 1);
+ ndent = (struct gfs2_dirent *)((char *)dent + offset);
+ dent->de_rec_len = cpu_to_be16(offset);
+ gfs2_qstr2dirent(name, totlen - offset, ndent);
+ return ndent;
+}
+
+static struct gfs2_dirent *gfs2_dirent_alloc(struct inode *inode,
+ struct buffer_head *bh,
+ const struct qstr *name)
+{
+ struct gfs2_dirent *dent;
+ dent = gfs2_dirent_scan(inode, bh->b_data, bh->b_size,
+ gfs2_dirent_find_space, name, NULL);
+ if (!dent || IS_ERR(dent))
+ return dent;
+ return gfs2_init_dirent(inode, dent, name, bh);
+}
+
+static int get_leaf(struct gfs2_inode *dip, u64 leaf_no,
+ struct buffer_head **bhp)
+{
+ int error;
+
+ error = gfs2_meta_read(dip->i_gl, leaf_no, DIO_WAIT, bhp);
+ if (!error && gfs2_metatype_check(GFS2_SB(&dip->i_inode), *bhp, GFS2_METATYPE_LF)) {
+ /* printk(KERN_INFO "block num=%llu\n", leaf_no); */
+ error = -EIO;
+ }
+
+ return error;
+}
+
+/**
+ * get_leaf_nr - Get a leaf number associated with the index
+ * @dip: The GFS2 inode
+ * @index:
+ * @leaf_out:
+ *
+ * Returns: 0 on success, error code otherwise
+ */
+
+static int get_leaf_nr(struct gfs2_inode *dip, u32 index,
+ u64 *leaf_out)
+{
+ u64 leaf_no;
+ int error;
+
+ error = gfs2_dir_read_data(dip, (char *)&leaf_no,
+ index * sizeof(u64),
+ sizeof(u64), 0);
+ if (error != sizeof(u64))
+ return (error < 0) ? error : -EIO;
+
+ *leaf_out = be64_to_cpu(leaf_no);
+
+ return 0;
+}
+
+static int get_first_leaf(struct gfs2_inode *dip, u32 index,
+ struct buffer_head **bh_out)
+{
+ u64 leaf_no;
+ int error;
+
+ error = get_leaf_nr(dip, index, &leaf_no);
+ if (!error)
+ error = get_leaf(dip, leaf_no, bh_out);
+
+ return error;
+}
+
+static struct gfs2_dirent *gfs2_dirent_search(struct inode *inode,
+ const struct qstr *name,
+ gfs2_dscan_t scan,
+ struct buffer_head **pbh)
+{
+ struct buffer_head *bh;
+ struct gfs2_dirent *dent;
+ struct gfs2_inode *ip = GFS2_I(inode);
+ int error;
+
+ if (ip->i_di.di_flags & GFS2_DIF_EXHASH) {
+ struct gfs2_leaf *leaf;
+ unsigned hsize = 1 << ip->i_di.di_depth;
+ unsigned index;
+ u64 ln;
+ if (hsize * sizeof(u64) != ip->i_di.di_size) {
+ gfs2_consist_inode(ip);
+ return ERR_PTR(-EIO);
+ }
+
+ index = name->hash >> (32 - ip->i_di.di_depth);
+ error = get_first_leaf(ip, index, &bh);
+ if (error)
+ return ERR_PTR(error);
+ do {
+ dent = gfs2_dirent_scan(inode, bh->b_data, bh->b_size,
+ scan, name, NULL);
+ if (dent)
+ goto got_dent;
+ leaf = (struct gfs2_leaf *)bh->b_data;
+ ln = be64_to_cpu(leaf->lf_next);
+ brelse(bh);
+ if (!ln)
+ break;
+
+ error = get_leaf(ip, ln, &bh);
+ } while(!error);
+
+ return error ? ERR_PTR(error) : NULL;
+ }
+
+
+ error = gfs2_meta_inode_buffer(ip, &bh);
+ if (error)
+ return ERR_PTR(error);
+ dent = gfs2_dirent_scan(inode, bh->b_data, bh->b_size, scan, name, NULL);
+got_dent:
+ if (unlikely(dent == NULL || IS_ERR(dent))) {
+ brelse(bh);
+ bh = NULL;
+ }
+ *pbh = bh;
+ return dent;
+}
+
+static struct gfs2_leaf *new_leaf(struct inode *inode, struct buffer_head **pbh, u16 depth)
+{
+ struct gfs2_inode *ip = GFS2_I(inode);
+ u64 bn = gfs2_alloc_meta(ip);
+ struct buffer_head *bh = gfs2_meta_new(ip->i_gl, bn);
+ struct gfs2_leaf *leaf;
+ struct gfs2_dirent *dent;
+ struct qstr name = { .name = "", .len = 0, .hash = 0 };
+ if (!bh)
+ return NULL;
+
+ gfs2_trans_add_bh(ip->i_gl, bh, 1);
+ gfs2_metatype_set(bh, GFS2_METATYPE_LF, GFS2_FORMAT_LF);
+ leaf = (struct gfs2_leaf *)bh->b_data;
+ leaf->lf_depth = cpu_to_be16(depth);
+ leaf->lf_entries = 0;
+ leaf->lf_dirent_format = cpu_to_be32(GFS2_FORMAT_DE);
+ leaf->lf_next = 0;
+ memset(leaf->lf_reserved, 0, sizeof(leaf->lf_reserved));
+ dent = (struct gfs2_dirent *)(leaf+1);
+ gfs2_qstr2dirent(&name, bh->b_size - sizeof(struct gfs2_leaf), dent);
+ *pbh = bh;
+ return leaf;
+}
+
+/**
+ * dir_make_exhash - Convert a stuffed directory into an ExHash directory
+ * @dip: The GFS2 inode
+ *
+ * Returns: 0 on success, error code otherwise
+ */
+
+static int dir_make_exhash(struct inode *inode)
+{
+ struct gfs2_inode *dip = GFS2_I(inode);
+ struct gfs2_sbd *sdp = GFS2_SB(inode);
+ struct gfs2_dirent *dent;
+ struct qstr args;
+ struct buffer_head *bh, *dibh;
+ struct gfs2_leaf *leaf;
+ int y;
+ u32 x;
+ u64 *lp, bn;
+ int error;
+
+ error = gfs2_meta_inode_buffer(dip, &dibh);
+ if (error)
+ return error;
+
+ /* Turn over a new leaf */
+
+ leaf = new_leaf(inode, &bh, 0);
+ if (!leaf)
+ return -ENOSPC;
+ bn = bh->b_blocknr;
+
+ gfs2_assert(sdp, dip->i_di.di_entries < (1 << 16));
+ leaf->lf_entries = cpu_to_be16(dip->i_di.di_entries);
+
+ /* Copy dirents */
+
+ gfs2_buffer_copy_tail(bh, sizeof(struct gfs2_leaf), dibh,
+ sizeof(struct gfs2_dinode));
+
+ /* Find last entry */
+
+ x = 0;
+ args.len = bh->b_size - sizeof(struct gfs2_dinode) +
+ sizeof(struct gfs2_leaf);
+ args.name = bh->b_data;
+ dent = gfs2_dirent_scan(&dip->i_inode, bh->b_data, bh->b_size,
+ gfs2_dirent_last, &args, NULL);
+ if (!dent) {
+ brelse(bh);
+ brelse(dibh);
+ return -EIO;
+ }
+ if (IS_ERR(dent)) {
+ brelse(bh);
+ brelse(dibh);
+ return PTR_ERR(dent);
+ }
+
+ /* Adjust the last dirent's record length
+ (Remember that dent still points to the last entry.) */
+
+ dent->de_rec_len = cpu_to_be16(be16_to_cpu(dent->de_rec_len) +
+ sizeof(struct gfs2_dinode) -
+ sizeof(struct gfs2_leaf));
+
+ brelse(bh);
+
+ /* We're done with the new leaf block, now setup the new
+ hash table. */
+
+ gfs2_trans_add_bh(dip->i_gl, dibh, 1);
+ gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
+
+ lp = (u64 *)(dibh->b_data + sizeof(struct gfs2_dinode));
+
+ for (x = sdp->sd_hash_ptrs; x--; lp++)
+ *lp = cpu_to_be64(bn);
+
+ dip->i_di.di_size = sdp->sd_sb.sb_bsize / 2;
+ dip->i_di.di_blocks++;
+ dip->i_di.di_flags |= GFS2_DIF_EXHASH;
+ dip->i_di.di_payload_format = 0;
+
+ for (x = sdp->sd_hash_ptrs, y = -1; x; x >>= 1, y++) ;
+ dip->i_di.di_depth = y;
+
+ gfs2_dinode_out(&dip->i_di, dibh->b_data);
+
+ brelse(dibh);
+
+ return 0;
+}
+
+/**
+ * dir_split_leaf - Split a leaf block into two
+ * @dip: The GFS2 inode
+ * @index:
+ * @leaf_no:
+ *
+ * Returns: 0 on success, error code on failure
+ */
+
+static int dir_split_leaf(struct inode *inode, const struct qstr *name)
+{
+ struct gfs2_inode *dip = GFS2_I(inode);
+ struct buffer_head *nbh, *obh, *dibh;
+ struct gfs2_leaf *nleaf, *oleaf;
+ struct gfs2_dirent *dent = NULL, *prev = NULL, *next = NULL, *new;
+ u32 start, len, half_len, divider;
+ u64 bn, *lp, leaf_no;
+ u32 index;
+ int x, moved = 0;
+ int error;
+
+ index = name->hash >> (32 - dip->i_di.di_depth);
+ error = get_leaf_nr(dip, index, &leaf_no);
+ if (error)
+ return error;
+
+ /* Get the old leaf block */
+ error = get_leaf(dip, leaf_no, &obh);
+ if (error)
+ return error;
+
+ oleaf = (struct gfs2_leaf *)obh->b_data;
+ if (dip->i_di.di_depth == be16_to_cpu(oleaf->lf_depth)) {
+ brelse(obh);
+ return 1; /* can't split */
+ }
+
+ gfs2_trans_add_bh(dip->i_gl, obh, 1);
+
+ nleaf = new_leaf(inode, &nbh, be16_to_cpu(oleaf->lf_depth) + 1);
+ if (!nleaf) {
+ brelse(obh);
+ return -ENOSPC;
+ }
+ bn = nbh->b_blocknr;
+
+ /* Compute the start and len of leaf pointers in the hash table. */
+ len = 1 << (dip->i_di.di_depth - be16_to_cpu(oleaf->lf_depth));
+ half_len = len >> 1;
+ if (!half_len) {
+ printk(KERN_WARNING "di_depth %u lf_depth %u index %u\n", dip->i_di.di_depth, be16_to_cpu(oleaf->lf_depth), index);
+ gfs2_consist_inode(dip);
+ error = -EIO;
+ goto fail_brelse;
+ }
+
+ start = (index & ~(len - 1));
+
+ /* Change the pointers.
+ Don't bother distinguishing stuffed from non-stuffed.
+ This code is complicated enough already. */
+ lp = kmalloc(half_len * sizeof(u64), GFP_NOFS | __GFP_NOFAIL);
+ /* Change the pointers */
+ for (x = 0; x < half_len; x++)
+ lp[x] = cpu_to_be64(bn);
+
+ error = gfs2_dir_write_data(dip, (char *)lp, start * sizeof(u64),
+ half_len * sizeof(u64));
+ if (error != half_len * sizeof(u64)) {
+ if (error >= 0)
+ error = -EIO;
+ goto fail_lpfree;
+ }
+
+ kfree(lp);
+
+ /* Compute the divider */
+ divider = (start + half_len) << (32 - dip->i_di.di_depth);
+
+ /* Copy the entries */
+ dirent_first(dip, obh, &dent);
+
+ do {
+ next = dent;
+ if (dirent_next(dip, obh, &next))
+ next = NULL;
+
+ if (dent->de_inum.no_addr &&
+ be32_to_cpu(dent->de_hash) < divider) {
+ struct qstr str;
+ str.name = (char*)(dent+1);
+ str.len = be16_to_cpu(dent->de_name_len);
+ str.hash = be32_to_cpu(dent->de_hash);
+ new = gfs2_dirent_alloc(inode, nbh, &str);
+ if (IS_ERR(new)) {
+ error = PTR_ERR(new);
+ break;
+ }
+
+ new->de_inum = dent->de_inum; /* No endian worries */
+ new->de_type = dent->de_type; /* No endian worries */
+ nleaf->lf_entries = cpu_to_be16(be16_to_cpu(nleaf->lf_entries)+1);
+
+ dirent_del(dip, obh, prev, dent);
+
+ if (!oleaf->lf_entries)
+ gfs2_consist_inode(dip);
+ oleaf->lf_entries = cpu_to_be16(be16_to_cpu(oleaf->lf_entries)-1);
+
+ if (!prev)
+ prev = dent;
+
+ moved = 1;
+ } else {
+ prev = dent;
+ }
+ dent = next;
+ } while (dent);
+
+ oleaf->lf_depth = nleaf->lf_depth;
+
+ error = gfs2_meta_inode_buffer(dip, &dibh);
+ if (!gfs2_assert_withdraw(GFS2_SB(&dip->i_inode), !error)) {
+ dip->i_di.di_blocks++;
+ gfs2_dinode_out(&dip->i_di, dibh->b_data);
+ brelse(dibh);
+ }
+
+ brelse(obh);
+ brelse(nbh);
+
+ return error;
+
+fail_lpfree:
+ kfree(lp);
+
+fail_brelse:
+ brelse(obh);
+ brelse(nbh);
+ return error;
+}
+
+/**
+ * dir_double_exhash - Double size of ExHash table
+ * @dip: The GFS2 dinode
+ *
+ * Returns: 0 on success, error code on failure
+ */
+
+static int dir_double_exhash(struct gfs2_inode *dip)
+{
+ struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
+ struct buffer_head *dibh;
+ u32 hsize;
+ u64 *buf;
+ u64 *from, *to;
+ u64 block;
+ int x;
+ int error = 0;
+
+ hsize = 1 << dip->i_di.di_depth;
+ if (hsize * sizeof(u64) != dip->i_di.di_size) {
+ gfs2_consist_inode(dip);
+ return -EIO;
+ }
+
+ /* Allocate both the "from" and "to" buffers in one big chunk */
+
+ buf = kcalloc(3, sdp->sd_hash_bsize, GFP_KERNEL | __GFP_NOFAIL);
+
+ for (block = dip->i_di.di_size >> sdp->sd_hash_bsize_shift; block--;) {
+ error = gfs2_dir_read_data(dip, (char *)buf,
+ block * sdp->sd_hash_bsize,
+ sdp->sd_hash_bsize, 1);
+ if (error != sdp->sd_hash_bsize) {
+ if (error >= 0)
+ error = -EIO;
+ goto fail;
+ }
+
+ from = buf;
+ to = (u64 *)((char *)buf + sdp->sd_hash_bsize);
+
+ for (x = sdp->sd_hash_ptrs; x--; from++) {
+ *to++ = *from; /* No endianess worries */
+ *to++ = *from;
+ }
+
+ error = gfs2_dir_write_data(dip,
+ (char *)buf + sdp->sd_hash_bsize,
+ block * sdp->sd_sb.sb_bsize,
+ sdp->sd_sb.sb_bsize);
+ if (error != sdp->sd_sb.sb_bsize) {
+ if (error >= 0)
+ error = -EIO;
+ goto fail;
+ }
+ }
+
+ kfree(buf);
+
+ error = gfs2_meta_inode_buffer(dip, &dibh);
+ if (!gfs2_assert_withdraw(sdp, !error)) {
+ dip->i_di.di_depth++;
+ gfs2_dinode_out(&dip->i_di, dibh->b_data);
+ brelse(dibh);
+ }
+
+ return error;
+
+fail:
+ kfree(buf);
+ return error;
+}
+
+/**
+ * compare_dents - compare directory entries by hash value
+ * @a: first dent
+ * @b: second dent
+ *
+ * When comparing the hash entries of @a to @b:
+ * gt: returns 1
+ * lt: returns -1
+ * eq: returns 0
+ */
+
+static int compare_dents(const void *a, const void *b)
+{
+ const struct gfs2_dirent *dent_a, *dent_b;
+ u32 hash_a, hash_b;
+ int ret = 0;
+
+ dent_a = *(const struct gfs2_dirent **)a;
+ hash_a = be32_to_cpu(dent_a->de_hash);
+
+ dent_b = *(const struct gfs2_dirent **)b;
+ hash_b = be32_to_cpu(dent_b->de_hash);
+
+ if (hash_a > hash_b)
+ ret = 1;
+ else if (hash_a < hash_b)
+ ret = -1;
+ else {
+ unsigned int len_a = be16_to_cpu(dent_a->de_name_len);
+ unsigned int len_b = be16_to_cpu(dent_b->de_name_len);
+
+ if (len_a > len_b)
+ ret = 1;
+ else if (len_a < len_b)
+ ret = -1;
+ else
+ ret = memcmp(dent_a + 1, dent_b + 1, len_a);
+ }
+
+ return ret;
+}
+
+/**
+ * do_filldir_main - read out directory entries
+ * @dip: The GFS2 inode
+ * @offset: The offset in the file to read from
+ * @opaque: opaque data to pass to filldir
+ * @filldir: The function to pass entries to
+ * @darr: an array of struct gfs2_dirent pointers to read
+ * @entries: the number of entries in darr
+ * @copied: pointer to int that's non-zero if a entry has been copied out
+ *
+ * Jump through some hoops to make sure that if there are hash collsions,
+ * they are read out at the beginning of a buffer. We want to minimize
+ * the possibility that they will fall into different readdir buffers or
+ * that someone will want to seek to that location.
+ *
+ * Returns: errno, >0 on exception from filldir
+ */
+
+static int do_filldir_main(struct gfs2_inode *dip, u64 *offset,
+ void *opaque, gfs2_filldir_t filldir,
+ const struct gfs2_dirent **darr, u32 entries,
+ int *copied)
+{
+ const struct gfs2_dirent *dent, *dent_next;
+ struct gfs2_inum inum;
+ u64 off, off_next;
+ unsigned int x, y;
+ int run = 0;
+ int error = 0;
+
+ sort(darr, entries, sizeof(struct gfs2_dirent *), compare_dents, NULL);
+
+ dent_next = darr[0];
+ off_next = be32_to_cpu(dent_next->de_hash);
+ off_next = gfs2_disk_hash2offset(off_next);
+
+ for (x = 0, y = 1; x < entries; x++, y++) {
+ dent = dent_next;
+ off = off_next;
+
+ if (y < entries) {
+ dent_next = darr[y];
+ off_next = be32_to_cpu(dent_next->de_hash);
+ off_next = gfs2_disk_hash2offset(off_next);
+
+ if (off < *offset)
+ continue;
+ *offset = off;
+
+ if (off_next == off) {
+ if (*copied && !run)
+ return 1;
+ run = 1;
+ } else
+ run = 0;
+ } else {
+ if (off < *offset)
+ continue;
+ *offset = off;
+ }
+
+ gfs2_inum_in(&inum, (char *)&dent->de_inum);
+
+ error = filldir(opaque, (const char *)(dent + 1),
+ be16_to_cpu(dent->de_name_len),
+ off, &inum,
+ be16_to_cpu(dent->de_type));
+ if (error)
+ return 1;
+
+ *copied = 1;
+ }
+
+ /* Increment the *offset by one, so the next time we come into the
+ do_filldir fxn, we get the next entry instead of the last one in the
+ current leaf */
+
+ (*offset)++;
+
+ return 0;
+}
+
+static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque,
+ gfs2_filldir_t filldir, int *copied,
+ unsigned *depth, u64 leaf_no)
+{
+ struct gfs2_inode *ip = GFS2_I(inode);
+ struct buffer_head *bh;
+ struct gfs2_leaf *lf;
+ unsigned entries = 0;
+ unsigned leaves = 0;
+ const struct gfs2_dirent **darr, *dent;
+ struct dirent_gather g;
+ struct buffer_head **larr;
+ int leaf = 0;
+ int error, i;
+ u64 lfn = leaf_no;
+
+ do {
+ error = get_leaf(ip, lfn, &bh);
+ if (error)
+ goto out;
+ lf = (struct gfs2_leaf *)bh->b_data;
+ if (leaves == 0)
+ *depth = be16_to_cpu(lf->lf_depth);
+ entries += be16_to_cpu(lf->lf_entries);
+ leaves++;
+ lfn = be64_to_cpu(lf->lf_next);
+ brelse(bh);
+ } while(lfn);
+
+ if (!entries)
+ return 0;
+
+ error = -ENOMEM;
+ larr = vmalloc((leaves + entries) * sizeof(void *));
+ if (!larr)
+ goto out;
+ darr = (const struct gfs2_dirent **)(larr + leaves);
+ g.pdent = darr;
+ g.offset = 0;
+ lfn = leaf_no;
+
+ do {
+ error = get_leaf(ip, lfn, &bh);
+ if (error)
+ goto out_kfree;
+ lf = (struct gfs2_leaf *)bh->b_data;
+ lfn = be64_to_cpu(lf->lf_next);
+ if (lf->lf_entries) {
+ dent = gfs2_dirent_scan(inode, bh->b_data, bh->b_size,
+ gfs2_dirent_gather, NULL, &g);
+ error = PTR_ERR(dent);
+ if (IS_ERR(dent)) {
+ goto out_kfree;
+ }
+ error = 0;
+ larr[leaf++] = bh;
+ } else {
+ brelse(bh);
+ }
+ } while(lfn);
+
+ error = do_filldir_main(ip, offset, opaque, filldir, darr,
+ entries, copied);
+out_kfree:
+ for(i = 0; i < leaf; i++)
+ brelse(larr[i]);
+ vfree(larr);
+out:
+ return error;
+}
+
+/**
+ * dir_e_read - Reads the entries from a directory into a filldir buffer
+ * @dip: dinode pointer
+ * @offset: the hash of the last entry read shifted to the right once
+ * @opaque: buffer for the filldir function to fill
+ * @filldir: points to the filldir function to use
+ *
+ * Returns: errno
+ */
+
+static int dir_e_read(struct inode *inode, u64 *offset, void *opaque,
+ gfs2_filldir_t filldir)
+{
+ struct gfs2_inode *dip = GFS2_I(inode);
+ struct gfs2_sbd *sdp = GFS2_SB(inode);
+ u32 hsize, len = 0;
+ u32 ht_offset, lp_offset, ht_offset_cur = -1;
+ u32 hash, index;
+ u64 *lp;
+ int copied = 0;
+ int error = 0;
+ unsigned depth = 0;
+
+ hsize = 1 << dip->i_di.di_depth;
+ if (hsize * sizeof(u64) != dip->i_di.di_size) {
+ gfs2_consist_inode(dip);
+ return -EIO;
+ }
+
+ hash = gfs2_dir_offset2hash(*offset);
+ index = hash >> (32 - dip->i_di.di_depth);
+
+ lp = kmalloc(sdp->sd_hash_bsize, GFP_KERNEL);
+ if (!lp)
+ return -ENOMEM;
+
+ while (index < hsize) {
+ lp_offset = index & (sdp->sd_hash_ptrs - 1);
+ ht_offset = index - lp_offset;
+
+ if (ht_offset_cur != ht_offset) {
+ error = gfs2_dir_read_data(dip, (char *)lp,
+ ht_offset * sizeof(u64),
+ sdp->sd_hash_bsize, 1);
+ if (error != sdp->sd_hash_bsize) {
+ if (error >= 0)
+ error = -EIO;
+ goto out;
+ }
+ ht_offset_cur = ht_offset;
+ }
+
+ error = gfs2_dir_read_leaf(inode, offset, opaque, filldir,
+ &copied, &depth,
+ be64_to_cpu(lp[lp_offset]));
+ if (error)
+ break;
+
+ len = 1 << (dip->i_di.di_depth - depth);
+ index = (index & ~(len - 1)) + len;
+ }
+
+out:
+ kfree(lp);
+ if (error > 0)
+ error = 0;
+ return error;
+}
+
+int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
+ gfs2_filldir_t filldir)
+{
+ struct gfs2_inode *dip = GFS2_I(inode);
+ struct dirent_gather g;
+ const struct gfs2_dirent **darr, *dent;
+ struct buffer_head *dibh;
+ int copied = 0;
+ int error;
+
+ if (!dip->i_di.di_entries)
+ return 0;
+
+ if (dip->i_di.di_flags & GFS2_DIF_EXHASH)
+ return dir_e_read(inode, offset, opaque, filldir);
+
+ if (!gfs2_is_stuffed(dip)) {
+ gfs2_consist_inode(dip);
+ return -EIO;
+ }
+
+ error = gfs2_meta_inode_buffer(dip, &dibh);
+ if (error)
+ return error;
+
+ error = -ENOMEM;
+ darr = kmalloc(dip->i_di.di_entries * sizeof(struct gfs2_dirent *),
+ GFP_KERNEL);
+ if (darr) {
+ g.pdent = darr;
+ g.offset = 0;
+ dent = gfs2_dirent_scan(inode, dibh->b_data, dibh->b_size,
+ gfs2_dirent_gather, NULL, &g);
+ if (IS_ERR(dent)) {
+ error = PTR_ERR(dent);
+ goto out;
+ }
+ error = do_filldir_main(dip, offset, opaque, filldir, darr,
+ dip->i_di.di_entries, &copied);
+out:
+ kfree(darr);
+ }
+
+ if (error > 0)
+ error = 0;
+
+ brelse(dibh);
+
+ return error;
+}
+
+/**
+ * gfs2_dir_search - Search a directory
+ * @dip: The GFS2 inode
+ * @filename:
+ * @inode:
+ *
+ * This routine searches a directory for a file or another directory.
+ * Assumes a glock is held on dip.
+ *
+ * Returns: errno
+ */
+
+int gfs2_dir_search(struct inode *dir, const struct qstr *name,
+ struct gfs2_inum *inum, unsigned int *type)
+{
+ struct buffer_head *bh;
+ struct gfs2_dirent *dent;
+
+ dent = gfs2_dirent_search(dir, name, gfs2_dirent_find, &bh);
+ if (dent) {
+ if (IS_ERR(dent))
+ return PTR_ERR(dent);
+ if (inum)
+ gfs2_inum_in(inum, (char *)&dent->de_inum);
+ if (type)
+ *type = be16_to_cpu(dent->de_type);
+ brelse(bh);
+ return 0;
+ }
+ return -ENOENT;
+}
+
+static int dir_new_leaf(struct inode *inode, const struct qstr *name)
+{
+ struct buffer_head *bh, *obh;
+ struct gfs2_inode *ip = GFS2_I(inode);
+ struct gfs2_leaf *leaf, *oleaf;
+ int error;
+ u32 index;
+ u64 bn;
+
+ index = name->hash >> (32 - ip->i_di.di_depth);
+ error = get_first_leaf(ip, index, &obh);
+ if (error)
+ return error;
+ do {
+ oleaf = (struct gfs2_leaf *)obh->b_data;
+ bn = be64_to_cpu(oleaf->lf_next);
+ if (!bn)
+ break;
+ brelse(obh);
+ error = get_leaf(ip, bn, &obh);
+ if (error)
+ return error;
+ } while(1);
+
+ gfs2_trans_add_bh(ip->i_gl, obh, 1);
+
+ leaf = new_leaf(inode, &bh, be16_to_cpu(oleaf->lf_depth));
+ if (!leaf) {
+ brelse(obh);
+ return -ENOSPC;
+ }
+ oleaf->lf_next = cpu_to_be64(bh->b_blocknr);
+ brelse(bh);
+ brelse(obh);
+
+ error = gfs2_meta_inode_buffer(ip, &bh);
+ if (error)
+ return error;
+ gfs2_trans_add_bh(ip->i_gl, bh, 1);
+ ip->i_di.di_blocks++;
+ gfs2_dinode_out(&ip->i_di, bh->b_data);
+ brelse(bh);
+ return 0;
+}
+
+/**
+ * gfs2_dir_add - Add new filename into directory
+ * @dip: The GFS2 inode
+ * @filename: The new name
+ * @inode: The inode number of the entry
+ * @type: The type of the entry
+ *
+ * Returns: 0 on success, error code on failure
+ */
+
+int gfs2_dir_add(struct inode *inode, const struct qstr *name,
+ const struct gfs2_inum *inum, unsigned type)
+{
+ struct gfs2_inode *ip = GFS2_I(inode);
+ struct buffer_head *bh;
+ struct gfs2_dirent *dent;
+ struct gfs2_leaf *leaf;
+ int error;
+
+ while(1) {
+ dent = gfs2_dirent_search(inode, name, gfs2_dirent_find_space,
+ &bh);
+ if (dent) {
+ if (IS_ERR(dent))
+ return PTR_ERR(dent);
+ dent = gfs2_init_dirent(inode, dent, name, bh);
+ gfs2_inum_out(inum, (char *)&dent->de_inum);
+ dent->de_type = cpu_to_be16(type);
+ if (ip->i_di.di_flags & GFS2_DIF_EXHASH) {
+ leaf = (struct gfs2_leaf *)bh->b_data;
+ leaf->lf_entries = cpu_to_be16(be16_to_cpu(leaf->lf_entries) + 1);
+ }
+ brelse(bh);
+ error = gfs2_meta_inode_buffer(ip, &bh);
+ if (error)
+ break;
+ gfs2_trans_add_bh(ip->i_gl, bh, 1);
+ ip->i_di.di_entries++;
+ ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
+ gfs2_dinode_out(&ip->i_di, bh->b_data);
+ brelse(bh);
+ error = 0;
+ break;
+ }
+ if (!(ip->i_di.di_flags & GFS2_DIF_EXHASH)) {
+ error = dir_make_exhash(inode);
+ if (error)
+ break;
+ continue;
+ }
+ error = dir_split_leaf(inode, name);
+ if (error == 0)
+ continue;
+ if (error < 0)
+ break;
+ if (ip->i_di.di_depth < GFS2_DIR_MAX_DEPTH) {
+ error = dir_double_exhash(ip);
+ if (error)
+ break;
+ error = dir_split_leaf(inode, name);
+ if (error < 0)
+ break;
+ if (error == 0)
+ continue;
+ }
+ error = dir_new_leaf(inode, name);
+ if (!error)
+ continue;
+ error = -ENOSPC;
+ break;
+ }
+ return error;
+}
+
+
+/**
+ * gfs2_dir_del - Delete a directory entry
+ * @dip: The GFS2 inode
+ * @filename: The filename
+ *
+ * Returns: 0 on success, error code on failure
+ */
+
+int gfs2_dir_del(struct gfs2_inode *dip, const struct qstr *name)
+{
+ struct gfs2_dirent *dent, *prev = NULL;
+ struct buffer_head *bh;
+ int error;
+
+ /* Returns _either_ the entry (if its first in block) or the
+ previous entry otherwise */
+ dent = gfs2_dirent_search(&dip->i_inode, name, gfs2_dirent_prev, &bh);
+ if (!dent) {
+ gfs2_consist_inode(dip);
+ return -EIO;
+ }
+ if (IS_ERR(dent)) {
+ gfs2_consist_inode(dip);
+ return PTR_ERR(dent);
+ }
+ /* If not first in block, adjust pointers accordingly */
+ if (gfs2_dirent_find(dent, name, NULL) == 0) {
+ prev = dent;
+ dent = (struct gfs2_dirent *)((char *)dent + be16_to_cpu(prev->de_rec_len));
+ }
+
+ dirent_del(dip, bh, prev, dent);
+ if (dip->i_di.di_flags & GFS2_DIF_EXHASH) {
+ struct gfs2_leaf *leaf = (struct gfs2_leaf *)bh->b_data;
+ u16 entries = be16_to_cpu(leaf->lf_entries);
+ if (!entries)
+ gfs2_consist_inode(dip);
+ leaf->lf_entries = cpu_to_be16(--entries);
+ }
+ brelse(bh);
+
+ error = gfs2_meta_inode_buffer(dip, &bh);
+ if (error)
+ return error;
+
+ if (!dip->i_di.di_entries)
+ gfs2_consist_inode(dip);
+ gfs2_trans_add_bh(dip->i_gl, bh, 1);
+ dip->i_di.di_entries--;
+ dip->i_di.di_mtime = dip->i_di.di_ctime = get_seconds();
+ gfs2_dinode_out(&dip->i_di, bh->b_data);
+ brelse(bh);
+ mark_inode_dirty(&dip->i_inode);
+
+ return error;
+}
+
+/**
+ * gfs2_dir_mvino - Change inode number of directory entry
+ * @dip: The GFS2 inode
+ * @filename:
+ * @new_inode:
+ *
+ * This routine changes the inode number of a directory entry. It's used
+ * by rename to change ".." when a directory is moved.
+ * Assumes a glock is held on dvp.
+ *
+ * Returns: errno
+ */
+
+int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
+ struct gfs2_inum *inum, unsigned int new_type)
+{
+ struct buffer_head *bh;
+ struct gfs2_dirent *dent;
+ int error;
+
+ dent = gfs2_dirent_search(&dip->i_inode, filename, gfs2_dirent_find, &bh);
+ if (!dent) {
+ gfs2_consist_inode(dip);
+ return -EIO;
+ }
+ if (IS_ERR(dent))
+ return PTR_ERR(dent);
+
+ gfs2_trans_add_bh(dip->i_gl, bh, 1);
+ gfs2_inum_out(inum, (char *)&dent->de_inum);
+ dent->de_type = cpu_to_be16(new_type);
+
+ if (dip->i_di.di_flags & GFS2_DIF_EXHASH) {
+ brelse(bh);
+ error = gfs2_meta_inode_buffer(dip, &bh);
+ if (error)
+ return error;
+ gfs2_trans_add_bh(dip->i_gl, bh, 1);
+ }
+
+ dip->i_di.di_mtime = dip->i_di.di_ctime = get_seconds();
+ gfs2_dinode_out(&dip->i_di, bh->b_data);
+ brelse(bh);
+ return 0;
+}
+
+/**
+ * foreach_leaf - call a function for each leaf in a directory
+ * @dip: the directory
+ * @lc: the function to call for each each
+ * @data: private data to pass to it
+ *
+ * Returns: errno
+ */
+
+static int foreach_leaf(struct gfs2_inode *dip, leaf_call_t lc, void *data)
+{
+ struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
+ struct buffer_head *bh;
+ struct gfs2_leaf *leaf;
+ u32 hsize, len;
+ u32 ht_offset, lp_offset, ht_offset_cur = -1;
+ u32 index = 0;
+ u64 *lp;
+ u64 leaf_no;
+ int error = 0;
+
+ hsize = 1 << dip->i_di.di_depth;
+ if (hsize * sizeof(u64) != dip->i_di.di_size) {
+ gfs2_consist_inode(dip);
+ return -EIO;
+ }
+
+ lp = kmalloc(sdp->sd_hash_bsize, GFP_KERNEL);
+ if (!lp)
+ return -ENOMEM;
+
+ while (index < hsize) {
+ lp_offset = index & (sdp->sd_hash_ptrs - 1);
+ ht_offset = index - lp_offset;
+
+ if (ht_offset_cur != ht_offset) {
+ error = gfs2_dir_read_data(dip, (char *)lp,
+ ht_offset * sizeof(u64),
+ sdp->sd_hash_bsize, 1);
+ if (error != sdp->sd_hash_bsize) {
+ if (error >= 0)
+ error = -EIO;
+ goto out;
+ }
+ ht_offset_cur = ht_offset;
+ }
+
+ leaf_no = be64_to_cpu(lp[lp_offset]);
+ if (leaf_no) {
+ error = get_leaf(dip, leaf_no, &bh);
+ if (error)
+ goto out;
+ leaf = (struct gfs2_leaf *)bh->b_data;
+ len = 1 << (dip->i_di.di_depth - be16_to_cpu(leaf->lf_depth));
+ brelse(bh);
+
+ error = lc(dip, index, len, leaf_no, data);
+ if (error)
+ goto out;
+
+ index = (index & ~(len - 1)) + len;
+ } else
+ index++;
+ }
+
+ if (index != hsize) {
+ gfs2_consist_inode(dip);
+ error = -EIO;
+ }
+
+out:
+ kfree(lp);
+
+ return error;
+}
+
+/**
+ * leaf_dealloc - Deallocate a directory leaf
+ * @dip: the directory
+ * @index: the hash table offset in the directory
+ * @len: the number of pointers to this leaf
+ * @leaf_no: the leaf number
+ * @data: not used
+ *
+ * Returns: errno
+ */
+
+static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len,
+ u64 leaf_no, void *data)
+{
+ struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
+ struct gfs2_leaf *tmp_leaf;
+ struct gfs2_rgrp_list rlist;
+ struct buffer_head *bh, *dibh;
+ u64 blk, nblk;
+ unsigned int rg_blocks = 0, l_blocks = 0;
+ char *ht;
+ unsigned int x, size = len * sizeof(u64);
+ int error;
+
+ memset(&rlist, 0, sizeof(struct gfs2_rgrp_list));
+
+ ht = kzalloc(size, GFP_KERNEL);
+ if (!ht)
+ return -ENOMEM;
+
+ gfs2_alloc_get(dip);
+
+ error = gfs2_quota_hold(dip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+ if (error)
+ goto out;
+
+ error = gfs2_rindex_hold(sdp, &dip->i_alloc.al_ri_gh);
+ if (error)
+ goto out_qs;
+
+ /* Count the number of leaves */
+
+ for (blk = leaf_no; blk; blk = nblk) {
+ error = get_leaf(dip, blk, &bh);
+ if (error)
+ goto out_rlist;
+ tmp_leaf = (struct gfs2_leaf *)bh->b_data;
+ nblk = be64_to_cpu(tmp_leaf->lf_next);
+ brelse(bh);
+
+ gfs2_rlist_add(sdp, &rlist, blk);
+ l_blocks++;
+ }
+
+ gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE, 0);
+
+ for (x = 0; x < rlist.rl_rgrps; x++) {
+ struct gfs2_rgrpd *rgd;
+ rgd = rlist.rl_ghs[x].gh_gl->gl_object;
+ rg_blocks += rgd->rd_ri.ri_length;
+ }
+
+ error = gfs2_glock_nq_m(rlist.rl_rgrps, rlist.rl_ghs);
+ if (error)
+ goto out_rlist;
+
+ error = gfs2_trans_begin(sdp,
+ rg_blocks + (DIV_ROUND_UP(size, sdp->sd_jbsize) + 1) +
+ RES_DINODE + RES_STATFS + RES_QUOTA, l_blocks);
+ if (error)
+ goto out_rg_gunlock;
+
+ for (blk = leaf_no; blk; blk = nblk) {
+ error = get_leaf(dip, blk, &bh);
+ if (error)
+ goto out_end_trans;
+ tmp_leaf = (struct gfs2_leaf *)bh->b_data;
+ nblk = be64_to_cpu(tmp_leaf->lf_next);
+ brelse(bh);
+
+ gfs2_free_meta(dip, blk, 1);
+
+ if (!dip->i_di.di_blocks)
+ gfs2_consist_inode(dip);
+ dip->i_di.di_blocks--;
+ }
+
+ error = gfs2_dir_write_data(dip, ht, index * sizeof(u64), size);
+ if (error != size) {
+ if (error >= 0)
+ error = -EIO;
+ goto out_end_trans;
+ }
+
+ error = gfs2_meta_inode_buffer(dip, &dibh);
+ if (error)
+ goto out_end_trans;
+
+ gfs2_trans_add_bh(dip->i_gl, dibh, 1);
+ gfs2_dinode_out(&dip->i_di, dibh->b_data);
+ brelse(dibh);
+
+out_end_trans:
+ gfs2_trans_end(sdp);
+out_rg_gunlock:
+ gfs2_glock_dq_m(rlist.rl_rgrps, rlist.rl_ghs);
+out_rlist:
+ gfs2_rlist_free(&rlist);
+ gfs2_glock_dq_uninit(&dip->i_alloc.al_ri_gh);
+out_qs:
+ gfs2_quota_unhold(dip);
+out:
+ gfs2_alloc_put(dip);
+ kfree(ht);
+ return error;
+}
+
+/**
+ * gfs2_dir_exhash_dealloc - free all the leaf blocks in a directory
+ * @dip: the directory
+ *
+ * Dealloc all on-disk directory leaves to FREEMETA state
+ * Change on-disk inode type to "regular file"
+ *
+ * Returns: errno
+ */
+
+int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip)
+{
+ struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
+ struct buffer_head *bh;
+ int error;
+
+ /* Dealloc on-disk leaves to FREEMETA state */
+ error = foreach_leaf(dip, leaf_dealloc, NULL);
+ if (error)
+ return error;
+
+ /* Make this a regular file in case we crash.
+ (We don't want to free these blocks a second time.) */
+
+ error = gfs2_trans_begin(sdp, RES_DINODE, 0);
+ if (error)
+ return error;
+
+ error = gfs2_meta_inode_buffer(dip, &bh);
+ if (!error) {
+ gfs2_trans_add_bh(dip->i_gl, bh, 1);
+ ((struct gfs2_dinode *)bh->b_data)->di_mode =
+ cpu_to_be32(S_IFREG);
+ brelse(bh);
+ }
+
+ gfs2_trans_end(sdp);
+
+ return error;
+}
+
+/**
+ * gfs2_diradd_alloc_required - find if adding entry will require an allocation
+ * @ip: the file being written to
+ * @filname: the filename that's going to be added
+ *
+ * Returns: 1 if alloc required, 0 if not, -ve on error
+ */
+
+int gfs2_diradd_alloc_required(struct inode *inode, const struct qstr *name)
+{
+ struct gfs2_dirent *dent;
+ struct buffer_head *bh;
+
+ dent = gfs2_dirent_search(inode, name, gfs2_dirent_find_space, &bh);
+ if (!dent) {
+ return 1;
+ }
+ if (IS_ERR(dent))
+ return PTR_ERR(dent);
+ brelse(bh);
+ return 0;
+}
+
diff --git a/fs/gfs2/dir.h b/fs/gfs2/dir.h
new file mode 100644
index 000000000000..371233419b07
--- /dev/null
+++ b/fs/gfs2/dir.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#ifndef __DIR_DOT_H__
+#define __DIR_DOT_H__
+
+#include <linux/dcache.h>
+
+struct inode;
+struct gfs2_inode;
+struct gfs2_inum;
+
+/**
+ * gfs2_filldir_t - Report a directory entry to the caller of gfs2_dir_read()
+ * @opaque: opaque data used by the function
+ * @name: the name of the directory entry
+ * @length: the length of the name
+ * @offset: the entry's offset in the directory
+ * @inum: the inode number the entry points to
+ * @type: the type of inode the entry points to
+ *
+ * Returns: 0 on success, 1 if buffer full
+ */
+
+typedef int (*gfs2_filldir_t) (void *opaque,
+ const char *name, unsigned int length,
+ u64 offset,
+ struct gfs2_inum *inum, unsigned int type);
+
+int gfs2_dir_search(struct inode *dir, const struct qstr *filename,
+ struct gfs2_inum *inum, unsigned int *type);
+int gfs2_dir_add(struct inode *inode, const struct qstr *filename,
+ const struct gfs2_inum *inum, unsigned int type);
+int gfs2_dir_del(struct gfs2_inode *dip, const struct qstr *filename);
+int gfs2_dir_read(struct inode *inode, u64 * offset, void *opaque,
+ gfs2_filldir_t filldir);
+int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
+ struct gfs2_inum *new_inum, unsigned int new_type);
+
+int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip);
+
+int gfs2_diradd_alloc_required(struct inode *dir,
+ const struct qstr *filename);
+int gfs2_dir_get_new_buffer(struct gfs2_inode *ip, u64 block,
+ struct buffer_head **bhp);
+
+static inline u32 gfs2_disk_hash(const char *data, int len)
+{
+ return crc32_le((u32)~0, data, len) ^ (u32)~0;
+}
+
+
+static inline void gfs2_str2qstr(struct qstr *name, const char *fname)
+{
+ name->name = fname;
+ name->len = strlen(fname);
+ name->hash = gfs2_disk_hash(name->name, name->len);
+}
+
+/* N.B. This probably ought to take inum & type as args as well */
+static inline void gfs2_qstr2dirent(const struct qstr *name, u16 reclen, struct gfs2_dirent *dent)
+{
+ dent->de_inum.no_addr = cpu_to_be64(0);
+ dent->de_inum.no_formal_ino = cpu_to_be64(0);
+ dent->de_hash = cpu_to_be32(name->hash);
+ dent->de_rec_len = cpu_to_be16(reclen);
+ dent->de_name_len = cpu_to_be16(name->len);
+ dent->de_type = cpu_to_be16(0);
+ memset(dent->__pad, 0, sizeof(dent->__pad));
+ memcpy(dent + 1, name->name, name->len);
+}
+
+#endif /* __DIR_DOT_H__ */
diff --git a/fs/gfs2/eaops.c b/fs/gfs2/eaops.c
new file mode 100644
index 000000000000..92c54e9b0dc3
--- /dev/null
+++ b/fs/gfs2/eaops.c
@@ -0,0 +1,230 @@
+/*
+ * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/xattr.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/lm_interface.h>
+#include <asm/uaccess.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "acl.h"
+#include "eaops.h"
+#include "eattr.h"
+#include "util.h"
+
+/**
+ * gfs2_ea_name2type - get the type of the ea, and truncate type from the name
+ * @namep: ea name, possibly with type appended
+ *
+ * Returns: GFS2_EATYPE_XXX
+ */
+
+unsigned int gfs2_ea_name2type(const char *name, const char **truncated_name)
+{
+ unsigned int type;
+
+ if (strncmp(name, "system.", 7) == 0) {
+ type = GFS2_EATYPE_SYS;
+ if (truncated_name)
+ *truncated_name = name + sizeof("system.") - 1;
+ } else if (strncmp(name, "user.", 5) == 0) {
+ type = GFS2_EATYPE_USR;
+ if (truncated_name)
+ *truncated_name = name + sizeof("user.") - 1;
+ } else if (strncmp(name, "security.", 9) == 0) {
+ type = GFS2_EATYPE_SECURITY;
+ if (truncated_name)
+ *truncated_name = name + sizeof("security.") - 1;
+ } else {
+ type = GFS2_EATYPE_UNUSED;
+ if (truncated_name)
+ *truncated_name = NULL;
+ }
+
+ return type;
+}
+
+static int user_eo_get(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+{
+ struct inode *inode = &ip->i_inode;
+ int error = permission(inode, MAY_READ, NULL);
+ if (error)
+ return error;
+
+ return gfs2_ea_get_i(ip, er);
+}
+
+static int user_eo_set(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+{
+ struct inode *inode = &ip->i_inode;
+
+ if (S_ISREG(inode->i_mode) ||
+ (S_ISDIR(inode->i_mode) && !(inode->i_mode & S_ISVTX))) {
+ int error = permission(inode, MAY_WRITE, NULL);
+ if (error)
+ return error;
+ } else
+ return -EPERM;
+
+ return gfs2_ea_set_i(ip, er);
+}
+
+static int user_eo_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+{
+ struct inode *inode = &ip->i_inode;
+
+ if (S_ISREG(inode->i_mode) ||
+ (S_ISDIR(inode->i_mode) && !(inode->i_mode & S_ISVTX))) {
+ int error = permission(inode, MAY_WRITE, NULL);
+ if (error)
+ return error;
+ } else
+ return -EPERM;
+
+ return gfs2_ea_remove_i(ip, er);
+}
+
+static int system_eo_get(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+{
+ if (!GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len) &&
+ !GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len) &&
+ !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (GFS2_SB(&ip->i_inode)->sd_args.ar_posix_acl == 0 &&
+ (GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len) ||
+ GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len)))
+ return -EOPNOTSUPP;
+
+
+
+ return gfs2_ea_get_i(ip, er);
+}
+
+static int system_eo_set(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+{
+ int remove = 0;
+ int error;
+
+ if (GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len)) {
+ if (!(er->er_flags & GFS2_ERF_MODE)) {
+ er->er_mode = ip->i_di.di_mode;
+ er->er_flags |= GFS2_ERF_MODE;
+ }
+ error = gfs2_acl_validate_set(ip, 1, er,
+ &remove, &er->er_mode);
+ if (error)
+ return error;
+ error = gfs2_ea_set_i(ip, er);
+ if (error)
+ return error;
+ if (remove)
+ gfs2_ea_remove_i(ip, er);
+ return 0;
+
+ } else if (GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len)) {
+ error = gfs2_acl_validate_set(ip, 0, er,
+ &remove, NULL);
+ if (error)
+ return error;
+ if (!remove)
+ error = gfs2_ea_set_i(ip, er);
+ else {
+ error = gfs2_ea_remove_i(ip, er);
+ if (error == -ENODATA)
+ error = 0;
+ }
+ return error;
+ }
+
+ return -EPERM;
+}
+
+static int system_eo_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+{
+ if (GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len)) {
+ int error = gfs2_acl_validate_remove(ip, 1);
+ if (error)
+ return error;
+
+ } else if (GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len)) {
+ int error = gfs2_acl_validate_remove(ip, 0);
+ if (error)
+ return error;
+
+ } else
+ return -EPERM;
+
+ return gfs2_ea_remove_i(ip, er);
+}
+
+static int security_eo_get(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+{
+ struct inode *inode = &ip->i_inode;
+ int error = permission(inode, MAY_READ, NULL);
+ if (error)
+ return error;
+
+ return gfs2_ea_get_i(ip, er);
+}
+
+static int security_eo_set(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+{
+ struct inode *inode = &ip->i_inode;
+ int error = permission(inode, MAY_WRITE, NULL);
+ if (error)
+ return error;
+
+ return gfs2_ea_set_i(ip, er);
+}
+
+static int security_eo_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+{
+ struct inode *inode = &ip->i_inode;
+ int error = permission(inode, MAY_WRITE, NULL);
+ if (error)
+ return error;
+
+ return gfs2_ea_remove_i(ip, er);
+}
+
+static struct gfs2_eattr_operations gfs2_user_eaops = {
+ .eo_get = user_eo_get,
+ .eo_set = user_eo_set,
+ .eo_remove = user_eo_remove,
+ .eo_name = "user",
+};
+
+struct gfs2_eattr_operations gfs2_system_eaops = {
+ .eo_get = system_eo_get,
+ .eo_set = system_eo_set,
+ .eo_remove = system_eo_remove,
+ .eo_name = "system",
+};
+
+static struct gfs2_eattr_operations gfs2_security_eaops = {
+ .eo_get = security_eo_get,
+ .eo_set = security_eo_set,
+ .eo_remove = security_eo_remove,
+ .eo_name = "security",
+};
+
+struct gfs2_eattr_operations *gfs2_ea_ops[] = {
+ NULL,
+ &gfs2_user_eaops,
+ &gfs2_system_eaops,
+ &gfs2_security_eaops,
+};
+
diff --git a/fs/gfs2/eaops.h b/fs/gfs2/eaops.h
new file mode 100644
index 000000000000..508b4f7a2449
--- /dev/null
+++ b/fs/gfs2/eaops.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#ifndef __EAOPS_DOT_H__
+#define __EAOPS_DOT_H__
+
+struct gfs2_ea_request;
+struct gfs2_inode;
+
+struct gfs2_eattr_operations {
+ int (*eo_get) (struct gfs2_inode *ip, struct gfs2_ea_request *er);
+ int (*eo_set) (struct gfs2_inode *ip, struct gfs2_ea_request *er);
+ int (*eo_remove) (struct gfs2_inode *ip, struct gfs2_ea_request *er);
+ char *eo_name;
+};
+
+unsigned int gfs2_ea_name2type(const char *name, const char **truncated_name);
+
+extern struct gfs2_eattr_operations gfs2_system_eaops;
+
+extern struct gfs2_eattr_operations *gfs2_ea_ops[];
+
+#endif /* __EAOPS_DOT_H__ */
+
diff --git a/fs/gfs2/eattr.c b/fs/gfs2/eattr.c
new file mode 100644
index 000000000000..a65a4ccfd4dd
--- /dev/null
+++ b/fs/gfs2/eattr.c
@@ -0,0 +1,1501 @@
+/*
+ * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/xattr.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/lm_interface.h>
+#include <asm/uaccess.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "acl.h"
+#include "eaops.h"
+#include "eattr.h"
+#include "glock.h"
+#include "inode.h"
+#include "meta_io.h"
+#include "quota.h"
+#include "rgrp.h"
+#include "trans.h"
+#include "util.h"
+
+/**
+ * ea_calc_size - returns the acutal number of bytes the request will take up
+ * (not counting any unstuffed data blocks)
+ * @sdp:
+ * @er:
+ * @size:
+ *
+ * Returns: 1 if the EA should be stuffed
+ */
+
+static int ea_calc_size(struct gfs2_sbd *sdp, struct gfs2_ea_request *er,
+ unsigned int *size)
+{
+ *size = GFS2_EAREQ_SIZE_STUFFED(er);
+ if (*size <= sdp->sd_jbsize)
+ return 1;
+
+ *size = GFS2_EAREQ_SIZE_UNSTUFFED(sdp, er);
+
+ return 0;
+}
+
+static int ea_check_size(struct gfs2_sbd *sdp, struct gfs2_ea_request *er)
+{
+ unsigned int size;
+
+ if (er->er_data_len > GFS2_EA_MAX_DATA_LEN)
+ return -ERANGE;
+
+ ea_calc_size(sdp, er, &size);
+
+ /* This can only happen with 512 byte blocks */
+ if (size > sdp->sd_jbsize)
+ return -ERANGE;
+
+ return 0;
+}
+
+typedef int (*ea_call_t) (struct gfs2_inode *ip, struct buffer_head *bh,
+ struct gfs2_ea_header *ea,
+ struct gfs2_ea_header *prev, void *private);
+
+static int ea_foreach_i(struct gfs2_inode *ip, struct buffer_head *bh,
+ ea_call_t ea_call, void *data)
+{
+ struct gfs2_ea_header *ea, *prev = NULL;
+ int error = 0;
+
+ if (gfs2_metatype_check(GFS2_SB(&ip->i_inode), bh, GFS2_METATYPE_EA))
+ return -EIO;
+
+ for (ea = GFS2_EA_BH2FIRST(bh);; prev = ea, ea = GFS2_EA2NEXT(ea)) {
+ if (!GFS2_EA_REC_LEN(ea))
+ goto fail;
+ if (!(bh->b_data <= (char *)ea && (char *)GFS2_EA2NEXT(ea) <=
+ bh->b_data + bh->b_size))
+ goto fail;
+ if (!GFS2_EATYPE_VALID(ea->ea_type))
+ goto fail;
+
+ error = ea_call(ip, bh, ea, prev, data);
+ if (error)
+ return error;
+
+ if (GFS2_EA_IS_LAST(ea)) {
+ if ((char *)GFS2_EA2NEXT(ea) !=
+ bh->b_data + bh->b_size)
+ goto fail;
+ break;
+ }
+ }
+
+ return error;
+
+fail:
+ gfs2_consist_inode(ip);
+ return -EIO;
+}
+
+static int ea_foreach(struct gfs2_inode *ip, ea_call_t ea_call, void *data)
+{
+ struct buffer_head *bh, *eabh;
+ u64 *eablk, *end;
+ int error;
+
+ error = gfs2_meta_read(ip->i_gl, ip->i_di.di_eattr, DIO_WAIT, &bh);
+ if (error)
+ return error;
+
+ if (!(ip->i_di.di_flags & GFS2_DIF_EA_INDIRECT)) {
+ error = ea_foreach_i(ip, bh, ea_call, data);
+ goto out;
+ }
+
+ if (gfs2_metatype_check(GFS2_SB(&ip->i_inode), bh, GFS2_METATYPE_IN)) {
+ error = -EIO;
+ goto out;
+ }
+
+ eablk = (u64 *)(bh->b_data + sizeof(struct gfs2_meta_header));
+ end = eablk + GFS2_SB(&ip->i_inode)->sd_inptrs;
+
+ for (; eablk < end; eablk++) {
+ u64 bn;
+
+ if (!*eablk)
+ break;
+ bn = be64_to_cpu(*eablk);
+
+ error = gfs2_meta_read(ip->i_gl, bn, DIO_WAIT, &eabh);
+ if (error)
+ break;
+ error = ea_foreach_i(ip, eabh, ea_call, data);
+ brelse(eabh);
+ if (error)
+ break;
+ }
+out:
+ brelse(bh);
+ return error;
+}
+
+struct ea_find {
+ struct gfs2_ea_request *ef_er;
+ struct gfs2_ea_location *ef_el;
+};
+
+static int ea_find_i(struct gfs2_inode *ip, struct buffer_head *bh,
+ struct gfs2_ea_header *ea, struct gfs2_ea_header *prev,
+ void *private)
+{
+ struct ea_find *ef = private;
+ struct gfs2_ea_request *er = ef->ef_er;
+
+ if (ea->ea_type == GFS2_EATYPE_UNUSED)
+ return 0;
+
+ if (ea->ea_type == er->er_type) {
+ if (ea->ea_name_len == er->er_name_len &&
+ !memcmp(GFS2_EA2NAME(ea), er->er_name, ea->ea_name_len)) {
+ struct gfs2_ea_location *el = ef->ef_el;
+ get_bh(bh);
+ el->el_bh = bh;
+ el->el_ea = ea;
+ el->el_prev = prev;
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
+int gfs2_ea_find(struct gfs2_inode *ip, struct gfs2_ea_request *er,
+ struct gfs2_ea_location *el)
+{
+ struct ea_find ef;
+ int error;
+
+ ef.ef_er = er;
+ ef.ef_el = el;
+
+ memset(el, 0, sizeof(struct gfs2_ea_location));
+
+ error = ea_foreach(ip, ea_find_i, &ef);
+ if (error > 0)
+ return 0;
+
+ return error;
+}
+
+/**
+ * ea_dealloc_unstuffed -
+ * @ip:
+ * @bh:
+ * @ea:
+ * @prev:
+ * @private:
+ *
+ * Take advantage of the fact that all unstuffed blocks are
+ * allocated from the same RG. But watch, this may not always
+ * be true.
+ *
+ * Returns: errno
+ */
+
+static int ea_dealloc_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh,
+ struct gfs2_ea_header *ea,
+ struct gfs2_ea_header *prev, void *private)
+{
+ int *leave = private;
+ struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+ struct gfs2_rgrpd *rgd;
+ struct gfs2_holder rg_gh;
+ struct buffer_head *dibh;
+ u64 *dataptrs, bn = 0;
+ u64 bstart = 0;
+ unsigned int blen = 0;
+ unsigned int blks = 0;
+ unsigned int x;
+ int error;
+
+ if (GFS2_EA_IS_STUFFED(ea))
+ return 0;
+
+ dataptrs = GFS2_EA2DATAPTRS(ea);
+ for (x = 0; x < ea->ea_num_ptrs; x++, dataptrs++) {
+ if (*dataptrs) {
+ blks++;
+ bn = be64_to_cpu(*dataptrs);
+ }
+ }
+ if (!blks)
+ return 0;
+
+ rgd = gfs2_blk2rgrpd(sdp, bn);
+ if (!rgd) {
+ gfs2_consist_inode(ip);
+ return -EIO;
+ }
+
+ error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, &rg_gh);
+ if (error)
+ return error;
+
+ error = gfs2_trans_begin(sdp, rgd->rd_ri.ri_length + RES_DINODE +
+ RES_EATTR + RES_STATFS + RES_QUOTA, blks);
+ if (error)
+ goto out_gunlock;
+
+ gfs2_trans_add_bh(ip->i_gl, bh, 1);
+
+ dataptrs = GFS2_EA2DATAPTRS(ea);
+ for (x = 0; x < ea->ea_num_ptrs; x++, dataptrs++) {
+ if (!*dataptrs)
+ break;
+ bn = be64_to_cpu(*dataptrs);
+
+ if (bstart + blen == bn)
+ blen++;
+ else {
+ if (bstart)
+ gfs2_free_meta(ip, bstart, blen);
+ bstart = bn;
+ blen = 1;
+ }
+
+ *dataptrs = 0;
+ if (!ip->i_di.di_blocks)
+ gfs2_consist_inode(ip);
+ ip->i_di.di_blocks--;
+ }
+ if (bstart)
+ gfs2_free_meta(ip, bstart, blen);
+
+ if (prev && !leave) {
+ u32 len;
+
+ len = GFS2_EA_REC_LEN(prev) + GFS2_EA_REC_LEN(ea);
+ prev->ea_rec_len = cpu_to_be32(len);
+
+ if (GFS2_EA_IS_LAST(ea))
+ prev->ea_flags |= GFS2_EAFLAG_LAST;
+ } else {
+ ea->ea_type = GFS2_EATYPE_UNUSED;
+ ea->ea_num_ptrs = 0;
+ }
+
+ error = gfs2_meta_inode_buffer(ip, &dibh);
+ if (!error) {
+ ip->i_di.di_ctime = get_seconds();
+ gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+ gfs2_dinode_out(&ip->i_di, dibh->b_data);
+ brelse(dibh);
+ }
+
+ gfs2_trans_end(sdp);
+
+out_gunlock:
+ gfs2_glock_dq_uninit(&rg_gh);
+ return error;
+}
+
+static int ea_remove_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh,
+ struct gfs2_ea_header *ea,
+ struct gfs2_ea_header *prev, int leave)
+{
+ struct gfs2_alloc *al;
+ int error;
+
+ al = gfs2_alloc_get(ip);
+
+ error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+ if (error)
+ goto out_alloc;
+
+ error = gfs2_rindex_hold(GFS2_SB(&ip->i_inode), &al->al_ri_gh);
+ if (error)
+ goto out_quota;
+
+ error = ea_dealloc_unstuffed(ip, bh, ea, prev, (leave) ? &error : NULL);
+
+ gfs2_glock_dq_uninit(&al->al_ri_gh);
+
+out_quota:
+ gfs2_quota_unhold(ip);
+out_alloc:
+ gfs2_alloc_put(ip);
+ return error;
+}
+
+struct ea_list {
+ struct gfs2_ea_request *ei_er;
+ unsigned int ei_size;
+};
+
+static int ea_list_i(struct gfs2_inode *ip, struct buffer_head *bh,
+ struct gfs2_ea_header *ea, struct gfs2_ea_header *prev,
+ void *private)
+{
+ struct ea_list *ei = private;
+ struct gfs2_ea_request *er = ei->ei_er;
+ unsigned int ea_size = gfs2_ea_strlen(ea);
+
+ if (ea->ea_type == GFS2_EATYPE_UNUSED)
+ return 0;
+
+ if (er->er_data_len) {
+ char *prefix = NULL;
+ unsigned int l = 0;
+ char c = 0;
+
+ if (ei->ei_size + ea_size > er->er_data_len)
+ return -ERANGE;
+
+ switch (ea->ea_type) {
+ case GFS2_EATYPE_USR:
+ prefix = "user.";
+ l = 5;
+ break;
+ case GFS2_EATYPE_SYS:
+ prefix = "system.";
+ l = 7;
+ break;
+ case GFS2_EATYPE_SECURITY:
+ prefix = "security.";
+ l = 9;
+ break;
+ }
+
+ BUG_ON(l == 0);
+
+ memcpy(er->er_data + ei->ei_size, prefix, l);
+ memcpy(er->er_data + ei->ei_size + l, GFS2_EA2NAME(ea),
+ ea->ea_name_len);
+ memcpy(er->er_data + ei->ei_size + ea_size - 1, &c, 1);
+ }
+
+ ei->ei_size += ea_size;
+
+ return 0;
+}
+
+/**
+ * gfs2_ea_list -
+ * @ip:
+ * @er:
+ *
+ * Returns: actual size of data on success, -errno on error
+ */
+
+int gfs2_ea_list(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+{
+ struct gfs2_holder i_gh;
+ int error;
+
+ if (!er->er_data || !er->er_data_len) {
+ er->er_data = NULL;
+ er->er_data_len = 0;
+ }
+
+ error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
+ if (error)
+ return error;
+
+ if (ip->i_di.di_eattr) {
+ struct ea_list ei = { .ei_er = er, .ei_size = 0 };
+
+ error = ea_foreach(ip, ea_list_i, &ei);
+ if (!error)
+ error = ei.ei_size;
+ }
+
+ gfs2_glock_dq_uninit(&i_gh);
+
+ return error;
+}
+
+/**
+ * ea_get_unstuffed - actually copies the unstuffed data into the
+ * request buffer
+ * @ip: The GFS2 inode
+ * @ea: The extended attribute header structure
+ * @data: The data to be copied
+ *
+ * Returns: errno
+ */
+
+static int ea_get_unstuffed(struct gfs2_inode *ip, struct gfs2_ea_header *ea,
+ char *data)
+{
+ struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+ struct buffer_head **bh;
+ unsigned int amount = GFS2_EA_DATA_LEN(ea);
+ unsigned int nptrs = DIV_ROUND_UP(amount, sdp->sd_jbsize);
+ u64 *dataptrs = GFS2_EA2DATAPTRS(ea);
+ unsigned int x;
+ int error = 0;
+
+ bh = kcalloc(nptrs, sizeof(struct buffer_head *), GFP_KERNEL);
+ if (!bh)
+ return -ENOMEM;
+
+ for (x = 0; x < nptrs; x++) {
+ error = gfs2_meta_read(ip->i_gl, be64_to_cpu(*dataptrs), 0,
+ bh + x);
+ if (error) {
+ while (x--)
+ brelse(bh[x]);
+ goto out;
+ }
+ dataptrs++;
+ }
+
+ for (x = 0; x < nptrs; x++) {
+ error = gfs2_meta_wait(sdp, bh[x]);
+ if (error) {
+ for (; x < nptrs; x++)
+ brelse(bh[x]);
+ goto out;
+ }
+ if (gfs2_metatype_check(sdp, bh[x], GFS2_METATYPE_ED)) {
+ for (; x < nptrs; x++)
+ brelse(bh[x]);
+ error = -EIO;
+ goto out;
+ }
+
+ memcpy(data, bh[x]->b_data + sizeof(struct gfs2_meta_header),
+ (sdp->sd_jbsize > amount) ? amount : sdp->sd_jbsize);
+
+ amount -= sdp->sd_jbsize;
+ data += sdp->sd_jbsize;
+
+ brelse(bh[x]);
+ }
+
+out:
+ kfree(bh);
+ return error;
+}
+
+int gfs2_ea_get_copy(struct gfs2_inode *ip, struct gfs2_ea_location *el,
+ char *data)
+{
+ if (GFS2_EA_IS_STUFFED(el->el_ea)) {
+ memcpy(data, GFS2_EA2DATA(el->el_ea), GFS2_EA_DATA_LEN(el->el_ea));
+ return 0;
+ } else
+ return ea_get_unstuffed(ip, el->el_ea, data);
+}
+
+/**
+ * gfs2_ea_get_i -
+ * @ip: The GFS2 inode
+ * @er: The request structure
+ *
+ * Returns: actual size of data on success, -errno on error
+ */
+
+int gfs2_ea_get_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+{
+ struct gfs2_ea_location el;
+ int error;
+
+ if (!ip->i_di.di_eattr)
+ return -ENODATA;
+
+ error = gfs2_ea_find(ip, er, &el);
+ if (error)
+ return error;
+ if (!el.el_ea)
+ return -ENODATA;
+
+ if (er->er_data_len) {
+ if (GFS2_EA_DATA_LEN(el.el_ea) > er->er_data_len)
+ error = -ERANGE;
+ else
+ error = gfs2_ea_get_copy(ip, &el, er->er_data);
+ }
+ if (!error)
+ error = GFS2_EA_DATA_LEN(el.el_ea);
+
+ brelse(el.el_bh);
+
+ return error;
+}
+
+/**
+ * gfs2_ea_get -
+ * @ip: The GFS2 inode
+ * @er: The request structure
+ *
+ * Returns: actual size of data on success, -errno on error
+ */
+
+int gfs2_ea_get(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+{
+ struct gfs2_holder i_gh;
+ int error;
+
+ if (!er->er_name_len ||
+ er->er_name_len > GFS2_EA_MAX_NAME_LEN)
+ return -EINVAL;
+ if (!er->er_data || !er->er_data_len) {
+ er->er_data = NULL;
+ er->er_data_len = 0;
+ }
+
+ error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
+ if (error)
+ return error;
+
+ error = gfs2_ea_ops[er->er_type]->eo_get(ip, er);
+
+ gfs2_glock_dq_uninit(&i_gh);
+
+ return error;
+}
+
+/**
+ * ea_alloc_blk - allocates a new block for extended attributes.
+ * @ip: A pointer to the inode that's getting extended attributes
+ * @bhp: Pointer to pointer to a struct buffer_head
+ *
+ * Returns: errno
+ */
+
+static int ea_alloc_blk(struct gfs2_inode *ip, struct buffer_head **bhp)
+{
+ struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+ struct gfs2_ea_header *ea;
+ u64 block;
+
+ block = gfs2_alloc_meta(ip);
+
+ *bhp = gfs2_meta_new(ip->i_gl, block);
+ gfs2_trans_add_bh(ip->i_gl, *bhp, 1);
+ gfs2_metatype_set(*bhp, GFS2_METATYPE_EA, GFS2_FORMAT_EA);
+ gfs2_buffer_clear_tail(*bhp, sizeof(struct gfs2_meta_header));
+
+ ea = GFS2_EA_BH2FIRST(*bhp);
+ ea->ea_rec_len = cpu_to_be32(sdp->sd_jbsize);
+ ea->ea_type = GFS2_EATYPE_UNUSED;
+ ea->ea_flags = GFS2_EAFLAG_LAST;
+ ea->ea_num_ptrs = 0;
+
+ ip->i_di.di_blocks++;
+
+ return 0;
+}
+
+/**
+ * ea_write - writes the request info to an ea, creating new blocks if
+ * necessary
+ * @ip: inode that is being modified
+ * @ea: the location of the new ea in a block
+ * @er: the write request
+ *
+ * Note: does not update ea_rec_len or the GFS2_EAFLAG_LAST bin of ea_flags
+ *
+ * returns : errno
+ */
+
+static int ea_write(struct gfs2_inode *ip, struct gfs2_ea_header *ea,
+ struct gfs2_ea_request *er)
+{
+ struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+
+ ea->ea_data_len = cpu_to_be32(er->er_data_len);
+ ea->ea_name_len = er->er_name_len;
+ ea->ea_type = er->er_type;
+ ea->__pad = 0;
+
+ memcpy(GFS2_EA2NAME(ea), er->er_name, er->er_name_len);
+
+ if (GFS2_EAREQ_SIZE_STUFFED(er) <= sdp->sd_jbsize) {
+ ea->ea_num_ptrs = 0;
+ memcpy(GFS2_EA2DATA(ea), er->er_data, er->er_data_len);
+ } else {
+ u64 *dataptr = GFS2_EA2DATAPTRS(ea);
+ const char *data = er->er_data;
+ unsigned int data_len = er->er_data_len;
+ unsigned int copy;
+ unsigned int x;
+
+ ea->ea_num_ptrs = DIV_ROUND_UP(er->er_data_len, sdp->sd_jbsize);
+ for (x = 0; x < ea->ea_num_ptrs; x++) {
+ struct buffer_head *bh;
+ u64 block;
+ int mh_size = sizeof(struct gfs2_meta_header);
+
+ block = gfs2_alloc_meta(ip);
+
+ bh = gfs2_meta_new(ip->i_gl, block);
+ gfs2_trans_add_bh(ip->i_gl, bh, 1);
+ gfs2_metatype_set(bh, GFS2_METATYPE_ED, GFS2_FORMAT_ED);
+
+ ip->i_di.di_blocks++;
+
+ copy = data_len > sdp->sd_jbsize ? sdp->sd_jbsize :
+ data_len;
+ memcpy(bh->b_data + mh_size, data, copy);
+ if (copy < sdp->sd_jbsize)
+ memset(bh->b_data + mh_size + copy, 0,
+ sdp->sd_jbsize - copy);
+
+ *dataptr++ = cpu_to_be64(bh->b_blocknr);
+ data += copy;
+ data_len -= copy;
+
+ brelse(bh);
+ }
+
+ gfs2_assert_withdraw(sdp, !data_len);
+ }
+
+ return 0;
+}
+
+typedef int (*ea_skeleton_call_t) (struct gfs2_inode *ip,
+ struct gfs2_ea_request *er, void *private);
+
+static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er,
+ unsigned int blks,
+ ea_skeleton_call_t skeleton_call, void *private)
+{
+ struct gfs2_alloc *al;
+ struct buffer_head *dibh;
+ int error;
+
+ al = gfs2_alloc_get(ip);
+
+ error = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+ if (error)
+ goto out;
+
+ error = gfs2_quota_check(ip, ip->i_di.di_uid, ip->i_di.di_gid);
+ if (error)
+ goto out_gunlock_q;
+
+ al->al_requested = blks;
+
+ error = gfs2_inplace_reserve(ip);
+ if (error)
+ goto out_gunlock_q;
+
+ error = gfs2_trans_begin(GFS2_SB(&ip->i_inode),
+ blks + al->al_rgd->rd_ri.ri_length +
+ RES_DINODE + RES_STATFS + RES_QUOTA, 0);
+ if (error)
+ goto out_ipres;
+
+ error = skeleton_call(ip, er, private);
+ if (error)
+ goto out_end_trans;
+
+ error = gfs2_meta_inode_buffer(ip, &dibh);
+ if (!error) {
+ if (er->er_flags & GFS2_ERF_MODE) {
+ gfs2_assert_withdraw(GFS2_SB(&ip->i_inode),
+ (ip->i_di.di_mode & S_IFMT) ==
+ (er->er_mode & S_IFMT));
+ ip->i_di.di_mode = er->er_mode;
+ }
+ ip->i_di.di_ctime = get_seconds();
+ gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+ gfs2_dinode_out(&ip->i_di, dibh->b_data);
+ brelse(dibh);
+ }
+
+out_end_trans:
+ gfs2_trans_end(GFS2_SB(&ip->i_inode));
+out_ipres:
+ gfs2_inplace_release(ip);
+out_gunlock_q:
+ gfs2_quota_unlock(ip);
+out:
+ gfs2_alloc_put(ip);
+ return error;
+}
+
+static int ea_init_i(struct gfs2_inode *ip, struct gfs2_ea_request *er,
+ void *private)
+{
+ struct buffer_head *bh;
+ int error;
+
+ error = ea_alloc_blk(ip, &bh);
+ if (error)
+ return error;
+
+ ip->i_di.di_eattr = bh->b_blocknr;
+ error = ea_write(ip, GFS2_EA_BH2FIRST(bh), er);
+
+ brelse(bh);
+
+ return error;
+}
+
+/**
+ * ea_init - initializes a new eattr block
+ * @ip:
+ * @er:
+ *
+ * Returns: errno
+ */
+
+static int ea_init(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+{
+ unsigned int jbsize = GFS2_SB(&ip->i_inode)->sd_jbsize;
+ unsigned int blks = 1;
+
+ if (GFS2_EAREQ_SIZE_STUFFED(er) > jbsize)
+ blks += DIV_ROUND_UP(er->er_data_len, jbsize);
+
+ return ea_alloc_skeleton(ip, er, blks, ea_init_i, NULL);
+}
+
+static struct gfs2_ea_header *ea_split_ea(struct gfs2_ea_header *ea)
+{
+ u32 ea_size = GFS2_EA_SIZE(ea);
+ struct gfs2_ea_header *new = (struct gfs2_ea_header *)((char *)ea +
+ ea_size);
+ u32 new_size = GFS2_EA_REC_LEN(ea) - ea_size;
+ int last = ea->ea_flags & GFS2_EAFLAG_LAST;
+
+ ea->ea_rec_len = cpu_to_be32(ea_size);
+ ea->ea_flags ^= last;
+
+ new->ea_rec_len = cpu_to_be32(new_size);
+ new->ea_flags = last;
+
+ return new;
+}
+
+static void ea_set_remove_stuffed(struct gfs2_inode *ip,
+ struct gfs2_ea_location *el)
+{
+ struct gfs2_ea_header *ea = el->el_ea;
+ struct gfs2_ea_header *prev = el->el_prev;
+ u32 len;
+
+ gfs2_trans_add_bh(ip->i_gl, el->el_bh, 1);
+
+ if (!prev || !GFS2_EA_IS_STUFFED(ea)) {
+ ea->ea_type = GFS2_EATYPE_UNUSED;
+ return;
+ } else if (GFS2_EA2NEXT(prev) != ea) {
+ prev = GFS2_EA2NEXT(prev);
+ gfs2_assert_withdraw(GFS2_SB(&ip->i_inode), GFS2_EA2NEXT(prev) == ea);
+ }
+
+ len = GFS2_EA_REC_LEN(prev) + GFS2_EA_REC_LEN(ea);
+ prev->ea_rec_len = cpu_to_be32(len);
+
+ if (GFS2_EA_IS_LAST(ea))
+ prev->ea_flags |= GFS2_EAFLAG_LAST;
+}
+
+struct ea_set {
+ int ea_split;
+
+ struct gfs2_ea_request *es_er;
+ struct gfs2_ea_location *es_el;
+
+ struct buffer_head *es_bh;
+ struct gfs2_ea_header *es_ea;
+};
+
+static int ea_set_simple_noalloc(struct gfs2_inode *ip, struct buffer_head *bh,
+ struct gfs2_ea_header *ea, struct ea_set *es)
+{
+ struct gfs2_ea_request *er = es->es_er;
+ struct buffer_head *dibh;
+ int error;
+
+ error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), RES_DINODE + 2 * RES_EATTR, 0);
+ if (error)
+ return error;
+
+ gfs2_trans_add_bh(ip->i_gl, bh, 1);
+
+ if (es->ea_split)
+ ea = ea_split_ea(ea);
+
+ ea_write(ip, ea, er);
+
+ if (es->es_el)
+ ea_set_remove_stuffed(ip, es->es_el);
+
+ error = gfs2_meta_inode_buffer(ip, &dibh);
+ if (error)
+ goto out;
+
+ if (er->er_flags & GFS2_ERF_MODE) {
+ gfs2_assert_withdraw(GFS2_SB(&ip->i_inode),
+ (ip->i_di.di_mode & S_IFMT) == (er->er_mode & S_IFMT));
+ ip->i_di.di_mode = er->er_mode;
+ }
+ ip->i_di.di_ctime = get_seconds();
+ gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+ gfs2_dinode_out(&ip->i_di, dibh->b_data);
+ brelse(dibh);
+out:
+ gfs2_trans_end(GFS2_SB(&ip->i_inode));
+ return error;
+}
+
+static int ea_set_simple_alloc(struct gfs2_inode *ip,
+ struct gfs2_ea_request *er, void *private)
+{
+ struct ea_set *es = private;
+ struct gfs2_ea_header *ea = es->es_ea;
+ int error;
+
+ gfs2_trans_add_bh(ip->i_gl, es->es_bh, 1);
+
+ if (es->ea_split)
+ ea = ea_split_ea(ea);
+
+ error = ea_write(ip, ea, er);
+ if (error)
+ return error;
+
+ if (es->es_el)
+ ea_set_remove_stuffed(ip, es->es_el);
+
+ return 0;
+}
+
+static int ea_set_simple(struct gfs2_inode *ip, struct buffer_head *bh,
+ struct gfs2_ea_header *ea, struct gfs2_ea_header *prev,
+ void *private)
+{
+ struct ea_set *es = private;
+ unsigned int size;
+ int stuffed;
+ int error;
+
+ stuffed = ea_calc_size(GFS2_SB(&ip->i_inode), es->es_er, &size);
+
+ if (ea->ea_type == GFS2_EATYPE_UNUSED) {
+ if (GFS2_EA_REC_LEN(ea) < size)
+ return 0;
+ if (!GFS2_EA_IS_STUFFED(ea)) {
+ error = ea_remove_unstuffed(ip, bh, ea, prev, 1);
+ if (error)
+ return error;
+ }
+ es->ea_split = 0;
+ } else if (GFS2_EA_REC_LEN(ea) - GFS2_EA_SIZE(ea) >= size)
+ es->ea_split = 1;
+ else
+ return 0;
+
+ if (stuffed) {
+ error = ea_set_simple_noalloc(ip, bh, ea, es);
+ if (error)
+ return error;
+ } else {
+ unsigned int blks;
+
+ es->es_bh = bh;
+ es->es_ea = ea;
+ blks = 2 + DIV_ROUND_UP(es->es_er->er_data_len,
+ GFS2_SB(&ip->i_inode)->sd_jbsize);
+
+ error = ea_alloc_skeleton(ip, es->es_er, blks,
+ ea_set_simple_alloc, es);
+ if (error)
+ return error;
+ }
+
+ return 1;
+}
+
+static int ea_set_block(struct gfs2_inode *ip, struct gfs2_ea_request *er,
+ void *private)
+{
+ struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+ struct buffer_head *indbh, *newbh;
+ u64 *eablk;
+ int error;
+ int mh_size = sizeof(struct gfs2_meta_header);
+
+ if (ip->i_di.di_flags & GFS2_DIF_EA_INDIRECT) {
+ u64 *end;
+
+ error = gfs2_meta_read(ip->i_gl, ip->i_di.di_eattr, DIO_WAIT,
+ &indbh);
+ if (error)
+ return error;
+
+ if (gfs2_metatype_check(sdp, indbh, GFS2_METATYPE_IN)) {
+ error = -EIO;
+ goto out;
+ }
+
+ eablk = (u64 *)(indbh->b_data + mh_size);
+ end = eablk + sdp->sd_inptrs;
+
+ for (; eablk < end; eablk++)
+ if (!*eablk)
+ break;
+
+ if (eablk == end) {
+ error = -ENOSPC;
+ goto out;
+ }
+
+ gfs2_trans_add_bh(ip->i_gl, indbh, 1);
+ } else {
+ u64 blk;
+
+ blk = gfs2_alloc_meta(ip);
+
+ indbh = gfs2_meta_new(ip->i_gl, blk);
+ gfs2_trans_add_bh(ip->i_gl, indbh, 1);
+ gfs2_metatype_set(indbh, GFS2_METATYPE_IN, GFS2_FORMAT_IN);
+ gfs2_buffer_clear_tail(indbh, mh_size);
+
+ eablk = (u64 *)(indbh->b_data + mh_size);
+ *eablk = cpu_to_be64(ip->i_di.di_eattr);
+ ip->i_di.di_eattr = blk;
+ ip->i_di.di_flags |= GFS2_DIF_EA_INDIRECT;
+ ip->i_di.di_blocks++;
+
+ eablk++;
+ }
+
+ error = ea_alloc_blk(ip, &newbh);
+ if (error)
+ goto out;
+
+ *eablk = cpu_to_be64((u64)newbh->b_blocknr);
+ error = ea_write(ip, GFS2_EA_BH2FIRST(newbh), er);
+ brelse(newbh);
+ if (error)
+ goto out;
+
+ if (private)
+ ea_set_remove_stuffed(ip, private);
+
+out:
+ brelse(indbh);
+ return error;
+}
+
+static int ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er,
+ struct gfs2_ea_location *el)
+{
+ struct ea_set es;
+ unsigned int blks = 2;
+ int error;
+
+ memset(&es, 0, sizeof(struct ea_set));
+ es.es_er = er;
+ es.es_el = el;
+
+ error = ea_foreach(ip, ea_set_simple, &es);
+ if (error > 0)
+ return 0;
+ if (error)
+ return error;
+
+ if (!(ip->i_di.di_flags & GFS2_DIF_EA_INDIRECT))
+ blks++;
+ if (GFS2_EAREQ_SIZE_STUFFED(er) > GFS2_SB(&ip->i_inode)->sd_jbsize)
+ blks += DIV_ROUND_UP(er->er_data_len, GFS2_SB(&ip->i_inode)->sd_jbsize);
+
+ return ea_alloc_skeleton(ip, er, blks, ea_set_block, el);
+}
+
+static int ea_set_remove_unstuffed(struct gfs2_inode *ip,
+ struct gfs2_ea_location *el)
+{
+ if (el->el_prev && GFS2_EA2NEXT(el->el_prev) != el->el_ea) {
+ el->el_prev = GFS2_EA2NEXT(el->el_prev);
+ gfs2_assert_withdraw(GFS2_SB(&ip->i_inode),
+ GFS2_EA2NEXT(el->el_prev) == el->el_ea);
+ }
+
+ return ea_remove_unstuffed(ip, el->el_bh, el->el_ea, el->el_prev,0);
+}
+
+int gfs2_ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+{
+ struct gfs2_ea_location el;
+ int error;
+
+ if (!ip->i_di.di_eattr) {
+ if (er->er_flags & XATTR_REPLACE)
+ return -ENODATA;
+ return ea_init(ip, er);
+ }
+
+ error = gfs2_ea_find(ip, er, &el);
+ if (error)
+ return error;
+
+ if (el.el_ea) {
+ if (ip->i_di.di_flags & GFS2_DIF_APPENDONLY) {
+ brelse(el.el_bh);
+ return -EPERM;
+ }
+
+ error = -EEXIST;
+ if (!(er->er_flags & XATTR_CREATE)) {
+ int unstuffed = !GFS2_EA_IS_STUFFED(el.el_ea);
+ error = ea_set_i(ip, er, &el);
+ if (!error && unstuffed)
+ ea_set_remove_unstuffed(ip, &el);
+ }
+
+ brelse(el.el_bh);
+ } else {
+ error = -ENODATA;
+ if (!(er->er_flags & XATTR_REPLACE))
+ error = ea_set_i(ip, er, NULL);
+ }
+
+ return error;
+}
+
+int gfs2_ea_set(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+{
+ struct gfs2_holder i_gh;
+ int error;
+
+ if (!er->er_name_len || er->er_name_len > GFS2_EA_MAX_NAME_LEN)
+ return -EINVAL;
+ if (!er->er_data || !er->er_data_len) {
+ er->er_data = NULL;
+ er->er_data_len = 0;
+ }
+ error = ea_check_size(GFS2_SB(&ip->i_inode), er);
+ if (error)
+ return error;
+
+ error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
+ if (error)
+ return error;
+
+ if (IS_IMMUTABLE(&ip->i_inode))
+ error = -EPERM;
+ else
+ error = gfs2_ea_ops[er->er_type]->eo_set(ip, er);
+
+ gfs2_glock_dq_uninit(&i_gh);
+
+ return error;
+}
+
+static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el)
+{
+ struct gfs2_ea_header *ea = el->el_ea;
+ struct gfs2_ea_header *prev = el->el_prev;
+ struct buffer_head *dibh;
+ int error;
+
+ error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), RES_DINODE + RES_EATTR, 0);
+ if (error)
+ return error;
+
+ gfs2_trans_add_bh(ip->i_gl, el->el_bh, 1);
+
+ if (prev) {
+ u32 len;
+
+ len = GFS2_EA_REC_LEN(prev) + GFS2_EA_REC_LEN(ea);
+ prev->ea_rec_len = cpu_to_be32(len);
+
+ if (GFS2_EA_IS_LAST(ea))
+ prev->ea_flags |= GFS2_EAFLAG_LAST;
+ } else
+ ea->ea_type = GFS2_EATYPE_UNUSED;
+
+ error = gfs2_meta_inode_buffer(ip, &dibh);
+ if (!error) {
+ ip->i_di.di_ctime = get_seconds();
+ gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+ gfs2_dinode_out(&ip->i_di, dibh->b_data);
+ brelse(dibh);
+ }
+
+ gfs2_trans_end(GFS2_SB(&ip->i_inode));
+
+ return error;
+}
+
+int gfs2_ea_remove_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+{
+ struct gfs2_ea_location el;
+ int error;
+
+ if (!ip->i_di.di_eattr)
+ return -ENODATA;
+
+ error = gfs2_ea_find(ip, er, &el);
+ if (error)
+ return error;
+ if (!el.el_ea)
+ return -ENODATA;
+
+ if (GFS2_EA_IS_STUFFED(el.el_ea))
+ error = ea_remove_stuffed(ip, &el);
+ else
+ error = ea_remove_unstuffed(ip, el.el_bh, el.el_ea, el.el_prev,
+ 0);
+
+ brelse(el.el_bh);
+
+ return error;
+}
+
+/**
+ * gfs2_ea_remove - sets (or creates or replaces) an extended attribute
+ * @ip: pointer to the inode of the target file
+ * @er: request information
+ *
+ * Returns: errno
+ */
+
+int gfs2_ea_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+{
+ struct gfs2_holder i_gh;
+ int error;
+
+ if (!er->er_name_len || er->er_name_len > GFS2_EA_MAX_NAME_LEN)
+ return -EINVAL;
+
+ error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
+ if (error)
+ return error;
+
+ if (IS_IMMUTABLE(&ip->i_inode) || IS_APPEND(&ip->i_inode))
+ error = -EPERM;
+ else
+ error = gfs2_ea_ops[er->er_type]->eo_remove(ip, er);
+
+ gfs2_glock_dq_uninit(&i_gh);
+
+ return error;
+}
+
+static int ea_acl_chmod_unstuffed(struct gfs2_inode *ip,
+ struct gfs2_ea_header *ea, char *data)
+{
+ struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+ struct buffer_head **bh;
+ unsigned int amount = GFS2_EA_DATA_LEN(ea);
+ unsigned int nptrs = DIV_ROUND_UP(amount, sdp->sd_jbsize);
+ u64 *dataptrs = GFS2_EA2DATAPTRS(ea);
+ unsigned int x;
+ int error;
+
+ bh = kcalloc(nptrs, sizeof(struct buffer_head *), GFP_KERNEL);
+ if (!bh)
+ return -ENOMEM;
+
+ error = gfs2_trans_begin(sdp, nptrs + RES_DINODE, 0);
+ if (error)
+ goto out;
+
+ for (x = 0; x < nptrs; x++) {
+ error = gfs2_meta_read(ip->i_gl, be64_to_cpu(*dataptrs), 0,
+ bh + x);
+ if (error) {
+ while (x--)
+ brelse(bh[x]);
+ goto fail;
+ }
+ dataptrs++;
+ }
+
+ for (x = 0; x < nptrs; x++) {
+ error = gfs2_meta_wait(sdp, bh[x]);
+ if (error) {
+ for (; x < nptrs; x++)
+ brelse(bh[x]);
+ goto fail;
+ }
+ if (gfs2_metatype_check(sdp, bh[x], GFS2_METATYPE_ED)) {
+ for (; x < nptrs; x++)
+ brelse(bh[x]);
+ error = -EIO;
+ goto fail;
+ }
+
+ gfs2_trans_add_bh(ip->i_gl, bh[x], 1);
+
+ memcpy(bh[x]->b_data + sizeof(struct gfs2_meta_header), data,
+ (sdp->sd_jbsize > amount) ? amount : sdp->sd_jbsize);
+
+ amount -= sdp->sd_jbsize;
+ data += sdp->sd_jbsize;
+
+ brelse(bh[x]);
+ }
+
+out:
+ kfree(bh);
+ return error;
+
+fail:
+ gfs2_trans_end(sdp);
+ kfree(bh);
+ return error;
+}
+
+int gfs2_ea_acl_chmod(struct gfs2_inode *ip, struct gfs2_ea_location *el,
+ struct iattr *attr, char *data)
+{
+ struct buffer_head *dibh;
+ int error;
+
+ if (GFS2_EA_IS_STUFFED(el->el_ea)) {
+ error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), RES_DINODE + RES_EATTR, 0);
+ if (error)
+ return error;
+
+ gfs2_trans_add_bh(ip->i_gl, el->el_bh, 1);
+ memcpy(GFS2_EA2DATA(el->el_ea), data,
+ GFS2_EA_DATA_LEN(el->el_ea));
+ } else
+ error = ea_acl_chmod_unstuffed(ip, el->el_ea, data);
+
+ if (error)
+ return error;
+
+ error = gfs2_meta_inode_buffer(ip, &dibh);
+ if (!error) {
+ error = inode_setattr(&ip->i_inode, attr);
+ gfs2_assert_warn(GFS2_SB(&ip->i_inode), !error);
+ gfs2_inode_attr_out(ip);
+ gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+ gfs2_dinode_out(&ip->i_di, dibh->b_data);
+ brelse(dibh);
+ }
+
+ gfs2_trans_end(GFS2_SB(&ip->i_inode));
+
+ return error;
+}
+
+static int ea_dealloc_indirect(struct gfs2_inode *ip)
+{
+ struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+ struct gfs2_rgrp_list rlist;
+ struct buffer_head *indbh, *dibh;
+ u64 *eablk, *end;
+ unsigned int rg_blocks = 0;
+ u64 bstart = 0;
+ unsigned int blen = 0;
+ unsigned int blks = 0;
+ unsigned int x;
+ int error;
+
+ memset(&rlist, 0, sizeof(struct gfs2_rgrp_list));
+
+ error = gfs2_meta_read(ip->i_gl, ip->i_di.di_eattr, DIO_WAIT, &indbh);
+ if (error)
+ return error;
+
+ if (gfs2_metatype_check(sdp, indbh, GFS2_METATYPE_IN)) {
+ error = -EIO;
+ goto out;
+ }
+
+ eablk = (u64 *)(indbh->b_data + sizeof(struct gfs2_meta_header));
+ end = eablk + sdp->sd_inptrs;
+
+ for (; eablk < end; eablk++) {
+ u64 bn;
+
+ if (!*eablk)
+ break;
+ bn = be64_to_cpu(*eablk);
+
+ if (bstart + blen == bn)
+ blen++;
+ else {
+ if (bstart)
+ gfs2_rlist_add(sdp, &rlist, bstart);
+ bstart = bn;
+ blen = 1;
+ }
+ blks++;
+ }
+ if (bstart)
+ gfs2_rlist_add(sdp, &rlist, bstart);
+ else
+ goto out;
+
+ gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE, 0);
+
+ for (x = 0; x < rlist.rl_rgrps; x++) {
+ struct gfs2_rgrpd *rgd;
+ rgd = rlist.rl_ghs[x].gh_gl->gl_object;
+ rg_blocks += rgd->rd_ri.ri_length;
+ }
+
+ error = gfs2_glock_nq_m(rlist.rl_rgrps, rlist.rl_ghs);
+ if (error)
+ goto out_rlist_free;
+
+ error = gfs2_trans_begin(sdp, rg_blocks + RES_DINODE + RES_INDIRECT +
+ RES_STATFS + RES_QUOTA, blks);
+ if (error)
+ goto out_gunlock;
+
+ gfs2_trans_add_bh(ip->i_gl, indbh, 1);
+
+ eablk = (u64 *)(indbh->b_data + sizeof(struct gfs2_meta_header));
+ bstart = 0;
+ blen = 0;
+
+ for (; eablk < end; eablk++) {
+ u64 bn;
+
+ if (!*eablk)
+ break;
+ bn = be64_to_cpu(*eablk);
+
+ if (bstart + blen == bn)
+ blen++;
+ else {
+ if (bstart)
+ gfs2_free_meta(ip, bstart, blen);
+ bstart = bn;
+ blen = 1;
+ }
+
+ *eablk = 0;
+ if (!ip->i_di.di_blocks)
+ gfs2_consist_inode(ip);
+ ip->i_di.di_blocks--;
+ }
+ if (bstart)
+ gfs2_free_meta(ip, bstart, blen);
+
+ ip->i_di.di_flags &= ~GFS2_DIF_EA_INDIRECT;
+
+ error = gfs2_meta_inode_buffer(ip, &dibh);
+ if (!error) {
+ gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+ gfs2_dinode_out(&ip->i_di, dibh->b_data);
+ brelse(dibh);
+ }
+
+ gfs2_trans_end(sdp);
+
+out_gunlock:
+ gfs2_glock_dq_m(rlist.rl_rgrps, rlist.rl_ghs);
+out_rlist_free:
+ gfs2_rlist_free(&rlist);
+out:
+ brelse(indbh);
+ return error;
+}
+
+static int ea_dealloc_block(struct gfs2_inode *ip)
+{
+ struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+ struct gfs2_alloc *al = &ip->i_alloc;
+ struct gfs2_rgrpd *rgd;
+ struct buffer_head *dibh;
+ int error;
+
+ rgd = gfs2_blk2rgrpd(sdp, ip->i_di.di_eattr);
+ if (!rgd) {
+ gfs2_consist_inode(ip);
+ return -EIO;
+ }
+
+ error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0,
+ &al->al_rgd_gh);
+ if (error)
+ return error;
+
+ error = gfs2_trans_begin(sdp, RES_RG_BIT + RES_DINODE + RES_STATFS +
+ RES_QUOTA, 1);
+ if (error)
+ goto out_gunlock;
+
+ gfs2_free_meta(ip, ip->i_di.di_eattr, 1);
+
+ ip->i_di.di_eattr = 0;
+ if (!ip->i_di.di_blocks)
+ gfs2_consist_inode(ip);
+ ip->i_di.di_blocks--;
+
+ error = gfs2_meta_inode_buffer(ip, &dibh);
+ if (!error) {
+ gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+ gfs2_dinode_out(&ip->i_di, dibh->b_data);
+ brelse(dibh);
+ }
+
+ gfs2_trans_end(sdp);
+
+out_gunlock:
+ gfs2_glock_dq_uninit(&al->al_rgd_gh);
+ return error;
+}
+
+/**
+ * gfs2_ea_dealloc - deallocate the extended attribute fork
+ * @ip: the inode
+ *
+ * Returns: errno
+ */
+
+int gfs2_ea_dealloc(struct gfs2_inode *ip)
+{
+ struct gfs2_alloc *al;
+ int error;
+
+ al = gfs2_alloc_get(ip);
+
+ error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+ if (error)
+ goto out_alloc;
+
+ error = gfs2_rindex_hold(GFS2_SB(&ip->i_inode), &al->al_ri_gh);
+ if (error)
+ goto out_quota;
+
+ error = ea_foreach(ip, ea_dealloc_unstuffed, NULL);
+ if (error)
+ goto out_rindex;
+
+ if (ip->i_di.di_flags & GFS2_DIF_EA_INDIRECT) {
+ error = ea_dealloc_indirect(ip);
+ if (error)
+ goto out_rindex;
+ }
+
+ error = ea_dealloc_block(ip);
+
+out_rindex:
+ gfs2_glock_dq_uninit(&al->al_ri_gh);
+out_quota:
+ gfs2_quota_unhold(ip);
+out_alloc:
+ gfs2_alloc_put(ip);
+ return error;
+}
+
diff --git a/fs/gfs2/eattr.h b/fs/gfs2/eattr.h
new file mode 100644
index 000000000000..ffa65947d686
--- /dev/null
+++ b/fs/gfs2/eattr.h
@@ -0,0 +1,100 @@
+/*
+ * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#ifndef __EATTR_DOT_H__
+#define __EATTR_DOT_H__
+
+struct gfs2_inode;
+struct iattr;
+
+#define GFS2_EA_REC_LEN(ea) be32_to_cpu((ea)->ea_rec_len)
+#define GFS2_EA_DATA_LEN(ea) be32_to_cpu((ea)->ea_data_len)
+
+#define GFS2_EA_SIZE(ea) \
+ALIGN(sizeof(struct gfs2_ea_header) + (ea)->ea_name_len + \
+ ((GFS2_EA_IS_STUFFED(ea)) ? GFS2_EA_DATA_LEN(ea) : \
+ (sizeof(u64) * (ea)->ea_num_ptrs)), 8)
+
+#define GFS2_EA_IS_STUFFED(ea) (!(ea)->ea_num_ptrs)
+#define GFS2_EA_IS_LAST(ea) ((ea)->ea_flags & GFS2_EAFLAG_LAST)
+
+#define GFS2_EAREQ_SIZE_STUFFED(er) \
+ALIGN(sizeof(struct gfs2_ea_header) + (er)->er_name_len + (er)->er_data_len, 8)
+
+#define GFS2_EAREQ_SIZE_UNSTUFFED(sdp, er) \
+ALIGN(sizeof(struct gfs2_ea_header) + (er)->er_name_len + \
+ sizeof(u64) * DIV_ROUND_UP((er)->er_data_len, (sdp)->sd_jbsize), 8)
+
+#define GFS2_EA2NAME(ea) ((char *)((struct gfs2_ea_header *)(ea) + 1))
+#define GFS2_EA2DATA(ea) (GFS2_EA2NAME(ea) + (ea)->ea_name_len)
+
+#define GFS2_EA2DATAPTRS(ea) \
+((u64 *)(GFS2_EA2NAME(ea) + ALIGN((ea)->ea_name_len, 8)))
+
+#define GFS2_EA2NEXT(ea) \
+((struct gfs2_ea_header *)((char *)(ea) + GFS2_EA_REC_LEN(ea)))
+
+#define GFS2_EA_BH2FIRST(bh) \
+((struct gfs2_ea_header *)((bh)->b_data + sizeof(struct gfs2_meta_header)))
+
+#define GFS2_ERF_MODE 0x80000000
+
+struct gfs2_ea_request {
+ const char *er_name;
+ char *er_data;
+ unsigned int er_name_len;
+ unsigned int er_data_len;
+ unsigned int er_type; /* GFS2_EATYPE_... */
+ int er_flags;
+ mode_t er_mode;
+};
+
+struct gfs2_ea_location {
+ struct buffer_head *el_bh;
+ struct gfs2_ea_header *el_ea;
+ struct gfs2_ea_header *el_prev;
+};
+
+int gfs2_ea_get_i(struct gfs2_inode *ip, struct gfs2_ea_request *er);
+int gfs2_ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er);
+int gfs2_ea_remove_i(struct gfs2_inode *ip, struct gfs2_ea_request *er);
+
+int gfs2_ea_list(struct gfs2_inode *ip, struct gfs2_ea_request *er);
+int gfs2_ea_get(struct gfs2_inode *ip, struct gfs2_ea_request *er);
+int gfs2_ea_set(struct gfs2_inode *ip, struct gfs2_ea_request *er);
+int gfs2_ea_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er);
+
+int gfs2_ea_dealloc(struct gfs2_inode *ip);
+
+/* Exported to acl.c */
+
+int gfs2_ea_find(struct gfs2_inode *ip,
+ struct gfs2_ea_request *er,
+ struct gfs2_ea_location *el);
+int gfs2_ea_get_copy(struct gfs2_inode *ip,
+ struct gfs2_ea_location *el,
+ char *data);
+int gfs2_ea_acl_chmod(struct gfs2_inode *ip, struct gfs2_ea_location *el,
+ struct iattr *attr, char *data);
+
+static inline unsigned int gfs2_ea_strlen(struct gfs2_ea_header *ea)
+{
+ switch (ea->ea_type) {
+ case GFS2_EATYPE_USR:
+ return 5 + ea->ea_name_len + 1;
+ case GFS2_EATYPE_SYS:
+ return 7 + ea->ea_name_len + 1;
+ case GFS2_EATYPE_SECURITY:
+ return 9 + ea->ea_name_len + 1;
+ default:
+ return 0;
+ }
+}
+
+#endif /* __EATTR_DOT_H__ */
diff --git a/fs/gfs2/gfs2.h b/fs/gfs2/gfs2.h
new file mode 100644
index 000000000000..3bb11c0f8b56
--- /dev/null
+++ b/fs/gfs2/gfs2.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#ifndef __GFS2_DOT_H__
+#define __GFS2_DOT_H__
+
+enum {
+ NO_CREATE = 0,
+ CREATE = 1,
+};
+
+enum {
+ NO_WAIT = 0,
+ WAIT = 1,
+};
+
+enum {
+ NO_FORCE = 0,
+ FORCE = 1,
+};
+
+#define GFS2_FAST_NAME_SIZE 8
+
+#endif /* __GFS2_DOT_H__ */
+
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
new file mode 100644
index 000000000000..78fe0fae23ff
--- /dev/null
+++ b/fs/gfs2/glock.c
@@ -0,0 +1,2231 @@
+/*
+ * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/delay.h>
+#include <linux/sort.h>
+#include <linux/jhash.h>
+#include <linux/kallsyms.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/list.h>
+#include <linux/lm_interface.h>
+#include <asm/uaccess.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "glock.h"
+#include "glops.h"
+#include "inode.h"
+#include "lm.h"
+#include "lops.h"
+#include "meta_io.h"
+#include "quota.h"
+#include "super.h"
+#include "util.h"
+
+struct greedy {
+ struct gfs2_holder gr_gh;
+ struct work_struct gr_work;
+};
+
+struct gfs2_gl_hash_bucket {
+ struct hlist_head hb_list;
+};
+
+typedef void (*glock_examiner) (struct gfs2_glock * gl);
+
+static int gfs2_dump_lockstate(struct gfs2_sbd *sdp);
+static int dump_glock(struct gfs2_glock *gl);
+static int dump_inode(struct gfs2_inode *ip);
+
+#define GFS2_GL_HASH_SHIFT 15
+#define GFS2_GL_HASH_SIZE (1 << GFS2_GL_HASH_SHIFT)
+#define GFS2_GL_HASH_MASK (GFS2_GL_HASH_SIZE - 1)
+
+static struct gfs2_gl_hash_bucket gl_hash_table[GFS2_GL_HASH_SIZE];
+
+/*
+ * Despite what you might think, the numbers below are not arbitrary :-)
+ * They are taken from the ipv4 routing hash code, which is well tested
+ * and thus should be nearly optimal. Later on we might tweek the numbers
+ * but for now this should be fine.
+ *
+ * The reason for putting the locks in a separate array from the list heads
+ * is that we can have fewer locks than list heads and save memory. We use
+ * the same hash function for both, but with a different hash mask.
+ */
+#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
+ defined(CONFIG_PROVE_LOCKING)
+
+#ifdef CONFIG_LOCKDEP
+# define GL_HASH_LOCK_SZ 256
+#else
+# if NR_CPUS >= 32
+# define GL_HASH_LOCK_SZ 4096
+# elif NR_CPUS >= 16
+# define GL_HASH_LOCK_SZ 2048
+# elif NR_CPUS >= 8
+# define GL_HASH_LOCK_SZ 1024
+# elif NR_CPUS >= 4
+# define GL_HASH_LOCK_SZ 512
+# else
+# define GL_HASH_LOCK_SZ 256
+# endif
+#endif
+
+/* We never want more locks than chains */
+#if GFS2_GL_HASH_SIZE < GL_HASH_LOCK_SZ
+# undef GL_HASH_LOCK_SZ
+# define GL_HASH_LOCK_SZ GFS2_GL_HASH_SIZE
+#endif
+
+static rwlock_t gl_hash_locks[GL_HASH_LOCK_SZ];
+
+static inline rwlock_t *gl_lock_addr(unsigned int x)
+{
+ return &gl_hash_locks[x & (GL_HASH_LOCK_SZ-1)];
+}
+#else /* not SMP, so no spinlocks required */
+static inline rwlock_t *gl_lock_addr(x)
+{
+ return NULL;
+}
+#endif
+
+/**
+ * relaxed_state_ok - is a requested lock compatible with the current lock mode?
+ * @actual: the current state of the lock
+ * @requested: the lock state that was requested by the caller
+ * @flags: the modifier flags passed in by the caller
+ *
+ * Returns: 1 if the locks are compatible, 0 otherwise
+ */
+
+static inline int relaxed_state_ok(unsigned int actual, unsigned requested,
+ int flags)
+{
+ if (actual == requested)
+ return 1;
+
+ if (flags & GL_EXACT)
+ return 0;
+
+ if (actual == LM_ST_EXCLUSIVE && requested == LM_ST_SHARED)
+ return 1;
+
+ if (actual != LM_ST_UNLOCKED && (flags & LM_FLAG_ANY))
+ return 1;
+
+ return 0;
+}
+
+/**
+ * gl_hash() - Turn glock number into hash bucket number
+ * @lock: The glock number
+ *
+ * Returns: The number of the corresponding hash bucket
+ */
+
+static unsigned int gl_hash(const struct gfs2_sbd *sdp,
+ const struct lm_lockname *name)
+{
+ unsigned int h;
+
+ h = jhash(&name->ln_number, sizeof(u64), 0);
+ h = jhash(&name->ln_type, sizeof(unsigned int), h);
+ h = jhash(&sdp, sizeof(struct gfs2_sbd *), h);
+ h &= GFS2_GL_HASH_MASK;
+
+ return h;
+}
+
+/**
+ * glock_free() - Perform a few checks and then release struct gfs2_glock
+ * @gl: The glock to release
+ *
+ * Also calls lock module to release its internal structure for this glock.
+ *
+ */
+
+static void glock_free(struct gfs2_glock *gl)
+{
+ struct gfs2_sbd *sdp = gl->gl_sbd;
+ struct inode *aspace = gl->gl_aspace;
+
+ gfs2_lm_put_lock(sdp, gl->gl_lock);
+
+ if (aspace)
+ gfs2_aspace_put(aspace);
+
+ kmem_cache_free(gfs2_glock_cachep, gl);
+}
+
+/**
+ * gfs2_glock_hold() - increment reference count on glock
+ * @gl: The glock to hold
+ *
+ */
+
+void gfs2_glock_hold(struct gfs2_glock *gl)
+{
+ atomic_inc(&gl->gl_ref);
+}
+
+/**
+ * gfs2_glock_put() - Decrement reference count on glock
+ * @gl: The glock to put
+ *
+ */
+
+int gfs2_glock_put(struct gfs2_glock *gl)
+{
+ int rv = 0;
+ struct gfs2_sbd *sdp = gl->gl_sbd;
+
+ write_lock(gl_lock_addr(gl->gl_hash));
+ if (atomic_dec_and_test(&gl->gl_ref)) {
+ hlist_del(&gl->gl_list);
+ write_unlock(gl_lock_addr(gl->gl_hash));
+ BUG_ON(spin_is_locked(&gl->gl_spin));
+ gfs2_assert(sdp, gl->gl_state == LM_ST_UNLOCKED);
+ gfs2_assert(sdp, list_empty(&gl->gl_reclaim));
+ gfs2_assert(sdp, list_empty(&gl->gl_holders));
+ gfs2_assert(sdp, list_empty(&gl->gl_waiters1));
+ gfs2_assert(sdp, list_empty(&gl->gl_waiters2));
+ gfs2_assert(sdp, list_empty(&gl->gl_waiters3));
+ glock_free(gl);
+ rv = 1;
+ goto out;
+ }
+ write_unlock(gl_lock_addr(gl->gl_hash));
+out:
+ return rv;
+}
+
+/**
+ * queue_empty - check to see if a glock's queue is empty
+ * @gl: the glock
+ * @head: the head of the queue to check
+ *
+ * This function protects the list in the event that a process already
+ * has a holder on the list and is adding a second holder for itself.
+ * The glmutex lock is what generally prevents processes from working
+ * on the same glock at once, but the special case of adding a second
+ * holder for yourself ("recursive" locking) doesn't involve locking
+ * glmutex, making the spin lock necessary.
+ *
+ * Returns: 1 if the queue is empty
+ */
+
+static inline int queue_empty(struct gfs2_glock *gl, struct list_head *head)
+{
+ int empty;
+ spin_lock(&gl->gl_spin);
+ empty = list_empty(head);
+ spin_unlock(&gl->gl_spin);
+ return empty;
+}
+
+/**
+ * search_bucket() - Find struct gfs2_glock by lock number
+ * @bucket: the bucket to search
+ * @name: The lock name
+ *
+ * Returns: NULL, or the struct gfs2_glock with the requested number
+ */
+
+static struct gfs2_glock *search_bucket(unsigned int hash,
+ const struct gfs2_sbd *sdp,
+ const struct lm_lockname *name)
+{
+ struct gfs2_glock *gl;
+ struct hlist_node *h;
+
+ hlist_for_each_entry(gl, h, &gl_hash_table[hash].hb_list, gl_list) {
+ if (!lm_name_equal(&gl->gl_name, name))
+ continue;
+ if (gl->gl_sbd != sdp)
+ continue;
+
+ atomic_inc(&gl->gl_ref);
+
+ return gl;
+ }
+
+ return NULL;
+}
+
+/**
+ * gfs2_glock_find() - Find glock by lock number
+ * @sdp: The GFS2 superblock
+ * @name: The lock name
+ *
+ * Returns: NULL, or the struct gfs2_glock with the requested number
+ */
+
+static struct gfs2_glock *gfs2_glock_find(const struct gfs2_sbd *sdp,
+ const struct lm_lockname *name)
+{
+ unsigned int hash = gl_hash(sdp, name);
+ struct gfs2_glock *gl;
+
+ read_lock(gl_lock_addr(hash));
+ gl = search_bucket(hash, sdp, name);
+ read_unlock(gl_lock_addr(hash));
+
+ return gl;
+}
+
+/**
+ * gfs2_glock_get() - Get a glock, or create one if one doesn't exist
+ * @sdp: The GFS2 superblock
+ * @number: the lock number
+ * @glops: The glock_operations to use
+ * @create: If 0, don't create the glock if it doesn't exist
+ * @glp: the glock is returned here
+ *
+ * This does not lock a glock, just finds/creates structures for one.
+ *
+ * Returns: errno
+ */
+
+int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
+ const struct gfs2_glock_operations *glops, int create,
+ struct gfs2_glock **glp)
+{
+ struct lm_lockname name = { .ln_number = number, .ln_type = glops->go_type };
+ struct gfs2_glock *gl, *tmp;
+ unsigned int hash = gl_hash(sdp, &name);
+ int error;
+
+ read_lock(gl_lock_addr(hash));
+ gl = search_bucket(hash, sdp, &name);
+ read_unlock(gl_lock_addr(hash));
+
+ if (gl || !create) {
+ *glp = gl;
+ return 0;
+ }
+
+ gl = kmem_cache_alloc(gfs2_glock_cachep, GFP_KERNEL);
+ if (!gl)
+ return -ENOMEM;
+
+ gl->gl_flags = 0;
+ gl->gl_name = name;
+ atomic_set(&gl->gl_ref, 1);
+ gl->gl_state = LM_ST_UNLOCKED;
+ gl->gl_hash = hash;
+ gl->gl_owner = NULL;
+ gl->gl_ip = 0;
+ gl->gl_ops = glops;
+ gl->gl_req_gh = NULL;
+ gl->gl_req_bh = NULL;
+ gl->gl_vn = 0;
+ gl->gl_stamp = jiffies;
+ gl->gl_object = NULL;
+ gl->gl_sbd = sdp;
+ gl->gl_aspace = NULL;
+ lops_init_le(&gl->gl_le, &gfs2_glock_lops);
+
+ /* If this glock protects actual on-disk data or metadata blocks,
+ create a VFS inode to manage the pages/buffers holding them. */
+ if (glops == &gfs2_inode_glops || glops == &gfs2_rgrp_glops) {
+ gl->gl_aspace = gfs2_aspace_get(sdp);
+ if (!gl->gl_aspace) {
+ error = -ENOMEM;
+ goto fail;
+ }
+ }
+
+ error = gfs2_lm_get_lock(sdp, &name, &gl->gl_lock);
+ if (error)
+ goto fail_aspace;
+
+ write_lock(gl_lock_addr(hash));
+ tmp = search_bucket(hash, sdp, &name);
+ if (tmp) {
+ write_unlock(gl_lock_addr(hash));
+ glock_free(gl);
+ gl = tmp;
+ } else {
+ hlist_add_head(&gl->gl_list, &gl_hash_table[hash].hb_list);
+ write_unlock(gl_lock_addr(hash));
+ }
+
+ *glp = gl;
+
+ return 0;
+
+fail_aspace:
+ if (gl->gl_aspace)
+ gfs2_aspace_put(gl->gl_aspace);
+fail:
+ kmem_cache_free(gfs2_glock_cachep, gl);
+ return error;
+}
+
+/**
+ * gfs2_holder_init - initialize a struct gfs2_holder in the default way
+ * @gl: the glock
+ * @state: the state we're requesting
+ * @flags: the modifier flags
+ * @gh: the holder structure
+ *
+ */
+
+void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, unsigned flags,
+ struct gfs2_holder *gh)
+{
+ INIT_LIST_HEAD(&gh->gh_list);
+ gh->gh_gl = gl;
+ gh->gh_ip = (unsigned long)__builtin_return_address(0);
+ gh->gh_owner = current;
+ gh->gh_state = state;
+ gh->gh_flags = flags;
+ gh->gh_error = 0;
+ gh->gh_iflags = 0;
+ init_completion(&gh->gh_wait);
+
+ if (gh->gh_state == LM_ST_EXCLUSIVE)
+ gh->gh_flags |= GL_LOCAL_EXCL;
+
+ gfs2_glock_hold(gl);
+}
+
+/**
+ * gfs2_holder_reinit - reinitialize a struct gfs2_holder so we can requeue it
+ * @state: the state we're requesting
+ * @flags: the modifier flags
+ * @gh: the holder structure
+ *
+ * Don't mess with the glock.
+ *
+ */
+
+void gfs2_holder_reinit(unsigned int state, unsigned flags, struct gfs2_holder *gh)
+{
+ gh->gh_state = state;
+ gh->gh_flags = flags;
+ if (gh->gh_state == LM_ST_EXCLUSIVE)
+ gh->gh_flags |= GL_LOCAL_EXCL;
+
+ gh->gh_iflags &= 1 << HIF_ALLOCED;
+ gh->gh_ip = (unsigned long)__builtin_return_address(0);
+}
+
+/**
+ * gfs2_holder_uninit - uninitialize a holder structure (drop glock reference)
+ * @gh: the holder structure
+ *
+ */
+
+void gfs2_holder_uninit(struct gfs2_holder *gh)
+{
+ gfs2_glock_put(gh->gh_gl);
+ gh->gh_gl = NULL;
+ gh->gh_ip = 0;
+}
+
+/**
+ * gfs2_holder_get - get a struct gfs2_holder structure
+ * @gl: the glock
+ * @state: the state we're requesting
+ * @flags: the modifier flags
+ * @gfp_flags:
+ *
+ * Figure out how big an impact this function has. Either:
+ * 1) Replace it with a cache of structures hanging off the struct gfs2_sbd
+ * 2) Leave it like it is
+ *
+ * Returns: the holder structure, NULL on ENOMEM
+ */
+
+static struct gfs2_holder *gfs2_holder_get(struct gfs2_glock *gl,
+ unsigned int state,
+ int flags, gfp_t gfp_flags)
+{
+ struct gfs2_holder *gh;
+
+ gh = kmalloc(sizeof(struct gfs2_holder), gfp_flags);
+ if (!gh)
+ return NULL;
+
+ gfs2_holder_init(gl, state, flags, gh);
+ set_bit(HIF_ALLOCED, &gh->gh_iflags);
+ gh->gh_ip = (unsigned long)__builtin_return_address(0);
+ return gh;
+}
+
+/**
+ * gfs2_holder_put - get rid of a struct gfs2_holder structure
+ * @gh: the holder structure
+ *
+ */
+
+static void gfs2_holder_put(struct gfs2_holder *gh)
+{
+ gfs2_holder_uninit(gh);
+ kfree(gh);
+}
+
+/**
+ * rq_mutex - process a mutex request in the queue
+ * @gh: the glock holder
+ *
+ * Returns: 1 if the queue is blocked
+ */
+
+static int rq_mutex(struct gfs2_holder *gh)
+{
+ struct gfs2_glock *gl = gh->gh_gl;
+
+ list_del_init(&gh->gh_list);
+ /* gh->gh_error never examined. */
+ set_bit(GLF_LOCK, &gl->gl_flags);
+ complete(&gh->gh_wait);
+
+ return 1;
+}
+
+/**
+ * rq_promote - process a promote request in the queue
+ * @gh: the glock holder
+ *
+ * Acquire a new inter-node lock, or change a lock state to more restrictive.
+ *
+ * Returns: 1 if the queue is blocked
+ */
+
+static int rq_promote(struct gfs2_holder *gh)
+{
+ struct gfs2_glock *gl = gh->gh_gl;
+ struct gfs2_sbd *sdp = gl->gl_sbd;
+ const struct gfs2_glock_operations *glops = gl->gl_ops;
+
+ if (!relaxed_state_ok(gl->gl_state, gh->gh_state, gh->gh_flags)) {
+ if (list_empty(&gl->gl_holders)) {
+ gl->gl_req_gh = gh;
+ set_bit(GLF_LOCK, &gl->gl_flags);
+ spin_unlock(&gl->gl_spin);
+
+ if (atomic_read(&sdp->sd_reclaim_count) >
+ gfs2_tune_get(sdp, gt_reclaim_limit) &&
+ !(gh->gh_flags & LM_FLAG_PRIORITY)) {
+ gfs2_reclaim_glock(sdp);
+ gfs2_reclaim_glock(sdp);
+ }
+
+ glops->go_xmote_th(gl, gh->gh_state, gh->gh_flags);
+ spin_lock(&gl->gl_spin);
+ }
+ return 1;
+ }
+
+ if (list_empty(&gl->gl_holders)) {
+ set_bit(HIF_FIRST, &gh->gh_iflags);
+ set_bit(GLF_LOCK, &gl->gl_flags);
+ } else {
+ struct gfs2_holder *next_gh;
+ if (gh->gh_flags & GL_LOCAL_EXCL)
+ return 1;
+ next_gh = list_entry(gl->gl_holders.next, struct gfs2_holder,
+ gh_list);
+ if (next_gh->gh_flags & GL_LOCAL_EXCL)
+ return 1;
+ }
+
+ list_move_tail(&gh->gh_list, &gl->gl_holders);
+ gh->gh_error = 0;
+ set_bit(HIF_HOLDER, &gh->gh_iflags);
+
+ complete(&gh->gh_wait);
+
+ return 0;
+}
+
+/**
+ * rq_demote - process a demote request in the queue
+ * @gh: the glock holder
+ *
+ * Returns: 1 if the queue is blocked
+ */
+
+static int rq_demote(struct gfs2_holder *gh)
+{
+ struct gfs2_glock *gl = gh->gh_gl;
+ const struct gfs2_glock_operations *glops = gl->gl_ops;
+
+ if (!list_empty(&gl->gl_holders))
+ return 1;
+
+ if (gl->gl_state == gh->gh_state || gl->gl_state == LM_ST_UNLOCKED) {
+ list_del_init(&gh->gh_list);
+ gh->gh_error = 0;
+ spin_unlock(&gl->gl_spin);
+ if (test_bit(HIF_DEALLOC, &gh->gh_iflags))
+ gfs2_holder_put(gh);
+ else
+ complete(&gh->gh_wait);
+ spin_lock(&gl->gl_spin);
+ } else {
+ gl->gl_req_gh = gh;
+ set_bit(GLF_LOCK, &gl->gl_flags);
+ spin_unlock(&gl->gl_spin);
+
+ if (gh->gh_state == LM_ST_UNLOCKED ||
+ gl->gl_state != LM_ST_EXCLUSIVE)
+ glops->go_drop_th(gl);
+ else
+ glops->go_xmote_th(gl, gh->gh_state, gh->gh_flags);
+
+ spin_lock(&gl->gl_spin);
+ }
+
+ return 0;
+}
+
+/**
+ * rq_greedy - process a queued request to drop greedy status
+ * @gh: the glock holder
+ *
+ * Returns: 1 if the queue is blocked
+ */
+
+static int rq_greedy(struct gfs2_holder *gh)
+{
+ struct gfs2_glock *gl = gh->gh_gl;
+
+ list_del_init(&gh->gh_list);
+ /* gh->gh_error never examined. */
+ clear_bit(GLF_GREEDY, &gl->gl_flags);
+ spin_unlock(&gl->gl_spin);
+
+ gfs2_holder_uninit(gh);
+ kfree(container_of(gh, struct greedy, gr_gh));
+
+ spin_lock(&gl->gl_spin);
+
+ return 0;
+}
+
+/**
+ * run_queue - process holder structures on a glock
+ * @gl: the glock
+ *
+ */
+static void run_queue(struct gfs2_glock *gl)
+{
+ struct gfs2_holder *gh;
+ int blocked = 1;
+
+ for (;;) {
+ if (test_bit(GLF_LOCK, &gl->gl_flags))
+ break;
+
+ if (!list_empty(&gl->gl_waiters1)) {
+ gh = list_entry(gl->gl_waiters1.next,
+ struct gfs2_holder, gh_list);
+
+ if (test_bit(HIF_MUTEX, &gh->gh_iflags))
+ blocked = rq_mutex(gh);
+ else
+ gfs2_assert_warn(gl->gl_sbd, 0);
+
+ } else if (!list_empty(&gl->gl_waiters2) &&
+ !test_bit(GLF_SKIP_WAITERS2, &gl->gl_flags)) {
+ gh = list_entry(gl->gl_waiters2.next,
+ struct gfs2_holder, gh_list);
+
+ if (test_bit(HIF_DEMOTE, &gh->gh_iflags))
+ blocked = rq_demote(gh);
+ else if (test_bit(HIF_GREEDY, &gh->gh_iflags))
+ blocked = rq_greedy(gh);
+ else
+ gfs2_assert_warn(gl->gl_sbd, 0);
+
+ } else if (!list_empty(&gl->gl_waiters3)) {
+ gh = list_entry(gl->gl_waiters3.next,
+ struct gfs2_holder, gh_list);
+
+ if (test_bit(HIF_PROMOTE, &gh->gh_iflags))
+ blocked = rq_promote(gh);
+ else
+ gfs2_assert_warn(gl->gl_sbd, 0);
+
+ } else
+ break;
+
+ if (blocked)
+ break;
+ }
+}
+
+/**
+ * gfs2_glmutex_lock - acquire a local lock on a glock
+ * @gl: the glock
+ *
+ * Gives caller exclusive access to manipulate a glock structure.
+ */
+
+static void gfs2_glmutex_lock(struct gfs2_glock *gl)
+{
+ struct gfs2_holder gh;
+
+ gfs2_holder_init(gl, 0, 0, &gh);
+ set_bit(HIF_MUTEX, &gh.gh_iflags);
+
+ spin_lock(&gl->gl_spin);
+ if (test_and_set_bit(GLF_LOCK, &gl->gl_flags)) {
+ list_add_tail(&gh.gh_list, &gl->gl_waiters1);
+ } else {
+ gl->gl_owner = current;
+ gl->gl_ip = (unsigned long)__builtin_return_address(0);
+ complete(&gh.gh_wait);
+ }
+ spin_unlock(&gl->gl_spin);
+
+ wait_for_completion(&gh.gh_wait);
+ gfs2_holder_uninit(&gh);
+}
+
+/**
+ * gfs2_glmutex_trylock - try to acquire a local lock on a glock
+ * @gl: the glock
+ *
+ * Returns: 1 if the glock is acquired
+ */
+
+static int gfs2_glmutex_trylock(struct gfs2_glock *gl)
+{
+ int acquired = 1;
+
+ spin_lock(&gl->gl_spin);
+ if (test_and_set_bit(GLF_LOCK, &gl->gl_flags)) {
+ acquired = 0;
+ } else {
+ gl->gl_owner = current;
+ gl->gl_ip = (unsigned long)__builtin_return_address(0);
+ }
+ spin_unlock(&gl->gl_spin);
+
+ return acquired;
+}
+
+/**
+ * gfs2_glmutex_unlock - release a local lock on a glock
+ * @gl: the glock
+ *
+ */
+
+static void gfs2_glmutex_unlock(struct gfs2_glock *gl)
+{
+ spin_lock(&gl->gl_spin);
+ clear_bit(GLF_LOCK, &gl->gl_flags);
+ gl->gl_owner = NULL;
+ gl->gl_ip = 0;
+ run_queue(gl);
+ BUG_ON(!spin_is_locked(&gl->gl_spin));
+ spin_unlock(&gl->gl_spin);
+}
+
+/**
+ * handle_callback - add a demote request to a lock's queue
+ * @gl: the glock
+ * @state: the state the caller wants us to change to
+ *
+ * Note: This may fail sliently if we are out of memory.
+ */
+
+static void handle_callback(struct gfs2_glock *gl, unsigned int state)
+{
+ struct gfs2_holder *gh, *new_gh = NULL;
+
+restart:
+ spin_lock(&gl->gl_spin);
+
+ list_for_each_entry(gh, &gl->gl_waiters2, gh_list) {
+ if (test_bit(HIF_DEMOTE, &gh->gh_iflags) &&
+ gl->gl_req_gh != gh) {
+ if (gh->gh_state != state)
+ gh->gh_state = LM_ST_UNLOCKED;
+ goto out;
+ }
+ }
+
+ if (new_gh) {
+ list_add_tail(&new_gh->gh_list, &gl->gl_waiters2);
+ new_gh = NULL;
+ } else {
+ spin_unlock(&gl->gl_spin);
+
+ new_gh = gfs2_holder_get(gl, state, LM_FLAG_TRY, GFP_KERNEL);
+ if (!new_gh)
+ return;
+ set_bit(HIF_DEMOTE, &new_gh->gh_iflags);
+ set_bit(HIF_DEALLOC, &new_gh->gh_iflags);
+
+ goto restart;
+ }
+
+out:
+ spin_unlock(&gl->gl_spin);
+
+ if (new_gh)
+ gfs2_holder_put(new_gh);
+}
+
+void gfs2_glock_inode_squish(struct inode *inode)
+{
+ struct gfs2_holder gh;
+ struct gfs2_glock *gl = GFS2_I(inode)->i_gl;
+ gfs2_holder_init(gl, LM_ST_UNLOCKED, 0, &gh);
+ set_bit(HIF_DEMOTE, &gh.gh_iflags);
+ spin_lock(&gl->gl_spin);
+ gfs2_assert(inode->i_sb->s_fs_info, list_empty(&gl->gl_holders));
+ list_add_tail(&gh.gh_list, &gl->gl_waiters2);
+ run_queue(gl);
+ spin_unlock(&gl->gl_spin);
+ wait_for_completion(&gh.gh_wait);
+ gfs2_holder_uninit(&gh);
+}
+
+/**
+ * state_change - record that the glock is now in a different state
+ * @gl: the glock
+ * @new_state the new state
+ *
+ */
+
+static void state_change(struct gfs2_glock *gl, unsigned int new_state)
+{
+ int held1, held2;
+
+ held1 = (gl->gl_state != LM_ST_UNLOCKED);
+ held2 = (new_state != LM_ST_UNLOCKED);
+
+ if (held1 != held2) {
+ if (held2)
+ gfs2_glock_hold(gl);
+ else
+ gfs2_glock_put(gl);
+ }
+
+ gl->gl_state = new_state;
+}
+
+/**
+ * xmote_bh - Called after the lock module is done acquiring a lock
+ * @gl: The glock in question
+ * @ret: the int returned from the lock module
+ *
+ */
+
+static void xmote_bh(struct gfs2_glock *gl, unsigned int ret)
+{
+ struct gfs2_sbd *sdp = gl->gl_sbd;
+ const struct gfs2_glock_operations *glops = gl->gl_ops;
+ struct gfs2_holder *gh = gl->gl_req_gh;
+ int prev_state = gl->gl_state;
+ int op_done = 1;
+
+ gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
+ gfs2_assert_warn(sdp, queue_empty(gl, &gl->gl_holders));
+ gfs2_assert_warn(sdp, !(ret & LM_OUT_ASYNC));
+
+ state_change(gl, ret & LM_OUT_ST_MASK);
+
+ if (prev_state != LM_ST_UNLOCKED && !(ret & LM_OUT_CACHEABLE)) {
+ if (glops->go_inval)
+ glops->go_inval(gl, DIO_METADATA | DIO_DATA);
+ } else if (gl->gl_state == LM_ST_DEFERRED) {
+ /* We might not want to do this here.
+ Look at moving to the inode glops. */
+ if (glops->go_inval)
+ glops->go_inval(gl, DIO_DATA);
+ }
+
+ /* Deal with each possible exit condition */
+
+ if (!gh)
+ gl->gl_stamp = jiffies;
+ else if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) {
+ spin_lock(&gl->gl_spin);
+ list_del_init(&gh->gh_list);
+ gh->gh_error = -EIO;
+ spin_unlock(&gl->gl_spin);
+ } else if (test_bit(HIF_DEMOTE, &gh->gh_iflags)) {
+ spin_lock(&gl->gl_spin);
+ list_del_init(&gh->gh_list);
+ if (gl->gl_state == gh->gh_state ||
+ gl->gl_state == LM_ST_UNLOCKED) {
+ gh->gh_error = 0;
+ } else {
+ if (gfs2_assert_warn(sdp, gh->gh_flags &
+ (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) == -1)
+ fs_warn(sdp, "ret = 0x%.8X\n", ret);
+ gh->gh_error = GLR_TRYFAILED;
+ }
+ spin_unlock(&gl->gl_spin);
+
+ if (ret & LM_OUT_CANCELED)
+ handle_callback(gl, LM_ST_UNLOCKED);
+
+ } else if (ret & LM_OUT_CANCELED) {
+ spin_lock(&gl->gl_spin);
+ list_del_init(&gh->gh_list);
+ gh->gh_error = GLR_CANCELED;
+ spin_unlock(&gl->gl_spin);
+
+ } else if (relaxed_state_ok(gl->gl_state, gh->gh_state, gh->gh_flags)) {
+ spin_lock(&gl->gl_spin);
+ list_move_tail(&gh->gh_list, &gl->gl_holders);
+ gh->gh_error = 0;
+ set_bit(HIF_HOLDER, &gh->gh_iflags);
+ spin_unlock(&gl->gl_spin);
+
+ set_bit(HIF_FIRST, &gh->gh_iflags);
+
+ op_done = 0;
+
+ } else if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) {
+ spin_lock(&gl->gl_spin);
+ list_del_init(&gh->gh_list);
+ gh->gh_error = GLR_TRYFAILED;
+ spin_unlock(&gl->gl_spin);
+
+ } else {
+ if (gfs2_assert_withdraw(sdp, 0) == -1)
+ fs_err(sdp, "ret = 0x%.8X\n", ret);
+ }
+
+ if (glops->go_xmote_bh)
+ glops->go_xmote_bh(gl);
+
+ if (op_done) {
+ spin_lock(&gl->gl_spin);
+ gl->gl_req_gh = NULL;
+ gl->gl_req_bh = NULL;
+ clear_bit(GLF_LOCK, &gl->gl_flags);
+ run_queue(gl);
+ spin_unlock(&gl->gl_spin);
+ }
+
+ gfs2_glock_put(gl);
+
+ if (gh) {
+ if (test_bit(HIF_DEALLOC, &gh->gh_iflags))
+ gfs2_holder_put(gh);
+ else
+ complete(&gh->gh_wait);
+ }
+}
+
+/**
+ * gfs2_glock_xmote_th - Call into the lock module to acquire or change a glock
+ * @gl: The glock in question
+ * @state: the requested state
+ * @flags: modifier flags to the lock call
+ *
+ */
+
+void gfs2_glock_xmote_th(struct gfs2_glock *gl, unsigned int state, int flags)
+{
+ struct gfs2_sbd *sdp = gl->gl_sbd;
+ const struct gfs2_glock_operations *glops = gl->gl_ops;
+ int lck_flags = flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB |
+ LM_FLAG_NOEXP | LM_FLAG_ANY |
+ LM_FLAG_PRIORITY);
+ unsigned int lck_ret;
+
+ gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
+ gfs2_assert_warn(sdp, queue_empty(gl, &gl->gl_holders));
+ gfs2_assert_warn(sdp, state != LM_ST_UNLOCKED);
+ gfs2_assert_warn(sdp, state != gl->gl_state);
+
+ if (gl->gl_state == LM_ST_EXCLUSIVE && glops->go_sync)
+ glops->go_sync(gl, DIO_METADATA | DIO_DATA | DIO_RELEASE);
+
+ gfs2_glock_hold(gl);
+ gl->gl_req_bh = xmote_bh;
+
+ lck_ret = gfs2_lm_lock(sdp, gl->gl_lock, gl->gl_state, state, lck_flags);
+
+ if (gfs2_assert_withdraw(sdp, !(lck_ret & LM_OUT_ERROR)))
+ return;
+
+ if (lck_ret & LM_OUT_ASYNC)
+ gfs2_assert_warn(sdp, lck_ret == LM_OUT_ASYNC);
+ else
+ xmote_bh(gl, lck_ret);
+}
+
+/**
+ * drop_bh - Called after a lock module unlock completes
+ * @gl: the glock
+ * @ret: the return status
+ *
+ * Doesn't wake up the process waiting on the struct gfs2_holder (if any)
+ * Doesn't drop the reference on the glock the top half took out
+ *
+ */
+
+static void drop_bh(struct gfs2_glock *gl, unsigned int ret)
+{
+ struct gfs2_sbd *sdp = gl->gl_sbd;
+ const struct gfs2_glock_operations *glops = gl->gl_ops;
+ struct gfs2_holder *gh = gl->gl_req_gh;
+
+ clear_bit(GLF_PREFETCH, &gl->gl_flags);
+
+ gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
+ gfs2_assert_warn(sdp, queue_empty(gl, &gl->gl_holders));
+ gfs2_assert_warn(sdp, !ret);
+
+ state_change(gl, LM_ST_UNLOCKED);
+
+ if (glops->go_inval)
+ glops->go_inval(gl, DIO_METADATA | DIO_DATA);
+
+ if (gh) {
+ spin_lock(&gl->gl_spin);
+ list_del_init(&gh->gh_list);
+ gh->gh_error = 0;
+ spin_unlock(&gl->gl_spin);
+ }
+
+ if (glops->go_drop_bh)
+ glops->go_drop_bh(gl);
+
+ spin_lock(&gl->gl_spin);
+ gl->gl_req_gh = NULL;
+ gl->gl_req_bh = NULL;
+ clear_bit(GLF_LOCK, &gl->gl_flags);
+ run_queue(gl);
+ spin_unlock(&gl->gl_spin);
+
+ gfs2_glock_put(gl);
+
+ if (gh) {
+ if (test_bit(HIF_DEALLOC, &gh->gh_iflags))
+ gfs2_holder_put(gh);
+ else
+ complete(&gh->gh_wait);
+ }
+}
+
+/**
+ * gfs2_glock_drop_th - call into the lock module to unlock a lock
+ * @gl: the glock
+ *
+ */
+
+void gfs2_glock_drop_th(struct gfs2_glock *gl)
+{
+ struct gfs2_sbd *sdp = gl->gl_sbd;
+ const struct gfs2_glock_operations *glops = gl->gl_ops;
+ unsigned int ret;
+
+ gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
+ gfs2_assert_warn(sdp, queue_empty(gl, &gl->gl_holders));
+ gfs2_assert_warn(sdp, gl->gl_state != LM_ST_UNLOCKED);
+
+ if (gl->gl_state == LM_ST_EXCLUSIVE && glops->go_sync)
+ glops->go_sync(gl, DIO_METADATA | DIO_DATA | DIO_RELEASE);
+
+ gfs2_glock_hold(gl);
+ gl->gl_req_bh = drop_bh;
+
+ ret = gfs2_lm_unlock(sdp, gl->gl_lock, gl->gl_state);
+
+ if (gfs2_assert_withdraw(sdp, !(ret & LM_OUT_ERROR)))
+ return;
+
+ if (!ret)
+ drop_bh(gl, ret);
+ else
+ gfs2_assert_warn(sdp, ret == LM_OUT_ASYNC);
+}
+
+/**
+ * do_cancels - cancel requests for locks stuck waiting on an expire flag
+ * @gh: the LM_FLAG_PRIORITY holder waiting to acquire the lock
+ *
+ * Don't cancel GL_NOCANCEL requests.
+ */
+
+static void do_cancels(struct gfs2_holder *gh)
+{
+ struct gfs2_glock *gl = gh->gh_gl;
+
+ spin_lock(&gl->gl_spin);
+
+ while (gl->gl_req_gh != gh &&
+ !test_bit(HIF_HOLDER, &gh->gh_iflags) &&
+ !list_empty(&gh->gh_list)) {
+ if (gl->gl_req_bh && !(gl->gl_req_gh &&
+ (gl->gl_req_gh->gh_flags & GL_NOCANCEL))) {
+ spin_unlock(&gl->gl_spin);
+ gfs2_lm_cancel(gl->gl_sbd, gl->gl_lock);
+ msleep(100);
+ spin_lock(&gl->gl_spin);
+ } else {
+ spin_unlock(&gl->gl_spin);
+ msleep(100);
+ spin_lock(&gl->gl_spin);
+ }
+ }
+
+ spin_unlock(&gl->gl_spin);
+}
+
+/**
+ * glock_wait_internal - wait on a glock acquisition
+ * @gh: the glock holder
+ *
+ * Returns: 0 on success
+ */
+
+static int glock_wait_internal(struct gfs2_holder *gh)
+{
+ struct gfs2_glock *gl = gh->gh_gl;
+ struct gfs2_sbd *sdp = gl->gl_sbd;
+ const struct gfs2_glock_operations *glops = gl->gl_ops;
+
+ if (test_bit(HIF_ABORTED, &gh->gh_iflags))
+ return -EIO;
+
+ if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) {
+ spin_lock(&gl->gl_spin);
+ if (gl->gl_req_gh != gh &&
+ !test_bit(HIF_HOLDER, &gh->gh_iflags) &&
+ !list_empty(&gh->gh_list)) {
+ list_del_init(&gh->gh_list);
+ gh->gh_error = GLR_TRYFAILED;
+ run_queue(gl);
+ spin_unlock(&gl->gl_spin);
+ return gh->gh_error;
+ }
+ spin_unlock(&gl->gl_spin);
+ }
+
+ if (gh->gh_flags & LM_FLAG_PRIORITY)
+ do_cancels(gh);
+
+ wait_for_completion(&gh->gh_wait);
+
+ if (gh->gh_error)
+ return gh->gh_error;
+
+ gfs2_assert_withdraw(sdp, test_bit(HIF_HOLDER, &gh->gh_iflags));
+ gfs2_assert_withdraw(sdp, relaxed_state_ok(gl->gl_state, gh->gh_state,
+ gh->gh_flags));
+
+ if (test_bit(HIF_FIRST, &gh->gh_iflags)) {
+ gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
+
+ if (glops->go_lock) {
+ gh->gh_error = glops->go_lock(gh);
+ if (gh->gh_error) {
+ spin_lock(&gl->gl_spin);
+ list_del_init(&gh->gh_list);
+ spin_unlock(&gl->gl_spin);
+ }
+ }
+
+ spin_lock(&gl->gl_spin);
+ gl->gl_req_gh = NULL;
+ gl->gl_req_bh = NULL;
+ clear_bit(GLF_LOCK, &gl->gl_flags);
+ run_queue(gl);
+ spin_unlock(&gl->gl_spin);
+ }
+
+ return gh->gh_error;
+}
+
+static inline struct gfs2_holder *
+find_holder_by_owner(struct list_head *head, struct task_struct *owner)
+{
+ struct gfs2_holder *gh;
+
+ list_for_each_entry(gh, head, gh_list) {
+ if (gh->gh_owner == owner)
+ return gh;
+ }
+
+ return NULL;
+}
+
+/**
+ * add_to_queue - Add a holder to the wait queue (but look for recursion)
+ * @gh: the holder structure to add
+ *
+ */
+
+static void add_to_queue(struct gfs2_holder *gh)
+{
+ struct gfs2_glock *gl = gh->gh_gl;
+ struct gfs2_holder *existing;
+
+ BUG_ON(!gh->gh_owner);
+
+ existing = find_holder_by_owner(&gl->gl_holders, gh->gh_owner);
+ if (existing) {
+ print_symbol(KERN_WARNING "original: %s\n", existing->gh_ip);
+ printk(KERN_INFO "pid : %d\n", existing->gh_owner->pid);
+ printk(KERN_INFO "lock type : %d lock state : %d\n",
+ existing->gh_gl->gl_name.ln_type, existing->gh_gl->gl_state);
+ print_symbol(KERN_WARNING "new: %s\n", gh->gh_ip);
+ printk(KERN_INFO "pid : %d\n", gh->gh_owner->pid);
+ printk(KERN_INFO "lock type : %d lock state : %d\n",
+ gl->gl_name.ln_type, gl->gl_state);
+ BUG();
+ }
+
+ existing = find_holder_by_owner(&gl->gl_waiters3, gh->gh_owner);
+ if (existing) {
+ print_symbol(KERN_WARNING "original: %s\n", existing->gh_ip);
+ print_symbol(KERN_WARNING "new: %s\n", gh->gh_ip);
+ BUG();
+ }
+
+ if (gh->gh_flags & LM_FLAG_PRIORITY)
+ list_add(&gh->gh_list, &gl->gl_waiters3);
+ else
+ list_add_tail(&gh->gh_list, &gl->gl_waiters3);
+}
+
+/**
+ * gfs2_glock_nq - enqueue a struct gfs2_holder onto a glock (acquire a glock)
+ * @gh: the holder structure
+ *
+ * if (gh->gh_flags & GL_ASYNC), this never returns an error
+ *
+ * Returns: 0, GLR_TRYFAILED, or errno on failure
+ */
+
+int gfs2_glock_nq(struct gfs2_holder *gh)
+{
+ struct gfs2_glock *gl = gh->gh_gl;
+ struct gfs2_sbd *sdp = gl->gl_sbd;
+ int error = 0;
+
+restart:
+ if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) {
+ set_bit(HIF_ABORTED, &gh->gh_iflags);
+ return -EIO;
+ }
+
+ set_bit(HIF_PROMOTE, &gh->gh_iflags);
+
+ spin_lock(&gl->gl_spin);
+ add_to_queue(gh);
+ run_queue(gl);
+ spin_unlock(&gl->gl_spin);
+
+ if (!(gh->gh_flags & GL_ASYNC)) {
+ error = glock_wait_internal(gh);
+ if (error == GLR_CANCELED) {
+ msleep(100);
+ goto restart;
+ }
+ }
+
+ clear_bit(GLF_PREFETCH, &gl->gl_flags);
+
+ if (error == GLR_TRYFAILED && (gh->gh_flags & GL_DUMP))
+ dump_glock(gl);
+
+ return error;
+}
+
+/**
+ * gfs2_glock_poll - poll to see if an async request has been completed
+ * @gh: the holder
+ *
+ * Returns: 1 if the request is ready to be gfs2_glock_wait()ed on
+ */
+
+int gfs2_glock_poll(struct gfs2_holder *gh)
+{
+ struct gfs2_glock *gl = gh->gh_gl;
+ int ready = 0;
+
+ spin_lock(&gl->gl_spin);
+
+ if (test_bit(HIF_HOLDER, &gh->gh_iflags))
+ ready = 1;
+ else if (list_empty(&gh->gh_list)) {
+ if (gh->gh_error == GLR_CANCELED) {
+ spin_unlock(&gl->gl_spin);
+ msleep(100);
+ if (gfs2_glock_nq(gh))
+ return 1;
+ return 0;
+ } else
+ ready = 1;
+ }
+
+ spin_unlock(&gl->gl_spin);
+
+ return ready;
+}
+
+/**
+ * gfs2_glock_wait - wait for a lock acquisition that ended in a GLR_ASYNC
+ * @gh: the holder structure
+ *
+ * Returns: 0, GLR_TRYFAILED, or errno on failure
+ */
+
+int gfs2_glock_wait(struct gfs2_holder *gh)
+{
+ int error;
+
+ error = glock_wait_internal(gh);
+ if (error == GLR_CANCELED) {
+ msleep(100);
+ gh->gh_flags &= ~GL_ASYNC;
+ error = gfs2_glock_nq(gh);
+ }
+
+ return error;
+}
+
+/**
+ * gfs2_glock_dq - dequeue a struct gfs2_holder from a glock (release a glock)
+ * @gh: the glock holder
+ *
+ */
+
+void gfs2_glock_dq(struct gfs2_holder *gh)
+{
+ struct gfs2_glock *gl = gh->gh_gl;
+ const struct gfs2_glock_operations *glops = gl->gl_ops;
+
+ if (gh->gh_flags & GL_NOCACHE)
+ handle_callback(gl, LM_ST_UNLOCKED);
+
+ gfs2_glmutex_lock(gl);
+
+ spin_lock(&gl->gl_spin);
+ list_del_init(&gh->gh_list);
+
+ if (list_empty(&gl->gl_holders)) {
+ spin_unlock(&gl->gl_spin);
+
+ if (glops->go_unlock)
+ glops->go_unlock(gh);
+
+ gl->gl_stamp = jiffies;
+
+ spin_lock(&gl->gl_spin);
+ }
+
+ clear_bit(GLF_LOCK, &gl->gl_flags);
+ run_queue(gl);
+ spin_unlock(&gl->gl_spin);
+}
+
+/**
+ * gfs2_glock_prefetch - Try to prefetch a glock
+ * @gl: the glock
+ * @state: the state to prefetch in
+ * @flags: flags passed to go_xmote_th()
+ *
+ */
+
+static void gfs2_glock_prefetch(struct gfs2_glock *gl, unsigned int state,
+ int flags)
+{
+ const struct gfs2_glock_operations *glops = gl->gl_ops;
+
+ spin_lock(&gl->gl_spin);
+
+ if (test_bit(GLF_LOCK, &gl->gl_flags) || !list_empty(&gl->gl_holders) ||
+ !list_empty(&gl->gl_waiters1) || !list_empty(&gl->gl_waiters2) ||
+ !list_empty(&gl->gl_waiters3) ||
+ relaxed_state_ok(gl->gl_state, state, flags)) {
+ spin_unlock(&gl->gl_spin);
+ return;
+ }
+
+ set_bit(GLF_PREFETCH, &gl->gl_flags);
+ set_bit(GLF_LOCK, &gl->gl_flags);
+ spin_unlock(&gl->gl_spin);
+
+ glops->go_xmote_th(gl, state, flags);
+}
+
+static void greedy_work(void *data)
+{
+ struct greedy *gr = data;
+ struct gfs2_holder *gh = &gr->gr_gh;
+ struct gfs2_glock *gl = gh->gh_gl;
+ const struct gfs2_glock_operations *glops = gl->gl_ops;
+
+ clear_bit(GLF_SKIP_WAITERS2, &gl->gl_flags);
+
+ if (glops->go_greedy)
+ glops->go_greedy(gl);
+
+ spin_lock(&gl->gl_spin);
+
+ if (list_empty(&gl->gl_waiters2)) {
+ clear_bit(GLF_GREEDY, &gl->gl_flags);
+ spin_unlock(&gl->gl_spin);
+ gfs2_holder_uninit(gh);
+ kfree(gr);
+ } else {
+ gfs2_glock_hold(gl);
+ list_add_tail(&gh->gh_list, &gl->gl_waiters2);
+ run_queue(gl);
+ spin_unlock(&gl->gl_spin);
+ gfs2_glock_put(gl);
+ }
+}
+
+/**
+ * gfs2_glock_be_greedy -
+ * @gl:
+ * @time:
+ *
+ * Returns: 0 if go_greedy will be called, 1 otherwise
+ */
+
+int gfs2_glock_be_greedy(struct gfs2_glock *gl, unsigned int time)
+{
+ struct greedy *gr;
+ struct gfs2_holder *gh;
+
+ if (!time || gl->gl_sbd->sd_args.ar_localcaching ||
+ test_and_set_bit(GLF_GREEDY, &gl->gl_flags))
+ return 1;
+
+ gr = kmalloc(sizeof(struct greedy), GFP_KERNEL);
+ if (!gr) {
+ clear_bit(GLF_GREEDY, &gl->gl_flags);
+ return 1;
+ }
+ gh = &gr->gr_gh;
+
+ gfs2_holder_init(gl, 0, 0, gh);
+ set_bit(HIF_GREEDY, &gh->gh_iflags);
+ INIT_WORK(&gr->gr_work, greedy_work, gr);
+
+ set_bit(GLF_SKIP_WAITERS2, &gl->gl_flags);
+ schedule_delayed_work(&gr->gr_work, time);
+
+ return 0;
+}
+
+/**
+ * gfs2_glock_dq_uninit - dequeue a holder from a glock and initialize it
+ * @gh: the holder structure
+ *
+ */
+
+void gfs2_glock_dq_uninit(struct gfs2_holder *gh)
+{
+ gfs2_glock_dq(gh);
+ gfs2_holder_uninit(gh);
+}
+
+/**
+ * gfs2_glock_nq_num - acquire a glock based on lock number
+ * @sdp: the filesystem
+ * @number: the lock number
+ * @glops: the glock operations for the type of glock
+ * @state: the state to acquire the glock in
+ * @flags: modifier flags for the aquisition
+ * @gh: the struct gfs2_holder
+ *
+ * Returns: errno
+ */
+
+int gfs2_glock_nq_num(struct gfs2_sbd *sdp, u64 number,
+ const struct gfs2_glock_operations *glops,
+ unsigned int state, int flags, struct gfs2_holder *gh)
+{
+ struct gfs2_glock *gl;
+ int error;
+
+ error = gfs2_glock_get(sdp, number, glops, CREATE, &gl);
+ if (!error) {
+ error = gfs2_glock_nq_init(gl, state, flags, gh);
+ gfs2_glock_put(gl);
+ }
+
+ return error;
+}
+
+/**
+ * glock_compare - Compare two struct gfs2_glock structures for sorting
+ * @arg_a: the first structure
+ * @arg_b: the second structure
+ *
+ */
+
+static int glock_compare(const void *arg_a, const void *arg_b)
+{
+ const struct gfs2_holder *gh_a = *(const struct gfs2_holder **)arg_a;
+ const struct gfs2_holder *gh_b = *(const struct gfs2_holder **)arg_b;
+ const struct lm_lockname *a = &gh_a->gh_gl->gl_name;
+ const struct lm_lockname *b = &gh_b->gh_gl->gl_name;
+
+ if (a->ln_number > b->ln_number)
+ return 1;
+ if (a->ln_number < b->ln_number)
+ return -1;
+ if (gh_a->gh_state == LM_ST_SHARED && gh_b->gh_state == LM_ST_EXCLUSIVE)
+ return 1;
+ if (!(gh_a->gh_flags & GL_LOCAL_EXCL) && (gh_b->gh_flags & GL_LOCAL_EXCL))
+ return 1;
+ return 0;
+}
+
+/**
+ * nq_m_sync - synchonously acquire more than one glock in deadlock free order
+ * @num_gh: the number of structures
+ * @ghs: an array of struct gfs2_holder structures
+ *
+ * Returns: 0 on success (all glocks acquired),
+ * errno on failure (no glocks acquired)
+ */
+
+static int nq_m_sync(unsigned int num_gh, struct gfs2_holder *ghs,
+ struct gfs2_holder **p)
+{
+ unsigned int x;
+ int error = 0;
+
+ for (x = 0; x < num_gh; x++)
+ p[x] = &ghs[x];
+
+ sort(p, num_gh, sizeof(struct gfs2_holder *), glock_compare, NULL);
+
+ for (x = 0; x < num_gh; x++) {
+ p[x]->gh_flags &= ~(LM_FLAG_TRY | GL_ASYNC);
+
+ error = gfs2_glock_nq(p[x]);
+ if (error) {
+ while (x--)
+ gfs2_glock_dq(p[x]);
+ break;
+ }
+ }
+
+ return error;
+}
+
+/**
+ * gfs2_glock_nq_m - acquire multiple glocks
+ * @num_gh: the number of structures
+ * @ghs: an array of struct gfs2_holder structures
+ *
+ * Figure out how big an impact this function has. Either:
+ * 1) Replace this code with code that calls gfs2_glock_prefetch()
+ * 2) Forget async stuff and just call nq_m_sync()
+ * 3) Leave it like it is
+ *
+ * Returns: 0 on success (all glocks acquired),
+ * errno on failure (no glocks acquired)
+ */
+
+int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs)
+{
+ int *e;
+ unsigned int x;
+ int borked = 0, serious = 0;
+ int error = 0;
+
+ if (!num_gh)
+ return 0;
+
+ if (num_gh == 1) {
+ ghs->gh_flags &= ~(LM_FLAG_TRY | GL_ASYNC);
+ return gfs2_glock_nq(ghs);
+ }
+
+ e = kcalloc(num_gh, sizeof(struct gfs2_holder *), GFP_KERNEL);
+ if (!e)
+ return -ENOMEM;
+
+ for (x = 0; x < num_gh; x++) {
+ ghs[x].gh_flags |= LM_FLAG_TRY | GL_ASYNC;
+ error = gfs2_glock_nq(&ghs[x]);
+ if (error) {
+ borked = 1;
+ serious = error;
+ num_gh = x;
+ break;
+ }
+ }
+
+ for (x = 0; x < num_gh; x++) {
+ error = e[x] = glock_wait_internal(&ghs[x]);
+ if (error) {
+ borked = 1;
+ if (error != GLR_TRYFAILED && error != GLR_CANCELED)
+ serious = error;
+ }
+ }
+
+ if (!borked) {
+ kfree(e);
+ return 0;
+ }
+
+ for (x = 0; x < num_gh; x++)
+ if (!e[x])
+ gfs2_glock_dq(&ghs[x]);
+
+ if (serious)
+ error = serious;
+ else {
+ for (x = 0; x < num_gh; x++)
+ gfs2_holder_reinit(ghs[x].gh_state, ghs[x].gh_flags,
+ &ghs[x]);
+ error = nq_m_sync(num_gh, ghs, (struct gfs2_holder **)e);
+ }
+
+ kfree(e);
+
+ return error;
+}
+
+/**
+ * gfs2_glock_dq_m - release multiple glocks
+ * @num_gh: the number of structures
+ * @ghs: an array of struct gfs2_holder structures
+ *
+ */
+
+void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs)
+{
+ unsigned int x;
+
+ for (x = 0; x < num_gh; x++)
+ gfs2_glock_dq(&ghs[x]);
+}
+
+/**
+ * gfs2_glock_dq_uninit_m - release multiple glocks
+ * @num_gh: the number of structures
+ * @ghs: an array of struct gfs2_holder structures
+ *
+ */
+
+void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs)
+{
+ unsigned int x;
+
+ for (x = 0; x < num_gh; x++)
+ gfs2_glock_dq_uninit(&ghs[x]);
+}
+
+/**
+ * gfs2_glock_prefetch_num - prefetch a glock based on lock number
+ * @sdp: the filesystem
+ * @number: the lock number
+ * @glops: the glock operations for the type of glock
+ * @state: the state to acquire the glock in
+ * @flags: modifier flags for the aquisition
+ *
+ * Returns: errno
+ */
+
+void gfs2_glock_prefetch_num(struct gfs2_sbd *sdp, u64 number,
+ const struct gfs2_glock_operations *glops,
+ unsigned int state, int flags)
+{
+ struct gfs2_glock *gl;
+ int error;
+
+ if (atomic_read(&sdp->sd_reclaim_count) <
+ gfs2_tune_get(sdp, gt_reclaim_limit)) {
+ error = gfs2_glock_get(sdp, number, glops, CREATE, &gl);
+ if (!error) {
+ gfs2_glock_prefetch(gl, state, flags);
+ gfs2_glock_put(gl);
+ }
+ }
+}
+
+/**
+ * gfs2_lvb_hold - attach a LVB from a glock
+ * @gl: The glock in question
+ *
+ */
+
+int gfs2_lvb_hold(struct gfs2_glock *gl)
+{
+ int error;
+
+ gfs2_glmutex_lock(gl);
+
+ if (!atomic_read(&gl->gl_lvb_count)) {
+ error = gfs2_lm_hold_lvb(gl->gl_sbd, gl->gl_lock, &gl->gl_lvb);
+ if (error) {
+ gfs2_glmutex_unlock(gl);
+ return error;
+ }
+ gfs2_glock_hold(gl);
+ }
+ atomic_inc(&gl->gl_lvb_count);
+
+ gfs2_glmutex_unlock(gl);
+
+ return 0;
+}
+
+/**
+ * gfs2_lvb_unhold - detach a LVB from a glock
+ * @gl: The glock in question
+ *
+ */
+
+void gfs2_lvb_unhold(struct gfs2_glock *gl)
+{
+ gfs2_glock_hold(gl);
+ gfs2_glmutex_lock(gl);
+
+ gfs2_assert(gl->gl_sbd, atomic_read(&gl->gl_lvb_count) > 0);
+ if (atomic_dec_and_test(&gl->gl_lvb_count)) {
+ gfs2_lm_unhold_lvb(gl->gl_sbd, gl->gl_lock, gl->gl_lvb);
+ gl->gl_lvb = NULL;
+ gfs2_glock_put(gl);
+ }
+
+ gfs2_glmutex_unlock(gl);
+ gfs2_glock_put(gl);
+}
+
+static void blocking_cb(struct gfs2_sbd *sdp, struct lm_lockname *name,
+ unsigned int state)
+{
+ struct gfs2_glock *gl;
+
+ gl = gfs2_glock_find(sdp, name);
+ if (!gl)
+ return;
+
+ if (gl->gl_ops->go_callback)
+ gl->gl_ops->go_callback(gl, state);
+ handle_callback(gl, state);
+
+ spin_lock(&gl->gl_spin);
+ run_queue(gl);
+ spin_unlock(&gl->gl_spin);
+
+ gfs2_glock_put(gl);
+}
+
+/**
+ * gfs2_glock_cb - Callback used by locking module
+ * @sdp: Pointer to the superblock
+ * @type: Type of callback
+ * @data: Type dependent data pointer
+ *
+ * Called by the locking module when it wants to tell us something.
+ * Either we need to drop a lock, one of our ASYNC requests completed, or
+ * a journal from another client needs to be recovered.
+ */
+
+void gfs2_glock_cb(void *cb_data, unsigned int type, void *data)
+{
+ struct gfs2_sbd *sdp = cb_data;
+
+ switch (type) {
+ case LM_CB_NEED_E:
+ blocking_cb(sdp, data, LM_ST_UNLOCKED);
+ return;
+
+ case LM_CB_NEED_D:
+ blocking_cb(sdp, data, LM_ST_DEFERRED);
+ return;
+
+ case LM_CB_NEED_S:
+ blocking_cb(sdp, data, LM_ST_SHARED);
+ return;
+
+ case LM_CB_ASYNC: {
+ struct lm_async_cb *async = data;
+ struct gfs2_glock *gl;
+
+ gl = gfs2_glock_find(sdp, &async->lc_name);
+ if (gfs2_assert_warn(sdp, gl))
+ return;
+ if (!gfs2_assert_warn(sdp, gl->gl_req_bh))
+ gl->gl_req_bh(gl, async->lc_ret);
+ gfs2_glock_put(gl);
+ return;
+ }
+
+ case LM_CB_NEED_RECOVERY:
+ gfs2_jdesc_make_dirty(sdp, *(unsigned int *)data);
+ if (sdp->sd_recoverd_process)
+ wake_up_process(sdp->sd_recoverd_process);
+ return;
+
+ case LM_CB_DROPLOCKS:
+ gfs2_gl_hash_clear(sdp, NO_WAIT);
+ gfs2_quota_scan(sdp);
+ return;
+
+ default:
+ gfs2_assert_warn(sdp, 0);
+ return;
+ }
+}
+
+/**
+ * demote_ok - Check to see if it's ok to unlock a glock
+ * @gl: the glock
+ *
+ * Returns: 1 if it's ok
+ */
+
+static int demote_ok(struct gfs2_glock *gl)
+{
+ struct gfs2_sbd *sdp = gl->gl_sbd;
+ const struct gfs2_glock_operations *glops = gl->gl_ops;
+ int demote = 1;
+
+ if (test_bit(GLF_STICKY, &gl->gl_flags))
+ demote = 0;
+ else if (test_bit(GLF_PREFETCH, &gl->gl_flags))
+ demote = time_after_eq(jiffies, gl->gl_stamp +
+ gfs2_tune_get(sdp, gt_prefetch_secs) * HZ);
+ else if (glops->go_demote_ok)
+ demote = glops->go_demote_ok(gl);
+
+ return demote;
+}
+
+/**
+ * gfs2_glock_schedule_for_reclaim - Add a glock to the reclaim list
+ * @gl: the glock
+ *
+ */
+
+void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl)
+{
+ struct gfs2_sbd *sdp = gl->gl_sbd;
+
+ spin_lock(&sdp->sd_reclaim_lock);
+ if (list_empty(&gl->gl_reclaim)) {
+ gfs2_glock_hold(gl);
+ list_add(&gl->gl_reclaim, &sdp->sd_reclaim_list);
+ atomic_inc(&sdp->sd_reclaim_count);
+ }
+ spin_unlock(&sdp->sd_reclaim_lock);
+
+ wake_up(&sdp->sd_reclaim_wq);
+}
+
+/**
+ * gfs2_reclaim_glock - process the next glock on the filesystem's reclaim list
+ * @sdp: the filesystem
+ *
+ * Called from gfs2_glockd() glock reclaim daemon, or when promoting a
+ * different glock and we notice that there are a lot of glocks in the
+ * reclaim list.
+ *
+ */
+
+void gfs2_reclaim_glock(struct gfs2_sbd *sdp)
+{
+ struct gfs2_glock *gl;
+
+ spin_lock(&sdp->sd_reclaim_lock);
+ if (list_empty(&sdp->sd_reclaim_list)) {
+ spin_unlock(&sdp->sd_reclaim_lock);
+ return;
+ }
+ gl = list_entry(sdp->sd_reclaim_list.next,
+ struct gfs2_glock, gl_reclaim);
+ list_del_init(&gl->gl_reclaim);
+ spin_unlock(&sdp->sd_reclaim_lock);
+
+ atomic_dec(&sdp->sd_reclaim_count);
+ atomic_inc(&sdp->sd_reclaimed);
+
+ if (gfs2_glmutex_trylock(gl)) {
+ if (queue_empty(gl, &gl->gl_holders) &&
+ gl->gl_state != LM_ST_UNLOCKED && demote_ok(gl))
+ handle_callback(gl, LM_ST_UNLOCKED);
+ gfs2_glmutex_unlock(gl);
+ }
+
+ gfs2_glock_put(gl);
+}
+
+/**
+ * examine_bucket - Call a function for glock in a hash bucket
+ * @examiner: the function
+ * @sdp: the filesystem
+ * @bucket: the bucket
+ *
+ * Returns: 1 if the bucket has entries
+ */
+
+static int examine_bucket(glock_examiner examiner, struct gfs2_sbd *sdp,
+ unsigned int hash)
+{
+ struct gfs2_glock *gl, *prev = NULL;
+ int has_entries = 0;
+ struct hlist_head *head = &gl_hash_table[hash].hb_list;
+
+ read_lock(gl_lock_addr(hash));
+ /* Can't use hlist_for_each_entry - don't want prefetch here */
+ if (hlist_empty(head))
+ goto out;
+ gl = list_entry(head->first, struct gfs2_glock, gl_list);
+ while(1) {
+ if (gl->gl_sbd == sdp) {
+ gfs2_glock_hold(gl);
+ read_unlock(gl_lock_addr(hash));
+ if (prev)
+ gfs2_glock_put(prev);
+ prev = gl;
+ examiner(gl);
+ has_entries = 1;
+ read_lock(gl_lock_addr(hash));
+ }
+ if (gl->gl_list.next == NULL)
+ break;
+ gl = list_entry(gl->gl_list.next, struct gfs2_glock, gl_list);
+ }
+out:
+ read_unlock(gl_lock_addr(hash));
+ if (prev)
+ gfs2_glock_put(prev);
+ return has_entries;
+}
+
+/**
+ * scan_glock - look at a glock and see if we can reclaim it
+ * @gl: the glock to look at
+ *
+ */
+
+static void scan_glock(struct gfs2_glock *gl)
+{
+ if (gl->gl_ops == &gfs2_inode_glops)
+ return;
+
+ if (gfs2_glmutex_trylock(gl)) {
+ if (queue_empty(gl, &gl->gl_holders) &&
+ gl->gl_state != LM_ST_UNLOCKED && demote_ok(gl))
+ goto out_schedule;
+ gfs2_glmutex_unlock(gl);
+ }
+ return;
+
+out_schedule:
+ gfs2_glmutex_unlock(gl);
+ gfs2_glock_schedule_for_reclaim(gl);
+}
+
+/**
+ * gfs2_scand_internal - Look for glocks and inodes to toss from memory
+ * @sdp: the filesystem
+ *
+ */
+
+void gfs2_scand_internal(struct gfs2_sbd *sdp)
+{
+ unsigned int x;
+
+ for (x = 0; x < GFS2_GL_HASH_SIZE; x++)
+ examine_bucket(scan_glock, sdp, x);
+}
+
+/**
+ * clear_glock - look at a glock and see if we can free it from glock cache
+ * @gl: the glock to look at
+ *
+ */
+
+static void clear_glock(struct gfs2_glock *gl)
+{
+ struct gfs2_sbd *sdp = gl->gl_sbd;
+ int released;
+
+ spin_lock(&sdp->sd_reclaim_lock);
+ if (!list_empty(&gl->gl_reclaim)) {
+ list_del_init(&gl->gl_reclaim);
+ atomic_dec(&sdp->sd_reclaim_count);
+ spin_unlock(&sdp->sd_reclaim_lock);
+ released = gfs2_glock_put(gl);
+ gfs2_assert(sdp, !released);
+ } else {
+ spin_unlock(&sdp->sd_reclaim_lock);
+ }
+
+ if (gfs2_glmutex_trylock(gl)) {
+ if (queue_empty(gl, &gl->gl_holders) &&
+ gl->gl_state != LM_ST_UNLOCKED)
+ handle_callback(gl, LM_ST_UNLOCKED);
+ gfs2_glmutex_unlock(gl);
+ }
+}
+
+/**
+ * gfs2_gl_hash_clear - Empty out the glock hash table
+ * @sdp: the filesystem
+ * @wait: wait until it's all gone
+ *
+ * Called when unmounting the filesystem, or when inter-node lock manager
+ * requests DROPLOCKS because it is running out of capacity.
+ */
+
+void gfs2_gl_hash_clear(struct gfs2_sbd *sdp, int wait)
+{
+ unsigned long t;
+ unsigned int x;
+ int cont;
+
+ t = jiffies;
+
+ for (;;) {
+ cont = 0;
+ for (x = 0; x < GFS2_GL_HASH_SIZE; x++) {
+ if (examine_bucket(clear_glock, sdp, x))
+ cont = 1;
+ }
+
+ if (!wait || !cont)
+ break;
+
+ if (time_after_eq(jiffies,
+ t + gfs2_tune_get(sdp, gt_stall_secs) * HZ)) {
+ fs_warn(sdp, "Unmount seems to be stalled. "
+ "Dumping lock state...\n");
+ gfs2_dump_lockstate(sdp);
+ t = jiffies;
+ }
+
+ invalidate_inodes(sdp->sd_vfs);
+ msleep(10);
+ }
+}
+
+/*
+ * Diagnostic routines to help debug distributed deadlock
+ */
+
+/**
+ * dump_holder - print information about a glock holder
+ * @str: a string naming the type of holder
+ * @gh: the glock holder
+ *
+ * Returns: 0 on success, -ENOBUFS when we run out of space
+ */
+
+static int dump_holder(char *str, struct gfs2_holder *gh)
+{
+ unsigned int x;
+ int error = -ENOBUFS;
+
+ printk(KERN_INFO " %s\n", str);
+ printk(KERN_INFO " owner = %ld\n",
+ (gh->gh_owner) ? (long)gh->gh_owner->pid : -1);
+ printk(KERN_INFO " gh_state = %u\n", gh->gh_state);
+ printk(KERN_INFO " gh_flags =");
+ for (x = 0; x < 32; x++)
+ if (gh->gh_flags & (1 << x))
+ printk(" %u", x);
+ printk(" \n");
+ printk(KERN_INFO " error = %d\n", gh->gh_error);
+ printk(KERN_INFO " gh_iflags =");
+ for (x = 0; x < 32; x++)
+ if (test_bit(x, &gh->gh_iflags))
+ printk(" %u", x);
+ printk(" \n");
+ print_symbol(KERN_INFO " initialized at: %s\n", gh->gh_ip);
+
+ error = 0;
+
+ return error;
+}
+
+/**
+ * dump_inode - print information about an inode
+ * @ip: the inode
+ *
+ * Returns: 0 on success, -ENOBUFS when we run out of space
+ */
+
+static int dump_inode(struct gfs2_inode *ip)
+{
+ unsigned int x;
+ int error = -ENOBUFS;
+
+ printk(KERN_INFO " Inode:\n");
+ printk(KERN_INFO " num = %llu %llu\n",
+ (unsigned long long)ip->i_num.no_formal_ino,
+ (unsigned long long)ip->i_num.no_addr);
+ printk(KERN_INFO " type = %u\n", IF2DT(ip->i_di.di_mode));
+ printk(KERN_INFO " i_flags =");
+ for (x = 0; x < 32; x++)
+ if (test_bit(x, &ip->i_flags))
+ printk(" %u", x);
+ printk(" \n");
+
+ error = 0;
+
+ return error;
+}
+
+/**
+ * dump_glock - print information about a glock
+ * @gl: the glock
+ * @count: where we are in the buffer
+ *
+ * Returns: 0 on success, -ENOBUFS when we run out of space
+ */
+
+static int dump_glock(struct gfs2_glock *gl)
+{
+ struct gfs2_holder *gh;
+ unsigned int x;
+ int error = -ENOBUFS;
+
+ spin_lock(&gl->gl_spin);
+
+ printk(KERN_INFO "Glock 0x%p (%u, %llu)\n", gl, gl->gl_name.ln_type,
+ (unsigned long long)gl->gl_name.ln_number);
+ printk(KERN_INFO " gl_flags =");
+ for (x = 0; x < 32; x++) {
+ if (test_bit(x, &gl->gl_flags))
+ printk(" %u", x);
+ }
+ printk(" \n");
+ printk(KERN_INFO " gl_ref = %d\n", atomic_read(&gl->gl_ref));
+ printk(KERN_INFO " gl_state = %u\n", gl->gl_state);
+ printk(KERN_INFO " gl_owner = %s\n", gl->gl_owner->comm);
+ print_symbol(KERN_INFO " gl_ip = %s\n", gl->gl_ip);
+ printk(KERN_INFO " req_gh = %s\n", (gl->gl_req_gh) ? "yes" : "no");
+ printk(KERN_INFO " req_bh = %s\n", (gl->gl_req_bh) ? "yes" : "no");
+ printk(KERN_INFO " lvb_count = %d\n", atomic_read(&gl->gl_lvb_count));
+ printk(KERN_INFO " object = %s\n", (gl->gl_object) ? "yes" : "no");
+ printk(KERN_INFO " le = %s\n",
+ (list_empty(&gl->gl_le.le_list)) ? "no" : "yes");
+ printk(KERN_INFO " reclaim = %s\n",
+ (list_empty(&gl->gl_reclaim)) ? "no" : "yes");
+ if (gl->gl_aspace)
+ printk(KERN_INFO " aspace = 0x%p nrpages = %lu\n", gl->gl_aspace,
+ gl->gl_aspace->i_mapping->nrpages);
+ else
+ printk(KERN_INFO " aspace = no\n");
+ printk(KERN_INFO " ail = %d\n", atomic_read(&gl->gl_ail_count));
+ if (gl->gl_req_gh) {
+ error = dump_holder("Request", gl->gl_req_gh);
+ if (error)
+ goto out;
+ }
+ list_for_each_entry(gh, &gl->gl_holders, gh_list) {
+ error = dump_holder("Holder", gh);
+ if (error)
+ goto out;
+ }
+ list_for_each_entry(gh, &gl->gl_waiters1, gh_list) {
+ error = dump_holder("Waiter1", gh);
+ if (error)
+ goto out;
+ }
+ list_for_each_entry(gh, &gl->gl_waiters2, gh_list) {
+ error = dump_holder("Waiter2", gh);
+ if (error)
+ goto out;
+ }
+ list_for_each_entry(gh, &gl->gl_waiters3, gh_list) {
+ error = dump_holder("Waiter3", gh);
+ if (error)
+ goto out;
+ }
+ if (gl->gl_ops == &gfs2_inode_glops && gl->gl_object) {
+ if (!test_bit(GLF_LOCK, &gl->gl_flags) &&
+ list_empty(&gl->gl_holders)) {
+ error = dump_inode(gl->gl_object);
+ if (error)
+ goto out;
+ } else {
+ error = -ENOBUFS;
+ printk(KERN_INFO " Inode: busy\n");
+ }
+ }
+
+ error = 0;
+
+out:
+ spin_unlock(&gl->gl_spin);
+ return error;
+}
+
+/**
+ * gfs2_dump_lockstate - print out the current lockstate
+ * @sdp: the filesystem
+ * @ub: the buffer to copy the information into
+ *
+ * If @ub is NULL, dump the lockstate to the console.
+ *
+ */
+
+static int gfs2_dump_lockstate(struct gfs2_sbd *sdp)
+{
+ struct gfs2_glock *gl;
+ struct hlist_node *h;
+ unsigned int x;
+ int error = 0;
+
+ for (x = 0; x < GFS2_GL_HASH_SIZE; x++) {
+
+ read_lock(gl_lock_addr(x));
+
+ hlist_for_each_entry(gl, h, &gl_hash_table[x].hb_list, gl_list) {
+ if (gl->gl_sbd != sdp)
+ continue;
+
+ error = dump_glock(gl);
+ if (error)
+ break;
+ }
+
+ read_unlock(gl_lock_addr(x));
+
+ if (error)
+ break;
+ }
+
+
+ return error;
+}
+
+int __init gfs2_glock_init(void)
+{
+ unsigned i;
+ for(i = 0; i < GFS2_GL_HASH_SIZE; i++) {
+ INIT_HLIST_HEAD(&gl_hash_table[i].hb_list);
+ }
+#ifdef GL_HASH_LOCK_SZ
+ for(i = 0; i < GL_HASH_LOCK_SZ; i++) {
+ rwlock_init(&gl_hash_locks[i]);
+ }
+#endif
+ return 0;
+}
+
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
new file mode 100644
index 000000000000..2b2a889ee2cc
--- /dev/null
+++ b/fs/gfs2/glock.h
@@ -0,0 +1,153 @@
+/*
+ * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#ifndef __GLOCK_DOT_H__
+#define __GLOCK_DOT_H__
+
+#include "incore.h"
+
+/* Flags for lock requests; used in gfs2_holder gh_flag field.
+ From lm_interface.h:
+#define LM_FLAG_TRY 0x00000001
+#define LM_FLAG_TRY_1CB 0x00000002
+#define LM_FLAG_NOEXP 0x00000004
+#define LM_FLAG_ANY 0x00000008
+#define LM_FLAG_PRIORITY 0x00000010 */
+
+#define GL_LOCAL_EXCL 0x00000020
+#define GL_ASYNC 0x00000040
+#define GL_EXACT 0x00000080
+#define GL_SKIP 0x00000100
+#define GL_ATIME 0x00000200
+#define GL_NOCACHE 0x00000400
+#define GL_NOCANCEL 0x00001000
+#define GL_AOP 0x00004000
+#define GL_DUMP 0x00008000
+
+#define GLR_TRYFAILED 13
+#define GLR_CANCELED 14
+
+static inline int gfs2_glock_is_locked_by_me(struct gfs2_glock *gl)
+{
+ struct gfs2_holder *gh;
+ int locked = 0;
+
+ /* Look in glock's list of holders for one with current task as owner */
+ spin_lock(&gl->gl_spin);
+ list_for_each_entry(gh, &gl->gl_holders, gh_list) {
+ if (gh->gh_owner == current) {
+ locked = 1;
+ break;
+ }
+ }
+ spin_unlock(&gl->gl_spin);
+
+ return locked;
+}
+
+static inline int gfs2_glock_is_held_excl(struct gfs2_glock *gl)
+{
+ return gl->gl_state == LM_ST_EXCLUSIVE;
+}
+
+static inline int gfs2_glock_is_held_dfrd(struct gfs2_glock *gl)
+{
+ return gl->gl_state == LM_ST_DEFERRED;
+}
+
+static inline int gfs2_glock_is_held_shrd(struct gfs2_glock *gl)
+{
+ return gl->gl_state == LM_ST_SHARED;
+}
+
+static inline int gfs2_glock_is_blocking(struct gfs2_glock *gl)
+{
+ int ret;
+ spin_lock(&gl->gl_spin);
+ ret = !list_empty(&gl->gl_waiters2) || !list_empty(&gl->gl_waiters3);
+ spin_unlock(&gl->gl_spin);
+ return ret;
+}
+
+int gfs2_glock_get(struct gfs2_sbd *sdp,
+ u64 number, const struct gfs2_glock_operations *glops,
+ int create, struct gfs2_glock **glp);
+void gfs2_glock_hold(struct gfs2_glock *gl);
+int gfs2_glock_put(struct gfs2_glock *gl);
+void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, unsigned flags,
+ struct gfs2_holder *gh);
+void gfs2_holder_reinit(unsigned int state, unsigned flags,
+ struct gfs2_holder *gh);
+void gfs2_holder_uninit(struct gfs2_holder *gh);
+
+void gfs2_glock_xmote_th(struct gfs2_glock *gl, unsigned int state, int flags);
+void gfs2_glock_drop_th(struct gfs2_glock *gl);
+
+int gfs2_glock_nq(struct gfs2_holder *gh);
+int gfs2_glock_poll(struct gfs2_holder *gh);
+int gfs2_glock_wait(struct gfs2_holder *gh);
+void gfs2_glock_dq(struct gfs2_holder *gh);
+
+int gfs2_glock_be_greedy(struct gfs2_glock *gl, unsigned int time);
+
+void gfs2_glock_dq_uninit(struct gfs2_holder *gh);
+int gfs2_glock_nq_num(struct gfs2_sbd *sdp,
+ u64 number, const struct gfs2_glock_operations *glops,
+ unsigned int state, int flags, struct gfs2_holder *gh);
+
+int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs);
+void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs);
+void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs);
+
+void gfs2_glock_prefetch_num(struct gfs2_sbd *sdp, u64 number,
+ const struct gfs2_glock_operations *glops,
+ unsigned int state, int flags);
+void gfs2_glock_inode_squish(struct inode *inode);
+
+/**
+ * gfs2_glock_nq_init - intialize a holder and enqueue it on a glock
+ * @gl: the glock
+ * @state: the state we're requesting
+ * @flags: the modifier flags
+ * @gh: the holder structure
+ *
+ * Returns: 0, GLR_*, or errno
+ */
+
+static inline int gfs2_glock_nq_init(struct gfs2_glock *gl,
+ unsigned int state, int flags,
+ struct gfs2_holder *gh)
+{
+ int error;
+
+ gfs2_holder_init(gl, state, flags, gh);
+
+ error = gfs2_glock_nq(gh);
+ if (error)
+ gfs2_holder_uninit(gh);
+
+ return error;
+}
+
+/* Lock Value Block functions */
+
+int gfs2_lvb_hold(struct gfs2_glock *gl);
+void gfs2_lvb_unhold(struct gfs2_glock *gl);
+
+void gfs2_glock_cb(void *cb_data, unsigned int type, void *data);
+
+void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl);
+void gfs2_reclaim_glock(struct gfs2_sbd *sdp);
+
+void gfs2_scand_internal(struct gfs2_sbd *sdp);
+void gfs2_gl_hash_clear(struct gfs2_sbd *sdp, int wait);
+
+int __init gfs2_glock_init(void);
+
+#endif /* __GLOCK_DOT_H__ */
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
new file mode 100644
index 000000000000..41a6b6818a50
--- /dev/null
+++ b/fs/gfs2/glops.c
@@ -0,0 +1,615 @@
+/*
+ * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/lm_interface.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "bmap.h"
+#include "glock.h"
+#include "glops.h"
+#include "inode.h"
+#include "log.h"
+#include "meta_io.h"
+#include "recovery.h"
+#include "rgrp.h"
+#include "util.h"
+#include "trans.h"
+
+/**
+ * ail_empty_gl - remove all buffers for a given lock from the AIL
+ * @gl: the glock
+ *
+ * None of the buffers should be dirty, locked, or pinned.
+ */
+
+static void gfs2_ail_empty_gl(struct gfs2_glock *gl)
+{
+ struct gfs2_sbd *sdp = gl->gl_sbd;
+ unsigned int blocks;
+ struct list_head *head = &gl->gl_ail_list;
+ struct gfs2_bufdata *bd;
+ struct buffer_head *bh;
+ u64 blkno;
+ int error;
+
+ blocks = atomic_read(&gl->gl_ail_count);
+ if (!blocks)
+ return;
+
+ error = gfs2_trans_begin(sdp, 0, blocks);
+ if (gfs2_assert_withdraw(sdp, !error))
+ return;
+
+ gfs2_log_lock(sdp);
+ while (!list_empty(head)) {
+ bd = list_entry(head->next, struct gfs2_bufdata,
+ bd_ail_gl_list);
+ bh = bd->bd_bh;
+ blkno = bh->b_blocknr;
+ gfs2_assert_withdraw(sdp, !buffer_busy(bh));
+
+ bd->bd_ail = NULL;
+ list_del(&bd->bd_ail_st_list);
+ list_del(&bd->bd_ail_gl_list);
+ atomic_dec(&gl->gl_ail_count);
+ brelse(bh);
+ gfs2_log_unlock(sdp);
+
+ gfs2_trans_add_revoke(sdp, blkno);
+
+ gfs2_log_lock(sdp);
+ }
+ gfs2_assert_withdraw(sdp, !atomic_read(&gl->gl_ail_count));
+ gfs2_log_unlock(sdp);
+
+ gfs2_trans_end(sdp);
+ gfs2_log_flush(sdp, NULL);
+}
+
+/**
+ * gfs2_pte_inval - Sync and invalidate all PTEs associated with a glock
+ * @gl: the glock
+ *
+ */
+
+static void gfs2_pte_inval(struct gfs2_glock *gl)
+{
+ struct gfs2_inode *ip;
+ struct inode *inode;
+
+ ip = gl->gl_object;
+ inode = &ip->i_inode;
+ if (!ip || !S_ISREG(ip->i_di.di_mode))
+ return;
+
+ if (!test_bit(GIF_PAGED, &ip->i_flags))
+ return;
+
+ unmap_shared_mapping_range(inode->i_mapping, 0, 0);
+
+ if (test_bit(GIF_SW_PAGED, &ip->i_flags))
+ set_bit(GLF_DIRTY, &gl->gl_flags);
+
+ clear_bit(GIF_SW_PAGED, &ip->i_flags);
+}
+
+/**
+ * gfs2_page_inval - Invalidate all pages associated with a glock
+ * @gl: the glock
+ *
+ */
+
+static void gfs2_page_inval(struct gfs2_glock *gl)
+{
+ struct gfs2_inode *ip;
+ struct inode *inode;
+
+ ip = gl->gl_object;
+ inode = &ip->i_inode;
+ if (!ip || !S_ISREG(ip->i_di.di_mode))
+ return;
+
+ truncate_inode_pages(inode->i_mapping, 0);
+ gfs2_assert_withdraw(GFS2_SB(&ip->i_inode), !inode->i_mapping->nrpages);
+ clear_bit(GIF_PAGED, &ip->i_flags);
+}
+
+/**
+ * gfs2_page_wait - Wait for writeback of data
+ * @gl: the glock
+ *
+ * Syncs data (not metadata) for a regular file.
+ * No-op for all other types.
+ */
+
+static void gfs2_page_wait(struct gfs2_glock *gl)
+{
+ struct gfs2_inode *ip = gl->gl_object;
+ struct inode *inode = &ip->i_inode;
+ struct address_space *mapping = inode->i_mapping;
+ int error;
+
+ if (!S_ISREG(ip->i_di.di_mode))
+ return;
+
+ error = filemap_fdatawait(mapping);
+
+ /* Put back any errors cleared by filemap_fdatawait()
+ so they can be caught by someone who can pass them
+ up to user space. */
+
+ if (error == -ENOSPC)
+ set_bit(AS_ENOSPC, &mapping->flags);
+ else if (error)
+ set_bit(AS_EIO, &mapping->flags);
+
+}
+
+static void gfs2_page_writeback(struct gfs2_glock *gl)
+{
+ struct gfs2_inode *ip = gl->gl_object;
+ struct inode *inode = &ip->i_inode;
+ struct address_space *mapping = inode->i_mapping;
+
+ if (!S_ISREG(ip->i_di.di_mode))
+ return;
+
+ filemap_fdatawrite(mapping);
+}
+
+/**
+ * meta_go_sync - sync out the metadata for this glock
+ * @gl: the glock
+ * @flags: DIO_*
+ *
+ * Called when demoting or unlocking an EX glock. We must flush
+ * to disk all dirty buffers/pages relating to this glock, and must not
+ * not return to caller to demote/unlock the glock until I/O is complete.
+ */
+
+static void meta_go_sync(struct gfs2_glock *gl, int flags)
+{
+ if (!(flags & DIO_METADATA))
+ return;
+
+ if (test_and_clear_bit(GLF_DIRTY, &gl->gl_flags)) {
+ gfs2_log_flush(gl->gl_sbd, gl);
+ gfs2_meta_sync(gl);
+ if (flags & DIO_RELEASE)
+ gfs2_ail_empty_gl(gl);
+ }
+
+}
+
+/**
+ * meta_go_inval - invalidate the metadata for this glock
+ * @gl: the glock
+ * @flags:
+ *
+ */
+
+static void meta_go_inval(struct gfs2_glock *gl, int flags)
+{
+ if (!(flags & DIO_METADATA))
+ return;
+
+ gfs2_meta_inval(gl);
+ gl->gl_vn++;
+}
+
+/**
+ * inode_go_xmote_th - promote/demote a glock
+ * @gl: the glock
+ * @state: the requested state
+ * @flags:
+ *
+ */
+
+static void inode_go_xmote_th(struct gfs2_glock *gl, unsigned int state,
+ int flags)
+{
+ if (gl->gl_state != LM_ST_UNLOCKED)
+ gfs2_pte_inval(gl);
+ gfs2_glock_xmote_th(gl, state, flags);
+}
+
+/**
+ * inode_go_xmote_bh - After promoting/demoting a glock
+ * @gl: the glock
+ *
+ */
+
+static void inode_go_xmote_bh(struct gfs2_glock *gl)
+{
+ struct gfs2_holder *gh = gl->gl_req_gh;
+ struct buffer_head *bh;
+ int error;
+
+ if (gl->gl_state != LM_ST_UNLOCKED &&
+ (!gh || !(gh->gh_flags & GL_SKIP))) {
+ error = gfs2_meta_read(gl, gl->gl_name.ln_number, 0, &bh);
+ if (!error)
+ brelse(bh);
+ }
+}
+
+/**
+ * inode_go_drop_th - unlock a glock
+ * @gl: the glock
+ *
+ * Invoked from rq_demote().
+ * Another node needs the lock in EXCLUSIVE mode, or lock (unused for too long)
+ * is being purged from our node's glock cache; we're dropping lock.
+ */
+
+static void inode_go_drop_th(struct gfs2_glock *gl)
+{
+ gfs2_pte_inval(gl);
+ gfs2_glock_drop_th(gl);
+}
+
+/**
+ * inode_go_sync - Sync the dirty data and/or metadata for an inode glock
+ * @gl: the glock protecting the inode
+ * @flags:
+ *
+ */
+
+static void inode_go_sync(struct gfs2_glock *gl, int flags)
+{
+ int meta = (flags & DIO_METADATA);
+ int data = (flags & DIO_DATA);
+
+ if (test_bit(GLF_DIRTY, &gl->gl_flags)) {
+ if (meta && data) {
+ gfs2_page_writeback(gl);
+ gfs2_log_flush(gl->gl_sbd, gl);
+ gfs2_meta_sync(gl);
+ gfs2_page_wait(gl);
+ clear_bit(GLF_DIRTY, &gl->gl_flags);
+ } else if (meta) {
+ gfs2_log_flush(gl->gl_sbd, gl);
+ gfs2_meta_sync(gl);
+ } else if (data) {
+ gfs2_page_writeback(gl);
+ gfs2_page_wait(gl);
+ }
+ if (flags & DIO_RELEASE)
+ gfs2_ail_empty_gl(gl);
+ }
+}
+
+/**
+ * inode_go_inval - prepare a inode glock to be released
+ * @gl: the glock
+ * @flags:
+ *
+ */
+
+static void inode_go_inval(struct gfs2_glock *gl, int flags)
+{
+ int meta = (flags & DIO_METADATA);
+ int data = (flags & DIO_DATA);
+
+ if (meta) {
+ gfs2_meta_inval(gl);
+ gl->gl_vn++;
+ }
+ if (data)
+ gfs2_page_inval(gl);
+}
+
+/**
+ * inode_go_demote_ok - Check to see if it's ok to unlock an inode glock
+ * @gl: the glock
+ *
+ * Returns: 1 if it's ok
+ */
+
+static int inode_go_demote_ok(struct gfs2_glock *gl)
+{
+ struct gfs2_sbd *sdp = gl->gl_sbd;
+ int demote = 0;
+
+ if (!gl->gl_object && !gl->gl_aspace->i_mapping->nrpages)
+ demote = 1;
+ else if (!sdp->sd_args.ar_localcaching &&
+ time_after_eq(jiffies, gl->gl_stamp +
+ gfs2_tune_get(sdp, gt_demote_secs) * HZ))
+ demote = 1;
+
+ return demote;
+}
+
+/**
+ * inode_go_lock - operation done after an inode lock is locked by a process
+ * @gl: the glock
+ * @flags:
+ *
+ * Returns: errno
+ */
+
+static int inode_go_lock(struct gfs2_holder *gh)
+{
+ struct gfs2_glock *gl = gh->gh_gl;
+ struct gfs2_inode *ip = gl->gl_object;
+ int error = 0;
+
+ if (!ip)
+ return 0;
+
+ if (ip->i_vn != gl->gl_vn) {
+ error = gfs2_inode_refresh(ip);
+ if (error)
+ return error;
+ gfs2_inode_attr_in(ip);
+ }
+
+ if ((ip->i_di.di_flags & GFS2_DIF_TRUNC_IN_PROG) &&
+ (gl->gl_state == LM_ST_EXCLUSIVE) &&
+ (gh->gh_flags & GL_LOCAL_EXCL))
+ error = gfs2_truncatei_resume(ip);
+
+ return error;
+}
+
+/**
+ * inode_go_unlock - operation done before an inode lock is unlocked by a
+ * process
+ * @gl: the glock
+ * @flags:
+ *
+ */
+
+static void inode_go_unlock(struct gfs2_holder *gh)
+{
+ struct gfs2_glock *gl = gh->gh_gl;
+ struct gfs2_inode *ip = gl->gl_object;
+
+ if (ip == NULL)
+ return;
+ if (test_bit(GLF_DIRTY, &gl->gl_flags))
+ gfs2_inode_attr_in(ip);
+ gfs2_meta_cache_flush(ip);
+}
+
+/**
+ * inode_greedy -
+ * @gl: the glock
+ *
+ */
+
+static void inode_greedy(struct gfs2_glock *gl)
+{
+ struct gfs2_sbd *sdp = gl->gl_sbd;
+ struct gfs2_inode *ip = gl->gl_object;
+ unsigned int quantum = gfs2_tune_get(sdp, gt_greedy_quantum);
+ unsigned int max = gfs2_tune_get(sdp, gt_greedy_max);
+ unsigned int new_time;
+
+ spin_lock(&ip->i_spin);
+
+ if (time_after(ip->i_last_pfault + quantum, jiffies)) {
+ new_time = ip->i_greedy + quantum;
+ if (new_time > max)
+ new_time = max;
+ } else {
+ new_time = ip->i_greedy - quantum;
+ if (!new_time || new_time > max)
+ new_time = 1;
+ }
+
+ ip->i_greedy = new_time;
+
+ spin_unlock(&ip->i_spin);
+
+ iput(&ip->i_inode);
+}
+
+/**
+ * rgrp_go_demote_ok - Check to see if it's ok to unlock a RG's glock
+ * @gl: the glock
+ *
+ * Returns: 1 if it's ok
+ */
+
+static int rgrp_go_demote_ok(struct gfs2_glock *gl)
+{
+ return !gl->gl_aspace->i_mapping->nrpages;
+}
+
+/**
+ * rgrp_go_lock - operation done after an rgrp lock is locked by
+ * a first holder on this node.
+ * @gl: the glock
+ * @flags:
+ *
+ * Returns: errno
+ */
+
+static int rgrp_go_lock(struct gfs2_holder *gh)
+{
+ return gfs2_rgrp_bh_get(gh->gh_gl->gl_object);
+}
+
+/**
+ * rgrp_go_unlock - operation done before an rgrp lock is unlocked by
+ * a last holder on this node.
+ * @gl: the glock
+ * @flags:
+ *
+ */
+
+static void rgrp_go_unlock(struct gfs2_holder *gh)
+{
+ gfs2_rgrp_bh_put(gh->gh_gl->gl_object);
+}
+
+/**
+ * trans_go_xmote_th - promote/demote the transaction glock
+ * @gl: the glock
+ * @state: the requested state
+ * @flags:
+ *
+ */
+
+static void trans_go_xmote_th(struct gfs2_glock *gl, unsigned int state,
+ int flags)
+{
+ struct gfs2_sbd *sdp = gl->gl_sbd;
+
+ if (gl->gl_state != LM_ST_UNLOCKED &&
+ test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
+ gfs2_meta_syncfs(sdp);
+ gfs2_log_shutdown(sdp);
+ }
+
+ gfs2_glock_xmote_th(gl, state, flags);
+}
+
+/**
+ * trans_go_xmote_bh - After promoting/demoting the transaction glock
+ * @gl: the glock
+ *
+ */
+
+static void trans_go_xmote_bh(struct gfs2_glock *gl)
+{
+ struct gfs2_sbd *sdp = gl->gl_sbd;
+ struct gfs2_inode *ip = GFS2_I(sdp->sd_jdesc->jd_inode);
+ struct gfs2_glock *j_gl = ip->i_gl;
+ struct gfs2_log_header head;
+ int error;
+
+ if (gl->gl_state != LM_ST_UNLOCKED &&
+ test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
+ gfs2_meta_cache_flush(GFS2_I(sdp->sd_jdesc->jd_inode));
+ j_gl->gl_ops->go_inval(j_gl, DIO_METADATA | DIO_DATA);
+
+ error = gfs2_find_jhead(sdp->sd_jdesc, &head);
+ if (error)
+ gfs2_consist(sdp);
+ if (!(head.lh_flags & GFS2_LOG_HEAD_UNMOUNT))
+ gfs2_consist(sdp);
+
+ /* Initialize some head of the log stuff */
+ if (!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) {
+ sdp->sd_log_sequence = head.lh_sequence + 1;
+ gfs2_log_pointers_init(sdp, head.lh_blkno);
+ }
+ }
+}
+
+/**
+ * trans_go_drop_th - unlock the transaction glock
+ * @gl: the glock
+ *
+ * We want to sync the device even with localcaching. Remember
+ * that localcaching journal replay only marks buffers dirty.
+ */
+
+static void trans_go_drop_th(struct gfs2_glock *gl)
+{
+ struct gfs2_sbd *sdp = gl->gl_sbd;
+
+ if (test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
+ gfs2_meta_syncfs(sdp);
+ gfs2_log_shutdown(sdp);
+ }
+
+ gfs2_glock_drop_th(gl);
+}
+
+/**
+ * quota_go_demote_ok - Check to see if it's ok to unlock a quota glock
+ * @gl: the glock
+ *
+ * Returns: 1 if it's ok
+ */
+
+static int quota_go_demote_ok(struct gfs2_glock *gl)
+{
+ return !atomic_read(&gl->gl_lvb_count);
+}
+
+const struct gfs2_glock_operations gfs2_meta_glops = {
+ .go_xmote_th = gfs2_glock_xmote_th,
+ .go_drop_th = gfs2_glock_drop_th,
+ .go_type = LM_TYPE_META,
+};
+
+const struct gfs2_glock_operations gfs2_inode_glops = {
+ .go_xmote_th = inode_go_xmote_th,
+ .go_xmote_bh = inode_go_xmote_bh,
+ .go_drop_th = inode_go_drop_th,
+ .go_sync = inode_go_sync,
+ .go_inval = inode_go_inval,
+ .go_demote_ok = inode_go_demote_ok,
+ .go_lock = inode_go_lock,
+ .go_unlock = inode_go_unlock,
+ .go_greedy = inode_greedy,
+ .go_type = LM_TYPE_INODE,
+};
+
+const struct gfs2_glock_operations gfs2_rgrp_glops = {
+ .go_xmote_th = gfs2_glock_xmote_th,
+ .go_drop_th = gfs2_glock_drop_th,
+ .go_sync = meta_go_sync,
+ .go_inval = meta_go_inval,
+ .go_demote_ok = rgrp_go_demote_ok,
+ .go_lock = rgrp_go_lock,
+ .go_unlock = rgrp_go_unlock,
+ .go_type = LM_TYPE_RGRP,
+};
+
+const struct gfs2_glock_operations gfs2_trans_glops = {
+ .go_xmote_th = trans_go_xmote_th,
+ .go_xmote_bh = trans_go_xmote_bh,
+ .go_drop_th = trans_go_drop_th,
+ .go_type = LM_TYPE_NONDISK,
+};
+
+const struct gfs2_glock_operations gfs2_iopen_glops = {
+ .go_xmote_th = gfs2_glock_xmote_th,
+ .go_drop_th = gfs2_glock_drop_th,
+ .go_type = LM_TYPE_IOPEN,
+};
+
+const struct gfs2_glock_operations gfs2_flock_glops = {
+ .go_xmote_th = gfs2_glock_xmote_th,
+ .go_drop_th = gfs2_glock_drop_th,
+ .go_type = LM_TYPE_FLOCK,
+};
+
+const struct gfs2_glock_operations gfs2_nondisk_glops = {
+ .go_xmote_th = gfs2_glock_xmote_th,
+ .go_drop_th = gfs2_glock_drop_th,
+ .go_type = LM_TYPE_NONDISK,
+};
+
+const struct gfs2_glock_operations gfs2_quota_glops = {
+ .go_xmote_th = gfs2_glock_xmote_th,
+ .go_drop_th = gfs2_glock_drop_th,
+ .go_demote_ok = quota_go_demote_ok,
+ .go_type = LM_TYPE_QUOTA,
+};
+
+const struct gfs2_glock_operations gfs2_journal_glops = {
+ .go_xmote_th = gfs2_glock_xmote_th,
+ .go_drop_th = gfs2_glock_drop_th,
+ .go_type = LM_TYPE_JOURNAL,
+};
+
diff --git a/fs/gfs2/glops.h b/fs/gfs2/glops.h
new file mode 100644
index 000000000000..a1d9b5b024e6
--- /dev/null
+++ b/fs/gfs2/glops.h
@@ -0,0 +1,25 @@
+/*
+ * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#ifndef __GLOPS_DOT_H__
+#define __GLOPS_DOT_H__
+
+#include "incore.h"
+
+extern const struct gfs2_glock_operations gfs2_meta_glops;
+extern const struct gfs2_glock_operations gfs2_inode_glops;
+extern const struct gfs2_glock_operations gfs2_rgrp_glops;
+extern const struct gfs2_glock_operations gfs2_trans_glops;
+extern const struct gfs2_glock_operations gfs2_iopen_glops;
+extern const struct gfs2_glock_operations gfs2_flock_glops;
+extern const struct gfs2_glock_operations gfs2_nondisk_glops;
+extern const struct gfs2_glock_operations gfs2_quota_glops;
+extern const struct gfs2_glock_operations gfs2_journal_glops;
+
+#endif /* __GLOPS_DOT_H__ */
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
new file mode 100644
index 000000000000..118dc693d111
--- /dev/null
+++ b/fs/gfs2/incore.h
@@ -0,0 +1,634 @@
+/*
+ * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#ifndef __INCORE_DOT_H__
+#define __INCORE_DOT_H__
+
+#include <linux/fs.h>
+
+#define DIO_WAIT 0x00000010
+#define DIO_METADATA 0x00000020
+#define DIO_DATA 0x00000040
+#define DIO_RELEASE 0x00000080
+#define DIO_ALL 0x00000100
+
+struct gfs2_log_operations;
+struct gfs2_log_element;
+struct gfs2_holder;
+struct gfs2_glock;
+struct gfs2_quota_data;
+struct gfs2_trans;
+struct gfs2_ail;
+struct gfs2_jdesc;
+struct gfs2_sbd;
+
+typedef void (*gfs2_glop_bh_t) (struct gfs2_glock *gl, unsigned int ret);
+
+/*
+ * Structure of operations that are associated with each
+ * type of element in the log.
+ */
+
+struct gfs2_log_operations {
+ void (*lo_add) (struct gfs2_sbd *sdp, struct gfs2_log_element *le);
+ void (*lo_incore_commit) (struct gfs2_sbd *sdp, struct gfs2_trans *tr);
+ void (*lo_before_commit) (struct gfs2_sbd *sdp);
+ void (*lo_after_commit) (struct gfs2_sbd *sdp, struct gfs2_ail *ai);
+ void (*lo_before_scan) (struct gfs2_jdesc *jd,
+ struct gfs2_log_header *head, int pass);
+ int (*lo_scan_elements) (struct gfs2_jdesc *jd, unsigned int start,
+ struct gfs2_log_descriptor *ld, __be64 *ptr,
+ int pass);
+ void (*lo_after_scan) (struct gfs2_jdesc *jd, int error, int pass);
+ const char *lo_name;
+};
+
+struct gfs2_log_element {
+ struct list_head le_list;
+ const struct gfs2_log_operations *le_ops;
+};
+
+struct gfs2_bitmap {
+ struct buffer_head *bi_bh;
+ char *bi_clone;
+ u32 bi_offset;
+ u32 bi_start;
+ u32 bi_len;
+};
+
+struct gfs2_rgrpd {
+ struct list_head rd_list; /* Link with superblock */
+ struct list_head rd_list_mru;
+ struct list_head rd_recent; /* Recently used rgrps */
+ struct gfs2_glock *rd_gl; /* Glock for this rgrp */
+ struct gfs2_rindex rd_ri;
+ struct gfs2_rgrp rd_rg;
+ u64 rd_rg_vn;
+ struct gfs2_bitmap *rd_bits;
+ unsigned int rd_bh_count;
+ struct mutex rd_mutex;
+ u32 rd_free_clone;
+ struct gfs2_log_element rd_le;
+ u32 rd_last_alloc_data;
+ u32 rd_last_alloc_meta;
+ struct gfs2_sbd *rd_sbd;
+};
+
+enum gfs2_state_bits {
+ BH_Pinned = BH_PrivateStart,
+ BH_Escaped = BH_PrivateStart + 1,
+};
+
+BUFFER_FNS(Pinned, pinned)
+TAS_BUFFER_FNS(Pinned, pinned)
+BUFFER_FNS(Escaped, escaped)
+TAS_BUFFER_FNS(Escaped, escaped)
+
+struct gfs2_bufdata {
+ struct buffer_head *bd_bh;
+ struct gfs2_glock *bd_gl;
+
+ struct list_head bd_list_tr;
+ struct gfs2_log_element bd_le;
+
+ struct gfs2_ail *bd_ail;
+ struct list_head bd_ail_st_list;
+ struct list_head bd_ail_gl_list;
+};
+
+struct gfs2_glock_operations {
+ void (*go_xmote_th) (struct gfs2_glock * gl, unsigned int state,
+ int flags);
+ void (*go_xmote_bh) (struct gfs2_glock * gl);
+ void (*go_drop_th) (struct gfs2_glock * gl);
+ void (*go_drop_bh) (struct gfs2_glock * gl);
+ void (*go_sync) (struct gfs2_glock * gl, int flags);
+ void (*go_inval) (struct gfs2_glock * gl, int flags);
+ int (*go_demote_ok) (struct gfs2_glock * gl);
+ int (*go_lock) (struct gfs2_holder * gh);
+ void (*go_unlock) (struct gfs2_holder * gh);
+ void (*go_callback) (struct gfs2_glock * gl, unsigned int state);
+ void (*go_greedy) (struct gfs2_glock * gl);
+ const int go_type;
+};
+
+enum {
+ /* Actions */
+ HIF_MUTEX = 0,
+ HIF_PROMOTE = 1,
+ HIF_DEMOTE = 2,
+ HIF_GREEDY = 3,
+
+ /* States */
+ HIF_ALLOCED = 4,
+ HIF_DEALLOC = 5,
+ HIF_HOLDER = 6,
+ HIF_FIRST = 7,
+ HIF_ABORTED = 9,
+};
+
+struct gfs2_holder {
+ struct list_head gh_list;
+
+ struct gfs2_glock *gh_gl;
+ struct task_struct *gh_owner;
+ unsigned int gh_state;
+ unsigned gh_flags;
+
+ int gh_error;
+ unsigned long gh_iflags;
+ struct completion gh_wait;
+ unsigned long gh_ip;
+};
+
+enum {
+ GLF_LOCK = 1,
+ GLF_STICKY = 2,
+ GLF_PREFETCH = 3,
+ GLF_DIRTY = 5,
+ GLF_SKIP_WAITERS2 = 6,
+ GLF_GREEDY = 7,
+};
+
+struct gfs2_glock {
+ struct hlist_node gl_list;
+ unsigned long gl_flags; /* GLF_... */
+ struct lm_lockname gl_name;
+ atomic_t gl_ref;
+
+ spinlock_t gl_spin;
+
+ unsigned int gl_state;
+ unsigned int gl_hash;
+ struct task_struct *gl_owner;
+ unsigned long gl_ip;
+ struct list_head gl_holders;
+ struct list_head gl_waiters1; /* HIF_MUTEX */
+ struct list_head gl_waiters2; /* HIF_DEMOTE, HIF_GREEDY */
+ struct list_head gl_waiters3; /* HIF_PROMOTE */
+
+ const struct gfs2_glock_operations *gl_ops;
+
+ struct gfs2_holder *gl_req_gh;
+ gfs2_glop_bh_t gl_req_bh;
+
+ void *gl_lock;
+ char *gl_lvb;
+ atomic_t gl_lvb_count;
+
+ u64 gl_vn;
+ unsigned long gl_stamp;
+ void *gl_object;
+
+ struct list_head gl_reclaim;
+
+ struct gfs2_sbd *gl_sbd;
+
+ struct inode *gl_aspace;
+ struct gfs2_log_element gl_le;
+ struct list_head gl_ail_list;
+ atomic_t gl_ail_count;
+};
+
+struct gfs2_alloc {
+ /* Quota stuff */
+
+ struct gfs2_quota_data *al_qd[2*MAXQUOTAS];
+ struct gfs2_holder al_qd_ghs[2*MAXQUOTAS];
+ unsigned int al_qd_num;
+
+ u32 al_requested; /* Filled in by caller of gfs2_inplace_reserve() */
+ u32 al_alloced; /* Filled in by gfs2_alloc_*() */
+
+ /* Filled in by gfs2_inplace_reserve() */
+
+ unsigned int al_line;
+ char *al_file;
+ struct gfs2_holder al_ri_gh;
+ struct gfs2_holder al_rgd_gh;
+ struct gfs2_rgrpd *al_rgd;
+
+};
+
+enum {
+ GIF_QD_LOCKED = 1,
+ GIF_PAGED = 2,
+ GIF_SW_PAGED = 3,
+};
+
+struct gfs2_inode {
+ struct inode i_inode;
+ struct gfs2_inum i_num;
+
+ unsigned long i_flags; /* GIF_... */
+
+ u64 i_vn;
+ struct gfs2_dinode i_di; /* To be replaced by ref to block */
+
+ struct gfs2_glock *i_gl; /* Move into i_gh? */
+ struct gfs2_holder i_iopen_gh;
+ struct gfs2_holder i_gh; /* for prepare/commit_write only */
+ struct gfs2_alloc i_alloc;
+ u64 i_last_rg_alloc;
+
+ spinlock_t i_spin;
+ struct rw_semaphore i_rw_mutex;
+ unsigned int i_greedy;
+ unsigned long i_last_pfault;
+
+ struct buffer_head *i_cache[GFS2_MAX_META_HEIGHT];
+};
+
+/*
+ * Since i_inode is the first element of struct gfs2_inode,
+ * this is effectively a cast.
+ */
+static inline struct gfs2_inode *GFS2_I(struct inode *inode)
+{
+ return container_of(inode, struct gfs2_inode, i_inode);
+}
+
+/* To be removed? */
+static inline struct gfs2_sbd *GFS2_SB(struct inode *inode)
+{
+ return inode->i_sb->s_fs_info;
+}
+
+enum {
+ GFF_DID_DIRECT_ALLOC = 0,
+ GFF_EXLOCK = 1,
+};
+
+struct gfs2_file {
+ unsigned long f_flags; /* GFF_... */
+ struct mutex f_fl_mutex;
+ struct gfs2_holder f_fl_gh;
+};
+
+struct gfs2_revoke {
+ struct gfs2_log_element rv_le;
+ u64 rv_blkno;
+};
+
+struct gfs2_revoke_replay {
+ struct list_head rr_list;
+ u64 rr_blkno;
+ unsigned int rr_where;
+};
+
+enum {
+ QDF_USER = 0,
+ QDF_CHANGE = 1,
+ QDF_LOCKED = 2,
+};
+
+struct gfs2_quota_lvb {
+ __be32 qb_magic;
+ u32 __pad;
+ __be64 qb_limit; /* Hard limit of # blocks to alloc */
+ __be64 qb_warn; /* Warn user when alloc is above this # */
+ __be64 qb_value; /* Current # blocks allocated */
+};
+
+struct gfs2_quota_data {
+ struct list_head qd_list;
+ unsigned int qd_count;
+
+ u32 qd_id;
+ unsigned long qd_flags; /* QDF_... */
+
+ s64 qd_change;
+ s64 qd_change_sync;
+
+ unsigned int qd_slot;
+ unsigned int qd_slot_count;
+
+ struct buffer_head *qd_bh;
+ struct gfs2_quota_change *qd_bh_qc;
+ unsigned int qd_bh_count;
+
+ struct gfs2_glock *qd_gl;
+ struct gfs2_quota_lvb qd_qb;
+
+ u64 qd_sync_gen;
+ unsigned long qd_last_warn;
+ unsigned long qd_last_touched;
+};
+
+struct gfs2_log_buf {
+ struct list_head lb_list;
+ struct buffer_head *lb_bh;
+ struct buffer_head *lb_real;
+};
+
+struct gfs2_trans {
+ unsigned long tr_ip;
+
+ unsigned int tr_blocks;
+ unsigned int tr_revokes;
+ unsigned int tr_reserved;
+
+ struct gfs2_holder tr_t_gh;
+
+ int tr_touched;
+
+ unsigned int tr_num_buf;
+ unsigned int tr_num_buf_new;
+ unsigned int tr_num_buf_rm;
+ struct list_head tr_list_buf;
+
+ unsigned int tr_num_revoke;
+ unsigned int tr_num_revoke_rm;
+};
+
+struct gfs2_ail {
+ struct list_head ai_list;
+
+ unsigned int ai_first;
+ struct list_head ai_ail1_list;
+ struct list_head ai_ail2_list;
+
+ u64 ai_sync_gen;
+};
+
+struct gfs2_jdesc {
+ struct list_head jd_list;
+
+ struct inode *jd_inode;
+ unsigned int jd_jid;
+ int jd_dirty;
+
+ unsigned int jd_blocks;
+};
+
+#define GFS2_GLOCKD_DEFAULT 1
+#define GFS2_GLOCKD_MAX 16
+
+#define GFS2_QUOTA_DEFAULT GFS2_QUOTA_OFF
+#define GFS2_QUOTA_OFF 0
+#define GFS2_QUOTA_ACCOUNT 1
+#define GFS2_QUOTA_ON 2
+
+#define GFS2_DATA_DEFAULT GFS2_DATA_ORDERED
+#define GFS2_DATA_WRITEBACK 1
+#define GFS2_DATA_ORDERED 2
+
+struct gfs2_args {
+ char ar_lockproto[GFS2_LOCKNAME_LEN]; /* Name of the Lock Protocol */
+ char ar_locktable[GFS2_LOCKNAME_LEN]; /* Name of the Lock Table */
+ char ar_hostdata[GFS2_LOCKNAME_LEN]; /* Host specific data */
+ int ar_spectator; /* Don't get a journal because we're always RO */
+ int ar_ignore_local_fs; /* Don't optimize even if local_fs is 1 */
+ int ar_localflocks; /* Let the VFS do flock|fcntl locks for us */
+ int ar_localcaching; /* Local-style caching (dangerous on multihost) */
+ int ar_debug; /* Oops on errors instead of trying to be graceful */
+ int ar_upgrade; /* Upgrade ondisk/multihost format */
+ unsigned int ar_num_glockd; /* Number of glockd threads */
+ int ar_posix_acl; /* Enable posix acls */
+ int ar_quota; /* off/account/on */
+ int ar_suiddir; /* suiddir support */
+ int ar_data; /* ordered/writeback */
+};
+
+struct gfs2_tune {
+ spinlock_t gt_spin;
+
+ unsigned int gt_ilimit;
+ unsigned int gt_ilimit_tries;
+ unsigned int gt_ilimit_min;
+ unsigned int gt_demote_secs; /* Cache retention for unheld glock */
+ unsigned int gt_incore_log_blocks;
+ unsigned int gt_log_flush_secs;
+ unsigned int gt_jindex_refresh_secs; /* Check for new journal index */
+
+ unsigned int gt_scand_secs;
+ unsigned int gt_recoverd_secs;
+ unsigned int gt_logd_secs;
+ unsigned int gt_quotad_secs;
+
+ unsigned int gt_quota_simul_sync; /* Max quotavals to sync at once */
+ unsigned int gt_quota_warn_period; /* Secs between quota warn msgs */
+ unsigned int gt_quota_scale_num; /* Numerator */
+ unsigned int gt_quota_scale_den; /* Denominator */
+ unsigned int gt_quota_cache_secs;
+ unsigned int gt_quota_quantum; /* Secs between syncs to quota file */
+ unsigned int gt_atime_quantum; /* Min secs between atime updates */
+ unsigned int gt_new_files_jdata;
+ unsigned int gt_new_files_directio;
+ unsigned int gt_max_atomic_write; /* Split big writes into this size */
+ unsigned int gt_max_readahead; /* Max bytes to read-ahead from disk */
+ unsigned int gt_lockdump_size;
+ unsigned int gt_stall_secs; /* Detects trouble! */
+ unsigned int gt_complain_secs;
+ unsigned int gt_reclaim_limit; /* Max num of glocks in reclaim list */
+ unsigned int gt_entries_per_readdir;
+ unsigned int gt_prefetch_secs; /* Usage window for prefetched glocks */
+ unsigned int gt_greedy_default;
+ unsigned int gt_greedy_quantum;
+ unsigned int gt_greedy_max;
+ unsigned int gt_statfs_quantum;
+ unsigned int gt_statfs_slow;
+};
+
+enum {
+ SDF_JOURNAL_CHECKED = 0,
+ SDF_JOURNAL_LIVE = 1,
+ SDF_SHUTDOWN = 2,
+ SDF_NOATIME = 3,
+};
+
+#define GFS2_FSNAME_LEN 256
+
+struct gfs2_sbd {
+ struct super_block *sd_vfs;
+ struct super_block *sd_vfs_meta;
+ struct kobject sd_kobj;
+ unsigned long sd_flags; /* SDF_... */
+ struct gfs2_sb sd_sb;
+
+ /* Constants computed on mount */
+
+ u32 sd_fsb2bb;
+ u32 sd_fsb2bb_shift;
+ u32 sd_diptrs; /* Number of pointers in a dinode */
+ u32 sd_inptrs; /* Number of pointers in a indirect block */
+ u32 sd_jbsize; /* Size of a journaled data block */
+ u32 sd_hash_bsize; /* sizeof(exhash block) */
+ u32 sd_hash_bsize_shift;
+ u32 sd_hash_ptrs; /* Number of pointers in a hash block */
+ u32 sd_qc_per_block;
+ u32 sd_max_dirres; /* Max blocks needed to add a directory entry */
+ u32 sd_max_height; /* Max height of a file's metadata tree */
+ u64 sd_heightsize[GFS2_MAX_META_HEIGHT];
+ u32 sd_max_jheight; /* Max height of journaled file's meta tree */
+ u64 sd_jheightsize[GFS2_MAX_META_HEIGHT];
+
+ struct gfs2_args sd_args; /* Mount arguments */
+ struct gfs2_tune sd_tune; /* Filesystem tuning structure */
+
+ /* Lock Stuff */
+
+ struct lm_lockstruct sd_lockstruct;
+ struct list_head sd_reclaim_list;
+ spinlock_t sd_reclaim_lock;
+ wait_queue_head_t sd_reclaim_wq;
+ atomic_t sd_reclaim_count;
+ struct gfs2_holder sd_live_gh;
+ struct gfs2_glock *sd_rename_gl;
+ struct gfs2_glock *sd_trans_gl;
+
+ /* Inode Stuff */
+
+ struct inode *sd_master_dir;
+ struct inode *sd_jindex;
+ struct inode *sd_inum_inode;
+ struct inode *sd_statfs_inode;
+ struct inode *sd_ir_inode;
+ struct inode *sd_sc_inode;
+ struct inode *sd_qc_inode;
+ struct inode *sd_rindex;
+ struct inode *sd_quota_inode;
+
+ /* Inum stuff */
+
+ struct mutex sd_inum_mutex;
+
+ /* StatFS stuff */
+
+ spinlock_t sd_statfs_spin;
+ struct mutex sd_statfs_mutex;
+ struct gfs2_statfs_change sd_statfs_master;
+ struct gfs2_statfs_change sd_statfs_local;
+ unsigned long sd_statfs_sync_time;
+
+ /* Resource group stuff */
+
+ u64 sd_rindex_vn;
+ spinlock_t sd_rindex_spin;
+ struct mutex sd_rindex_mutex;
+ struct list_head sd_rindex_list;
+ struct list_head sd_rindex_mru_list;
+ struct list_head sd_rindex_recent_list;
+ struct gfs2_rgrpd *sd_rindex_forward;
+ unsigned int sd_rgrps;
+
+ /* Journal index stuff */
+
+ struct list_head sd_jindex_list;
+ spinlock_t sd_jindex_spin;
+ struct mutex sd_jindex_mutex;
+ unsigned int sd_journals;
+ unsigned long sd_jindex_refresh_time;
+
+ struct gfs2_jdesc *sd_jdesc;
+ struct gfs2_holder sd_journal_gh;
+ struct gfs2_holder sd_jinode_gh;
+
+ struct gfs2_holder sd_ir_gh;
+ struct gfs2_holder sd_sc_gh;
+ struct gfs2_holder sd_qc_gh;
+
+ /* Daemon stuff */
+
+ struct task_struct *sd_scand_process;
+ struct task_struct *sd_recoverd_process;
+ struct task_struct *sd_logd_process;
+ struct task_struct *sd_quotad_process;
+ struct task_struct *sd_glockd_process[GFS2_GLOCKD_MAX];
+ unsigned int sd_glockd_num;
+
+ /* Quota stuff */
+
+ struct list_head sd_quota_list;
+ atomic_t sd_quota_count;
+ spinlock_t sd_quota_spin;
+ struct mutex sd_quota_mutex;
+
+ unsigned int sd_quota_slots;
+ unsigned int sd_quota_chunks;
+ unsigned char **sd_quota_bitmap;
+
+ u64 sd_quota_sync_gen;
+ unsigned long sd_quota_sync_time;
+
+ /* Log stuff */
+
+ spinlock_t sd_log_lock;
+
+ unsigned int sd_log_blks_reserved;
+ unsigned int sd_log_commited_buf;
+ unsigned int sd_log_commited_revoke;
+
+ unsigned int sd_log_num_gl;
+ unsigned int sd_log_num_buf;
+ unsigned int sd_log_num_revoke;
+ unsigned int sd_log_num_rg;
+ unsigned int sd_log_num_databuf;
+ unsigned int sd_log_num_jdata;
+ unsigned int sd_log_num_hdrs;
+
+ struct list_head sd_log_le_gl;
+ struct list_head sd_log_le_buf;
+ struct list_head sd_log_le_revoke;
+ struct list_head sd_log_le_rg;
+ struct list_head sd_log_le_databuf;
+
+ unsigned int sd_log_blks_free;
+ struct mutex sd_log_reserve_mutex;
+
+ u64 sd_log_sequence;
+ unsigned int sd_log_head;
+ unsigned int sd_log_tail;
+ int sd_log_idle;
+
+ unsigned long sd_log_flush_time;
+ struct rw_semaphore sd_log_flush_lock;
+ struct list_head sd_log_flush_list;
+
+ unsigned int sd_log_flush_head;
+ u64 sd_log_flush_wrapped;
+
+ struct list_head sd_ail1_list;
+ struct list_head sd_ail2_list;
+ u64 sd_ail_sync_gen;
+
+ /* Replay stuff */
+
+ struct list_head sd_revoke_list;
+ unsigned int sd_replay_tail;
+
+ unsigned int sd_found_blocks;
+ unsigned int sd_found_revokes;
+ unsigned int sd_replayed_blocks;
+
+ /* For quiescing the filesystem */
+
+ struct gfs2_holder sd_freeze_gh;
+ struct mutex sd_freeze_lock;
+ unsigned int sd_freeze_count;
+
+ /* Counters */
+
+ atomic_t sd_glock_count;
+ atomic_t sd_glock_held_count;
+ atomic_t sd_inode_count;
+ atomic_t sd_reclaimed;
+
+ char sd_fsname[GFS2_FSNAME_LEN];
+ char sd_table_name[GFS2_FSNAME_LEN];
+ char sd_proto_name[GFS2_FSNAME_LEN];
+
+ /* Debugging crud */
+
+ unsigned long sd_last_warning;
+ struct vfsmount *sd_gfs2mnt;
+};
+
+#endif /* __INCORE_DOT_H__ */
+
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
new file mode 100644
index 000000000000..57c43ac47925
--- /dev/null
+++ b/fs/gfs2/inode.c
@@ -0,0 +1,1379 @@
+/*
+ * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/posix_acl.h>
+#include <linux/sort.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/crc32.h>
+#include <linux/lm_interface.h>
+#include <linux/security.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "acl.h"
+#include "bmap.h"
+#include "dir.h"
+#include "eattr.h"
+#include "glock.h"
+#include "glops.h"
+#include "inode.h"
+#include "log.h"
+#include "meta_io.h"
+#include "ops_address.h"
+#include "ops_file.h"
+#include "ops_inode.h"
+#include "quota.h"
+#include "rgrp.h"
+#include "trans.h"
+#include "util.h"
+
+/**
+ * gfs2_inode_attr_in - Copy attributes from the dinode into the VFS inode
+ * @ip: The GFS2 inode (with embedded disk inode data)
+ * @inode: The Linux VFS inode
+ *
+ */
+
+void gfs2_inode_attr_in(struct gfs2_inode *ip)
+{
+ struct inode *inode = &ip->i_inode;
+ struct gfs2_dinode *di = &ip->i_di;
+
+ inode->i_ino = ip->i_num.no_addr;
+
+ switch (di->di_mode & S_IFMT) {
+ case S_IFBLK:
+ case S_IFCHR:
+ inode->i_rdev = MKDEV(di->di_major, di->di_minor);
+ break;
+ default:
+ inode->i_rdev = 0;
+ break;
+ };
+
+ inode->i_mode = di->di_mode;
+ inode->i_nlink = di->di_nlink;
+ inode->i_uid = di->di_uid;
+ inode->i_gid = di->di_gid;
+ i_size_write(inode, di->di_size);
+ inode->i_atime.tv_sec = di->di_atime;
+ inode->i_mtime.tv_sec = di->di_mtime;
+ inode->i_ctime.tv_sec = di->di_ctime;
+ inode->i_atime.tv_nsec = 0;
+ inode->i_mtime.tv_nsec = 0;
+ inode->i_ctime.tv_nsec = 0;
+ inode->i_blocks = di->di_blocks <<
+ (GFS2_SB(inode)->sd_sb.sb_bsize_shift - GFS2_BASIC_BLOCK_SHIFT);
+
+ if (di->di_flags & GFS2_DIF_IMMUTABLE)
+ inode->i_flags |= S_IMMUTABLE;
+ else
+ inode->i_flags &= ~S_IMMUTABLE;
+
+ if (di->di_flags & GFS2_DIF_APPENDONLY)
+ inode->i_flags |= S_APPEND;
+ else
+ inode->i_flags &= ~S_APPEND;
+}
+
+/**
+ * gfs2_inode_attr_out - Copy attributes from VFS inode into the dinode
+ * @ip: The GFS2 inode
+ *
+ * Only copy out the attributes that we want the VFS layer
+ * to be able to modify.
+ */
+
+void gfs2_inode_attr_out(struct gfs2_inode *ip)
+{
+ struct inode *inode = &ip->i_inode;
+ struct gfs2_dinode *di = &ip->i_di;
+ gfs2_assert_withdraw(GFS2_SB(inode),
+ (di->di_mode & S_IFMT) == (inode->i_mode & S_IFMT));
+ di->di_mode = inode->i_mode;
+ di->di_uid = inode->i_uid;
+ di->di_gid = inode->i_gid;
+ di->di_atime = inode->i_atime.tv_sec;
+ di->di_mtime = inode->i_mtime.tv_sec;
+ di->di_ctime = inode->i_ctime.tv_sec;
+}
+
+static int iget_test(struct inode *inode, void *opaque)
+{
+ struct gfs2_inode *ip = GFS2_I(inode);
+ struct gfs2_inum *inum = opaque;
+
+ if (ip && ip->i_num.no_addr == inum->no_addr)
+ return 1;
+
+ return 0;
+}
+
+static int iget_set(struct inode *inode, void *opaque)
+{
+ struct gfs2_inode *ip = GFS2_I(inode);
+ struct gfs2_inum *inum = opaque;
+
+ ip->i_num = *inum;
+ return 0;
+}
+
+struct inode *gfs2_ilookup(struct super_block *sb, struct gfs2_inum *inum)
+{
+ return ilookup5(sb, (unsigned long)inum->no_formal_ino,
+ iget_test, inum);
+}
+
+static struct inode *gfs2_iget(struct super_block *sb, struct gfs2_inum *inum)
+{
+ return iget5_locked(sb, (unsigned long)inum->no_formal_ino,
+ iget_test, iget_set, inum);
+}
+
+/**
+ * gfs2_inode_lookup - Lookup an inode
+ * @sb: The super block
+ * @inum: The inode number
+ * @type: The type of the inode
+ *
+ * Returns: A VFS inode, or an error
+ */
+
+struct inode *gfs2_inode_lookup(struct super_block *sb, struct gfs2_inum *inum, unsigned int type)
+{
+ struct inode *inode = gfs2_iget(sb, inum);
+ struct gfs2_inode *ip = GFS2_I(inode);
+ struct gfs2_glock *io_gl;
+ int error;
+
+ if (inode->i_state & I_NEW) {
+ struct gfs2_sbd *sdp = GFS2_SB(inode);
+ umode_t mode = DT2IF(type);
+ inode->i_private = ip;
+ inode->i_mode = mode;
+
+ if (S_ISREG(mode)) {
+ inode->i_op = &gfs2_file_iops;
+ inode->i_fop = &gfs2_file_fops;
+ inode->i_mapping->a_ops = &gfs2_file_aops;
+ } else if (S_ISDIR(mode)) {
+ inode->i_op = &gfs2_dir_iops;
+ inode->i_fop = &gfs2_dir_fops;
+ } else if (S_ISLNK(mode)) {
+ inode->i_op = &gfs2_symlink_iops;
+ } else {
+ inode->i_op = &gfs2_dev_iops;
+ }
+
+ error = gfs2_glock_get(sdp, inum->no_addr, &gfs2_inode_glops, CREATE, &ip->i_gl);
+ if (unlikely(error))
+ goto fail;
+ ip->i_gl->gl_object = ip;
+
+ error = gfs2_glock_get(sdp, inum->no_addr, &gfs2_iopen_glops, CREATE, &io_gl);
+ if (unlikely(error))
+ goto fail_put;
+
+ ip->i_vn = ip->i_gl->gl_vn - 1;
+ error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT, &ip->i_iopen_gh);
+ if (unlikely(error))
+ goto fail_iopen;
+
+ gfs2_glock_put(io_gl);
+ unlock_new_inode(inode);
+ }
+
+ return inode;
+fail_iopen:
+ gfs2_glock_put(io_gl);
+fail_put:
+ ip->i_gl->gl_object = NULL;
+ gfs2_glock_put(ip->i_gl);
+fail:
+ iput(inode);
+ return ERR_PTR(error);
+}
+
+/**
+ * gfs2_inode_refresh - Refresh the incore copy of the dinode
+ * @ip: The GFS2 inode
+ *
+ * Returns: errno
+ */
+
+int gfs2_inode_refresh(struct gfs2_inode *ip)
+{
+ struct buffer_head *dibh;
+ int error;
+
+ error = gfs2_meta_inode_buffer(ip, &dibh);
+ if (error)
+ return error;
+
+ if (gfs2_metatype_check(GFS2_SB(&ip->i_inode), dibh, GFS2_METATYPE_DI)) {
+ brelse(dibh);
+ return -EIO;
+ }
+
+ gfs2_dinode_in(&ip->i_di, dibh->b_data);
+
+ brelse(dibh);
+
+ if (ip->i_num.no_addr != ip->i_di.di_num.no_addr) {
+ if (gfs2_consist_inode(ip))
+ gfs2_dinode_print(&ip->i_di);
+ return -EIO;
+ }
+ if (ip->i_num.no_formal_ino != ip->i_di.di_num.no_formal_ino)
+ return -ESTALE;
+
+ ip->i_vn = ip->i_gl->gl_vn;
+
+ return 0;
+}
+
+int gfs2_dinode_dealloc(struct gfs2_inode *ip)
+{
+ struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+ struct gfs2_alloc *al;
+ struct gfs2_rgrpd *rgd;
+ int error;
+
+ if (ip->i_di.di_blocks != 1) {
+ if (gfs2_consist_inode(ip))
+ gfs2_dinode_print(&ip->i_di);
+ return -EIO;
+ }
+
+ al = gfs2_alloc_get(ip);
+
+ error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+ if (error)
+ goto out;
+
+ error = gfs2_rindex_hold(sdp, &al->al_ri_gh);
+ if (error)
+ goto out_qs;
+
+ rgd = gfs2_blk2rgrpd(sdp, ip->i_num.no_addr);
+ if (!rgd) {
+ gfs2_consist_inode(ip);
+ error = -EIO;
+ goto out_rindex_relse;
+ }
+
+ error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0,
+ &al->al_rgd_gh);
+ if (error)
+ goto out_rindex_relse;
+
+ error = gfs2_trans_begin(sdp, RES_RG_BIT + RES_STATFS + RES_QUOTA, 1);
+ if (error)
+ goto out_rg_gunlock;
+
+ gfs2_trans_add_gl(ip->i_gl);
+
+ gfs2_free_di(rgd, ip);
+
+ gfs2_trans_end(sdp);
+ clear_bit(GLF_STICKY, &ip->i_gl->gl_flags);
+
+out_rg_gunlock:
+ gfs2_glock_dq_uninit(&al->al_rgd_gh);
+out_rindex_relse:
+ gfs2_glock_dq_uninit(&al->al_ri_gh);
+out_qs:
+ gfs2_quota_unhold(ip);
+out:
+ gfs2_alloc_put(ip);
+ return error;
+}
+
+/**
+ * gfs2_change_nlink - Change nlink count on inode
+ * @ip: The GFS2 inode
+ * @diff: The change in the nlink count required
+ *
+ * Returns: errno
+ */
+
+int gfs2_change_nlink(struct gfs2_inode *ip, int diff)
+{
+ struct gfs2_sbd *sdp = ip->i_inode.i_sb->s_fs_info;
+ struct buffer_head *dibh;
+ u32 nlink;
+ int error;
+
+ BUG_ON(ip->i_di.di_nlink != ip->i_inode.i_nlink);
+ nlink = ip->i_di.di_nlink + diff;
+
+ /* If we are reducing the nlink count, but the new value ends up being
+ bigger than the old one, we must have underflowed. */
+ if (diff < 0 && nlink > ip->i_di.di_nlink) {
+ if (gfs2_consist_inode(ip))
+ gfs2_dinode_print(&ip->i_di);
+ return -EIO;
+ }
+
+ error = gfs2_meta_inode_buffer(ip, &dibh);
+ if (error)
+ return error;
+
+ ip->i_di.di_nlink = nlink;
+ ip->i_di.di_ctime = get_seconds();
+ ip->i_inode.i_nlink = nlink;
+
+ gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+ gfs2_dinode_out(&ip->i_di, dibh->b_data);
+ brelse(dibh);
+ mark_inode_dirty(&ip->i_inode);
+
+ if (ip->i_di.di_nlink == 0) {
+ struct gfs2_rgrpd *rgd;
+ struct gfs2_holder ri_gh, rg_gh;
+
+ error = gfs2_rindex_hold(sdp, &ri_gh);
+ if (error)
+ goto out;
+ error = -EIO;
+ rgd = gfs2_blk2rgrpd(sdp, ip->i_num.no_addr);
+ if (!rgd)
+ goto out_norgrp;
+ error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, &rg_gh);
+ if (error)
+ goto out_norgrp;
+
+ clear_nlink(&ip->i_inode);
+ gfs2_unlink_di(&ip->i_inode); /* mark inode unlinked */
+ gfs2_glock_dq_uninit(&rg_gh);
+out_norgrp:
+ gfs2_glock_dq_uninit(&ri_gh);
+ }
+out:
+ return error;
+}
+
+struct inode *gfs2_lookup_simple(struct inode *dip, const char *name)
+{
+ struct qstr qstr;
+ gfs2_str2qstr(&qstr, name);
+ return gfs2_lookupi(dip, &qstr, 1, NULL);
+}
+
+
+/**
+ * gfs2_lookupi - Look up a filename in a directory and return its inode
+ * @d_gh: An initialized holder for the directory glock
+ * @name: The name of the inode to look for
+ * @is_root: If 1, ignore the caller's permissions
+ * @i_gh: An uninitialized holder for the new inode glock
+ *
+ * There will always be a vnode (Linux VFS inode) for the d_gh inode unless
+ * @is_root is true.
+ *
+ * Returns: errno
+ */
+
+struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
+ int is_root, struct nameidata *nd)
+{
+ struct super_block *sb = dir->i_sb;
+ struct gfs2_inode *dip = GFS2_I(dir);
+ struct gfs2_holder d_gh;
+ struct gfs2_inum inum;
+ unsigned int type;
+ int error = 0;
+ struct inode *inode = NULL;
+
+ if (!name->len || name->len > GFS2_FNAMESIZE)
+ return ERR_PTR(-ENAMETOOLONG);
+
+ if ((name->len == 1 && memcmp(name->name, ".", 1) == 0) ||
+ (name->len == 2 && memcmp(name->name, "..", 2) == 0 &&
+ dir == sb->s_root->d_inode)) {
+ igrab(dir);
+ return dir;
+ }
+
+ error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh);
+ if (error)
+ return ERR_PTR(error);
+
+ if (!is_root) {
+ error = permission(dir, MAY_EXEC, NULL);
+ if (error)
+ goto out;
+ }
+
+ error = gfs2_dir_search(dir, name, &inum, &type);
+ if (error)
+ goto out;
+
+ inode = gfs2_inode_lookup(sb, &inum, type);
+
+out:
+ gfs2_glock_dq_uninit(&d_gh);
+ if (error == -ENOENT)
+ return NULL;
+ return inode;
+}
+
+static int pick_formal_ino_1(struct gfs2_sbd *sdp, u64 *formal_ino)
+{
+ struct gfs2_inode *ip = GFS2_I(sdp->sd_ir_inode);
+ struct buffer_head *bh;
+ struct gfs2_inum_range ir;
+ int error;
+
+ error = gfs2_trans_begin(sdp, RES_DINODE, 0);
+ if (error)
+ return error;
+ mutex_lock(&sdp->sd_inum_mutex);
+
+ error = gfs2_meta_inode_buffer(ip, &bh);
+ if (error) {
+ mutex_unlock(&sdp->sd_inum_mutex);
+ gfs2_trans_end(sdp);
+ return error;
+ }
+
+ gfs2_inum_range_in(&ir, bh->b_data + sizeof(struct gfs2_dinode));
+
+ if (ir.ir_length) {
+ *formal_ino = ir.ir_start++;
+ ir.ir_length--;
+ gfs2_trans_add_bh(ip->i_gl, bh, 1);
+ gfs2_inum_range_out(&ir,
+ bh->b_data + sizeof(struct gfs2_dinode));
+ brelse(bh);
+ mutex_unlock(&sdp->sd_inum_mutex);
+ gfs2_trans_end(sdp);
+ return 0;
+ }
+
+ brelse(bh);
+
+ mutex_unlock(&sdp->sd_inum_mutex);
+ gfs2_trans_end(sdp);
+
+ return 1;
+}
+
+static int pick_formal_ino_2(struct gfs2_sbd *sdp, u64 *formal_ino)
+{
+ struct gfs2_inode *ip = GFS2_I(sdp->sd_ir_inode);
+ struct gfs2_inode *m_ip = GFS2_I(sdp->sd_inum_inode);
+ struct gfs2_holder gh;
+ struct buffer_head *bh;
+ struct gfs2_inum_range ir;
+ int error;
+
+ error = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
+ if (error)
+ return error;
+
+ error = gfs2_trans_begin(sdp, 2 * RES_DINODE, 0);
+ if (error)
+ goto out;
+ mutex_lock(&sdp->sd_inum_mutex);
+
+ error = gfs2_meta_inode_buffer(ip, &bh);
+ if (error)
+ goto out_end_trans;
+
+ gfs2_inum_range_in(&ir, bh->b_data + sizeof(struct gfs2_dinode));
+
+ if (!ir.ir_length) {
+ struct buffer_head *m_bh;
+ u64 x, y;
+
+ error = gfs2_meta_inode_buffer(m_ip, &m_bh);
+ if (error)
+ goto out_brelse;
+
+ x = *(u64 *)(m_bh->b_data + sizeof(struct gfs2_dinode));
+ x = y = be64_to_cpu(x);
+ ir.ir_start = x;
+ ir.ir_length = GFS2_INUM_QUANTUM;
+ x += GFS2_INUM_QUANTUM;
+ if (x < y)
+ gfs2_consist_inode(m_ip);
+ x = cpu_to_be64(x);
+ gfs2_trans_add_bh(m_ip->i_gl, m_bh, 1);
+ *(u64 *)(m_bh->b_data + sizeof(struct gfs2_dinode)) = x;
+
+ brelse(m_bh);
+ }
+
+ *formal_ino = ir.ir_start++;
+ ir.ir_length--;
+
+ gfs2_trans_add_bh(ip->i_gl, bh, 1);
+ gfs2_inum_range_out(&ir, bh->b_data + sizeof(struct gfs2_dinode));
+
+out_brelse:
+ brelse(bh);
+out_end_trans:
+ mutex_unlock(&sdp->sd_inum_mutex);
+ gfs2_trans_end(sdp);
+out:
+ gfs2_glock_dq_uninit(&gh);
+ return error;
+}
+
+static int pick_formal_ino(struct gfs2_sbd *sdp, u64 *inum)
+{
+ int error;
+
+ error = pick_formal_ino_1(sdp, inum);
+ if (error <= 0)
+ return error;
+
+ error = pick_formal_ino_2(sdp, inum);
+
+ return error;
+}
+
+/**
+ * create_ok - OK to create a new on-disk inode here?
+ * @dip: Directory in which dinode is to be created
+ * @name: Name of new dinode
+ * @mode:
+ *
+ * Returns: errno
+ */
+
+static int create_ok(struct gfs2_inode *dip, const struct qstr *name,
+ unsigned int mode)
+{
+ int error;
+
+ error = permission(&dip->i_inode, MAY_WRITE | MAY_EXEC, NULL);
+ if (error)
+ return error;
+
+ /* Don't create entries in an unlinked directory */
+ if (!dip->i_di.di_nlink)
+ return -EPERM;
+
+ error = gfs2_dir_search(&dip->i_inode, name, NULL, NULL);
+ switch (error) {
+ case -ENOENT:
+ error = 0;
+ break;
+ case 0:
+ return -EEXIST;
+ default:
+ return error;
+ }
+
+ if (dip->i_di.di_entries == (u32)-1)
+ return -EFBIG;
+ if (S_ISDIR(mode) && dip->i_di.di_nlink == (u32)-1)
+ return -EMLINK;
+
+ return 0;
+}
+
+static void munge_mode_uid_gid(struct gfs2_inode *dip, unsigned int *mode,
+ unsigned int *uid, unsigned int *gid)
+{
+ if (GFS2_SB(&dip->i_inode)->sd_args.ar_suiddir &&
+ (dip->i_di.di_mode & S_ISUID) && dip->i_di.di_uid) {
+ if (S_ISDIR(*mode))
+ *mode |= S_ISUID;
+ else if (dip->i_di.di_uid != current->fsuid)
+ *mode &= ~07111;
+ *uid = dip->i_di.di_uid;
+ } else
+ *uid = current->fsuid;
+
+ if (dip->i_di.di_mode & S_ISGID) {
+ if (S_ISDIR(*mode))
+ *mode |= S_ISGID;
+ *gid = dip->i_di.di_gid;
+ } else
+ *gid = current->fsgid;
+}
+
+static int alloc_dinode(struct gfs2_inode *dip, struct gfs2_inum *inum,
+ u64 *generation)
+{
+ struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
+ int error;
+
+ gfs2_alloc_get(dip);
+
+ dip->i_alloc.al_requested = RES_DINODE;
+ error = gfs2_inplace_reserve(dip);
+ if (error)
+ goto out;
+
+ error = gfs2_trans_begin(sdp, RES_RG_BIT + RES_STATFS, 0);
+ if (error)
+ goto out_ipreserv;
+
+ inum->no_addr = gfs2_alloc_di(dip, generation);
+
+ gfs2_trans_end(sdp);
+
+out_ipreserv:
+ gfs2_inplace_release(dip);
+out:
+ gfs2_alloc_put(dip);
+ return error;
+}
+
+/**
+ * init_dinode - Fill in a new dinode structure
+ * @dip: the directory this inode is being created in
+ * @gl: The glock covering the new inode
+ * @inum: the inode number
+ * @mode: the file permissions
+ * @uid:
+ * @gid:
+ *
+ */
+
+static void init_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
+ const struct gfs2_inum *inum, unsigned int mode,
+ unsigned int uid, unsigned int gid,
+ const u64 *generation)
+{
+ struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
+ struct gfs2_dinode *di;
+ struct buffer_head *dibh;
+
+ dibh = gfs2_meta_new(gl, inum->no_addr);
+ gfs2_trans_add_bh(gl, dibh, 1);
+ gfs2_metatype_set(dibh, GFS2_METATYPE_DI, GFS2_FORMAT_DI);
+ gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
+ di = (struct gfs2_dinode *)dibh->b_data;
+
+ di->di_num.no_formal_ino = cpu_to_be64(inum->no_formal_ino);
+ di->di_num.no_addr = cpu_to_be64(inum->no_addr);
+ di->di_mode = cpu_to_be32(mode);
+ di->di_uid = cpu_to_be32(uid);
+ di->di_gid = cpu_to_be32(gid);
+ di->di_nlink = cpu_to_be32(0);
+ di->di_size = cpu_to_be64(0);
+ di->di_blocks = cpu_to_be64(1);
+ di->di_atime = di->di_mtime = di->di_ctime = cpu_to_be64(get_seconds());
+ di->di_major = di->di_minor = cpu_to_be32(0);
+ di->di_goal_meta = di->di_goal_data = cpu_to_be64(inum->no_addr);
+ di->di_generation = cpu_to_be64(*generation);
+ di->di_flags = cpu_to_be32(0);
+
+ if (S_ISREG(mode)) {
+ if ((dip->i_di.di_flags & GFS2_DIF_INHERIT_JDATA) ||
+ gfs2_tune_get(sdp, gt_new_files_jdata))
+ di->di_flags |= cpu_to_be32(GFS2_DIF_JDATA);
+ if ((dip->i_di.di_flags & GFS2_DIF_INHERIT_DIRECTIO) ||
+ gfs2_tune_get(sdp, gt_new_files_directio))
+ di->di_flags |= cpu_to_be32(GFS2_DIF_DIRECTIO);
+ } else if (S_ISDIR(mode)) {
+ di->di_flags |= cpu_to_be32(dip->i_di.di_flags &
+ GFS2_DIF_INHERIT_DIRECTIO);
+ di->di_flags |= cpu_to_be32(dip->i_di.di_flags &
+ GFS2_DIF_INHERIT_JDATA);
+ }
+
+ di->__pad1 = 0;
+ di->di_payload_format = cpu_to_be32(0);
+ di->di_height = cpu_to_be32(0);
+ di->__pad2 = 0;
+ di->__pad3 = 0;
+ di->di_depth = cpu_to_be16(0);
+ di->di_entries = cpu_to_be32(0);
+ memset(&di->__pad4, 0, sizeof(di->__pad4));
+ di->di_eattr = cpu_to_be64(0);
+ memset(&di->di_reserved, 0, sizeof(di->di_reserved));
+
+ brelse(dibh);
+}
+
+static int make_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
+ unsigned int mode, const struct gfs2_inum *inum,
+ const u64 *generation)
+{
+ struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
+ unsigned int uid, gid;
+ int error;
+
+ munge_mode_uid_gid(dip, &mode, &uid, &gid);
+ gfs2_alloc_get(dip);
+
+ error = gfs2_quota_lock(dip, uid, gid);
+ if (error)
+ goto out;
+
+ error = gfs2_quota_check(dip, uid, gid);
+ if (error)
+ goto out_quota;
+
+ error = gfs2_trans_begin(sdp, RES_DINODE + RES_QUOTA, 0);
+ if (error)
+ goto out_quota;
+
+ init_dinode(dip, gl, inum, mode, uid, gid, generation);
+ gfs2_quota_change(dip, +1, uid, gid);
+ gfs2_trans_end(sdp);
+
+out_quota:
+ gfs2_quota_unlock(dip);
+out:
+ gfs2_alloc_put(dip);
+ return error;
+}
+
+static int link_dinode(struct gfs2_inode *dip, const struct qstr *name,
+ struct gfs2_inode *ip)
+{
+ struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
+ struct gfs2_alloc *al;
+ int alloc_required;
+ struct buffer_head *dibh;
+ int error;
+
+ al = gfs2_alloc_get(dip);
+
+ error = gfs2_quota_lock(dip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+ if (error)
+ goto fail;
+
+ error = alloc_required = gfs2_diradd_alloc_required(&dip->i_inode, name);
+ if (alloc_required < 0)
+ goto fail;
+ if (alloc_required) {
+ error = gfs2_quota_check(dip, dip->i_di.di_uid,
+ dip->i_di.di_gid);
+ if (error)
+ goto fail_quota_locks;
+
+ al->al_requested = sdp->sd_max_dirres;
+
+ error = gfs2_inplace_reserve(dip);
+ if (error)
+ goto fail_quota_locks;
+
+ error = gfs2_trans_begin(sdp, sdp->sd_max_dirres +
+ al->al_rgd->rd_ri.ri_length +
+ 2 * RES_DINODE +
+ RES_STATFS + RES_QUOTA, 0);
+ if (error)
+ goto fail_ipreserv;
+ } else {
+ error = gfs2_trans_begin(sdp, RES_LEAF + 2 * RES_DINODE, 0);
+ if (error)
+ goto fail_quota_locks;
+ }
+
+ error = gfs2_dir_add(&dip->i_inode, name, &ip->i_num, IF2DT(ip->i_di.di_mode));
+ if (error)
+ goto fail_end_trans;
+
+ error = gfs2_meta_inode_buffer(ip, &dibh);
+ if (error)
+ goto fail_end_trans;
+ ip->i_di.di_nlink = 1;
+ gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+ gfs2_dinode_out(&ip->i_di, dibh->b_data);
+ brelse(dibh);
+ return 0;
+
+fail_end_trans:
+ gfs2_trans_end(sdp);
+
+fail_ipreserv:
+ if (dip->i_alloc.al_rgd)
+ gfs2_inplace_release(dip);
+
+fail_quota_locks:
+ gfs2_quota_unlock(dip);
+
+fail:
+ gfs2_alloc_put(dip);
+ return error;
+}
+
+static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip)
+{
+ int err;
+ size_t len;
+ void *value;
+ char *name;
+ struct gfs2_ea_request er;
+
+ err = security_inode_init_security(&ip->i_inode, &dip->i_inode,
+ &name, &value, &len);
+
+ if (err) {
+ if (err == -EOPNOTSUPP)
+ return 0;
+ return err;
+ }
+
+ memset(&er, 0, sizeof(struct gfs2_ea_request));
+
+ er.er_type = GFS2_EATYPE_SECURITY;
+ er.er_name = name;
+ er.er_data = value;
+ er.er_name_len = strlen(name);
+ er.er_data_len = len;
+
+ err = gfs2_ea_set_i(ip, &er);
+
+ kfree(value);
+ kfree(name);
+
+ return err;
+}
+
+/**
+ * gfs2_createi - Create a new inode
+ * @ghs: An array of two holders
+ * @name: The name of the new file
+ * @mode: the permissions on the new inode
+ *
+ * @ghs[0] is an initialized holder for the directory
+ * @ghs[1] is the holder for the inode lock
+ *
+ * If the return value is not NULL, the glocks on both the directory and the new
+ * file are held. A transaction has been started and an inplace reservation
+ * is held, as well.
+ *
+ * Returns: An inode
+ */
+
+struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
+ unsigned int mode)
+{
+ struct inode *inode;
+ struct gfs2_inode *dip = ghs->gh_gl->gl_object;
+ struct inode *dir = &dip->i_inode;
+ struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
+ struct gfs2_inum inum;
+ int error;
+ u64 generation;
+
+ if (!name->len || name->len > GFS2_FNAMESIZE)
+ return ERR_PTR(-ENAMETOOLONG);
+
+ gfs2_holder_reinit(LM_ST_EXCLUSIVE, 0, ghs);
+ error = gfs2_glock_nq(ghs);
+ if (error)
+ goto fail;
+
+ error = create_ok(dip, name, mode);
+ if (error)
+ goto fail_gunlock;
+
+ error = pick_formal_ino(sdp, &inum.no_formal_ino);
+ if (error)
+ goto fail_gunlock;
+
+ error = alloc_dinode(dip, &inum, &generation);
+ if (error)
+ goto fail_gunlock;
+
+ if (inum.no_addr < dip->i_num.no_addr) {
+ gfs2_glock_dq(ghs);
+
+ error = gfs2_glock_nq_num(sdp, inum.no_addr,
+ &gfs2_inode_glops, LM_ST_EXCLUSIVE,
+ GL_SKIP, ghs + 1);
+ if (error) {
+ return ERR_PTR(error);
+ }
+
+ gfs2_holder_reinit(LM_ST_EXCLUSIVE, 0, ghs);
+ error = gfs2_glock_nq(ghs);
+ if (error) {
+ gfs2_glock_dq_uninit(ghs + 1);
+ return ERR_PTR(error);
+ }
+
+ error = create_ok(dip, name, mode);
+ if (error)
+ goto fail_gunlock2;
+ } else {
+ error = gfs2_glock_nq_num(sdp, inum.no_addr,
+ &gfs2_inode_glops, LM_ST_EXCLUSIVE,
+ GL_SKIP, ghs + 1);
+ if (error)
+ goto fail_gunlock;
+ }
+
+ error = make_dinode(dip, ghs[1].gh_gl, mode, &inum, &generation);
+ if (error)
+ goto fail_gunlock2;
+
+ inode = gfs2_inode_lookup(dir->i_sb, &inum, IF2DT(mode));
+ if (IS_ERR(inode))
+ goto fail_gunlock2;
+
+ error = gfs2_inode_refresh(GFS2_I(inode));
+ if (error)
+ goto fail_iput;
+
+ error = gfs2_acl_create(dip, GFS2_I(inode));
+ if (error)
+ goto fail_iput;
+
+ error = gfs2_security_init(dip, GFS2_I(inode));
+ if (error)
+ goto fail_iput;
+
+ error = link_dinode(dip, name, GFS2_I(inode));
+ if (error)
+ goto fail_iput;
+
+ if (!inode)
+ return ERR_PTR(-ENOMEM);
+ return inode;
+
+fail_iput:
+ iput(inode);
+fail_gunlock2:
+ gfs2_glock_dq_uninit(ghs + 1);
+fail_gunlock:
+ gfs2_glock_dq(ghs);
+fail:
+ return ERR_PTR(error);
+}
+
+/**
+ * gfs2_rmdiri - Remove a directory
+ * @dip: The parent directory of the directory to be removed
+ * @name: The name of the directory to be removed
+ * @ip: The GFS2 inode of the directory to be removed
+ *
+ * Assumes Glocks on dip and ip are held
+ *
+ * Returns: errno
+ */
+
+int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name,
+ struct gfs2_inode *ip)
+{
+ struct qstr dotname;
+ int error;
+
+ if (ip->i_di.di_entries != 2) {
+ if (gfs2_consist_inode(ip))
+ gfs2_dinode_print(&ip->i_di);
+ return -EIO;
+ }
+
+ error = gfs2_dir_del(dip, name);
+ if (error)
+ return error;
+
+ error = gfs2_change_nlink(dip, -1);
+ if (error)
+ return error;
+
+ gfs2_str2qstr(&dotname, ".");
+ error = gfs2_dir_del(ip, &dotname);
+ if (error)
+ return error;
+
+ gfs2_str2qstr(&dotname, "..");
+ error = gfs2_dir_del(ip, &dotname);
+ if (error)
+ return error;
+
+ error = gfs2_change_nlink(ip, -2);
+ if (error)
+ return error;
+
+ return error;
+}
+
+/*
+ * gfs2_unlink_ok - check to see that a inode is still in a directory
+ * @dip: the directory
+ * @name: the name of the file
+ * @ip: the inode
+ *
+ * Assumes that the lock on (at least) @dip is held.
+ *
+ * Returns: 0 if the parent/child relationship is correct, errno if it isn't
+ */
+
+int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
+ struct gfs2_inode *ip)
+{
+ struct gfs2_inum inum;
+ unsigned int type;
+ int error;
+
+ if (IS_IMMUTABLE(&ip->i_inode) || IS_APPEND(&ip->i_inode))
+ return -EPERM;
+
+ if ((dip->i_di.di_mode & S_ISVTX) &&
+ dip->i_di.di_uid != current->fsuid &&
+ ip->i_di.di_uid != current->fsuid && !capable(CAP_FOWNER))
+ return -EPERM;
+
+ if (IS_APPEND(&dip->i_inode))
+ return -EPERM;
+
+ error = permission(&dip->i_inode, MAY_WRITE | MAY_EXEC, NULL);
+ if (error)
+ return error;
+
+ error = gfs2_dir_search(&dip->i_inode, name, &inum, &type);
+ if (error)
+ return error;
+
+ if (!gfs2_inum_equal(&inum, &ip->i_num))
+ return -ENOENT;
+
+ if (IF2DT(ip->i_di.di_mode) != type) {
+ gfs2_consist_inode(dip);
+ return -EIO;
+ }
+
+ return 0;
+}
+
+/*
+ * gfs2_ok_to_move - check if it's ok to move a directory to another directory
+ * @this: move this
+ * @to: to here
+ *
+ * Follow @to back to the root and make sure we don't encounter @this
+ * Assumes we already hold the rename lock.
+ *
+ * Returns: errno
+ */
+
+int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to)
+{
+ struct inode *dir = &to->i_inode;
+ struct super_block *sb = dir->i_sb;
+ struct inode *tmp;
+ struct qstr dotdot;
+ int error = 0;
+
+ gfs2_str2qstr(&dotdot, "..");
+
+ igrab(dir);
+
+ for (;;) {
+ if (dir == &this->i_inode) {
+ error = -EINVAL;
+ break;
+ }
+ if (dir == sb->s_root->d_inode) {
+ error = 0;
+ break;
+ }
+
+ tmp = gfs2_lookupi(dir, &dotdot, 1, NULL);
+ if (IS_ERR(tmp)) {
+ error = PTR_ERR(tmp);
+ break;
+ }
+
+ iput(dir);
+ dir = tmp;
+ }
+
+ iput(dir);
+
+ return error;
+}
+
+/**
+ * gfs2_readlinki - return the contents of a symlink
+ * @ip: the symlink's inode
+ * @buf: a pointer to the buffer to be filled
+ * @len: a pointer to the length of @buf
+ *
+ * If @buf is too small, a piece of memory is kmalloc()ed and needs
+ * to be freed by the caller.
+ *
+ * Returns: errno
+ */
+
+int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len)
+{
+ struct gfs2_holder i_gh;
+ struct buffer_head *dibh;
+ unsigned int x;
+ int error;
+
+ gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &i_gh);
+ error = gfs2_glock_nq_atime(&i_gh);
+ if (error) {
+ gfs2_holder_uninit(&i_gh);
+ return error;
+ }
+
+ if (!ip->i_di.di_size) {
+ gfs2_consist_inode(ip);
+ error = -EIO;
+ goto out;
+ }
+
+ error = gfs2_meta_inode_buffer(ip, &dibh);
+ if (error)
+ goto out;
+
+ x = ip->i_di.di_size + 1;
+ if (x > *len) {
+ *buf = kmalloc(x, GFP_KERNEL);
+ if (!*buf) {
+ error = -ENOMEM;
+ goto out_brelse;
+ }
+ }
+
+ memcpy(*buf, dibh->b_data + sizeof(struct gfs2_dinode), x);
+ *len = x;
+
+out_brelse:
+ brelse(dibh);
+out:
+ gfs2_glock_dq_uninit(&i_gh);
+ return error;
+}
+
+/**
+ * gfs2_glock_nq_atime - Acquire a hold on an inode's glock, and
+ * conditionally update the inode's atime
+ * @gh: the holder to acquire
+ *
+ * Tests atime (access time) for gfs2_read, gfs2_readdir and gfs2_mmap
+ * Update if the difference between the current time and the inode's current
+ * atime is greater than an interval specified at mount.
+ *
+ * Returns: errno
+ */
+
+int gfs2_glock_nq_atime(struct gfs2_holder *gh)
+{
+ struct gfs2_glock *gl = gh->gh_gl;
+ struct gfs2_sbd *sdp = gl->gl_sbd;
+ struct gfs2_inode *ip = gl->gl_object;
+ s64 curtime, quantum = gfs2_tune_get(sdp, gt_atime_quantum);
+ unsigned int state;
+ int flags;
+ int error;
+
+ if (gfs2_assert_warn(sdp, gh->gh_flags & GL_ATIME) ||
+ gfs2_assert_warn(sdp, !(gh->gh_flags & GL_ASYNC)) ||
+ gfs2_assert_warn(sdp, gl->gl_ops == &gfs2_inode_glops))
+ return -EINVAL;
+
+ state = gh->gh_state;
+ flags = gh->gh_flags;
+
+ error = gfs2_glock_nq(gh);
+ if (error)
+ return error;
+
+ if (test_bit(SDF_NOATIME, &sdp->sd_flags) ||
+ (sdp->sd_vfs->s_flags & MS_RDONLY))
+ return 0;
+
+ curtime = get_seconds();
+ if (curtime - ip->i_di.di_atime >= quantum) {
+ gfs2_glock_dq(gh);
+ gfs2_holder_reinit(LM_ST_EXCLUSIVE, gh->gh_flags & ~LM_FLAG_ANY,
+ gh);
+ error = gfs2_glock_nq(gh);
+ if (error)
+ return error;
+
+ /* Verify that atime hasn't been updated while we were
+ trying to get exclusive lock. */
+
+ curtime = get_seconds();
+ if (curtime - ip->i_di.di_atime >= quantum) {
+ struct buffer_head *dibh;
+ struct gfs2_dinode *di;
+
+ error = gfs2_trans_begin(sdp, RES_DINODE, 0);
+ if (error == -EROFS)
+ return 0;
+ if (error)
+ goto fail;
+
+ error = gfs2_meta_inode_buffer(ip, &dibh);
+ if (error)
+ goto fail_end_trans;
+
+ ip->i_di.di_atime = curtime;
+
+ gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+ di = (struct gfs2_dinode *)dibh->b_data;
+ di->di_atime = cpu_to_be64(ip->i_di.di_atime);
+ brelse(dibh);
+
+ gfs2_trans_end(sdp);
+ }
+
+ /* If someone else has asked for the glock,
+ unlock and let them have it. Then reacquire
+ in the original state. */
+ if (gfs2_glock_is_blocking(gl)) {
+ gfs2_glock_dq(gh);
+ gfs2_holder_reinit(state, flags, gh);
+ return gfs2_glock_nq(gh);
+ }
+ }
+
+ return 0;
+
+fail_end_trans:
+ gfs2_trans_end(sdp);
+fail:
+ gfs2_glock_dq(gh);
+ return error;
+}
+
+/**
+ * glock_compare_atime - Compare two struct gfs2_glock structures for sort
+ * @arg_a: the first structure
+ * @arg_b: the second structure
+ *
+ * Returns: 1 if A > B
+ * -1 if A < B
+ * 0 if A == B
+ */
+
+static int glock_compare_atime(const void *arg_a, const void *arg_b)
+{
+ const struct gfs2_holder *gh_a = *(const struct gfs2_holder **)arg_a;
+ const struct gfs2_holder *gh_b = *(const struct gfs2_holder **)arg_b;
+ const struct lm_lockname *a = &gh_a->gh_gl->gl_name;
+ const struct lm_lockname *b = &gh_b->gh_gl->gl_name;
+
+ if (a->ln_number > b->ln_number)
+ return 1;
+ if (a->ln_number < b->ln_number)
+ return -1;
+ if (gh_a->gh_state == LM_ST_SHARED && gh_b->gh_state == LM_ST_EXCLUSIVE)
+ return 1;
+ if (gh_a->gh_state == LM_ST_SHARED && (gh_b->gh_flags & GL_ATIME))
+ return 1;
+
+ return 0;
+}
+
+/**
+ * gfs2_glock_nq_m_atime - acquire multiple glocks where one may need an
+ * atime update
+ * @num_gh: the number of structures
+ * @ghs: an array of struct gfs2_holder structures
+ *
+ * Returns: 0 on success (all glocks acquired),
+ * errno on failure (no glocks acquired)
+ */
+
+int gfs2_glock_nq_m_atime(unsigned int num_gh, struct gfs2_holder *ghs)
+{
+ struct gfs2_holder **p;
+ unsigned int x;
+ int error = 0;
+
+ if (!num_gh)
+ return 0;
+
+ if (num_gh == 1) {
+ ghs->gh_flags &= ~(LM_FLAG_TRY | GL_ASYNC);
+ if (ghs->gh_flags & GL_ATIME)
+ error = gfs2_glock_nq_atime(ghs);
+ else
+ error = gfs2_glock_nq(ghs);
+ return error;
+ }
+
+ p = kcalloc(num_gh, sizeof(struct gfs2_holder *), GFP_KERNEL);
+ if (!p)
+ return -ENOMEM;
+
+ for (x = 0; x < num_gh; x++)
+ p[x] = &ghs[x];
+
+ sort(p, num_gh, sizeof(struct gfs2_holder *), glock_compare_atime,NULL);
+
+ for (x = 0; x < num_gh; x++) {
+ p[x]->gh_flags &= ~(LM_FLAG_TRY | GL_ASYNC);
+
+ if (p[x]->gh_flags & GL_ATIME)
+ error = gfs2_glock_nq_atime(p[x]);
+ else
+ error = gfs2_glock_nq(p[x]);
+
+ if (error) {
+ while (x--)
+ gfs2_glock_dq(p[x]);
+ break;
+ }
+ }
+
+ kfree(p);
+ return error;
+}
+
+
+static int
+__gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr)
+{
+ struct buffer_head *dibh;
+ int error;
+
+ error = gfs2_meta_inode_buffer(ip, &dibh);
+ if (!error) {
+ error = inode_setattr(&ip->i_inode, attr);
+ gfs2_assert_warn(GFS2_SB(&ip->i_inode), !error);
+ gfs2_inode_attr_out(ip);
+
+ gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+ gfs2_dinode_out(&ip->i_di, dibh->b_data);
+ brelse(dibh);
+ }
+ return error;
+}
+
+/**
+ * gfs2_setattr_simple -
+ * @ip:
+ * @attr:
+ *
+ * Called with a reference on the vnode.
+ *
+ * Returns: errno
+ */
+
+int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr)
+{
+ int error;
+
+ if (current->journal_info)
+ return __gfs2_setattr_simple(ip, attr);
+
+ error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), RES_DINODE, 0);
+ if (error)
+ return error;
+
+ error = __gfs2_setattr_simple(ip, attr);
+ gfs2_trans_end(GFS2_SB(&ip->i_inode));
+ return error;
+}
+
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
new file mode 100644
index 000000000000..f5d861760579
--- /dev/null
+++ b/fs/gfs2/inode.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#ifndef __INODE_DOT_H__
+#define __INODE_DOT_H__
+
+static inline int gfs2_is_stuffed(struct gfs2_inode *ip)
+{
+ return !ip->i_di.di_height;
+}
+
+static inline int gfs2_is_jdata(struct gfs2_inode *ip)
+{
+ return ip->i_di.di_flags & GFS2_DIF_JDATA;
+}
+
+static inline int gfs2_is_dir(struct gfs2_inode *ip)
+{
+ return S_ISDIR(ip->i_di.di_mode);
+}
+
+void gfs2_inode_attr_in(struct gfs2_inode *ip);
+void gfs2_inode_attr_out(struct gfs2_inode *ip);
+struct inode *gfs2_inode_lookup(struct super_block *sb, struct gfs2_inum *inum, unsigned type);
+struct inode *gfs2_ilookup(struct super_block *sb, struct gfs2_inum *inum);
+
+int gfs2_inode_refresh(struct gfs2_inode *ip);
+
+int gfs2_dinode_dealloc(struct gfs2_inode *inode);
+int gfs2_change_nlink(struct gfs2_inode *ip, int diff);
+struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
+ int is_root, struct nameidata *nd);
+struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
+ unsigned int mode);
+int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name,
+ struct gfs2_inode *ip);
+int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
+ struct gfs2_inode *ip);
+int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to);
+int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len);
+
+int gfs2_glock_nq_atime(struct gfs2_holder *gh);
+int gfs2_glock_nq_m_atime(unsigned int num_gh, struct gfs2_holder *ghs);
+
+int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr);
+
+struct inode *gfs2_lookup_simple(struct inode *dip, const char *name);
+
+#endif /* __INODE_DOT_H__ */
+
diff --git a/fs/gfs2/lm.c b/fs/gfs2/lm.c
new file mode 100644
index 000000000000..effe4a337c1d
--- /dev/null
+++ b/fs/gfs2/lm.c
@@ -0,0 +1,217 @@
+/*
+ * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/delay.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/lm_interface.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "glock.h"
+#include "lm.h"
+#include "super.h"
+#include "util.h"
+
+/**
+ * gfs2_lm_mount - mount a locking protocol
+ * @sdp: the filesystem
+ * @args: mount arguements
+ * @silent: if 1, don't complain if the FS isn't a GFS2 fs
+ *
+ * Returns: errno
+ */
+
+int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent)
+{
+ char *proto = sdp->sd_proto_name;
+ char *table = sdp->sd_table_name;
+ int flags = 0;
+ int error;
+
+ if (sdp->sd_args.ar_spectator)
+ flags |= LM_MFLAG_SPECTATOR;
+
+ fs_info(sdp, "Trying to join cluster \"%s\", \"%s\"\n", proto, table);
+
+ error = gfs2_mount_lockproto(proto, table, sdp->sd_args.ar_hostdata,
+ gfs2_glock_cb, sdp,
+ GFS2_MIN_LVB_SIZE, flags,
+ &sdp->sd_lockstruct, &sdp->sd_kobj);
+ if (error) {
+ fs_info(sdp, "can't mount proto=%s, table=%s, hostdata=%s\n",
+ proto, table, sdp->sd_args.ar_hostdata);
+ goto out;
+ }
+
+ if (gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_lockspace) ||
+ gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_ops) ||
+ gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_lvb_size >=
+ GFS2_MIN_LVB_SIZE)) {
+ gfs2_unmount_lockproto(&sdp->sd_lockstruct);
+ goto out;
+ }
+
+ if (sdp->sd_args.ar_spectator)
+ snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.s", table);
+ else
+ snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.%u", table,
+ sdp->sd_lockstruct.ls_jid);
+
+ fs_info(sdp, "Joined cluster. Now mounting FS...\n");
+
+ if ((sdp->sd_lockstruct.ls_flags & LM_LSFLAG_LOCAL) &&
+ !sdp->sd_args.ar_ignore_local_fs) {
+ sdp->sd_args.ar_localflocks = 1;
+ sdp->sd_args.ar_localcaching = 1;
+ }
+
+out:
+ return error;
+}
+
+void gfs2_lm_others_may_mount(struct gfs2_sbd *sdp)
+{
+ if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+ sdp->sd_lockstruct.ls_ops->lm_others_may_mount(
+ sdp->sd_lockstruct.ls_lockspace);
+}
+
+void gfs2_lm_unmount(struct gfs2_sbd *sdp)
+{
+ if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+ gfs2_unmount_lockproto(&sdp->sd_lockstruct);
+}
+
+int gfs2_lm_withdraw(struct gfs2_sbd *sdp, char *fmt, ...)
+{
+ va_list args;
+
+ if (test_and_set_bit(SDF_SHUTDOWN, &sdp->sd_flags))
+ return 0;
+
+ va_start(args, fmt);
+ vprintk(fmt, args);
+ va_end(args);
+
+ fs_err(sdp, "about to withdraw from the cluster\n");
+ BUG_ON(sdp->sd_args.ar_debug);
+
+
+ fs_err(sdp, "waiting for outstanding I/O\n");
+
+ /* FIXME: suspend dm device so oustanding bio's complete
+ and all further io requests fail */
+
+ fs_err(sdp, "telling LM to withdraw\n");
+ gfs2_withdraw_lockproto(&sdp->sd_lockstruct);
+ fs_err(sdp, "withdrawn\n");
+ dump_stack();
+
+ return -1;
+}
+
+int gfs2_lm_get_lock(struct gfs2_sbd *sdp, struct lm_lockname *name,
+ void **lockp)
+{
+ int error = -EIO;
+ if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+ error = sdp->sd_lockstruct.ls_ops->lm_get_lock(
+ sdp->sd_lockstruct.ls_lockspace, name, lockp);
+ return error;
+}
+
+void gfs2_lm_put_lock(struct gfs2_sbd *sdp, void *lock)
+{
+ if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+ sdp->sd_lockstruct.ls_ops->lm_put_lock(lock);
+}
+
+unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, void *lock,
+ unsigned int cur_state, unsigned int req_state,
+ unsigned int flags)
+{
+ int ret = 0;
+ if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+ ret = sdp->sd_lockstruct.ls_ops->lm_lock(lock, cur_state,
+ req_state, flags);
+ return ret;
+}
+
+unsigned int gfs2_lm_unlock(struct gfs2_sbd *sdp, void *lock,
+ unsigned int cur_state)
+{
+ int ret = 0;
+ if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+ ret = sdp->sd_lockstruct.ls_ops->lm_unlock(lock, cur_state);
+ return ret;
+}
+
+void gfs2_lm_cancel(struct gfs2_sbd *sdp, void *lock)
+{
+ if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+ sdp->sd_lockstruct.ls_ops->lm_cancel(lock);
+}
+
+int gfs2_lm_hold_lvb(struct gfs2_sbd *sdp, void *lock, char **lvbp)
+{
+ int error = -EIO;
+ if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+ error = sdp->sd_lockstruct.ls_ops->lm_hold_lvb(lock, lvbp);
+ return error;
+}
+
+void gfs2_lm_unhold_lvb(struct gfs2_sbd *sdp, void *lock, char *lvb)
+{
+ if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+ sdp->sd_lockstruct.ls_ops->lm_unhold_lvb(lock, lvb);
+}
+
+int gfs2_lm_plock_get(struct gfs2_sbd *sdp, struct lm_lockname *name,
+ struct file *file, struct file_lock *fl)
+{
+ int error = -EIO;
+ if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+ error = sdp->sd_lockstruct.ls_ops->lm_plock_get(
+ sdp->sd_lockstruct.ls_lockspace, name, file, fl);
+ return error;
+}
+
+int gfs2_lm_plock(struct gfs2_sbd *sdp, struct lm_lockname *name,
+ struct file *file, int cmd, struct file_lock *fl)
+{
+ int error = -EIO;
+ if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+ error = sdp->sd_lockstruct.ls_ops->lm_plock(
+ sdp->sd_lockstruct.ls_lockspace, name, file, cmd, fl);
+ return error;
+}
+
+int gfs2_lm_punlock(struct gfs2_sbd *sdp, struct lm_lockname *name,
+ struct file *file, struct file_lock *fl)
+{
+ int error = -EIO;
+ if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+ error = sdp->sd_lockstruct.ls_ops->lm_punlock(
+ sdp->sd_lockstruct.ls_lockspace, name, file, fl);
+ return error;
+}
+
+void gfs2_lm_recovery_done(struct gfs2_sbd *sdp, unsigned int jid,
+ unsigned int message)
+{
+ if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+ sdp->sd_lockstruct.ls_ops->lm_recovery_done(
+ sdp->sd_lockstruct.ls_lockspace, jid, message);
+}
+
diff --git a/fs/gfs2/lm.h b/fs/gfs2/lm.h
new file mode 100644
index 000000000000..21cdc30ee08c
--- /dev/null
+++ b/fs/gfs2/lm.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#ifndef __LM_DOT_H__
+#define __LM_DOT_H__
+
+struct gfs2_sbd;
+
+#define GFS2_MIN_LVB_SIZE 32
+
+int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent);
+void gfs2_lm_others_may_mount(struct gfs2_sbd *sdp);
+void gfs2_lm_unmount(struct gfs2_sbd *sdp);
+int gfs2_lm_withdraw(struct gfs2_sbd *sdp, char *fmt, ...)
+ __attribute__ ((format(printf, 2, 3)));
+int gfs2_lm_get_lock(struct gfs2_sbd *sdp, struct lm_lockname *name,
+ void **lockp);
+void gfs2_lm_put_lock(struct gfs2_sbd *sdp, void *lock);
+unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, void *lock,
+ unsigned int cur_state, unsigned int req_state,
+ unsigned int flags);
+unsigned int gfs2_lm_unlock(struct gfs2_sbd *sdp, void *lock,
+ unsigned int cur_state);
+void gfs2_lm_cancel(struct gfs2_sbd *sdp, void *lock);
+int gfs2_lm_hold_lvb(struct gfs2_sbd *sdp, void *lock, char **lvbp);
+void gfs2_lm_unhold_lvb(struct gfs2_sbd *sdp, void *lock, char *lvb);
+int gfs2_lm_plock_get(struct gfs2_sbd *sdp, struct lm_lockname *name,
+ struct file *file, struct file_lock *fl);
+int gfs2_lm_plock(struct gfs2_sbd *sdp, struct lm_lockname *name,
+ struct file *file, int cmd, struct file_lock *fl);
+int gfs2_lm_punlock(struct gfs2_sbd *sdp, struct lm_lockname *name,
+ struct file *file, struct file_lock *fl);
+void gfs2_lm_recovery_done(struct gfs2_sbd *sdp, unsigned int jid,
+ unsigned int message);
+
+#endif /* __LM_DOT_H__ */
diff --git a/fs/gfs2/locking.c b/fs/gfs2/locking.c
new file mode 100644
index 000000000000..663fee728783
--- /dev/null
+++ b/fs/gfs2/locking.c
@@ -0,0 +1,184 @@
+/*
+ * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/wait.h>
+#include <linux/sched.h>
+#include <linux/kmod.h>
+#include <linux/fs.h>
+#include <linux/delay.h>
+#include <linux/lm_interface.h>
+
+struct lmh_wrapper {
+ struct list_head lw_list;
+ const struct lm_lockops *lw_ops;
+};
+
+/* List of registered low-level locking protocols. A file system selects one
+ of them by name at mount time, e.g. lock_nolock, lock_dlm. */
+
+static LIST_HEAD(lmh_list);
+static DEFINE_MUTEX(lmh_lock);
+
+/**
+ * gfs2_register_lockproto - Register a low-level locking protocol
+ * @proto: the protocol definition
+ *
+ * Returns: 0 on success, -EXXX on failure
+ */
+
+int gfs2_register_lockproto(const struct lm_lockops *proto)
+{
+ struct lmh_wrapper *lw;
+
+ mutex_lock(&lmh_lock);
+
+ list_for_each_entry(lw, &lmh_list, lw_list) {
+ if (!strcmp(lw->lw_ops->lm_proto_name, proto->lm_proto_name)) {
+ mutex_unlock(&lmh_lock);
+ printk(KERN_INFO "GFS2: protocol %s already exists\n",
+ proto->lm_proto_name);
+ return -EEXIST;
+ }
+ }
+
+ lw = kzalloc(sizeof(struct lmh_wrapper), GFP_KERNEL);
+ if (!lw) {
+ mutex_unlock(&lmh_lock);
+ return -ENOMEM;
+ }
+
+ lw->lw_ops = proto;
+ list_add(&lw->lw_list, &lmh_list);
+
+ mutex_unlock(&lmh_lock);
+
+ return 0;
+}
+
+/**
+ * gfs2_unregister_lockproto - Unregister a low-level locking protocol
+ * @proto: the protocol definition
+ *
+ */
+
+void gfs2_unregister_lockproto(const struct lm_lockops *proto)
+{
+ struct lmh_wrapper *lw;
+
+ mutex_lock(&lmh_lock);
+
+ list_for_each_entry(lw, &lmh_list, lw_list) {
+ if (!strcmp(lw->lw_ops->lm_proto_name, proto->lm_proto_name)) {
+ list_del(&lw->lw_list);
+ mutex_unlock(&lmh_lock);
+ kfree(lw);
+ return;
+ }
+ }
+
+ mutex_unlock(&lmh_lock);
+
+ printk(KERN_WARNING "GFS2: can't unregister lock protocol %s\n",
+ proto->lm_proto_name);
+}
+
+/**
+ * gfs2_mount_lockproto - Mount a lock protocol
+ * @proto_name - the name of the protocol
+ * @table_name - the name of the lock space
+ * @host_data - data specific to this host
+ * @cb - the callback to the code using the lock module
+ * @sdp - The GFS2 superblock
+ * @min_lvb_size - the mininum LVB size that the caller can deal with
+ * @flags - LM_MFLAG_*
+ * @lockstruct - a structure returned describing the mount
+ *
+ * Returns: 0 on success, -EXXX on failure
+ */
+
+int gfs2_mount_lockproto(char *proto_name, char *table_name, char *host_data,
+ lm_callback_t cb, void *cb_data,
+ unsigned int min_lvb_size, int flags,
+ struct lm_lockstruct *lockstruct,
+ struct kobject *fskobj)
+{
+ struct lmh_wrapper *lw = NULL;
+ int try = 0;
+ int error, found;
+
+retry:
+ mutex_lock(&lmh_lock);
+
+ found = 0;
+ list_for_each_entry(lw, &lmh_list, lw_list) {
+ if (!strcmp(lw->lw_ops->lm_proto_name, proto_name)) {
+ found = 1;
+ break;
+ }
+ }
+
+ if (!found) {
+ if (!try && capable(CAP_SYS_MODULE)) {
+ try = 1;
+ mutex_unlock(&lmh_lock);
+ request_module(proto_name);
+ goto retry;
+ }
+ printk(KERN_INFO "GFS2: can't find protocol %s\n", proto_name);
+ error = -ENOENT;
+ goto out;
+ }
+
+ if (!try_module_get(lw->lw_ops->lm_owner)) {
+ try = 0;
+ mutex_unlock(&lmh_lock);
+ msleep(1000);
+ goto retry;
+ }
+
+ error = lw->lw_ops->lm_mount(table_name, host_data, cb, cb_data,
+ min_lvb_size, flags, lockstruct, fskobj);
+ if (error)
+ module_put(lw->lw_ops->lm_owner);
+out:
+ mutex_unlock(&lmh_lock);
+ return error;
+}
+
+void gfs2_unmount_lockproto(struct lm_lockstruct *lockstruct)
+{
+ mutex_lock(&lmh_lock);
+ lockstruct->ls_ops->lm_unmount(lockstruct->ls_lockspace);
+ if (lockstruct->ls_ops->lm_owner)
+ module_put(lockstruct->ls_ops->lm_owner);
+ mutex_unlock(&lmh_lock);
+}
+
+/**
+ * gfs2_withdraw_lockproto - abnormally unmount a lock module
+ * @lockstruct: the lockstruct passed into mount
+ *
+ */
+
+void gfs2_withdraw_lockproto(struct lm_lockstruct *lockstruct)
+{
+ mutex_lock(&lmh_lock);
+ lockstruct->ls_ops->lm_withdraw(lockstruct->ls_lockspace);
+ if (lockstruct->ls_ops->lm_owner)
+ module_put(lockstruct->ls_ops->lm_owner);
+ mutex_unlock(&lmh_lock);
+}
+
+EXPORT_SYMBOL_GPL(gfs2_register_lockproto);
+EXPORT_SYMBOL_GPL(gfs2_unregister_lockproto);
+
diff --git a/fs/gfs2/locking/dlm/Makefile b/fs/gfs2/locking/dlm/Makefile
new file mode 100644
index 000000000000..89b93b6b45cf
--- /dev/null
+++ b/fs/gfs2/locking/dlm/Makefile
@@ -0,0 +1,3 @@
+obj-$(CONFIG_GFS2_FS_LOCKING_DLM) += lock_dlm.o
+lock_dlm-y := lock.o main.o mount.o sysfs.o thread.o plock.o
+
diff --git a/fs/gfs2/locking/dlm/lock.c b/fs/gfs2/locking/dlm/lock.c
new file mode 100644
index 000000000000..b167addf9fd1
--- /dev/null
+++ b/fs/gfs2/locking/dlm/lock.c
@@ -0,0 +1,524 @@
+/*
+ * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+ * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include "lock_dlm.h"
+
+static char junk_lvb[GDLM_LVB_SIZE];
+
+static void queue_complete(struct gdlm_lock *lp)
+{
+ struct gdlm_ls *ls = lp->ls;
+
+ clear_bit(LFL_ACTIVE, &lp->flags);
+
+ spin_lock(&ls->async_lock);
+ list_add_tail(&lp->clist, &ls->complete);
+ spin_unlock(&ls->async_lock);
+ wake_up(&ls->thread_wait);
+}
+
+static inline void gdlm_ast(void *astarg)
+{
+ queue_complete(astarg);
+}
+
+static inline void gdlm_bast(void *astarg, int mode)
+{
+ struct gdlm_lock *lp = astarg;
+ struct gdlm_ls *ls = lp->ls;
+
+ if (!mode) {
+ printk(KERN_INFO "lock_dlm: bast mode zero %x,%llx\n",
+ lp->lockname.ln_type,
+ (unsigned long long)lp->lockname.ln_number);
+ return;
+ }
+
+ spin_lock(&ls->async_lock);
+ if (!lp->bast_mode) {
+ list_add_tail(&lp->blist, &ls->blocking);
+ lp->bast_mode = mode;
+ } else if (lp->bast_mode < mode)
+ lp->bast_mode = mode;
+ spin_unlock(&ls->async_lock);
+ wake_up(&ls->thread_wait);
+}
+
+void gdlm_queue_delayed(struct gdlm_lock *lp)
+{
+ struct gdlm_ls *ls = lp->ls;
+
+ spin_lock(&ls->async_lock);
+ list_add_tail(&lp->delay_list, &ls->delayed);
+ spin_unlock(&ls->async_lock);
+}
+
+/* convert gfs lock-state to dlm lock-mode */
+
+static s16 make_mode(s16 lmstate)
+{
+ switch (lmstate) {
+ case LM_ST_UNLOCKED:
+ return DLM_LOCK_NL;
+ case LM_ST_EXCLUSIVE:
+ return DLM_LOCK_EX;
+ case LM_ST_DEFERRED:
+ return DLM_LOCK_CW;
+ case LM_ST_SHARED:
+ return DLM_LOCK_PR;
+ }
+ gdlm_assert(0, "unknown LM state %d", lmstate);
+ return -1;
+}
+
+/* convert dlm lock-mode to gfs lock-state */
+
+s16 gdlm_make_lmstate(s16 dlmmode)
+{
+ switch (dlmmode) {
+ case DLM_LOCK_IV:
+ case DLM_LOCK_NL:
+ return LM_ST_UNLOCKED;
+ case DLM_LOCK_EX:
+ return LM_ST_EXCLUSIVE;
+ case DLM_LOCK_CW:
+ return LM_ST_DEFERRED;
+ case DLM_LOCK_PR:
+ return LM_ST_SHARED;
+ }
+ gdlm_assert(0, "unknown DLM mode %d", dlmmode);
+ return -1;
+}
+
+/* verify agreement with GFS on the current lock state, NB: DLM_LOCK_NL and
+ DLM_LOCK_IV are both considered LM_ST_UNLOCKED by GFS. */
+
+static void check_cur_state(struct gdlm_lock *lp, unsigned int cur_state)
+{
+ s16 cur = make_mode(cur_state);
+ if (lp->cur != DLM_LOCK_IV)
+ gdlm_assert(lp->cur == cur, "%d, %d", lp->cur, cur);
+}
+
+static inline unsigned int make_flags(struct gdlm_lock *lp,
+ unsigned int gfs_flags,
+ s16 cur, s16 req)
+{
+ unsigned int lkf = 0;
+
+ if (gfs_flags & LM_FLAG_TRY)
+ lkf |= DLM_LKF_NOQUEUE;
+
+ if (gfs_flags & LM_FLAG_TRY_1CB) {
+ lkf |= DLM_LKF_NOQUEUE;
+ lkf |= DLM_LKF_NOQUEUEBAST;
+ }
+
+ if (gfs_flags & LM_FLAG_PRIORITY) {
+ lkf |= DLM_LKF_NOORDER;
+ lkf |= DLM_LKF_HEADQUE;
+ }
+
+ if (gfs_flags & LM_FLAG_ANY) {
+ if (req == DLM_LOCK_PR)
+ lkf |= DLM_LKF_ALTCW;
+ else if (req == DLM_LOCK_CW)
+ lkf |= DLM_LKF_ALTPR;
+ }
+
+ if (lp->lksb.sb_lkid != 0) {
+ lkf |= DLM_LKF_CONVERT;
+
+ /* Conversion deadlock avoidance by DLM */
+
+ if (!test_bit(LFL_FORCE_PROMOTE, &lp->flags) &&
+ !(lkf & DLM_LKF_NOQUEUE) &&
+ cur > DLM_LOCK_NL && req > DLM_LOCK_NL && cur != req)
+ lkf |= DLM_LKF_CONVDEADLK;
+ }
+
+ if (lp->lvb)
+ lkf |= DLM_LKF_VALBLK;
+
+ return lkf;
+}
+
+/* make_strname - convert GFS lock numbers to a string */
+
+static inline void make_strname(struct lm_lockname *lockname,
+ struct gdlm_strname *str)
+{
+ sprintf(str->name, "%8x%16llx", lockname->ln_type,
+ (unsigned long long)lockname->ln_number);
+ str->namelen = GDLM_STRNAME_BYTES;
+}
+
+static int gdlm_create_lp(struct gdlm_ls *ls, struct lm_lockname *name,
+ struct gdlm_lock **lpp)
+{
+ struct gdlm_lock *lp;
+
+ lp = kzalloc(sizeof(struct gdlm_lock), GFP_KERNEL);
+ if (!lp)
+ return -ENOMEM;
+
+ lp->lockname = *name;
+ lp->ls = ls;
+ lp->cur = DLM_LOCK_IV;
+ lp->lvb = NULL;
+ lp->hold_null = NULL;
+ init_completion(&lp->ast_wait);
+ INIT_LIST_HEAD(&lp->clist);
+ INIT_LIST_HEAD(&lp->blist);
+ INIT_LIST_HEAD(&lp->delay_list);
+
+ spin_lock(&ls->async_lock);
+ list_add(&lp->all_list, &ls->all_locks);
+ ls->all_locks_count++;
+ spin_unlock(&ls->async_lock);
+
+ *lpp = lp;
+ return 0;
+}
+
+void gdlm_delete_lp(struct gdlm_lock *lp)
+{
+ struct gdlm_ls *ls = lp->ls;
+
+ spin_lock(&ls->async_lock);
+ if (!list_empty(&lp->clist))
+ list_del_init(&lp->clist);
+ if (!list_empty(&lp->blist))
+ list_del_init(&lp->blist);
+ if (!list_empty(&lp->delay_list))
+ list_del_init(&lp->delay_list);
+ gdlm_assert(!list_empty(&lp->all_list), "%x,%llx", lp->lockname.ln_type,
+ (unsigned long long)lp->lockname.ln_number);
+ list_del_init(&lp->all_list);
+ ls->all_locks_count--;
+ spin_unlock(&ls->async_lock);
+
+ kfree(lp);
+}
+
+int gdlm_get_lock(void *lockspace, struct lm_lockname *name,
+ void **lockp)
+{
+ struct gdlm_lock *lp;
+ int error;
+
+ error = gdlm_create_lp(lockspace, name, &lp);
+
+ *lockp = lp;
+ return error;
+}
+
+void gdlm_put_lock(void *lock)
+{
+ gdlm_delete_lp(lock);
+}
+
+unsigned int gdlm_do_lock(struct gdlm_lock *lp)
+{
+ struct gdlm_ls *ls = lp->ls;
+ struct gdlm_strname str;
+ int error, bast = 1;
+
+ /*
+ * When recovery is in progress, delay lock requests for submission
+ * once recovery is done. Requests for recovery (NOEXP) and unlocks
+ * can pass.
+ */
+
+ if (test_bit(DFL_BLOCK_LOCKS, &ls->flags) &&
+ !test_bit(LFL_NOBLOCK, &lp->flags) && lp->req != DLM_LOCK_NL) {
+ gdlm_queue_delayed(lp);
+ return LM_OUT_ASYNC;
+ }
+
+ /*
+ * Submit the actual lock request.
+ */
+
+ if (test_bit(LFL_NOBAST, &lp->flags))
+ bast = 0;
+
+ make_strname(&lp->lockname, &str);
+
+ set_bit(LFL_ACTIVE, &lp->flags);
+
+ log_debug("lk %x,%llx id %x %d,%d %x", lp->lockname.ln_type,
+ (unsigned long long)lp->lockname.ln_number, lp->lksb.sb_lkid,
+ lp->cur, lp->req, lp->lkf);
+
+ error = dlm_lock(ls->dlm_lockspace, lp->req, &lp->lksb, lp->lkf,
+ str.name, str.namelen, 0, gdlm_ast, lp,
+ bast ? gdlm_bast : NULL);
+
+ if ((error == -EAGAIN) && (lp->lkf & DLM_LKF_NOQUEUE)) {
+ lp->lksb.sb_status = -EAGAIN;
+ queue_complete(lp);
+ error = 0;
+ }
+
+ if (error) {
+ log_debug("%s: gdlm_lock %x,%llx err=%d cur=%d req=%d lkf=%x "
+ "flags=%lx", ls->fsname, lp->lockname.ln_type,
+ (unsigned long long)lp->lockname.ln_number, error,
+ lp->cur, lp->req, lp->lkf, lp->flags);
+ return LM_OUT_ERROR;
+ }
+ return LM_OUT_ASYNC;
+}
+
+static unsigned int gdlm_do_unlock(struct gdlm_lock *lp)
+{
+ struct gdlm_ls *ls = lp->ls;
+ unsigned int lkf = 0;
+ int error;
+
+ set_bit(LFL_DLM_UNLOCK, &lp->flags);
+ set_bit(LFL_ACTIVE, &lp->flags);
+
+ if (lp->lvb)
+ lkf = DLM_LKF_VALBLK;
+
+ log_debug("un %x,%llx %x %d %x", lp->lockname.ln_type,
+ (unsigned long long)lp->lockname.ln_number,
+ lp->lksb.sb_lkid, lp->cur, lkf);
+
+ error = dlm_unlock(ls->dlm_lockspace, lp->lksb.sb_lkid, lkf, NULL, lp);
+
+ if (error) {
+ log_debug("%s: gdlm_unlock %x,%llx err=%d cur=%d req=%d lkf=%x "
+ "flags=%lx", ls->fsname, lp->lockname.ln_type,
+ (unsigned long long)lp->lockname.ln_number, error,
+ lp->cur, lp->req, lp->lkf, lp->flags);
+ return LM_OUT_ERROR;
+ }
+ return LM_OUT_ASYNC;
+}
+
+unsigned int gdlm_lock(void *lock, unsigned int cur_state,
+ unsigned int req_state, unsigned int flags)
+{
+ struct gdlm_lock *lp = lock;
+
+ clear_bit(LFL_DLM_CANCEL, &lp->flags);
+ if (flags & LM_FLAG_NOEXP)
+ set_bit(LFL_NOBLOCK, &lp->flags);
+
+ check_cur_state(lp, cur_state);
+ lp->req = make_mode(req_state);
+ lp->lkf = make_flags(lp, flags, lp->cur, lp->req);
+
+ return gdlm_do_lock(lp);
+}
+
+unsigned int gdlm_unlock(void *lock, unsigned int cur_state)
+{
+ struct gdlm_lock *lp = lock;
+
+ clear_bit(LFL_DLM_CANCEL, &lp->flags);
+ if (lp->cur == DLM_LOCK_IV)
+ return 0;
+ return gdlm_do_unlock(lp);
+}
+
+void gdlm_cancel(void *lock)
+{
+ struct gdlm_lock *lp = lock;
+ struct gdlm_ls *ls = lp->ls;
+ int error, delay_list = 0;
+
+ if (test_bit(LFL_DLM_CANCEL, &lp->flags))
+ return;
+
+ log_info("gdlm_cancel %x,%llx flags %lx", lp->lockname.ln_type,
+ (unsigned long long)lp->lockname.ln_number, lp->flags);
+
+ spin_lock(&ls->async_lock);
+ if (!list_empty(&lp->delay_list)) {
+ list_del_init(&lp->delay_list);
+ delay_list = 1;
+ }
+ spin_unlock(&ls->async_lock);
+
+ if (delay_list) {
+ set_bit(LFL_CANCEL, &lp->flags);
+ set_bit(LFL_ACTIVE, &lp->flags);
+ queue_complete(lp);
+ return;
+ }
+
+ if (!test_bit(LFL_ACTIVE, &lp->flags) ||
+ test_bit(LFL_DLM_UNLOCK, &lp->flags)) {
+ log_info("gdlm_cancel skip %x,%llx flags %lx",
+ lp->lockname.ln_type,
+ (unsigned long long)lp->lockname.ln_number, lp->flags);
+ return;
+ }
+
+ /* the lock is blocked in the dlm */
+
+ set_bit(LFL_DLM_CANCEL, &lp->flags);
+ set_bit(LFL_ACTIVE, &lp->flags);
+
+ error = dlm_unlock(ls->dlm_lockspace, lp->lksb.sb_lkid, DLM_LKF_CANCEL,
+ NULL, lp);
+
+ log_info("gdlm_cancel rv %d %x,%llx flags %lx", error,
+ lp->lockname.ln_type,
+ (unsigned long long)lp->lockname.ln_number, lp->flags);
+
+ if (error == -EBUSY)
+ clear_bit(LFL_DLM_CANCEL, &lp->flags);
+}
+
+static int gdlm_add_lvb(struct gdlm_lock *lp)
+{
+ char *lvb;
+
+ lvb = kzalloc(GDLM_LVB_SIZE, GFP_KERNEL);
+ if (!lvb)
+ return -ENOMEM;
+
+ lp->lksb.sb_lvbptr = lvb;
+ lp->lvb = lvb;
+ return 0;
+}
+
+static void gdlm_del_lvb(struct gdlm_lock *lp)
+{
+ kfree(lp->lvb);
+ lp->lvb = NULL;
+ lp->lksb.sb_lvbptr = NULL;
+}
+
+/* This can do a synchronous dlm request (requiring a lock_dlm thread to get
+ the completion) because gfs won't call hold_lvb() during a callback (from
+ the context of a lock_dlm thread). */
+
+static int hold_null_lock(struct gdlm_lock *lp)
+{
+ struct gdlm_lock *lpn = NULL;
+ int error;
+
+ if (lp->hold_null) {
+ printk(KERN_INFO "lock_dlm: lvb already held\n");
+ return 0;
+ }
+
+ error = gdlm_create_lp(lp->ls, &lp->lockname, &lpn);
+ if (error)
+ goto out;
+
+ lpn->lksb.sb_lvbptr = junk_lvb;
+ lpn->lvb = junk_lvb;
+
+ lpn->req = DLM_LOCK_NL;
+ lpn->lkf = DLM_LKF_VALBLK | DLM_LKF_EXPEDITE;
+ set_bit(LFL_NOBAST, &lpn->flags);
+ set_bit(LFL_INLOCK, &lpn->flags);
+
+ init_completion(&lpn->ast_wait);
+ gdlm_do_lock(lpn);
+ wait_for_completion(&lpn->ast_wait);
+ error = lpn->lksb.sb_status;
+ if (error) {
+ printk(KERN_INFO "lock_dlm: hold_null_lock dlm error %d\n",
+ error);
+ gdlm_delete_lp(lpn);
+ lpn = NULL;
+ }
+out:
+ lp->hold_null = lpn;
+ return error;
+}
+
+/* This cannot do a synchronous dlm request (requiring a lock_dlm thread to get
+ the completion) because gfs may call unhold_lvb() during a callback (from
+ the context of a lock_dlm thread) which could cause a deadlock since the
+ other lock_dlm thread could be engaged in recovery. */
+
+static void unhold_null_lock(struct gdlm_lock *lp)
+{
+ struct gdlm_lock *lpn = lp->hold_null;
+
+ gdlm_assert(lpn, "%x,%llx", lp->lockname.ln_type,
+ (unsigned long long)lp->lockname.ln_number);
+ lpn->lksb.sb_lvbptr = NULL;
+ lpn->lvb = NULL;
+ set_bit(LFL_UNLOCK_DELETE, &lpn->flags);
+ gdlm_do_unlock(lpn);
+ lp->hold_null = NULL;
+}
+
+/* Acquire a NL lock because gfs requires the value block to remain
+ intact on the resource while the lvb is "held" even if it's holding no locks
+ on the resource. */
+
+int gdlm_hold_lvb(void *lock, char **lvbp)
+{
+ struct gdlm_lock *lp = lock;
+ int error;
+
+ error = gdlm_add_lvb(lp);
+ if (error)
+ return error;
+
+ *lvbp = lp->lvb;
+
+ error = hold_null_lock(lp);
+ if (error)
+ gdlm_del_lvb(lp);
+
+ return error;
+}
+
+void gdlm_unhold_lvb(void *lock, char *lvb)
+{
+ struct gdlm_lock *lp = lock;
+
+ unhold_null_lock(lp);
+ gdlm_del_lvb(lp);
+}
+
+void gdlm_submit_delayed(struct gdlm_ls *ls)
+{
+ struct gdlm_lock *lp, *safe;
+
+ spin_lock(&ls->async_lock);
+ list_for_each_entry_safe(lp, safe, &ls->delayed, delay_list) {
+ list_del_init(&lp->delay_list);
+ list_add_tail(&lp->delay_list, &ls->submit);
+ }
+ spin_unlock(&ls->async_lock);
+ wake_up(&ls->thread_wait);
+}
+
+int gdlm_release_all_locks(struct gdlm_ls *ls)
+{
+ struct gdlm_lock *lp, *safe;
+ int count = 0;
+
+ spin_lock(&ls->async_lock);
+ list_for_each_entry_safe(lp, safe, &ls->all_locks, all_list) {
+ list_del_init(&lp->all_list);
+
+ if (lp->lvb && lp->lvb != junk_lvb)
+ kfree(lp->lvb);
+ kfree(lp);
+ count++;
+ }
+ spin_unlock(&ls->async_lock);
+
+ return count;
+}
+
diff --git a/fs/gfs2/locking/dlm/lock_dlm.h b/fs/gfs2/locking/dlm/lock_dlm.h
new file mode 100644
index 000000000000..33af707a4d3f
--- /dev/null
+++ b/fs/gfs2/locking/dlm/lock_dlm.h
@@ -0,0 +1,187 @@
+/*
+ * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+ * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#ifndef LOCK_DLM_DOT_H
+#define LOCK_DLM_DOT_H
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/list.h>
+#include <linux/socket.h>
+#include <linux/delay.h>
+#include <linux/kthread.h>
+#include <linux/kobject.h>
+#include <linux/fcntl.h>
+#include <linux/wait.h>
+#include <net/sock.h>
+
+#include <linux/dlm.h>
+#include <linux/lm_interface.h>
+
+/*
+ * Internally, we prefix things with gdlm_ and GDLM_ (for gfs-dlm) since a
+ * prefix of lock_dlm_ gets awkward. Externally, GFS refers to this module
+ * as "lock_dlm".
+ */
+
+#define GDLM_STRNAME_BYTES 24
+#define GDLM_LVB_SIZE 32
+#define GDLM_DROP_COUNT 50000
+#define GDLM_DROP_PERIOD 60
+#define GDLM_NAME_LEN 128
+
+/* GFS uses 12 bytes to identify a resource (32 bit type + 64 bit number).
+ We sprintf these numbers into a 24 byte string of hex values to make them
+ human-readable (to make debugging simpler.) */
+
+struct gdlm_strname {
+ unsigned char name[GDLM_STRNAME_BYTES];
+ unsigned short namelen;
+};
+
+enum {
+ DFL_BLOCK_LOCKS = 0,
+ DFL_SPECTATOR = 1,
+ DFL_WITHDRAW = 2,
+};
+
+struct gdlm_ls {
+ u32 id;
+ int jid;
+ int first;
+ int first_done;
+ unsigned long flags;
+ struct kobject kobj;
+ char clustername[GDLM_NAME_LEN];
+ char fsname[GDLM_NAME_LEN];
+ int fsflags;
+ dlm_lockspace_t *dlm_lockspace;
+ lm_callback_t fscb;
+ struct gfs2_sbd *sdp;
+ int recover_jid;
+ int recover_jid_done;
+ int recover_jid_status;
+ spinlock_t async_lock;
+ struct list_head complete;
+ struct list_head blocking;
+ struct list_head delayed;
+ struct list_head submit;
+ struct list_head all_locks;
+ u32 all_locks_count;
+ wait_queue_head_t wait_control;
+ struct task_struct *thread1;
+ struct task_struct *thread2;
+ wait_queue_head_t thread_wait;
+ unsigned long drop_time;
+ int drop_locks_count;
+ int drop_locks_period;
+};
+
+enum {
+ LFL_NOBLOCK = 0,
+ LFL_NOCACHE = 1,
+ LFL_DLM_UNLOCK = 2,
+ LFL_DLM_CANCEL = 3,
+ LFL_SYNC_LVB = 4,
+ LFL_FORCE_PROMOTE = 5,
+ LFL_REREQUEST = 6,
+ LFL_ACTIVE = 7,
+ LFL_INLOCK = 8,
+ LFL_CANCEL = 9,
+ LFL_NOBAST = 10,
+ LFL_HEADQUE = 11,
+ LFL_UNLOCK_DELETE = 12,
+};
+
+struct gdlm_lock {
+ struct gdlm_ls *ls;
+ struct lm_lockname lockname;
+ char *lvb;
+ struct dlm_lksb lksb;
+
+ s16 cur;
+ s16 req;
+ s16 prev_req;
+ u32 lkf; /* dlm flags DLM_LKF_ */
+ unsigned long flags; /* lock_dlm flags LFL_ */
+
+ int bast_mode; /* protected by async_lock */
+ struct completion ast_wait;
+
+ struct list_head clist; /* complete */
+ struct list_head blist; /* blocking */
+ struct list_head delay_list; /* delayed */
+ struct list_head all_list; /* all locks for the fs */
+ struct gdlm_lock *hold_null; /* NL lock for hold_lvb */
+};
+
+#define gdlm_assert(assertion, fmt, args...) \
+do { \
+ if (unlikely(!(assertion))) { \
+ printk(KERN_EMERG "lock_dlm: fatal assertion failed \"%s\"\n" \
+ "lock_dlm: " fmt "\n", \
+ #assertion, ##args); \
+ BUG(); \
+ } \
+} while (0)
+
+#define log_print(lev, fmt, arg...) printk(lev "lock_dlm: " fmt "\n" , ## arg)
+#define log_info(fmt, arg...) log_print(KERN_INFO , fmt , ## arg)
+#define log_error(fmt, arg...) log_print(KERN_ERR , fmt , ## arg)
+#ifdef LOCK_DLM_LOG_DEBUG
+#define log_debug(fmt, arg...) log_print(KERN_DEBUG , fmt , ## arg)
+#else
+#define log_debug(fmt, arg...)
+#endif
+
+/* sysfs.c */
+
+int gdlm_sysfs_init(void);
+void gdlm_sysfs_exit(void);
+int gdlm_kobject_setup(struct gdlm_ls *, struct kobject *);
+void gdlm_kobject_release(struct gdlm_ls *);
+
+/* thread.c */
+
+int gdlm_init_threads(struct gdlm_ls *);
+void gdlm_release_threads(struct gdlm_ls *);
+
+/* lock.c */
+
+s16 gdlm_make_lmstate(s16);
+void gdlm_queue_delayed(struct gdlm_lock *);
+void gdlm_submit_delayed(struct gdlm_ls *);
+int gdlm_release_all_locks(struct gdlm_ls *);
+void gdlm_delete_lp(struct gdlm_lock *);
+unsigned int gdlm_do_lock(struct gdlm_lock *);
+
+int gdlm_get_lock(void *, struct lm_lockname *, void **);
+void gdlm_put_lock(void *);
+unsigned int gdlm_lock(void *, unsigned int, unsigned int, unsigned int);
+unsigned int gdlm_unlock(void *, unsigned int);
+void gdlm_cancel(void *);
+int gdlm_hold_lvb(void *, char **);
+void gdlm_unhold_lvb(void *, char *);
+
+/* plock.c */
+
+int gdlm_plock_init(void);
+void gdlm_plock_exit(void);
+int gdlm_plock(void *, struct lm_lockname *, struct file *, int,
+ struct file_lock *);
+int gdlm_plock_get(void *, struct lm_lockname *, struct file *,
+ struct file_lock *);
+int gdlm_punlock(void *, struct lm_lockname *, struct file *,
+ struct file_lock *);
+#endif
+
diff --git a/fs/gfs2/locking/dlm/main.c b/fs/gfs2/locking/dlm/main.c
new file mode 100644
index 000000000000..2194b1d5b5ec
--- /dev/null
+++ b/fs/gfs2/locking/dlm/main.c
@@ -0,0 +1,64 @@
+/*
+ * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+ * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/init.h>
+
+#include "lock_dlm.h"
+
+extern int gdlm_drop_count;
+extern int gdlm_drop_period;
+
+extern struct lm_lockops gdlm_ops;
+
+static int __init init_lock_dlm(void)
+{
+ int error;
+
+ error = gfs2_register_lockproto(&gdlm_ops);
+ if (error) {
+ printk(KERN_WARNING "lock_dlm: can't register protocol: %d\n",
+ error);
+ return error;
+ }
+
+ error = gdlm_sysfs_init();
+ if (error) {
+ gfs2_unregister_lockproto(&gdlm_ops);
+ return error;
+ }
+
+ error = gdlm_plock_init();
+ if (error) {
+ gdlm_sysfs_exit();
+ gfs2_unregister_lockproto(&gdlm_ops);
+ return error;
+ }
+
+ gdlm_drop_count = GDLM_DROP_COUNT;
+ gdlm_drop_period = GDLM_DROP_PERIOD;
+
+ printk(KERN_INFO
+ "Lock_DLM (built %s %s) installed\n", __DATE__, __TIME__);
+ return 0;
+}
+
+static void __exit exit_lock_dlm(void)
+{
+ gdlm_plock_exit();
+ gdlm_sysfs_exit();
+ gfs2_unregister_lockproto(&gdlm_ops);
+}
+
+module_init(init_lock_dlm);
+module_exit(exit_lock_dlm);
+
+MODULE_DESCRIPTION("GFS DLM Locking Module");
+MODULE_AUTHOR("Red Hat, Inc.");
+MODULE_LICENSE("GPL");
+
diff --git a/fs/gfs2/locking/dlm/mount.c b/fs/gfs2/locking/dlm/mount.c
new file mode 100644
index 000000000000..cdd1694e889b
--- /dev/null
+++ b/fs/gfs2/locking/dlm/mount.c
@@ -0,0 +1,255 @@
+/*
+ * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+ * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include "lock_dlm.h"
+
+int gdlm_drop_count;
+int gdlm_drop_period;
+const struct lm_lockops gdlm_ops;
+
+
+static struct gdlm_ls *init_gdlm(lm_callback_t cb, struct gfs2_sbd *sdp,
+ int flags, char *table_name)
+{
+ struct gdlm_ls *ls;
+ char buf[256], *p;
+
+ ls = kzalloc(sizeof(struct gdlm_ls), GFP_KERNEL);
+ if (!ls)
+ return NULL;
+
+ ls->drop_locks_count = gdlm_drop_count;
+ ls->drop_locks_period = gdlm_drop_period;
+ ls->fscb = cb;
+ ls->sdp = sdp;
+ ls->fsflags = flags;
+ spin_lock_init(&ls->async_lock);
+ INIT_LIST_HEAD(&ls->complete);
+ INIT_LIST_HEAD(&ls->blocking);
+ INIT_LIST_HEAD(&ls->delayed);
+ INIT_LIST_HEAD(&ls->submit);
+ INIT_LIST_HEAD(&ls->all_locks);
+ init_waitqueue_head(&ls->thread_wait);
+ init_waitqueue_head(&ls->wait_control);
+ ls->thread1 = NULL;
+ ls->thread2 = NULL;
+ ls->drop_time = jiffies;
+ ls->jid = -1;
+
+ strncpy(buf, table_name, 256);
+ buf[255] = '\0';
+
+ p = strchr(buf, ':');
+ if (!p) {
+ log_info("invalid table_name \"%s\"", table_name);
+ kfree(ls);
+ return NULL;
+ }
+ *p = '\0';
+ p++;
+
+ strncpy(ls->clustername, buf, GDLM_NAME_LEN);
+ strncpy(ls->fsname, p, GDLM_NAME_LEN);
+
+ return ls;
+}
+
+static int make_args(struct gdlm_ls *ls, char *data_arg, int *nodir)
+{
+ char data[256];
+ char *options, *x, *y;
+ int error = 0;
+
+ memset(data, 0, 256);
+ strncpy(data, data_arg, 255);
+
+ for (options = data; (x = strsep(&options, ":")); ) {
+ if (!*x)
+ continue;
+
+ y = strchr(x, '=');
+ if (y)
+ *y++ = 0;
+
+ if (!strcmp(x, "jid")) {
+ if (!y) {
+ log_error("need argument to jid");
+ error = -EINVAL;
+ break;
+ }
+ sscanf(y, "%u", &ls->jid);
+
+ } else if (!strcmp(x, "first")) {
+ if (!y) {
+ log_error("need argument to first");
+ error = -EINVAL;
+ break;
+ }
+ sscanf(y, "%u", &ls->first);
+
+ } else if (!strcmp(x, "id")) {
+ if (!y) {
+ log_error("need argument to id");
+ error = -EINVAL;
+ break;
+ }
+ sscanf(y, "%u", &ls->id);
+
+ } else if (!strcmp(x, "nodir")) {
+ if (!y) {
+ log_error("need argument to nodir");
+ error = -EINVAL;
+ break;
+ }
+ sscanf(y, "%u", nodir);
+
+ } else {
+ log_error("unkonwn option: %s", x);
+ error = -EINVAL;
+ break;
+ }
+ }
+
+ return error;
+}
+
+static int gdlm_mount(char *table_name, char *host_data,
+ lm_callback_t cb, void *cb_data,
+ unsigned int min_lvb_size, int flags,
+ struct lm_lockstruct *lockstruct,
+ struct kobject *fskobj)
+{
+ struct gdlm_ls *ls;
+ int error = -ENOMEM, nodir = 0;
+
+ if (min_lvb_size > GDLM_LVB_SIZE)
+ goto out;
+
+ ls = init_gdlm(cb, cb_data, flags, table_name);
+ if (!ls)
+ goto out;
+
+ error = make_args(ls, host_data, &nodir);
+ if (error)
+ goto out;
+
+ error = gdlm_init_threads(ls);
+ if (error)
+ goto out_free;
+
+ error = gdlm_kobject_setup(ls, fskobj);
+ if (error)
+ goto out_thread;
+
+ error = dlm_new_lockspace(ls->fsname, strlen(ls->fsname),
+ &ls->dlm_lockspace,
+ nodir ? DLM_LSFL_NODIR : 0,
+ GDLM_LVB_SIZE);
+ if (error) {
+ log_error("dlm_new_lockspace error %d", error);
+ goto out_kobj;
+ }
+
+ lockstruct->ls_jid = ls->jid;
+ lockstruct->ls_first = ls->first;
+ lockstruct->ls_lockspace = ls;
+ lockstruct->ls_ops = &gdlm_ops;
+ lockstruct->ls_flags = 0;
+ lockstruct->ls_lvb_size = GDLM_LVB_SIZE;
+ return 0;
+
+out_kobj:
+ gdlm_kobject_release(ls);
+out_thread:
+ gdlm_release_threads(ls);
+out_free:
+ kfree(ls);
+out:
+ return error;
+}
+
+static void gdlm_unmount(void *lockspace)
+{
+ struct gdlm_ls *ls = lockspace;
+ int rv;
+
+ log_debug("unmount flags %lx", ls->flags);
+
+ /* FIXME: serialize unmount and withdraw in case they
+ happen at once. Also, if unmount follows withdraw,
+ wait for withdraw to finish. */
+
+ if (test_bit(DFL_WITHDRAW, &ls->flags))
+ goto out;
+
+ gdlm_kobject_release(ls);
+ dlm_release_lockspace(ls->dlm_lockspace, 2);
+ gdlm_release_threads(ls);
+ rv = gdlm_release_all_locks(ls);
+ if (rv)
+ log_info("gdlm_unmount: %d stray locks freed", rv);
+out:
+ kfree(ls);
+}
+
+static void gdlm_recovery_done(void *lockspace, unsigned int jid,
+ unsigned int message)
+{
+ struct gdlm_ls *ls = lockspace;
+ ls->recover_jid_done = jid;
+ ls->recover_jid_status = message;
+ kobject_uevent(&ls->kobj, KOBJ_CHANGE);
+}
+
+static void gdlm_others_may_mount(void *lockspace)
+{
+ struct gdlm_ls *ls = lockspace;
+ ls->first_done = 1;
+ kobject_uevent(&ls->kobj, KOBJ_CHANGE);
+}
+
+/* Userspace gets the offline uevent, blocks new gfs locks on
+ other mounters, and lets us know (sets WITHDRAW flag). Then,
+ userspace leaves the mount group while we leave the lockspace. */
+
+static void gdlm_withdraw(void *lockspace)
+{
+ struct gdlm_ls *ls = lockspace;
+
+ kobject_uevent(&ls->kobj, KOBJ_OFFLINE);
+
+ wait_event_interruptible(ls->wait_control,
+ test_bit(DFL_WITHDRAW, &ls->flags));
+
+ dlm_release_lockspace(ls->dlm_lockspace, 2);
+ gdlm_release_threads(ls);
+ gdlm_release_all_locks(ls);
+ gdlm_kobject_release(ls);
+}
+
+const struct lm_lockops gdlm_ops = {
+ .lm_proto_name = "lock_dlm",
+ .lm_mount = gdlm_mount,
+ .lm_others_may_mount = gdlm_others_may_mount,
+ .lm_unmount = gdlm_unmount,
+ .lm_withdraw = gdlm_withdraw,
+ .lm_get_lock = gdlm_get_lock,
+ .lm_put_lock = gdlm_put_lock,
+ .lm_lock = gdlm_lock,
+ .lm_unlock = gdlm_unlock,
+ .lm_plock = gdlm_plock,
+ .lm_punlock = gdlm_punlock,
+ .lm_plock_get = gdlm_plock_get,
+ .lm_cancel = gdlm_cancel,
+ .lm_hold_lvb = gdlm_hold_lvb,
+ .lm_unhold_lvb = gdlm_unhold_lvb,
+ .lm_recovery_done = gdlm_recovery_done,
+ .lm_owner = THIS_MODULE,
+};
+
diff --git a/fs/gfs2/locking/dlm/plock.c b/fs/gfs2/locking/dlm/plock.c
new file mode 100644
index 000000000000..7365aec9511b
--- /dev/null
+++ b/fs/gfs2/locking/dlm/plock.c
@@ -0,0 +1,301 @@
+/*
+ * Copyright (C) 2005 Red Hat, Inc. All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/miscdevice.h>
+#include <linux/lock_dlm_plock.h>
+
+#include "lock_dlm.h"
+
+
+static spinlock_t ops_lock;
+static struct list_head send_list;
+static struct list_head recv_list;
+static wait_queue_head_t send_wq;
+static wait_queue_head_t recv_wq;
+
+struct plock_op {
+ struct list_head list;
+ int done;
+ struct gdlm_plock_info info;
+};
+
+static inline void set_version(struct gdlm_plock_info *info)
+{
+ info->version[0] = GDLM_PLOCK_VERSION_MAJOR;
+ info->version[1] = GDLM_PLOCK_VERSION_MINOR;
+ info->version[2] = GDLM_PLOCK_VERSION_PATCH;
+}
+
+static int check_version(struct gdlm_plock_info *info)
+{
+ if ((GDLM_PLOCK_VERSION_MAJOR != info->version[0]) ||
+ (GDLM_PLOCK_VERSION_MINOR < info->version[1])) {
+ log_error("plock device version mismatch: "
+ "kernel (%u.%u.%u), user (%u.%u.%u)",
+ GDLM_PLOCK_VERSION_MAJOR,
+ GDLM_PLOCK_VERSION_MINOR,
+ GDLM_PLOCK_VERSION_PATCH,
+ info->version[0],
+ info->version[1],
+ info->version[2]);
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static void send_op(struct plock_op *op)
+{
+ set_version(&op->info);
+ INIT_LIST_HEAD(&op->list);
+ spin_lock(&ops_lock);
+ list_add_tail(&op->list, &send_list);
+ spin_unlock(&ops_lock);
+ wake_up(&send_wq);
+}
+
+int gdlm_plock(void *lockspace, struct lm_lockname *name,
+ struct file *file, int cmd, struct file_lock *fl)
+{
+ struct gdlm_ls *ls = lockspace;
+ struct plock_op *op;
+ int rv;
+
+ op = kzalloc(sizeof(*op), GFP_KERNEL);
+ if (!op)
+ return -ENOMEM;
+
+ op->info.optype = GDLM_PLOCK_OP_LOCK;
+ op->info.pid = fl->fl_pid;
+ op->info.ex = (fl->fl_type == F_WRLCK);
+ op->info.wait = IS_SETLKW(cmd);
+ op->info.fsid = ls->id;
+ op->info.number = name->ln_number;
+ op->info.start = fl->fl_start;
+ op->info.end = fl->fl_end;
+ op->info.owner = (__u64)(long) fl->fl_owner;
+
+ send_op(op);
+ wait_event(recv_wq, (op->done != 0));
+
+ spin_lock(&ops_lock);
+ if (!list_empty(&op->list)) {
+ printk(KERN_INFO "plock op on list\n");
+ list_del(&op->list);
+ }
+ spin_unlock(&ops_lock);
+
+ rv = op->info.rv;
+
+ if (!rv) {
+ if (posix_lock_file_wait(file, fl) < 0)
+ log_error("gdlm_plock: vfs lock error %x,%llx",
+ name->ln_type,
+ (unsigned long long)name->ln_number);
+ }
+
+ kfree(op);
+ return rv;
+}
+
+int gdlm_punlock(void *lockspace, struct lm_lockname *name,
+ struct file *file, struct file_lock *fl)
+{
+ struct gdlm_ls *ls = lockspace;
+ struct plock_op *op;
+ int rv;
+
+ op = kzalloc(sizeof(*op), GFP_KERNEL);
+ if (!op)
+ return -ENOMEM;
+
+ if (posix_lock_file_wait(file, fl) < 0)
+ log_error("gdlm_punlock: vfs unlock error %x,%llx",
+ name->ln_type, (unsigned long long)name->ln_number);
+
+ op->info.optype = GDLM_PLOCK_OP_UNLOCK;
+ op->info.pid = fl->fl_pid;
+ op->info.fsid = ls->id;
+ op->info.number = name->ln_number;
+ op->info.start = fl->fl_start;
+ op->info.end = fl->fl_end;
+ op->info.owner = (__u64)(long) fl->fl_owner;
+
+ send_op(op);
+ wait_event(recv_wq, (op->done != 0));
+
+ spin_lock(&ops_lock);
+ if (!list_empty(&op->list)) {
+ printk(KERN_INFO "punlock op on list\n");
+ list_del(&op->list);
+ }
+ spin_unlock(&ops_lock);
+
+ rv = op->info.rv;
+
+ kfree(op);
+ return rv;
+}
+
+int gdlm_plock_get(void *lockspace, struct lm_lockname *name,
+ struct file *file, struct file_lock *fl)
+{
+ struct gdlm_ls *ls = lockspace;
+ struct plock_op *op;
+ int rv;
+
+ op = kzalloc(sizeof(*op), GFP_KERNEL);
+ if (!op)
+ return -ENOMEM;
+
+ op->info.optype = GDLM_PLOCK_OP_GET;
+ op->info.pid = fl->fl_pid;
+ op->info.ex = (fl->fl_type == F_WRLCK);
+ op->info.fsid = ls->id;
+ op->info.number = name->ln_number;
+ op->info.start = fl->fl_start;
+ op->info.end = fl->fl_end;
+
+ send_op(op);
+ wait_event(recv_wq, (op->done != 0));
+
+ spin_lock(&ops_lock);
+ if (!list_empty(&op->list)) {
+ printk(KERN_INFO "plock_get op on list\n");
+ list_del(&op->list);
+ }
+ spin_unlock(&ops_lock);
+
+ rv = op->info.rv;
+
+ if (rv == 0)
+ fl->fl_type = F_UNLCK;
+ else if (rv > 0) {
+ fl->fl_type = (op->info.ex) ? F_WRLCK : F_RDLCK;
+ fl->fl_pid = op->info.pid;
+ fl->fl_start = op->info.start;
+ fl->fl_end = op->info.end;
+ }
+
+ kfree(op);
+ return rv;
+}
+
+/* a read copies out one plock request from the send list */
+static ssize_t dev_read(struct file *file, char __user *u, size_t count,
+ loff_t *ppos)
+{
+ struct gdlm_plock_info info;
+ struct plock_op *op = NULL;
+
+ if (count < sizeof(info))
+ return -EINVAL;
+
+ spin_lock(&ops_lock);
+ if (!list_empty(&send_list)) {
+ op = list_entry(send_list.next, struct plock_op, list);
+ list_move(&op->list, &recv_list);
+ memcpy(&info, &op->info, sizeof(info));
+ }
+ spin_unlock(&ops_lock);
+
+ if (!op)
+ return -EAGAIN;
+
+ if (copy_to_user(u, &info, sizeof(info)))
+ return -EFAULT;
+ return sizeof(info);
+}
+
+/* a write copies in one plock result that should match a plock_op
+ on the recv list */
+static ssize_t dev_write(struct file *file, const char __user *u, size_t count,
+ loff_t *ppos)
+{
+ struct gdlm_plock_info info;
+ struct plock_op *op;
+ int found = 0;
+
+ if (count != sizeof(info))
+ return -EINVAL;
+
+ if (copy_from_user(&info, u, sizeof(info)))
+ return -EFAULT;
+
+ if (check_version(&info))
+ return -EINVAL;
+
+ spin_lock(&ops_lock);
+ list_for_each_entry(op, &recv_list, list) {
+ if (op->info.fsid == info.fsid && op->info.number == info.number &&
+ op->info.owner == info.owner) {
+ list_del_init(&op->list);
+ found = 1;
+ op->done = 1;
+ memcpy(&op->info, &info, sizeof(info));
+ break;
+ }
+ }
+ spin_unlock(&ops_lock);
+
+ if (found)
+ wake_up(&recv_wq);
+ else
+ printk(KERN_INFO "gdlm dev_write no op %x %llx\n", info.fsid,
+ (unsigned long long)info.number);
+ return count;
+}
+
+static unsigned int dev_poll(struct file *file, poll_table *wait)
+{
+ poll_wait(file, &send_wq, wait);
+
+ spin_lock(&ops_lock);
+ if (!list_empty(&send_list)) {
+ spin_unlock(&ops_lock);
+ return POLLIN | POLLRDNORM;
+ }
+ spin_unlock(&ops_lock);
+ return 0;
+}
+
+static struct file_operations dev_fops = {
+ .read = dev_read,
+ .write = dev_write,
+ .poll = dev_poll,
+ .owner = THIS_MODULE
+};
+
+static struct miscdevice plock_dev_misc = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = GDLM_PLOCK_MISC_NAME,
+ .fops = &dev_fops
+};
+
+int gdlm_plock_init(void)
+{
+ int rv;
+
+ spin_lock_init(&ops_lock);
+ INIT_LIST_HEAD(&send_list);
+ INIT_LIST_HEAD(&recv_list);
+ init_waitqueue_head(&send_wq);
+ init_waitqueue_head(&recv_wq);
+
+ rv = misc_register(&plock_dev_misc);
+ if (rv)
+ printk(KERN_INFO "gdlm_plock_init: misc_register failed %d",
+ rv);
+ return rv;
+}
+
+void gdlm_plock_exit(void)
+{
+ if (misc_deregister(&plock_dev_misc) < 0)
+ printk(KERN_INFO "gdlm_plock_exit: misc_deregister failed");
+}
+
diff --git a/fs/gfs2/locking/dlm/sysfs.c b/fs/gfs2/locking/dlm/sysfs.c
new file mode 100644
index 000000000000..29ae06f94944
--- /dev/null
+++ b/fs/gfs2/locking/dlm/sysfs.c
@@ -0,0 +1,226 @@
+/*
+ * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+ * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/ctype.h>
+#include <linux/stat.h>
+
+#include "lock_dlm.h"
+
+extern struct lm_lockops gdlm_ops;
+
+static ssize_t proto_name_show(struct gdlm_ls *ls, char *buf)
+{
+ return sprintf(buf, "%s\n", gdlm_ops.lm_proto_name);
+}
+
+static ssize_t block_show(struct gdlm_ls *ls, char *buf)
+{
+ ssize_t ret;
+ int val = 0;
+
+ if (test_bit(DFL_BLOCK_LOCKS, &ls->flags))
+ val = 1;
+ ret = sprintf(buf, "%d\n", val);
+ return ret;
+}
+
+static ssize_t block_store(struct gdlm_ls *ls, const char *buf, size_t len)
+{
+ ssize_t ret = len;
+ int val;
+
+ val = simple_strtol(buf, NULL, 0);
+
+ if (val == 1)
+ set_bit(DFL_BLOCK_LOCKS, &ls->flags);
+ else if (val == 0) {
+ clear_bit(DFL_BLOCK_LOCKS, &ls->flags);
+ gdlm_submit_delayed(ls);
+ } else {
+ ret = -EINVAL;
+ }
+ return ret;
+}
+
+static ssize_t withdraw_show(struct gdlm_ls *ls, char *buf)
+{
+ ssize_t ret;
+ int val = 0;
+
+ if (test_bit(DFL_WITHDRAW, &ls->flags))
+ val = 1;
+ ret = sprintf(buf, "%d\n", val);
+ return ret;
+}
+
+static ssize_t withdraw_store(struct gdlm_ls *ls, const char *buf, size_t len)
+{
+ ssize_t ret = len;
+ int val;
+
+ val = simple_strtol(buf, NULL, 0);
+
+ if (val == 1)
+ set_bit(DFL_WITHDRAW, &ls->flags);
+ else
+ ret = -EINVAL;
+ wake_up(&ls->wait_control);
+ return ret;
+}
+
+static ssize_t id_show(struct gdlm_ls *ls, char *buf)
+{
+ return sprintf(buf, "%u\n", ls->id);
+}
+
+static ssize_t jid_show(struct gdlm_ls *ls, char *buf)
+{
+ return sprintf(buf, "%d\n", ls->jid);
+}
+
+static ssize_t first_show(struct gdlm_ls *ls, char *buf)
+{
+ return sprintf(buf, "%d\n", ls->first);
+}
+
+static ssize_t first_done_show(struct gdlm_ls *ls, char *buf)
+{
+ return sprintf(buf, "%d\n", ls->first_done);
+}
+
+static ssize_t recover_show(struct gdlm_ls *ls, char *buf)
+{
+ return sprintf(buf, "%d\n", ls->recover_jid);
+}
+
+static ssize_t recover_store(struct gdlm_ls *ls, const char *buf, size_t len)
+{
+ ls->recover_jid = simple_strtol(buf, NULL, 0);
+ ls->fscb(ls->sdp, LM_CB_NEED_RECOVERY, &ls->recover_jid);
+ return len;
+}
+
+static ssize_t recover_done_show(struct gdlm_ls *ls, char *buf)
+{
+ return sprintf(buf, "%d\n", ls->recover_jid_done);
+}
+
+static ssize_t recover_status_show(struct gdlm_ls *ls, char *buf)
+{
+ return sprintf(buf, "%d\n", ls->recover_jid_status);
+}
+
+struct gdlm_attr {
+ struct attribute attr;
+ ssize_t (*show)(struct gdlm_ls *, char *);
+ ssize_t (*store)(struct gdlm_ls *, const char *, size_t);
+};
+
+#define GDLM_ATTR(_name,_mode,_show,_store) \
+static struct gdlm_attr gdlm_attr_##_name = __ATTR(_name,_mode,_show,_store)
+
+GDLM_ATTR(proto_name, 0444, proto_name_show, NULL);
+GDLM_ATTR(block, 0644, block_show, block_store);
+GDLM_ATTR(withdraw, 0644, withdraw_show, withdraw_store);
+GDLM_ATTR(id, 0444, id_show, NULL);
+GDLM_ATTR(jid, 0444, jid_show, NULL);
+GDLM_ATTR(first, 0444, first_show, NULL);
+GDLM_ATTR(first_done, 0444, first_done_show, NULL);
+GDLM_ATTR(recover, 0644, recover_show, recover_store);
+GDLM_ATTR(recover_done, 0444, recover_done_show, NULL);
+GDLM_ATTR(recover_status, 0444, recover_status_show, NULL);
+
+static struct attribute *gdlm_attrs[] = {
+ &gdlm_attr_proto_name.attr,
+ &gdlm_attr_block.attr,
+ &gdlm_attr_withdraw.attr,
+ &gdlm_attr_id.attr,
+ &gdlm_attr_jid.attr,
+ &gdlm_attr_first.attr,
+ &gdlm_attr_first_done.attr,
+ &gdlm_attr_recover.attr,
+ &gdlm_attr_recover_done.attr,
+ &gdlm_attr_recover_status.attr,
+ NULL,
+};
+
+static ssize_t gdlm_attr_show(struct kobject *kobj, struct attribute *attr,
+ char *buf)
+{
+ struct gdlm_ls *ls = container_of(kobj, struct gdlm_ls, kobj);
+ struct gdlm_attr *a = container_of(attr, struct gdlm_attr, attr);
+ return a->show ? a->show(ls, buf) : 0;
+}
+
+static ssize_t gdlm_attr_store(struct kobject *kobj, struct attribute *attr,
+ const char *buf, size_t len)
+{
+ struct gdlm_ls *ls = container_of(kobj, struct gdlm_ls, kobj);
+ struct gdlm_attr *a = container_of(attr, struct gdlm_attr, attr);
+ return a->store ? a->store(ls, buf, len) : len;
+}
+
+static struct sysfs_ops gdlm_attr_ops = {
+ .show = gdlm_attr_show,
+ .store = gdlm_attr_store,
+};
+
+static struct kobj_type gdlm_ktype = {
+ .default_attrs = gdlm_attrs,
+ .sysfs_ops = &gdlm_attr_ops,
+};
+
+static struct kset gdlm_kset = {
+ .subsys = &kernel_subsys,
+ .kobj = {.name = "lock_dlm",},
+ .ktype = &gdlm_ktype,
+};
+
+int gdlm_kobject_setup(struct gdlm_ls *ls, struct kobject *fskobj)
+{
+ int error;
+
+ error = kobject_set_name(&ls->kobj, "%s", "lock_module");
+ if (error) {
+ log_error("can't set kobj name %d", error);
+ return error;
+ }
+
+ ls->kobj.kset = &gdlm_kset;
+ ls->kobj.ktype = &gdlm_ktype;
+ ls->kobj.parent = fskobj;
+
+ error = kobject_register(&ls->kobj);
+ if (error)
+ log_error("can't register kobj %d", error);
+
+ return error;
+}
+
+void gdlm_kobject_release(struct gdlm_ls *ls)
+{
+ kobject_unregister(&ls->kobj);
+}
+
+int gdlm_sysfs_init(void)
+{
+ int error;
+
+ error = kset_register(&gdlm_kset);
+ if (error)
+ printk("lock_dlm: cannot register kset %d\n", error);
+
+ return error;
+}
+
+void gdlm_sysfs_exit(void)
+{
+ kset_unregister(&gdlm_kset);
+}
+
diff --git a/fs/gfs2/locking/dlm/thread.c b/fs/gfs2/locking/dlm/thread.c
new file mode 100644
index 000000000000..9cf1f168eaf8
--- /dev/null
+++ b/fs/gfs2/locking/dlm/thread.c
@@ -0,0 +1,359 @@
+/*
+ * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+ * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include "lock_dlm.h"
+
+/* A lock placed on this queue is re-submitted to DLM as soon as the lock_dlm
+ thread gets to it. */
+
+static void queue_submit(struct gdlm_lock *lp)
+{
+ struct gdlm_ls *ls = lp->ls;
+
+ spin_lock(&ls->async_lock);
+ list_add_tail(&lp->delay_list, &ls->submit);
+ spin_unlock(&ls->async_lock);
+ wake_up(&ls->thread_wait);
+}
+
+static void process_blocking(struct gdlm_lock *lp, int bast_mode)
+{
+ struct gdlm_ls *ls = lp->ls;
+ unsigned int cb = 0;
+
+ switch (gdlm_make_lmstate(bast_mode)) {
+ case LM_ST_EXCLUSIVE:
+ cb = LM_CB_NEED_E;
+ break;
+ case LM_ST_DEFERRED:
+ cb = LM_CB_NEED_D;
+ break;
+ case LM_ST_SHARED:
+ cb = LM_CB_NEED_S;
+ break;
+ default:
+ gdlm_assert(0, "unknown bast mode %u", lp->bast_mode);
+ }
+
+ ls->fscb(ls->sdp, cb, &lp->lockname);
+}
+
+static void process_complete(struct gdlm_lock *lp)
+{
+ struct gdlm_ls *ls = lp->ls;
+ struct lm_async_cb acb;
+ s16 prev_mode = lp->cur;
+
+ memset(&acb, 0, sizeof(acb));
+
+ if (lp->lksb.sb_status == -DLM_ECANCEL) {
+ log_info("complete dlm cancel %x,%llx flags %lx",
+ lp->lockname.ln_type,
+ (unsigned long long)lp->lockname.ln_number,
+ lp->flags);
+
+ lp->req = lp->cur;
+ acb.lc_ret |= LM_OUT_CANCELED;
+ if (lp->cur == DLM_LOCK_IV)
+ lp->lksb.sb_lkid = 0;
+ goto out;
+ }
+
+ if (test_and_clear_bit(LFL_DLM_UNLOCK, &lp->flags)) {
+ if (lp->lksb.sb_status != -DLM_EUNLOCK) {
+ log_info("unlock sb_status %d %x,%llx flags %lx",
+ lp->lksb.sb_status, lp->lockname.ln_type,
+ (unsigned long long)lp->lockname.ln_number,
+ lp->flags);
+ return;
+ }
+
+ lp->cur = DLM_LOCK_IV;
+ lp->req = DLM_LOCK_IV;
+ lp->lksb.sb_lkid = 0;
+
+ if (test_and_clear_bit(LFL_UNLOCK_DELETE, &lp->flags)) {
+ gdlm_delete_lp(lp);
+ return;
+ }
+ goto out;
+ }
+
+ if (lp->lksb.sb_flags & DLM_SBF_VALNOTVALID)
+ memset(lp->lksb.sb_lvbptr, 0, GDLM_LVB_SIZE);
+
+ if (lp->lksb.sb_flags & DLM_SBF_ALTMODE) {
+ if (lp->req == DLM_LOCK_PR)
+ lp->req = DLM_LOCK_CW;
+ else if (lp->req == DLM_LOCK_CW)
+ lp->req = DLM_LOCK_PR;
+ }
+
+ /*
+ * A canceled lock request. The lock was just taken off the delayed
+ * list and was never even submitted to dlm.
+ */
+
+ if (test_and_clear_bit(LFL_CANCEL, &lp->flags)) {
+ log_info("complete internal cancel %x,%llx",
+ lp->lockname.ln_type,
+ (unsigned long long)lp->lockname.ln_number);
+ lp->req = lp->cur;
+ acb.lc_ret |= LM_OUT_CANCELED;
+ goto out;
+ }
+
+ /*
+ * An error occured.
+ */
+
+ if (lp->lksb.sb_status) {
+ /* a "normal" error */
+ if ((lp->lksb.sb_status == -EAGAIN) &&
+ (lp->lkf & DLM_LKF_NOQUEUE)) {
+ lp->req = lp->cur;
+ if (lp->cur == DLM_LOCK_IV)
+ lp->lksb.sb_lkid = 0;
+ goto out;
+ }
+
+ /* this could only happen with cancels I think */
+ log_info("ast sb_status %d %x,%llx flags %lx",
+ lp->lksb.sb_status, lp->lockname.ln_type,
+ (unsigned long long)lp->lockname.ln_number,
+ lp->flags);
+ return;
+ }
+
+ /*
+ * This is an AST for an EX->EX conversion for sync_lvb from GFS.
+ */
+
+ if (test_and_clear_bit(LFL_SYNC_LVB, &lp->flags)) {
+ complete(&lp->ast_wait);
+ return;
+ }
+
+ /*
+ * A lock has been demoted to NL because it initially completed during
+ * BLOCK_LOCKS. Now it must be requested in the originally requested
+ * mode.
+ */
+
+ if (test_and_clear_bit(LFL_REREQUEST, &lp->flags)) {
+ gdlm_assert(lp->req == DLM_LOCK_NL, "%x,%llx",
+ lp->lockname.ln_type,
+ (unsigned long long)lp->lockname.ln_number);
+ gdlm_assert(lp->prev_req > DLM_LOCK_NL, "%x,%llx",
+ lp->lockname.ln_type,
+ (unsigned long long)lp->lockname.ln_number);
+
+ lp->cur = DLM_LOCK_NL;
+ lp->req = lp->prev_req;
+ lp->prev_req = DLM_LOCK_IV;
+ lp->lkf &= ~DLM_LKF_CONVDEADLK;
+
+ set_bit(LFL_NOCACHE, &lp->flags);
+
+ if (test_bit(DFL_BLOCK_LOCKS, &ls->flags) &&
+ !test_bit(LFL_NOBLOCK, &lp->flags))
+ gdlm_queue_delayed(lp);
+ else
+ queue_submit(lp);
+ return;
+ }
+
+ /*
+ * A request is granted during dlm recovery. It may be granted
+ * because the locks of a failed node were cleared. In that case,
+ * there may be inconsistent data beneath this lock and we must wait
+ * for recovery to complete to use it. When gfs recovery is done this
+ * granted lock will be converted to NL and then reacquired in this
+ * granted state.
+ */
+
+ if (test_bit(DFL_BLOCK_LOCKS, &ls->flags) &&
+ !test_bit(LFL_NOBLOCK, &lp->flags) &&
+ lp->req != DLM_LOCK_NL) {
+
+ lp->cur = lp->req;
+ lp->prev_req = lp->req;
+ lp->req = DLM_LOCK_NL;
+ lp->lkf |= DLM_LKF_CONVERT;
+ lp->lkf &= ~DLM_LKF_CONVDEADLK;
+
+ log_debug("rereq %x,%llx id %x %d,%d",
+ lp->lockname.ln_type,
+ (unsigned long long)lp->lockname.ln_number,
+ lp->lksb.sb_lkid, lp->cur, lp->req);
+
+ set_bit(LFL_REREQUEST, &lp->flags);
+ queue_submit(lp);
+ return;
+ }
+
+ /*
+ * DLM demoted the lock to NL before it was granted so GFS must be
+ * told it cannot cache data for this lock.
+ */
+
+ if (lp->lksb.sb_flags & DLM_SBF_DEMOTED)
+ set_bit(LFL_NOCACHE, &lp->flags);
+
+out:
+ /*
+ * This is an internal lock_dlm lock
+ */
+
+ if (test_bit(LFL_INLOCK, &lp->flags)) {
+ clear_bit(LFL_NOBLOCK, &lp->flags);
+ lp->cur = lp->req;
+ complete(&lp->ast_wait);
+ return;
+ }
+
+ /*
+ * Normal completion of a lock request. Tell GFS it now has the lock.
+ */
+
+ clear_bit(LFL_NOBLOCK, &lp->flags);
+ lp->cur = lp->req;
+
+ acb.lc_name = lp->lockname;
+ acb.lc_ret |= gdlm_make_lmstate(lp->cur);
+
+ if (!test_and_clear_bit(LFL_NOCACHE, &lp->flags) &&
+ (lp->cur > DLM_LOCK_NL) && (prev_mode > DLM_LOCK_NL))
+ acb.lc_ret |= LM_OUT_CACHEABLE;
+
+ ls->fscb(ls->sdp, LM_CB_ASYNC, &acb);
+}
+
+static inline int no_work(struct gdlm_ls *ls, int blocking)
+{
+ int ret;
+
+ spin_lock(&ls->async_lock);
+ ret = list_empty(&ls->complete) && list_empty(&ls->submit);
+ if (ret && blocking)
+ ret = list_empty(&ls->blocking);
+ spin_unlock(&ls->async_lock);
+
+ return ret;
+}
+
+static inline int check_drop(struct gdlm_ls *ls)
+{
+ if (!ls->drop_locks_count)
+ return 0;
+
+ if (time_after(jiffies, ls->drop_time + ls->drop_locks_period * HZ)) {
+ ls->drop_time = jiffies;
+ if (ls->all_locks_count >= ls->drop_locks_count)
+ return 1;
+ }
+ return 0;
+}
+
+static int gdlm_thread(void *data)
+{
+ struct gdlm_ls *ls = (struct gdlm_ls *) data;
+ struct gdlm_lock *lp = NULL;
+ int blist = 0;
+ uint8_t complete, blocking, submit, drop;
+ DECLARE_WAITQUEUE(wait, current);
+
+ /* Only thread1 is allowed to do blocking callbacks since gfs
+ may wait for a completion callback within a blocking cb. */
+
+ if (current == ls->thread1)
+ blist = 1;
+
+ while (!kthread_should_stop()) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ add_wait_queue(&ls->thread_wait, &wait);
+ if (no_work(ls, blist))
+ schedule();
+ remove_wait_queue(&ls->thread_wait, &wait);
+ set_current_state(TASK_RUNNING);
+
+ complete = blocking = submit = drop = 0;
+
+ spin_lock(&ls->async_lock);
+
+ if (blist && !list_empty(&ls->blocking)) {
+ lp = list_entry(ls->blocking.next, struct gdlm_lock,
+ blist);
+ list_del_init(&lp->blist);
+ blocking = lp->bast_mode;
+ lp->bast_mode = 0;
+ } else if (!list_empty(&ls->complete)) {
+ lp = list_entry(ls->complete.next, struct gdlm_lock,
+ clist);
+ list_del_init(&lp->clist);
+ complete = 1;
+ } else if (!list_empty(&ls->submit)) {
+ lp = list_entry(ls->submit.next, struct gdlm_lock,
+ delay_list);
+ list_del_init(&lp->delay_list);
+ submit = 1;
+ }
+
+ drop = check_drop(ls);
+ spin_unlock(&ls->async_lock);
+
+ if (complete)
+ process_complete(lp);
+
+ else if (blocking)
+ process_blocking(lp, blocking);
+
+ else if (submit)
+ gdlm_do_lock(lp);
+
+ if (drop)
+ ls->fscb(ls->sdp, LM_CB_DROPLOCKS, NULL);
+
+ schedule();
+ }
+
+ return 0;
+}
+
+int gdlm_init_threads(struct gdlm_ls *ls)
+{
+ struct task_struct *p;
+ int error;
+
+ p = kthread_run(gdlm_thread, ls, "lock_dlm1");
+ error = IS_ERR(p);
+ if (error) {
+ log_error("can't start lock_dlm1 thread %d", error);
+ return error;
+ }
+ ls->thread1 = p;
+
+ p = kthread_run(gdlm_thread, ls, "lock_dlm2");
+ error = IS_ERR(p);
+ if (error) {
+ log_error("can't start lock_dlm2 thread %d", error);
+ kthread_stop(ls->thread1);
+ return error;
+ }
+ ls->thread2 = p;
+
+ return 0;
+}
+
+void gdlm_release_threads(struct gdlm_ls *ls)
+{
+ kthread_stop(ls->thread1);
+ kthread_stop(ls->thread2);
+}
+
diff --git a/fs/gfs2/locking/nolock/Makefile b/fs/gfs2/locking/nolock/Makefile
new file mode 100644
index 000000000000..35e9730bc3a8
--- /dev/null
+++ b/fs/gfs2/locking/nolock/Makefile
@@ -0,0 +1,3 @@
+obj-$(CONFIG_GFS2_FS_LOCKING_NOLOCK) += lock_nolock.o
+lock_nolock-y := main.o
+
diff --git a/fs/gfs2/locking/nolock/main.c b/fs/gfs2/locking/nolock/main.c
new file mode 100644
index 000000000000..acfbc941f319
--- /dev/null
+++ b/fs/gfs2/locking/nolock/main.c
@@ -0,0 +1,246 @@
+/*
+ * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+ * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/fs.h>
+#include <linux/smp_lock.h>
+#include <linux/lm_interface.h>
+
+struct nolock_lockspace {
+ unsigned int nl_lvb_size;
+};
+
+static const struct lm_lockops nolock_ops;
+
+static int nolock_mount(char *table_name, char *host_data,
+ lm_callback_t cb, void *cb_data,
+ unsigned int min_lvb_size, int flags,
+ struct lm_lockstruct *lockstruct,
+ struct kobject *fskobj)
+{
+ char *c;
+ unsigned int jid;
+ struct nolock_lockspace *nl;
+
+ c = strstr(host_data, "jid=");
+ if (!c)
+ jid = 0;
+ else {
+ c += 4;
+ sscanf(c, "%u", &jid);
+ }
+
+ nl = kzalloc(sizeof(struct nolock_lockspace), GFP_KERNEL);
+ if (!nl)
+ return -ENOMEM;
+
+ nl->nl_lvb_size = min_lvb_size;
+
+ lockstruct->ls_jid = jid;
+ lockstruct->ls_first = 1;
+ lockstruct->ls_lvb_size = min_lvb_size;
+ lockstruct->ls_lockspace = nl;
+ lockstruct->ls_ops = &nolock_ops;
+ lockstruct->ls_flags = LM_LSFLAG_LOCAL;
+
+ return 0;
+}
+
+static void nolock_others_may_mount(void *lockspace)
+{
+}
+
+static void nolock_unmount(void *lockspace)
+{
+ struct nolock_lockspace *nl = lockspace;
+ kfree(nl);
+}
+
+static void nolock_withdraw(void *lockspace)
+{
+}
+
+/**
+ * nolock_get_lock - get a lm_lock_t given a descripton of the lock
+ * @lockspace: the lockspace the lock lives in
+ * @name: the name of the lock
+ * @lockp: return the lm_lock_t here
+ *
+ * Returns: 0 on success, -EXXX on failure
+ */
+
+static int nolock_get_lock(void *lockspace, struct lm_lockname *name,
+ void **lockp)
+{
+ *lockp = lockspace;
+ return 0;
+}
+
+/**
+ * nolock_put_lock - get rid of a lock structure
+ * @lock: the lock to throw away
+ *
+ */
+
+static void nolock_put_lock(void *lock)
+{
+}
+
+/**
+ * nolock_lock - acquire a lock
+ * @lock: the lock to manipulate
+ * @cur_state: the current state
+ * @req_state: the requested state
+ * @flags: modifier flags
+ *
+ * Returns: A bitmap of LM_OUT_*
+ */
+
+static unsigned int nolock_lock(void *lock, unsigned int cur_state,
+ unsigned int req_state, unsigned int flags)
+{
+ return req_state | LM_OUT_CACHEABLE;
+}
+
+/**
+ * nolock_unlock - unlock a lock
+ * @lock: the lock to manipulate
+ * @cur_state: the current state
+ *
+ * Returns: 0
+ */
+
+static unsigned int nolock_unlock(void *lock, unsigned int cur_state)
+{
+ return 0;
+}
+
+static void nolock_cancel(void *lock)
+{
+}
+
+/**
+ * nolock_hold_lvb - hold on to a lock value block
+ * @lock: the lock the LVB is associated with
+ * @lvbp: return the lm_lvb_t here
+ *
+ * Returns: 0 on success, -EXXX on failure
+ */
+
+static int nolock_hold_lvb(void *lock, char **lvbp)
+{
+ struct nolock_lockspace *nl = lock;
+ int error = 0;
+
+ *lvbp = kzalloc(nl->nl_lvb_size, GFP_KERNEL);
+ if (!*lvbp)
+ error = -ENOMEM;
+
+ return error;
+}
+
+/**
+ * nolock_unhold_lvb - release a LVB
+ * @lock: the lock the LVB is associated with
+ * @lvb: the lock value block
+ *
+ */
+
+static void nolock_unhold_lvb(void *lock, char *lvb)
+{
+ kfree(lvb);
+}
+
+static int nolock_plock_get(void *lockspace, struct lm_lockname *name,
+ struct file *file, struct file_lock *fl)
+{
+ struct file_lock tmp;
+ int ret;
+
+ ret = posix_test_lock(file, fl, &tmp);
+ fl->fl_type = F_UNLCK;
+ if (ret)
+ memcpy(fl, &tmp, sizeof(struct file_lock));
+
+ return 0;
+}
+
+static int nolock_plock(void *lockspace, struct lm_lockname *name,
+ struct file *file, int cmd, struct file_lock *fl)
+{
+ int error;
+ error = posix_lock_file_wait(file, fl);
+ return error;
+}
+
+static int nolock_punlock(void *lockspace, struct lm_lockname *name,
+ struct file *file, struct file_lock *fl)
+{
+ int error;
+ error = posix_lock_file_wait(file, fl);
+ return error;
+}
+
+static void nolock_recovery_done(void *lockspace, unsigned int jid,
+ unsigned int message)
+{
+}
+
+static const struct lm_lockops nolock_ops = {
+ .lm_proto_name = "lock_nolock",
+ .lm_mount = nolock_mount,
+ .lm_others_may_mount = nolock_others_may_mount,
+ .lm_unmount = nolock_unmount,
+ .lm_withdraw = nolock_withdraw,
+ .lm_get_lock = nolock_get_lock,
+ .lm_put_lock = nolock_put_lock,
+ .lm_lock = nolock_lock,
+ .lm_unlock = nolock_unlock,
+ .lm_cancel = nolock_cancel,
+ .lm_hold_lvb = nolock_hold_lvb,
+ .lm_unhold_lvb = nolock_unhold_lvb,
+ .lm_plock_get = nolock_plock_get,
+ .lm_plock = nolock_plock,
+ .lm_punlock = nolock_punlock,
+ .lm_recovery_done = nolock_recovery_done,
+ .lm_owner = THIS_MODULE,
+};
+
+static int __init init_nolock(void)
+{
+ int error;
+
+ error = gfs2_register_lockproto(&nolock_ops);
+ if (error) {
+ printk(KERN_WARNING
+ "lock_nolock: can't register protocol: %d\n", error);
+ return error;
+ }
+
+ printk(KERN_INFO
+ "Lock_Nolock (built %s %s) installed\n", __DATE__, __TIME__);
+ return 0;
+}
+
+static void __exit exit_nolock(void)
+{
+ gfs2_unregister_lockproto(&nolock_ops);
+}
+
+module_init(init_nolock);
+module_exit(exit_nolock);
+
+MODULE_DESCRIPTION("GFS Nolock Locking Module");
+MODULE_AUTHOR("Red Hat, Inc.");
+MODULE_LICENSE("GPL");
+
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
new file mode 100644
index 000000000000..0cace3da9dbb
--- /dev/null
+++ b/fs/gfs2/log.c
@@ -0,0 +1,688 @@
+/*
+ * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/crc32.h>
+#include <linux/lm_interface.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "bmap.h"
+#include "glock.h"
+#include "log.h"
+#include "lops.h"
+#include "meta_io.h"
+#include "util.h"
+#include "dir.h"
+
+#define PULL 1
+
+/**
+ * gfs2_struct2blk - compute stuff
+ * @sdp: the filesystem
+ * @nstruct: the number of structures
+ * @ssize: the size of the structures
+ *
+ * Compute the number of log descriptor blocks needed to hold a certain number
+ * of structures of a certain size.
+ *
+ * Returns: the number of blocks needed (minimum is always 1)
+ */
+
+unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct,
+ unsigned int ssize)
+{
+ unsigned int blks;
+ unsigned int first, second;
+
+ blks = 1;
+ first = (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_log_descriptor)) / ssize;
+
+ if (nstruct > first) {
+ second = (sdp->sd_sb.sb_bsize -
+ sizeof(struct gfs2_meta_header)) / ssize;
+ blks += DIV_ROUND_UP(nstruct - first, second);
+ }
+
+ return blks;
+}
+
+/**
+ * gfs2_ail1_start_one - Start I/O on a part of the AIL
+ * @sdp: the filesystem
+ * @tr: the part of the AIL
+ *
+ */
+
+static void gfs2_ail1_start_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
+{
+ struct gfs2_bufdata *bd, *s;
+ struct buffer_head *bh;
+ int retry;
+
+ BUG_ON(!spin_is_locked(&sdp->sd_log_lock));
+
+ do {
+ retry = 0;
+
+ list_for_each_entry_safe_reverse(bd, s, &ai->ai_ail1_list,
+ bd_ail_st_list) {
+ bh = bd->bd_bh;
+
+ gfs2_assert(sdp, bd->bd_ail == ai);
+
+ if (!buffer_busy(bh)) {
+ if (!buffer_uptodate(bh)) {
+ gfs2_log_unlock(sdp);
+ gfs2_io_error_bh(sdp, bh);
+ gfs2_log_lock(sdp);
+ }
+ list_move(&bd->bd_ail_st_list, &ai->ai_ail2_list);
+ continue;
+ }
+
+ if (!buffer_dirty(bh))
+ continue;
+
+ list_move(&bd->bd_ail_st_list, &ai->ai_ail1_list);
+
+ gfs2_log_unlock(sdp);
+ wait_on_buffer(bh);
+ ll_rw_block(WRITE, 1, &bh);
+ gfs2_log_lock(sdp);
+
+ retry = 1;
+ break;
+ }
+ } while (retry);
+}
+
+/**
+ * gfs2_ail1_empty_one - Check whether or not a trans in the AIL has been synced
+ * @sdp: the filesystem
+ * @ai: the AIL entry
+ *
+ */
+
+static int gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai, int flags)
+{
+ struct gfs2_bufdata *bd, *s;
+ struct buffer_head *bh;
+
+ list_for_each_entry_safe_reverse(bd, s, &ai->ai_ail1_list,
+ bd_ail_st_list) {
+ bh = bd->bd_bh;
+
+ gfs2_assert(sdp, bd->bd_ail == ai);
+
+ if (buffer_busy(bh)) {
+ if (flags & DIO_ALL)
+ continue;
+ else
+ break;
+ }
+
+ if (!buffer_uptodate(bh))
+ gfs2_io_error_bh(sdp, bh);
+
+ list_move(&bd->bd_ail_st_list, &ai->ai_ail2_list);
+ }
+
+ return list_empty(&ai->ai_ail1_list);
+}
+
+void gfs2_ail1_start(struct gfs2_sbd *sdp, int flags)
+{
+ struct list_head *head = &sdp->sd_ail1_list;
+ u64 sync_gen;
+ struct list_head *first;
+ struct gfs2_ail *first_ai, *ai, *tmp;
+ int done = 0;
+
+ gfs2_log_lock(sdp);
+ if (list_empty(head)) {
+ gfs2_log_unlock(sdp);
+ return;
+ }
+ sync_gen = sdp->sd_ail_sync_gen++;
+
+ first = head->prev;
+ first_ai = list_entry(first, struct gfs2_ail, ai_list);
+ first_ai->ai_sync_gen = sync_gen;
+ gfs2_ail1_start_one(sdp, first_ai); /* This may drop log lock */
+
+ if (flags & DIO_ALL)
+ first = NULL;
+
+ while(!done) {
+ if (first && (head->prev != first ||
+ gfs2_ail1_empty_one(sdp, first_ai, 0)))
+ break;
+
+ done = 1;
+ list_for_each_entry_safe_reverse(ai, tmp, head, ai_list) {
+ if (ai->ai_sync_gen >= sync_gen)
+ continue;
+ ai->ai_sync_gen = sync_gen;
+ gfs2_ail1_start_one(sdp, ai); /* This may drop log lock */
+ done = 0;
+ break;
+ }
+ }
+
+ gfs2_log_unlock(sdp);
+}
+
+int gfs2_ail1_empty(struct gfs2_sbd *sdp, int flags)
+{
+ struct gfs2_ail *ai, *s;
+ int ret;
+
+ gfs2_log_lock(sdp);
+
+ list_for_each_entry_safe_reverse(ai, s, &sdp->sd_ail1_list, ai_list) {
+ if (gfs2_ail1_empty_one(sdp, ai, flags))
+ list_move(&ai->ai_list, &sdp->sd_ail2_list);
+ else if (!(flags & DIO_ALL))
+ break;
+ }
+
+ ret = list_empty(&sdp->sd_ail1_list);
+
+ gfs2_log_unlock(sdp);
+
+ return ret;
+}
+
+
+/**
+ * gfs2_ail2_empty_one - Check whether or not a trans in the AIL has been synced
+ * @sdp: the filesystem
+ * @ai: the AIL entry
+ *
+ */
+
+static void gfs2_ail2_empty_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
+{
+ struct list_head *head = &ai->ai_ail2_list;
+ struct gfs2_bufdata *bd;
+
+ while (!list_empty(head)) {
+ bd = list_entry(head->prev, struct gfs2_bufdata,
+ bd_ail_st_list);
+ gfs2_assert(sdp, bd->bd_ail == ai);
+ bd->bd_ail = NULL;
+ list_del(&bd->bd_ail_st_list);
+ list_del(&bd->bd_ail_gl_list);
+ atomic_dec(&bd->bd_gl->gl_ail_count);
+ brelse(bd->bd_bh);
+ }
+}
+
+static void ail2_empty(struct gfs2_sbd *sdp, unsigned int new_tail)
+{
+ struct gfs2_ail *ai, *safe;
+ unsigned int old_tail = sdp->sd_log_tail;
+ int wrap = (new_tail < old_tail);
+ int a, b, rm;
+
+ gfs2_log_lock(sdp);
+
+ list_for_each_entry_safe(ai, safe, &sdp->sd_ail2_list, ai_list) {
+ a = (old_tail <= ai->ai_first);
+ b = (ai->ai_first < new_tail);
+ rm = (wrap) ? (a || b) : (a && b);
+ if (!rm)
+ continue;
+
+ gfs2_ail2_empty_one(sdp, ai);
+ list_del(&ai->ai_list);
+ gfs2_assert_warn(sdp, list_empty(&ai->ai_ail1_list));
+ gfs2_assert_warn(sdp, list_empty(&ai->ai_ail2_list));
+ kfree(ai);
+ }
+
+ gfs2_log_unlock(sdp);
+}
+
+/**
+ * gfs2_log_reserve - Make a log reservation
+ * @sdp: The GFS2 superblock
+ * @blks: The number of blocks to reserve
+ *
+ * Returns: errno
+ */
+
+int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks)
+{
+ unsigned int try = 0;
+
+ if (gfs2_assert_warn(sdp, blks) ||
+ gfs2_assert_warn(sdp, blks <= sdp->sd_jdesc->jd_blocks))
+ return -EINVAL;
+
+ mutex_lock(&sdp->sd_log_reserve_mutex);
+ gfs2_log_lock(sdp);
+ while(sdp->sd_log_blks_free <= blks) {
+ gfs2_log_unlock(sdp);
+ gfs2_ail1_empty(sdp, 0);
+ gfs2_log_flush(sdp, NULL);
+
+ if (try++)
+ gfs2_ail1_start(sdp, 0);
+ gfs2_log_lock(sdp);
+ }
+ sdp->sd_log_blks_free -= blks;
+ gfs2_log_unlock(sdp);
+ mutex_unlock(&sdp->sd_log_reserve_mutex);
+
+ down_read(&sdp->sd_log_flush_lock);
+
+ return 0;
+}
+
+/**
+ * gfs2_log_release - Release a given number of log blocks
+ * @sdp: The GFS2 superblock
+ * @blks: The number of blocks
+ *
+ */
+
+void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks)
+{
+
+ gfs2_log_lock(sdp);
+ sdp->sd_log_blks_free += blks;
+ gfs2_assert_withdraw(sdp,
+ sdp->sd_log_blks_free <= sdp->sd_jdesc->jd_blocks);
+ gfs2_log_unlock(sdp);
+ up_read(&sdp->sd_log_flush_lock);
+}
+
+static u64 log_bmap(struct gfs2_sbd *sdp, unsigned int lbn)
+{
+ struct inode *inode = sdp->sd_jdesc->jd_inode;
+ int error;
+ struct buffer_head bh_map = { .b_state = 0, .b_blocknr = 0 };
+
+ bh_map.b_size = 1 << inode->i_blkbits;
+ error = gfs2_block_map(inode, lbn, 0, &bh_map);
+ if (error || !bh_map.b_blocknr)
+ printk(KERN_INFO "error=%d, dbn=%llu lbn=%u", error, bh_map.b_blocknr, lbn);
+ gfs2_assert_withdraw(sdp, !error && bh_map.b_blocknr);
+
+ return bh_map.b_blocknr;
+}
+
+/**
+ * log_distance - Compute distance between two journal blocks
+ * @sdp: The GFS2 superblock
+ * @newer: The most recent journal block of the pair
+ * @older: The older journal block of the pair
+ *
+ * Compute the distance (in the journal direction) between two
+ * blocks in the journal
+ *
+ * Returns: the distance in blocks
+ */
+
+static inline unsigned int log_distance(struct gfs2_sbd *sdp, unsigned int newer,
+ unsigned int older)
+{
+ int dist;
+
+ dist = newer - older;
+ if (dist < 0)
+ dist += sdp->sd_jdesc->jd_blocks;
+
+ return dist;
+}
+
+static unsigned int current_tail(struct gfs2_sbd *sdp)
+{
+ struct gfs2_ail *ai;
+ unsigned int tail;
+
+ gfs2_log_lock(sdp);
+
+ if (list_empty(&sdp->sd_ail1_list)) {
+ tail = sdp->sd_log_head;
+ } else {
+ ai = list_entry(sdp->sd_ail1_list.prev, struct gfs2_ail, ai_list);
+ tail = ai->ai_first;
+ }
+
+ gfs2_log_unlock(sdp);
+
+ return tail;
+}
+
+static inline void log_incr_head(struct gfs2_sbd *sdp)
+{
+ if (sdp->sd_log_flush_head == sdp->sd_log_tail)
+ gfs2_assert_withdraw(sdp, sdp->sd_log_flush_head == sdp->sd_log_head);
+
+ if (++sdp->sd_log_flush_head == sdp->sd_jdesc->jd_blocks) {
+ sdp->sd_log_flush_head = 0;
+ sdp->sd_log_flush_wrapped = 1;
+ }
+}
+
+/**
+ * gfs2_log_get_buf - Get and initialize a buffer to use for log control data
+ * @sdp: The GFS2 superblock
+ *
+ * Returns: the buffer_head
+ */
+
+struct buffer_head *gfs2_log_get_buf(struct gfs2_sbd *sdp)
+{
+ u64 blkno = log_bmap(sdp, sdp->sd_log_flush_head);
+ struct gfs2_log_buf *lb;
+ struct buffer_head *bh;
+
+ lb = kzalloc(sizeof(struct gfs2_log_buf), GFP_NOFS | __GFP_NOFAIL);
+ list_add(&lb->lb_list, &sdp->sd_log_flush_list);
+
+ bh = lb->lb_bh = sb_getblk(sdp->sd_vfs, blkno);
+ lock_buffer(bh);
+ memset(bh->b_data, 0, bh->b_size);
+ set_buffer_uptodate(bh);
+ clear_buffer_dirty(bh);
+ unlock_buffer(bh);
+
+ log_incr_head(sdp);
+
+ return bh;
+}
+
+/**
+ * gfs2_log_fake_buf - Build a fake buffer head to write metadata buffer to log
+ * @sdp: the filesystem
+ * @data: the data the buffer_head should point to
+ *
+ * Returns: the log buffer descriptor
+ */
+
+struct buffer_head *gfs2_log_fake_buf(struct gfs2_sbd *sdp,
+ struct buffer_head *real)
+{
+ u64 blkno = log_bmap(sdp, sdp->sd_log_flush_head);
+ struct gfs2_log_buf *lb;
+ struct buffer_head *bh;
+
+ lb = kzalloc(sizeof(struct gfs2_log_buf), GFP_NOFS | __GFP_NOFAIL);
+ list_add(&lb->lb_list, &sdp->sd_log_flush_list);
+ lb->lb_real = real;
+
+ bh = lb->lb_bh = alloc_buffer_head(GFP_NOFS | __GFP_NOFAIL);
+ atomic_set(&bh->b_count, 1);
+ bh->b_state = (1 << BH_Mapped) | (1 << BH_Uptodate);
+ set_bh_page(bh, real->b_page, bh_offset(real));
+ bh->b_blocknr = blkno;
+ bh->b_size = sdp->sd_sb.sb_bsize;
+ bh->b_bdev = sdp->sd_vfs->s_bdev;
+
+ log_incr_head(sdp);
+
+ return bh;
+}
+
+static void log_pull_tail(struct gfs2_sbd *sdp, unsigned int new_tail, int pull)
+{
+ unsigned int dist = log_distance(sdp, new_tail, sdp->sd_log_tail);
+
+ ail2_empty(sdp, new_tail);
+
+ gfs2_log_lock(sdp);
+ sdp->sd_log_blks_free += dist - (pull ? 1 : 0);
+ gfs2_assert_withdraw(sdp, sdp->sd_log_blks_free <= sdp->sd_jdesc->jd_blocks);
+ gfs2_log_unlock(sdp);
+
+ sdp->sd_log_tail = new_tail;
+}
+
+/**
+ * log_write_header - Get and initialize a journal header buffer
+ * @sdp: The GFS2 superblock
+ *
+ * Returns: the initialized log buffer descriptor
+ */
+
+static void log_write_header(struct gfs2_sbd *sdp, u32 flags, int pull)
+{
+ u64 blkno = log_bmap(sdp, sdp->sd_log_flush_head);
+ struct buffer_head *bh;
+ struct gfs2_log_header *lh;
+ unsigned int tail;
+ u32 hash;
+
+ bh = sb_getblk(sdp->sd_vfs, blkno);
+ lock_buffer(bh);
+ memset(bh->b_data, 0, bh->b_size);
+ set_buffer_uptodate(bh);
+ clear_buffer_dirty(bh);
+ unlock_buffer(bh);
+
+ gfs2_ail1_empty(sdp, 0);
+ tail = current_tail(sdp);
+
+ lh = (struct gfs2_log_header *)bh->b_data;
+ memset(lh, 0, sizeof(struct gfs2_log_header));
+ lh->lh_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
+ lh->lh_header.mh_type = cpu_to_be32(GFS2_METATYPE_LH);
+ lh->lh_header.mh_format = cpu_to_be32(GFS2_FORMAT_LH);
+ lh->lh_sequence = cpu_to_be64(sdp->sd_log_sequence++);
+ lh->lh_flags = cpu_to_be32(flags);
+ lh->lh_tail = cpu_to_be32(tail);
+ lh->lh_blkno = cpu_to_be32(sdp->sd_log_flush_head);
+ hash = gfs2_disk_hash(bh->b_data, sizeof(struct gfs2_log_header));
+ lh->lh_hash = cpu_to_be32(hash);
+
+ set_buffer_dirty(bh);
+ if (sync_dirty_buffer(bh))
+ gfs2_io_error_bh(sdp, bh);
+ brelse(bh);
+
+ if (sdp->sd_log_tail != tail)
+ log_pull_tail(sdp, tail, pull);
+ else
+ gfs2_assert_withdraw(sdp, !pull);
+
+ sdp->sd_log_idle = (tail == sdp->sd_log_flush_head);
+ log_incr_head(sdp);
+}
+
+static void log_flush_commit(struct gfs2_sbd *sdp)
+{
+ struct list_head *head = &sdp->sd_log_flush_list;
+ struct gfs2_log_buf *lb;
+ struct buffer_head *bh;
+
+ while (!list_empty(head)) {
+ lb = list_entry(head->next, struct gfs2_log_buf, lb_list);
+ list_del(&lb->lb_list);
+ bh = lb->lb_bh;
+
+ wait_on_buffer(bh);
+ if (!buffer_uptodate(bh))
+ gfs2_io_error_bh(sdp, bh);
+ if (lb->lb_real) {
+ while (atomic_read(&bh->b_count) != 1) /* Grrrr... */
+ schedule();
+ free_buffer_head(bh);
+ } else
+ brelse(bh);
+ kfree(lb);
+ }
+
+ log_write_header(sdp, 0, 0);
+}
+
+/**
+ * gfs2_log_flush - flush incore transaction(s)
+ * @sdp: the filesystem
+ * @gl: The glock structure to flush. If NULL, flush the whole incore log
+ *
+ */
+
+void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl)
+{
+ struct gfs2_ail *ai;
+
+ down_write(&sdp->sd_log_flush_lock);
+
+ if (gl) {
+ gfs2_log_lock(sdp);
+ if (list_empty(&gl->gl_le.le_list)) {
+ gfs2_log_unlock(sdp);
+ up_write(&sdp->sd_log_flush_lock);
+ return;
+ }
+ gfs2_log_unlock(sdp);
+ }
+
+ ai = kzalloc(sizeof(struct gfs2_ail), GFP_NOFS | __GFP_NOFAIL);
+ INIT_LIST_HEAD(&ai->ai_ail1_list);
+ INIT_LIST_HEAD(&ai->ai_ail2_list);
+
+ gfs2_assert_withdraw(sdp, sdp->sd_log_num_buf == sdp->sd_log_commited_buf);
+ gfs2_assert_withdraw(sdp,
+ sdp->sd_log_num_revoke == sdp->sd_log_commited_revoke);
+
+ sdp->sd_log_flush_head = sdp->sd_log_head;
+ sdp->sd_log_flush_wrapped = 0;
+ ai->ai_first = sdp->sd_log_flush_head;
+
+ lops_before_commit(sdp);
+ if (!list_empty(&sdp->sd_log_flush_list))
+ log_flush_commit(sdp);
+ else if (sdp->sd_log_tail != current_tail(sdp) && !sdp->sd_log_idle)
+ log_write_header(sdp, 0, PULL);
+ lops_after_commit(sdp, ai);
+
+ gfs2_log_lock(sdp);
+ sdp->sd_log_head = sdp->sd_log_flush_head;
+ sdp->sd_log_blks_free -= sdp->sd_log_num_hdrs;
+ sdp->sd_log_blks_reserved = 0;
+ sdp->sd_log_commited_buf = 0;
+ sdp->sd_log_num_hdrs = 0;
+ sdp->sd_log_commited_revoke = 0;
+
+ if (!list_empty(&ai->ai_ail1_list)) {
+ list_add(&ai->ai_list, &sdp->sd_ail1_list);
+ ai = NULL;
+ }
+ gfs2_log_unlock(sdp);
+
+ sdp->sd_vfs->s_dirt = 0;
+ up_write(&sdp->sd_log_flush_lock);
+
+ kfree(ai);
+}
+
+static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
+{
+ unsigned int reserved = 0;
+ unsigned int old;
+
+ gfs2_log_lock(sdp);
+
+ sdp->sd_log_commited_buf += tr->tr_num_buf_new - tr->tr_num_buf_rm;
+ gfs2_assert_withdraw(sdp, ((int)sdp->sd_log_commited_buf) >= 0);
+ sdp->sd_log_commited_revoke += tr->tr_num_revoke - tr->tr_num_revoke_rm;
+ gfs2_assert_withdraw(sdp, ((int)sdp->sd_log_commited_revoke) >= 0);
+
+ if (sdp->sd_log_commited_buf)
+ reserved += sdp->sd_log_commited_buf;
+ if (sdp->sd_log_commited_revoke)
+ reserved += gfs2_struct2blk(sdp, sdp->sd_log_commited_revoke,
+ sizeof(u64));
+ if (reserved)
+ reserved++;
+
+ old = sdp->sd_log_blks_free;
+ sdp->sd_log_blks_free += tr->tr_reserved -
+ (reserved - sdp->sd_log_blks_reserved);
+
+ gfs2_assert_withdraw(sdp, sdp->sd_log_blks_free >= old);
+ gfs2_assert_withdraw(sdp,
+ sdp->sd_log_blks_free <= sdp->sd_jdesc->jd_blocks +
+ sdp->sd_log_num_hdrs);
+
+ sdp->sd_log_blks_reserved = reserved;
+
+ gfs2_log_unlock(sdp);
+}
+
+/**
+ * gfs2_log_commit - Commit a transaction to the log
+ * @sdp: the filesystem
+ * @tr: the transaction
+ *
+ * Returns: errno
+ */
+
+void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
+{
+ log_refund(sdp, tr);
+ lops_incore_commit(sdp, tr);
+
+ sdp->sd_vfs->s_dirt = 1;
+ up_read(&sdp->sd_log_flush_lock);
+
+ gfs2_log_lock(sdp);
+ if (sdp->sd_log_num_buf > gfs2_tune_get(sdp, gt_incore_log_blocks)) {
+ gfs2_log_unlock(sdp);
+ gfs2_log_flush(sdp, NULL);
+ } else {
+ gfs2_log_unlock(sdp);
+ }
+}
+
+/**
+ * gfs2_log_shutdown - write a shutdown header into a journal
+ * @sdp: the filesystem
+ *
+ */
+
+void gfs2_log_shutdown(struct gfs2_sbd *sdp)
+{
+ down_write(&sdp->sd_log_flush_lock);
+
+ gfs2_assert_withdraw(sdp, !sdp->sd_log_blks_reserved);
+ gfs2_assert_withdraw(sdp, !sdp->sd_log_num_gl);
+ gfs2_assert_withdraw(sdp, !sdp->sd_log_num_buf);
+ gfs2_assert_withdraw(sdp, !sdp->sd_log_num_jdata);
+ gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke);
+ gfs2_assert_withdraw(sdp, !sdp->sd_log_num_rg);
+ gfs2_assert_withdraw(sdp, !sdp->sd_log_num_databuf);
+ gfs2_assert_withdraw(sdp, !sdp->sd_log_num_hdrs);
+ gfs2_assert_withdraw(sdp, list_empty(&sdp->sd_ail1_list));
+
+ sdp->sd_log_flush_head = sdp->sd_log_head;
+ sdp->sd_log_flush_wrapped = 0;
+
+ log_write_header(sdp, GFS2_LOG_HEAD_UNMOUNT, 0);
+
+ gfs2_assert_warn(sdp, sdp->sd_log_blks_free == sdp->sd_jdesc->jd_blocks);
+ gfs2_assert_warn(sdp, sdp->sd_log_head == sdp->sd_log_tail);
+ gfs2_assert_warn(sdp, list_empty(&sdp->sd_ail2_list));
+
+ sdp->sd_log_head = sdp->sd_log_flush_head;
+ sdp->sd_log_tail = sdp->sd_log_head;
+
+ up_write(&sdp->sd_log_flush_lock);
+}
+
diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h
new file mode 100644
index 000000000000..7f5737d55612
--- /dev/null
+++ b/fs/gfs2/log.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#ifndef __LOG_DOT_H__
+#define __LOG_DOT_H__
+
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include "incore.h"
+
+/**
+ * gfs2_log_lock - acquire the right to mess with the log manager
+ * @sdp: the filesystem
+ *
+ */
+
+static inline void gfs2_log_lock(struct gfs2_sbd *sdp)
+{
+ spin_lock(&sdp->sd_log_lock);
+}
+
+/**
+ * gfs2_log_unlock - release the right to mess with the log manager
+ * @sdp: the filesystem
+ *
+ */
+
+static inline void gfs2_log_unlock(struct gfs2_sbd *sdp)
+{
+ spin_unlock(&sdp->sd_log_lock);
+}
+
+static inline void gfs2_log_pointers_init(struct gfs2_sbd *sdp,
+ unsigned int value)
+{
+ if (++value == sdp->sd_jdesc->jd_blocks) {
+ value = 0;
+ }
+ sdp->sd_log_head = sdp->sd_log_tail = value;
+}
+
+unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct,
+ unsigned int ssize);
+
+void gfs2_ail1_start(struct gfs2_sbd *sdp, int flags);
+int gfs2_ail1_empty(struct gfs2_sbd *sdp, int flags);
+
+int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks);
+void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks);
+
+struct buffer_head *gfs2_log_get_buf(struct gfs2_sbd *sdp);
+struct buffer_head *gfs2_log_fake_buf(struct gfs2_sbd *sdp,
+ struct buffer_head *real);
+void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl);
+void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *trans);
+
+void gfs2_log_shutdown(struct gfs2_sbd *sdp);
+
+#endif /* __LOG_DOT_H__ */
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
new file mode 100644
index 000000000000..ab6d1115f95d
--- /dev/null
+++ b/fs/gfs2/lops.c
@@ -0,0 +1,809 @@
+/*
+ * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/lm_interface.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "glock.h"
+#include "log.h"
+#include "lops.h"
+#include "meta_io.h"
+#include "recovery.h"
+#include "rgrp.h"
+#include "trans.h"
+#include "util.h"
+
+static void glock_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
+{
+ struct gfs2_glock *gl;
+ struct gfs2_trans *tr = current->journal_info;
+
+ tr->tr_touched = 1;
+
+ if (!list_empty(&le->le_list))
+ return;
+
+ gl = container_of(le, struct gfs2_glock, gl_le);
+ if (gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(gl)))
+ return;
+ gfs2_glock_hold(gl);
+ set_bit(GLF_DIRTY, &gl->gl_flags);
+
+ gfs2_log_lock(sdp);
+ sdp->sd_log_num_gl++;
+ list_add(&le->le_list, &sdp->sd_log_le_gl);
+ gfs2_log_unlock(sdp);
+}
+
+static void glock_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
+{
+ struct list_head *head = &sdp->sd_log_le_gl;
+ struct gfs2_glock *gl;
+
+ while (!list_empty(head)) {
+ gl = list_entry(head->next, struct gfs2_glock, gl_le.le_list);
+ list_del_init(&gl->gl_le.le_list);
+ sdp->sd_log_num_gl--;
+
+ gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(gl));
+ gfs2_glock_put(gl);
+ }
+ gfs2_assert_warn(sdp, !sdp->sd_log_num_gl);
+}
+
+static void buf_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
+{
+ struct gfs2_bufdata *bd = container_of(le, struct gfs2_bufdata, bd_le);
+ struct gfs2_trans *tr;
+
+ if (!list_empty(&bd->bd_list_tr))
+ return;
+
+ tr = current->journal_info;
+ tr->tr_touched = 1;
+ tr->tr_num_buf++;
+ list_add(&bd->bd_list_tr, &tr->tr_list_buf);
+
+ if (!list_empty(&le->le_list))
+ return;
+
+ gfs2_trans_add_gl(bd->bd_gl);
+
+ gfs2_meta_check(sdp, bd->bd_bh);
+ gfs2_pin(sdp, bd->bd_bh);
+
+ gfs2_log_lock(sdp);
+ sdp->sd_log_num_buf++;
+ list_add(&le->le_list, &sdp->sd_log_le_buf);
+ gfs2_log_unlock(sdp);
+
+ tr->tr_num_buf_new++;
+}
+
+static void buf_lo_incore_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
+{
+ struct list_head *head = &tr->tr_list_buf;
+ struct gfs2_bufdata *bd;
+
+ while (!list_empty(head)) {
+ bd = list_entry(head->next, struct gfs2_bufdata, bd_list_tr);
+ list_del_init(&bd->bd_list_tr);
+ tr->tr_num_buf--;
+ }
+ gfs2_assert_warn(sdp, !tr->tr_num_buf);
+}
+
+static void buf_lo_before_commit(struct gfs2_sbd *sdp)
+{
+ struct buffer_head *bh;
+ struct gfs2_log_descriptor *ld;
+ struct gfs2_bufdata *bd1 = NULL, *bd2;
+ unsigned int total = sdp->sd_log_num_buf;
+ unsigned int offset = sizeof(struct gfs2_log_descriptor);
+ unsigned int limit;
+ unsigned int num;
+ unsigned n;
+ __be64 *ptr;
+
+ offset += sizeof(__be64) - 1;
+ offset &= ~(sizeof(__be64) - 1);
+ limit = (sdp->sd_sb.sb_bsize - offset)/sizeof(__be64);
+ /* for 4k blocks, limit = 503 */
+
+ bd1 = bd2 = list_prepare_entry(bd1, &sdp->sd_log_le_buf, bd_le.le_list);
+ while(total) {
+ num = total;
+ if (total > limit)
+ num = limit;
+ bh = gfs2_log_get_buf(sdp);
+ sdp->sd_log_num_hdrs++;
+ ld = (struct gfs2_log_descriptor *)bh->b_data;
+ ptr = (__be64 *)(bh->b_data + offset);
+ ld->ld_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
+ ld->ld_header.mh_type = cpu_to_be32(GFS2_METATYPE_LD);
+ ld->ld_header.mh_format = cpu_to_be32(GFS2_FORMAT_LD);
+ ld->ld_type = cpu_to_be32(GFS2_LOG_DESC_METADATA);
+ ld->ld_length = cpu_to_be32(num + 1);
+ ld->ld_data1 = cpu_to_be32(num);
+ ld->ld_data2 = cpu_to_be32(0);
+ memset(ld->ld_reserved, 0, sizeof(ld->ld_reserved));
+
+ n = 0;
+ list_for_each_entry_continue(bd1, &sdp->sd_log_le_buf,
+ bd_le.le_list) {
+ *ptr++ = cpu_to_be64(bd1->bd_bh->b_blocknr);
+ if (++n >= num)
+ break;
+ }
+
+ set_buffer_dirty(bh);
+ ll_rw_block(WRITE, 1, &bh);
+
+ n = 0;
+ list_for_each_entry_continue(bd2, &sdp->sd_log_le_buf,
+ bd_le.le_list) {
+ bh = gfs2_log_fake_buf(sdp, bd2->bd_bh);
+ set_buffer_dirty(bh);
+ ll_rw_block(WRITE, 1, &bh);
+ if (++n >= num)
+ break;
+ }
+
+ total -= num;
+ }
+}
+
+static void buf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
+{
+ struct list_head *head = &sdp->sd_log_le_buf;
+ struct gfs2_bufdata *bd;
+
+ while (!list_empty(head)) {
+ bd = list_entry(head->next, struct gfs2_bufdata, bd_le.le_list);
+ list_del_init(&bd->bd_le.le_list);
+ sdp->sd_log_num_buf--;
+
+ gfs2_unpin(sdp, bd->bd_bh, ai);
+ }
+ gfs2_assert_warn(sdp, !sdp->sd_log_num_buf);
+}
+
+static void buf_lo_before_scan(struct gfs2_jdesc *jd,
+ struct gfs2_log_header *head, int pass)
+{
+ struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
+
+ if (pass != 0)
+ return;
+
+ sdp->sd_found_blocks = 0;
+ sdp->sd_replayed_blocks = 0;
+}
+
+static int buf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
+ struct gfs2_log_descriptor *ld, __be64 *ptr,
+ int pass)
+{
+ struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
+ struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
+ struct gfs2_glock *gl = ip->i_gl;
+ unsigned int blks = be32_to_cpu(ld->ld_data1);
+ struct buffer_head *bh_log, *bh_ip;
+ u64 blkno;
+ int error = 0;
+
+ if (pass != 1 || be32_to_cpu(ld->ld_type) != GFS2_LOG_DESC_METADATA)
+ return 0;
+
+ gfs2_replay_incr_blk(sdp, &start);
+
+ for (; blks; gfs2_replay_incr_blk(sdp, &start), blks--) {
+ blkno = be64_to_cpu(*ptr++);
+
+ sdp->sd_found_blocks++;
+
+ if (gfs2_revoke_check(sdp, blkno, start))
+ continue;
+
+ error = gfs2_replay_read_block(jd, start, &bh_log);
+ if (error)
+ return error;
+
+ bh_ip = gfs2_meta_new(gl, blkno);
+ memcpy(bh_ip->b_data, bh_log->b_data, bh_log->b_size);
+
+ if (gfs2_meta_check(sdp, bh_ip))
+ error = -EIO;
+ else
+ mark_buffer_dirty(bh_ip);
+
+ brelse(bh_log);
+ brelse(bh_ip);
+
+ if (error)
+ break;
+
+ sdp->sd_replayed_blocks++;
+ }
+
+ return error;
+}
+
+static void buf_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass)
+{
+ struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
+ struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
+
+ if (error) {
+ gfs2_meta_sync(ip->i_gl);
+ return;
+ }
+ if (pass != 1)
+ return;
+
+ gfs2_meta_sync(ip->i_gl);
+
+ fs_info(sdp, "jid=%u: Replayed %u of %u blocks\n",
+ jd->jd_jid, sdp->sd_replayed_blocks, sdp->sd_found_blocks);
+}
+
+static void revoke_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
+{
+ struct gfs2_trans *tr;
+
+ tr = current->journal_info;
+ tr->tr_touched = 1;
+ tr->tr_num_revoke++;
+
+ gfs2_log_lock(sdp);
+ sdp->sd_log_num_revoke++;
+ list_add(&le->le_list, &sdp->sd_log_le_revoke);
+ gfs2_log_unlock(sdp);
+}
+
+static void revoke_lo_before_commit(struct gfs2_sbd *sdp)
+{
+ struct gfs2_log_descriptor *ld;
+ struct gfs2_meta_header *mh;
+ struct buffer_head *bh;
+ unsigned int offset;
+ struct list_head *head = &sdp->sd_log_le_revoke;
+ struct gfs2_revoke *rv;
+
+ if (!sdp->sd_log_num_revoke)
+ return;
+
+ bh = gfs2_log_get_buf(sdp);
+ ld = (struct gfs2_log_descriptor *)bh->b_data;
+ ld->ld_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
+ ld->ld_header.mh_type = cpu_to_be32(GFS2_METATYPE_LD);
+ ld->ld_header.mh_format = cpu_to_be32(GFS2_FORMAT_LD);
+ ld->ld_type = cpu_to_be32(GFS2_LOG_DESC_REVOKE);
+ ld->ld_length = cpu_to_be32(gfs2_struct2blk(sdp, sdp->sd_log_num_revoke,
+ sizeof(u64)));
+ ld->ld_data1 = cpu_to_be32(sdp->sd_log_num_revoke);
+ ld->ld_data2 = cpu_to_be32(0);
+ memset(ld->ld_reserved, 0, sizeof(ld->ld_reserved));
+ offset = sizeof(struct gfs2_log_descriptor);
+
+ while (!list_empty(head)) {
+ rv = list_entry(head->next, struct gfs2_revoke, rv_le.le_list);
+ list_del_init(&rv->rv_le.le_list);
+ sdp->sd_log_num_revoke--;
+
+ if (offset + sizeof(u64) > sdp->sd_sb.sb_bsize) {
+ set_buffer_dirty(bh);
+ ll_rw_block(WRITE, 1, &bh);
+
+ bh = gfs2_log_get_buf(sdp);
+ mh = (struct gfs2_meta_header *)bh->b_data;
+ mh->mh_magic = cpu_to_be32(GFS2_MAGIC);
+ mh->mh_type = cpu_to_be32(GFS2_METATYPE_LB);
+ mh->mh_format = cpu_to_be32(GFS2_FORMAT_LB);
+ offset = sizeof(struct gfs2_meta_header);
+ }
+
+ *(__be64 *)(bh->b_data + offset) = cpu_to_be64(rv->rv_blkno);
+ kfree(rv);
+
+ offset += sizeof(u64);
+ }
+ gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke);
+
+ set_buffer_dirty(bh);
+ ll_rw_block(WRITE, 1, &bh);
+}
+
+static void revoke_lo_before_scan(struct gfs2_jdesc *jd,
+ struct gfs2_log_header *head, int pass)
+{
+ struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
+
+ if (pass != 0)
+ return;
+
+ sdp->sd_found_revokes = 0;
+ sdp->sd_replay_tail = head->lh_tail;
+}
+
+static int revoke_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
+ struct gfs2_log_descriptor *ld, __be64 *ptr,
+ int pass)
+{
+ struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
+ unsigned int blks = be32_to_cpu(ld->ld_length);
+ unsigned int revokes = be32_to_cpu(ld->ld_data1);
+ struct buffer_head *bh;
+ unsigned int offset;
+ u64 blkno;
+ int first = 1;
+ int error;
+
+ if (pass != 0 || be32_to_cpu(ld->ld_type) != GFS2_LOG_DESC_REVOKE)
+ return 0;
+
+ offset = sizeof(struct gfs2_log_descriptor);
+
+ for (; blks; gfs2_replay_incr_blk(sdp, &start), blks--) {
+ error = gfs2_replay_read_block(jd, start, &bh);
+ if (error)
+ return error;
+
+ if (!first)
+ gfs2_metatype_check(sdp, bh, GFS2_METATYPE_LB);
+
+ while (offset + sizeof(u64) <= sdp->sd_sb.sb_bsize) {
+ blkno = be64_to_cpu(*(__be64 *)(bh->b_data + offset));
+
+ error = gfs2_revoke_add(sdp, blkno, start);
+ if (error < 0)
+ return error;
+ else if (error)
+ sdp->sd_found_revokes++;
+
+ if (!--revokes)
+ break;
+ offset += sizeof(u64);
+ }
+
+ brelse(bh);
+ offset = sizeof(struct gfs2_meta_header);
+ first = 0;
+ }
+
+ return 0;
+}
+
+static void revoke_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass)
+{
+ struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
+
+ if (error) {
+ gfs2_revoke_clean(sdp);
+ return;
+ }
+ if (pass != 1)
+ return;
+
+ fs_info(sdp, "jid=%u: Found %u revoke tags\n",
+ jd->jd_jid, sdp->sd_found_revokes);
+
+ gfs2_revoke_clean(sdp);
+}
+
+static void rg_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
+{
+ struct gfs2_rgrpd *rgd;
+ struct gfs2_trans *tr = current->journal_info;
+
+ tr->tr_touched = 1;
+
+ if (!list_empty(&le->le_list))
+ return;
+
+ rgd = container_of(le, struct gfs2_rgrpd, rd_le);
+ gfs2_rgrp_bh_hold(rgd);
+
+ gfs2_log_lock(sdp);
+ sdp->sd_log_num_rg++;
+ list_add(&le->le_list, &sdp->sd_log_le_rg);
+ gfs2_log_unlock(sdp);
+}
+
+static void rg_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
+{
+ struct list_head *head = &sdp->sd_log_le_rg;
+ struct gfs2_rgrpd *rgd;
+
+ while (!list_empty(head)) {
+ rgd = list_entry(head->next, struct gfs2_rgrpd, rd_le.le_list);
+ list_del_init(&rgd->rd_le.le_list);
+ sdp->sd_log_num_rg--;
+
+ gfs2_rgrp_repolish_clones(rgd);
+ gfs2_rgrp_bh_put(rgd);
+ }
+ gfs2_assert_warn(sdp, !sdp->sd_log_num_rg);
+}
+
+/**
+ * databuf_lo_add - Add a databuf to the transaction.
+ *
+ * This is used in two distinct cases:
+ * i) In ordered write mode
+ * We put the data buffer on a list so that we can ensure that its
+ * synced to disk at the right time
+ * ii) In journaled data mode
+ * We need to journal the data block in the same way as metadata in
+ * the functions above. The difference is that here we have a tag
+ * which is two __be64's being the block number (as per meta data)
+ * and a flag which says whether the data block needs escaping or
+ * not. This means we need a new log entry for each 251 or so data
+ * blocks, which isn't an enormous overhead but twice as much as
+ * for normal metadata blocks.
+ */
+static void databuf_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
+{
+ struct gfs2_bufdata *bd = container_of(le, struct gfs2_bufdata, bd_le);
+ struct gfs2_trans *tr = current->journal_info;
+ struct address_space *mapping = bd->bd_bh->b_page->mapping;
+ struct gfs2_inode *ip = GFS2_I(mapping->host);
+
+ tr->tr_touched = 1;
+ if (list_empty(&bd->bd_list_tr) &&
+ (ip->i_di.di_flags & GFS2_DIF_JDATA)) {
+ tr->tr_num_buf++;
+ list_add(&bd->bd_list_tr, &tr->tr_list_buf);
+ gfs2_pin(sdp, bd->bd_bh);
+ tr->tr_num_buf_new++;
+ }
+ gfs2_trans_add_gl(bd->bd_gl);
+ gfs2_log_lock(sdp);
+ if (list_empty(&le->le_list)) {
+ if (ip->i_di.di_flags & GFS2_DIF_JDATA)
+ sdp->sd_log_num_jdata++;
+ sdp->sd_log_num_databuf++;
+ list_add(&le->le_list, &sdp->sd_log_le_databuf);
+ }
+ gfs2_log_unlock(sdp);
+}
+
+static int gfs2_check_magic(struct buffer_head *bh)
+{
+ struct page *page = bh->b_page;
+ void *kaddr;
+ __be32 *ptr;
+ int rv = 0;
+
+ kaddr = kmap_atomic(page, KM_USER0);
+ ptr = kaddr + bh_offset(bh);
+ if (*ptr == cpu_to_be32(GFS2_MAGIC))
+ rv = 1;
+ kunmap_atomic(kaddr, KM_USER0);
+
+ return rv;
+}
+
+/**
+ * databuf_lo_before_commit - Scan the data buffers, writing as we go
+ *
+ * Here we scan through the lists of buffers and make the assumption
+ * that any buffer thats been pinned is being journaled, and that
+ * any unpinned buffer is an ordered write data buffer and therefore
+ * will be written back rather than journaled.
+ */
+static void databuf_lo_before_commit(struct gfs2_sbd *sdp)
+{
+ LIST_HEAD(started);
+ struct gfs2_bufdata *bd1 = NULL, *bd2, *bdt;
+ struct buffer_head *bh = NULL;
+ unsigned int offset = sizeof(struct gfs2_log_descriptor);
+ struct gfs2_log_descriptor *ld;
+ unsigned int limit;
+ unsigned int total_dbuf = sdp->sd_log_num_databuf;
+ unsigned int total_jdata = sdp->sd_log_num_jdata;
+ unsigned int num, n;
+ __be64 *ptr = NULL;
+
+ offset += 2*sizeof(__be64) - 1;
+ offset &= ~(2*sizeof(__be64) - 1);
+ limit = (sdp->sd_sb.sb_bsize - offset)/sizeof(__be64);
+
+ /*
+ * Start writing ordered buffers, write journaled buffers
+ * into the log along with a header
+ */
+ gfs2_log_lock(sdp);
+ bd2 = bd1 = list_prepare_entry(bd1, &sdp->sd_log_le_databuf,
+ bd_le.le_list);
+ while(total_dbuf) {
+ num = total_jdata;
+ if (num > limit)
+ num = limit;
+ n = 0;
+ list_for_each_entry_safe_continue(bd1, bdt,
+ &sdp->sd_log_le_databuf,
+ bd_le.le_list) {
+ /* An ordered write buffer */
+ if (bd1->bd_bh && !buffer_pinned(bd1->bd_bh)) {
+ list_move(&bd1->bd_le.le_list, &started);
+ if (bd1 == bd2) {
+ bd2 = NULL;
+ bd2 = list_prepare_entry(bd2,
+ &sdp->sd_log_le_databuf,
+ bd_le.le_list);
+ }
+ total_dbuf--;
+ if (bd1->bd_bh) {
+ get_bh(bd1->bd_bh);
+ if (buffer_dirty(bd1->bd_bh)) {
+ gfs2_log_unlock(sdp);
+ wait_on_buffer(bd1->bd_bh);
+ ll_rw_block(WRITE, 1,
+ &bd1->bd_bh);
+ gfs2_log_lock(sdp);
+ }
+ brelse(bd1->bd_bh);
+ continue;
+ }
+ continue;
+ } else if (bd1->bd_bh) { /* A journaled buffer */
+ int magic;
+ gfs2_log_unlock(sdp);
+ if (!bh) {
+ bh = gfs2_log_get_buf(sdp);
+ sdp->sd_log_num_hdrs++;
+ ld = (struct gfs2_log_descriptor *)
+ bh->b_data;
+ ptr = (__be64 *)(bh->b_data + offset);
+ ld->ld_header.mh_magic =
+ cpu_to_be32(GFS2_MAGIC);
+ ld->ld_header.mh_type =
+ cpu_to_be32(GFS2_METATYPE_LD);
+ ld->ld_header.mh_format =
+ cpu_to_be32(GFS2_FORMAT_LD);
+ ld->ld_type =
+ cpu_to_be32(GFS2_LOG_DESC_JDATA);
+ ld->ld_length = cpu_to_be32(num + 1);
+ ld->ld_data1 = cpu_to_be32(num);
+ ld->ld_data2 = cpu_to_be32(0);
+ memset(ld->ld_reserved, 0, sizeof(ld->ld_reserved));
+ }
+ magic = gfs2_check_magic(bd1->bd_bh);
+ *ptr++ = cpu_to_be64(bd1->bd_bh->b_blocknr);
+ *ptr++ = cpu_to_be64((__u64)magic);
+ clear_buffer_escaped(bd1->bd_bh);
+ if (unlikely(magic != 0))
+ set_buffer_escaped(bd1->bd_bh);
+ gfs2_log_lock(sdp);
+ if (n++ > num)
+ break;
+ } else if (!bd1->bd_bh) {
+ total_dbuf--;
+ sdp->sd_log_num_databuf--;
+ list_del_init(&bd1->bd_le.le_list);
+ if (bd1 == bd2) {
+ bd2 = NULL;
+ bd2 = list_prepare_entry(bd2,
+ &sdp->sd_log_le_databuf,
+ bd_le.le_list);
+ }
+ kmem_cache_free(gfs2_bufdata_cachep, bd1);
+ }
+ }
+ gfs2_log_unlock(sdp);
+ if (bh) {
+ set_buffer_dirty(bh);
+ ll_rw_block(WRITE, 1, &bh);
+ bh = NULL;
+ }
+ n = 0;
+ gfs2_log_lock(sdp);
+ list_for_each_entry_continue(bd2, &sdp->sd_log_le_databuf,
+ bd_le.le_list) {
+ if (!bd2->bd_bh)
+ continue;
+ /* copy buffer if it needs escaping */
+ gfs2_log_unlock(sdp);
+ if (unlikely(buffer_escaped(bd2->bd_bh))) {
+ void *kaddr;
+ struct page *page = bd2->bd_bh->b_page;
+ bh = gfs2_log_get_buf(sdp);
+ kaddr = kmap_atomic(page, KM_USER0);
+ memcpy(bh->b_data,
+ kaddr + bh_offset(bd2->bd_bh),
+ sdp->sd_sb.sb_bsize);
+ kunmap_atomic(kaddr, KM_USER0);
+ *(__be32 *)bh->b_data = 0;
+ } else {
+ bh = gfs2_log_fake_buf(sdp, bd2->bd_bh);
+ }
+ set_buffer_dirty(bh);
+ ll_rw_block(WRITE, 1, &bh);
+ gfs2_log_lock(sdp);
+ if (++n >= num)
+ break;
+ }
+ bh = NULL;
+ total_dbuf -= num;
+ total_jdata -= num;
+ }
+ gfs2_log_unlock(sdp);
+
+ /* Wait on all ordered buffers */
+ while (!list_empty(&started)) {
+ gfs2_log_lock(sdp);
+ bd1 = list_entry(started.next, struct gfs2_bufdata,
+ bd_le.le_list);
+ list_del_init(&bd1->bd_le.le_list);
+ sdp->sd_log_num_databuf--;
+ bh = bd1->bd_bh;
+ if (bh) {
+ bh->b_private = NULL;
+ get_bh(bh);
+ gfs2_log_unlock(sdp);
+ wait_on_buffer(bh);
+ brelse(bh);
+ } else
+ gfs2_log_unlock(sdp);
+
+ kmem_cache_free(gfs2_bufdata_cachep, bd1);
+ }
+
+ /* We've removed all the ordered write bufs here, so only jdata left */
+ gfs2_assert_warn(sdp, sdp->sd_log_num_databuf == sdp->sd_log_num_jdata);
+}
+
+static int databuf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
+ struct gfs2_log_descriptor *ld,
+ __be64 *ptr, int pass)
+{
+ struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
+ struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
+ struct gfs2_glock *gl = ip->i_gl;
+ unsigned int blks = be32_to_cpu(ld->ld_data1);
+ struct buffer_head *bh_log, *bh_ip;
+ u64 blkno;
+ u64 esc;
+ int error = 0;
+
+ if (pass != 1 || be32_to_cpu(ld->ld_type) != GFS2_LOG_DESC_JDATA)
+ return 0;
+
+ gfs2_replay_incr_blk(sdp, &start);
+ for (; blks; gfs2_replay_incr_blk(sdp, &start), blks--) {
+ blkno = be64_to_cpu(*ptr++);
+ esc = be64_to_cpu(*ptr++);
+
+ sdp->sd_found_blocks++;
+
+ if (gfs2_revoke_check(sdp, blkno, start))
+ continue;
+
+ error = gfs2_replay_read_block(jd, start, &bh_log);
+ if (error)
+ return error;
+
+ bh_ip = gfs2_meta_new(gl, blkno);
+ memcpy(bh_ip->b_data, bh_log->b_data, bh_log->b_size);
+
+ /* Unescape */
+ if (esc) {
+ __be32 *eptr = (__be32 *)bh_ip->b_data;
+ *eptr = cpu_to_be32(GFS2_MAGIC);
+ }
+ mark_buffer_dirty(bh_ip);
+
+ brelse(bh_log);
+ brelse(bh_ip);
+ if (error)
+ break;
+
+ sdp->sd_replayed_blocks++;
+ }
+
+ return error;
+}
+
+/* FIXME: sort out accounting for log blocks etc. */
+
+static void databuf_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass)
+{
+ struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
+ struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
+
+ if (error) {
+ gfs2_meta_sync(ip->i_gl);
+ return;
+ }
+ if (pass != 1)
+ return;
+
+ /* data sync? */
+ gfs2_meta_sync(ip->i_gl);
+
+ fs_info(sdp, "jid=%u: Replayed %u of %u data blocks\n",
+ jd->jd_jid, sdp->sd_replayed_blocks, sdp->sd_found_blocks);
+}
+
+static void databuf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
+{
+ struct list_head *head = &sdp->sd_log_le_databuf;
+ struct gfs2_bufdata *bd;
+
+ while (!list_empty(head)) {
+ bd = list_entry(head->next, struct gfs2_bufdata, bd_le.le_list);
+ list_del_init(&bd->bd_le.le_list);
+ sdp->sd_log_num_databuf--;
+ sdp->sd_log_num_jdata--;
+ gfs2_unpin(sdp, bd->bd_bh, ai);
+ }
+ gfs2_assert_warn(sdp, !sdp->sd_log_num_databuf);
+ gfs2_assert_warn(sdp, !sdp->sd_log_num_jdata);
+}
+
+
+const struct gfs2_log_operations gfs2_glock_lops = {
+ .lo_add = glock_lo_add,
+ .lo_after_commit = glock_lo_after_commit,
+ .lo_name = "glock",
+};
+
+const struct gfs2_log_operations gfs2_buf_lops = {
+ .lo_add = buf_lo_add,
+ .lo_incore_commit = buf_lo_incore_commit,
+ .lo_before_commit = buf_lo_before_commit,
+ .lo_after_commit = buf_lo_after_commit,
+ .lo_before_scan = buf_lo_before_scan,
+ .lo_scan_elements = buf_lo_scan_elements,
+ .lo_after_scan = buf_lo_after_scan,
+ .lo_name = "buf",
+};
+
+const struct gfs2_log_operations gfs2_revoke_lops = {
+ .lo_add = revoke_lo_add,
+ .lo_before_commit = revoke_lo_before_commit,
+ .lo_before_scan = revoke_lo_before_scan,
+ .lo_scan_elements = revoke_lo_scan_elements,
+ .lo_after_scan = revoke_lo_after_scan,
+ .lo_name = "revoke",
+};
+
+const struct gfs2_log_operations gfs2_rg_lops = {
+ .lo_add = rg_lo_add,
+ .lo_after_commit = rg_lo_after_commit,
+ .lo_name = "rg",
+};
+
+const struct gfs2_log_operations gfs2_databuf_lops = {
+ .lo_add = databuf_lo_add,
+ .lo_incore_commit = buf_lo_incore_commit,
+ .lo_before_commit = databuf_lo_before_commit,
+ .lo_after_commit = databuf_lo_after_commit,
+ .lo_scan_elements = databuf_lo_scan_elements,
+ .lo_after_scan = databuf_lo_after_scan,
+ .lo_name = "databuf",
+};
+
+const struct gfs2_log_operations *gfs2_log_ops[] = {
+ &gfs2_glock_lops,
+ &gfs2_buf_lops,
+ &gfs2_revoke_lops,
+ &gfs2_rg_lops,
+ &gfs2_databuf_lops,
+ NULL,
+};
+
diff --git a/fs/gfs2/lops.h b/fs/gfs2/lops.h
new file mode 100644
index 000000000000..5839c05ae6be
--- /dev/null
+++ b/fs/gfs2/lops.h
@@ -0,0 +1,99 @@
+/*
+ * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#ifndef __LOPS_DOT_H__
+#define __LOPS_DOT_H__
+
+#include <linux/list.h>
+#include "incore.h"
+
+extern const struct gfs2_log_operations gfs2_glock_lops;
+extern const struct gfs2_log_operations gfs2_buf_lops;
+extern const struct gfs2_log_operations gfs2_revoke_lops;
+extern const struct gfs2_log_operations gfs2_rg_lops;
+extern const struct gfs2_log_operations gfs2_databuf_lops;
+
+extern const struct gfs2_log_operations *gfs2_log_ops[];
+
+static inline void lops_init_le(struct gfs2_log_element *le,
+ const struct gfs2_log_operations *lops)
+{
+ INIT_LIST_HEAD(&le->le_list);
+ le->le_ops = lops;
+}
+
+static inline void lops_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
+{
+ if (le->le_ops->lo_add)
+ le->le_ops->lo_add(sdp, le);
+}
+
+static inline void lops_incore_commit(struct gfs2_sbd *sdp,
+ struct gfs2_trans *tr)
+{
+ int x;
+ for (x = 0; gfs2_log_ops[x]; x++)
+ if (gfs2_log_ops[x]->lo_incore_commit)
+ gfs2_log_ops[x]->lo_incore_commit(sdp, tr);
+}
+
+static inline void lops_before_commit(struct gfs2_sbd *sdp)
+{
+ int x;
+ for (x = 0; gfs2_log_ops[x]; x++)
+ if (gfs2_log_ops[x]->lo_before_commit)
+ gfs2_log_ops[x]->lo_before_commit(sdp);
+}
+
+static inline void lops_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
+{
+ int x;
+ for (x = 0; gfs2_log_ops[x]; x++)
+ if (gfs2_log_ops[x]->lo_after_commit)
+ gfs2_log_ops[x]->lo_after_commit(sdp, ai);
+}
+
+static inline void lops_before_scan(struct gfs2_jdesc *jd,
+ struct gfs2_log_header *head,
+ unsigned int pass)
+{
+ int x;
+ for (x = 0; gfs2_log_ops[x]; x++)
+ if (gfs2_log_ops[x]->lo_before_scan)
+ gfs2_log_ops[x]->lo_before_scan(jd, head, pass);
+}
+
+static inline int lops_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
+ struct gfs2_log_descriptor *ld,
+ __be64 *ptr,
+ unsigned int pass)
+{
+ int x, error;
+ for (x = 0; gfs2_log_ops[x]; x++)
+ if (gfs2_log_ops[x]->lo_scan_elements) {
+ error = gfs2_log_ops[x]->lo_scan_elements(jd, start,
+ ld, ptr, pass);
+ if (error)
+ return error;
+ }
+
+ return 0;
+}
+
+static inline void lops_after_scan(struct gfs2_jdesc *jd, int error,
+ unsigned int pass)
+{
+ int x;
+ for (x = 0; gfs2_log_ops[x]; x++)
+ if (gfs2_log_ops[x]->lo_before_scan)
+ gfs2_log_ops[x]->lo_after_scan(jd, error, pass);
+}
+
+#endif /* __LOPS_DOT_H__ */
+
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
new file mode 100644
index 000000000000..21508a13bb78
--- /dev/null
+++ b/fs/gfs2/main.c
@@ -0,0 +1,150 @@
+/*
+ * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/lm_interface.h>
+#include <asm/atomic.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "ops_fstype.h"
+#include "sys.h"
+#include "util.h"
+#include "glock.h"
+
+static void gfs2_init_inode_once(void *foo, kmem_cache_t *cachep, unsigned long flags)
+{
+ struct gfs2_inode *ip = foo;
+ if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
+ SLAB_CTOR_CONSTRUCTOR) {
+ inode_init_once(&ip->i_inode);
+ spin_lock_init(&ip->i_spin);
+ init_rwsem(&ip->i_rw_mutex);
+ memset(ip->i_cache, 0, sizeof(ip->i_cache));
+ }
+}
+
+static void gfs2_init_glock_once(void *foo, kmem_cache_t *cachep, unsigned long flags)
+{
+ struct gfs2_glock *gl = foo;
+ if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
+ SLAB_CTOR_CONSTRUCTOR) {
+ INIT_HLIST_NODE(&gl->gl_list);
+ spin_lock_init(&gl->gl_spin);
+ INIT_LIST_HEAD(&gl->gl_holders);
+ INIT_LIST_HEAD(&gl->gl_waiters1);
+ INIT_LIST_HEAD(&gl->gl_waiters2);
+ INIT_LIST_HEAD(&gl->gl_waiters3);
+ gl->gl_lvb = NULL;
+ atomic_set(&gl->gl_lvb_count, 0);
+ INIT_LIST_HEAD(&gl->gl_reclaim);
+ INIT_LIST_HEAD(&gl->gl_ail_list);
+ atomic_set(&gl->gl_ail_count, 0);
+ }
+}
+
+/**
+ * init_gfs2_fs - Register GFS2 as a filesystem
+ *
+ * Returns: 0 on success, error code on failure
+ */
+
+static int __init init_gfs2_fs(void)
+{
+ int error;
+
+ error = gfs2_sys_init();
+ if (error)
+ return error;
+
+ error = gfs2_glock_init();
+ if (error)
+ goto fail;
+
+ error = -ENOMEM;
+ gfs2_glock_cachep = kmem_cache_create("gfs2_glock",
+ sizeof(struct gfs2_glock),
+ 0, 0,
+ gfs2_init_glock_once, NULL);
+ if (!gfs2_glock_cachep)
+ goto fail;
+
+ gfs2_inode_cachep = kmem_cache_create("gfs2_inode",
+ sizeof(struct gfs2_inode),
+ 0, (SLAB_RECLAIM_ACCOUNT|
+ SLAB_PANIC|SLAB_MEM_SPREAD),
+ gfs2_init_inode_once, NULL);
+ if (!gfs2_inode_cachep)
+ goto fail;
+
+ gfs2_bufdata_cachep = kmem_cache_create("gfs2_bufdata",
+ sizeof(struct gfs2_bufdata),
+ 0, 0, NULL, NULL);
+ if (!gfs2_bufdata_cachep)
+ goto fail;
+
+ error = register_filesystem(&gfs2_fs_type);
+ if (error)
+ goto fail;
+
+ error = register_filesystem(&gfs2meta_fs_type);
+ if (error)
+ goto fail_unregister;
+
+ printk("GFS2 (built %s %s) installed\n", __DATE__, __TIME__);
+
+ return 0;
+
+fail_unregister:
+ unregister_filesystem(&gfs2_fs_type);
+fail:
+ if (gfs2_bufdata_cachep)
+ kmem_cache_destroy(gfs2_bufdata_cachep);
+
+ if (gfs2_inode_cachep)
+ kmem_cache_destroy(gfs2_inode_cachep);
+
+ if (gfs2_glock_cachep)
+ kmem_cache_destroy(gfs2_glock_cachep);
+
+ gfs2_sys_uninit();
+ return error;
+}
+
+/**
+ * exit_gfs2_fs - Unregister the file system
+ *
+ */
+
+static void __exit exit_gfs2_fs(void)
+{
+ unregister_filesystem(&gfs2_fs_type);
+ unregister_filesystem(&gfs2meta_fs_type);
+
+ kmem_cache_destroy(gfs2_bufdata_cachep);
+ kmem_cache_destroy(gfs2_inode_cachep);
+ kmem_cache_destroy(gfs2_glock_cachep);
+
+ gfs2_sys_uninit();
+}
+
+MODULE_DESCRIPTION("Global File System");
+MODULE_AUTHOR("Red Hat, Inc.");
+MODULE_LICENSE("GPL");
+
+module_init(init_gfs2_fs);
+module_exit(exit_gfs2_fs);
+
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
new file mode 100644
index 000000000000..3912d6a4b1e6
--- /dev/null
+++ b/fs/gfs2/meta_io.c
@@ -0,0 +1,590 @@
+/*
+ * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/writeback.h>
+#include <linux/swap.h>
+#include <linux/delay.h>
+#include <linux/bio.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/lm_interface.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "glock.h"
+#include "glops.h"
+#include "inode.h"
+#include "log.h"
+#include "lops.h"
+#include "meta_io.h"
+#include "rgrp.h"
+#include "trans.h"
+#include "util.h"
+#include "ops_address.h"
+
+static int aspace_get_block(struct inode *inode, sector_t lblock,
+ struct buffer_head *bh_result, int create)
+{
+ gfs2_assert_warn(inode->i_sb->s_fs_info, 0);
+ return -EOPNOTSUPP;
+}
+
+static int gfs2_aspace_writepage(struct page *page,
+ struct writeback_control *wbc)
+{
+ return block_write_full_page(page, aspace_get_block, wbc);
+}
+
+static const struct address_space_operations aspace_aops = {
+ .writepage = gfs2_aspace_writepage,
+ .releasepage = gfs2_releasepage,
+};
+
+/**
+ * gfs2_aspace_get - Create and initialize a struct inode structure
+ * @sdp: the filesystem the aspace is in
+ *
+ * Right now a struct inode is just a struct inode. Maybe Linux
+ * will supply a more lightweight address space construct (that works)
+ * in the future.
+ *
+ * Make sure pages/buffers in this aspace aren't in high memory.
+ *
+ * Returns: the aspace
+ */
+
+struct inode *gfs2_aspace_get(struct gfs2_sbd *sdp)
+{
+ struct inode *aspace;
+
+ aspace = new_inode(sdp->sd_vfs);
+ if (aspace) {
+ mapping_set_gfp_mask(aspace->i_mapping, GFP_NOFS);
+ aspace->i_mapping->a_ops = &aspace_aops;
+ aspace->i_size = ~0ULL;
+ aspace->i_private = NULL;
+ insert_inode_hash(aspace);
+ }
+ return aspace;
+}
+
+void gfs2_aspace_put(struct inode *aspace)
+{
+ remove_inode_hash(aspace);
+ iput(aspace);
+}
+
+/**
+ * gfs2_meta_inval - Invalidate all buffers associated with a glock
+ * @gl: the glock
+ *
+ */
+
+void gfs2_meta_inval(struct gfs2_glock *gl)
+{
+ struct gfs2_sbd *sdp = gl->gl_sbd;
+ struct inode *aspace = gl->gl_aspace;
+ struct address_space *mapping = gl->gl_aspace->i_mapping;
+
+ gfs2_assert_withdraw(sdp, !atomic_read(&gl->gl_ail_count));
+
+ atomic_inc(&aspace->i_writecount);
+ truncate_inode_pages(mapping, 0);
+ atomic_dec(&aspace->i_writecount);
+
+ gfs2_assert_withdraw(sdp, !mapping->nrpages);
+}
+
+/**
+ * gfs2_meta_sync - Sync all buffers associated with a glock
+ * @gl: The glock
+ *
+ */
+
+void gfs2_meta_sync(struct gfs2_glock *gl)
+{
+ struct address_space *mapping = gl->gl_aspace->i_mapping;
+ int error;
+
+ filemap_fdatawrite(mapping);
+ error = filemap_fdatawait(mapping);
+
+ if (error)
+ gfs2_io_error(gl->gl_sbd);
+}
+
+/**
+ * getbuf - Get a buffer with a given address space
+ * @sdp: the filesystem
+ * @aspace: the address space
+ * @blkno: the block number (filesystem scope)
+ * @create: 1 if the buffer should be created
+ *
+ * Returns: the buffer
+ */
+
+static struct buffer_head *getbuf(struct gfs2_sbd *sdp, struct inode *aspace,
+ u64 blkno, int create)
+{
+ struct page *page;
+ struct buffer_head *bh;
+ unsigned int shift;
+ unsigned long index;
+ unsigned int bufnum;
+
+ shift = PAGE_CACHE_SHIFT - sdp->sd_sb.sb_bsize_shift;
+ index = blkno >> shift; /* convert block to page */
+ bufnum = blkno - (index << shift); /* block buf index within page */
+
+ if (create) {
+ for (;;) {
+ page = grab_cache_page(aspace->i_mapping, index);
+ if (page)
+ break;
+ yield();
+ }
+ } else {
+ page = find_lock_page(aspace->i_mapping, index);
+ if (!page)
+ return NULL;
+ }
+
+ if (!page_has_buffers(page))
+ create_empty_buffers(page, sdp->sd_sb.sb_bsize, 0);
+
+ /* Locate header for our buffer within our page */
+ for (bh = page_buffers(page); bufnum--; bh = bh->b_this_page)
+ /* Do nothing */;
+ get_bh(bh);
+
+ if (!buffer_mapped(bh))
+ map_bh(bh, sdp->sd_vfs, blkno);
+
+ unlock_page(page);
+ mark_page_accessed(page);
+ page_cache_release(page);
+
+ return bh;
+}
+
+static void meta_prep_new(struct buffer_head *bh)
+{
+ struct gfs2_meta_header *mh = (struct gfs2_meta_header *)bh->b_data;
+
+ lock_buffer(bh);
+ clear_buffer_dirty(bh);
+ set_buffer_uptodate(bh);
+ unlock_buffer(bh);
+
+ mh->mh_magic = cpu_to_be32(GFS2_MAGIC);
+}
+
+/**
+ * gfs2_meta_new - Get a block
+ * @gl: The glock associated with this block
+ * @blkno: The block number
+ *
+ * Returns: The buffer
+ */
+
+struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno)
+{
+ struct buffer_head *bh;
+ bh = getbuf(gl->gl_sbd, gl->gl_aspace, blkno, CREATE);
+ meta_prep_new(bh);
+ return bh;
+}
+
+/**
+ * gfs2_meta_read - Read a block from disk
+ * @gl: The glock covering the block
+ * @blkno: The block number
+ * @flags: flags
+ * @bhp: the place where the buffer is returned (NULL on failure)
+ *
+ * Returns: errno
+ */
+
+int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags,
+ struct buffer_head **bhp)
+{
+ *bhp = getbuf(gl->gl_sbd, gl->gl_aspace, blkno, CREATE);
+ if (!buffer_uptodate(*bhp))
+ ll_rw_block(READ_META, 1, bhp);
+ if (flags & DIO_WAIT) {
+ int error = gfs2_meta_wait(gl->gl_sbd, *bhp);
+ if (error) {
+ brelse(*bhp);
+ return error;
+ }
+ }
+
+ return 0;
+}
+
+/**
+ * gfs2_meta_wait - Reread a block from disk
+ * @sdp: the filesystem
+ * @bh: The block to wait for
+ *
+ * Returns: errno
+ */
+
+int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh)
+{
+ if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+ return -EIO;
+
+ wait_on_buffer(bh);
+
+ if (!buffer_uptodate(bh)) {
+ struct gfs2_trans *tr = current->journal_info;
+ if (tr && tr->tr_touched)
+ gfs2_io_error_bh(sdp, bh);
+ return -EIO;
+ }
+ if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+ return -EIO;
+
+ return 0;
+}
+
+/**
+ * gfs2_attach_bufdata - attach a struct gfs2_bufdata structure to a buffer
+ * @gl: the glock the buffer belongs to
+ * @bh: The buffer to be attached to
+ * @meta: Flag to indicate whether its metadata or not
+ */
+
+void gfs2_attach_bufdata(struct gfs2_glock *gl, struct buffer_head *bh,
+ int meta)
+{
+ struct gfs2_bufdata *bd;
+
+ if (meta)
+ lock_page(bh->b_page);
+
+ if (bh->b_private) {
+ if (meta)
+ unlock_page(bh->b_page);
+ return;
+ }
+
+ bd = kmem_cache_alloc(gfs2_bufdata_cachep, GFP_NOFS | __GFP_NOFAIL),
+ memset(bd, 0, sizeof(struct gfs2_bufdata));
+ bd->bd_bh = bh;
+ bd->bd_gl = gl;
+
+ INIT_LIST_HEAD(&bd->bd_list_tr);
+ if (meta)
+ lops_init_le(&bd->bd_le, &gfs2_buf_lops);
+ else
+ lops_init_le(&bd->bd_le, &gfs2_databuf_lops);
+ bh->b_private = bd;
+
+ if (meta)
+ unlock_page(bh->b_page);
+}
+
+/**
+ * gfs2_pin - Pin a buffer in memory
+ * @sdp: the filesystem the buffer belongs to
+ * @bh: The buffer to be pinned
+ *
+ */
+
+void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh)
+{
+ struct gfs2_bufdata *bd = bh->b_private;
+
+ gfs2_assert_withdraw(sdp, test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags));
+
+ if (test_set_buffer_pinned(bh))
+ gfs2_assert_withdraw(sdp, 0);
+
+ wait_on_buffer(bh);
+
+ /* If this buffer is in the AIL and it has already been written
+ to in-place disk block, remove it from the AIL. */
+
+ gfs2_log_lock(sdp);
+ if (bd->bd_ail && !buffer_in_io(bh))
+ list_move(&bd->bd_ail_st_list, &bd->bd_ail->ai_ail2_list);
+ gfs2_log_unlock(sdp);
+
+ clear_buffer_dirty(bh);
+ wait_on_buffer(bh);
+
+ if (!buffer_uptodate(bh))
+ gfs2_io_error_bh(sdp, bh);
+
+ get_bh(bh);
+}
+
+/**
+ * gfs2_unpin - Unpin a buffer
+ * @sdp: the filesystem the buffer belongs to
+ * @bh: The buffer to unpin
+ * @ai:
+ *
+ */
+
+void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh,
+ struct gfs2_ail *ai)
+{
+ struct gfs2_bufdata *bd = bh->b_private;
+
+ gfs2_assert_withdraw(sdp, buffer_uptodate(bh));
+
+ if (!buffer_pinned(bh))
+ gfs2_assert_withdraw(sdp, 0);
+
+ mark_buffer_dirty(bh);
+ clear_buffer_pinned(bh);
+
+ gfs2_log_lock(sdp);
+ if (bd->bd_ail) {
+ list_del(&bd->bd_ail_st_list);
+ brelse(bh);
+ } else {
+ struct gfs2_glock *gl = bd->bd_gl;
+ list_add(&bd->bd_ail_gl_list, &gl->gl_ail_list);
+ atomic_inc(&gl->gl_ail_count);
+ }
+ bd->bd_ail = ai;
+ list_add(&bd->bd_ail_st_list, &ai->ai_ail1_list);
+ gfs2_log_unlock(sdp);
+}
+
+/**
+ * gfs2_meta_wipe - make inode's buffers so they aren't dirty/pinned anymore
+ * @ip: the inode who owns the buffers
+ * @bstart: the first buffer in the run
+ * @blen: the number of buffers in the run
+ *
+ */
+
+void gfs2_meta_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen)
+{
+ struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+ struct inode *aspace = ip->i_gl->gl_aspace;
+ struct buffer_head *bh;
+
+ while (blen) {
+ bh = getbuf(sdp, aspace, bstart, NO_CREATE);
+ if (bh) {
+ struct gfs2_bufdata *bd = bh->b_private;
+
+ if (test_clear_buffer_pinned(bh)) {
+ struct gfs2_trans *tr = current->journal_info;
+ gfs2_log_lock(sdp);
+ list_del_init(&bd->bd_le.le_list);
+ gfs2_assert_warn(sdp, sdp->sd_log_num_buf);
+ sdp->sd_log_num_buf--;
+ gfs2_log_unlock(sdp);
+ tr->tr_num_buf_rm++;
+ brelse(bh);
+ }
+ if (bd) {
+ gfs2_log_lock(sdp);
+ if (bd->bd_ail) {
+ u64 blkno = bh->b_blocknr;
+ bd->bd_ail = NULL;
+ list_del(&bd->bd_ail_st_list);
+ list_del(&bd->bd_ail_gl_list);
+ atomic_dec(&bd->bd_gl->gl_ail_count);
+ brelse(bh);
+ gfs2_log_unlock(sdp);
+ gfs2_trans_add_revoke(sdp, blkno);
+ } else
+ gfs2_log_unlock(sdp);
+ }
+
+ lock_buffer(bh);
+ clear_buffer_dirty(bh);
+ clear_buffer_uptodate(bh);
+ unlock_buffer(bh);
+
+ brelse(bh);
+ }
+
+ bstart++;
+ blen--;
+ }
+}
+
+/**
+ * gfs2_meta_cache_flush - get rid of any references on buffers for this inode
+ * @ip: The GFS2 inode
+ *
+ * This releases buffers that are in the most-recently-used array of
+ * blocks used for indirect block addressing for this inode.
+ */
+
+void gfs2_meta_cache_flush(struct gfs2_inode *ip)
+{
+ struct buffer_head **bh_slot;
+ unsigned int x;
+
+ spin_lock(&ip->i_spin);
+
+ for (x = 0; x < GFS2_MAX_META_HEIGHT; x++) {
+ bh_slot = &ip->i_cache[x];
+ if (!*bh_slot)
+ break;
+ brelse(*bh_slot);
+ *bh_slot = NULL;
+ }
+
+ spin_unlock(&ip->i_spin);
+}
+
+/**
+ * gfs2_meta_indirect_buffer - Get a metadata buffer
+ * @ip: The GFS2 inode
+ * @height: The level of this buf in the metadata (indir addr) tree (if any)
+ * @num: The block number (device relative) of the buffer
+ * @new: Non-zero if we may create a new buffer
+ * @bhp: the buffer is returned here
+ *
+ * Try to use the gfs2_inode's MRU metadata tree cache.
+ *
+ * Returns: errno
+ */
+
+int gfs2_meta_indirect_buffer(struct gfs2_inode *ip, int height, u64 num,
+ int new, struct buffer_head **bhp)
+{
+ struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+ struct gfs2_glock *gl = ip->i_gl;
+ struct buffer_head *bh = NULL, **bh_slot = ip->i_cache + height;
+ int in_cache = 0;
+
+ spin_lock(&ip->i_spin);
+ if (*bh_slot && (*bh_slot)->b_blocknr == num) {
+ bh = *bh_slot;
+ get_bh(bh);
+ in_cache = 1;
+ }
+ spin_unlock(&ip->i_spin);
+
+ if (!bh)
+ bh = getbuf(gl->gl_sbd, gl->gl_aspace, num, CREATE);
+
+ if (!bh)
+ return -ENOBUFS;
+
+ if (new) {
+ if (gfs2_assert_warn(sdp, height))
+ goto err;
+ meta_prep_new(bh);
+ gfs2_trans_add_bh(ip->i_gl, bh, 1);
+ gfs2_metatype_set(bh, GFS2_METATYPE_IN, GFS2_FORMAT_IN);
+ gfs2_buffer_clear_tail(bh, sizeof(struct gfs2_meta_header));
+ } else {
+ u32 mtype = height ? GFS2_METATYPE_IN : GFS2_METATYPE_DI;
+ if (!buffer_uptodate(bh)) {
+ ll_rw_block(READ_META, 1, &bh);
+ if (gfs2_meta_wait(sdp, bh))
+ goto err;
+ }
+ if (gfs2_metatype_check(sdp, bh, mtype))
+ goto err;
+ }
+
+ if (!in_cache) {
+ spin_lock(&ip->i_spin);
+ if (*bh_slot)
+ brelse(*bh_slot);
+ *bh_slot = bh;
+ get_bh(bh);
+ spin_unlock(&ip->i_spin);
+ }
+
+ *bhp = bh;
+ return 0;
+err:
+ brelse(bh);
+ return -EIO;
+}
+
+/**
+ * gfs2_meta_ra - start readahead on an extent of a file
+ * @gl: the glock the blocks belong to
+ * @dblock: the starting disk block
+ * @extlen: the number of blocks in the extent
+ *
+ * returns: the first buffer in the extent
+ */
+
+struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen)
+{
+ struct gfs2_sbd *sdp = gl->gl_sbd;
+ struct inode *aspace = gl->gl_aspace;
+ struct buffer_head *first_bh, *bh;
+ u32 max_ra = gfs2_tune_get(sdp, gt_max_readahead) >>
+ sdp->sd_sb.sb_bsize_shift;
+
+ BUG_ON(!extlen);
+
+ if (max_ra < 1)
+ max_ra = 1;
+ if (extlen > max_ra)
+ extlen = max_ra;
+
+ first_bh = getbuf(sdp, aspace, dblock, CREATE);
+
+ if (buffer_uptodate(first_bh))
+ goto out;
+ if (!buffer_locked(first_bh))
+ ll_rw_block(READ_META, 1, &first_bh);
+
+ dblock++;
+ extlen--;
+
+ while (extlen) {
+ bh = getbuf(sdp, aspace, dblock, CREATE);
+
+ if (!buffer_uptodate(bh) && !buffer_locked(bh))
+ ll_rw_block(READA, 1, &bh);
+ brelse(bh);
+ dblock++;
+ extlen--;
+ if (!buffer_locked(first_bh) && buffer_uptodate(first_bh))
+ goto out;
+ }
+
+ wait_on_buffer(first_bh);
+out:
+ return first_bh;
+}
+
+/**
+ * gfs2_meta_syncfs - sync all the buffers in a filesystem
+ * @sdp: the filesystem
+ *
+ */
+
+void gfs2_meta_syncfs(struct gfs2_sbd *sdp)
+{
+ gfs2_log_flush(sdp, NULL);
+ for (;;) {
+ gfs2_ail1_start(sdp, DIO_ALL);
+ if (gfs2_ail1_empty(sdp, DIO_ALL))
+ break;
+ msleep(10);
+ }
+}
+
diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h
new file mode 100644
index 000000000000..3ec939e20dff
--- /dev/null
+++ b/fs/gfs2/meta_io.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#ifndef __DIO_DOT_H__
+#define __DIO_DOT_H__
+
+#include <linux/buffer_head.h>
+#include <linux/string.h>
+#include "incore.h"
+
+static inline void gfs2_buffer_clear(struct buffer_head *bh)
+{
+ memset(bh->b_data, 0, bh->b_size);
+}
+
+static inline void gfs2_buffer_clear_tail(struct buffer_head *bh, int head)
+{
+ BUG_ON(head > bh->b_size);
+ memset(bh->b_data + head, 0, bh->b_size - head);
+}
+
+static inline void gfs2_buffer_copy_tail(struct buffer_head *to_bh,
+ int to_head,
+ struct buffer_head *from_bh,
+ int from_head)
+{
+ BUG_ON(from_head < to_head);
+ memcpy(to_bh->b_data + to_head, from_bh->b_data + from_head,
+ from_bh->b_size - from_head);
+ memset(to_bh->b_data + to_bh->b_size + to_head - from_head,
+ 0, from_head - to_head);
+}
+
+struct inode *gfs2_aspace_get(struct gfs2_sbd *sdp);
+void gfs2_aspace_put(struct inode *aspace);
+
+void gfs2_meta_inval(struct gfs2_glock *gl);
+void gfs2_meta_sync(struct gfs2_glock *gl);
+
+struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno);
+int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno,
+ int flags, struct buffer_head **bhp);
+int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh);
+
+void gfs2_attach_bufdata(struct gfs2_glock *gl, struct buffer_head *bh,
+ int meta);
+void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh);
+void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh,
+ struct gfs2_ail *ai);
+
+void gfs2_meta_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen);
+
+void gfs2_meta_cache_flush(struct gfs2_inode *ip);
+int gfs2_meta_indirect_buffer(struct gfs2_inode *ip, int height, u64 num,
+ int new, struct buffer_head **bhp);
+
+static inline int gfs2_meta_inode_buffer(struct gfs2_inode *ip,
+ struct buffer_head **bhp)
+{
+ return gfs2_meta_indirect_buffer(ip, 0, ip->i_num.no_addr, 0, bhp);
+}
+
+struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen);
+void gfs2_meta_syncfs(struct gfs2_sbd *sdp);
+
+#define buffer_busy(bh) \
+((bh)->b_state & ((1ul << BH_Dirty) | (1ul << BH_Lock) | (1ul << BH_Pinned)))
+#define buffer_in_io(bh) \
+((bh)->b_state & ((1ul << BH_Dirty) | (1ul << BH_Lock)))
+
+#endif /* __DIO_DOT_H__ */
+
diff --git a/fs/gfs2/mount.c b/fs/gfs2/mount.c
new file mode 100644
index 000000000000..ef3092e29607
--- /dev/null
+++ b/fs/gfs2/mount.c
@@ -0,0 +1,214 @@
+/*
+ * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/lm_interface.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "mount.h"
+#include "sys.h"
+#include "util.h"
+
+/**
+ * gfs2_mount_args - Parse mount options
+ * @sdp:
+ * @data:
+ *
+ * Return: errno
+ */
+
+int gfs2_mount_args(struct gfs2_sbd *sdp, char *data_arg, int remount)
+{
+ struct gfs2_args *args = &sdp->sd_args;
+ char *data = data_arg;
+ char *options, *o, *v;
+ int error = 0;
+
+ if (!remount) {
+ /* If someone preloaded options, use those instead */
+ spin_lock(&gfs2_sys_margs_lock);
+ if (gfs2_sys_margs) {
+ data = gfs2_sys_margs;
+ gfs2_sys_margs = NULL;
+ }
+ spin_unlock(&gfs2_sys_margs_lock);
+
+ /* Set some defaults */
+ args->ar_num_glockd = GFS2_GLOCKD_DEFAULT;
+ args->ar_quota = GFS2_QUOTA_DEFAULT;
+ args->ar_data = GFS2_DATA_DEFAULT;
+ }
+
+ /* Split the options into tokens with the "," character and
+ process them */
+
+ for (options = data; (o = strsep(&options, ",")); ) {
+ if (!*o)
+ continue;
+
+ v = strchr(o, '=');
+ if (v)
+ *v++ = 0;
+
+ if (!strcmp(o, "lockproto")) {
+ if (!v)
+ goto need_value;
+ if (remount && strcmp(v, args->ar_lockproto))
+ goto cant_remount;
+ strncpy(args->ar_lockproto, v, GFS2_LOCKNAME_LEN);
+ args->ar_lockproto[GFS2_LOCKNAME_LEN - 1] = 0;
+ }
+
+ else if (!strcmp(o, "locktable")) {
+ if (!v)
+ goto need_value;
+ if (remount && strcmp(v, args->ar_locktable))
+ goto cant_remount;
+ strncpy(args->ar_locktable, v, GFS2_LOCKNAME_LEN);
+ args->ar_locktable[GFS2_LOCKNAME_LEN - 1] = 0;
+ }
+
+ else if (!strcmp(o, "hostdata")) {
+ if (!v)
+ goto need_value;
+ if (remount && strcmp(v, args->ar_hostdata))
+ goto cant_remount;
+ strncpy(args->ar_hostdata, v, GFS2_LOCKNAME_LEN);
+ args->ar_hostdata[GFS2_LOCKNAME_LEN - 1] = 0;
+ }
+
+ else if (!strcmp(o, "spectator")) {
+ if (remount && !args->ar_spectator)
+ goto cant_remount;
+ args->ar_spectator = 1;
+ sdp->sd_vfs->s_flags |= MS_RDONLY;
+ }
+
+ else if (!strcmp(o, "ignore_local_fs")) {
+ if (remount && !args->ar_ignore_local_fs)
+ goto cant_remount;
+ args->ar_ignore_local_fs = 1;
+ }
+
+ else if (!strcmp(o, "localflocks")) {
+ if (remount && !args->ar_localflocks)
+ goto cant_remount;
+ args->ar_localflocks = 1;
+ }
+
+ else if (!strcmp(o, "localcaching")) {
+ if (remount && !args->ar_localcaching)
+ goto cant_remount;
+ args->ar_localcaching = 1;
+ }
+
+ else if (!strcmp(o, "debug"))
+ args->ar_debug = 1;
+
+ else if (!strcmp(o, "nodebug"))
+ args->ar_debug = 0;
+
+ else if (!strcmp(o, "upgrade")) {
+ if (remount && !args->ar_upgrade)
+ goto cant_remount;
+ args->ar_upgrade = 1;
+ }
+
+ else if (!strcmp(o, "num_glockd")) {
+ unsigned int x;
+ if (!v)
+ goto need_value;
+ sscanf(v, "%u", &x);
+ if (remount && x != args->ar_num_glockd)
+ goto cant_remount;
+ if (!x || x > GFS2_GLOCKD_MAX) {
+ fs_info(sdp, "0 < num_glockd <= %u (not %u)\n",
+ GFS2_GLOCKD_MAX, x);
+ error = -EINVAL;
+ break;
+ }
+ args->ar_num_glockd = x;
+ }
+
+ else if (!strcmp(o, "acl")) {
+ args->ar_posix_acl = 1;
+ sdp->sd_vfs->s_flags |= MS_POSIXACL;
+ }
+
+ else if (!strcmp(o, "noacl")) {
+ args->ar_posix_acl = 0;
+ sdp->sd_vfs->s_flags &= ~MS_POSIXACL;
+ }
+
+ else if (!strcmp(o, "quota")) {
+ if (!v)
+ goto need_value;
+ if (!strcmp(v, "off"))
+ args->ar_quota = GFS2_QUOTA_OFF;
+ else if (!strcmp(v, "account"))
+ args->ar_quota = GFS2_QUOTA_ACCOUNT;
+ else if (!strcmp(v, "on"))
+ args->ar_quota = GFS2_QUOTA_ON;
+ else {
+ fs_info(sdp, "invalid value for quota\n");
+ error = -EINVAL;
+ break;
+ }
+ }
+
+ else if (!strcmp(o, "suiddir"))
+ args->ar_suiddir = 1;
+
+ else if (!strcmp(o, "nosuiddir"))
+ args->ar_suiddir = 0;
+
+ else if (!strcmp(o, "data")) {
+ if (!v)
+ goto need_value;
+ if (!strcmp(v, "writeback"))
+ args->ar_data = GFS2_DATA_WRITEBACK;
+ else if (!strcmp(v, "ordered"))
+ args->ar_data = GFS2_DATA_ORDERED;
+ else {
+ fs_info(sdp, "invalid value for data\n");
+ error = -EINVAL;
+ break;
+ }
+ }
+
+ else {
+ fs_info(sdp, "unknown option: %s\n", o);
+ error = -EINVAL;
+ break;
+ }
+ }
+
+ if (error)
+ fs_info(sdp, "invalid mount option(s)\n");
+
+ if (data != data_arg)
+ kfree(data);
+
+ return error;
+
+need_value:
+ fs_info(sdp, "need value for option %s\n", o);
+ return -EINVAL;
+
+cant_remount:
+ fs_info(sdp, "can't remount with option %s\n", o);
+ return -EINVAL;
+}
+
diff --git a/fs/gfs2/mount.h b/fs/gfs2/mount.h
new file mode 100644
index 000000000000..401288acfdf3
--- /dev/null
+++ b/fs/gfs2/mount.h
@@ -0,0 +1,17 @@
+/*
+ * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#ifndef __MOUNT_DOT_H__
+#define __MOUNT_DOT_H__
+
+struct gfs2_sbd;
+
+int gfs2_mount_args(struct gfs2_sbd *sdp, char *data_arg, int remount);
+
+#endif /* __MOUNT_DOT_H__ */
diff --git a/fs/gfs2/ondisk.c b/fs/gfs2/ondisk.c
new file mode 100644
index 000000000000..1025960b0e6e
--- /dev/null
+++ b/fs/gfs2/ondisk.c
@@ -0,0 +1,308 @@
+/*
+ * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+
+#include "gfs2.h"
+#include <linux/gfs2_ondisk.h>
+
+#define pv(struct, member, fmt) printk(KERN_INFO " "#member" = "fmt"\n", \
+ struct->member);
+
+/*
+ * gfs2_xxx_in - read in an xxx struct
+ * first arg: the cpu-order structure
+ * buf: the disk-order buffer
+ *
+ * gfs2_xxx_out - write out an xxx struct
+ * first arg: the cpu-order structure
+ * buf: the disk-order buffer
+ *
+ * gfs2_xxx_print - print out an xxx struct
+ * first arg: the cpu-order structure
+ */
+
+void gfs2_inum_in(struct gfs2_inum *no, const void *buf)
+{
+ const struct gfs2_inum *str = buf;
+
+ no->no_formal_ino = be64_to_cpu(str->no_formal_ino);
+ no->no_addr = be64_to_cpu(str->no_addr);
+}
+
+void gfs2_inum_out(const struct gfs2_inum *no, void *buf)
+{
+ struct gfs2_inum *str = buf;
+
+ str->no_formal_ino = cpu_to_be64(no->no_formal_ino);
+ str->no_addr = cpu_to_be64(no->no_addr);
+}
+
+static void gfs2_inum_print(const struct gfs2_inum *no)
+{
+ printk(KERN_INFO " no_formal_ino = %llu\n", (unsigned long long)no->no_formal_ino);
+ printk(KERN_INFO " no_addr = %llu\n", (unsigned long long)no->no_addr);
+}
+
+static void gfs2_meta_header_in(struct gfs2_meta_header *mh, const void *buf)
+{
+ const struct gfs2_meta_header *str = buf;
+
+ mh->mh_magic = be32_to_cpu(str->mh_magic);
+ mh->mh_type = be32_to_cpu(str->mh_type);
+ mh->mh_format = be32_to_cpu(str->mh_format);
+}
+
+static void gfs2_meta_header_out(const struct gfs2_meta_header *mh, void *buf)
+{
+ struct gfs2_meta_header *str = buf;
+
+ str->mh_magic = cpu_to_be32(mh->mh_magic);
+ str->mh_type = cpu_to_be32(mh->mh_type);
+ str->mh_format = cpu_to_be32(mh->mh_format);
+}
+
+static void gfs2_meta_header_print(const struct gfs2_meta_header *mh)
+{
+ pv(mh, mh_magic, "0x%.8X");
+ pv(mh, mh_type, "%u");
+ pv(mh, mh_format, "%u");
+}
+
+void gfs2_sb_in(struct gfs2_sb *sb, const void *buf)
+{
+ const struct gfs2_sb *str = buf;
+
+ gfs2_meta_header_in(&sb->sb_header, buf);
+
+ sb->sb_fs_format = be32_to_cpu(str->sb_fs_format);
+ sb->sb_multihost_format = be32_to_cpu(str->sb_multihost_format);
+ sb->sb_bsize = be32_to_cpu(str->sb_bsize);
+ sb->sb_bsize_shift = be32_to_cpu(str->sb_bsize_shift);
+
+ gfs2_inum_in(&sb->sb_master_dir, (char *)&str->sb_master_dir);
+ gfs2_inum_in(&sb->sb_root_dir, (char *)&str->sb_root_dir);
+
+ memcpy(sb->sb_lockproto, str->sb_lockproto, GFS2_LOCKNAME_LEN);
+ memcpy(sb->sb_locktable, str->sb_locktable, GFS2_LOCKNAME_LEN);
+}
+
+void gfs2_rindex_in(struct gfs2_rindex *ri, const void *buf)
+{
+ const struct gfs2_rindex *str = buf;
+
+ ri->ri_addr = be64_to_cpu(str->ri_addr);
+ ri->ri_length = be32_to_cpu(str->ri_length);
+ ri->ri_data0 = be64_to_cpu(str->ri_data0);
+ ri->ri_data = be32_to_cpu(str->ri_data);
+ ri->ri_bitbytes = be32_to_cpu(str->ri_bitbytes);
+
+}
+
+void gfs2_rindex_print(const struct gfs2_rindex *ri)
+{
+ printk(KERN_INFO " ri_addr = %llu\n", (unsigned long long)ri->ri_addr);
+ pv(ri, ri_length, "%u");
+
+ printk(KERN_INFO " ri_data0 = %llu\n", (unsigned long long)ri->ri_data0);
+ pv(ri, ri_data, "%u");
+
+ pv(ri, ri_bitbytes, "%u");
+}
+
+void gfs2_rgrp_in(struct gfs2_rgrp *rg, const void *buf)
+{
+ const struct gfs2_rgrp *str = buf;
+
+ gfs2_meta_header_in(&rg->rg_header, buf);
+ rg->rg_flags = be32_to_cpu(str->rg_flags);
+ rg->rg_free = be32_to_cpu(str->rg_free);
+ rg->rg_dinodes = be32_to_cpu(str->rg_dinodes);
+ rg->rg_igeneration = be64_to_cpu(str->rg_igeneration);
+}
+
+void gfs2_rgrp_out(const struct gfs2_rgrp *rg, void *buf)
+{
+ struct gfs2_rgrp *str = buf;
+
+ gfs2_meta_header_out(&rg->rg_header, buf);
+ str->rg_flags = cpu_to_be32(rg->rg_flags);
+ str->rg_free = cpu_to_be32(rg->rg_free);
+ str->rg_dinodes = cpu_to_be32(rg->rg_dinodes);
+ str->__pad = cpu_to_be32(0);
+ str->rg_igeneration = cpu_to_be64(rg->rg_igeneration);
+ memset(&str->rg_reserved, 0, sizeof(str->rg_reserved));
+}
+
+void gfs2_quota_in(struct gfs2_quota *qu, const void *buf)
+{
+ const struct gfs2_quota *str = buf;
+
+ qu->qu_limit = be64_to_cpu(str->qu_limit);
+ qu->qu_warn = be64_to_cpu(str->qu_warn);
+ qu->qu_value = be64_to_cpu(str->qu_value);
+}
+
+void gfs2_dinode_in(struct gfs2_dinode *di, const void *buf)
+{
+ const struct gfs2_dinode *str = buf;
+
+ gfs2_meta_header_in(&di->di_header, buf);
+ gfs2_inum_in(&di->di_num, &str->di_num);
+
+ di->di_mode = be32_to_cpu(str->di_mode);
+ di->di_uid = be32_to_cpu(str->di_uid);
+ di->di_gid = be32_to_cpu(str->di_gid);
+ di->di_nlink = be32_to_cpu(str->di_nlink);
+ di->di_size = be64_to_cpu(str->di_size);
+ di->di_blocks = be64_to_cpu(str->di_blocks);
+ di->di_atime = be64_to_cpu(str->di_atime);
+ di->di_mtime = be64_to_cpu(str->di_mtime);
+ di->di_ctime = be64_to_cpu(str->di_ctime);
+ di->di_major = be32_to_cpu(str->di_major);
+ di->di_minor = be32_to_cpu(str->di_minor);
+
+ di->di_goal_meta = be64_to_cpu(str->di_goal_meta);
+ di->di_goal_data = be64_to_cpu(str->di_goal_data);
+ di->di_generation = be64_to_cpu(str->di_generation);
+
+ di->di_flags = be32_to_cpu(str->di_flags);
+ di->di_payload_format = be32_to_cpu(str->di_payload_format);
+ di->di_height = be16_to_cpu(str->di_height);
+
+ di->di_depth = be16_to_cpu(str->di_depth);
+ di->di_entries = be32_to_cpu(str->di_entries);
+
+ di->di_eattr = be64_to_cpu(str->di_eattr);
+
+}
+
+void gfs2_dinode_out(const struct gfs2_dinode *di, void *buf)
+{
+ struct gfs2_dinode *str = buf;
+
+ gfs2_meta_header_out(&di->di_header, buf);
+ gfs2_inum_out(&di->di_num, (char *)&str->di_num);
+
+ str->di_mode = cpu_to_be32(di->di_mode);
+ str->di_uid = cpu_to_be32(di->di_uid);
+ str->di_gid = cpu_to_be32(di->di_gid);
+ str->di_nlink = cpu_to_be32(di->di_nlink);
+ str->di_size = cpu_to_be64(di->di_size);
+ str->di_blocks = cpu_to_be64(di->di_blocks);
+ str->di_atime = cpu_to_be64(di->di_atime);
+ str->di_mtime = cpu_to_be64(di->di_mtime);
+ str->di_ctime = cpu_to_be64(di->di_ctime);
+ str->di_major = cpu_to_be32(di->di_major);
+ str->di_minor = cpu_to_be32(di->di_minor);
+
+ str->di_goal_meta = cpu_to_be64(di->di_goal_meta);
+ str->di_goal_data = cpu_to_be64(di->di_goal_data);
+ str->di_generation = cpu_to_be64(di->di_generation);
+
+ str->di_flags = cpu_to_be32(di->di_flags);
+ str->di_payload_format = cpu_to_be32(di->di_payload_format);
+ str->di_height = cpu_to_be16(di->di_height);
+
+ str->di_depth = cpu_to_be16(di->di_depth);
+ str->di_entries = cpu_to_be32(di->di_entries);
+
+ str->di_eattr = cpu_to_be64(di->di_eattr);
+
+}
+
+void gfs2_dinode_print(const struct gfs2_dinode *di)
+{
+ gfs2_meta_header_print(&di->di_header);
+ gfs2_inum_print(&di->di_num);
+
+ pv(di, di_mode, "0%o");
+ pv(di, di_uid, "%u");
+ pv(di, di_gid, "%u");
+ pv(di, di_nlink, "%u");
+ printk(KERN_INFO " di_size = %llu\n", (unsigned long long)di->di_size);
+ printk(KERN_INFO " di_blocks = %llu\n", (unsigned long long)di->di_blocks);
+ printk(KERN_INFO " di_atime = %lld\n", (long long)di->di_atime);
+ printk(KERN_INFO " di_mtime = %lld\n", (long long)di->di_mtime);
+ printk(KERN_INFO " di_ctime = %lld\n", (long long)di->di_ctime);
+ pv(di, di_major, "%u");
+ pv(di, di_minor, "%u");
+
+ printk(KERN_INFO " di_goal_meta = %llu\n", (unsigned long long)di->di_goal_meta);
+ printk(KERN_INFO " di_goal_data = %llu\n", (unsigned long long)di->di_goal_data);
+
+ pv(di, di_flags, "0x%.8X");
+ pv(di, di_payload_format, "%u");
+ pv(di, di_height, "%u");
+
+ pv(di, di_depth, "%u");
+ pv(di, di_entries, "%u");
+
+ printk(KERN_INFO " di_eattr = %llu\n", (unsigned long long)di->di_eattr);
+}
+
+void gfs2_log_header_in(struct gfs2_log_header *lh, const void *buf)
+{
+ const struct gfs2_log_header *str = buf;
+
+ gfs2_meta_header_in(&lh->lh_header, buf);
+ lh->lh_sequence = be64_to_cpu(str->lh_sequence);
+ lh->lh_flags = be32_to_cpu(str->lh_flags);
+ lh->lh_tail = be32_to_cpu(str->lh_tail);
+ lh->lh_blkno = be32_to_cpu(str->lh_blkno);
+ lh->lh_hash = be32_to_cpu(str->lh_hash);
+}
+
+void gfs2_inum_range_in(struct gfs2_inum_range *ir, const void *buf)
+{
+ const struct gfs2_inum_range *str = buf;
+
+ ir->ir_start = be64_to_cpu(str->ir_start);
+ ir->ir_length = be64_to_cpu(str->ir_length);
+}
+
+void gfs2_inum_range_out(const struct gfs2_inum_range *ir, void *buf)
+{
+ struct gfs2_inum_range *str = buf;
+
+ str->ir_start = cpu_to_be64(ir->ir_start);
+ str->ir_length = cpu_to_be64(ir->ir_length);
+}
+
+void gfs2_statfs_change_in(struct gfs2_statfs_change *sc, const void *buf)
+{
+ const struct gfs2_statfs_change *str = buf;
+
+ sc->sc_total = be64_to_cpu(str->sc_total);
+ sc->sc_free = be64_to_cpu(str->sc_free);
+ sc->sc_dinodes = be64_to_cpu(str->sc_dinodes);
+}
+
+void gfs2_statfs_change_out(const struct gfs2_statfs_change *sc, void *buf)
+{
+ struct gfs2_statfs_change *str = buf;
+
+ str->sc_total = cpu_to_be64(sc->sc_total);
+ str->sc_free = cpu_to_be64(sc->sc_free);
+ str->sc_dinodes = cpu_to_be64(sc->sc_dinodes);
+}
+
+void gfs2_quota_change_in(struct gfs2_quota_change *qc, const void *buf)
+{
+ const struct gfs2_quota_change *str = buf;
+
+ qc->qc_change = be64_to_cpu(str->qc_change);
+ qc->qc_flags = be32_to_cpu(str->qc_flags);
+ qc->qc_id = be32_to_cpu(str->qc_id);
+}
+
diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c
new file mode 100644
index 000000000000..8d5963c7e123
--- /dev/null
+++ b/fs/gfs2/ops_address.c
@@ -0,0 +1,793 @@
+/*
+ * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/pagemap.h>
+#include <linux/pagevec.h>
+#include <linux/mpage.h>
+#include <linux/fs.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/lm_interface.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "bmap.h"
+#include "glock.h"
+#include "inode.h"
+#include "log.h"
+#include "meta_io.h"
+#include "ops_address.h"
+#include "quota.h"
+#include "trans.h"
+#include "rgrp.h"
+#include "ops_file.h"
+#include "util.h"
+#include "glops.h"
+
+
+static void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page,
+ unsigned int from, unsigned int to)
+{
+ struct buffer_head *head = page_buffers(page);
+ unsigned int bsize = head->b_size;
+ struct buffer_head *bh;
+ unsigned int start, end;
+
+ for (bh = head, start = 0; bh != head || !start;
+ bh = bh->b_this_page, start = end) {
+ end = start + bsize;
+ if (end <= from || start >= to)
+ continue;
+ gfs2_trans_add_bh(ip->i_gl, bh, 0);
+ }
+}
+
+/**
+ * gfs2_get_block - Fills in a buffer head with details about a block
+ * @inode: The inode
+ * @lblock: The block number to look up
+ * @bh_result: The buffer head to return the result in
+ * @create: Non-zero if we may add block to the file
+ *
+ * Returns: errno
+ */
+
+int gfs2_get_block(struct inode *inode, sector_t lblock,
+ struct buffer_head *bh_result, int create)
+{
+ return gfs2_block_map(inode, lblock, create, bh_result);
+}
+
+/**
+ * gfs2_get_block_noalloc - Fills in a buffer head with details about a block
+ * @inode: The inode
+ * @lblock: The block number to look up
+ * @bh_result: The buffer head to return the result in
+ * @create: Non-zero if we may add block to the file
+ *
+ * Returns: errno
+ */
+
+static int gfs2_get_block_noalloc(struct inode *inode, sector_t lblock,
+ struct buffer_head *bh_result, int create)
+{
+ int error;
+
+ error = gfs2_block_map(inode, lblock, 0, bh_result);
+ if (error)
+ return error;
+ if (bh_result->b_blocknr == 0)
+ return -EIO;
+ return 0;
+}
+
+static int gfs2_get_block_direct(struct inode *inode, sector_t lblock,
+ struct buffer_head *bh_result, int create)
+{
+ return gfs2_block_map(inode, lblock, 0, bh_result);
+}
+
+/**
+ * gfs2_writepage - Write complete page
+ * @page: Page to write
+ *
+ * Returns: errno
+ *
+ * Some of this is copied from block_write_full_page() although we still
+ * call it to do most of the work.
+ */
+
+static int gfs2_writepage(struct page *page, struct writeback_control *wbc)
+{
+ struct inode *inode = page->mapping->host;
+ struct gfs2_inode *ip = GFS2_I(inode);
+ struct gfs2_sbd *sdp = GFS2_SB(inode);
+ loff_t i_size = i_size_read(inode);
+ pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
+ unsigned offset;
+ int error;
+ int done_trans = 0;
+
+ if (gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(ip->i_gl))) {
+ unlock_page(page);
+ return -EIO;
+ }
+ if (current->journal_info)
+ goto out_ignore;
+
+ /* Is the page fully outside i_size? (truncate in progress) */
+ offset = i_size & (PAGE_CACHE_SIZE-1);
+ if (page->index > end_index || (page->index == end_index && !offset)) {
+ page->mapping->a_ops->invalidatepage(page, 0);
+ unlock_page(page);
+ return 0; /* don't care */
+ }
+
+ if (sdp->sd_args.ar_data == GFS2_DATA_ORDERED || gfs2_is_jdata(ip)) {
+ error = gfs2_trans_begin(sdp, RES_DINODE + 1, 0);
+ if (error)
+ goto out_ignore;
+ if (!page_has_buffers(page)) {
+ create_empty_buffers(page, inode->i_sb->s_blocksize,
+ (1 << BH_Dirty)|(1 << BH_Uptodate));
+ }
+ gfs2_page_add_databufs(ip, page, 0, sdp->sd_vfs->s_blocksize-1);
+ done_trans = 1;
+ }
+ error = block_write_full_page(page, gfs2_get_block_noalloc, wbc);
+ if (done_trans)
+ gfs2_trans_end(sdp);
+ gfs2_meta_cache_flush(ip);
+ return error;
+
+out_ignore:
+ redirty_page_for_writepage(wbc, page);
+ unlock_page(page);
+ return 0;
+}
+
+static int zero_readpage(struct page *page)
+{
+ void *kaddr;
+
+ kaddr = kmap_atomic(page, KM_USER0);
+ memset(kaddr, 0, PAGE_CACHE_SIZE);
+ kunmap_atomic(kaddr, KM_USER0);
+
+ SetPageUptodate(page);
+
+ return 0;
+}
+
+/**
+ * stuffed_readpage - Fill in a Linux page with stuffed file data
+ * @ip: the inode
+ * @page: the page
+ *
+ * Returns: errno
+ */
+
+static int stuffed_readpage(struct gfs2_inode *ip, struct page *page)
+{
+ struct buffer_head *dibh;
+ void *kaddr;
+ int error;
+
+ /* Only the first page of a stuffed file might contain data */
+ if (unlikely(page->index))
+ return zero_readpage(page);
+
+ error = gfs2_meta_inode_buffer(ip, &dibh);
+ if (error)
+ return error;
+
+ kaddr = kmap_atomic(page, KM_USER0);
+ memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode),
+ ip->i_di.di_size);
+ memset(kaddr + ip->i_di.di_size, 0, PAGE_CACHE_SIZE - ip->i_di.di_size);
+ kunmap_atomic(kaddr, KM_USER0);
+
+ brelse(dibh);
+
+ SetPageUptodate(page);
+
+ return 0;
+}
+
+
+/**
+ * gfs2_readpage - readpage with locking
+ * @file: The file to read a page for. N.B. This may be NULL if we are
+ * reading an internal file.
+ * @page: The page to read
+ *
+ * Returns: errno
+ */
+
+static int gfs2_readpage(struct file *file, struct page *page)
+{
+ struct gfs2_inode *ip = GFS2_I(page->mapping->host);
+ struct gfs2_sbd *sdp = GFS2_SB(page->mapping->host);
+ struct gfs2_file *gf = NULL;
+ struct gfs2_holder gh;
+ int error;
+ int do_unlock = 0;
+
+ if (likely(file != &gfs2_internal_file_sentinel)) {
+ if (file) {
+ gf = file->private_data;
+ if (test_bit(GFF_EXLOCK, &gf->f_flags))
+ /* gfs2_sharewrite_nopage has grabbed the ip->i_gl already */
+ goto skip_lock;
+ }
+ gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME|GL_AOP, &gh);
+ do_unlock = 1;
+ error = gfs2_glock_nq_m_atime(1, &gh);
+ if (unlikely(error))
+ goto out_unlock;
+ }
+
+skip_lock:
+ if (gfs2_is_stuffed(ip)) {
+ error = stuffed_readpage(ip, page);
+ unlock_page(page);
+ } else
+ error = mpage_readpage(page, gfs2_get_block);
+
+ if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+ error = -EIO;
+
+ if (do_unlock) {
+ gfs2_glock_dq_m(1, &gh);
+ gfs2_holder_uninit(&gh);
+ }
+out:
+ return error;
+out_unlock:
+ unlock_page(page);
+ if (do_unlock)
+ gfs2_holder_uninit(&gh);
+ goto out;
+}
+
+/**
+ * gfs2_readpages - Read a bunch of pages at once
+ *
+ * Some notes:
+ * 1. This is only for readahead, so we can simply ignore any things
+ * which are slightly inconvenient (such as locking conflicts between
+ * the page lock and the glock) and return having done no I/O. Its
+ * obviously not something we'd want to do on too regular a basis.
+ * Any I/O we ignore at this time will be done via readpage later.
+ * 2. We have to handle stuffed files here too.
+ * 3. mpage_readpages() does most of the heavy lifting in the common case.
+ * 4. gfs2_get_block() is relied upon to set BH_Boundary in the right places.
+ * 5. We use LM_FLAG_TRY_1CB here, effectively we then have lock-ahead as
+ * well as read-ahead.
+ */
+static int gfs2_readpages(struct file *file, struct address_space *mapping,
+ struct list_head *pages, unsigned nr_pages)
+{
+ struct inode *inode = mapping->host;
+ struct gfs2_inode *ip = GFS2_I(inode);
+ struct gfs2_sbd *sdp = GFS2_SB(inode);
+ struct gfs2_holder gh;
+ unsigned page_idx;
+ int ret;
+ int do_unlock = 0;
+
+ if (likely(file != &gfs2_internal_file_sentinel)) {
+ if (file) {
+ struct gfs2_file *gf = file->private_data;
+ if (test_bit(GFF_EXLOCK, &gf->f_flags))
+ goto skip_lock;
+ }
+ gfs2_holder_init(ip->i_gl, LM_ST_SHARED,
+ LM_FLAG_TRY_1CB|GL_ATIME|GL_AOP, &gh);
+ do_unlock = 1;
+ ret = gfs2_glock_nq_m_atime(1, &gh);
+ if (ret == GLR_TRYFAILED)
+ goto out_noerror;
+ if (unlikely(ret))
+ goto out_unlock;
+ }
+skip_lock:
+ if (gfs2_is_stuffed(ip)) {
+ struct pagevec lru_pvec;
+ pagevec_init(&lru_pvec, 0);
+ for (page_idx = 0; page_idx < nr_pages; page_idx++) {
+ struct page *page = list_entry(pages->prev, struct page, lru);
+ prefetchw(&page->flags);
+ list_del(&page->lru);
+ if (!add_to_page_cache(page, mapping,
+ page->index, GFP_KERNEL)) {
+ ret = stuffed_readpage(ip, page);
+ unlock_page(page);
+ if (!pagevec_add(&lru_pvec, page))
+ __pagevec_lru_add(&lru_pvec);
+ } else {
+ page_cache_release(page);
+ }
+ }
+ pagevec_lru_add(&lru_pvec);
+ ret = 0;
+ } else {
+ /* What we really want to do .... */
+ ret = mpage_readpages(mapping, pages, nr_pages, gfs2_get_block);
+ }
+
+ if (do_unlock) {
+ gfs2_glock_dq_m(1, &gh);
+ gfs2_holder_uninit(&gh);
+ }
+out:
+ if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+ ret = -EIO;
+ return ret;
+out_noerror:
+ ret = 0;
+out_unlock:
+ /* unlock all pages, we can't do any I/O right now */
+ for (page_idx = 0; page_idx < nr_pages; page_idx++) {
+ struct page *page = list_entry(pages->prev, struct page, lru);
+ list_del(&page->lru);
+ unlock_page(page);
+ page_cache_release(page);
+ }
+ if (do_unlock)
+ gfs2_holder_uninit(&gh);
+ goto out;
+}
+
+/**
+ * gfs2_prepare_write - Prepare to write a page to a file
+ * @file: The file to write to
+ * @page: The page which is to be prepared for writing
+ * @from: From (byte range within page)
+ * @to: To (byte range within page)
+ *
+ * Returns: errno
+ */
+
+static int gfs2_prepare_write(struct file *file, struct page *page,
+ unsigned from, unsigned to)
+{
+ struct gfs2_inode *ip = GFS2_I(page->mapping->host);
+ struct gfs2_sbd *sdp = GFS2_SB(page->mapping->host);
+ unsigned int data_blocks, ind_blocks, rblocks;
+ int alloc_required;
+ int error = 0;
+ loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + from;
+ loff_t end = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
+ struct gfs2_alloc *al;
+ unsigned int write_len = to - from;
+
+
+ gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_ATIME|GL_AOP, &ip->i_gh);
+ error = gfs2_glock_nq_m_atime(1, &ip->i_gh);
+ if (error)
+ goto out_uninit;
+
+ gfs2_write_calc_reserv(ip, write_len, &data_blocks, &ind_blocks);
+
+ error = gfs2_write_alloc_required(ip, pos, write_len, &alloc_required);
+ if (error)
+ goto out_unlock;
+
+
+ ip->i_alloc.al_requested = 0;
+ if (alloc_required) {
+ al = gfs2_alloc_get(ip);
+
+ error = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+ if (error)
+ goto out_alloc_put;
+
+ error = gfs2_quota_check(ip, ip->i_di.di_uid, ip->i_di.di_gid);
+ if (error)
+ goto out_qunlock;
+
+ al->al_requested = data_blocks + ind_blocks;
+ error = gfs2_inplace_reserve(ip);
+ if (error)
+ goto out_qunlock;
+ }
+
+ rblocks = RES_DINODE + ind_blocks;
+ if (gfs2_is_jdata(ip))
+ rblocks += data_blocks ? data_blocks : 1;
+ if (ind_blocks || data_blocks)
+ rblocks += RES_STATFS + RES_QUOTA;
+
+ error = gfs2_trans_begin(sdp, rblocks, 0);
+ if (error)
+ goto out;
+
+ if (gfs2_is_stuffed(ip)) {
+ if (end > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)) {
+ error = gfs2_unstuff_dinode(ip, page);
+ if (error == 0)
+ goto prepare_write;
+ } else if (!PageUptodate(page))
+ error = stuffed_readpage(ip, page);
+ goto out;
+ }
+
+prepare_write:
+ error = block_prepare_write(page, from, to, gfs2_get_block);
+
+out:
+ if (error) {
+ gfs2_trans_end(sdp);
+ if (alloc_required) {
+ gfs2_inplace_release(ip);
+out_qunlock:
+ gfs2_quota_unlock(ip);
+out_alloc_put:
+ gfs2_alloc_put(ip);
+ }
+out_unlock:
+ gfs2_glock_dq_m(1, &ip->i_gh);
+out_uninit:
+ gfs2_holder_uninit(&ip->i_gh);
+ }
+
+ return error;
+}
+
+/**
+ * gfs2_commit_write - Commit write to a file
+ * @file: The file to write to
+ * @page: The page containing the data
+ * @from: From (byte range within page)
+ * @to: To (byte range within page)
+ *
+ * Returns: errno
+ */
+
+static int gfs2_commit_write(struct file *file, struct page *page,
+ unsigned from, unsigned to)
+{
+ struct inode *inode = page->mapping->host;
+ struct gfs2_inode *ip = GFS2_I(inode);
+ struct gfs2_sbd *sdp = GFS2_SB(inode);
+ int error = -EOPNOTSUPP;
+ struct buffer_head *dibh;
+ struct gfs2_alloc *al = &ip->i_alloc;
+ struct gfs2_dinode *di;
+
+ if (gfs2_assert_withdraw(sdp, gfs2_glock_is_locked_by_me(ip->i_gl)))
+ goto fail_nounlock;
+
+ error = gfs2_meta_inode_buffer(ip, &dibh);
+ if (error)
+ goto fail_endtrans;
+
+ gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+ di = (struct gfs2_dinode *)dibh->b_data;
+
+ if (gfs2_is_stuffed(ip)) {
+ u64 file_size;
+ void *kaddr;
+
+ file_size = ((u64)page->index << PAGE_CACHE_SHIFT) + to;
+
+ kaddr = kmap_atomic(page, KM_USER0);
+ memcpy(dibh->b_data + sizeof(struct gfs2_dinode) + from,
+ kaddr + from, to - from);
+ kunmap_atomic(kaddr, KM_USER0);
+
+ SetPageUptodate(page);
+
+ if (inode->i_size < file_size)
+ i_size_write(inode, file_size);
+ } else {
+ if (sdp->sd_args.ar_data == GFS2_DATA_ORDERED ||
+ gfs2_is_jdata(ip))
+ gfs2_page_add_databufs(ip, page, from, to);
+ error = generic_commit_write(file, page, from, to);
+ if (error)
+ goto fail;
+ }
+
+ if (ip->i_di.di_size < inode->i_size) {
+ ip->i_di.di_size = inode->i_size;
+ di->di_size = cpu_to_be64(inode->i_size);
+ }
+
+ di->di_mode = cpu_to_be32(inode->i_mode);
+ di->di_atime = cpu_to_be64(inode->i_atime.tv_sec);
+ di->di_mtime = cpu_to_be64(inode->i_mtime.tv_sec);
+ di->di_ctime = cpu_to_be64(inode->i_ctime.tv_sec);
+
+ brelse(dibh);
+ gfs2_trans_end(sdp);
+ if (al->al_requested) {
+ gfs2_inplace_release(ip);
+ gfs2_quota_unlock(ip);
+ gfs2_alloc_put(ip);
+ }
+ gfs2_glock_dq_m(1, &ip->i_gh);
+ gfs2_holder_uninit(&ip->i_gh);
+ return 0;
+
+fail:
+ brelse(dibh);
+fail_endtrans:
+ gfs2_trans_end(sdp);
+ if (al->al_requested) {
+ gfs2_inplace_release(ip);
+ gfs2_quota_unlock(ip);
+ gfs2_alloc_put(ip);
+ }
+ gfs2_glock_dq_m(1, &ip->i_gh);
+ gfs2_holder_uninit(&ip->i_gh);
+fail_nounlock:
+ ClearPageUptodate(page);
+ return error;
+}
+
+/**
+ * gfs2_bmap - Block map function
+ * @mapping: Address space info
+ * @lblock: The block to map
+ *
+ * Returns: The disk address for the block or 0 on hole or error
+ */
+
+static sector_t gfs2_bmap(struct address_space *mapping, sector_t lblock)
+{
+ struct gfs2_inode *ip = GFS2_I(mapping->host);
+ struct gfs2_holder i_gh;
+ sector_t dblock = 0;
+ int error;
+
+ error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
+ if (error)
+ return 0;
+
+ if (!gfs2_is_stuffed(ip))
+ dblock = generic_block_bmap(mapping, lblock, gfs2_get_block);
+
+ gfs2_glock_dq_uninit(&i_gh);
+
+ return dblock;
+}
+
+static void discard_buffer(struct gfs2_sbd *sdp, struct buffer_head *bh)
+{
+ struct gfs2_bufdata *bd;
+
+ gfs2_log_lock(sdp);
+ bd = bh->b_private;
+ if (bd) {
+ bd->bd_bh = NULL;
+ bh->b_private = NULL;
+ }
+ gfs2_log_unlock(sdp);
+
+ lock_buffer(bh);
+ clear_buffer_dirty(bh);
+ bh->b_bdev = NULL;
+ clear_buffer_mapped(bh);
+ clear_buffer_req(bh);
+ clear_buffer_new(bh);
+ clear_buffer_delay(bh);
+ unlock_buffer(bh);
+}
+
+static void gfs2_invalidatepage(struct page *page, unsigned long offset)
+{
+ struct gfs2_sbd *sdp = GFS2_SB(page->mapping->host);
+ struct buffer_head *head, *bh, *next;
+ unsigned int curr_off = 0;
+
+ BUG_ON(!PageLocked(page));
+ if (!page_has_buffers(page))
+ return;
+
+ bh = head = page_buffers(page);
+ do {
+ unsigned int next_off = curr_off + bh->b_size;
+ next = bh->b_this_page;
+
+ if (offset <= curr_off)
+ discard_buffer(sdp, bh);
+
+ curr_off = next_off;
+ bh = next;
+ } while (bh != head);
+
+ if (!offset)
+ try_to_release_page(page, 0);
+
+ return;
+}
+
+static ssize_t gfs2_direct_IO(int rw, struct kiocb *iocb,
+ const struct iovec *iov, loff_t offset,
+ unsigned long nr_segs)
+{
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file->f_mapping->host;
+ struct gfs2_inode *ip = GFS2_I(inode);
+ struct gfs2_holder gh;
+ int rv;
+
+ if (rw == READ)
+ mutex_lock(&inode->i_mutex);
+ /*
+ * Shared lock, even if its a write, since we do no allocation
+ * on this path. All we need change is atime.
+ */
+ gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &gh);
+ rv = gfs2_glock_nq_m_atime(1, &gh);
+ if (rv)
+ goto out;
+
+ if (offset > i_size_read(inode))
+ goto out;
+
+ /*
+ * Should we return an error here? I can't see that O_DIRECT for
+ * a journaled file makes any sense. For now we'll silently fall
+ * back to buffered I/O, likewise we do the same for stuffed
+ * files since they are (a) small and (b) unaligned.
+ */
+ if (gfs2_is_jdata(ip))
+ goto out;
+
+ if (gfs2_is_stuffed(ip))
+ goto out;
+
+ rv = blockdev_direct_IO_own_locking(rw, iocb, inode,
+ inode->i_sb->s_bdev,
+ iov, offset, nr_segs,
+ gfs2_get_block_direct, NULL);
+out:
+ gfs2_glock_dq_m(1, &gh);
+ gfs2_holder_uninit(&gh);
+ if (rw == READ)
+ mutex_unlock(&inode->i_mutex);
+
+ return rv;
+}
+
+/**
+ * stuck_releasepage - We're stuck in gfs2_releasepage(). Print stuff out.
+ * @bh: the buffer we're stuck on
+ *
+ */
+
+static void stuck_releasepage(struct buffer_head *bh)
+{
+ struct inode *inode = bh->b_page->mapping->host;
+ struct gfs2_sbd *sdp = inode->i_sb->s_fs_info;
+ struct gfs2_bufdata *bd = bh->b_private;
+ struct gfs2_glock *gl;
+static unsigned limit = 0;
+
+ if (limit > 3)
+ return;
+ limit++;
+
+ fs_warn(sdp, "stuck in gfs2_releasepage() %p\n", inode);
+ fs_warn(sdp, "blkno = %llu, bh->b_count = %d\n",
+ (unsigned long long)bh->b_blocknr, atomic_read(&bh->b_count));
+ fs_warn(sdp, "pinned = %u\n", buffer_pinned(bh));
+ fs_warn(sdp, "bh->b_private = %s\n", (bd) ? "!NULL" : "NULL");
+
+ if (!bd)
+ return;
+
+ gl = bd->bd_gl;
+
+ fs_warn(sdp, "gl = (%u, %llu)\n",
+ gl->gl_name.ln_type, (unsigned long long)gl->gl_name.ln_number);
+
+ fs_warn(sdp, "bd_list_tr = %s, bd_le.le_list = %s\n",
+ (list_empty(&bd->bd_list_tr)) ? "no" : "yes",
+ (list_empty(&bd->bd_le.le_list)) ? "no" : "yes");
+
+ if (gl->gl_ops == &gfs2_inode_glops) {
+ struct gfs2_inode *ip = gl->gl_object;
+ unsigned int x;
+
+ if (!ip)
+ return;
+
+ fs_warn(sdp, "ip = %llu %llu\n",
+ (unsigned long long)ip->i_num.no_formal_ino,
+ (unsigned long long)ip->i_num.no_addr);
+
+ for (x = 0; x < GFS2_MAX_META_HEIGHT; x++)
+ fs_warn(sdp, "ip->i_cache[%u] = %s\n",
+ x, (ip->i_cache[x]) ? "!NULL" : "NULL");
+ }
+}
+
+/**
+ * gfs2_releasepage - free the metadata associated with a page
+ * @page: the page that's being released
+ * @gfp_mask: passed from Linux VFS, ignored by us
+ *
+ * Call try_to_free_buffers() if the buffers in this page can be
+ * released.
+ *
+ * Returns: 0
+ */
+
+int gfs2_releasepage(struct page *page, gfp_t gfp_mask)
+{
+ struct inode *aspace = page->mapping->host;
+ struct gfs2_sbd *sdp = aspace->i_sb->s_fs_info;
+ struct buffer_head *bh, *head;
+ struct gfs2_bufdata *bd;
+ unsigned long t = jiffies + gfs2_tune_get(sdp, gt_stall_secs) * HZ;
+
+ if (!page_has_buffers(page))
+ goto out;
+
+ head = bh = page_buffers(page);
+ do {
+ while (atomic_read(&bh->b_count)) {
+ if (!atomic_read(&aspace->i_writecount))
+ return 0;
+
+ if (time_after_eq(jiffies, t)) {
+ stuck_releasepage(bh);
+ /* should we withdraw here? */
+ return 0;
+ }
+
+ yield();
+ }
+
+ gfs2_assert_warn(sdp, !buffer_pinned(bh));
+ gfs2_assert_warn(sdp, !buffer_dirty(bh));
+
+ gfs2_log_lock(sdp);
+ bd = bh->b_private;
+ if (bd) {
+ gfs2_assert_warn(sdp, bd->bd_bh == bh);
+ gfs2_assert_warn(sdp, list_empty(&bd->bd_list_tr));
+ gfs2_assert_warn(sdp, !bd->bd_ail);
+ bd->bd_bh = NULL;
+ if (!list_empty(&bd->bd_le.le_list))
+ bd = NULL;
+ bh->b_private = NULL;
+ }
+ gfs2_log_unlock(sdp);
+ if (bd)
+ kmem_cache_free(gfs2_bufdata_cachep, bd);
+
+ bh = bh->b_this_page;
+ } while (bh != head);
+
+out:
+ return try_to_free_buffers(page);
+}
+
+const struct address_space_operations gfs2_file_aops = {
+ .writepage = gfs2_writepage,
+ .readpage = gfs2_readpage,
+ .readpages = gfs2_readpages,
+ .sync_page = block_sync_page,
+ .prepare_write = gfs2_prepare_write,
+ .commit_write = gfs2_commit_write,
+ .bmap = gfs2_bmap,
+ .invalidatepage = gfs2_invalidatepage,
+ .releasepage = gfs2_releasepage,
+ .direct_IO = gfs2_direct_IO,
+};
+
diff --git a/fs/gfs2/ops_address.h b/fs/gfs2/ops_address.h
new file mode 100644
index 000000000000..35aaee4aa7e1
--- /dev/null
+++ b/fs/gfs2/ops_address.h
@@ -0,0 +1,22 @@
+/*
+ * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#ifndef __OPS_ADDRESS_DOT_H__
+#define __OPS_ADDRESS_DOT_H__
+
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <linux/mm.h>
+
+extern const struct address_space_operations gfs2_file_aops;
+extern int gfs2_get_block(struct inode *inode, sector_t lblock,
+ struct buffer_head *bh_result, int create);
+extern int gfs2_releasepage(struct page *page, gfp_t gfp_mask);
+
+#endif /* __OPS_ADDRESS_DOT_H__ */
diff --git a/fs/gfs2/ops_dentry.c b/fs/gfs2/ops_dentry.c
new file mode 100644
index 000000000000..00041b1b8025
--- /dev/null
+++ b/fs/gfs2/ops_dentry.c
@@ -0,0 +1,119 @@
+/*
+ * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/smp_lock.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/crc32.h>
+#include <linux/lm_interface.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "dir.h"
+#include "glock.h"
+#include "ops_dentry.h"
+#include "util.h"
+
+/**
+ * gfs2_drevalidate - Check directory lookup consistency
+ * @dentry: the mapping to check
+ * @nd:
+ *
+ * Check to make sure the lookup necessary to arrive at this inode from its
+ * parent is still good.
+ *
+ * Returns: 1 if the dentry is ok, 0 if it isn't
+ */
+
+static int gfs2_drevalidate(struct dentry *dentry, struct nameidata *nd)
+{
+ struct dentry *parent = dget_parent(dentry);
+ struct gfs2_sbd *sdp = GFS2_SB(parent->d_inode);
+ struct gfs2_inode *dip = GFS2_I(parent->d_inode);
+ struct inode *inode = dentry->d_inode;
+ struct gfs2_holder d_gh;
+ struct gfs2_inode *ip;
+ struct gfs2_inum inum;
+ unsigned int type;
+ int error;
+
+ if (inode && is_bad_inode(inode))
+ goto invalid;
+
+ if (sdp->sd_args.ar_localcaching)
+ goto valid;
+
+ error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh);
+ if (error)
+ goto fail;
+
+ error = gfs2_dir_search(parent->d_inode, &dentry->d_name, &inum, &type);
+ switch (error) {
+ case 0:
+ if (!inode)
+ goto invalid_gunlock;
+ break;
+ case -ENOENT:
+ if (!inode)
+ goto valid_gunlock;
+ goto invalid_gunlock;
+ default:
+ goto fail_gunlock;
+ }
+
+ ip = GFS2_I(inode);
+
+ if (!gfs2_inum_equal(&ip->i_num, &inum))
+ goto invalid_gunlock;
+
+ if (IF2DT(ip->i_di.di_mode) != type) {
+ gfs2_consist_inode(dip);
+ goto fail_gunlock;
+ }
+
+valid_gunlock:
+ gfs2_glock_dq_uninit(&d_gh);
+valid:
+ dput(parent);
+ return 1;
+
+invalid_gunlock:
+ gfs2_glock_dq_uninit(&d_gh);
+invalid:
+ if (inode && S_ISDIR(inode->i_mode)) {
+ if (have_submounts(dentry))
+ goto valid;
+ shrink_dcache_parent(dentry);
+ }
+ d_drop(dentry);
+ dput(parent);
+ return 0;
+
+fail_gunlock:
+ gfs2_glock_dq_uninit(&d_gh);
+fail:
+ dput(parent);
+ return 0;
+}
+
+static int gfs2_dhash(struct dentry *dentry, struct qstr *str)
+{
+ str->hash = gfs2_disk_hash(str->name, str->len);
+ return 0;
+}
+
+struct dentry_operations gfs2_dops = {
+ .d_revalidate = gfs2_drevalidate,
+ .d_hash = gfs2_dhash,
+};
+
diff --git a/fs/gfs2/ops_dentry.h b/fs/gfs2/ops_dentry.h
new file mode 100644
index 000000000000..5caa3db4d3f5
--- /dev/null
+++ b/fs/gfs2/ops_dentry.h
@@ -0,0 +1,17 @@
+/*
+ * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#ifndef __OPS_DENTRY_DOT_H__
+#define __OPS_DENTRY_DOT_H__
+
+#include <linux/dcache.h>
+
+extern struct dentry_operations gfs2_dops;
+
+#endif /* __OPS_DENTRY_DOT_H__ */
diff --git a/fs/gfs2/ops_export.c b/fs/gfs2/ops_export.c
new file mode 100644
index 000000000000..86127d93bd35
--- /dev/null
+++ b/fs/gfs2/ops_export.c
@@ -0,0 +1,298 @@
+/*
+ * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/crc32.h>
+#include <linux/lm_interface.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "dir.h"
+#include "glock.h"
+#include "glops.h"
+#include "inode.h"
+#include "ops_export.h"
+#include "rgrp.h"
+#include "util.h"
+
+static struct dentry *gfs2_decode_fh(struct super_block *sb,
+ __u32 *fh,
+ int fh_len,
+ int fh_type,
+ int (*acceptable)(void *context,
+ struct dentry *dentry),
+ void *context)
+{
+ struct gfs2_fh_obj fh_obj;
+ struct gfs2_inum *this, parent;
+
+ if (fh_type != fh_len)
+ return NULL;
+
+ this = &fh_obj.this;
+ fh_obj.imode = DT_UNKNOWN;
+ memset(&parent, 0, sizeof(struct gfs2_inum));
+
+ switch (fh_type) {
+ case GFS2_LARGE_FH_SIZE:
+ parent.no_formal_ino = ((u64)be32_to_cpu(fh[4])) << 32;
+ parent.no_formal_ino |= be32_to_cpu(fh[5]);
+ parent.no_addr = ((u64)be32_to_cpu(fh[6])) << 32;
+ parent.no_addr |= be32_to_cpu(fh[7]);
+ fh_obj.imode = be32_to_cpu(fh[8]);
+ case GFS2_SMALL_FH_SIZE:
+ this->no_formal_ino = ((u64)be32_to_cpu(fh[0])) << 32;
+ this->no_formal_ino |= be32_to_cpu(fh[1]);
+ this->no_addr = ((u64)be32_to_cpu(fh[2])) << 32;
+ this->no_addr |= be32_to_cpu(fh[3]);
+ break;
+ default:
+ return NULL;
+ }
+
+ return gfs2_export_ops.find_exported_dentry(sb, &fh_obj, &parent,
+ acceptable, context);
+}
+
+static int gfs2_encode_fh(struct dentry *dentry, __u32 *fh, int *len,
+ int connectable)
+{
+ struct inode *inode = dentry->d_inode;
+ struct super_block *sb = inode->i_sb;
+ struct gfs2_inode *ip = GFS2_I(inode);
+
+ if (*len < GFS2_SMALL_FH_SIZE ||
+ (connectable && *len < GFS2_LARGE_FH_SIZE))
+ return 255;
+
+ fh[0] = ip->i_num.no_formal_ino >> 32;
+ fh[0] = cpu_to_be32(fh[0]);
+ fh[1] = ip->i_num.no_formal_ino & 0xFFFFFFFF;
+ fh[1] = cpu_to_be32(fh[1]);
+ fh[2] = ip->i_num.no_addr >> 32;
+ fh[2] = cpu_to_be32(fh[2]);
+ fh[3] = ip->i_num.no_addr & 0xFFFFFFFF;
+ fh[3] = cpu_to_be32(fh[3]);
+ *len = GFS2_SMALL_FH_SIZE;
+
+ if (!connectable || inode == sb->s_root->d_inode)
+ return *len;
+
+ spin_lock(&dentry->d_lock);
+ inode = dentry->d_parent->d_inode;
+ ip = GFS2_I(inode);
+ igrab(inode);
+ spin_unlock(&dentry->d_lock);
+
+ fh[4] = ip->i_num.no_formal_ino >> 32;
+ fh[4] = cpu_to_be32(fh[4]);
+ fh[5] = ip->i_num.no_formal_ino & 0xFFFFFFFF;
+ fh[5] = cpu_to_be32(fh[5]);
+ fh[6] = ip->i_num.no_addr >> 32;
+ fh[6] = cpu_to_be32(fh[6]);
+ fh[7] = ip->i_num.no_addr & 0xFFFFFFFF;
+ fh[7] = cpu_to_be32(fh[7]);
+
+ fh[8] = cpu_to_be32(inode->i_mode);
+ fh[9] = 0; /* pad to double word */
+ *len = GFS2_LARGE_FH_SIZE;
+
+ iput(inode);
+
+ return *len;
+}
+
+struct get_name_filldir {
+ struct gfs2_inum inum;
+ char *name;
+};
+
+static int get_name_filldir(void *opaque, const char *name, unsigned int length,
+ u64 offset, struct gfs2_inum *inum,
+ unsigned int type)
+{
+ struct get_name_filldir *gnfd = (struct get_name_filldir *)opaque;
+
+ if (!gfs2_inum_equal(inum, &gnfd->inum))
+ return 0;
+
+ memcpy(gnfd->name, name, length);
+ gnfd->name[length] = 0;
+
+ return 1;
+}
+
+static int gfs2_get_name(struct dentry *parent, char *name,
+ struct dentry *child)
+{
+ struct inode *dir = parent->d_inode;
+ struct inode *inode = child->d_inode;
+ struct gfs2_inode *dip, *ip;
+ struct get_name_filldir gnfd;
+ struct gfs2_holder gh;
+ u64 offset = 0;
+ int error;
+
+ if (!dir)
+ return -EINVAL;
+
+ if (!S_ISDIR(dir->i_mode) || !inode)
+ return -EINVAL;
+
+ dip = GFS2_I(dir);
+ ip = GFS2_I(inode);
+
+ *name = 0;
+ gnfd.inum = ip->i_num;
+ gnfd.name = name;
+
+ error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &gh);
+ if (error)
+ return error;
+
+ error = gfs2_dir_read(dir, &offset, &gnfd, get_name_filldir);
+
+ gfs2_glock_dq_uninit(&gh);
+
+ if (!error && !*name)
+ error = -ENOENT;
+
+ return error;
+}
+
+static struct dentry *gfs2_get_parent(struct dentry *child)
+{
+ struct qstr dotdot;
+ struct inode *inode;
+ struct dentry *dentry;
+
+ gfs2_str2qstr(&dotdot, "..");
+ inode = gfs2_lookupi(child->d_inode, &dotdot, 1, NULL);
+
+ if (!inode)
+ return ERR_PTR(-ENOENT);
+ /*
+ * In case of an error, @inode carries the error value, and we
+ * have to return that as a(n invalid) pointer to dentry.
+ */
+ if (IS_ERR(inode))
+ return ERR_PTR(PTR_ERR(inode));
+
+ dentry = d_alloc_anon(inode);
+ if (!dentry) {
+ iput(inode);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ return dentry;
+}
+
+static struct dentry *gfs2_get_dentry(struct super_block *sb, void *inum_obj)
+{
+ struct gfs2_sbd *sdp = sb->s_fs_info;
+ struct gfs2_fh_obj *fh_obj = (struct gfs2_fh_obj *)inum_obj;
+ struct gfs2_inum *inum = &fh_obj->this;
+ struct gfs2_holder i_gh, ri_gh, rgd_gh;
+ struct gfs2_rgrpd *rgd;
+ struct inode *inode;
+ struct dentry *dentry;
+ int error;
+
+ /* System files? */
+
+ inode = gfs2_ilookup(sb, inum);
+ if (inode) {
+ if (GFS2_I(inode)->i_num.no_formal_ino != inum->no_formal_ino) {
+ iput(inode);
+ return ERR_PTR(-ESTALE);
+ }
+ goto out_inode;
+ }
+
+ error = gfs2_glock_nq_num(sdp, inum->no_addr, &gfs2_inode_glops,
+ LM_ST_SHARED, LM_FLAG_ANY | GL_LOCAL_EXCL,
+ &i_gh);
+ if (error)
+ return ERR_PTR(error);
+
+ error = gfs2_rindex_hold(sdp, &ri_gh);
+ if (error)
+ goto fail;
+
+ error = -EINVAL;
+ rgd = gfs2_blk2rgrpd(sdp, inum->no_addr);
+ if (!rgd)
+ goto fail_rindex;
+
+ error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_SHARED, 0, &rgd_gh);
+ if (error)
+ goto fail_rindex;
+
+ error = -ESTALE;
+ if (gfs2_get_block_type(rgd, inum->no_addr) != GFS2_BLKST_DINODE)
+ goto fail_rgd;
+
+ gfs2_glock_dq_uninit(&rgd_gh);
+ gfs2_glock_dq_uninit(&ri_gh);
+
+ inode = gfs2_inode_lookup(sb, inum, fh_obj->imode);
+ if (!inode)
+ goto fail;
+ if (IS_ERR(inode)) {
+ error = PTR_ERR(inode);
+ goto fail;
+ }
+
+ error = gfs2_inode_refresh(GFS2_I(inode));
+ if (error) {
+ iput(inode);
+ goto fail;
+ }
+
+ error = -EIO;
+ if (GFS2_I(inode)->i_di.di_flags & GFS2_DIF_SYSTEM) {
+ iput(inode);
+ goto fail;
+ }
+
+ gfs2_glock_dq_uninit(&i_gh);
+
+out_inode:
+ dentry = d_alloc_anon(inode);
+ if (!dentry) {
+ iput(inode);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ return dentry;
+
+fail_rgd:
+ gfs2_glock_dq_uninit(&rgd_gh);
+
+fail_rindex:
+ gfs2_glock_dq_uninit(&ri_gh);
+
+fail:
+ gfs2_glock_dq_uninit(&i_gh);
+ return ERR_PTR(error);
+}
+
+struct export_operations gfs2_export_ops = {
+ .decode_fh = gfs2_decode_fh,
+ .encode_fh = gfs2_encode_fh,
+ .get_name = gfs2_get_name,
+ .get_parent = gfs2_get_parent,
+ .get_dentry = gfs2_get_dentry,
+};
+
diff --git a/fs/gfs2/ops_export.h b/fs/gfs2/ops_export.h
new file mode 100644
index 000000000000..09aca5046fb1
--- /dev/null
+++ b/fs/gfs2/ops_export.h
@@ -0,0 +1,22 @@
+/*
+ * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#ifndef __OPS_EXPORT_DOT_H__
+#define __OPS_EXPORT_DOT_H__
+
+#define GFS2_SMALL_FH_SIZE 4
+#define GFS2_LARGE_FH_SIZE 10
+
+extern struct export_operations gfs2_export_ops;
+struct gfs2_fh_obj {
+ struct gfs2_inum this;
+ __u32 imode;
+};
+
+#endif /* __OPS_EXPORT_DOT_H__ */
diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c
new file mode 100644
index 000000000000..3064f133bf3c
--- /dev/null
+++ b/fs/gfs2/ops_file.c
@@ -0,0 +1,661 @@
+/*
+ * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/pagemap.h>
+#include <linux/uio.h>
+#include <linux/blkdev.h>
+#include <linux/mm.h>
+#include <linux/smp_lock.h>
+#include <linux/fs.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/ext2_fs.h>
+#include <linux/crc32.h>
+#include <linux/lm_interface.h>
+#include <asm/uaccess.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "bmap.h"
+#include "dir.h"
+#include "glock.h"
+#include "glops.h"
+#include "inode.h"
+#include "lm.h"
+#include "log.h"
+#include "meta_io.h"
+#include "ops_file.h"
+#include "ops_vm.h"
+#include "quota.h"
+#include "rgrp.h"
+#include "trans.h"
+#include "util.h"
+#include "eaops.h"
+
+/* For regular, non-NFS */
+struct filldir_reg {
+ struct gfs2_sbd *fdr_sbd;
+ int fdr_prefetch;
+
+ filldir_t fdr_filldir;
+ void *fdr_opaque;
+};
+
+/*
+ * Most fields left uninitialised to catch anybody who tries to
+ * use them. f_flags set to prevent file_accessed() from touching
+ * any other part of this. Its use is purely as a flag so that we
+ * know (in readpage()) whether or not do to locking.
+ */
+struct file gfs2_internal_file_sentinel = {
+ .f_flags = O_NOATIME|O_RDONLY,
+};
+
+static int gfs2_read_actor(read_descriptor_t *desc, struct page *page,
+ unsigned long offset, unsigned long size)
+{
+ char *kaddr;
+ unsigned long count = desc->count;
+
+ if (size > count)
+ size = count;
+
+ kaddr = kmap(page);
+ memcpy(desc->arg.buf, kaddr + offset, size);
+ kunmap(page);
+
+ desc->count = count - size;
+ desc->written += size;
+ desc->arg.buf += size;
+ return size;
+}
+
+int gfs2_internal_read(struct gfs2_inode *ip, struct file_ra_state *ra_state,
+ char *buf, loff_t *pos, unsigned size)
+{
+ struct inode *inode = &ip->i_inode;
+ read_descriptor_t desc;
+ desc.written = 0;
+ desc.arg.buf = buf;
+ desc.count = size;
+ desc.error = 0;
+ do_generic_mapping_read(inode->i_mapping, ra_state,
+ &gfs2_internal_file_sentinel, pos, &desc,
+ gfs2_read_actor);
+ return desc.written ? desc.written : desc.error;
+}
+
+/**
+ * gfs2_llseek - seek to a location in a file
+ * @file: the file
+ * @offset: the offset
+ * @origin: Where to seek from (SEEK_SET, SEEK_CUR, or SEEK_END)
+ *
+ * SEEK_END requires the glock for the file because it references the
+ * file's size.
+ *
+ * Returns: The new offset, or errno
+ */
+
+static loff_t gfs2_llseek(struct file *file, loff_t offset, int origin)
+{
+ struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
+ struct gfs2_holder i_gh;
+ loff_t error;
+
+ if (origin == 2) {
+ error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY,
+ &i_gh);
+ if (!error) {
+ error = remote_llseek(file, offset, origin);
+ gfs2_glock_dq_uninit(&i_gh);
+ }
+ } else
+ error = remote_llseek(file, offset, origin);
+
+ return error;
+}
+
+/**
+ * filldir_func - Report a directory entry to the caller of gfs2_dir_read()
+ * @opaque: opaque data used by the function
+ * @name: the name of the directory entry
+ * @length: the length of the name
+ * @offset: the entry's offset in the directory
+ * @inum: the inode number the entry points to
+ * @type: the type of inode the entry points to
+ *
+ * Returns: 0 on success, 1 if buffer full
+ */
+
+static int filldir_func(void *opaque, const char *name, unsigned int length,
+ u64 offset, struct gfs2_inum *inum,
+ unsigned int type)
+{
+ struct filldir_reg *fdr = (struct filldir_reg *)opaque;
+ struct gfs2_sbd *sdp = fdr->fdr_sbd;
+ int error;
+
+ error = fdr->fdr_filldir(fdr->fdr_opaque, name, length, offset,
+ inum->no_addr, type);
+ if (error)
+ return 1;
+
+ if (fdr->fdr_prefetch && !(length == 1 && *name == '.')) {
+ gfs2_glock_prefetch_num(sdp, inum->no_addr, &gfs2_inode_glops,
+ LM_ST_SHARED, LM_FLAG_TRY | LM_FLAG_ANY);
+ gfs2_glock_prefetch_num(sdp, inum->no_addr, &gfs2_iopen_glops,
+ LM_ST_SHARED, LM_FLAG_TRY);
+ }
+
+ return 0;
+}
+
+/**
+ * gfs2_readdir - Read directory entries from a directory
+ * @file: The directory to read from
+ * @dirent: Buffer for dirents
+ * @filldir: Function used to do the copying
+ *
+ * Returns: errno
+ */
+
+static int gfs2_readdir(struct file *file, void *dirent, filldir_t filldir)
+{
+ struct inode *dir = file->f_mapping->host;
+ struct gfs2_inode *dip = GFS2_I(dir);
+ struct filldir_reg fdr;
+ struct gfs2_holder d_gh;
+ u64 offset = file->f_pos;
+ int error;
+
+ fdr.fdr_sbd = GFS2_SB(dir);
+ fdr.fdr_prefetch = 1;
+ fdr.fdr_filldir = filldir;
+ fdr.fdr_opaque = dirent;
+
+ gfs2_holder_init(dip->i_gl, LM_ST_SHARED, GL_ATIME, &d_gh);
+ error = gfs2_glock_nq_atime(&d_gh);
+ if (error) {
+ gfs2_holder_uninit(&d_gh);
+ return error;
+ }
+
+ error = gfs2_dir_read(dir, &offset, &fdr, filldir_func);
+
+ gfs2_glock_dq_uninit(&d_gh);
+
+ file->f_pos = offset;
+
+ return error;
+}
+
+/**
+ * fsflags_cvt
+ * @table: A table of 32 u32 flags
+ * @val: a 32 bit value to convert
+ *
+ * This function can be used to convert between fsflags values and
+ * GFS2's own flags values.
+ *
+ * Returns: the converted flags
+ */
+static u32 fsflags_cvt(const u32 *table, u32 val)
+{
+ u32 res = 0;
+ while(val) {
+ if (val & 1)
+ res |= *table;
+ table++;
+ val >>= 1;
+ }
+ return res;
+}
+
+static const u32 fsflags_to_gfs2[32] = {
+ [3] = GFS2_DIF_SYNC,
+ [4] = GFS2_DIF_IMMUTABLE,
+ [5] = GFS2_DIF_APPENDONLY,
+ [7] = GFS2_DIF_NOATIME,
+ [12] = GFS2_DIF_EXHASH,
+ [14] = GFS2_DIF_JDATA,
+ [20] = GFS2_DIF_DIRECTIO,
+};
+
+static const u32 gfs2_to_fsflags[32] = {
+ [gfs2fl_Sync] = FS_SYNC_FL,
+ [gfs2fl_Immutable] = FS_IMMUTABLE_FL,
+ [gfs2fl_AppendOnly] = FS_APPEND_FL,
+ [gfs2fl_NoAtime] = FS_NOATIME_FL,
+ [gfs2fl_ExHash] = FS_INDEX_FL,
+ [gfs2fl_Jdata] = FS_JOURNAL_DATA_FL,
+ [gfs2fl_Directio] = FS_DIRECTIO_FL,
+ [gfs2fl_InheritDirectio] = FS_DIRECTIO_FL,
+ [gfs2fl_InheritJdata] = FS_JOURNAL_DATA_FL,
+};
+
+static int gfs2_get_flags(struct file *filp, u32 __user *ptr)
+{
+ struct inode *inode = filp->f_dentry->d_inode;
+ struct gfs2_inode *ip = GFS2_I(inode);
+ struct gfs2_holder gh;
+ int error;
+ u32 fsflags;
+
+ gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &gh);
+ error = gfs2_glock_nq_m_atime(1, &gh);
+ if (error)
+ return error;
+
+ fsflags = fsflags_cvt(gfs2_to_fsflags, ip->i_di.di_flags);
+ if (put_user(fsflags, ptr))
+ error = -EFAULT;
+
+ gfs2_glock_dq_m(1, &gh);
+ gfs2_holder_uninit(&gh);
+ return error;
+}
+
+/* Flags that can be set by user space */
+#define GFS2_FLAGS_USER_SET (GFS2_DIF_JDATA| \
+ GFS2_DIF_DIRECTIO| \
+ GFS2_DIF_IMMUTABLE| \
+ GFS2_DIF_APPENDONLY| \
+ GFS2_DIF_NOATIME| \
+ GFS2_DIF_SYNC| \
+ GFS2_DIF_SYSTEM| \
+ GFS2_DIF_INHERIT_DIRECTIO| \
+ GFS2_DIF_INHERIT_JDATA)
+
+/**
+ * gfs2_set_flags - set flags on an inode
+ * @inode: The inode
+ * @flags: The flags to set
+ * @mask: Indicates which flags are valid
+ *
+ */
+static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask)
+{
+ struct inode *inode = filp->f_dentry->d_inode;
+ struct gfs2_inode *ip = GFS2_I(inode);
+ struct gfs2_sbd *sdp = GFS2_SB(inode);
+ struct buffer_head *bh;
+ struct gfs2_holder gh;
+ int error;
+ u32 new_flags, flags;
+
+ error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
+ if (error)
+ return error;
+
+ flags = ip->i_di.di_flags;
+ new_flags = (flags & ~mask) | (reqflags & mask);
+ if ((new_flags ^ flags) == 0)
+ goto out;
+
+ if (S_ISDIR(inode->i_mode)) {
+ if ((new_flags ^ flags) & GFS2_DIF_JDATA)
+ new_flags ^= (GFS2_DIF_JDATA|GFS2_DIF_INHERIT_JDATA);
+ if ((new_flags ^ flags) & GFS2_DIF_DIRECTIO)
+ new_flags ^= (GFS2_DIF_DIRECTIO|GFS2_DIF_INHERIT_DIRECTIO);
+ }
+
+ error = -EINVAL;
+ if ((new_flags ^ flags) & ~GFS2_FLAGS_USER_SET)
+ goto out;
+
+ error = -EPERM;
+ if (IS_IMMUTABLE(inode) && (new_flags & GFS2_DIF_IMMUTABLE))
+ goto out;
+ if (IS_APPEND(inode) && (new_flags & GFS2_DIF_APPENDONLY))
+ goto out;
+ if (((new_flags ^ flags) & GFS2_DIF_IMMUTABLE) &&
+ !capable(CAP_LINUX_IMMUTABLE))
+ goto out;
+ if (!IS_IMMUTABLE(inode)) {
+ error = permission(inode, MAY_WRITE, NULL);
+ if (error)
+ goto out;
+ }
+
+ error = gfs2_trans_begin(sdp, RES_DINODE, 0);
+ if (error)
+ goto out;
+ error = gfs2_meta_inode_buffer(ip, &bh);
+ if (error)
+ goto out_trans_end;
+ gfs2_trans_add_bh(ip->i_gl, bh, 1);
+ ip->i_di.di_flags = new_flags;
+ gfs2_dinode_out(&ip->i_di, bh->b_data);
+ brelse(bh);
+out_trans_end:
+ gfs2_trans_end(sdp);
+out:
+ gfs2_glock_dq_uninit(&gh);
+ return error;
+}
+
+static int gfs2_set_flags(struct file *filp, u32 __user *ptr)
+{
+ u32 fsflags, gfsflags;
+ if (get_user(fsflags, ptr))
+ return -EFAULT;
+ gfsflags = fsflags_cvt(fsflags_to_gfs2, fsflags);
+ return do_gfs2_set_flags(filp, gfsflags, ~0);
+}
+
+static long gfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+ switch(cmd) {
+ case FS_IOC_GETFLAGS:
+ return gfs2_get_flags(filp, (u32 __user *)arg);
+ case FS_IOC_SETFLAGS:
+ return gfs2_set_flags(filp, (u32 __user *)arg);
+ }
+ return -ENOTTY;
+}
+
+
+/**
+ * gfs2_mmap -
+ * @file: The file to map
+ * @vma: The VMA which described the mapping
+ *
+ * Returns: 0 or error code
+ */
+
+static int gfs2_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
+ struct gfs2_holder i_gh;
+ int error;
+
+ gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &i_gh);
+ error = gfs2_glock_nq_atime(&i_gh);
+ if (error) {
+ gfs2_holder_uninit(&i_gh);
+ return error;
+ }
+
+ /* This is VM_MAYWRITE instead of VM_WRITE because a call
+ to mprotect() can turn on VM_WRITE later. */
+
+ if ((vma->vm_flags & (VM_MAYSHARE | VM_MAYWRITE)) ==
+ (VM_MAYSHARE | VM_MAYWRITE))
+ vma->vm_ops = &gfs2_vm_ops_sharewrite;
+ else
+ vma->vm_ops = &gfs2_vm_ops_private;
+
+ gfs2_glock_dq_uninit(&i_gh);
+
+ return error;
+}
+
+/**
+ * gfs2_open - open a file
+ * @inode: the inode to open
+ * @file: the struct file for this opening
+ *
+ * Returns: errno
+ */
+
+static int gfs2_open(struct inode *inode, struct file *file)
+{
+ struct gfs2_inode *ip = GFS2_I(inode);
+ struct gfs2_holder i_gh;
+ struct gfs2_file *fp;
+ int error;
+
+ fp = kzalloc(sizeof(struct gfs2_file), GFP_KERNEL);
+ if (!fp)
+ return -ENOMEM;
+
+ mutex_init(&fp->f_fl_mutex);
+
+ gfs2_assert_warn(GFS2_SB(inode), !file->private_data);
+ file->private_data = fp;
+
+ if (S_ISREG(ip->i_di.di_mode)) {
+ error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY,
+ &i_gh);
+ if (error)
+ goto fail;
+
+ if (!(file->f_flags & O_LARGEFILE) &&
+ ip->i_di.di_size > MAX_NON_LFS) {
+ error = -EFBIG;
+ goto fail_gunlock;
+ }
+
+ /* Listen to the Direct I/O flag */
+
+ if (ip->i_di.di_flags & GFS2_DIF_DIRECTIO)
+ file->f_flags |= O_DIRECT;
+
+ gfs2_glock_dq_uninit(&i_gh);
+ }
+
+ return 0;
+
+fail_gunlock:
+ gfs2_glock_dq_uninit(&i_gh);
+fail:
+ file->private_data = NULL;
+ kfree(fp);
+ return error;
+}
+
+/**
+ * gfs2_close - called to close a struct file
+ * @inode: the inode the struct file belongs to
+ * @file: the struct file being closed
+ *
+ * Returns: errno
+ */
+
+static int gfs2_close(struct inode *inode, struct file *file)
+{
+ struct gfs2_sbd *sdp = inode->i_sb->s_fs_info;
+ struct gfs2_file *fp;
+
+ fp = file->private_data;
+ file->private_data = NULL;
+
+ if (gfs2_assert_warn(sdp, fp))
+ return -EIO;
+
+ kfree(fp);
+
+ return 0;
+}
+
+/**
+ * gfs2_fsync - sync the dirty data for a file (across the cluster)
+ * @file: the file that points to the dentry (we ignore this)
+ * @dentry: the dentry that points to the inode to sync
+ *
+ * Returns: errno
+ */
+
+static int gfs2_fsync(struct file *file, struct dentry *dentry, int datasync)
+{
+ struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
+
+ gfs2_log_flush(ip->i_gl->gl_sbd, ip->i_gl);
+
+ return 0;
+}
+
+/**
+ * gfs2_lock - acquire/release a posix lock on a file
+ * @file: the file pointer
+ * @cmd: either modify or retrieve lock state, possibly wait
+ * @fl: type and range of lock
+ *
+ * Returns: errno
+ */
+
+static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl)
+{
+ struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
+ struct gfs2_sbd *sdp = GFS2_SB(file->f_mapping->host);
+ struct lm_lockname name =
+ { .ln_number = ip->i_num.no_addr,
+ .ln_type = LM_TYPE_PLOCK };
+
+ if (!(fl->fl_flags & FL_POSIX))
+ return -ENOLCK;
+ if ((ip->i_di.di_mode & (S_ISGID | S_IXGRP)) == S_ISGID)
+ return -ENOLCK;
+
+ if (sdp->sd_args.ar_localflocks) {
+ if (IS_GETLK(cmd)) {
+ struct file_lock tmp;
+ int ret;
+ ret = posix_test_lock(file, fl, &tmp);
+ fl->fl_type = F_UNLCK;
+ if (ret)
+ memcpy(fl, &tmp, sizeof(struct file_lock));
+ return 0;
+ } else {
+ return posix_lock_file_wait(file, fl);
+ }
+ }
+
+ if (IS_GETLK(cmd))
+ return gfs2_lm_plock_get(sdp, &name, file, fl);
+ else if (fl->fl_type == F_UNLCK)
+ return gfs2_lm_punlock(sdp, &name, file, fl);
+ else
+ return gfs2_lm_plock(sdp, &name, file, cmd, fl);
+}
+
+static int do_flock(struct file *file, int cmd, struct file_lock *fl)
+{
+ struct gfs2_file *fp = file->private_data;
+ struct gfs2_holder *fl_gh = &fp->f_fl_gh;
+ struct gfs2_inode *ip = GFS2_I(file->f_dentry->d_inode);
+ struct gfs2_glock *gl;
+ unsigned int state;
+ int flags;
+ int error = 0;
+
+ state = (fl->fl_type == F_WRLCK) ? LM_ST_EXCLUSIVE : LM_ST_SHARED;
+ flags = (IS_SETLKW(cmd) ? 0 : LM_FLAG_TRY) | GL_EXACT | GL_NOCACHE;
+
+ mutex_lock(&fp->f_fl_mutex);
+
+ gl = fl_gh->gh_gl;
+ if (gl) {
+ if (fl_gh->gh_state == state)
+ goto out;
+ gfs2_glock_hold(gl);
+ flock_lock_file_wait(file,
+ &(struct file_lock){.fl_type = F_UNLCK});
+ gfs2_glock_dq_uninit(fl_gh);
+ } else {
+ error = gfs2_glock_get(GFS2_SB(&ip->i_inode),
+ ip->i_num.no_addr, &gfs2_flock_glops,
+ CREATE, &gl);
+ if (error)
+ goto out;
+ }
+
+ gfs2_holder_init(gl, state, flags, fl_gh);
+ gfs2_glock_put(gl);
+
+ error = gfs2_glock_nq(fl_gh);
+ if (error) {
+ gfs2_holder_uninit(fl_gh);
+ if (error == GLR_TRYFAILED)
+ error = -EAGAIN;
+ } else {
+ error = flock_lock_file_wait(file, fl);
+ gfs2_assert_warn(GFS2_SB(&ip->i_inode), !error);
+ }
+
+out:
+ mutex_unlock(&fp->f_fl_mutex);
+ return error;
+}
+
+static void do_unflock(struct file *file, struct file_lock *fl)
+{
+ struct gfs2_file *fp = file->private_data;
+ struct gfs2_holder *fl_gh = &fp->f_fl_gh;
+
+ mutex_lock(&fp->f_fl_mutex);
+ flock_lock_file_wait(file, fl);
+ if (fl_gh->gh_gl)
+ gfs2_glock_dq_uninit(fl_gh);
+ mutex_unlock(&fp->f_fl_mutex);
+}
+
+/**
+ * gfs2_flock - acquire/release a flock lock on a file
+ * @file: the file pointer
+ * @cmd: either modify or retrieve lock state, possibly wait
+ * @fl: type and range of lock
+ *
+ * Returns: errno
+ */
+
+static int gfs2_flock(struct file *file, int cmd, struct file_lock *fl)
+{
+ struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
+ struct gfs2_sbd *sdp = GFS2_SB(file->f_mapping->host);
+
+ if (!(fl->fl_flags & FL_FLOCK))
+ return -ENOLCK;
+ if ((ip->i_di.di_mode & (S_ISGID | S_IXGRP)) == S_ISGID)
+ return -ENOLCK;
+
+ if (sdp->sd_args.ar_localflocks)
+ return flock_lock_file_wait(file, fl);
+
+ if (fl->fl_type == F_UNLCK) {
+ do_unflock(file, fl);
+ return 0;
+ } else {
+ return do_flock(file, cmd, fl);
+ }
+}
+
+const struct file_operations gfs2_file_fops = {
+ .llseek = gfs2_llseek,
+ .read = do_sync_read,
+ .aio_read = generic_file_aio_read,
+ .write = do_sync_write,
+ .aio_write = generic_file_aio_write,
+ .unlocked_ioctl = gfs2_ioctl,
+ .mmap = gfs2_mmap,
+ .open = gfs2_open,
+ .release = gfs2_close,
+ .fsync = gfs2_fsync,
+ .lock = gfs2_lock,
+ .sendfile = generic_file_sendfile,
+ .flock = gfs2_flock,
+ .splice_read = generic_file_splice_read,
+ .splice_write = generic_file_splice_write,
+};
+
+const struct file_operations gfs2_dir_fops = {
+ .readdir = gfs2_readdir,
+ .unlocked_ioctl = gfs2_ioctl,
+ .open = gfs2_open,
+ .release = gfs2_close,
+ .fsync = gfs2_fsync,
+ .lock = gfs2_lock,
+ .flock = gfs2_flock,
+};
+
diff --git a/fs/gfs2/ops_file.h b/fs/gfs2/ops_file.h
new file mode 100644
index 000000000000..ce319f89ec8e
--- /dev/null
+++ b/fs/gfs2/ops_file.h
@@ -0,0 +1,24 @@
+/*
+ * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#ifndef __OPS_FILE_DOT_H__
+#define __OPS_FILE_DOT_H__
+
+#include <linux/fs.h>
+struct gfs2_inode;
+
+extern struct file gfs2_internal_file_sentinel;
+extern int gfs2_internal_read(struct gfs2_inode *ip,
+ struct file_ra_state *ra_state,
+ char *buf, loff_t *pos, unsigned size);
+
+extern const struct file_operations gfs2_file_fops;
+extern const struct file_operations gfs2_dir_fops;
+
+#endif /* __OPS_FILE_DOT_H__ */
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
new file mode 100644
index 000000000000..882873a6bd69
--- /dev/null
+++ b/fs/gfs2/ops_fstype.c
@@ -0,0 +1,925 @@
+/*
+ * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/blkdev.h>
+#include <linux/kthread.h>
+#include <linux/namei.h>
+#include <linux/mount.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/lm_interface.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "daemon.h"
+#include "glock.h"
+#include "glops.h"
+#include "inode.h"
+#include "lm.h"
+#include "mount.h"
+#include "ops_export.h"
+#include "ops_fstype.h"
+#include "ops_super.h"
+#include "recovery.h"
+#include "rgrp.h"
+#include "super.h"
+#include "sys.h"
+#include "util.h"
+
+#define DO 0
+#define UNDO 1
+
+extern struct dentry_operations gfs2_dops;
+
+static struct gfs2_sbd *init_sbd(struct super_block *sb)
+{
+ struct gfs2_sbd *sdp;
+
+ sdp = kzalloc(sizeof(struct gfs2_sbd), GFP_KERNEL);
+ if (!sdp)
+ return NULL;
+
+ sb->s_fs_info = sdp;
+ sdp->sd_vfs = sb;
+
+ gfs2_tune_init(&sdp->sd_tune);
+
+ INIT_LIST_HEAD(&sdp->sd_reclaim_list);
+ spin_lock_init(&sdp->sd_reclaim_lock);
+ init_waitqueue_head(&sdp->sd_reclaim_wq);
+
+ mutex_init(&sdp->sd_inum_mutex);
+ spin_lock_init(&sdp->sd_statfs_spin);
+ mutex_init(&sdp->sd_statfs_mutex);
+
+ spin_lock_init(&sdp->sd_rindex_spin);
+ mutex_init(&sdp->sd_rindex_mutex);
+ INIT_LIST_HEAD(&sdp->sd_rindex_list);
+ INIT_LIST_HEAD(&sdp->sd_rindex_mru_list);
+ INIT_LIST_HEAD(&sdp->sd_rindex_recent_list);
+
+ INIT_LIST_HEAD(&sdp->sd_jindex_list);
+ spin_lock_init(&sdp->sd_jindex_spin);
+ mutex_init(&sdp->sd_jindex_mutex);
+
+ INIT_LIST_HEAD(&sdp->sd_quota_list);
+ spin_lock_init(&sdp->sd_quota_spin);
+ mutex_init(&sdp->sd_quota_mutex);
+
+ spin_lock_init(&sdp->sd_log_lock);
+
+ INIT_LIST_HEAD(&sdp->sd_log_le_gl);
+ INIT_LIST_HEAD(&sdp->sd_log_le_buf);
+ INIT_LIST_HEAD(&sdp->sd_log_le_revoke);
+ INIT_LIST_HEAD(&sdp->sd_log_le_rg);
+ INIT_LIST_HEAD(&sdp->sd_log_le_databuf);
+
+ mutex_init(&sdp->sd_log_reserve_mutex);
+ INIT_LIST_HEAD(&sdp->sd_ail1_list);
+ INIT_LIST_HEAD(&sdp->sd_ail2_list);
+
+ init_rwsem(&sdp->sd_log_flush_lock);
+ INIT_LIST_HEAD(&sdp->sd_log_flush_list);
+
+ INIT_LIST_HEAD(&sdp->sd_revoke_list);
+
+ mutex_init(&sdp->sd_freeze_lock);
+
+ return sdp;
+}
+
+static void init_vfs(struct super_block *sb, unsigned noatime)
+{
+ struct gfs2_sbd *sdp = sb->s_fs_info;
+
+ sb->s_magic = GFS2_MAGIC;
+ sb->s_op = &gfs2_super_ops;
+ sb->s_export_op = &gfs2_export_ops;
+ sb->s_maxbytes = MAX_LFS_FILESIZE;
+
+ if (sb->s_flags & (MS_NOATIME | MS_NODIRATIME))
+ set_bit(noatime, &sdp->sd_flags);
+
+ /* Don't let the VFS update atimes. GFS2 handles this itself. */
+ sb->s_flags |= MS_NOATIME | MS_NODIRATIME;
+}
+
+static int init_names(struct gfs2_sbd *sdp, int silent)
+{
+ struct page *page;
+ char *proto, *table;
+ int error = 0;
+
+ proto = sdp->sd_args.ar_lockproto;
+ table = sdp->sd_args.ar_locktable;
+
+ /* Try to autodetect */
+
+ if (!proto[0] || !table[0]) {
+ struct gfs2_sb *sb;
+ page = gfs2_read_super(sdp->sd_vfs, GFS2_SB_ADDR >> sdp->sd_fsb2bb_shift);
+ if (!page)
+ return -ENOBUFS;
+ sb = kmap(page);
+ gfs2_sb_in(&sdp->sd_sb, sb);
+ kunmap(page);
+ __free_page(page);
+
+ error = gfs2_check_sb(sdp, &sdp->sd_sb, silent);
+ if (error)
+ goto out;
+
+ if (!proto[0])
+ proto = sdp->sd_sb.sb_lockproto;
+ if (!table[0])
+ table = sdp->sd_sb.sb_locktable;
+ }
+
+ if (!table[0])
+ table = sdp->sd_vfs->s_id;
+
+ snprintf(sdp->sd_proto_name, GFS2_FSNAME_LEN, "%s", proto);
+ snprintf(sdp->sd_table_name, GFS2_FSNAME_LEN, "%s", table);
+
+out:
+ return error;
+}
+
+static int init_locking(struct gfs2_sbd *sdp, struct gfs2_holder *mount_gh,
+ int undo)
+{
+ struct task_struct *p;
+ int error = 0;
+
+ if (undo)
+ goto fail_trans;
+
+ p = kthread_run(gfs2_scand, sdp, "gfs2_scand");
+ error = IS_ERR(p);
+ if (error) {
+ fs_err(sdp, "can't start scand thread: %d\n", error);
+ return error;
+ }
+ sdp->sd_scand_process = p;
+
+ for (sdp->sd_glockd_num = 0;
+ sdp->sd_glockd_num < sdp->sd_args.ar_num_glockd;
+ sdp->sd_glockd_num++) {
+ p = kthread_run(gfs2_glockd, sdp, "gfs2_glockd");
+ error = IS_ERR(p);
+ if (error) {
+ fs_err(sdp, "can't start glockd thread: %d\n", error);
+ goto fail;
+ }
+ sdp->sd_glockd_process[sdp->sd_glockd_num] = p;
+ }
+
+ error = gfs2_glock_nq_num(sdp,
+ GFS2_MOUNT_LOCK, &gfs2_nondisk_glops,
+ LM_ST_EXCLUSIVE, LM_FLAG_NOEXP | GL_NOCACHE,
+ mount_gh);
+ if (error) {
+ fs_err(sdp, "can't acquire mount glock: %d\n", error);
+ goto fail;
+ }
+
+ error = gfs2_glock_nq_num(sdp,
+ GFS2_LIVE_LOCK, &gfs2_nondisk_glops,
+ LM_ST_SHARED,
+ LM_FLAG_NOEXP | GL_EXACT,
+ &sdp->sd_live_gh);
+ if (error) {
+ fs_err(sdp, "can't acquire live glock: %d\n", error);
+ goto fail_mount;
+ }
+
+ error = gfs2_glock_get(sdp, GFS2_RENAME_LOCK, &gfs2_nondisk_glops,
+ CREATE, &sdp->sd_rename_gl);
+ if (error) {
+ fs_err(sdp, "can't create rename glock: %d\n", error);
+ goto fail_live;
+ }
+
+ error = gfs2_glock_get(sdp, GFS2_TRANS_LOCK, &gfs2_trans_glops,
+ CREATE, &sdp->sd_trans_gl);
+ if (error) {
+ fs_err(sdp, "can't create transaction glock: %d\n", error);
+ goto fail_rename;
+ }
+ set_bit(GLF_STICKY, &sdp->sd_trans_gl->gl_flags);
+
+ return 0;
+
+fail_trans:
+ gfs2_glock_put(sdp->sd_trans_gl);
+fail_rename:
+ gfs2_glock_put(sdp->sd_rename_gl);
+fail_live:
+ gfs2_glock_dq_uninit(&sdp->sd_live_gh);
+fail_mount:
+ gfs2_glock_dq_uninit(mount_gh);
+fail:
+ while (sdp->sd_glockd_num--)
+ kthread_stop(sdp->sd_glockd_process[sdp->sd_glockd_num]);
+
+ kthread_stop(sdp->sd_scand_process);
+ return error;
+}
+
+static struct inode *gfs2_lookup_root(struct super_block *sb,
+ struct gfs2_inum *inum)
+{
+ return gfs2_inode_lookup(sb, inum, DT_DIR);
+}
+
+static int init_sb(struct gfs2_sbd *sdp, int silent, int undo)
+{
+ struct super_block *sb = sdp->sd_vfs;
+ struct gfs2_holder sb_gh;
+ struct gfs2_inum *inum;
+ struct inode *inode;
+ int error = 0;
+
+ if (undo) {
+ if (sb->s_root) {
+ dput(sb->s_root);
+ sb->s_root = NULL;
+ }
+ return 0;
+ }
+
+ error = gfs2_glock_nq_num(sdp, GFS2_SB_LOCK, &gfs2_meta_glops,
+ LM_ST_SHARED, 0, &sb_gh);
+ if (error) {
+ fs_err(sdp, "can't acquire superblock glock: %d\n", error);
+ return error;
+ }
+
+ error = gfs2_read_sb(sdp, sb_gh.gh_gl, silent);
+ if (error) {
+ fs_err(sdp, "can't read superblock: %d\n", error);
+ goto out;
+ }
+
+ /* Set up the buffer cache and SB for real */
+ if (sdp->sd_sb.sb_bsize < bdev_hardsect_size(sb->s_bdev)) {
+ error = -EINVAL;
+ fs_err(sdp, "FS block size (%u) is too small for device "
+ "block size (%u)\n",
+ sdp->sd_sb.sb_bsize, bdev_hardsect_size(sb->s_bdev));
+ goto out;
+ }
+ if (sdp->sd_sb.sb_bsize > PAGE_SIZE) {
+ error = -EINVAL;
+ fs_err(sdp, "FS block size (%u) is too big for machine "
+ "page size (%u)\n",
+ sdp->sd_sb.sb_bsize, (unsigned int)PAGE_SIZE);
+ goto out;
+ }
+ sb_set_blocksize(sb, sdp->sd_sb.sb_bsize);
+
+ /* Get the root inode */
+ inum = &sdp->sd_sb.sb_root_dir;
+ if (sb->s_type == &gfs2meta_fs_type)
+ inum = &sdp->sd_sb.sb_master_dir;
+ inode = gfs2_lookup_root(sb, inum);
+ if (IS_ERR(inode)) {
+ error = PTR_ERR(inode);
+ fs_err(sdp, "can't read in root inode: %d\n", error);
+ goto out;
+ }
+
+ sb->s_root = d_alloc_root(inode);
+ if (!sb->s_root) {
+ fs_err(sdp, "can't get root dentry\n");
+ error = -ENOMEM;
+ iput(inode);
+ }
+ sb->s_root->d_op = &gfs2_dops;
+out:
+ gfs2_glock_dq_uninit(&sb_gh);
+ return error;
+}
+
+static int init_journal(struct gfs2_sbd *sdp, int undo)
+{
+ struct gfs2_holder ji_gh;
+ struct task_struct *p;
+ struct gfs2_inode *ip;
+ int jindex = 1;
+ int error = 0;
+
+ if (undo) {
+ jindex = 0;
+ goto fail_recoverd;
+ }
+
+ sdp->sd_jindex = gfs2_lookup_simple(sdp->sd_master_dir, "jindex");
+ if (IS_ERR(sdp->sd_jindex)) {
+ fs_err(sdp, "can't lookup journal index: %d\n", error);
+ return PTR_ERR(sdp->sd_jindex);
+ }
+ ip = GFS2_I(sdp->sd_jindex);
+ set_bit(GLF_STICKY, &ip->i_gl->gl_flags);
+
+ /* Load in the journal index special file */
+
+ error = gfs2_jindex_hold(sdp, &ji_gh);
+ if (error) {
+ fs_err(sdp, "can't read journal index: %d\n", error);
+ goto fail;
+ }
+
+ error = -EINVAL;
+ if (!gfs2_jindex_size(sdp)) {
+ fs_err(sdp, "no journals!\n");
+ goto fail_jindex;
+ }
+
+ if (sdp->sd_args.ar_spectator) {
+ sdp->sd_jdesc = gfs2_jdesc_find(sdp, 0);
+ sdp->sd_log_blks_free = sdp->sd_jdesc->jd_blocks;
+ } else {
+ if (sdp->sd_lockstruct.ls_jid >= gfs2_jindex_size(sdp)) {
+ fs_err(sdp, "can't mount journal #%u\n",
+ sdp->sd_lockstruct.ls_jid);
+ fs_err(sdp, "there are only %u journals (0 - %u)\n",
+ gfs2_jindex_size(sdp),
+ gfs2_jindex_size(sdp) - 1);
+ goto fail_jindex;
+ }
+ sdp->sd_jdesc = gfs2_jdesc_find(sdp, sdp->sd_lockstruct.ls_jid);
+
+ error = gfs2_glock_nq_num(sdp, sdp->sd_lockstruct.ls_jid,
+ &gfs2_journal_glops,
+ LM_ST_EXCLUSIVE, LM_FLAG_NOEXP,
+ &sdp->sd_journal_gh);
+ if (error) {
+ fs_err(sdp, "can't acquire journal glock: %d\n", error);
+ goto fail_jindex;
+ }
+
+ ip = GFS2_I(sdp->sd_jdesc->jd_inode);
+ error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED,
+ LM_FLAG_NOEXP | GL_EXACT,
+ &sdp->sd_jinode_gh);
+ if (error) {
+ fs_err(sdp, "can't acquire journal inode glock: %d\n",
+ error);
+ goto fail_journal_gh;
+ }
+
+ error = gfs2_jdesc_check(sdp->sd_jdesc);
+ if (error) {
+ fs_err(sdp, "my journal (%u) is bad: %d\n",
+ sdp->sd_jdesc->jd_jid, error);
+ goto fail_jinode_gh;
+ }
+ sdp->sd_log_blks_free = sdp->sd_jdesc->jd_blocks;
+ }
+
+ if (sdp->sd_lockstruct.ls_first) {
+ unsigned int x;
+ for (x = 0; x < sdp->sd_journals; x++) {
+ error = gfs2_recover_journal(gfs2_jdesc_find(sdp, x));
+ if (error) {
+ fs_err(sdp, "error recovering journal %u: %d\n",
+ x, error);
+ goto fail_jinode_gh;
+ }
+ }
+
+ gfs2_lm_others_may_mount(sdp);
+ } else if (!sdp->sd_args.ar_spectator) {
+ error = gfs2_recover_journal(sdp->sd_jdesc);
+ if (error) {
+ fs_err(sdp, "error recovering my journal: %d\n", error);
+ goto fail_jinode_gh;
+ }
+ }
+
+ set_bit(SDF_JOURNAL_CHECKED, &sdp->sd_flags);
+ gfs2_glock_dq_uninit(&ji_gh);
+ jindex = 0;
+
+ p = kthread_run(gfs2_recoverd, sdp, "gfs2_recoverd");
+ error = IS_ERR(p);
+ if (error) {
+ fs_err(sdp, "can't start recoverd thread: %d\n", error);
+ goto fail_jinode_gh;
+ }
+ sdp->sd_recoverd_process = p;
+
+ return 0;
+
+fail_recoverd:
+ kthread_stop(sdp->sd_recoverd_process);
+fail_jinode_gh:
+ if (!sdp->sd_args.ar_spectator)
+ gfs2_glock_dq_uninit(&sdp->sd_jinode_gh);
+fail_journal_gh:
+ if (!sdp->sd_args.ar_spectator)
+ gfs2_glock_dq_uninit(&sdp->sd_journal_gh);
+fail_jindex:
+ gfs2_jindex_free(sdp);
+ if (jindex)
+ gfs2_glock_dq_uninit(&ji_gh);
+fail:
+ iput(sdp->sd_jindex);
+ return error;
+}
+
+
+static int init_inodes(struct gfs2_sbd *sdp, int undo)
+{
+ int error = 0;
+ struct gfs2_inode *ip;
+ struct inode *inode;
+
+ if (undo)
+ goto fail_qinode;
+
+ inode = gfs2_lookup_root(sdp->sd_vfs, &sdp->sd_sb.sb_master_dir);
+ if (IS_ERR(inode)) {
+ error = PTR_ERR(inode);
+ fs_err(sdp, "can't read in master directory: %d\n", error);
+ goto fail;
+ }
+ sdp->sd_master_dir = inode;
+
+ error = init_journal(sdp, undo);
+ if (error)
+ goto fail_master;
+
+ /* Read in the master inode number inode */
+ sdp->sd_inum_inode = gfs2_lookup_simple(sdp->sd_master_dir, "inum");
+ if (IS_ERR(sdp->sd_inum_inode)) {
+ error = PTR_ERR(sdp->sd_inum_inode);
+ fs_err(sdp, "can't read in inum inode: %d\n", error);
+ goto fail_journal;
+ }
+
+
+ /* Read in the master statfs inode */
+ sdp->sd_statfs_inode = gfs2_lookup_simple(sdp->sd_master_dir, "statfs");
+ if (IS_ERR(sdp->sd_statfs_inode)) {
+ error = PTR_ERR(sdp->sd_statfs_inode);
+ fs_err(sdp, "can't read in statfs inode: %d\n", error);
+ goto fail_inum;
+ }
+
+ /* Read in the resource index inode */
+ sdp->sd_rindex = gfs2_lookup_simple(sdp->sd_master_dir, "rindex");
+ if (IS_ERR(sdp->sd_rindex)) {
+ error = PTR_ERR(sdp->sd_rindex);
+ fs_err(sdp, "can't get resource index inode: %d\n", error);
+ goto fail_statfs;
+ }
+ ip = GFS2_I(sdp->sd_rindex);
+ set_bit(GLF_STICKY, &ip->i_gl->gl_flags);
+ sdp->sd_rindex_vn = ip->i_gl->gl_vn - 1;
+
+ /* Read in the quota inode */
+ sdp->sd_quota_inode = gfs2_lookup_simple(sdp->sd_master_dir, "quota");
+ if (IS_ERR(sdp->sd_quota_inode)) {
+ error = PTR_ERR(sdp->sd_quota_inode);
+ fs_err(sdp, "can't get quota file inode: %d\n", error);
+ goto fail_rindex;
+ }
+ return 0;
+
+fail_qinode:
+ iput(sdp->sd_quota_inode);
+fail_rindex:
+ gfs2_clear_rgrpd(sdp);
+ iput(sdp->sd_rindex);
+fail_statfs:
+ iput(sdp->sd_statfs_inode);
+fail_inum:
+ iput(sdp->sd_inum_inode);
+fail_journal:
+ init_journal(sdp, UNDO);
+fail_master:
+ iput(sdp->sd_master_dir);
+fail:
+ return error;
+}
+
+static int init_per_node(struct gfs2_sbd *sdp, int undo)
+{
+ struct inode *pn = NULL;
+ char buf[30];
+ int error = 0;
+ struct gfs2_inode *ip;
+
+ if (sdp->sd_args.ar_spectator)
+ return 0;
+
+ if (undo)
+ goto fail_qc_gh;
+
+ pn = gfs2_lookup_simple(sdp->sd_master_dir, "per_node");
+ if (IS_ERR(pn)) {
+ error = PTR_ERR(pn);
+ fs_err(sdp, "can't find per_node directory: %d\n", error);
+ return error;
+ }
+
+ sprintf(buf, "inum_range%u", sdp->sd_jdesc->jd_jid);
+ sdp->sd_ir_inode = gfs2_lookup_simple(pn, buf);
+ if (IS_ERR(sdp->sd_ir_inode)) {
+ error = PTR_ERR(sdp->sd_ir_inode);
+ fs_err(sdp, "can't find local \"ir\" file: %d\n", error);
+ goto fail;
+ }
+
+ sprintf(buf, "statfs_change%u", sdp->sd_jdesc->jd_jid);
+ sdp->sd_sc_inode = gfs2_lookup_simple(pn, buf);
+ if (IS_ERR(sdp->sd_sc_inode)) {
+ error = PTR_ERR(sdp->sd_sc_inode);
+ fs_err(sdp, "can't find local \"sc\" file: %d\n", error);
+ goto fail_ir_i;
+ }
+
+ sprintf(buf, "quota_change%u", sdp->sd_jdesc->jd_jid);
+ sdp->sd_qc_inode = gfs2_lookup_simple(pn, buf);
+ if (IS_ERR(sdp->sd_qc_inode)) {
+ error = PTR_ERR(sdp->sd_qc_inode);
+ fs_err(sdp, "can't find local \"qc\" file: %d\n", error);
+ goto fail_ut_i;
+ }
+
+ iput(pn);
+ pn = NULL;
+
+ ip = GFS2_I(sdp->sd_ir_inode);
+ error = gfs2_glock_nq_init(ip->i_gl,
+ LM_ST_EXCLUSIVE, 0,
+ &sdp->sd_ir_gh);
+ if (error) {
+ fs_err(sdp, "can't lock local \"ir\" file: %d\n", error);
+ goto fail_qc_i;
+ }
+
+ ip = GFS2_I(sdp->sd_sc_inode);
+ error = gfs2_glock_nq_init(ip->i_gl,
+ LM_ST_EXCLUSIVE, 0,
+ &sdp->sd_sc_gh);
+ if (error) {
+ fs_err(sdp, "can't lock local \"sc\" file: %d\n", error);
+ goto fail_ir_gh;
+ }
+
+ ip = GFS2_I(sdp->sd_qc_inode);
+ error = gfs2_glock_nq_init(ip->i_gl,
+ LM_ST_EXCLUSIVE, 0,
+ &sdp->sd_qc_gh);
+ if (error) {
+ fs_err(sdp, "can't lock local \"qc\" file: %d\n", error);
+ goto fail_ut_gh;
+ }
+
+ return 0;
+
+fail_qc_gh:
+ gfs2_glock_dq_uninit(&sdp->sd_qc_gh);
+fail_ut_gh:
+ gfs2_glock_dq_uninit(&sdp->sd_sc_gh);
+fail_ir_gh:
+ gfs2_glock_dq_uninit(&sdp->sd_ir_gh);
+fail_qc_i:
+ iput(sdp->sd_qc_inode);
+fail_ut_i:
+ iput(sdp->sd_sc_inode);
+fail_ir_i:
+ iput(sdp->sd_ir_inode);
+fail:
+ if (pn)
+ iput(pn);
+ return error;
+}
+
+static int init_threads(struct gfs2_sbd *sdp, int undo)
+{
+ struct task_struct *p;
+ int error = 0;
+
+ if (undo)
+ goto fail_quotad;
+
+ sdp->sd_log_flush_time = jiffies;
+ sdp->sd_jindex_refresh_time = jiffies;
+
+ p = kthread_run(gfs2_logd, sdp, "gfs2_logd");
+ error = IS_ERR(p);
+ if (error) {
+ fs_err(sdp, "can't start logd thread: %d\n", error);
+ return error;
+ }
+ sdp->sd_logd_process = p;
+
+ sdp->sd_statfs_sync_time = jiffies;
+ sdp->sd_quota_sync_time = jiffies;
+
+ p = kthread_run(gfs2_quotad, sdp, "gfs2_quotad");
+ error = IS_ERR(p);
+ if (error) {
+ fs_err(sdp, "can't start quotad thread: %d\n", error);
+ goto fail;
+ }
+ sdp->sd_quotad_process = p;
+
+ return 0;
+
+
+fail_quotad:
+ kthread_stop(sdp->sd_quotad_process);
+fail:
+ kthread_stop(sdp->sd_logd_process);
+ return error;
+}
+
+/**
+ * fill_super - Read in superblock
+ * @sb: The VFS superblock
+ * @data: Mount options
+ * @silent: Don't complain if it's not a GFS2 filesystem
+ *
+ * Returns: errno
+ */
+
+static int fill_super(struct super_block *sb, void *data, int silent)
+{
+ struct gfs2_sbd *sdp;
+ struct gfs2_holder mount_gh;
+ int error;
+
+ sdp = init_sbd(sb);
+ if (!sdp) {
+ printk(KERN_WARNING "GFS2: can't alloc struct gfs2_sbd\n");
+ return -ENOMEM;
+ }
+
+ error = gfs2_mount_args(sdp, (char *)data, 0);
+ if (error) {
+ printk(KERN_WARNING "GFS2: can't parse mount arguments\n");
+ goto fail;
+ }
+
+ init_vfs(sb, SDF_NOATIME);
+
+ /* Set up the buffer cache and fill in some fake block size values
+ to allow us to read-in the on-disk superblock. */
+ sdp->sd_sb.sb_bsize = sb_min_blocksize(sb, GFS2_BASIC_BLOCK);
+ sdp->sd_sb.sb_bsize_shift = sb->s_blocksize_bits;
+ sdp->sd_fsb2bb_shift = sdp->sd_sb.sb_bsize_shift -
+ GFS2_BASIC_BLOCK_SHIFT;
+ sdp->sd_fsb2bb = 1 << sdp->sd_fsb2bb_shift;
+
+ error = init_names(sdp, silent);
+ if (error)
+ goto fail;
+
+ error = gfs2_sys_fs_add(sdp);
+ if (error)
+ goto fail;
+
+ error = gfs2_lm_mount(sdp, silent);
+ if (error)
+ goto fail_sys;
+
+ error = init_locking(sdp, &mount_gh, DO);
+ if (error)
+ goto fail_lm;
+
+ error = init_sb(sdp, silent, DO);
+ if (error)
+ goto fail_locking;
+
+ error = init_inodes(sdp, DO);
+ if (error)
+ goto fail_sb;
+
+ error = init_per_node(sdp, DO);
+ if (error)
+ goto fail_inodes;
+
+ error = gfs2_statfs_init(sdp);
+ if (error) {
+ fs_err(sdp, "can't initialize statfs subsystem: %d\n", error);
+ goto fail_per_node;
+ }
+
+ error = init_threads(sdp, DO);
+ if (error)
+ goto fail_per_node;
+
+ if (!(sb->s_flags & MS_RDONLY)) {
+ error = gfs2_make_fs_rw(sdp);
+ if (error) {
+ fs_err(sdp, "can't make FS RW: %d\n", error);
+ goto fail_threads;
+ }
+ }
+
+ gfs2_glock_dq_uninit(&mount_gh);
+
+ return 0;
+
+fail_threads:
+ init_threads(sdp, UNDO);
+fail_per_node:
+ init_per_node(sdp, UNDO);
+fail_inodes:
+ init_inodes(sdp, UNDO);
+fail_sb:
+ init_sb(sdp, 0, UNDO);
+fail_locking:
+ init_locking(sdp, &mount_gh, UNDO);
+fail_lm:
+ gfs2_gl_hash_clear(sdp, WAIT);
+ gfs2_lm_unmount(sdp);
+ while (invalidate_inodes(sb))
+ yield();
+fail_sys:
+ gfs2_sys_fs_del(sdp);
+fail:
+ kfree(sdp);
+ sb->s_fs_info = NULL;
+ return error;
+}
+
+static int gfs2_get_sb(struct file_system_type *fs_type, int flags,
+ const char *dev_name, void *data, struct vfsmount *mnt)
+{
+ struct super_block *sb;
+ struct gfs2_sbd *sdp;
+ int error = get_sb_bdev(fs_type, flags, dev_name, data, fill_super, mnt);
+ if (error)
+ goto out;
+ sb = mnt->mnt_sb;
+ sdp = sb->s_fs_info;
+ sdp->sd_gfs2mnt = mnt;
+out:
+ return error;
+}
+
+static int fill_super_meta(struct super_block *sb, struct super_block *new,
+ void *data, int silent)
+{
+ struct gfs2_sbd *sdp = sb->s_fs_info;
+ struct inode *inode;
+ int error = 0;
+
+ new->s_fs_info = sdp;
+ sdp->sd_vfs_meta = sb;
+
+ init_vfs(new, SDF_NOATIME);
+
+ /* Get the master inode */
+ inode = igrab(sdp->sd_master_dir);
+
+ new->s_root = d_alloc_root(inode);
+ if (!new->s_root) {
+ fs_err(sdp, "can't get root dentry\n");
+ error = -ENOMEM;
+ iput(inode);
+ } else
+ new->s_root->d_op = &gfs2_dops;
+
+ return error;
+}
+
+static int set_bdev_super(struct super_block *s, void *data)
+{
+ s->s_bdev = data;
+ s->s_dev = s->s_bdev->bd_dev;
+ return 0;
+}
+
+static int test_bdev_super(struct super_block *s, void *data)
+{
+ return s->s_bdev == data;
+}
+
+static struct super_block* get_gfs2_sb(const char *dev_name)
+{
+ struct kstat stat;
+ struct nameidata nd;
+ struct file_system_type *fstype;
+ struct super_block *sb = NULL, *s;
+ struct list_head *l;
+ int error;
+
+ error = path_lookup(dev_name, LOOKUP_FOLLOW, &nd);
+ if (error) {
+ printk(KERN_WARNING "GFS2: path_lookup on %s returned error\n",
+ dev_name);
+ goto out;
+ }
+ error = vfs_getattr(nd.mnt, nd.dentry, &stat);
+
+ fstype = get_fs_type("gfs2");
+ list_for_each(l, &fstype->fs_supers) {
+ s = list_entry(l, struct super_block, s_instances);
+ if ((S_ISBLK(stat.mode) && s->s_dev == stat.rdev) ||
+ (S_ISDIR(stat.mode) && s == nd.dentry->d_inode->i_sb)) {
+ sb = s;
+ goto free_nd;
+ }
+ }
+
+ printk(KERN_WARNING "GFS2: Unrecognized block device or "
+ "mount point %s", dev_name);
+
+free_nd:
+ path_release(&nd);
+out:
+ return sb;
+}
+
+static int gfs2_get_sb_meta(struct file_system_type *fs_type, int flags,
+ const char *dev_name, void *data, struct vfsmount *mnt)
+{
+ int error = 0;
+ struct super_block *sb = NULL, *new;
+ struct gfs2_sbd *sdp;
+
+ sb = get_gfs2_sb(dev_name);
+ if (!sb) {
+ printk(KERN_WARNING "GFS2: gfs2 mount does not exist\n");
+ error = -ENOENT;
+ goto error;
+ }
+ sdp = (struct gfs2_sbd*) sb->s_fs_info;
+ if (sdp->sd_vfs_meta) {
+ printk(KERN_WARNING "GFS2: gfs2meta mount already exists\n");
+ error = -EBUSY;
+ goto error;
+ }
+ mutex_lock(&sb->s_bdev->bd_mount_mutex);
+ new = sget(fs_type, test_bdev_super, set_bdev_super, sb->s_bdev);
+ mutex_unlock(&sb->s_bdev->bd_mount_mutex);
+ if (IS_ERR(new)) {
+ error = PTR_ERR(new);
+ goto error;
+ }
+ module_put(fs_type->owner);
+ new->s_flags = flags;
+ strlcpy(new->s_id, sb->s_id, sizeof(new->s_id));
+ sb_set_blocksize(new, sb->s_blocksize);
+ error = fill_super_meta(sb, new, data, flags & MS_SILENT ? 1 : 0);
+ if (error) {
+ up_write(&new->s_umount);
+ deactivate_super(new);
+ goto error;
+ }
+
+ new->s_flags |= MS_ACTIVE;
+
+ /* Grab a reference to the gfs2 mount point */
+ atomic_inc(&sdp->sd_gfs2mnt->mnt_count);
+ return simple_set_mnt(mnt, new);
+error:
+ return error;
+}
+
+static void gfs2_kill_sb(struct super_block *sb)
+{
+ kill_block_super(sb);
+}
+
+static void gfs2_kill_sb_meta(struct super_block *sb)
+{
+ struct gfs2_sbd *sdp = sb->s_fs_info;
+ generic_shutdown_super(sb);
+ sdp->sd_vfs_meta = NULL;
+ atomic_dec(&sdp->sd_gfs2mnt->mnt_count);
+}
+
+struct file_system_type gfs2_fs_type = {
+ .name = "gfs2",
+ .fs_flags = FS_REQUIRES_DEV,
+ .get_sb = gfs2_get_sb,
+ .kill_sb = gfs2_kill_sb,
+ .owner = THIS_MODULE,
+};
+
+struct file_system_type gfs2meta_fs_type = {
+ .name = "gfs2meta",
+ .fs_flags = FS_REQUIRES_DEV,
+ .get_sb = gfs2_get_sb_meta,
+ .kill_sb = gfs2_kill_sb_meta,
+ .owner = THIS_MODULE,
+};
+
diff --git a/fs/gfs2/ops_fstype.h b/fs/gfs2/ops_fstype.h
new file mode 100644
index 000000000000..7cc2c296271b
--- /dev/null
+++ b/fs/gfs2/ops_fstype.h
@@ -0,0 +1,18 @@
+/*
+ * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#ifndef __OPS_FSTYPE_DOT_H__
+#define __OPS_FSTYPE_DOT_H__
+
+#include <linux/fs.h>
+
+extern struct file_system_type gfs2_fs_type;
+extern struct file_system_type gfs2meta_fs_type;
+
+#endif /* __OPS_FSTYPE_DOT_H__ */
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
new file mode 100644
index 000000000000..ef6e5ed70e94
--- /dev/null
+++ b/fs/gfs2/ops_inode.c
@@ -0,0 +1,1151 @@
+/*
+ * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/namei.h>
+#include <linux/utsname.h>
+#include <linux/mm.h>
+#include <linux/xattr.h>
+#include <linux/posix_acl.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/crc32.h>
+#include <linux/lm_interface.h>
+#include <asm/uaccess.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "acl.h"
+#include "bmap.h"
+#include "dir.h"
+#include "eaops.h"
+#include "eattr.h"
+#include "glock.h"
+#include "inode.h"
+#include "meta_io.h"
+#include "ops_dentry.h"
+#include "ops_inode.h"
+#include "quota.h"
+#include "rgrp.h"
+#include "trans.h"
+#include "util.h"
+
+/**
+ * gfs2_create - Create a file
+ * @dir: The directory in which to create the file
+ * @dentry: The dentry of the new file
+ * @mode: The mode of the new file
+ *
+ * Returns: errno
+ */
+
+static int gfs2_create(struct inode *dir, struct dentry *dentry,
+ int mode, struct nameidata *nd)
+{
+ struct gfs2_inode *dip = GFS2_I(dir);
+ struct gfs2_sbd *sdp = GFS2_SB(dir);
+ struct gfs2_holder ghs[2];
+ struct inode *inode;
+
+ gfs2_holder_init(dip->i_gl, 0, 0, ghs);
+
+ for (;;) {
+ inode = gfs2_createi(ghs, &dentry->d_name, S_IFREG | mode);
+ if (!IS_ERR(inode)) {
+ gfs2_trans_end(sdp);
+ if (dip->i_alloc.al_rgd)
+ gfs2_inplace_release(dip);
+ gfs2_quota_unlock(dip);
+ gfs2_alloc_put(dip);
+ gfs2_glock_dq_uninit_m(2, ghs);
+ mark_inode_dirty(inode);
+ break;
+ } else if (PTR_ERR(inode) != -EEXIST ||
+ (nd->intent.open.flags & O_EXCL)) {
+ gfs2_holder_uninit(ghs);
+ return PTR_ERR(inode);
+ }
+
+ inode = gfs2_lookupi(dir, &dentry->d_name, 0, nd);
+ if (inode) {
+ if (!IS_ERR(inode)) {
+ gfs2_holder_uninit(ghs);
+ break;
+ } else {
+ gfs2_holder_uninit(ghs);
+ return PTR_ERR(inode);
+ }
+ }
+ }
+
+ d_instantiate(dentry, inode);
+
+ return 0;
+}
+
+/**
+ * gfs2_lookup - Look up a filename in a directory and return its inode
+ * @dir: The directory inode
+ * @dentry: The dentry of the new inode
+ * @nd: passed from Linux VFS, ignored by us
+ *
+ * Called by the VFS layer. Lock dir and call gfs2_lookupi()
+ *
+ * Returns: errno
+ */
+
+static struct dentry *gfs2_lookup(struct inode *dir, struct dentry *dentry,
+ struct nameidata *nd)
+{
+ struct inode *inode = NULL;
+
+ dentry->d_op = &gfs2_dops;
+
+ inode = gfs2_lookupi(dir, &dentry->d_name, 0, nd);
+ if (inode && IS_ERR(inode))
+ return ERR_PTR(PTR_ERR(inode));
+
+ if (inode)
+ return d_splice_alias(inode, dentry);
+ d_add(dentry, inode);
+
+ return NULL;
+}
+
+/**
+ * gfs2_link - Link to a file
+ * @old_dentry: The inode to link
+ * @dir: Add link to this directory
+ * @dentry: The name of the link
+ *
+ * Link the inode in "old_dentry" into the directory "dir" with the
+ * name in "dentry".
+ *
+ * Returns: errno
+ */
+
+static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
+ struct dentry *dentry)
+{
+ struct gfs2_inode *dip = GFS2_I(dir);
+ struct gfs2_sbd *sdp = GFS2_SB(dir);
+ struct inode *inode = old_dentry->d_inode;
+ struct gfs2_inode *ip = GFS2_I(inode);
+ struct gfs2_holder ghs[2];
+ int alloc_required;
+ int error;
+
+ if (S_ISDIR(ip->i_di.di_mode))
+ return -EPERM;
+
+ gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs);
+ gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1);
+
+ error = gfs2_glock_nq_m(2, ghs);
+ if (error)
+ goto out;
+
+ error = permission(dir, MAY_WRITE | MAY_EXEC, NULL);
+ if (error)
+ goto out_gunlock;
+
+ error = gfs2_dir_search(dir, &dentry->d_name, NULL, NULL);
+ switch (error) {
+ case -ENOENT:
+ break;
+ case 0:
+ error = -EEXIST;
+ default:
+ goto out_gunlock;
+ }
+
+ error = -EINVAL;
+ if (!dip->i_di.di_nlink)
+ goto out_gunlock;
+ error = -EFBIG;
+ if (dip->i_di.di_entries == (u32)-1)
+ goto out_gunlock;
+ error = -EPERM;
+ if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
+ goto out_gunlock;
+ error = -EINVAL;
+ if (!ip->i_di.di_nlink)
+ goto out_gunlock;
+ error = -EMLINK;
+ if (ip->i_di.di_nlink == (u32)-1)
+ goto out_gunlock;
+
+ alloc_required = error = gfs2_diradd_alloc_required(dir, &dentry->d_name);
+ if (error < 0)
+ goto out_gunlock;
+ error = 0;
+
+ if (alloc_required) {
+ struct gfs2_alloc *al = gfs2_alloc_get(dip);
+
+ error = gfs2_quota_lock(dip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+ if (error)
+ goto out_alloc;
+
+ error = gfs2_quota_check(dip, dip->i_di.di_uid,
+ dip->i_di.di_gid);
+ if (error)
+ goto out_gunlock_q;
+
+ al->al_requested = sdp->sd_max_dirres;
+
+ error = gfs2_inplace_reserve(dip);
+ if (error)
+ goto out_gunlock_q;
+
+ error = gfs2_trans_begin(sdp, sdp->sd_max_dirres +
+ al->al_rgd->rd_ri.ri_length +
+ 2 * RES_DINODE + RES_STATFS +
+ RES_QUOTA, 0);
+ if (error)
+ goto out_ipres;
+ } else {
+ error = gfs2_trans_begin(sdp, 2 * RES_DINODE + RES_LEAF, 0);
+ if (error)
+ goto out_ipres;
+ }
+
+ error = gfs2_dir_add(dir, &dentry->d_name, &ip->i_num,
+ IF2DT(ip->i_di.di_mode));
+ if (error)
+ goto out_end_trans;
+
+ error = gfs2_change_nlink(ip, +1);
+
+out_end_trans:
+ gfs2_trans_end(sdp);
+out_ipres:
+ if (alloc_required)
+ gfs2_inplace_release(dip);
+out_gunlock_q:
+ if (alloc_required)
+ gfs2_quota_unlock(dip);
+out_alloc:
+ if (alloc_required)
+ gfs2_alloc_put(dip);
+out_gunlock:
+ gfs2_glock_dq_m(2, ghs);
+out:
+ gfs2_holder_uninit(ghs);
+ gfs2_holder_uninit(ghs + 1);
+ if (!error) {
+ atomic_inc(&inode->i_count);
+ d_instantiate(dentry, inode);
+ mark_inode_dirty(inode);
+ }
+ return error;
+}
+
+/**
+ * gfs2_unlink - Unlink a file
+ * @dir: The inode of the directory containing the file to unlink
+ * @dentry: The file itself
+ *
+ * Unlink a file. Call gfs2_unlinki()
+ *
+ * Returns: errno
+ */
+
+static int gfs2_unlink(struct inode *dir, struct dentry *dentry)
+{
+ struct gfs2_inode *dip = GFS2_I(dir);
+ struct gfs2_sbd *sdp = GFS2_SB(dir);
+ struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
+ struct gfs2_holder ghs[2];
+ int error;
+
+ gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs);
+ gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1);
+
+ error = gfs2_glock_nq_m(2, ghs);
+ if (error)
+ goto out;
+
+ error = gfs2_unlink_ok(dip, &dentry->d_name, ip);
+ if (error)
+ goto out_gunlock;
+
+ error = gfs2_trans_begin(sdp, 2*RES_DINODE + RES_LEAF + RES_RG_BIT, 0);
+ if (error)
+ goto out_gunlock;
+
+ error = gfs2_dir_del(dip, &dentry->d_name);
+ if (error)
+ goto out_end_trans;
+
+ error = gfs2_change_nlink(ip, -1);
+
+out_end_trans:
+ gfs2_trans_end(sdp);
+out_gunlock:
+ gfs2_glock_dq_m(2, ghs);
+out:
+ gfs2_holder_uninit(ghs);
+ gfs2_holder_uninit(ghs + 1);
+ return error;
+}
+
+/**
+ * gfs2_symlink - Create a symlink
+ * @dir: The directory to create the symlink in
+ * @dentry: The dentry to put the symlink in
+ * @symname: The thing which the link points to
+ *
+ * Returns: errno
+ */
+
+static int gfs2_symlink(struct inode *dir, struct dentry *dentry,
+ const char *symname)
+{
+ struct gfs2_inode *dip = GFS2_I(dir), *ip;
+ struct gfs2_sbd *sdp = GFS2_SB(dir);
+ struct gfs2_holder ghs[2];
+ struct inode *inode;
+ struct buffer_head *dibh;
+ int size;
+ int error;
+
+ /* Must be stuffed with a null terminator for gfs2_follow_link() */
+ size = strlen(symname);
+ if (size > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode) - 1)
+ return -ENAMETOOLONG;
+
+ gfs2_holder_init(dip->i_gl, 0, 0, ghs);
+
+ inode = gfs2_createi(ghs, &dentry->d_name, S_IFLNK | S_IRWXUGO);
+ if (IS_ERR(inode)) {
+ gfs2_holder_uninit(ghs);
+ return PTR_ERR(inode);
+ }
+
+ ip = ghs[1].gh_gl->gl_object;
+
+ ip->i_di.di_size = size;
+
+ error = gfs2_meta_inode_buffer(ip, &dibh);
+
+ if (!gfs2_assert_withdraw(sdp, !error)) {
+ gfs2_dinode_out(&ip->i_di, dibh->b_data);
+ memcpy(dibh->b_data + sizeof(struct gfs2_dinode), symname,
+ size);
+ brelse(dibh);
+ }
+
+ gfs2_trans_end(sdp);
+ if (dip->i_alloc.al_rgd)
+ gfs2_inplace_release(dip);
+ gfs2_quota_unlock(dip);
+ gfs2_alloc_put(dip);
+
+ gfs2_glock_dq_uninit_m(2, ghs);
+
+ d_instantiate(dentry, inode);
+ mark_inode_dirty(inode);
+
+ return 0;
+}
+
+/**
+ * gfs2_mkdir - Make a directory
+ * @dir: The parent directory of the new one
+ * @dentry: The dentry of the new directory
+ * @mode: The mode of the new directory
+ *
+ * Returns: errno
+ */
+
+static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+ struct gfs2_inode *dip = GFS2_I(dir), *ip;
+ struct gfs2_sbd *sdp = GFS2_SB(dir);
+ struct gfs2_holder ghs[2];
+ struct inode *inode;
+ struct buffer_head *dibh;
+ int error;
+
+ gfs2_holder_init(dip->i_gl, 0, 0, ghs);
+
+ inode = gfs2_createi(ghs, &dentry->d_name, S_IFDIR | mode);
+ if (IS_ERR(inode)) {
+ gfs2_holder_uninit(ghs);
+ return PTR_ERR(inode);
+ }
+
+ ip = ghs[1].gh_gl->gl_object;
+
+ ip->i_di.di_nlink = 2;
+ ip->i_di.di_size = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode);
+ ip->i_di.di_flags |= GFS2_DIF_JDATA;
+ ip->i_di.di_payload_format = GFS2_FORMAT_DE;
+ ip->i_di.di_entries = 2;
+
+ error = gfs2_meta_inode_buffer(ip, &dibh);
+
+ if (!gfs2_assert_withdraw(sdp, !error)) {
+ struct gfs2_dinode *di = (struct gfs2_dinode *)dibh->b_data;
+ struct gfs2_dirent *dent = (struct gfs2_dirent *)(di+1);
+ struct qstr str;
+
+ gfs2_str2qstr(&str, ".");
+ gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+ gfs2_qstr2dirent(&str, GFS2_DIRENT_SIZE(str.len), dent);
+ dent->de_inum = di->di_num; /* already GFS2 endian */
+ dent->de_type = cpu_to_be16(DT_DIR);
+ di->di_entries = cpu_to_be32(1);
+
+ gfs2_str2qstr(&str, "..");
+ dent = (struct gfs2_dirent *)((char*)dent + GFS2_DIRENT_SIZE(1));
+ gfs2_qstr2dirent(&str, dibh->b_size - GFS2_DIRENT_SIZE(1) - sizeof(struct gfs2_dinode), dent);
+
+ gfs2_inum_out(&dip->i_num, &dent->de_inum);
+ dent->de_type = cpu_to_be16(DT_DIR);
+
+ gfs2_dinode_out(&ip->i_di, di);
+
+ brelse(dibh);
+ }
+
+ error = gfs2_change_nlink(dip, +1);
+ gfs2_assert_withdraw(sdp, !error); /* dip already pinned */
+
+ gfs2_trans_end(sdp);
+ if (dip->i_alloc.al_rgd)
+ gfs2_inplace_release(dip);
+ gfs2_quota_unlock(dip);
+ gfs2_alloc_put(dip);
+
+ gfs2_glock_dq_uninit_m(2, ghs);
+
+ d_instantiate(dentry, inode);
+ mark_inode_dirty(inode);
+
+ return 0;
+}
+
+/**
+ * gfs2_rmdir - Remove a directory
+ * @dir: The parent directory of the directory to be removed
+ * @dentry: The dentry of the directory to remove
+ *
+ * Remove a directory. Call gfs2_rmdiri()
+ *
+ * Returns: errno
+ */
+
+static int gfs2_rmdir(struct inode *dir, struct dentry *dentry)
+{
+ struct gfs2_inode *dip = GFS2_I(dir);
+ struct gfs2_sbd *sdp = GFS2_SB(dir);
+ struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
+ struct gfs2_holder ghs[2];
+ int error;
+
+ gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs);
+ gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1);
+
+ error = gfs2_glock_nq_m(2, ghs);
+ if (error)
+ goto out;
+
+ error = gfs2_unlink_ok(dip, &dentry->d_name, ip);
+ if (error)
+ goto out_gunlock;
+
+ if (ip->i_di.di_entries < 2) {
+ if (gfs2_consist_inode(ip))
+ gfs2_dinode_print(&ip->i_di);
+ error = -EIO;
+ goto out_gunlock;
+ }
+ if (ip->i_di.di_entries > 2) {
+ error = -ENOTEMPTY;
+ goto out_gunlock;
+ }
+
+ error = gfs2_trans_begin(sdp, 2 * RES_DINODE + 3 * RES_LEAF + RES_RG_BIT, 0);
+ if (error)
+ goto out_gunlock;
+
+ error = gfs2_rmdiri(dip, &dentry->d_name, ip);
+
+ gfs2_trans_end(sdp);
+
+out_gunlock:
+ gfs2_glock_dq_m(2, ghs);
+out:
+ gfs2_holder_uninit(ghs);
+ gfs2_holder_uninit(ghs + 1);
+ return error;
+}
+
+/**
+ * gfs2_mknod - Make a special file
+ * @dir: The directory in which the special file will reside
+ * @dentry: The dentry of the special file
+ * @mode: The mode of the special file
+ * @rdev: The device specification of the special file
+ *
+ */
+
+static int gfs2_mknod(struct inode *dir, struct dentry *dentry, int mode,
+ dev_t dev)
+{
+ struct gfs2_inode *dip = GFS2_I(dir), *ip;
+ struct gfs2_sbd *sdp = GFS2_SB(dir);
+ struct gfs2_holder ghs[2];
+ struct inode *inode;
+ struct buffer_head *dibh;
+ u32 major = 0, minor = 0;
+ int error;
+
+ switch (mode & S_IFMT) {
+ case S_IFBLK:
+ case S_IFCHR:
+ major = MAJOR(dev);
+ minor = MINOR(dev);
+ break;
+ case S_IFIFO:
+ case S_IFSOCK:
+ break;
+ default:
+ return -EOPNOTSUPP;
+ };
+
+ gfs2_holder_init(dip->i_gl, 0, 0, ghs);
+
+ inode = gfs2_createi(ghs, &dentry->d_name, mode);
+ if (IS_ERR(inode)) {
+ gfs2_holder_uninit(ghs);
+ return PTR_ERR(inode);
+ }
+
+ ip = ghs[1].gh_gl->gl_object;
+
+ ip->i_di.di_major = major;
+ ip->i_di.di_minor = minor;
+
+ error = gfs2_meta_inode_buffer(ip, &dibh);
+
+ if (!gfs2_assert_withdraw(sdp, !error)) {
+ gfs2_dinode_out(&ip->i_di, dibh->b_data);
+ brelse(dibh);
+ }
+
+ gfs2_trans_end(sdp);
+ if (dip->i_alloc.al_rgd)
+ gfs2_inplace_release(dip);
+ gfs2_quota_unlock(dip);
+ gfs2_alloc_put(dip);
+
+ gfs2_glock_dq_uninit_m(2, ghs);
+
+ d_instantiate(dentry, inode);
+ mark_inode_dirty(inode);
+
+ return 0;
+}
+
+/**
+ * gfs2_rename - Rename a file
+ * @odir: Parent directory of old file name
+ * @odentry: The old dentry of the file
+ * @ndir: Parent directory of new file name
+ * @ndentry: The new dentry of the file
+ *
+ * Returns: errno
+ */
+
+static int gfs2_rename(struct inode *odir, struct dentry *odentry,
+ struct inode *ndir, struct dentry *ndentry)
+{
+ struct gfs2_inode *odip = GFS2_I(odir);
+ struct gfs2_inode *ndip = GFS2_I(ndir);
+ struct gfs2_inode *ip = GFS2_I(odentry->d_inode);
+ struct gfs2_inode *nip = NULL;
+ struct gfs2_sbd *sdp = GFS2_SB(odir);
+ struct gfs2_holder ghs[4], r_gh;
+ unsigned int num_gh;
+ int dir_rename = 0;
+ int alloc_required;
+ unsigned int x;
+ int error;
+
+ if (ndentry->d_inode) {
+ nip = GFS2_I(ndentry->d_inode);
+ if (ip == nip)
+ return 0;
+ }
+
+ /* Make sure we aren't trying to move a dirctory into it's subdir */
+
+ if (S_ISDIR(ip->i_di.di_mode) && odip != ndip) {
+ dir_rename = 1;
+
+ error = gfs2_glock_nq_init(sdp->sd_rename_gl,
+ LM_ST_EXCLUSIVE, 0,
+ &r_gh);
+ if (error)
+ goto out;
+
+ error = gfs2_ok_to_move(ip, ndip);
+ if (error)
+ goto out_gunlock_r;
+ }
+
+ num_gh = 1;
+ gfs2_holder_init(odip->i_gl, LM_ST_EXCLUSIVE, 0, ghs);
+ if (odip != ndip) {
+ gfs2_holder_init(ndip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh);
+ num_gh++;
+ }
+ gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh);
+ num_gh++;
+
+ if (nip) {
+ gfs2_holder_init(nip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh);
+ num_gh++;
+ }
+
+ error = gfs2_glock_nq_m(num_gh, ghs);
+ if (error)
+ goto out_uninit;
+
+ /* Check out the old directory */
+
+ error = gfs2_unlink_ok(odip, &odentry->d_name, ip);
+ if (error)
+ goto out_gunlock;
+
+ /* Check out the new directory */
+
+ if (nip) {
+ error = gfs2_unlink_ok(ndip, &ndentry->d_name, nip);
+ if (error)
+ goto out_gunlock;
+
+ if (S_ISDIR(nip->i_di.di_mode)) {
+ if (nip->i_di.di_entries < 2) {
+ if (gfs2_consist_inode(nip))
+ gfs2_dinode_print(&nip->i_di);
+ error = -EIO;
+ goto out_gunlock;
+ }
+ if (nip->i_di.di_entries > 2) {
+ error = -ENOTEMPTY;
+ goto out_gunlock;
+ }
+ }
+ } else {
+ error = permission(ndir, MAY_WRITE | MAY_EXEC, NULL);
+ if (error)
+ goto out_gunlock;
+
+ error = gfs2_dir_search(ndir, &ndentry->d_name, NULL, NULL);
+ switch (error) {
+ case -ENOENT:
+ error = 0;
+ break;
+ case 0:
+ error = -EEXIST;
+ default:
+ goto out_gunlock;
+ };
+
+ if (odip != ndip) {
+ if (!ndip->i_di.di_nlink) {
+ error = -EINVAL;
+ goto out_gunlock;
+ }
+ if (ndip->i_di.di_entries == (u32)-1) {
+ error = -EFBIG;
+ goto out_gunlock;
+ }
+ if (S_ISDIR(ip->i_di.di_mode) &&
+ ndip->i_di.di_nlink == (u32)-1) {
+ error = -EMLINK;
+ goto out_gunlock;
+ }
+ }
+ }
+
+ /* Check out the dir to be renamed */
+
+ if (dir_rename) {
+ error = permission(odentry->d_inode, MAY_WRITE, NULL);
+ if (error)
+ goto out_gunlock;
+ }
+
+ alloc_required = error = gfs2_diradd_alloc_required(ndir, &ndentry->d_name);
+ if (error < 0)
+ goto out_gunlock;
+ error = 0;
+
+ if (alloc_required) {
+ struct gfs2_alloc *al = gfs2_alloc_get(ndip);
+
+ error = gfs2_quota_lock(ndip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+ if (error)
+ goto out_alloc;
+
+ error = gfs2_quota_check(ndip, ndip->i_di.di_uid,
+ ndip->i_di.di_gid);
+ if (error)
+ goto out_gunlock_q;
+
+ al->al_requested = sdp->sd_max_dirres;
+
+ error = gfs2_inplace_reserve(ndip);
+ if (error)
+ goto out_gunlock_q;
+
+ error = gfs2_trans_begin(sdp, sdp->sd_max_dirres +
+ al->al_rgd->rd_ri.ri_length +
+ 4 * RES_DINODE + 4 * RES_LEAF +
+ RES_STATFS + RES_QUOTA, 0);
+ if (error)
+ goto out_ipreserv;
+ } else {
+ error = gfs2_trans_begin(sdp, 4 * RES_DINODE +
+ 5 * RES_LEAF, 0);
+ if (error)
+ goto out_gunlock;
+ }
+
+ /* Remove the target file, if it exists */
+
+ if (nip) {
+ if (S_ISDIR(nip->i_di.di_mode))
+ error = gfs2_rmdiri(ndip, &ndentry->d_name, nip);
+ else {
+ error = gfs2_dir_del(ndip, &ndentry->d_name);
+ if (error)
+ goto out_end_trans;
+ error = gfs2_change_nlink(nip, -1);
+ }
+ if (error)
+ goto out_end_trans;
+ }
+
+ if (dir_rename) {
+ struct qstr name;
+ gfs2_str2qstr(&name, "..");
+
+ error = gfs2_change_nlink(ndip, +1);
+ if (error)
+ goto out_end_trans;
+ error = gfs2_change_nlink(odip, -1);
+ if (error)
+ goto out_end_trans;
+
+ error = gfs2_dir_mvino(ip, &name, &ndip->i_num, DT_DIR);
+ if (error)
+ goto out_end_trans;
+ } else {
+ struct buffer_head *dibh;
+ error = gfs2_meta_inode_buffer(ip, &dibh);
+ if (error)
+ goto out_end_trans;
+ ip->i_di.di_ctime = get_seconds();
+ gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+ gfs2_dinode_out(&ip->i_di, dibh->b_data);
+ brelse(dibh);
+ }
+
+ error = gfs2_dir_del(odip, &odentry->d_name);
+ if (error)
+ goto out_end_trans;
+
+ error = gfs2_dir_add(ndir, &ndentry->d_name, &ip->i_num,
+ IF2DT(ip->i_di.di_mode));
+ if (error)
+ goto out_end_trans;
+
+out_end_trans:
+ gfs2_trans_end(sdp);
+out_ipreserv:
+ if (alloc_required)
+ gfs2_inplace_release(ndip);
+out_gunlock_q:
+ if (alloc_required)
+ gfs2_quota_unlock(ndip);
+out_alloc:
+ if (alloc_required)
+ gfs2_alloc_put(ndip);
+out_gunlock:
+ gfs2_glock_dq_m(num_gh, ghs);
+out_uninit:
+ for (x = 0; x < num_gh; x++)
+ gfs2_holder_uninit(ghs + x);
+out_gunlock_r:
+ if (dir_rename)
+ gfs2_glock_dq_uninit(&r_gh);
+out:
+ return error;
+}
+
+/**
+ * gfs2_readlink - Read the value of a symlink
+ * @dentry: the symlink
+ * @buf: the buffer to read the symlink data into
+ * @size: the size of the buffer
+ *
+ * Returns: errno
+ */
+
+static int gfs2_readlink(struct dentry *dentry, char __user *user_buf,
+ int user_size)
+{
+ struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
+ char array[GFS2_FAST_NAME_SIZE], *buf = array;
+ unsigned int len = GFS2_FAST_NAME_SIZE;
+ int error;
+
+ error = gfs2_readlinki(ip, &buf, &len);
+ if (error)
+ return error;
+
+ if (user_size > len - 1)
+ user_size = len - 1;
+
+ if (copy_to_user(user_buf, buf, user_size))
+ error = -EFAULT;
+ else
+ error = user_size;
+
+ if (buf != array)
+ kfree(buf);
+
+ return error;
+}
+
+/**
+ * gfs2_follow_link - Follow a symbolic link
+ * @dentry: The dentry of the link
+ * @nd: Data that we pass to vfs_follow_link()
+ *
+ * This can handle symlinks of any size. It is optimised for symlinks
+ * under GFS2_FAST_NAME_SIZE.
+ *
+ * Returns: 0 on success or error code
+ */
+
+static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+ struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
+ char array[GFS2_FAST_NAME_SIZE], *buf = array;
+ unsigned int len = GFS2_FAST_NAME_SIZE;
+ int error;
+
+ error = gfs2_readlinki(ip, &buf, &len);
+ if (!error) {
+ error = vfs_follow_link(nd, buf);
+ if (buf != array)
+ kfree(buf);
+ }
+
+ return ERR_PTR(error);
+}
+
+/**
+ * gfs2_permission -
+ * @inode:
+ * @mask:
+ * @nd: passed from Linux VFS, ignored by us
+ *
+ * Returns: errno
+ */
+
+static int gfs2_permission(struct inode *inode, int mask, struct nameidata *nd)
+{
+ struct gfs2_inode *ip = GFS2_I(inode);
+ struct gfs2_holder i_gh;
+ int error;
+
+ if (ip->i_vn == ip->i_gl->gl_vn)
+ return generic_permission(inode, mask, gfs2_check_acl);
+
+ error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
+ if (!error) {
+ error = generic_permission(inode, mask, gfs2_check_acl_locked);
+ gfs2_glock_dq_uninit(&i_gh);
+ }
+
+ return error;
+}
+
+static int setattr_size(struct inode *inode, struct iattr *attr)
+{
+ struct gfs2_inode *ip = GFS2_I(inode);
+ int error;
+
+ if (attr->ia_size != ip->i_di.di_size) {
+ error = vmtruncate(inode, attr->ia_size);
+ if (error)
+ return error;
+ }
+
+ error = gfs2_truncatei(ip, attr->ia_size);
+ if (error)
+ return error;
+
+ return error;
+}
+
+static int setattr_chown(struct inode *inode, struct iattr *attr)
+{
+ struct gfs2_inode *ip = GFS2_I(inode);
+ struct gfs2_sbd *sdp = GFS2_SB(inode);
+ struct buffer_head *dibh;
+ u32 ouid, ogid, nuid, ngid;
+ int error;
+
+ ouid = ip->i_di.di_uid;
+ ogid = ip->i_di.di_gid;
+ nuid = attr->ia_uid;
+ ngid = attr->ia_gid;
+
+ if (!(attr->ia_valid & ATTR_UID) || ouid == nuid)
+ ouid = nuid = NO_QUOTA_CHANGE;
+ if (!(attr->ia_valid & ATTR_GID) || ogid == ngid)
+ ogid = ngid = NO_QUOTA_CHANGE;
+
+ gfs2_alloc_get(ip);
+
+ error = gfs2_quota_lock(ip, nuid, ngid);
+ if (error)
+ goto out_alloc;
+
+ if (ouid != NO_QUOTA_CHANGE || ogid != NO_QUOTA_CHANGE) {
+ error = gfs2_quota_check(ip, nuid, ngid);
+ if (error)
+ goto out_gunlock_q;
+ }
+
+ error = gfs2_trans_begin(sdp, RES_DINODE + 2 * RES_QUOTA, 0);
+ if (error)
+ goto out_gunlock_q;
+
+ error = gfs2_meta_inode_buffer(ip, &dibh);
+ if (error)
+ goto out_end_trans;
+
+ error = inode_setattr(inode, attr);
+ gfs2_assert_warn(sdp, !error);
+ gfs2_inode_attr_out(ip);
+
+ gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+ gfs2_dinode_out(&ip->i_di, dibh->b_data);
+ brelse(dibh);
+
+ if (ouid != NO_QUOTA_CHANGE || ogid != NO_QUOTA_CHANGE) {
+ gfs2_quota_change(ip, -ip->i_di.di_blocks, ouid, ogid);
+ gfs2_quota_change(ip, ip->i_di.di_blocks, nuid, ngid);
+ }
+
+out_end_trans:
+ gfs2_trans_end(sdp);
+out_gunlock_q:
+ gfs2_quota_unlock(ip);
+out_alloc:
+ gfs2_alloc_put(ip);
+ return error;
+}
+
+/**
+ * gfs2_setattr - Change attributes on an inode
+ * @dentry: The dentry which is changing
+ * @attr: The structure describing the change
+ *
+ * The VFS layer wants to change one or more of an inodes attributes. Write
+ * that change out to disk.
+ *
+ * Returns: errno
+ */
+
+static int gfs2_setattr(struct dentry *dentry, struct iattr *attr)
+{
+ struct inode *inode = dentry->d_inode;
+ struct gfs2_inode *ip = GFS2_I(inode);
+ struct gfs2_holder i_gh;
+ int error;
+
+ error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
+ if (error)
+ return error;
+
+ error = -EPERM;
+ if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
+ goto out;
+
+ error = inode_change_ok(inode, attr);
+ if (error)
+ goto out;
+
+ if (attr->ia_valid & ATTR_SIZE)
+ error = setattr_size(inode, attr);
+ else if (attr->ia_valid & (ATTR_UID | ATTR_GID))
+ error = setattr_chown(inode, attr);
+ else if ((attr->ia_valid & ATTR_MODE) && IS_POSIXACL(inode))
+ error = gfs2_acl_chmod(ip, attr);
+ else
+ error = gfs2_setattr_simple(ip, attr);
+
+out:
+ gfs2_glock_dq_uninit(&i_gh);
+ if (!error)
+ mark_inode_dirty(inode);
+ return error;
+}
+
+/**
+ * gfs2_getattr - Read out an inode's attributes
+ * @mnt: The vfsmount the inode is being accessed from
+ * @dentry: The dentry to stat
+ * @stat: The inode's stats
+ *
+ * Returns: errno
+ */
+
+static int gfs2_getattr(struct vfsmount *mnt, struct dentry *dentry,
+ struct kstat *stat)
+{
+ struct inode *inode = dentry->d_inode;
+ struct gfs2_inode *ip = GFS2_I(inode);
+ struct gfs2_holder gh;
+ int error;
+
+ error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &gh);
+ if (!error) {
+ generic_fillattr(inode, stat);
+ gfs2_glock_dq_uninit(&gh);
+ }
+
+ return error;
+}
+
+static int gfs2_setxattr(struct dentry *dentry, const char *name,
+ const void *data, size_t size, int flags)
+{
+ struct inode *inode = dentry->d_inode;
+ struct gfs2_ea_request er;
+
+ memset(&er, 0, sizeof(struct gfs2_ea_request));
+ er.er_type = gfs2_ea_name2type(name, &er.er_name);
+ if (er.er_type == GFS2_EATYPE_UNUSED)
+ return -EOPNOTSUPP;
+ er.er_data = (char *)data;
+ er.er_name_len = strlen(er.er_name);
+ er.er_data_len = size;
+ er.er_flags = flags;
+
+ gfs2_assert_warn(GFS2_SB(inode), !(er.er_flags & GFS2_ERF_MODE));
+
+ return gfs2_ea_set(GFS2_I(inode), &er);
+}
+
+static ssize_t gfs2_getxattr(struct dentry *dentry, const char *name,
+ void *data, size_t size)
+{
+ struct gfs2_ea_request er;
+
+ memset(&er, 0, sizeof(struct gfs2_ea_request));
+ er.er_type = gfs2_ea_name2type(name, &er.er_name);
+ if (er.er_type == GFS2_EATYPE_UNUSED)
+ return -EOPNOTSUPP;
+ er.er_data = data;
+ er.er_name_len = strlen(er.er_name);
+ er.er_data_len = size;
+
+ return gfs2_ea_get(GFS2_I(dentry->d_inode), &er);
+}
+
+static ssize_t gfs2_listxattr(struct dentry *dentry, char *buffer, size_t size)
+{
+ struct gfs2_ea_request er;
+
+ memset(&er, 0, sizeof(struct gfs2_ea_request));
+ er.er_data = (size) ? buffer : NULL;
+ er.er_data_len = size;
+
+ return gfs2_ea_list(GFS2_I(dentry->d_inode), &er);
+}
+
+static int gfs2_removexattr(struct dentry *dentry, const char *name)
+{
+ struct gfs2_ea_request er;
+
+ memset(&er, 0, sizeof(struct gfs2_ea_request));
+ er.er_type = gfs2_ea_name2type(name, &er.er_name);
+ if (er.er_type == GFS2_EATYPE_UNUSED)
+ return -EOPNOTSUPP;
+ er.er_name_len = strlen(er.er_name);
+
+ return gfs2_ea_remove(GFS2_I(dentry->d_inode), &er);
+}
+
+struct inode_operations gfs2_file_iops = {
+ .permission = gfs2_permission,
+ .setattr = gfs2_setattr,
+ .getattr = gfs2_getattr,
+ .setxattr = gfs2_setxattr,
+ .getxattr = gfs2_getxattr,
+ .listxattr = gfs2_listxattr,
+ .removexattr = gfs2_removexattr,
+};
+
+struct inode_operations gfs2_dev_iops = {
+ .permission = gfs2_permission,
+ .setattr = gfs2_setattr,
+ .getattr = gfs2_getattr,
+ .setxattr = gfs2_setxattr,
+ .getxattr = gfs2_getxattr,
+ .listxattr = gfs2_listxattr,
+ .removexattr = gfs2_removexattr,
+};
+
+struct inode_operations gfs2_dir_iops = {
+ .create = gfs2_create,
+ .lookup = gfs2_lookup,
+ .link = gfs2_link,
+ .unlink = gfs2_unlink,
+ .symlink = gfs2_symlink,
+ .mkdir = gfs2_mkdir,
+ .rmdir = gfs2_rmdir,
+ .mknod = gfs2_mknod,
+ .rename = gfs2_rename,
+ .permission = gfs2_permission,
+ .setattr = gfs2_setattr,
+ .getattr = gfs2_getattr,
+ .setxattr = gfs2_setxattr,
+ .getxattr = gfs2_getxattr,
+ .listxattr = gfs2_listxattr,
+ .removexattr = gfs2_removexattr,
+};
+
+struct inode_operations gfs2_symlink_iops = {
+ .readlink = gfs2_readlink,
+ .follow_link = gfs2_follow_link,
+ .permission = gfs2_permission,
+ .setattr = gfs2_setattr,
+ .getattr = gfs2_getattr,
+ .setxattr = gfs2_setxattr,
+ .getxattr = gfs2_getxattr,
+ .listxattr = gfs2_listxattr,
+ .removexattr = gfs2_removexattr,
+};
+
diff --git a/fs/gfs2/ops_inode.h b/fs/gfs2/ops_inode.h
new file mode 100644
index 000000000000..b15acb4fd34c
--- /dev/null
+++ b/fs/gfs2/ops_inode.h
@@ -0,0 +1,20 @@
+/*
+ * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#ifndef __OPS_INODE_DOT_H__
+#define __OPS_INODE_DOT_H__
+
+#include <linux/fs.h>
+
+extern struct inode_operations gfs2_file_iops;
+extern struct inode_operations gfs2_dir_iops;
+extern struct inode_operations gfs2_symlink_iops;
+extern struct inode_operations gfs2_dev_iops;
+
+#endif /* __OPS_INODE_DOT_H__ */
diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c
new file mode 100644
index 000000000000..06f06f7773d0
--- /dev/null
+++ b/fs/gfs2/ops_super.c
@@ -0,0 +1,468 @@
+/*
+ * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/statfs.h>
+#include <linux/seq_file.h>
+#include <linux/mount.h>
+#include <linux/kthread.h>
+#include <linux/delay.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/crc32.h>
+#include <linux/lm_interface.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "glock.h"
+#include "inode.h"
+#include "lm.h"
+#include "log.h"
+#include "mount.h"
+#include "ops_super.h"
+#include "quota.h"
+#include "recovery.h"
+#include "rgrp.h"
+#include "super.h"
+#include "sys.h"
+#include "util.h"
+#include "trans.h"
+#include "dir.h"
+#include "eattr.h"
+#include "bmap.h"
+
+/**
+ * gfs2_write_inode - Make sure the inode is stable on the disk
+ * @inode: The inode
+ * @sync: synchronous write flag
+ *
+ * Returns: errno
+ */
+
+static int gfs2_write_inode(struct inode *inode, int sync)
+{
+ struct gfs2_inode *ip = GFS2_I(inode);
+
+ /* Check this is a "normal" inode */
+ if (inode->i_private) {
+ if (current->flags & PF_MEMALLOC)
+ return 0;
+ if (sync)
+ gfs2_log_flush(GFS2_SB(inode), ip->i_gl);
+ }
+
+ return 0;
+}
+
+/**
+ * gfs2_put_super - Unmount the filesystem
+ * @sb: The VFS superblock
+ *
+ */
+
+static void gfs2_put_super(struct super_block *sb)
+{
+ struct gfs2_sbd *sdp = sb->s_fs_info;
+ int error;
+
+ if (!sdp)
+ return;
+
+ if (!strncmp(sb->s_type->name, "gfs2meta", 8))
+ return; /* Nothing to do */
+
+ /* Unfreeze the filesystem, if we need to */
+
+ mutex_lock(&sdp->sd_freeze_lock);
+ if (sdp->sd_freeze_count)
+ gfs2_glock_dq_uninit(&sdp->sd_freeze_gh);
+ mutex_unlock(&sdp->sd_freeze_lock);
+
+ kthread_stop(sdp->sd_quotad_process);
+ kthread_stop(sdp->sd_logd_process);
+ kthread_stop(sdp->sd_recoverd_process);
+ while (sdp->sd_glockd_num--)
+ kthread_stop(sdp->sd_glockd_process[sdp->sd_glockd_num]);
+ kthread_stop(sdp->sd_scand_process);
+
+ if (!(sb->s_flags & MS_RDONLY)) {
+ error = gfs2_make_fs_ro(sdp);
+ if (error)
+ gfs2_io_error(sdp);
+ }
+ /* At this point, we're through modifying the disk */
+
+ /* Release stuff */
+
+ iput(sdp->sd_master_dir);
+ iput(sdp->sd_jindex);
+ iput(sdp->sd_inum_inode);
+ iput(sdp->sd_statfs_inode);
+ iput(sdp->sd_rindex);
+ iput(sdp->sd_quota_inode);
+
+ gfs2_glock_put(sdp->sd_rename_gl);
+ gfs2_glock_put(sdp->sd_trans_gl);
+
+ if (!sdp->sd_args.ar_spectator) {
+ gfs2_glock_dq_uninit(&sdp->sd_journal_gh);
+ gfs2_glock_dq_uninit(&sdp->sd_jinode_gh);
+ gfs2_glock_dq_uninit(&sdp->sd_ir_gh);
+ gfs2_glock_dq_uninit(&sdp->sd_sc_gh);
+ gfs2_glock_dq_uninit(&sdp->sd_qc_gh);
+ iput(sdp->sd_ir_inode);
+ iput(sdp->sd_sc_inode);
+ iput(sdp->sd_qc_inode);
+ }
+
+ gfs2_glock_dq_uninit(&sdp->sd_live_gh);
+ gfs2_clear_rgrpd(sdp);
+ gfs2_jindex_free(sdp);
+ /* Take apart glock structures and buffer lists */
+ gfs2_gl_hash_clear(sdp, WAIT);
+ /* Unmount the locking protocol */
+ gfs2_lm_unmount(sdp);
+
+ /* At this point, we're through participating in the lockspace */
+ gfs2_sys_fs_del(sdp);
+ kfree(sdp);
+}
+
+/**
+ * gfs2_write_super - disk commit all incore transactions
+ * @sb: the filesystem
+ *
+ * This function is called every time sync(2) is called.
+ * After this exits, all dirty buffers are synced.
+ */
+
+static void gfs2_write_super(struct super_block *sb)
+{
+ gfs2_log_flush(sb->s_fs_info, NULL);
+}
+
+/**
+ * gfs2_write_super_lockfs - prevent further writes to the filesystem
+ * @sb: the VFS structure for the filesystem
+ *
+ */
+
+static void gfs2_write_super_lockfs(struct super_block *sb)
+{
+ struct gfs2_sbd *sdp = sb->s_fs_info;
+ int error;
+
+ for (;;) {
+ error = gfs2_freeze_fs(sdp);
+ if (!error)
+ break;
+
+ switch (error) {
+ case -EBUSY:
+ fs_err(sdp, "waiting for recovery before freeze\n");
+ break;
+
+ default:
+ fs_err(sdp, "error freezing FS: %d\n", error);
+ break;
+ }
+
+ fs_err(sdp, "retrying...\n");
+ msleep(1000);
+ }
+}
+
+/**
+ * gfs2_unlockfs - reallow writes to the filesystem
+ * @sb: the VFS structure for the filesystem
+ *
+ */
+
+static void gfs2_unlockfs(struct super_block *sb)
+{
+ gfs2_unfreeze_fs(sb->s_fs_info);
+}
+
+/**
+ * gfs2_statfs - Gather and return stats about the filesystem
+ * @sb: The superblock
+ * @statfsbuf: The buffer
+ *
+ * Returns: 0 on success or error code
+ */
+
+static int gfs2_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+ struct super_block *sb = dentry->d_inode->i_sb;
+ struct gfs2_sbd *sdp = sb->s_fs_info;
+ struct gfs2_statfs_change sc;
+ int error;
+
+ if (gfs2_tune_get(sdp, gt_statfs_slow))
+ error = gfs2_statfs_slow(sdp, &sc);
+ else
+ error = gfs2_statfs_i(sdp, &sc);
+
+ if (error)
+ return error;
+
+ buf->f_type = GFS2_MAGIC;
+ buf->f_bsize = sdp->sd_sb.sb_bsize;
+ buf->f_blocks = sc.sc_total;
+ buf->f_bfree = sc.sc_free;
+ buf->f_bavail = sc.sc_free;
+ buf->f_files = sc.sc_dinodes + sc.sc_free;
+ buf->f_ffree = sc.sc_free;
+ buf->f_namelen = GFS2_FNAMESIZE;
+
+ return 0;
+}
+
+/**
+ * gfs2_remount_fs - called when the FS is remounted
+ * @sb: the filesystem
+ * @flags: the remount flags
+ * @data: extra data passed in (not used right now)
+ *
+ * Returns: errno
+ */
+
+static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
+{
+ struct gfs2_sbd *sdp = sb->s_fs_info;
+ int error;
+
+ error = gfs2_mount_args(sdp, data, 1);
+ if (error)
+ return error;
+
+ if (sdp->sd_args.ar_spectator)
+ *flags |= MS_RDONLY;
+ else {
+ if (*flags & MS_RDONLY) {
+ if (!(sb->s_flags & MS_RDONLY))
+ error = gfs2_make_fs_ro(sdp);
+ } else if (!(*flags & MS_RDONLY) &&
+ (sb->s_flags & MS_RDONLY)) {
+ error = gfs2_make_fs_rw(sdp);
+ }
+ }
+
+ if (*flags & (MS_NOATIME | MS_NODIRATIME))
+ set_bit(SDF_NOATIME, &sdp->sd_flags);
+ else
+ clear_bit(SDF_NOATIME, &sdp->sd_flags);
+
+ /* Don't let the VFS update atimes. GFS2 handles this itself. */
+ *flags |= MS_NOATIME | MS_NODIRATIME;
+
+ return error;
+}
+
+/**
+ * gfs2_clear_inode - Deallocate an inode when VFS is done with it
+ * @inode: The VFS inode
+ *
+ */
+
+static void gfs2_clear_inode(struct inode *inode)
+{
+ /* This tells us its a "real" inode and not one which only
+ * serves to contain an address space (see rgrp.c, meta_io.c)
+ * which therefore doesn't have its own glocks.
+ */
+ if (inode->i_private) {
+ struct gfs2_inode *ip = GFS2_I(inode);
+ gfs2_glock_inode_squish(inode);
+ gfs2_assert(inode->i_sb->s_fs_info, ip->i_gl->gl_state == LM_ST_UNLOCKED);
+ ip->i_gl->gl_object = NULL;
+ gfs2_glock_schedule_for_reclaim(ip->i_gl);
+ gfs2_glock_put(ip->i_gl);
+ ip->i_gl = NULL;
+ if (ip->i_iopen_gh.gh_gl)
+ gfs2_glock_dq_uninit(&ip->i_iopen_gh);
+ }
+}
+
+/**
+ * gfs2_show_options - Show mount options for /proc/mounts
+ * @s: seq_file structure
+ * @mnt: vfsmount
+ *
+ * Returns: 0 on success or error code
+ */
+
+static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
+{
+ struct gfs2_sbd *sdp = mnt->mnt_sb->s_fs_info;
+ struct gfs2_args *args = &sdp->sd_args;
+
+ if (args->ar_lockproto[0])
+ seq_printf(s, ",lockproto=%s", args->ar_lockproto);
+ if (args->ar_locktable[0])
+ seq_printf(s, ",locktable=%s", args->ar_locktable);
+ if (args->ar_hostdata[0])
+ seq_printf(s, ",hostdata=%s", args->ar_hostdata);
+ if (args->ar_spectator)
+ seq_printf(s, ",spectator");
+ if (args->ar_ignore_local_fs)
+ seq_printf(s, ",ignore_local_fs");
+ if (args->ar_localflocks)
+ seq_printf(s, ",localflocks");
+ if (args->ar_localcaching)
+ seq_printf(s, ",localcaching");
+ if (args->ar_debug)
+ seq_printf(s, ",debug");
+ if (args->ar_upgrade)
+ seq_printf(s, ",upgrade");
+ if (args->ar_num_glockd != GFS2_GLOCKD_DEFAULT)
+ seq_printf(s, ",num_glockd=%u", args->ar_num_glockd);
+ if (args->ar_posix_acl)
+ seq_printf(s, ",acl");
+ if (args->ar_quota != GFS2_QUOTA_DEFAULT) {
+ char *state;
+ switch (args->ar_quota) {
+ case GFS2_QUOTA_OFF:
+ state = "off";
+ break;
+ case GFS2_QUOTA_ACCOUNT:
+ state = "account";
+ break;
+ case GFS2_QUOTA_ON:
+ state = "on";
+ break;
+ default:
+ state = "unknown";
+ break;
+ }
+ seq_printf(s, ",quota=%s", state);
+ }
+ if (args->ar_suiddir)
+ seq_printf(s, ",suiddir");
+ if (args->ar_data != GFS2_DATA_DEFAULT) {
+ char *state;
+ switch (args->ar_data) {
+ case GFS2_DATA_WRITEBACK:
+ state = "writeback";
+ break;
+ case GFS2_DATA_ORDERED:
+ state = "ordered";
+ break;
+ default:
+ state = "unknown";
+ break;
+ }
+ seq_printf(s, ",data=%s", state);
+ }
+
+ return 0;
+}
+
+/*
+ * We have to (at the moment) hold the inodes main lock to cover
+ * the gap between unlocking the shared lock on the iopen lock and
+ * taking the exclusive lock. I'd rather do a shared -> exclusive
+ * conversion on the iopen lock, but we can change that later. This
+ * is safe, just less efficient.
+ */
+static void gfs2_delete_inode(struct inode *inode)
+{
+ struct gfs2_sbd *sdp = inode->i_sb->s_fs_info;
+ struct gfs2_inode *ip = GFS2_I(inode);
+ struct gfs2_holder gh;
+ int error;
+
+ if (!inode->i_private)
+ goto out;
+
+ error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE, &gh);
+ if (unlikely(error)) {
+ gfs2_glock_dq_uninit(&ip->i_iopen_gh);
+ goto out;
+ }
+
+ gfs2_glock_dq(&ip->i_iopen_gh);
+ gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE, &ip->i_iopen_gh);
+ error = gfs2_glock_nq(&ip->i_iopen_gh);
+ if (error)
+ goto out_uninit;
+
+ if (S_ISDIR(ip->i_di.di_mode) &&
+ (ip->i_di.di_flags & GFS2_DIF_EXHASH)) {
+ error = gfs2_dir_exhash_dealloc(ip);
+ if (error)
+ goto out_unlock;
+ }
+
+ if (ip->i_di.di_eattr) {
+ error = gfs2_ea_dealloc(ip);
+ if (error)
+ goto out_unlock;
+ }
+
+ if (!gfs2_is_stuffed(ip)) {
+ error = gfs2_file_dealloc(ip);
+ if (error)
+ goto out_unlock;
+ }
+
+ error = gfs2_dinode_dealloc(ip);
+
+out_unlock:
+ gfs2_glock_dq(&ip->i_iopen_gh);
+out_uninit:
+ gfs2_holder_uninit(&ip->i_iopen_gh);
+ gfs2_glock_dq_uninit(&gh);
+ if (error)
+ fs_warn(sdp, "gfs2_delete_inode: %d\n", error);
+out:
+ truncate_inode_pages(&inode->i_data, 0);
+ clear_inode(inode);
+}
+
+
+
+static struct inode *gfs2_alloc_inode(struct super_block *sb)
+{
+ struct gfs2_sbd *sdp = sb->s_fs_info;
+ struct gfs2_inode *ip;
+
+ ip = kmem_cache_alloc(gfs2_inode_cachep, GFP_KERNEL);
+ if (ip) {
+ ip->i_flags = 0;
+ ip->i_gl = NULL;
+ ip->i_greedy = gfs2_tune_get(sdp, gt_greedy_default);
+ ip->i_last_pfault = jiffies;
+ }
+ return &ip->i_inode;
+}
+
+static void gfs2_destroy_inode(struct inode *inode)
+{
+ kmem_cache_free(gfs2_inode_cachep, inode);
+}
+
+struct super_operations gfs2_super_ops = {
+ .alloc_inode = gfs2_alloc_inode,
+ .destroy_inode = gfs2_destroy_inode,
+ .write_inode = gfs2_write_inode,
+ .delete_inode = gfs2_delete_inode,
+ .put_super = gfs2_put_super,
+ .write_super = gfs2_write_super,
+ .write_super_lockfs = gfs2_write_super_lockfs,
+ .unlockfs = gfs2_unlockfs,
+ .statfs = gfs2_statfs,
+ .remount_fs = gfs2_remount_fs,
+ .clear_inode = gfs2_clear_inode,
+ .show_options = gfs2_show_options,
+};
+
diff --git a/fs/gfs2/ops_super.h b/fs/gfs2/ops_super.h
new file mode 100644
index 000000000000..9de73f042f78
--- /dev/null
+++ b/fs/gfs2/ops_super.h
@@ -0,0 +1,17 @@
+/*
+ * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#ifndef __OPS_SUPER_DOT_H__
+#define __OPS_SUPER_DOT_H__
+
+#include <linux/fs.h>
+
+extern struct super_operations gfs2_super_ops;
+
+#endif /* __OPS_SUPER_DOT_H__ */
diff --git a/fs/gfs2/ops_vm.c b/fs/gfs2/ops_vm.c
new file mode 100644
index 000000000000..5453d2947ab3
--- /dev/null
+++ b/fs/gfs2/ops_vm.c
@@ -0,0 +1,184 @@
+/*
+ * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/lm_interface.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "bmap.h"
+#include "glock.h"
+#include "inode.h"
+#include "ops_vm.h"
+#include "quota.h"
+#include "rgrp.h"
+#include "trans.h"
+#include "util.h"
+
+static void pfault_be_greedy(struct gfs2_inode *ip)
+{
+ unsigned int time;
+
+ spin_lock(&ip->i_spin);
+ time = ip->i_greedy;
+ ip->i_last_pfault = jiffies;
+ spin_unlock(&ip->i_spin);
+
+ igrab(&ip->i_inode);
+ if (gfs2_glock_be_greedy(ip->i_gl, time))
+ iput(&ip->i_inode);
+}
+
+static struct page *gfs2_private_nopage(struct vm_area_struct *area,
+ unsigned long address, int *type)
+{
+ struct gfs2_inode *ip = GFS2_I(area->vm_file->f_mapping->host);
+ struct page *result;
+
+ set_bit(GIF_PAGED, &ip->i_flags);
+
+ result = filemap_nopage(area, address, type);
+
+ if (result && result != NOPAGE_OOM)
+ pfault_be_greedy(ip);
+
+ return result;
+}
+
+static int alloc_page_backing(struct gfs2_inode *ip, struct page *page)
+{
+ struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+ unsigned long index = page->index;
+ u64 lblock = index << (PAGE_CACHE_SHIFT -
+ sdp->sd_sb.sb_bsize_shift);
+ unsigned int blocks = PAGE_CACHE_SIZE >> sdp->sd_sb.sb_bsize_shift;
+ struct gfs2_alloc *al;
+ unsigned int data_blocks, ind_blocks;
+ unsigned int x;
+ int error;
+
+ al = gfs2_alloc_get(ip);
+
+ error = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+ if (error)
+ goto out;
+
+ error = gfs2_quota_check(ip, ip->i_di.di_uid, ip->i_di.di_gid);
+ if (error)
+ goto out_gunlock_q;
+
+ gfs2_write_calc_reserv(ip, PAGE_CACHE_SIZE, &data_blocks, &ind_blocks);
+
+ al->al_requested = data_blocks + ind_blocks;
+
+ error = gfs2_inplace_reserve(ip);
+ if (error)
+ goto out_gunlock_q;
+
+ error = gfs2_trans_begin(sdp, al->al_rgd->rd_ri.ri_length +
+ ind_blocks + RES_DINODE +
+ RES_STATFS + RES_QUOTA, 0);
+ if (error)
+ goto out_ipres;
+
+ if (gfs2_is_stuffed(ip)) {
+ error = gfs2_unstuff_dinode(ip, NULL);
+ if (error)
+ goto out_trans;
+ }
+
+ for (x = 0; x < blocks; ) {
+ u64 dblock;
+ unsigned int extlen;
+ int new = 1;
+
+ error = gfs2_extent_map(&ip->i_inode, lblock, &new, &dblock, &extlen);
+ if (error)
+ goto out_trans;
+
+ lblock += extlen;
+ x += extlen;
+ }
+
+ gfs2_assert_warn(sdp, al->al_alloced);
+
+out_trans:
+ gfs2_trans_end(sdp);
+out_ipres:
+ gfs2_inplace_release(ip);
+out_gunlock_q:
+ gfs2_quota_unlock(ip);
+out:
+ gfs2_alloc_put(ip);
+ return error;
+}
+
+static struct page *gfs2_sharewrite_nopage(struct vm_area_struct *area,
+ unsigned long address, int *type)
+{
+ struct file *file = area->vm_file;
+ struct gfs2_file *gf = file->private_data;
+ struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
+ struct gfs2_holder i_gh;
+ struct page *result = NULL;
+ unsigned long index = ((address - area->vm_start) >> PAGE_CACHE_SHIFT) +
+ area->vm_pgoff;
+ int alloc_required;
+ int error;
+
+ error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
+ if (error)
+ return NULL;
+
+ set_bit(GIF_PAGED, &ip->i_flags);
+ set_bit(GIF_SW_PAGED, &ip->i_flags);
+
+ error = gfs2_write_alloc_required(ip, (u64)index << PAGE_CACHE_SHIFT,
+ PAGE_CACHE_SIZE, &alloc_required);
+ if (error)
+ goto out;
+
+ set_bit(GFF_EXLOCK, &gf->f_flags);
+ result = filemap_nopage(area, address, type);
+ clear_bit(GFF_EXLOCK, &gf->f_flags);
+ if (!result || result == NOPAGE_OOM)
+ goto out;
+
+ if (alloc_required) {
+ error = alloc_page_backing(ip, result);
+ if (error) {
+ page_cache_release(result);
+ result = NULL;
+ goto out;
+ }
+ set_page_dirty(result);
+ }
+
+ pfault_be_greedy(ip);
+out:
+ gfs2_glock_dq_uninit(&i_gh);
+
+ return result;
+}
+
+struct vm_operations_struct gfs2_vm_ops_private = {
+ .nopage = gfs2_private_nopage,
+};
+
+struct vm_operations_struct gfs2_vm_ops_sharewrite = {
+ .nopage = gfs2_sharewrite_nopage,
+};
+
diff --git a/fs/gfs2/ops_vm.h b/fs/gfs2/ops_vm.h
new file mode 100644
index 000000000000..4ae8f43ed5e3
--- /dev/null
+++ b/fs/gfs2/ops_vm.h
@@ -0,0 +1,18 @@
+/*
+ * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#ifndef __OPS_VM_DOT_H__
+#define __OPS_VM_DOT_H__
+
+#include <linux/mm.h>
+
+extern struct vm_operations_struct gfs2_vm_ops_private;
+extern struct vm_operations_struct gfs2_vm_ops_sharewrite;
+
+#endif /* __OPS_VM_DOT_H__ */
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
new file mode 100644
index 000000000000..a3deae7416c9
--- /dev/null
+++ b/fs/gfs2/quota.c
@@ -0,0 +1,1228 @@
+/*
+ * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+/*
+ * Quota change tags are associated with each transaction that allocates or
+ * deallocates space. Those changes are accumulated locally to each node (in a
+ * per-node file) and then are periodically synced to the quota file. This
+ * avoids the bottleneck of constantly touching the quota file, but introduces
+ * fuzziness in the current usage value of IDs that are being used on different
+ * nodes in the cluster simultaneously. So, it is possible for a user on
+ * multiple nodes to overrun their quota, but that overrun is controlable.
+ * Since quota tags are part of transactions, there is no need to a quota check
+ * program to be run on node crashes or anything like that.
+ *
+ * There are couple of knobs that let the administrator manage the quota
+ * fuzziness. "quota_quantum" sets the maximum time a quota change can be
+ * sitting on one node before being synced to the quota file. (The default is
+ * 60 seconds.) Another knob, "quota_scale" controls how quickly the frequency
+ * of quota file syncs increases as the user moves closer to their limit. The
+ * more frequent the syncs, the more accurate the quota enforcement, but that
+ * means that there is more contention between the nodes for the quota file.
+ * The default value is one. This sets the maximum theoretical quota overrun
+ * (with infinite node with infinite bandwidth) to twice the user's limit. (In
+ * practice, the maximum overrun you see should be much less.) A "quota_scale"
+ * number greater than one makes quota syncs more frequent and reduces the
+ * maximum overrun. Numbers less than one (but greater than zero) make quota
+ * syncs less frequent.
+ *
+ * GFS quotas also use per-ID Lock Value Blocks (LVBs) to cache the contents of
+ * the quota file, so it is not being constantly read.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/sort.h>
+#include <linux/fs.h>
+#include <linux/bio.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/lm_interface.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "bmap.h"
+#include "glock.h"
+#include "glops.h"
+#include "log.h"
+#include "meta_io.h"
+#include "quota.h"
+#include "rgrp.h"
+#include "super.h"
+#include "trans.h"
+#include "inode.h"
+#include "ops_file.h"
+#include "ops_address.h"
+#include "util.h"
+
+#define QUOTA_USER 1
+#define QUOTA_GROUP 0
+
+static u64 qd2offset(struct gfs2_quota_data *qd)
+{
+ u64 offset;
+
+ offset = 2 * (u64)qd->qd_id + !test_bit(QDF_USER, &qd->qd_flags);
+ offset *= sizeof(struct gfs2_quota);
+
+ return offset;
+}
+
+static int qd_alloc(struct gfs2_sbd *sdp, int user, u32 id,
+ struct gfs2_quota_data **qdp)
+{
+ struct gfs2_quota_data *qd;
+ int error;
+
+ qd = kzalloc(sizeof(struct gfs2_quota_data), GFP_KERNEL);
+ if (!qd)
+ return -ENOMEM;
+
+ qd->qd_count = 1;
+ qd->qd_id = id;
+ if (user)
+ set_bit(QDF_USER, &qd->qd_flags);
+ qd->qd_slot = -1;
+
+ error = gfs2_glock_get(sdp, 2 * (u64)id + !user,
+ &gfs2_quota_glops, CREATE, &qd->qd_gl);
+ if (error)
+ goto fail;
+
+ error = gfs2_lvb_hold(qd->qd_gl);
+ gfs2_glock_put(qd->qd_gl);
+ if (error)
+ goto fail;
+
+ *qdp = qd;
+
+ return 0;
+
+fail:
+ kfree(qd);
+ return error;
+}
+
+static int qd_get(struct gfs2_sbd *sdp, int user, u32 id, int create,
+ struct gfs2_quota_data **qdp)
+{
+ struct gfs2_quota_data *qd = NULL, *new_qd = NULL;
+ int error, found;
+
+ *qdp = NULL;
+
+ for (;;) {
+ found = 0;
+ spin_lock(&sdp->sd_quota_spin);
+ list_for_each_entry(qd, &sdp->sd_quota_list, qd_list) {
+ if (qd->qd_id == id &&
+ !test_bit(QDF_USER, &qd->qd_flags) == !user) {
+ qd->qd_count++;
+ found = 1;
+ break;
+ }
+ }
+
+ if (!found)
+ qd = NULL;
+
+ if (!qd && new_qd) {
+ qd = new_qd;
+ list_add(&qd->qd_list, &sdp->sd_quota_list);
+ atomic_inc(&sdp->sd_quota_count);
+ new_qd = NULL;
+ }
+
+ spin_unlock(&sdp->sd_quota_spin);
+
+ if (qd || !create) {
+ if (new_qd) {
+ gfs2_lvb_unhold(new_qd->qd_gl);
+ kfree(new_qd);
+ }
+ *qdp = qd;
+ return 0;
+ }
+
+ error = qd_alloc(sdp, user, id, &new_qd);
+ if (error)
+ return error;
+ }
+}
+
+static void qd_hold(struct gfs2_quota_data *qd)
+{
+ struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
+
+ spin_lock(&sdp->sd_quota_spin);
+ gfs2_assert(sdp, qd->qd_count);
+ qd->qd_count++;
+ spin_unlock(&sdp->sd_quota_spin);
+}
+
+static void qd_put(struct gfs2_quota_data *qd)
+{
+ struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
+ spin_lock(&sdp->sd_quota_spin);
+ gfs2_assert(sdp, qd->qd_count);
+ if (!--qd->qd_count)
+ qd->qd_last_touched = jiffies;
+ spin_unlock(&sdp->sd_quota_spin);
+}
+
+static int slot_get(struct gfs2_quota_data *qd)
+{
+ struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
+ unsigned int c, o = 0, b;
+ unsigned char byte = 0;
+
+ spin_lock(&sdp->sd_quota_spin);
+
+ if (qd->qd_slot_count++) {
+ spin_unlock(&sdp->sd_quota_spin);
+ return 0;
+ }
+
+ for (c = 0; c < sdp->sd_quota_chunks; c++)
+ for (o = 0; o < PAGE_SIZE; o++) {
+ byte = sdp->sd_quota_bitmap[c][o];
+ if (byte != 0xFF)
+ goto found;
+ }
+
+ goto fail;
+
+found:
+ for (b = 0; b < 8; b++)
+ if (!(byte & (1 << b)))
+ break;
+ qd->qd_slot = c * (8 * PAGE_SIZE) + o * 8 + b;
+
+ if (qd->qd_slot >= sdp->sd_quota_slots)
+ goto fail;
+
+ sdp->sd_quota_bitmap[c][o] |= 1 << b;
+
+ spin_unlock(&sdp->sd_quota_spin);
+
+ return 0;
+
+fail:
+ qd->qd_slot_count--;
+ spin_unlock(&sdp->sd_quota_spin);
+ return -ENOSPC;
+}
+
+static void slot_hold(struct gfs2_quota_data *qd)
+{
+ struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
+
+ spin_lock(&sdp->sd_quota_spin);
+ gfs2_assert(sdp, qd->qd_slot_count);
+ qd->qd_slot_count++;
+ spin_unlock(&sdp->sd_quota_spin);
+}
+
+static void slot_put(struct gfs2_quota_data *qd)
+{
+ struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
+
+ spin_lock(&sdp->sd_quota_spin);
+ gfs2_assert(sdp, qd->qd_slot_count);
+ if (!--qd->qd_slot_count) {
+ gfs2_icbit_munge(sdp, sdp->sd_quota_bitmap, qd->qd_slot, 0);
+ qd->qd_slot = -1;
+ }
+ spin_unlock(&sdp->sd_quota_spin);
+}
+
+static int bh_get(struct gfs2_quota_data *qd)
+{
+ struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
+ struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode);
+ unsigned int block, offset;
+ struct buffer_head *bh;
+ int error;
+ struct buffer_head bh_map = { .b_state = 0, .b_blocknr = 0 };
+
+ mutex_lock(&sdp->sd_quota_mutex);
+
+ if (qd->qd_bh_count++) {
+ mutex_unlock(&sdp->sd_quota_mutex);
+ return 0;
+ }
+
+ block = qd->qd_slot / sdp->sd_qc_per_block;
+ offset = qd->qd_slot % sdp->sd_qc_per_block;;
+
+ bh_map.b_size = 1 << ip->i_inode.i_blkbits;
+ error = gfs2_block_map(&ip->i_inode, block, 0, &bh_map);
+ if (error)
+ goto fail;
+ error = gfs2_meta_read(ip->i_gl, bh_map.b_blocknr, DIO_WAIT, &bh);
+ if (error)
+ goto fail;
+ error = -EIO;
+ if (gfs2_metatype_check(sdp, bh, GFS2_METATYPE_QC))
+ goto fail_brelse;
+
+ qd->qd_bh = bh;
+ qd->qd_bh_qc = (struct gfs2_quota_change *)
+ (bh->b_data + sizeof(struct gfs2_meta_header) +
+ offset * sizeof(struct gfs2_quota_change));
+
+ mutex_lock(&sdp->sd_quota_mutex);
+
+ return 0;
+
+fail_brelse:
+ brelse(bh);
+fail:
+ qd->qd_bh_count--;
+ mutex_unlock(&sdp->sd_quota_mutex);
+ return error;
+}
+
+static void bh_put(struct gfs2_quota_data *qd)
+{
+ struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
+
+ mutex_lock(&sdp->sd_quota_mutex);
+ gfs2_assert(sdp, qd->qd_bh_count);
+ if (!--qd->qd_bh_count) {
+ brelse(qd->qd_bh);
+ qd->qd_bh = NULL;
+ qd->qd_bh_qc = NULL;
+ }
+ mutex_unlock(&sdp->sd_quota_mutex);
+}
+
+static int qd_fish(struct gfs2_sbd *sdp, struct gfs2_quota_data **qdp)
+{
+ struct gfs2_quota_data *qd = NULL;
+ int error;
+ int found = 0;
+
+ *qdp = NULL;
+
+ if (sdp->sd_vfs->s_flags & MS_RDONLY)
+ return 0;
+
+ spin_lock(&sdp->sd_quota_spin);
+
+ list_for_each_entry(qd, &sdp->sd_quota_list, qd_list) {
+ if (test_bit(QDF_LOCKED, &qd->qd_flags) ||
+ !test_bit(QDF_CHANGE, &qd->qd_flags) ||
+ qd->qd_sync_gen >= sdp->sd_quota_sync_gen)
+ continue;
+
+ list_move_tail(&qd->qd_list, &sdp->sd_quota_list);
+
+ set_bit(QDF_LOCKED, &qd->qd_flags);
+ gfs2_assert_warn(sdp, qd->qd_count);
+ qd->qd_count++;
+ qd->qd_change_sync = qd->qd_change;
+ gfs2_assert_warn(sdp, qd->qd_slot_count);
+ qd->qd_slot_count++;
+ found = 1;
+
+ break;
+ }
+
+ if (!found)
+ qd = NULL;
+
+ spin_unlock(&sdp->sd_quota_spin);
+
+ if (qd) {
+ gfs2_assert_warn(sdp, qd->qd_change_sync);
+ error = bh_get(qd);
+ if (error) {
+ clear_bit(QDF_LOCKED, &qd->qd_flags);
+ slot_put(qd);
+ qd_put(qd);
+ return error;
+ }
+ }
+
+ *qdp = qd;
+
+ return 0;
+}
+
+static int qd_trylock(struct gfs2_quota_data *qd)
+{
+ struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
+
+ if (sdp->sd_vfs->s_flags & MS_RDONLY)
+ return 0;
+
+ spin_lock(&sdp->sd_quota_spin);
+
+ if (test_bit(QDF_LOCKED, &qd->qd_flags) ||
+ !test_bit(QDF_CHANGE, &qd->qd_flags)) {
+ spin_unlock(&sdp->sd_quota_spin);
+ return 0;
+ }
+
+ list_move_tail(&qd->qd_list, &sdp->sd_quota_list);
+
+ set_bit(QDF_LOCKED, &qd->qd_flags);
+ gfs2_assert_warn(sdp, qd->qd_count);
+ qd->qd_count++;
+ qd->qd_change_sync = qd->qd_change;
+ gfs2_assert_warn(sdp, qd->qd_slot_count);
+ qd->qd_slot_count++;
+
+ spin_unlock(&sdp->sd_quota_spin);
+
+ gfs2_assert_warn(sdp, qd->qd_change_sync);
+ if (bh_get(qd)) {
+ clear_bit(QDF_LOCKED, &qd->qd_flags);
+ slot_put(qd);
+ qd_put(qd);
+ return 0;
+ }
+
+ return 1;
+}
+
+static void qd_unlock(struct gfs2_quota_data *qd)
+{
+ gfs2_assert_warn(qd->qd_gl->gl_sbd,
+ test_bit(QDF_LOCKED, &qd->qd_flags));
+ clear_bit(QDF_LOCKED, &qd->qd_flags);
+ bh_put(qd);
+ slot_put(qd);
+ qd_put(qd);
+}
+
+static int qdsb_get(struct gfs2_sbd *sdp, int user, u32 id, int create,
+ struct gfs2_quota_data **qdp)
+{
+ int error;
+
+ error = qd_get(sdp, user, id, create, qdp);
+ if (error)
+ return error;
+
+ error = slot_get(*qdp);
+ if (error)
+ goto fail;
+
+ error = bh_get(*qdp);
+ if (error)
+ goto fail_slot;
+
+ return 0;
+
+fail_slot:
+ slot_put(*qdp);
+fail:
+ qd_put(*qdp);
+ return error;
+}
+
+static void qdsb_put(struct gfs2_quota_data *qd)
+{
+ bh_put(qd);
+ slot_put(qd);
+ qd_put(qd);
+}
+
+int gfs2_quota_hold(struct gfs2_inode *ip, u32 uid, u32 gid)
+{
+ struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+ struct gfs2_alloc *al = &ip->i_alloc;
+ struct gfs2_quota_data **qd = al->al_qd;
+ int error;
+
+ if (gfs2_assert_warn(sdp, !al->al_qd_num) ||
+ gfs2_assert_warn(sdp, !test_bit(GIF_QD_LOCKED, &ip->i_flags)))
+ return -EIO;
+
+ if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF)
+ return 0;
+
+ error = qdsb_get(sdp, QUOTA_USER, ip->i_di.di_uid, CREATE, qd);
+ if (error)
+ goto out;
+ al->al_qd_num++;
+ qd++;
+
+ error = qdsb_get(sdp, QUOTA_GROUP, ip->i_di.di_gid, CREATE, qd);
+ if (error)
+ goto out;
+ al->al_qd_num++;
+ qd++;
+
+ if (uid != NO_QUOTA_CHANGE && uid != ip->i_di.di_uid) {
+ error = qdsb_get(sdp, QUOTA_USER, uid, CREATE, qd);
+ if (error)
+ goto out;
+ al->al_qd_num++;
+ qd++;
+ }
+
+ if (gid != NO_QUOTA_CHANGE && gid != ip->i_di.di_gid) {
+ error = qdsb_get(sdp, QUOTA_GROUP, gid, CREATE, qd);
+ if (error)
+ goto out;
+ al->al_qd_num++;
+ qd++;
+ }
+
+out:
+ if (error)
+ gfs2_quota_unhold(ip);
+ return error;
+}
+
+void gfs2_quota_unhold(struct gfs2_inode *ip)
+{
+ struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+ struct gfs2_alloc *al = &ip->i_alloc;
+ unsigned int x;
+
+ gfs2_assert_warn(sdp, !test_bit(GIF_QD_LOCKED, &ip->i_flags));
+
+ for (x = 0; x < al->al_qd_num; x++) {
+ qdsb_put(al->al_qd[x]);
+ al->al_qd[x] = NULL;
+ }
+ al->al_qd_num = 0;
+}
+
+static int sort_qd(const void *a, const void *b)
+{
+ const struct gfs2_quota_data *qd_a = *(const struct gfs2_quota_data **)a;
+ const struct gfs2_quota_data *qd_b = *(const struct gfs2_quota_data **)b;
+
+ if (!test_bit(QDF_USER, &qd_a->qd_flags) !=
+ !test_bit(QDF_USER, &qd_b->qd_flags)) {
+ if (test_bit(QDF_USER, &qd_a->qd_flags))
+ return -1;
+ else
+ return 1;
+ }
+ if (qd_a->qd_id < qd_b->qd_id)
+ return -1;
+ if (qd_a->qd_id > qd_b->qd_id)
+ return 1;
+
+ return 0;
+}
+
+static void do_qc(struct gfs2_quota_data *qd, s64 change)
+{
+ struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
+ struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode);
+ struct gfs2_quota_change *qc = qd->qd_bh_qc;
+ s64 x;
+
+ mutex_lock(&sdp->sd_quota_mutex);
+ gfs2_trans_add_bh(ip->i_gl, qd->qd_bh, 1);
+
+ if (!test_bit(QDF_CHANGE, &qd->qd_flags)) {
+ qc->qc_change = 0;
+ qc->qc_flags = 0;
+ if (test_bit(QDF_USER, &qd->qd_flags))
+ qc->qc_flags = cpu_to_be32(GFS2_QCF_USER);
+ qc->qc_id = cpu_to_be32(qd->qd_id);
+ }
+
+ x = qc->qc_change;
+ x = be64_to_cpu(x) + change;
+ qc->qc_change = cpu_to_be64(x);
+
+ spin_lock(&sdp->sd_quota_spin);
+ qd->qd_change = x;
+ spin_unlock(&sdp->sd_quota_spin);
+
+ if (!x) {
+ gfs2_assert_warn(sdp, test_bit(QDF_CHANGE, &qd->qd_flags));
+ clear_bit(QDF_CHANGE, &qd->qd_flags);
+ qc->qc_flags = 0;
+ qc->qc_id = 0;
+ slot_put(qd);
+ qd_put(qd);
+ } else if (!test_and_set_bit(QDF_CHANGE, &qd->qd_flags)) {
+ qd_hold(qd);
+ slot_hold(qd);
+ }
+
+ mutex_unlock(&sdp->sd_quota_mutex);
+}
+
+/**
+ * gfs2_adjust_quota
+ *
+ * This function was mostly borrowed from gfs2_block_truncate_page which was
+ * in turn mostly borrowed from ext3
+ */
+static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
+ s64 change, struct gfs2_quota_data *qd)
+{
+ struct inode *inode = &ip->i_inode;
+ struct address_space *mapping = inode->i_mapping;
+ unsigned long index = loc >> PAGE_CACHE_SHIFT;
+ unsigned offset = loc & (PAGE_CACHE_SHIFT - 1);
+ unsigned blocksize, iblock, pos;
+ struct buffer_head *bh;
+ struct page *page;
+ void *kaddr;
+ __be64 *ptr;
+ s64 value;
+ int err = -EIO;
+
+ page = grab_cache_page(mapping, index);
+ if (!page)
+ return -ENOMEM;
+
+ blocksize = inode->i_sb->s_blocksize;
+ iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
+
+ if (!page_has_buffers(page))
+ create_empty_buffers(page, blocksize, 0);
+
+ bh = page_buffers(page);
+ pos = blocksize;
+ while (offset >= pos) {
+ bh = bh->b_this_page;
+ iblock++;
+ pos += blocksize;
+ }
+
+ if (!buffer_mapped(bh)) {
+ gfs2_get_block(inode, iblock, bh, 1);
+ if (!buffer_mapped(bh))
+ goto unlock;
+ }
+
+ if (PageUptodate(page))
+ set_buffer_uptodate(bh);
+
+ if (!buffer_uptodate(bh)) {
+ ll_rw_block(READ_META, 1, &bh);
+ wait_on_buffer(bh);
+ if (!buffer_uptodate(bh))
+ goto unlock;
+ }
+
+ gfs2_trans_add_bh(ip->i_gl, bh, 0);
+
+ kaddr = kmap_atomic(page, KM_USER0);
+ ptr = kaddr + offset;
+ value = (s64)be64_to_cpu(*ptr) + change;
+ *ptr = cpu_to_be64(value);
+ flush_dcache_page(page);
+ kunmap_atomic(kaddr, KM_USER0);
+ err = 0;
+ qd->qd_qb.qb_magic = cpu_to_be32(GFS2_MAGIC);
+ qd->qd_qb.qb_value = cpu_to_be64(value);
+unlock:
+ unlock_page(page);
+ page_cache_release(page);
+ return err;
+}
+
+static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
+{
+ struct gfs2_sbd *sdp = (*qda)->qd_gl->gl_sbd;
+ struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode);
+ unsigned int data_blocks, ind_blocks;
+ struct gfs2_holder *ghs, i_gh;
+ unsigned int qx, x;
+ struct gfs2_quota_data *qd;
+ loff_t offset;
+ unsigned int nalloc = 0;
+ struct gfs2_alloc *al = NULL;
+ int error;
+
+ gfs2_write_calc_reserv(ip, sizeof(struct gfs2_quota),
+ &data_blocks, &ind_blocks);
+
+ ghs = kcalloc(num_qd, sizeof(struct gfs2_holder), GFP_KERNEL);
+ if (!ghs)
+ return -ENOMEM;
+
+ sort(qda, num_qd, sizeof(struct gfs2_quota_data *), sort_qd, NULL);
+ for (qx = 0; qx < num_qd; qx++) {
+ error = gfs2_glock_nq_init(qda[qx]->qd_gl,
+ LM_ST_EXCLUSIVE,
+ GL_NOCACHE, &ghs[qx]);
+ if (error)
+ goto out;
+ }
+
+ error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
+ if (error)
+ goto out;
+
+ for (x = 0; x < num_qd; x++) {
+ int alloc_required;
+
+ offset = qd2offset(qda[x]);
+ error = gfs2_write_alloc_required(ip, offset,
+ sizeof(struct gfs2_quota),
+ &alloc_required);
+ if (error)
+ goto out_gunlock;
+ if (alloc_required)
+ nalloc++;
+ }
+
+ if (nalloc) {
+ al = gfs2_alloc_get(ip);
+
+ al->al_requested = nalloc * (data_blocks + ind_blocks);
+
+ error = gfs2_inplace_reserve(ip);
+ if (error)
+ goto out_alloc;
+
+ error = gfs2_trans_begin(sdp,
+ al->al_rgd->rd_ri.ri_length +
+ num_qd * data_blocks +
+ nalloc * ind_blocks +
+ RES_DINODE + num_qd +
+ RES_STATFS, 0);
+ if (error)
+ goto out_ipres;
+ } else {
+ error = gfs2_trans_begin(sdp,
+ num_qd * data_blocks +
+ RES_DINODE + num_qd, 0);
+ if (error)
+ goto out_gunlock;
+ }
+
+ for (x = 0; x < num_qd; x++) {
+ qd = qda[x];
+ offset = qd2offset(qd);
+ error = gfs2_adjust_quota(ip, offset, qd->qd_change_sync,
+ (struct gfs2_quota_data *)
+ qd->qd_gl->gl_lvb);
+ if (error)
+ goto out_end_trans;
+
+ do_qc(qd, -qd->qd_change_sync);
+ }
+
+ error = 0;
+
+out_end_trans:
+ gfs2_trans_end(sdp);
+out_ipres:
+ if (nalloc)
+ gfs2_inplace_release(ip);
+out_alloc:
+ if (nalloc)
+ gfs2_alloc_put(ip);
+out_gunlock:
+ gfs2_glock_dq_uninit(&i_gh);
+out:
+ while (qx--)
+ gfs2_glock_dq_uninit(&ghs[qx]);
+ kfree(ghs);
+ gfs2_log_flush(ip->i_gl->gl_sbd, ip->i_gl);
+ return error;
+}
+
+static int do_glock(struct gfs2_quota_data *qd, int force_refresh,
+ struct gfs2_holder *q_gh)
+{
+ struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
+ struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode);
+ struct gfs2_holder i_gh;
+ struct gfs2_quota q;
+ char buf[sizeof(struct gfs2_quota)];
+ struct file_ra_state ra_state;
+ int error;
+ struct gfs2_quota_lvb *qlvb;
+
+ file_ra_state_init(&ra_state, sdp->sd_quota_inode->i_mapping);
+restart:
+ error = gfs2_glock_nq_init(qd->qd_gl, LM_ST_SHARED, 0, q_gh);
+ if (error)
+ return error;
+
+ qd->qd_qb = *(struct gfs2_quota_lvb *)qd->qd_gl->gl_lvb;
+
+ if (force_refresh || qd->qd_qb.qb_magic != cpu_to_be32(GFS2_MAGIC)) {
+ loff_t pos;
+ gfs2_glock_dq_uninit(q_gh);
+ error = gfs2_glock_nq_init(qd->qd_gl,
+ LM_ST_EXCLUSIVE, GL_NOCACHE,
+ q_gh);
+ if (error)
+ return error;
+
+ error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &i_gh);
+ if (error)
+ goto fail;
+
+ memset(buf, 0, sizeof(struct gfs2_quota));
+ pos = qd2offset(qd);
+ error = gfs2_internal_read(ip, &ra_state, buf,
+ &pos, sizeof(struct gfs2_quota));
+ if (error < 0)
+ goto fail_gunlock;
+
+ gfs2_glock_dq_uninit(&i_gh);
+
+
+ gfs2_quota_in(&q, buf);
+ qlvb = (struct gfs2_quota_lvb *)qd->qd_gl->gl_lvb;
+ qlvb->qb_magic = cpu_to_be32(GFS2_MAGIC);
+ qlvb->__pad = 0;
+ qlvb->qb_limit = cpu_to_be64(q.qu_limit);
+ qlvb->qb_warn = cpu_to_be64(q.qu_warn);
+ qlvb->qb_value = cpu_to_be64(q.qu_value);
+ qd->qd_qb = *qlvb;
+
+ if (gfs2_glock_is_blocking(qd->qd_gl)) {
+ gfs2_glock_dq_uninit(q_gh);
+ force_refresh = 0;
+ goto restart;
+ }
+ }
+
+ return 0;
+
+fail_gunlock:
+ gfs2_glock_dq_uninit(&i_gh);
+fail:
+ gfs2_glock_dq_uninit(q_gh);
+ return error;
+}
+
+int gfs2_quota_lock(struct gfs2_inode *ip, u32 uid, u32 gid)
+{
+ struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+ struct gfs2_alloc *al = &ip->i_alloc;
+ unsigned int x;
+ int error = 0;
+
+ gfs2_quota_hold(ip, uid, gid);
+
+ if (capable(CAP_SYS_RESOURCE) ||
+ sdp->sd_args.ar_quota != GFS2_QUOTA_ON)
+ return 0;
+
+ sort(al->al_qd, al->al_qd_num, sizeof(struct gfs2_quota_data *),
+ sort_qd, NULL);
+
+ for (x = 0; x < al->al_qd_num; x++) {
+ error = do_glock(al->al_qd[x], NO_FORCE, &al->al_qd_ghs[x]);
+ if (error)
+ break;
+ }
+
+ if (!error)
+ set_bit(GIF_QD_LOCKED, &ip->i_flags);
+ else {
+ while (x--)
+ gfs2_glock_dq_uninit(&al->al_qd_ghs[x]);
+ gfs2_quota_unhold(ip);
+ }
+
+ return error;
+}
+
+static int need_sync(struct gfs2_quota_data *qd)
+{
+ struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
+ struct gfs2_tune *gt = &sdp->sd_tune;
+ s64 value;
+ unsigned int num, den;
+ int do_sync = 1;
+
+ if (!qd->qd_qb.qb_limit)
+ return 0;
+
+ spin_lock(&sdp->sd_quota_spin);
+ value = qd->qd_change;
+ spin_unlock(&sdp->sd_quota_spin);
+
+ spin_lock(&gt->gt_spin);
+ num = gt->gt_quota_scale_num;
+ den = gt->gt_quota_scale_den;
+ spin_unlock(&gt->gt_spin);
+
+ if (value < 0)
+ do_sync = 0;
+ else if ((s64)be64_to_cpu(qd->qd_qb.qb_value) >=
+ (s64)be64_to_cpu(qd->qd_qb.qb_limit))
+ do_sync = 0;
+ else {
+ value *= gfs2_jindex_size(sdp) * num;
+ do_div(value, den);
+ value += (s64)be64_to_cpu(qd->qd_qb.qb_value);
+ if (value < (s64)be64_to_cpu(qd->qd_qb.qb_limit))
+ do_sync = 0;
+ }
+
+ return do_sync;
+}
+
+void gfs2_quota_unlock(struct gfs2_inode *ip)
+{
+ struct gfs2_alloc *al = &ip->i_alloc;
+ struct gfs2_quota_data *qda[4];
+ unsigned int count = 0;
+ unsigned int x;
+
+ if (!test_and_clear_bit(GIF_QD_LOCKED, &ip->i_flags))
+ goto out;
+
+ for (x = 0; x < al->al_qd_num; x++) {
+ struct gfs2_quota_data *qd;
+ int sync;
+
+ qd = al->al_qd[x];
+ sync = need_sync(qd);
+
+ gfs2_glock_dq_uninit(&al->al_qd_ghs[x]);
+
+ if (sync && qd_trylock(qd))
+ qda[count++] = qd;
+ }
+
+ if (count) {
+ do_sync(count, qda);
+ for (x = 0; x < count; x++)
+ qd_unlock(qda[x]);
+ }
+
+out:
+ gfs2_quota_unhold(ip);
+}
+
+#define MAX_LINE 256
+
+static int print_message(struct gfs2_quota_data *qd, char *type)
+{
+ struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
+
+ printk(KERN_INFO "GFS2: fsid=%s: quota %s for %s %u\r\n",
+ sdp->sd_fsname, type,
+ (test_bit(QDF_USER, &qd->qd_flags)) ? "user" : "group",
+ qd->qd_id);
+
+ return 0;
+}
+
+int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid)
+{
+ struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+ struct gfs2_alloc *al = &ip->i_alloc;
+ struct gfs2_quota_data *qd;
+ s64 value;
+ unsigned int x;
+ int error = 0;
+
+ if (!test_bit(GIF_QD_LOCKED, &ip->i_flags))
+ return 0;
+
+ if (sdp->sd_args.ar_quota != GFS2_QUOTA_ON)
+ return 0;
+
+ for (x = 0; x < al->al_qd_num; x++) {
+ qd = al->al_qd[x];
+
+ if (!((qd->qd_id == uid && test_bit(QDF_USER, &qd->qd_flags)) ||
+ (qd->qd_id == gid && !test_bit(QDF_USER, &qd->qd_flags))))
+ continue;
+
+ value = (s64)be64_to_cpu(qd->qd_qb.qb_value);
+ spin_lock(&sdp->sd_quota_spin);
+ value += qd->qd_change;
+ spin_unlock(&sdp->sd_quota_spin);
+
+ if (be64_to_cpu(qd->qd_qb.qb_limit) && (s64)be64_to_cpu(qd->qd_qb.qb_limit) < value) {
+ print_message(qd, "exceeded");
+ error = -EDQUOT;
+ break;
+ } else if (be64_to_cpu(qd->qd_qb.qb_warn) &&
+ (s64)be64_to_cpu(qd->qd_qb.qb_warn) < value &&
+ time_after_eq(jiffies, qd->qd_last_warn +
+ gfs2_tune_get(sdp,
+ gt_quota_warn_period) * HZ)) {
+ error = print_message(qd, "warning");
+ qd->qd_last_warn = jiffies;
+ }
+ }
+
+ return error;
+}
+
+void gfs2_quota_change(struct gfs2_inode *ip, s64 change,
+ u32 uid, u32 gid)
+{
+ struct gfs2_alloc *al = &ip->i_alloc;
+ struct gfs2_quota_data *qd;
+ unsigned int x;
+ unsigned int found = 0;
+
+ if (gfs2_assert_warn(GFS2_SB(&ip->i_inode), change))
+ return;
+ if (ip->i_di.di_flags & GFS2_DIF_SYSTEM)
+ return;
+
+ for (x = 0; x < al->al_qd_num; x++) {
+ qd = al->al_qd[x];
+
+ if ((qd->qd_id == uid && test_bit(QDF_USER, &qd->qd_flags)) ||
+ (qd->qd_id == gid && !test_bit(QDF_USER, &qd->qd_flags))) {
+ do_qc(qd, change);
+ found++;
+ }
+ }
+}
+
+int gfs2_quota_sync(struct gfs2_sbd *sdp)
+{
+ struct gfs2_quota_data **qda;
+ unsigned int max_qd = gfs2_tune_get(sdp, gt_quota_simul_sync);
+ unsigned int num_qd;
+ unsigned int x;
+ int error = 0;
+
+ sdp->sd_quota_sync_gen++;
+
+ qda = kcalloc(max_qd, sizeof(struct gfs2_quota_data *), GFP_KERNEL);
+ if (!qda)
+ return -ENOMEM;
+
+ do {
+ num_qd = 0;
+
+ for (;;) {
+ error = qd_fish(sdp, qda + num_qd);
+ if (error || !qda[num_qd])
+ break;
+ if (++num_qd == max_qd)
+ break;
+ }
+
+ if (num_qd) {
+ if (!error)
+ error = do_sync(num_qd, qda);
+ if (!error)
+ for (x = 0; x < num_qd; x++)
+ qda[x]->qd_sync_gen =
+ sdp->sd_quota_sync_gen;
+
+ for (x = 0; x < num_qd; x++)
+ qd_unlock(qda[x]);
+ }
+ } while (!error && num_qd == max_qd);
+
+ kfree(qda);
+
+ return error;
+}
+
+int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, u32 id)
+{
+ struct gfs2_quota_data *qd;
+ struct gfs2_holder q_gh;
+ int error;
+
+ error = qd_get(sdp, user, id, CREATE, &qd);
+ if (error)
+ return error;
+
+ error = do_glock(qd, FORCE, &q_gh);
+ if (!error)
+ gfs2_glock_dq_uninit(&q_gh);
+
+ qd_put(qd);
+
+ return error;
+}
+
+int gfs2_quota_init(struct gfs2_sbd *sdp)
+{
+ struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode);
+ unsigned int blocks = ip->i_di.di_size >> sdp->sd_sb.sb_bsize_shift;
+ unsigned int x, slot = 0;
+ unsigned int found = 0;
+ u64 dblock;
+ u32 extlen = 0;
+ int error;
+
+ if (!ip->i_di.di_size || ip->i_di.di_size > (64 << 20) ||
+ ip->i_di.di_size & (sdp->sd_sb.sb_bsize - 1)) {
+ gfs2_consist_inode(ip);
+ return -EIO;
+ }
+ sdp->sd_quota_slots = blocks * sdp->sd_qc_per_block;
+ sdp->sd_quota_chunks = DIV_ROUND_UP(sdp->sd_quota_slots, 8 * PAGE_SIZE);
+
+ error = -ENOMEM;
+
+ sdp->sd_quota_bitmap = kcalloc(sdp->sd_quota_chunks,
+ sizeof(unsigned char *), GFP_KERNEL);
+ if (!sdp->sd_quota_bitmap)
+ return error;
+
+ for (x = 0; x < sdp->sd_quota_chunks; x++) {
+ sdp->sd_quota_bitmap[x] = kzalloc(PAGE_SIZE, GFP_KERNEL);
+ if (!sdp->sd_quota_bitmap[x])
+ goto fail;
+ }
+
+ for (x = 0; x < blocks; x++) {
+ struct buffer_head *bh;
+ unsigned int y;
+
+ if (!extlen) {
+ int new = 0;
+ error = gfs2_extent_map(&ip->i_inode, x, &new, &dblock, &extlen);
+ if (error)
+ goto fail;
+ }
+ error = -EIO;
+ bh = gfs2_meta_ra(ip->i_gl, dblock, extlen);
+ if (!bh)
+ goto fail;
+ if (gfs2_metatype_check(sdp, bh, GFS2_METATYPE_QC)) {
+ brelse(bh);
+ goto fail;
+ }
+
+ for (y = 0; y < sdp->sd_qc_per_block && slot < sdp->sd_quota_slots;
+ y++, slot++) {
+ struct gfs2_quota_change qc;
+ struct gfs2_quota_data *qd;
+
+ gfs2_quota_change_in(&qc, bh->b_data +
+ sizeof(struct gfs2_meta_header) +
+ y * sizeof(struct gfs2_quota_change));
+ if (!qc.qc_change)
+ continue;
+
+ error = qd_alloc(sdp, (qc.qc_flags & GFS2_QCF_USER),
+ qc.qc_id, &qd);
+ if (error) {
+ brelse(bh);
+ goto fail;
+ }
+
+ set_bit(QDF_CHANGE, &qd->qd_flags);
+ qd->qd_change = qc.qc_change;
+ qd->qd_slot = slot;
+ qd->qd_slot_count = 1;
+ qd->qd_last_touched = jiffies;
+
+ spin_lock(&sdp->sd_quota_spin);
+ gfs2_icbit_munge(sdp, sdp->sd_quota_bitmap, slot, 1);
+ list_add(&qd->qd_list, &sdp->sd_quota_list);
+ atomic_inc(&sdp->sd_quota_count);
+ spin_unlock(&sdp->sd_quota_spin);
+
+ found++;
+ }
+
+ brelse(bh);
+ dblock++;
+ extlen--;
+ }
+
+ if (found)
+ fs_info(sdp, "found %u quota changes\n", found);
+
+ return 0;
+
+fail:
+ gfs2_quota_cleanup(sdp);
+ return error;
+}
+
+void gfs2_quota_scan(struct gfs2_sbd *sdp)
+{
+ struct gfs2_quota_data *qd, *safe;
+ LIST_HEAD(dead);
+
+ spin_lock(&sdp->sd_quota_spin);
+ list_for_each_entry_safe(qd, safe, &sdp->sd_quota_list, qd_list) {
+ if (!qd->qd_count &&
+ time_after_eq(jiffies, qd->qd_last_touched +
+ gfs2_tune_get(sdp, gt_quota_cache_secs) * HZ)) {
+ list_move(&qd->qd_list, &dead);
+ gfs2_assert_warn(sdp,
+ atomic_read(&sdp->sd_quota_count) > 0);
+ atomic_dec(&sdp->sd_quota_count);
+ }
+ }
+ spin_unlock(&sdp->sd_quota_spin);
+
+ while (!list_empty(&dead)) {
+ qd = list_entry(dead.next, struct gfs2_quota_data, qd_list);
+ list_del(&qd->qd_list);
+
+ gfs2_assert_warn(sdp, !qd->qd_change);
+ gfs2_assert_warn(sdp, !qd->qd_slot_count);
+ gfs2_assert_warn(sdp, !qd->qd_bh_count);
+
+ gfs2_lvb_unhold(qd->qd_gl);
+ kfree(qd);
+ }
+}
+
+void gfs2_quota_cleanup(struct gfs2_sbd *sdp)
+{
+ struct list_head *head = &sdp->sd_quota_list;
+ struct gfs2_quota_data *qd;
+ unsigned int x;
+
+ spin_lock(&sdp->sd_quota_spin);
+ while (!list_empty(head)) {
+ qd = list_entry(head->prev, struct gfs2_quota_data, qd_list);
+
+ if (qd->qd_count > 1 ||
+ (qd->qd_count && !test_bit(QDF_CHANGE, &qd->qd_flags))) {
+ list_move(&qd->qd_list, head);
+ spin_unlock(&sdp->sd_quota_spin);
+ schedule();
+ spin_lock(&sdp->sd_quota_spin);
+ continue;
+ }
+
+ list_del(&qd->qd_list);
+ atomic_dec(&sdp->sd_quota_count);
+ spin_unlock(&sdp->sd_quota_spin);
+
+ if (!qd->qd_count) {
+ gfs2_assert_warn(sdp, !qd->qd_change);
+ gfs2_assert_warn(sdp, !qd->qd_slot_count);
+ } else
+ gfs2_assert_warn(sdp, qd->qd_slot_count == 1);
+ gfs2_assert_warn(sdp, !qd->qd_bh_count);
+
+ gfs2_lvb_unhold(qd->qd_gl);
+ kfree(qd);
+
+ spin_lock(&sdp->sd_quota_spin);
+ }
+ spin_unlock(&sdp->sd_quota_spin);
+
+ gfs2_assert_warn(sdp, !atomic_read(&sdp->sd_quota_count));
+
+ if (sdp->sd_quota_bitmap) {
+ for (x = 0; x < sdp->sd_quota_chunks; x++)
+ kfree(sdp->sd_quota_bitmap[x]);
+ kfree(sdp->sd_quota_bitmap);
+ }
+}
+
diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h
new file mode 100644
index 000000000000..a8be1417051f
--- /dev/null
+++ b/fs/gfs2/quota.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#ifndef __QUOTA_DOT_H__
+#define __QUOTA_DOT_H__
+
+struct gfs2_inode;
+struct gfs2_sbd;
+
+#define NO_QUOTA_CHANGE ((u32)-1)
+
+int gfs2_quota_hold(struct gfs2_inode *ip, u32 uid, u32 gid);
+void gfs2_quota_unhold(struct gfs2_inode *ip);
+
+int gfs2_quota_lock(struct gfs2_inode *ip, u32 uid, u32 gid);
+void gfs2_quota_unlock(struct gfs2_inode *ip);
+
+int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid);
+void gfs2_quota_change(struct gfs2_inode *ip, s64 change,
+ u32 uid, u32 gid);
+
+int gfs2_quota_sync(struct gfs2_sbd *sdp);
+int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, u32 id);
+
+int gfs2_quota_init(struct gfs2_sbd *sdp);
+void gfs2_quota_scan(struct gfs2_sbd *sdp);
+void gfs2_quota_cleanup(struct gfs2_sbd *sdp);
+
+#endif /* __QUOTA_DOT_H__ */
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
new file mode 100644
index 000000000000..62cd223819b7
--- /dev/null
+++ b/fs/gfs2/recovery.c
@@ -0,0 +1,571 @@
+/*
+ * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/crc32.h>
+#include <linux/lm_interface.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "bmap.h"
+#include "glock.h"
+#include "glops.h"
+#include "lm.h"
+#include "lops.h"
+#include "meta_io.h"
+#include "recovery.h"
+#include "super.h"
+#include "util.h"
+#include "dir.h"
+
+int gfs2_replay_read_block(struct gfs2_jdesc *jd, unsigned int blk,
+ struct buffer_head **bh)
+{
+ struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
+ struct gfs2_glock *gl = ip->i_gl;
+ int new = 0;
+ u64 dblock;
+ u32 extlen;
+ int error;
+
+ error = gfs2_extent_map(&ip->i_inode, blk, &new, &dblock, &extlen);
+ if (error)
+ return error;
+ if (!dblock) {
+ gfs2_consist_inode(ip);
+ return -EIO;
+ }
+
+ *bh = gfs2_meta_ra(gl, dblock, extlen);
+
+ return error;
+}
+
+int gfs2_revoke_add(struct gfs2_sbd *sdp, u64 blkno, unsigned int where)
+{
+ struct list_head *head = &sdp->sd_revoke_list;
+ struct gfs2_revoke_replay *rr;
+ int found = 0;
+
+ list_for_each_entry(rr, head, rr_list) {
+ if (rr->rr_blkno == blkno) {
+ found = 1;
+ break;
+ }
+ }
+
+ if (found) {
+ rr->rr_where = where;
+ return 0;
+ }
+
+ rr = kmalloc(sizeof(struct gfs2_revoke_replay), GFP_KERNEL);
+ if (!rr)
+ return -ENOMEM;
+
+ rr->rr_blkno = blkno;
+ rr->rr_where = where;
+ list_add(&rr->rr_list, head);
+
+ return 1;
+}
+
+int gfs2_revoke_check(struct gfs2_sbd *sdp, u64 blkno, unsigned int where)
+{
+ struct gfs2_revoke_replay *rr;
+ int wrap, a, b, revoke;
+ int found = 0;
+
+ list_for_each_entry(rr, &sdp->sd_revoke_list, rr_list) {
+ if (rr->rr_blkno == blkno) {
+ found = 1;
+ break;
+ }
+ }
+
+ if (!found)
+ return 0;
+
+ wrap = (rr->rr_where < sdp->sd_replay_tail);
+ a = (sdp->sd_replay_tail < where);
+ b = (where < rr->rr_where);
+ revoke = (wrap) ? (a || b) : (a && b);
+
+ return revoke;
+}
+
+void gfs2_revoke_clean(struct gfs2_sbd *sdp)
+{
+ struct list_head *head = &sdp->sd_revoke_list;
+ struct gfs2_revoke_replay *rr;
+
+ while (!list_empty(head)) {
+ rr = list_entry(head->next, struct gfs2_revoke_replay, rr_list);
+ list_del(&rr->rr_list);
+ kfree(rr);
+ }
+}
+
+/**
+ * get_log_header - read the log header for a given segment
+ * @jd: the journal
+ * @blk: the block to look at
+ * @lh: the log header to return
+ *
+ * Read the log header for a given segement in a given journal. Do a few
+ * sanity checks on it.
+ *
+ * Returns: 0 on success,
+ * 1 if the header was invalid or incomplete,
+ * errno on error
+ */
+
+static int get_log_header(struct gfs2_jdesc *jd, unsigned int blk,
+ struct gfs2_log_header *head)
+{
+ struct buffer_head *bh;
+ struct gfs2_log_header lh;
+ u32 hash;
+ int error;
+
+ error = gfs2_replay_read_block(jd, blk, &bh);
+ if (error)
+ return error;
+
+ memcpy(&lh, bh->b_data, sizeof(struct gfs2_log_header));
+ lh.lh_hash = 0;
+ hash = gfs2_disk_hash((char *)&lh, sizeof(struct gfs2_log_header));
+ gfs2_log_header_in(&lh, bh->b_data);
+
+ brelse(bh);
+
+ if (lh.lh_header.mh_magic != GFS2_MAGIC ||
+ lh.lh_header.mh_type != GFS2_METATYPE_LH ||
+ lh.lh_blkno != blk || lh.lh_hash != hash)
+ return 1;
+
+ *head = lh;
+
+ return 0;
+}
+
+/**
+ * find_good_lh - find a good log header
+ * @jd: the journal
+ * @blk: the segment to start searching from
+ * @lh: the log header to fill in
+ * @forward: if true search forward in the log, else search backward
+ *
+ * Call get_log_header() to get a log header for a segment, but if the
+ * segment is bad, either scan forward or backward until we find a good one.
+ *
+ * Returns: errno
+ */
+
+static int find_good_lh(struct gfs2_jdesc *jd, unsigned int *blk,
+ struct gfs2_log_header *head)
+{
+ unsigned int orig_blk = *blk;
+ int error;
+
+ for (;;) {
+ error = get_log_header(jd, *blk, head);
+ if (error <= 0)
+ return error;
+
+ if (++*blk == jd->jd_blocks)
+ *blk = 0;
+
+ if (*blk == orig_blk) {
+ gfs2_consist_inode(GFS2_I(jd->jd_inode));
+ return -EIO;
+ }
+ }
+}
+
+/**
+ * jhead_scan - make sure we've found the head of the log
+ * @jd: the journal
+ * @head: this is filled in with the log descriptor of the head
+ *
+ * At this point, seg and lh should be either the head of the log or just
+ * before. Scan forward until we find the head.
+ *
+ * Returns: errno
+ */
+
+static int jhead_scan(struct gfs2_jdesc *jd, struct gfs2_log_header *head)
+{
+ unsigned int blk = head->lh_blkno;
+ struct gfs2_log_header lh;
+ int error;
+
+ for (;;) {
+ if (++blk == jd->jd_blocks)
+ blk = 0;
+
+ error = get_log_header(jd, blk, &lh);
+ if (error < 0)
+ return error;
+ if (error == 1)
+ continue;
+
+ if (lh.lh_sequence == head->lh_sequence) {
+ gfs2_consist_inode(GFS2_I(jd->jd_inode));
+ return -EIO;
+ }
+ if (lh.lh_sequence < head->lh_sequence)
+ break;
+
+ *head = lh;
+ }
+
+ return 0;
+}
+
+/**
+ * gfs2_find_jhead - find the head of a log
+ * @jd: the journal
+ * @head: the log descriptor for the head of the log is returned here
+ *
+ * Do a binary search of a journal and find the valid log entry with the
+ * highest sequence number. (i.e. the log head)
+ *
+ * Returns: errno
+ */
+
+int gfs2_find_jhead(struct gfs2_jdesc *jd, struct gfs2_log_header *head)
+{
+ struct gfs2_log_header lh_1, lh_m;
+ u32 blk_1, blk_2, blk_m;
+ int error;
+
+ blk_1 = 0;
+ blk_2 = jd->jd_blocks - 1;
+
+ for (;;) {
+ blk_m = (blk_1 + blk_2) / 2;
+
+ error = find_good_lh(jd, &blk_1, &lh_1);
+ if (error)
+ return error;
+
+ error = find_good_lh(jd, &blk_m, &lh_m);
+ if (error)
+ return error;
+
+ if (blk_1 == blk_m || blk_m == blk_2)
+ break;
+
+ if (lh_1.lh_sequence <= lh_m.lh_sequence)
+ blk_1 = blk_m;
+ else
+ blk_2 = blk_m;
+ }
+
+ error = jhead_scan(jd, &lh_1);
+ if (error)
+ return error;
+
+ *head = lh_1;
+
+ return error;
+}
+
+/**
+ * foreach_descriptor - go through the active part of the log
+ * @jd: the journal
+ * @start: the first log header in the active region
+ * @end: the last log header (don't process the contents of this entry))
+ *
+ * Call a given function once for every log descriptor in the active
+ * portion of the log.
+ *
+ * Returns: errno
+ */
+
+static int foreach_descriptor(struct gfs2_jdesc *jd, unsigned int start,
+ unsigned int end, int pass)
+{
+ struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
+ struct buffer_head *bh;
+ struct gfs2_log_descriptor *ld;
+ int error = 0;
+ u32 length;
+ __be64 *ptr;
+ unsigned int offset = sizeof(struct gfs2_log_descriptor);
+ offset += sizeof(__be64) - 1;
+ offset &= ~(sizeof(__be64) - 1);
+
+ while (start != end) {
+ error = gfs2_replay_read_block(jd, start, &bh);
+ if (error)
+ return error;
+ if (gfs2_meta_check(sdp, bh)) {
+ brelse(bh);
+ return -EIO;
+ }
+ ld = (struct gfs2_log_descriptor *)bh->b_data;
+ length = be32_to_cpu(ld->ld_length);
+
+ if (be32_to_cpu(ld->ld_header.mh_type) == GFS2_METATYPE_LH) {
+ struct gfs2_log_header lh;
+ error = get_log_header(jd, start, &lh);
+ if (!error) {
+ gfs2_replay_incr_blk(sdp, &start);
+ brelse(bh);
+ continue;
+ }
+ if (error == 1) {
+ gfs2_consist_inode(GFS2_I(jd->jd_inode));
+ error = -EIO;
+ }
+ brelse(bh);
+ return error;
+ } else if (gfs2_metatype_check(sdp, bh, GFS2_METATYPE_LD)) {
+ brelse(bh);
+ return -EIO;
+ }
+ ptr = (__be64 *)(bh->b_data + offset);
+ error = lops_scan_elements(jd, start, ld, ptr, pass);
+ if (error) {
+ brelse(bh);
+ return error;
+ }
+
+ while (length--)
+ gfs2_replay_incr_blk(sdp, &start);
+
+ brelse(bh);
+ }
+
+ return 0;
+}
+
+/**
+ * clean_journal - mark a dirty journal as being clean
+ * @sdp: the filesystem
+ * @jd: the journal
+ * @gl: the journal's glock
+ * @head: the head journal to start from
+ *
+ * Returns: errno
+ */
+
+static int clean_journal(struct gfs2_jdesc *jd, struct gfs2_log_header *head)
+{
+ struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
+ struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
+ unsigned int lblock;
+ struct gfs2_log_header *lh;
+ u32 hash;
+ struct buffer_head *bh;
+ int error;
+ struct buffer_head bh_map = { .b_state = 0, .b_blocknr = 0 };
+
+ lblock = head->lh_blkno;
+ gfs2_replay_incr_blk(sdp, &lblock);
+ bh_map.b_size = 1 << ip->i_inode.i_blkbits;
+ error = gfs2_block_map(&ip->i_inode, lblock, 0, &bh_map);
+ if (error)
+ return error;
+ if (!bh_map.b_blocknr) {
+ gfs2_consist_inode(ip);
+ return -EIO;
+ }
+
+ bh = sb_getblk(sdp->sd_vfs, bh_map.b_blocknr);
+ lock_buffer(bh);
+ memset(bh->b_data, 0, bh->b_size);
+ set_buffer_uptodate(bh);
+ clear_buffer_dirty(bh);
+ unlock_buffer(bh);
+
+ lh = (struct gfs2_log_header *)bh->b_data;
+ memset(lh, 0, sizeof(struct gfs2_log_header));
+ lh->lh_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
+ lh->lh_header.mh_type = cpu_to_be32(GFS2_METATYPE_LH);
+ lh->lh_header.mh_format = cpu_to_be32(GFS2_FORMAT_LH);
+ lh->lh_sequence = cpu_to_be64(head->lh_sequence + 1);
+ lh->lh_flags = cpu_to_be32(GFS2_LOG_HEAD_UNMOUNT);
+ lh->lh_blkno = cpu_to_be32(lblock);
+ hash = gfs2_disk_hash((const char *)lh, sizeof(struct gfs2_log_header));
+ lh->lh_hash = cpu_to_be32(hash);
+
+ set_buffer_dirty(bh);
+ if (sync_dirty_buffer(bh))
+ gfs2_io_error_bh(sdp, bh);
+ brelse(bh);
+
+ return error;
+}
+
+/**
+ * gfs2_recover_journal - recovery a given journal
+ * @jd: the struct gfs2_jdesc describing the journal
+ *
+ * Acquire the journal's lock, check to see if the journal is clean, and
+ * do recovery if necessary.
+ *
+ * Returns: errno
+ */
+
+int gfs2_recover_journal(struct gfs2_jdesc *jd)
+{
+ struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
+ struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
+ struct gfs2_log_header head;
+ struct gfs2_holder j_gh, ji_gh, t_gh;
+ unsigned long t;
+ int ro = 0;
+ unsigned int pass;
+ int error;
+
+ if (jd->jd_jid != sdp->sd_lockstruct.ls_jid) {
+ fs_info(sdp, "jid=%u: Trying to acquire journal lock...\n",
+ jd->jd_jid);
+
+ /* Aquire the journal lock so we can do recovery */
+
+ error = gfs2_glock_nq_num(sdp, jd->jd_jid, &gfs2_journal_glops,
+ LM_ST_EXCLUSIVE,
+ LM_FLAG_NOEXP | LM_FLAG_TRY | GL_NOCACHE,
+ &j_gh);
+ switch (error) {
+ case 0:
+ break;
+
+ case GLR_TRYFAILED:
+ fs_info(sdp, "jid=%u: Busy\n", jd->jd_jid);
+ error = 0;
+
+ default:
+ goto fail;
+ };
+
+ error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED,
+ LM_FLAG_NOEXP, &ji_gh);
+ if (error)
+ goto fail_gunlock_j;
+ } else {
+ fs_info(sdp, "jid=%u, already locked for use\n", jd->jd_jid);
+ }
+
+ fs_info(sdp, "jid=%u: Looking at journal...\n", jd->jd_jid);
+
+ error = gfs2_jdesc_check(jd);
+ if (error)
+ goto fail_gunlock_ji;
+
+ error = gfs2_find_jhead(jd, &head);
+ if (error)
+ goto fail_gunlock_ji;
+
+ if (!(head.lh_flags & GFS2_LOG_HEAD_UNMOUNT)) {
+ fs_info(sdp, "jid=%u: Acquiring the transaction lock...\n",
+ jd->jd_jid);
+
+ t = jiffies;
+
+ /* Acquire a shared hold on the transaction lock */
+
+ error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED,
+ LM_FLAG_NOEXP | LM_FLAG_PRIORITY |
+ GL_NOCANCEL | GL_NOCACHE, &t_gh);
+ if (error)
+ goto fail_gunlock_ji;
+
+ if (test_bit(SDF_JOURNAL_CHECKED, &sdp->sd_flags)) {
+ if (!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags))
+ ro = 1;
+ } else {
+ if (sdp->sd_vfs->s_flags & MS_RDONLY)
+ ro = 1;
+ }
+
+ if (ro) {
+ fs_warn(sdp, "jid=%u: Can't replay: read-only FS\n",
+ jd->jd_jid);
+ error = -EROFS;
+ goto fail_gunlock_tr;
+ }
+
+ fs_info(sdp, "jid=%u: Replaying journal...\n", jd->jd_jid);
+
+ for (pass = 0; pass < 2; pass++) {
+ lops_before_scan(jd, &head, pass);
+ error = foreach_descriptor(jd, head.lh_tail,
+ head.lh_blkno, pass);
+ lops_after_scan(jd, error, pass);
+ if (error)
+ goto fail_gunlock_tr;
+ }
+
+ error = clean_journal(jd, &head);
+ if (error)
+ goto fail_gunlock_tr;
+
+ gfs2_glock_dq_uninit(&t_gh);
+ t = DIV_ROUND_UP(jiffies - t, HZ);
+ fs_info(sdp, "jid=%u: Journal replayed in %lus\n",
+ jd->jd_jid, t);
+ }
+
+ if (jd->jd_jid != sdp->sd_lockstruct.ls_jid)
+ gfs2_glock_dq_uninit(&ji_gh);
+
+ gfs2_lm_recovery_done(sdp, jd->jd_jid, LM_RD_SUCCESS);
+
+ if (jd->jd_jid != sdp->sd_lockstruct.ls_jid)
+ gfs2_glock_dq_uninit(&j_gh);
+
+ fs_info(sdp, "jid=%u: Done\n", jd->jd_jid);
+ return 0;
+
+fail_gunlock_tr:
+ gfs2_glock_dq_uninit(&t_gh);
+fail_gunlock_ji:
+ if (jd->jd_jid != sdp->sd_lockstruct.ls_jid) {
+ gfs2_glock_dq_uninit(&ji_gh);
+fail_gunlock_j:
+ gfs2_glock_dq_uninit(&j_gh);
+ }
+
+ fs_info(sdp, "jid=%u: %s\n", jd->jd_jid, (error) ? "Failed" : "Done");
+
+fail:
+ gfs2_lm_recovery_done(sdp, jd->jd_jid, LM_RD_GAVEUP);
+ return error;
+}
+
+/**
+ * gfs2_check_journals - Recover any dirty journals
+ * @sdp: the filesystem
+ *
+ */
+
+void gfs2_check_journals(struct gfs2_sbd *sdp)
+{
+ struct gfs2_jdesc *jd;
+
+ for (;;) {
+ jd = gfs2_jdesc_find_dirty(sdp);
+ if (!jd)
+ break;
+
+ if (jd != sdp->sd_jdesc)
+ gfs2_recover_journal(jd);
+ }
+}
+
diff --git a/fs/gfs2/recovery.h b/fs/gfs2/recovery.h
new file mode 100644
index 000000000000..961feedf4d8b
--- /dev/null
+++ b/fs/gfs2/recovery.h
@@ -0,0 +1,34 @@
+/*
+ * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#ifndef __RECOVERY_DOT_H__
+#define __RECOVERY_DOT_H__
+
+#include "incore.h"
+
+static inline void gfs2_replay_incr_blk(struct gfs2_sbd *sdp, unsigned int *blk)
+{
+ if (++*blk == sdp->sd_jdesc->jd_blocks)
+ *blk = 0;
+}
+
+int gfs2_replay_read_block(struct gfs2_jdesc *jd, unsigned int blk,
+ struct buffer_head **bh);
+
+int gfs2_revoke_add(struct gfs2_sbd *sdp, u64 blkno, unsigned int where);
+int gfs2_revoke_check(struct gfs2_sbd *sdp, u64 blkno, unsigned int where);
+void gfs2_revoke_clean(struct gfs2_sbd *sdp);
+
+int gfs2_find_jhead(struct gfs2_jdesc *jd,
+ struct gfs2_log_header *head);
+int gfs2_recover_journal(struct gfs2_jdesc *gfs2_jd);
+void gfs2_check_journals(struct gfs2_sbd *sdp);
+
+#endif /* __RECOVERY_DOT_H__ */
+
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
new file mode 100644
index 000000000000..b261385c0065
--- /dev/null
+++ b/fs/gfs2/rgrp.c
@@ -0,0 +1,1513 @@
+/*
+ * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/fs.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/lm_interface.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "glock.h"
+#include "glops.h"
+#include "lops.h"
+#include "meta_io.h"
+#include "quota.h"
+#include "rgrp.h"
+#include "super.h"
+#include "trans.h"
+#include "ops_file.h"
+#include "util.h"
+
+#define BFITNOENT ((u32)~0)
+
+/*
+ * These routines are used by the resource group routines (rgrp.c)
+ * to keep track of block allocation. Each block is represented by two
+ * bits. So, each byte represents GFS2_NBBY (i.e. 4) blocks.
+ *
+ * 0 = Free
+ * 1 = Used (not metadata)
+ * 2 = Unlinked (still in use) inode
+ * 3 = Used (metadata)
+ */
+
+static const char valid_change[16] = {
+ /* current */
+ /* n */ 0, 1, 1, 1,
+ /* e */ 1, 0, 0, 0,
+ /* w */ 0, 0, 0, 1,
+ 1, 0, 0, 0
+};
+
+/**
+ * gfs2_setbit - Set a bit in the bitmaps
+ * @buffer: the buffer that holds the bitmaps
+ * @buflen: the length (in bytes) of the buffer
+ * @block: the block to set
+ * @new_state: the new state of the block
+ *
+ */
+
+static void gfs2_setbit(struct gfs2_rgrpd *rgd, unsigned char *buffer,
+ unsigned int buflen, u32 block,
+ unsigned char new_state)
+{
+ unsigned char *byte, *end, cur_state;
+ unsigned int bit;
+
+ byte = buffer + (block / GFS2_NBBY);
+ bit = (block % GFS2_NBBY) * GFS2_BIT_SIZE;
+ end = buffer + buflen;
+
+ gfs2_assert(rgd->rd_sbd, byte < end);
+
+ cur_state = (*byte >> bit) & GFS2_BIT_MASK;
+
+ if (valid_change[new_state * 4 + cur_state]) {
+ *byte ^= cur_state << bit;
+ *byte |= new_state << bit;
+ } else
+ gfs2_consist_rgrpd(rgd);
+}
+
+/**
+ * gfs2_testbit - test a bit in the bitmaps
+ * @buffer: the buffer that holds the bitmaps
+ * @buflen: the length (in bytes) of the buffer
+ * @block: the block to read
+ *
+ */
+
+static unsigned char gfs2_testbit(struct gfs2_rgrpd *rgd, unsigned char *buffer,
+ unsigned int buflen, u32 block)
+{
+ unsigned char *byte, *end, cur_state;
+ unsigned int bit;
+
+ byte = buffer + (block / GFS2_NBBY);
+ bit = (block % GFS2_NBBY) * GFS2_BIT_SIZE;
+ end = buffer + buflen;
+
+ gfs2_assert(rgd->rd_sbd, byte < end);
+
+ cur_state = (*byte >> bit) & GFS2_BIT_MASK;
+
+ return cur_state;
+}
+
+/**
+ * gfs2_bitfit - Search an rgrp's bitmap buffer to find a bit-pair representing
+ * a block in a given allocation state.
+ * @buffer: the buffer that holds the bitmaps
+ * @buflen: the length (in bytes) of the buffer
+ * @goal: start search at this block's bit-pair (within @buffer)
+ * @old_state: GFS2_BLKST_XXX the state of the block we're looking for;
+ * bit 0 = alloc(1)/free(0), bit 1 = meta(1)/data(0)
+ *
+ * Scope of @goal and returned block number is only within this bitmap buffer,
+ * not entire rgrp or filesystem. @buffer will be offset from the actual
+ * beginning of a bitmap block buffer, skipping any header structures.
+ *
+ * Return: the block number (bitmap buffer scope) that was found
+ */
+
+static u32 gfs2_bitfit(struct gfs2_rgrpd *rgd, unsigned char *buffer,
+ unsigned int buflen, u32 goal,
+ unsigned char old_state)
+{
+ unsigned char *byte, *end, alloc;
+ u32 blk = goal;
+ unsigned int bit;
+
+ byte = buffer + (goal / GFS2_NBBY);
+ bit = (goal % GFS2_NBBY) * GFS2_BIT_SIZE;
+ end = buffer + buflen;
+ alloc = (old_state & 1) ? 0 : 0x55;
+
+ while (byte < end) {
+ if ((*byte & 0x55) == alloc) {
+ blk += (8 - bit) >> 1;
+
+ bit = 0;
+ byte++;
+
+ continue;
+ }
+
+ if (((*byte >> bit) & GFS2_BIT_MASK) == old_state)
+ return blk;
+
+ bit += GFS2_BIT_SIZE;
+ if (bit >= 8) {
+ bit = 0;
+ byte++;
+ }
+
+ blk++;
+ }
+
+ return BFITNOENT;
+}
+
+/**
+ * gfs2_bitcount - count the number of bits in a certain state
+ * @buffer: the buffer that holds the bitmaps
+ * @buflen: the length (in bytes) of the buffer
+ * @state: the state of the block we're looking for
+ *
+ * Returns: The number of bits
+ */
+
+static u32 gfs2_bitcount(struct gfs2_rgrpd *rgd, unsigned char *buffer,
+ unsigned int buflen, unsigned char state)
+{
+ unsigned char *byte = buffer;
+ unsigned char *end = buffer + buflen;
+ unsigned char state1 = state << 2;
+ unsigned char state2 = state << 4;
+ unsigned char state3 = state << 6;
+ u32 count = 0;
+
+ for (; byte < end; byte++) {
+ if (((*byte) & 0x03) == state)
+ count++;
+ if (((*byte) & 0x0C) == state1)
+ count++;
+ if (((*byte) & 0x30) == state2)
+ count++;
+ if (((*byte) & 0xC0) == state3)
+ count++;
+ }
+
+ return count;
+}
+
+/**
+ * gfs2_rgrp_verify - Verify that a resource group is consistent
+ * @sdp: the filesystem
+ * @rgd: the rgrp
+ *
+ */
+
+void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd)
+{
+ struct gfs2_sbd *sdp = rgd->rd_sbd;
+ struct gfs2_bitmap *bi = NULL;
+ u32 length = rgd->rd_ri.ri_length;
+ u32 count[4], tmp;
+ int buf, x;
+
+ memset(count, 0, 4 * sizeof(u32));
+
+ /* Count # blocks in each of 4 possible allocation states */
+ for (buf = 0; buf < length; buf++) {
+ bi = rgd->rd_bits + buf;
+ for (x = 0; x < 4; x++)
+ count[x] += gfs2_bitcount(rgd,
+ bi->bi_bh->b_data +
+ bi->bi_offset,
+ bi->bi_len, x);
+ }
+
+ if (count[0] != rgd->rd_rg.rg_free) {
+ if (gfs2_consist_rgrpd(rgd))
+ fs_err(sdp, "free data mismatch: %u != %u\n",
+ count[0], rgd->rd_rg.rg_free);
+ return;
+ }
+
+ tmp = rgd->rd_ri.ri_data -
+ rgd->rd_rg.rg_free -
+ rgd->rd_rg.rg_dinodes;
+ if (count[1] + count[2] != tmp) {
+ if (gfs2_consist_rgrpd(rgd))
+ fs_err(sdp, "used data mismatch: %u != %u\n",
+ count[1], tmp);
+ return;
+ }
+
+ if (count[3] != rgd->rd_rg.rg_dinodes) {
+ if (gfs2_consist_rgrpd(rgd))
+ fs_err(sdp, "used metadata mismatch: %u != %u\n",
+ count[3], rgd->rd_rg.rg_dinodes);
+ return;
+ }
+
+ if (count[2] > count[3]) {
+ if (gfs2_consist_rgrpd(rgd))
+ fs_err(sdp, "unlinked inodes > inodes: %u\n",
+ count[2]);
+ return;
+ }
+
+}
+
+static inline int rgrp_contains_block(struct gfs2_rindex *ri, u64 block)
+{
+ u64 first = ri->ri_data0;
+ u64 last = first + ri->ri_data;
+ return first <= block && block < last;
+}
+
+/**
+ * gfs2_blk2rgrpd - Find resource group for a given data/meta block number
+ * @sdp: The GFS2 superblock
+ * @n: The data block number
+ *
+ * Returns: The resource group, or NULL if not found
+ */
+
+struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk)
+{
+ struct gfs2_rgrpd *rgd;
+
+ spin_lock(&sdp->sd_rindex_spin);
+
+ list_for_each_entry(rgd, &sdp->sd_rindex_mru_list, rd_list_mru) {
+ if (rgrp_contains_block(&rgd->rd_ri, blk)) {
+ list_move(&rgd->rd_list_mru, &sdp->sd_rindex_mru_list);
+ spin_unlock(&sdp->sd_rindex_spin);
+ return rgd;
+ }
+ }
+
+ spin_unlock(&sdp->sd_rindex_spin);
+
+ return NULL;
+}
+
+/**
+ * gfs2_rgrpd_get_first - get the first Resource Group in the filesystem
+ * @sdp: The GFS2 superblock
+ *
+ * Returns: The first rgrp in the filesystem
+ */
+
+struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp)
+{
+ gfs2_assert(sdp, !list_empty(&sdp->sd_rindex_list));
+ return list_entry(sdp->sd_rindex_list.next, struct gfs2_rgrpd, rd_list);
+}
+
+/**
+ * gfs2_rgrpd_get_next - get the next RG
+ * @rgd: A RG
+ *
+ * Returns: The next rgrp
+ */
+
+struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd)
+{
+ if (rgd->rd_list.next == &rgd->rd_sbd->sd_rindex_list)
+ return NULL;
+ return list_entry(rgd->rd_list.next, struct gfs2_rgrpd, rd_list);
+}
+
+static void clear_rgrpdi(struct gfs2_sbd *sdp)
+{
+ struct list_head *head;
+ struct gfs2_rgrpd *rgd;
+ struct gfs2_glock *gl;
+
+ spin_lock(&sdp->sd_rindex_spin);
+ sdp->sd_rindex_forward = NULL;
+ head = &sdp->sd_rindex_recent_list;
+ while (!list_empty(head)) {
+ rgd = list_entry(head->next, struct gfs2_rgrpd, rd_recent);
+ list_del(&rgd->rd_recent);
+ }
+ spin_unlock(&sdp->sd_rindex_spin);
+
+ head = &sdp->sd_rindex_list;
+ while (!list_empty(head)) {
+ rgd = list_entry(head->next, struct gfs2_rgrpd, rd_list);
+ gl = rgd->rd_gl;
+
+ list_del(&rgd->rd_list);
+ list_del(&rgd->rd_list_mru);
+
+ if (gl) {
+ gl->gl_object = NULL;
+ gfs2_glock_put(gl);
+ }
+
+ kfree(rgd->rd_bits);
+ kfree(rgd);
+ }
+}
+
+void gfs2_clear_rgrpd(struct gfs2_sbd *sdp)
+{
+ mutex_lock(&sdp->sd_rindex_mutex);
+ clear_rgrpdi(sdp);
+ mutex_unlock(&sdp->sd_rindex_mutex);
+}
+
+/**
+ * gfs2_compute_bitstructs - Compute the bitmap sizes
+ * @rgd: The resource group descriptor
+ *
+ * Calculates bitmap descriptors, one for each block that contains bitmap data
+ *
+ * Returns: errno
+ */
+
+static int compute_bitstructs(struct gfs2_rgrpd *rgd)
+{
+ struct gfs2_sbd *sdp = rgd->rd_sbd;
+ struct gfs2_bitmap *bi;
+ u32 length = rgd->rd_ri.ri_length; /* # blocks in hdr & bitmap */
+ u32 bytes_left, bytes;
+ int x;
+
+ if (!length)
+ return -EINVAL;
+
+ rgd->rd_bits = kcalloc(length, sizeof(struct gfs2_bitmap), GFP_NOFS);
+ if (!rgd->rd_bits)
+ return -ENOMEM;
+
+ bytes_left = rgd->rd_ri.ri_bitbytes;
+
+ for (x = 0; x < length; x++) {
+ bi = rgd->rd_bits + x;
+
+ /* small rgrp; bitmap stored completely in header block */
+ if (length == 1) {
+ bytes = bytes_left;
+ bi->bi_offset = sizeof(struct gfs2_rgrp);
+ bi->bi_start = 0;
+ bi->bi_len = bytes;
+ /* header block */
+ } else if (x == 0) {
+ bytes = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_rgrp);
+ bi->bi_offset = sizeof(struct gfs2_rgrp);
+ bi->bi_start = 0;
+ bi->bi_len = bytes;
+ /* last block */
+ } else if (x + 1 == length) {
+ bytes = bytes_left;
+ bi->bi_offset = sizeof(struct gfs2_meta_header);
+ bi->bi_start = rgd->rd_ri.ri_bitbytes - bytes_left;
+ bi->bi_len = bytes;
+ /* other blocks */
+ } else {
+ bytes = sdp->sd_sb.sb_bsize -
+ sizeof(struct gfs2_meta_header);
+ bi->bi_offset = sizeof(struct gfs2_meta_header);
+ bi->bi_start = rgd->rd_ri.ri_bitbytes - bytes_left;
+ bi->bi_len = bytes;
+ }
+
+ bytes_left -= bytes;
+ }
+
+ if (bytes_left) {
+ gfs2_consist_rgrpd(rgd);
+ return -EIO;
+ }
+ bi = rgd->rd_bits + (length - 1);
+ if ((bi->bi_start + bi->bi_len) * GFS2_NBBY != rgd->rd_ri.ri_data) {
+ if (gfs2_consist_rgrpd(rgd)) {
+ gfs2_rindex_print(&rgd->rd_ri);
+ fs_err(sdp, "start=%u len=%u offset=%u\n",
+ bi->bi_start, bi->bi_len, bi->bi_offset);
+ }
+ return -EIO;
+ }
+
+ return 0;
+}
+
+/**
+ * gfs2_ri_update - Pull in a new resource index from the disk
+ * @gl: The glock covering the rindex inode
+ *
+ * Returns: 0 on successful update, error code otherwise
+ */
+
+static int gfs2_ri_update(struct gfs2_inode *ip)
+{
+ struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+ struct inode *inode = &ip->i_inode;
+ struct gfs2_rgrpd *rgd;
+ char buf[sizeof(struct gfs2_rindex)];
+ struct file_ra_state ra_state;
+ u64 junk = ip->i_di.di_size;
+ int error;
+
+ if (do_div(junk, sizeof(struct gfs2_rindex))) {
+ gfs2_consist_inode(ip);
+ return -EIO;
+ }
+
+ clear_rgrpdi(sdp);
+
+ file_ra_state_init(&ra_state, inode->i_mapping);
+ for (sdp->sd_rgrps = 0;; sdp->sd_rgrps++) {
+ loff_t pos = sdp->sd_rgrps * sizeof(struct gfs2_rindex);
+ error = gfs2_internal_read(ip, &ra_state, buf, &pos,
+ sizeof(struct gfs2_rindex));
+ if (!error)
+ break;
+ if (error != sizeof(struct gfs2_rindex)) {
+ if (error > 0)
+ error = -EIO;
+ goto fail;
+ }
+
+ rgd = kzalloc(sizeof(struct gfs2_rgrpd), GFP_NOFS);
+ error = -ENOMEM;
+ if (!rgd)
+ goto fail;
+
+ mutex_init(&rgd->rd_mutex);
+ lops_init_le(&rgd->rd_le, &gfs2_rg_lops);
+ rgd->rd_sbd = sdp;
+
+ list_add_tail(&rgd->rd_list, &sdp->sd_rindex_list);
+ list_add_tail(&rgd->rd_list_mru, &sdp->sd_rindex_mru_list);
+
+ gfs2_rindex_in(&rgd->rd_ri, buf);
+ error = compute_bitstructs(rgd);
+ if (error)
+ goto fail;
+
+ error = gfs2_glock_get(sdp, rgd->rd_ri.ri_addr,
+ &gfs2_rgrp_glops, CREATE, &rgd->rd_gl);
+ if (error)
+ goto fail;
+
+ rgd->rd_gl->gl_object = rgd;
+ rgd->rd_rg_vn = rgd->rd_gl->gl_vn - 1;
+ }
+
+ sdp->sd_rindex_vn = ip->i_gl->gl_vn;
+ return 0;
+
+fail:
+ clear_rgrpdi(sdp);
+ return error;
+}
+
+/**
+ * gfs2_rindex_hold - Grab a lock on the rindex
+ * @sdp: The GFS2 superblock
+ * @ri_gh: the glock holder
+ *
+ * We grab a lock on the rindex inode to make sure that it doesn't
+ * change whilst we are performing an operation. We keep this lock
+ * for quite long periods of time compared to other locks. This
+ * doesn't matter, since it is shared and it is very, very rarely
+ * accessed in the exclusive mode (i.e. only when expanding the filesystem).
+ *
+ * This makes sure that we're using the latest copy of the resource index
+ * special file, which might have been updated if someone expanded the
+ * filesystem (via gfs2_grow utility), which adds new resource groups.
+ *
+ * Returns: 0 on success, error code otherwise
+ */
+
+int gfs2_rindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ri_gh)
+{
+ struct gfs2_inode *ip = GFS2_I(sdp->sd_rindex);
+ struct gfs2_glock *gl = ip->i_gl;
+ int error;
+
+ error = gfs2_glock_nq_init(gl, LM_ST_SHARED, 0, ri_gh);
+ if (error)
+ return error;
+
+ /* Read new copy from disk if we don't have the latest */
+ if (sdp->sd_rindex_vn != gl->gl_vn) {
+ mutex_lock(&sdp->sd_rindex_mutex);
+ if (sdp->sd_rindex_vn != gl->gl_vn) {
+ error = gfs2_ri_update(ip);
+ if (error)
+ gfs2_glock_dq_uninit(ri_gh);
+ }
+ mutex_unlock(&sdp->sd_rindex_mutex);
+ }
+
+ return error;
+}
+
+/**
+ * gfs2_rgrp_bh_get - Read in a RG's header and bitmaps
+ * @rgd: the struct gfs2_rgrpd describing the RG to read in
+ *
+ * Read in all of a Resource Group's header and bitmap blocks.
+ * Caller must eventually call gfs2_rgrp_relse() to free the bitmaps.
+ *
+ * Returns: errno
+ */
+
+int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd)
+{
+ struct gfs2_sbd *sdp = rgd->rd_sbd;
+ struct gfs2_glock *gl = rgd->rd_gl;
+ unsigned int length = rgd->rd_ri.ri_length;
+ struct gfs2_bitmap *bi;
+ unsigned int x, y;
+ int error;
+
+ mutex_lock(&rgd->rd_mutex);
+
+ spin_lock(&sdp->sd_rindex_spin);
+ if (rgd->rd_bh_count) {
+ rgd->rd_bh_count++;
+ spin_unlock(&sdp->sd_rindex_spin);
+ mutex_unlock(&rgd->rd_mutex);
+ return 0;
+ }
+ spin_unlock(&sdp->sd_rindex_spin);
+
+ for (x = 0; x < length; x++) {
+ bi = rgd->rd_bits + x;
+ error = gfs2_meta_read(gl, rgd->rd_ri.ri_addr + x, 0, &bi->bi_bh);
+ if (error)
+ goto fail;
+ }
+
+ for (y = length; y--;) {
+ bi = rgd->rd_bits + y;
+ error = gfs2_meta_wait(sdp, bi->bi_bh);
+ if (error)
+ goto fail;
+ if (gfs2_metatype_check(sdp, bi->bi_bh, y ? GFS2_METATYPE_RB :
+ GFS2_METATYPE_RG)) {
+ error = -EIO;
+ goto fail;
+ }
+ }
+
+ if (rgd->rd_rg_vn != gl->gl_vn) {
+ gfs2_rgrp_in(&rgd->rd_rg, (rgd->rd_bits[0].bi_bh)->b_data);
+ rgd->rd_rg_vn = gl->gl_vn;
+ }
+
+ spin_lock(&sdp->sd_rindex_spin);
+ rgd->rd_free_clone = rgd->rd_rg.rg_free;
+ rgd->rd_bh_count++;
+ spin_unlock(&sdp->sd_rindex_spin);
+
+ mutex_unlock(&rgd->rd_mutex);
+
+ return 0;
+
+fail:
+ while (x--) {
+ bi = rgd->rd_bits + x;
+ brelse(bi->bi_bh);
+ bi->bi_bh = NULL;
+ gfs2_assert_warn(sdp, !bi->bi_clone);
+ }
+ mutex_unlock(&rgd->rd_mutex);
+
+ return error;
+}
+
+void gfs2_rgrp_bh_hold(struct gfs2_rgrpd *rgd)
+{
+ struct gfs2_sbd *sdp = rgd->rd_sbd;
+
+ spin_lock(&sdp->sd_rindex_spin);
+ gfs2_assert_warn(rgd->rd_sbd, rgd->rd_bh_count);
+ rgd->rd_bh_count++;
+ spin_unlock(&sdp->sd_rindex_spin);
+}
+
+/**
+ * gfs2_rgrp_bh_put - Release RG bitmaps read in with gfs2_rgrp_bh_get()
+ * @rgd: the struct gfs2_rgrpd describing the RG to read in
+ *
+ */
+
+void gfs2_rgrp_bh_put(struct gfs2_rgrpd *rgd)
+{
+ struct gfs2_sbd *sdp = rgd->rd_sbd;
+ int x, length = rgd->rd_ri.ri_length;
+
+ spin_lock(&sdp->sd_rindex_spin);
+ gfs2_assert_warn(rgd->rd_sbd, rgd->rd_bh_count);
+ if (--rgd->rd_bh_count) {
+ spin_unlock(&sdp->sd_rindex_spin);
+ return;
+ }
+
+ for (x = 0; x < length; x++) {
+ struct gfs2_bitmap *bi = rgd->rd_bits + x;
+ kfree(bi->bi_clone);
+ bi->bi_clone = NULL;
+ brelse(bi->bi_bh);
+ bi->bi_bh = NULL;
+ }
+
+ spin_unlock(&sdp->sd_rindex_spin);
+}
+
+void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd)
+{
+ struct gfs2_sbd *sdp = rgd->rd_sbd;
+ unsigned int length = rgd->rd_ri.ri_length;
+ unsigned int x;
+
+ for (x = 0; x < length; x++) {
+ struct gfs2_bitmap *bi = rgd->rd_bits + x;
+ if (!bi->bi_clone)
+ continue;
+ memcpy(bi->bi_clone + bi->bi_offset,
+ bi->bi_bh->b_data + bi->bi_offset, bi->bi_len);
+ }
+
+ spin_lock(&sdp->sd_rindex_spin);
+ rgd->rd_free_clone = rgd->rd_rg.rg_free;
+ spin_unlock(&sdp->sd_rindex_spin);
+}
+
+/**
+ * gfs2_alloc_get - get the struct gfs2_alloc structure for an inode
+ * @ip: the incore GFS2 inode structure
+ *
+ * Returns: the struct gfs2_alloc
+ */
+
+struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip)
+{
+ struct gfs2_alloc *al = &ip->i_alloc;
+
+ /* FIXME: Should assert that the correct locks are held here... */
+ memset(al, 0, sizeof(*al));
+ return al;
+}
+
+/**
+ * try_rgrp_fit - See if a given reservation will fit in a given RG
+ * @rgd: the RG data
+ * @al: the struct gfs2_alloc structure describing the reservation
+ *
+ * If there's room for the requested blocks to be allocated from the RG:
+ * Sets the $al_reserved_data field in @al.
+ * Sets the $al_reserved_meta field in @al.
+ * Sets the $al_rgd field in @al.
+ *
+ * Returns: 1 on success (it fits), 0 on failure (it doesn't fit)
+ */
+
+static int try_rgrp_fit(struct gfs2_rgrpd *rgd, struct gfs2_alloc *al)
+{
+ struct gfs2_sbd *sdp = rgd->rd_sbd;
+ int ret = 0;
+
+ spin_lock(&sdp->sd_rindex_spin);
+ if (rgd->rd_free_clone >= al->al_requested) {
+ al->al_rgd = rgd;
+ ret = 1;
+ }
+ spin_unlock(&sdp->sd_rindex_spin);
+
+ return ret;
+}
+
+/**
+ * recent_rgrp_first - get first RG from "recent" list
+ * @sdp: The GFS2 superblock
+ * @rglast: address of the rgrp used last
+ *
+ * Returns: The first rgrp in the recent list
+ */
+
+static struct gfs2_rgrpd *recent_rgrp_first(struct gfs2_sbd *sdp,
+ u64 rglast)
+{
+ struct gfs2_rgrpd *rgd = NULL;
+
+ spin_lock(&sdp->sd_rindex_spin);
+
+ if (list_empty(&sdp->sd_rindex_recent_list))
+ goto out;
+
+ if (!rglast)
+ goto first;
+
+ list_for_each_entry(rgd, &sdp->sd_rindex_recent_list, rd_recent) {
+ if (rgd->rd_ri.ri_addr == rglast)
+ goto out;
+ }
+
+first:
+ rgd = list_entry(sdp->sd_rindex_recent_list.next, struct gfs2_rgrpd,
+ rd_recent);
+out:
+ spin_unlock(&sdp->sd_rindex_spin);
+ return rgd;
+}
+
+/**
+ * recent_rgrp_next - get next RG from "recent" list
+ * @cur_rgd: current rgrp
+ * @remove:
+ *
+ * Returns: The next rgrp in the recent list
+ */
+
+static struct gfs2_rgrpd *recent_rgrp_next(struct gfs2_rgrpd *cur_rgd,
+ int remove)
+{
+ struct gfs2_sbd *sdp = cur_rgd->rd_sbd;
+ struct list_head *head;
+ struct gfs2_rgrpd *rgd;
+
+ spin_lock(&sdp->sd_rindex_spin);
+
+ head = &sdp->sd_rindex_recent_list;
+
+ list_for_each_entry(rgd, head, rd_recent) {
+ if (rgd == cur_rgd) {
+ if (cur_rgd->rd_recent.next != head)
+ rgd = list_entry(cur_rgd->rd_recent.next,
+ struct gfs2_rgrpd, rd_recent);
+ else
+ rgd = NULL;
+
+ if (remove)
+ list_del(&cur_rgd->rd_recent);
+
+ goto out;
+ }
+ }
+
+ rgd = NULL;
+ if (!list_empty(head))
+ rgd = list_entry(head->next, struct gfs2_rgrpd, rd_recent);
+
+out:
+ spin_unlock(&sdp->sd_rindex_spin);
+ return rgd;
+}
+
+/**
+ * recent_rgrp_add - add an RG to tail of "recent" list
+ * @new_rgd: The rgrp to add
+ *
+ */
+
+static void recent_rgrp_add(struct gfs2_rgrpd *new_rgd)
+{
+ struct gfs2_sbd *sdp = new_rgd->rd_sbd;
+ struct gfs2_rgrpd *rgd;
+ unsigned int count = 0;
+ unsigned int max = sdp->sd_rgrps / gfs2_jindex_size(sdp);
+
+ spin_lock(&sdp->sd_rindex_spin);
+
+ list_for_each_entry(rgd, &sdp->sd_rindex_recent_list, rd_recent) {
+ if (rgd == new_rgd)
+ goto out;
+
+ if (++count >= max)
+ goto out;
+ }
+ list_add_tail(&new_rgd->rd_recent, &sdp->sd_rindex_recent_list);
+
+out:
+ spin_unlock(&sdp->sd_rindex_spin);
+}
+
+/**
+ * forward_rgrp_get - get an rgrp to try next from full list
+ * @sdp: The GFS2 superblock
+ *
+ * Returns: The rgrp to try next
+ */
+
+static struct gfs2_rgrpd *forward_rgrp_get(struct gfs2_sbd *sdp)
+{
+ struct gfs2_rgrpd *rgd;
+ unsigned int journals = gfs2_jindex_size(sdp);
+ unsigned int rg = 0, x;
+
+ spin_lock(&sdp->sd_rindex_spin);
+
+ rgd = sdp->sd_rindex_forward;
+ if (!rgd) {
+ if (sdp->sd_rgrps >= journals)
+ rg = sdp->sd_rgrps * sdp->sd_jdesc->jd_jid / journals;
+
+ for (x = 0, rgd = gfs2_rgrpd_get_first(sdp); x < rg;
+ x++, rgd = gfs2_rgrpd_get_next(rgd))
+ /* Do Nothing */;
+
+ sdp->sd_rindex_forward = rgd;
+ }
+
+ spin_unlock(&sdp->sd_rindex_spin);
+
+ return rgd;
+}
+
+/**
+ * forward_rgrp_set - set the forward rgrp pointer
+ * @sdp: the filesystem
+ * @rgd: The new forward rgrp
+ *
+ */
+
+static void forward_rgrp_set(struct gfs2_sbd *sdp, struct gfs2_rgrpd *rgd)
+{
+ spin_lock(&sdp->sd_rindex_spin);
+ sdp->sd_rindex_forward = rgd;
+ spin_unlock(&sdp->sd_rindex_spin);
+}
+
+/**
+ * get_local_rgrp - Choose and lock a rgrp for allocation
+ * @ip: the inode to reserve space for
+ * @rgp: the chosen and locked rgrp
+ *
+ * Try to acquire rgrp in way which avoids contending with others.
+ *
+ * Returns: errno
+ */
+
+static int get_local_rgrp(struct gfs2_inode *ip)
+{
+ struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+ struct gfs2_rgrpd *rgd, *begin = NULL;
+ struct gfs2_alloc *al = &ip->i_alloc;
+ int flags = LM_FLAG_TRY;
+ int skipped = 0;
+ int loops = 0;
+ int error;
+
+ /* Try recently successful rgrps */
+
+ rgd = recent_rgrp_first(sdp, ip->i_last_rg_alloc);
+
+ while (rgd) {
+ error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE,
+ LM_FLAG_TRY, &al->al_rgd_gh);
+ switch (error) {
+ case 0:
+ if (try_rgrp_fit(rgd, al))
+ goto out;
+ gfs2_glock_dq_uninit(&al->al_rgd_gh);
+ rgd = recent_rgrp_next(rgd, 1);
+ break;
+
+ case GLR_TRYFAILED:
+ rgd = recent_rgrp_next(rgd, 0);
+ break;
+
+ default:
+ return error;
+ }
+ }
+
+ /* Go through full list of rgrps */
+
+ begin = rgd = forward_rgrp_get(sdp);
+
+ for (;;) {
+ error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, flags,
+ &al->al_rgd_gh);
+ switch (error) {
+ case 0:
+ if (try_rgrp_fit(rgd, al))
+ goto out;
+ gfs2_glock_dq_uninit(&al->al_rgd_gh);
+ break;
+
+ case GLR_TRYFAILED:
+ skipped++;
+ break;
+
+ default:
+ return error;
+ }
+
+ rgd = gfs2_rgrpd_get_next(rgd);
+ if (!rgd)
+ rgd = gfs2_rgrpd_get_first(sdp);
+
+ if (rgd == begin) {
+ if (++loops >= 2 || !skipped)
+ return -ENOSPC;
+ flags = 0;
+ }
+ }
+
+out:
+ ip->i_last_rg_alloc = rgd->rd_ri.ri_addr;
+
+ if (begin) {
+ recent_rgrp_add(rgd);
+ rgd = gfs2_rgrpd_get_next(rgd);
+ if (!rgd)
+ rgd = gfs2_rgrpd_get_first(sdp);
+ forward_rgrp_set(sdp, rgd);
+ }
+
+ return 0;
+}
+
+/**
+ * gfs2_inplace_reserve_i - Reserve space in the filesystem
+ * @ip: the inode to reserve space for
+ *
+ * Returns: errno
+ */
+
+int gfs2_inplace_reserve_i(struct gfs2_inode *ip, char *file, unsigned int line)
+{
+ struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+ struct gfs2_alloc *al = &ip->i_alloc;
+ int error;
+
+ if (gfs2_assert_warn(sdp, al->al_requested))
+ return -EINVAL;
+
+ error = gfs2_rindex_hold(sdp, &al->al_ri_gh);
+ if (error)
+ return error;
+
+ error = get_local_rgrp(ip);
+ if (error) {
+ gfs2_glock_dq_uninit(&al->al_ri_gh);
+ return error;
+ }
+
+ al->al_file = file;
+ al->al_line = line;
+
+ return 0;
+}
+
+/**
+ * gfs2_inplace_release - release an inplace reservation
+ * @ip: the inode the reservation was taken out on
+ *
+ * Release a reservation made by gfs2_inplace_reserve().
+ */
+
+void gfs2_inplace_release(struct gfs2_inode *ip)
+{
+ struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+ struct gfs2_alloc *al = &ip->i_alloc;
+
+ if (gfs2_assert_warn(sdp, al->al_alloced <= al->al_requested) == -1)
+ fs_warn(sdp, "al_alloced = %u, al_requested = %u "
+ "al_file = %s, al_line = %u\n",
+ al->al_alloced, al->al_requested, al->al_file,
+ al->al_line);
+
+ al->al_rgd = NULL;
+ gfs2_glock_dq_uninit(&al->al_rgd_gh);
+ gfs2_glock_dq_uninit(&al->al_ri_gh);
+}
+
+/**
+ * gfs2_get_block_type - Check a block in a RG is of given type
+ * @rgd: the resource group holding the block
+ * @block: the block number
+ *
+ * Returns: The block type (GFS2_BLKST_*)
+ */
+
+unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, u64 block)
+{
+ struct gfs2_bitmap *bi = NULL;
+ u32 length, rgrp_block, buf_block;
+ unsigned int buf;
+ unsigned char type;
+
+ length = rgd->rd_ri.ri_length;
+ rgrp_block = block - rgd->rd_ri.ri_data0;
+
+ for (buf = 0; buf < length; buf++) {
+ bi = rgd->rd_bits + buf;
+ if (rgrp_block < (bi->bi_start + bi->bi_len) * GFS2_NBBY)
+ break;
+ }
+
+ gfs2_assert(rgd->rd_sbd, buf < length);
+ buf_block = rgrp_block - bi->bi_start * GFS2_NBBY;
+
+ type = gfs2_testbit(rgd, bi->bi_bh->b_data + bi->bi_offset,
+ bi->bi_len, buf_block);
+
+ return type;
+}
+
+/**
+ * rgblk_search - find a block in @old_state, change allocation
+ * state to @new_state
+ * @rgd: the resource group descriptor
+ * @goal: the goal block within the RG (start here to search for avail block)
+ * @old_state: GFS2_BLKST_XXX the before-allocation state to find
+ * @new_state: GFS2_BLKST_XXX the after-allocation block state
+ *
+ * Walk rgrp's bitmap to find bits that represent a block in @old_state.
+ * Add the found bitmap buffer to the transaction.
+ * Set the found bits to @new_state to change block's allocation state.
+ *
+ * This function never fails, because we wouldn't call it unless we
+ * know (from reservation results, etc.) that a block is available.
+ *
+ * Scope of @goal and returned block is just within rgrp, not the whole
+ * filesystem.
+ *
+ * Returns: the block number allocated
+ */
+
+static u32 rgblk_search(struct gfs2_rgrpd *rgd, u32 goal,
+ unsigned char old_state, unsigned char new_state)
+{
+ struct gfs2_bitmap *bi = NULL;
+ u32 length = rgd->rd_ri.ri_length;
+ u32 blk = 0;
+ unsigned int buf, x;
+
+ /* Find bitmap block that contains bits for goal block */
+ for (buf = 0; buf < length; buf++) {
+ bi = rgd->rd_bits + buf;
+ if (goal < (bi->bi_start + bi->bi_len) * GFS2_NBBY)
+ break;
+ }
+
+ gfs2_assert(rgd->rd_sbd, buf < length);
+
+ /* Convert scope of "goal" from rgrp-wide to within found bit block */
+ goal -= bi->bi_start * GFS2_NBBY;
+
+ /* Search (up to entire) bitmap in this rgrp for allocatable block.
+ "x <= length", instead of "x < length", because we typically start
+ the search in the middle of a bit block, but if we can't find an
+ allocatable block anywhere else, we want to be able wrap around and
+ search in the first part of our first-searched bit block. */
+ for (x = 0; x <= length; x++) {
+ if (bi->bi_clone)
+ blk = gfs2_bitfit(rgd, bi->bi_clone + bi->bi_offset,
+ bi->bi_len, goal, old_state);
+ else
+ blk = gfs2_bitfit(rgd,
+ bi->bi_bh->b_data + bi->bi_offset,
+ bi->bi_len, goal, old_state);
+ if (blk != BFITNOENT)
+ break;
+
+ /* Try next bitmap block (wrap back to rgrp header if at end) */
+ buf = (buf + 1) % length;
+ bi = rgd->rd_bits + buf;
+ goal = 0;
+ }
+
+ if (gfs2_assert_withdraw(rgd->rd_sbd, x <= length))
+ blk = 0;
+
+ gfs2_trans_add_bh(rgd->rd_gl, bi->bi_bh, 1);
+ gfs2_setbit(rgd, bi->bi_bh->b_data + bi->bi_offset,
+ bi->bi_len, blk, new_state);
+ if (bi->bi_clone)
+ gfs2_setbit(rgd, bi->bi_clone + bi->bi_offset,
+ bi->bi_len, blk, new_state);
+
+ return bi->bi_start * GFS2_NBBY + blk;
+}
+
+/**
+ * rgblk_free - Change alloc state of given block(s)
+ * @sdp: the filesystem
+ * @bstart: the start of a run of blocks to free
+ * @blen: the length of the block run (all must lie within ONE RG!)
+ * @new_state: GFS2_BLKST_XXX the after-allocation block state
+ *
+ * Returns: Resource group containing the block(s)
+ */
+
+static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart,
+ u32 blen, unsigned char new_state)
+{
+ struct gfs2_rgrpd *rgd;
+ struct gfs2_bitmap *bi = NULL;
+ u32 length, rgrp_blk, buf_blk;
+ unsigned int buf;
+
+ rgd = gfs2_blk2rgrpd(sdp, bstart);
+ if (!rgd) {
+ if (gfs2_consist(sdp))
+ fs_err(sdp, "block = %llu\n", (unsigned long long)bstart);
+ return NULL;
+ }
+
+ length = rgd->rd_ri.ri_length;
+
+ rgrp_blk = bstart - rgd->rd_ri.ri_data0;
+
+ while (blen--) {
+ for (buf = 0; buf < length; buf++) {
+ bi = rgd->rd_bits + buf;
+ if (rgrp_blk < (bi->bi_start + bi->bi_len) * GFS2_NBBY)
+ break;
+ }
+
+ gfs2_assert(rgd->rd_sbd, buf < length);
+
+ buf_blk = rgrp_blk - bi->bi_start * GFS2_NBBY;
+ rgrp_blk++;
+
+ if (!bi->bi_clone) {
+ bi->bi_clone = kmalloc(bi->bi_bh->b_size,
+ GFP_NOFS | __GFP_NOFAIL);
+ memcpy(bi->bi_clone + bi->bi_offset,
+ bi->bi_bh->b_data + bi->bi_offset,
+ bi->bi_len);
+ }
+ gfs2_trans_add_bh(rgd->rd_gl, bi->bi_bh, 1);
+ gfs2_setbit(rgd, bi->bi_bh->b_data + bi->bi_offset,
+ bi->bi_len, buf_blk, new_state);
+ }
+
+ return rgd;
+}
+
+/**
+ * gfs2_alloc_data - Allocate a data block
+ * @ip: the inode to allocate the data block for
+ *
+ * Returns: the allocated block
+ */
+
+u64 gfs2_alloc_data(struct gfs2_inode *ip)
+{
+ struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+ struct gfs2_alloc *al = &ip->i_alloc;
+ struct gfs2_rgrpd *rgd = al->al_rgd;
+ u32 goal, blk;
+ u64 block;
+
+ if (rgrp_contains_block(&rgd->rd_ri, ip->i_di.di_goal_data))
+ goal = ip->i_di.di_goal_data - rgd->rd_ri.ri_data0;
+ else
+ goal = rgd->rd_last_alloc_data;
+
+ blk = rgblk_search(rgd, goal, GFS2_BLKST_FREE, GFS2_BLKST_USED);
+ rgd->rd_last_alloc_data = blk;
+
+ block = rgd->rd_ri.ri_data0 + blk;
+ ip->i_di.di_goal_data = block;
+
+ gfs2_assert_withdraw(sdp, rgd->rd_rg.rg_free);
+ rgd->rd_rg.rg_free--;
+
+ gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
+ gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data);
+
+ al->al_alloced++;
+
+ gfs2_statfs_change(sdp, 0, -1, 0);
+ gfs2_quota_change(ip, +1, ip->i_di.di_uid, ip->i_di.di_gid);
+
+ spin_lock(&sdp->sd_rindex_spin);
+ rgd->rd_free_clone--;
+ spin_unlock(&sdp->sd_rindex_spin);
+
+ return block;
+}
+
+/**
+ * gfs2_alloc_meta - Allocate a metadata block
+ * @ip: the inode to allocate the metadata block for
+ *
+ * Returns: the allocated block
+ */
+
+u64 gfs2_alloc_meta(struct gfs2_inode *ip)
+{
+ struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+ struct gfs2_alloc *al = &ip->i_alloc;
+ struct gfs2_rgrpd *rgd = al->al_rgd;
+ u32 goal, blk;
+ u64 block;
+
+ if (rgrp_contains_block(&rgd->rd_ri, ip->i_di.di_goal_meta))
+ goal = ip->i_di.di_goal_meta - rgd->rd_ri.ri_data0;
+ else
+ goal = rgd->rd_last_alloc_meta;
+
+ blk = rgblk_search(rgd, goal, GFS2_BLKST_FREE, GFS2_BLKST_USED);
+ rgd->rd_last_alloc_meta = blk;
+
+ block = rgd->rd_ri.ri_data0 + blk;
+ ip->i_di.di_goal_meta = block;
+
+ gfs2_assert_withdraw(sdp, rgd->rd_rg.rg_free);
+ rgd->rd_rg.rg_free--;
+
+ gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
+ gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data);
+
+ al->al_alloced++;
+
+ gfs2_statfs_change(sdp, 0, -1, 0);
+ gfs2_quota_change(ip, +1, ip->i_di.di_uid, ip->i_di.di_gid);
+ gfs2_trans_add_unrevoke(sdp, block);
+
+ spin_lock(&sdp->sd_rindex_spin);
+ rgd->rd_free_clone--;
+ spin_unlock(&sdp->sd_rindex_spin);
+
+ return block;
+}
+
+/**
+ * gfs2_alloc_di - Allocate a dinode
+ * @dip: the directory that the inode is going in
+ *
+ * Returns: the block allocated
+ */
+
+u64 gfs2_alloc_di(struct gfs2_inode *dip, u64 *generation)
+{
+ struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
+ struct gfs2_alloc *al = &dip->i_alloc;
+ struct gfs2_rgrpd *rgd = al->al_rgd;
+ u32 blk;
+ u64 block;
+
+ blk = rgblk_search(rgd, rgd->rd_last_alloc_meta,
+ GFS2_BLKST_FREE, GFS2_BLKST_DINODE);
+
+ rgd->rd_last_alloc_meta = blk;
+
+ block = rgd->rd_ri.ri_data0 + blk;
+
+ gfs2_assert_withdraw(sdp, rgd->rd_rg.rg_free);
+ rgd->rd_rg.rg_free--;
+ rgd->rd_rg.rg_dinodes++;
+ *generation = rgd->rd_rg.rg_igeneration++;
+ gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
+ gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data);
+
+ al->al_alloced++;
+
+ gfs2_statfs_change(sdp, 0, -1, +1);
+ gfs2_trans_add_unrevoke(sdp, block);
+
+ spin_lock(&sdp->sd_rindex_spin);
+ rgd->rd_free_clone--;
+ spin_unlock(&sdp->sd_rindex_spin);
+
+ return block;
+}
+
+/**
+ * gfs2_free_data - free a contiguous run of data block(s)
+ * @ip: the inode these blocks are being freed from
+ * @bstart: first block of a run of contiguous blocks
+ * @blen: the length of the block run
+ *
+ */
+
+void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen)
+{
+ struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+ struct gfs2_rgrpd *rgd;
+
+ rgd = rgblk_free(sdp, bstart, blen, GFS2_BLKST_FREE);
+ if (!rgd)
+ return;
+
+ rgd->rd_rg.rg_free += blen;
+
+ gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
+ gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data);
+
+ gfs2_trans_add_rg(rgd);
+
+ gfs2_statfs_change(sdp, 0, +blen, 0);
+ gfs2_quota_change(ip, -(s64)blen,
+ ip->i_di.di_uid, ip->i_di.di_gid);
+}
+
+/**
+ * gfs2_free_meta - free a contiguous run of data block(s)
+ * @ip: the inode these blocks are being freed from
+ * @bstart: first block of a run of contiguous blocks
+ * @blen: the length of the block run
+ *
+ */
+
+void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen)
+{
+ struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+ struct gfs2_rgrpd *rgd;
+
+ rgd = rgblk_free(sdp, bstart, blen, GFS2_BLKST_FREE);
+ if (!rgd)
+ return;
+
+ rgd->rd_rg.rg_free += blen;
+
+ gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
+ gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data);
+
+ gfs2_trans_add_rg(rgd);
+
+ gfs2_statfs_change(sdp, 0, +blen, 0);
+ gfs2_quota_change(ip, -(s64)blen, ip->i_di.di_uid, ip->i_di.di_gid);
+ gfs2_meta_wipe(ip, bstart, blen);
+}
+
+void gfs2_unlink_di(struct inode *inode)
+{
+ struct gfs2_inode *ip = GFS2_I(inode);
+ struct gfs2_sbd *sdp = GFS2_SB(inode);
+ struct gfs2_rgrpd *rgd;
+ u64 blkno = ip->i_num.no_addr;
+
+ rgd = rgblk_free(sdp, blkno, 1, GFS2_BLKST_UNLINKED);
+ if (!rgd)
+ return;
+ gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
+ gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data);
+ gfs2_trans_add_rg(rgd);
+}
+
+static void gfs2_free_uninit_di(struct gfs2_rgrpd *rgd, u64 blkno)
+{
+ struct gfs2_sbd *sdp = rgd->rd_sbd;
+ struct gfs2_rgrpd *tmp_rgd;
+
+ tmp_rgd = rgblk_free(sdp, blkno, 1, GFS2_BLKST_FREE);
+ if (!tmp_rgd)
+ return;
+ gfs2_assert_withdraw(sdp, rgd == tmp_rgd);
+
+ if (!rgd->rd_rg.rg_dinodes)
+ gfs2_consist_rgrpd(rgd);
+ rgd->rd_rg.rg_dinodes--;
+ rgd->rd_rg.rg_free++;
+
+ gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
+ gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data);
+
+ gfs2_statfs_change(sdp, 0, +1, -1);
+ gfs2_trans_add_rg(rgd);
+}
+
+
+void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip)
+{
+ gfs2_free_uninit_di(rgd, ip->i_num.no_addr);
+ gfs2_quota_change(ip, -1, ip->i_di.di_uid, ip->i_di.di_gid);
+ gfs2_meta_wipe(ip, ip->i_num.no_addr, 1);
+}
+
+/**
+ * gfs2_rlist_add - add a RG to a list of RGs
+ * @sdp: the filesystem
+ * @rlist: the list of resource groups
+ * @block: the block
+ *
+ * Figure out what RG a block belongs to and add that RG to the list
+ *
+ * FIXME: Don't use NOFAIL
+ *
+ */
+
+void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist,
+ u64 block)
+{
+ struct gfs2_rgrpd *rgd;
+ struct gfs2_rgrpd **tmp;
+ unsigned int new_space;
+ unsigned int x;
+
+ if (gfs2_assert_warn(sdp, !rlist->rl_ghs))
+ return;
+
+ rgd = gfs2_blk2rgrpd(sdp, block);
+ if (!rgd) {
+ if (gfs2_consist(sdp))
+ fs_err(sdp, "block = %llu\n", (unsigned long long)block);
+ return;
+ }
+
+ for (x = 0; x < rlist->rl_rgrps; x++)
+ if (rlist->rl_rgd[x] == rgd)
+ return;
+
+ if (rlist->rl_rgrps == rlist->rl_space) {
+ new_space = rlist->rl_space + 10;
+
+ tmp = kcalloc(new_space, sizeof(struct gfs2_rgrpd *),
+ GFP_NOFS | __GFP_NOFAIL);
+
+ if (rlist->rl_rgd) {
+ memcpy(tmp, rlist->rl_rgd,
+ rlist->rl_space * sizeof(struct gfs2_rgrpd *));
+ kfree(rlist->rl_rgd);
+ }
+
+ rlist->rl_space = new_space;
+ rlist->rl_rgd = tmp;
+ }
+
+ rlist->rl_rgd[rlist->rl_rgrps++] = rgd;
+}
+
+/**
+ * gfs2_rlist_alloc - all RGs have been added to the rlist, now allocate
+ * and initialize an array of glock holders for them
+ * @rlist: the list of resource groups
+ * @state: the lock state to acquire the RG lock in
+ * @flags: the modifier flags for the holder structures
+ *
+ * FIXME: Don't use NOFAIL
+ *
+ */
+
+void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state,
+ int flags)
+{
+ unsigned int x;
+
+ rlist->rl_ghs = kcalloc(rlist->rl_rgrps, sizeof(struct gfs2_holder),
+ GFP_NOFS | __GFP_NOFAIL);
+ for (x = 0; x < rlist->rl_rgrps; x++)
+ gfs2_holder_init(rlist->rl_rgd[x]->rd_gl,
+ state, flags,
+ &rlist->rl_ghs[x]);
+}
+
+/**
+ * gfs2_rlist_free - free a resource group list
+ * @list: the list of resource groups
+ *
+ */
+
+void gfs2_rlist_free(struct gfs2_rgrp_list *rlist)
+{
+ unsigned int x;
+
+ kfree(rlist->rl_rgd);
+
+ if (rlist->rl_ghs) {
+ for (x = 0; x < rlist->rl_rgrps; x++)
+ gfs2_holder_uninit(&rlist->rl_ghs[x]);
+ kfree(rlist->rl_ghs);
+ }
+}
+
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
new file mode 100644
index 000000000000..b01e0cfc99b5
--- /dev/null
+++ b/fs/gfs2/rgrp.h
@@ -0,0 +1,69 @@
+/*
+ * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#ifndef __RGRP_DOT_H__
+#define __RGRP_DOT_H__
+
+struct gfs2_rgrpd;
+struct gfs2_sbd;
+struct gfs2_holder;
+
+void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd);
+
+struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk);
+struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp);
+struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd);
+
+void gfs2_clear_rgrpd(struct gfs2_sbd *sdp);
+int gfs2_rindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ri_gh);
+
+int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd);
+void gfs2_rgrp_bh_hold(struct gfs2_rgrpd *rgd);
+void gfs2_rgrp_bh_put(struct gfs2_rgrpd *rgd);
+
+void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd);
+
+struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip);
+static inline void gfs2_alloc_put(struct gfs2_inode *ip)
+{
+ return; /* So we can see where ip->i_alloc is used */
+}
+
+int gfs2_inplace_reserve_i(struct gfs2_inode *ip,
+ char *file, unsigned int line);
+#define gfs2_inplace_reserve(ip) \
+gfs2_inplace_reserve_i((ip), __FILE__, __LINE__)
+
+void gfs2_inplace_release(struct gfs2_inode *ip);
+
+unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, u64 block);
+
+u64 gfs2_alloc_data(struct gfs2_inode *ip);
+u64 gfs2_alloc_meta(struct gfs2_inode *ip);
+u64 gfs2_alloc_di(struct gfs2_inode *ip, u64 *generation);
+
+void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen);
+void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen);
+void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip);
+void gfs2_unlink_di(struct inode *inode);
+
+struct gfs2_rgrp_list {
+ unsigned int rl_rgrps;
+ unsigned int rl_space;
+ struct gfs2_rgrpd **rl_rgd;
+ struct gfs2_holder *rl_ghs;
+};
+
+void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist,
+ u64 block);
+void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state,
+ int flags);
+void gfs2_rlist_free(struct gfs2_rgrp_list *rlist);
+
+#endif /* __RGRP_DOT_H__ */
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
new file mode 100644
index 000000000000..6a78b1b32e25
--- /dev/null
+++ b/fs/gfs2/super.c
@@ -0,0 +1,976 @@
+/*
+ * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/crc32.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/bio.h>
+#include <linux/lm_interface.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "bmap.h"
+#include "dir.h"
+#include "glock.h"
+#include "glops.h"
+#include "inode.h"
+#include "log.h"
+#include "meta_io.h"
+#include "quota.h"
+#include "recovery.h"
+#include "rgrp.h"
+#include "super.h"
+#include "trans.h"
+#include "util.h"
+
+static const u32 gfs2_old_fs_formats[] = {
+ 0
+};
+
+static const u32 gfs2_old_multihost_formats[] = {
+ 0
+};
+
+/**
+ * gfs2_tune_init - Fill a gfs2_tune structure with default values
+ * @gt: tune
+ *
+ */
+
+void gfs2_tune_init(struct gfs2_tune *gt)
+{
+ spin_lock_init(&gt->gt_spin);
+
+ gt->gt_ilimit = 100;
+ gt->gt_ilimit_tries = 3;
+ gt->gt_ilimit_min = 1;
+ gt->gt_demote_secs = 300;
+ gt->gt_incore_log_blocks = 1024;
+ gt->gt_log_flush_secs = 60;
+ gt->gt_jindex_refresh_secs = 60;
+ gt->gt_scand_secs = 15;
+ gt->gt_recoverd_secs = 60;
+ gt->gt_logd_secs = 1;
+ gt->gt_quotad_secs = 5;
+ gt->gt_quota_simul_sync = 64;
+ gt->gt_quota_warn_period = 10;
+ gt->gt_quota_scale_num = 1;
+ gt->gt_quota_scale_den = 1;
+ gt->gt_quota_cache_secs = 300;
+ gt->gt_quota_quantum = 60;
+ gt->gt_atime_quantum = 3600;
+ gt->gt_new_files_jdata = 0;
+ gt->gt_new_files_directio = 0;
+ gt->gt_max_atomic_write = 4 << 20;
+ gt->gt_max_readahead = 1 << 18;
+ gt->gt_lockdump_size = 131072;
+ gt->gt_stall_secs = 600;
+ gt->gt_complain_secs = 10;
+ gt->gt_reclaim_limit = 5000;
+ gt->gt_entries_per_readdir = 32;
+ gt->gt_prefetch_secs = 10;
+ gt->gt_greedy_default = HZ / 10;
+ gt->gt_greedy_quantum = HZ / 40;
+ gt->gt_greedy_max = HZ / 4;
+ gt->gt_statfs_quantum = 30;
+ gt->gt_statfs_slow = 0;
+}
+
+/**
+ * gfs2_check_sb - Check superblock
+ * @sdp: the filesystem
+ * @sb: The superblock
+ * @silent: Don't print a message if the check fails
+ *
+ * Checks the version code of the FS is one that we understand how to
+ * read and that the sizes of the various on-disk structures have not
+ * changed.
+ */
+
+int gfs2_check_sb(struct gfs2_sbd *sdp, struct gfs2_sb *sb, int silent)
+{
+ unsigned int x;
+
+ if (sb->sb_header.mh_magic != GFS2_MAGIC ||
+ sb->sb_header.mh_type != GFS2_METATYPE_SB) {
+ if (!silent)
+ printk(KERN_WARNING "GFS2: not a GFS2 filesystem\n");
+ return -EINVAL;
+ }
+
+ /* If format numbers match exactly, we're done. */
+
+ if (sb->sb_fs_format == GFS2_FORMAT_FS &&
+ sb->sb_multihost_format == GFS2_FORMAT_MULTI)
+ return 0;
+
+ if (sb->sb_fs_format != GFS2_FORMAT_FS) {
+ for (x = 0; gfs2_old_fs_formats[x]; x++)
+ if (gfs2_old_fs_formats[x] == sb->sb_fs_format)
+ break;
+
+ if (!gfs2_old_fs_formats[x]) {
+ printk(KERN_WARNING
+ "GFS2: code version (%u, %u) is incompatible "
+ "with ondisk format (%u, %u)\n",
+ GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
+ sb->sb_fs_format, sb->sb_multihost_format);
+ printk(KERN_WARNING
+ "GFS2: I don't know how to upgrade this FS\n");
+ return -EINVAL;
+ }
+ }
+
+ if (sb->sb_multihost_format != GFS2_FORMAT_MULTI) {
+ for (x = 0; gfs2_old_multihost_formats[x]; x++)
+ if (gfs2_old_multihost_formats[x] ==
+ sb->sb_multihost_format)
+ break;
+
+ if (!gfs2_old_multihost_formats[x]) {
+ printk(KERN_WARNING
+ "GFS2: code version (%u, %u) is incompatible "
+ "with ondisk format (%u, %u)\n",
+ GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
+ sb->sb_fs_format, sb->sb_multihost_format);
+ printk(KERN_WARNING
+ "GFS2: I don't know how to upgrade this FS\n");
+ return -EINVAL;
+ }
+ }
+
+ if (!sdp->sd_args.ar_upgrade) {
+ printk(KERN_WARNING
+ "GFS2: code version (%u, %u) is incompatible "
+ "with ondisk format (%u, %u)\n",
+ GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
+ sb->sb_fs_format, sb->sb_multihost_format);
+ printk(KERN_INFO
+ "GFS2: Use the \"upgrade\" mount option to upgrade "
+ "the FS\n");
+ printk(KERN_INFO "GFS2: See the manual for more details\n");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+
+static int end_bio_io_page(struct bio *bio, unsigned int bytes_done, int error)
+{
+ struct page *page = bio->bi_private;
+ if (bio->bi_size)
+ return 1;
+
+ if (!error)
+ SetPageUptodate(page);
+ else
+ printk(KERN_WARNING "gfs2: error %d reading superblock\n", error);
+ unlock_page(page);
+ return 0;
+}
+
+struct page *gfs2_read_super(struct super_block *sb, sector_t sector)
+{
+ struct page *page;
+ struct bio *bio;
+
+ page = alloc_page(GFP_KERNEL);
+ if (unlikely(!page))
+ return NULL;
+
+ ClearPageUptodate(page);
+ ClearPageDirty(page);
+ lock_page(page);
+
+ bio = bio_alloc(GFP_KERNEL, 1);
+ if (unlikely(!bio)) {
+ __free_page(page);
+ return NULL;
+ }
+
+ bio->bi_sector = sector;
+ bio->bi_bdev = sb->s_bdev;
+ bio_add_page(bio, page, PAGE_SIZE, 0);
+
+ bio->bi_end_io = end_bio_io_page;
+ bio->bi_private = page;
+ submit_bio(READ_SYNC | (1 << BIO_RW_META), bio);
+ wait_on_page_locked(page);
+ bio_put(bio);
+ if (!PageUptodate(page)) {
+ __free_page(page);
+ return NULL;
+ }
+ return page;
+}
+
+/**
+ * gfs2_read_sb - Read super block
+ * @sdp: The GFS2 superblock
+ * @gl: the glock for the superblock (assumed to be held)
+ * @silent: Don't print message if mount fails
+ *
+ */
+
+int gfs2_read_sb(struct gfs2_sbd *sdp, struct gfs2_glock *gl, int silent)
+{
+ u32 hash_blocks, ind_blocks, leaf_blocks;
+ u32 tmp_blocks;
+ unsigned int x;
+ int error;
+ struct page *page;
+ char *sb;
+
+ page = gfs2_read_super(sdp->sd_vfs, GFS2_SB_ADDR >> sdp->sd_fsb2bb_shift);
+ if (!page) {
+ if (!silent)
+ fs_err(sdp, "can't read superblock\n");
+ return -EIO;
+ }
+ sb = kmap(page);
+ gfs2_sb_in(&sdp->sd_sb, sb);
+ kunmap(page);
+ __free_page(page);
+
+ error = gfs2_check_sb(sdp, &sdp->sd_sb, silent);
+ if (error)
+ return error;
+
+ sdp->sd_fsb2bb_shift = sdp->sd_sb.sb_bsize_shift -
+ GFS2_BASIC_BLOCK_SHIFT;
+ sdp->sd_fsb2bb = 1 << sdp->sd_fsb2bb_shift;
+ sdp->sd_diptrs = (sdp->sd_sb.sb_bsize -
+ sizeof(struct gfs2_dinode)) / sizeof(u64);
+ sdp->sd_inptrs = (sdp->sd_sb.sb_bsize -
+ sizeof(struct gfs2_meta_header)) / sizeof(u64);
+ sdp->sd_jbsize = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_meta_header);
+ sdp->sd_hash_bsize = sdp->sd_sb.sb_bsize / 2;
+ sdp->sd_hash_bsize_shift = sdp->sd_sb.sb_bsize_shift - 1;
+ sdp->sd_hash_ptrs = sdp->sd_hash_bsize / sizeof(u64);
+ sdp->sd_qc_per_block = (sdp->sd_sb.sb_bsize -
+ sizeof(struct gfs2_meta_header)) /
+ sizeof(struct gfs2_quota_change);
+
+ /* Compute maximum reservation required to add a entry to a directory */
+
+ hash_blocks = DIV_ROUND_UP(sizeof(u64) * (1 << GFS2_DIR_MAX_DEPTH),
+ sdp->sd_jbsize);
+
+ ind_blocks = 0;
+ for (tmp_blocks = hash_blocks; tmp_blocks > sdp->sd_diptrs;) {
+ tmp_blocks = DIV_ROUND_UP(tmp_blocks, sdp->sd_inptrs);
+ ind_blocks += tmp_blocks;
+ }
+
+ leaf_blocks = 2 + GFS2_DIR_MAX_DEPTH;
+
+ sdp->sd_max_dirres = hash_blocks + ind_blocks + leaf_blocks;
+
+ sdp->sd_heightsize[0] = sdp->sd_sb.sb_bsize -
+ sizeof(struct gfs2_dinode);
+ sdp->sd_heightsize[1] = sdp->sd_sb.sb_bsize * sdp->sd_diptrs;
+ for (x = 2;; x++) {
+ u64 space, d;
+ u32 m;
+
+ space = sdp->sd_heightsize[x - 1] * sdp->sd_inptrs;
+ d = space;
+ m = do_div(d, sdp->sd_inptrs);
+
+ if (d != sdp->sd_heightsize[x - 1] || m)
+ break;
+ sdp->sd_heightsize[x] = space;
+ }
+ sdp->sd_max_height = x;
+ gfs2_assert(sdp, sdp->sd_max_height <= GFS2_MAX_META_HEIGHT);
+
+ sdp->sd_jheightsize[0] = sdp->sd_sb.sb_bsize -
+ sizeof(struct gfs2_dinode);
+ sdp->sd_jheightsize[1] = sdp->sd_jbsize * sdp->sd_diptrs;
+ for (x = 2;; x++) {
+ u64 space, d;
+ u32 m;
+
+ space = sdp->sd_jheightsize[x - 1] * sdp->sd_inptrs;
+ d = space;
+ m = do_div(d, sdp->sd_inptrs);
+
+ if (d != sdp->sd_jheightsize[x - 1] || m)
+ break;
+ sdp->sd_jheightsize[x] = space;
+ }
+ sdp->sd_max_jheight = x;
+ gfs2_assert(sdp, sdp->sd_max_jheight <= GFS2_MAX_META_HEIGHT);
+
+ return 0;
+}
+
+/**
+ * gfs2_jindex_hold - Grab a lock on the jindex
+ * @sdp: The GFS2 superblock
+ * @ji_gh: the holder for the jindex glock
+ *
+ * This is very similar to the gfs2_rindex_hold() function, except that
+ * in general we hold the jindex lock for longer periods of time and
+ * we grab it far less frequently (in general) then the rgrp lock.
+ *
+ * Returns: errno
+ */
+
+int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh)
+{
+ struct gfs2_inode *dip = GFS2_I(sdp->sd_jindex);
+ struct qstr name;
+ char buf[20];
+ struct gfs2_jdesc *jd;
+ int error;
+
+ name.name = buf;
+
+ mutex_lock(&sdp->sd_jindex_mutex);
+
+ for (;;) {
+ error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED,
+ GL_LOCAL_EXCL, ji_gh);
+ if (error)
+ break;
+
+ name.len = sprintf(buf, "journal%u", sdp->sd_journals);
+ name.hash = gfs2_disk_hash(name.name, name.len);
+
+ error = gfs2_dir_search(sdp->sd_jindex, &name, NULL, NULL);
+ if (error == -ENOENT) {
+ error = 0;
+ break;
+ }
+
+ gfs2_glock_dq_uninit(ji_gh);
+
+ if (error)
+ break;
+
+ error = -ENOMEM;
+ jd = kzalloc(sizeof(struct gfs2_jdesc), GFP_KERNEL);
+ if (!jd)
+ break;
+
+ jd->jd_inode = gfs2_lookupi(sdp->sd_jindex, &name, 1, NULL);
+ if (!jd->jd_inode || IS_ERR(jd->jd_inode)) {
+ if (!jd->jd_inode)
+ error = -ENOENT;
+ else
+ error = PTR_ERR(jd->jd_inode);
+ kfree(jd);
+ break;
+ }
+
+ spin_lock(&sdp->sd_jindex_spin);
+ jd->jd_jid = sdp->sd_journals++;
+ list_add_tail(&jd->jd_list, &sdp->sd_jindex_list);
+ spin_unlock(&sdp->sd_jindex_spin);
+ }
+
+ mutex_unlock(&sdp->sd_jindex_mutex);
+
+ return error;
+}
+
+/**
+ * gfs2_jindex_free - Clear all the journal index information
+ * @sdp: The GFS2 superblock
+ *
+ */
+
+void gfs2_jindex_free(struct gfs2_sbd *sdp)
+{
+ struct list_head list;
+ struct gfs2_jdesc *jd;
+
+ spin_lock(&sdp->sd_jindex_spin);
+ list_add(&list, &sdp->sd_jindex_list);
+ list_del_init(&sdp->sd_jindex_list);
+ sdp->sd_journals = 0;
+ spin_unlock(&sdp->sd_jindex_spin);
+
+ while (!list_empty(&list)) {
+ jd = list_entry(list.next, struct gfs2_jdesc, jd_list);
+ list_del(&jd->jd_list);
+ iput(jd->jd_inode);
+ kfree(jd);
+ }
+}
+
+static struct gfs2_jdesc *jdesc_find_i(struct list_head *head, unsigned int jid)
+{
+ struct gfs2_jdesc *jd;
+ int found = 0;
+
+ list_for_each_entry(jd, head, jd_list) {
+ if (jd->jd_jid == jid) {
+ found = 1;
+ break;
+ }
+ }
+
+ if (!found)
+ jd = NULL;
+
+ return jd;
+}
+
+struct gfs2_jdesc *gfs2_jdesc_find(struct gfs2_sbd *sdp, unsigned int jid)
+{
+ struct gfs2_jdesc *jd;
+
+ spin_lock(&sdp->sd_jindex_spin);
+ jd = jdesc_find_i(&sdp->sd_jindex_list, jid);
+ spin_unlock(&sdp->sd_jindex_spin);
+
+ return jd;
+}
+
+void gfs2_jdesc_make_dirty(struct gfs2_sbd *sdp, unsigned int jid)
+{
+ struct gfs2_jdesc *jd;
+
+ spin_lock(&sdp->sd_jindex_spin);
+ jd = jdesc_find_i(&sdp->sd_jindex_list, jid);
+ if (jd)
+ jd->jd_dirty = 1;
+ spin_unlock(&sdp->sd_jindex_spin);
+}
+
+struct gfs2_jdesc *gfs2_jdesc_find_dirty(struct gfs2_sbd *sdp)
+{
+ struct gfs2_jdesc *jd;
+ int found = 0;
+
+ spin_lock(&sdp->sd_jindex_spin);
+
+ list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
+ if (jd->jd_dirty) {
+ jd->jd_dirty = 0;
+ found = 1;
+ break;
+ }
+ }
+ spin_unlock(&sdp->sd_jindex_spin);
+
+ if (!found)
+ jd = NULL;
+
+ return jd;
+}
+
+int gfs2_jdesc_check(struct gfs2_jdesc *jd)
+{
+ struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
+ struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
+ int ar;
+ int error;
+
+ if (ip->i_di.di_size < (8 << 20) || ip->i_di.di_size > (1 << 30) ||
+ (ip->i_di.di_size & (sdp->sd_sb.sb_bsize - 1))) {
+ gfs2_consist_inode(ip);
+ return -EIO;
+ }
+ jd->jd_blocks = ip->i_di.di_size >> sdp->sd_sb.sb_bsize_shift;
+
+ error = gfs2_write_alloc_required(ip, 0, ip->i_di.di_size, &ar);
+ if (!error && ar) {
+ gfs2_consist_inode(ip);
+ error = -EIO;
+ }
+
+ return error;
+}
+
+/**
+ * gfs2_make_fs_rw - Turn a Read-Only FS into a Read-Write one
+ * @sdp: the filesystem
+ *
+ * Returns: errno
+ */
+
+int gfs2_make_fs_rw(struct gfs2_sbd *sdp)
+{
+ struct gfs2_inode *ip = GFS2_I(sdp->sd_jdesc->jd_inode);
+ struct gfs2_glock *j_gl = ip->i_gl;
+ struct gfs2_holder t_gh;
+ struct gfs2_log_header head;
+ int error;
+
+ error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED,
+ GL_LOCAL_EXCL, &t_gh);
+ if (error)
+ return error;
+
+ gfs2_meta_cache_flush(ip);
+ j_gl->gl_ops->go_inval(j_gl, DIO_METADATA | DIO_DATA);
+
+ error = gfs2_find_jhead(sdp->sd_jdesc, &head);
+ if (error)
+ goto fail;
+
+ if (!(head.lh_flags & GFS2_LOG_HEAD_UNMOUNT)) {
+ gfs2_consist(sdp);
+ error = -EIO;
+ goto fail;
+ }
+
+ /* Initialize some head of the log stuff */
+ sdp->sd_log_sequence = head.lh_sequence + 1;
+ gfs2_log_pointers_init(sdp, head.lh_blkno);
+
+ error = gfs2_quota_init(sdp);
+ if (error)
+ goto fail;
+
+ set_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags);
+
+ gfs2_glock_dq_uninit(&t_gh);
+
+ return 0;
+
+fail:
+ t_gh.gh_flags |= GL_NOCACHE;
+ gfs2_glock_dq_uninit(&t_gh);
+
+ return error;
+}
+
+/**
+ * gfs2_make_fs_ro - Turn a Read-Write FS into a Read-Only one
+ * @sdp: the filesystem
+ *
+ * Returns: errno
+ */
+
+int gfs2_make_fs_ro(struct gfs2_sbd *sdp)
+{
+ struct gfs2_holder t_gh;
+ int error;
+
+ gfs2_quota_sync(sdp);
+ gfs2_statfs_sync(sdp);
+
+ error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED,
+ GL_LOCAL_EXCL | GL_NOCACHE,
+ &t_gh);
+ if (error && !test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
+ return error;
+
+ gfs2_meta_syncfs(sdp);
+ gfs2_log_shutdown(sdp);
+
+ clear_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags);
+
+ if (t_gh.gh_gl)
+ gfs2_glock_dq_uninit(&t_gh);
+
+ gfs2_quota_cleanup(sdp);
+
+ return error;
+}
+
+int gfs2_statfs_init(struct gfs2_sbd *sdp)
+{
+ struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
+ struct gfs2_statfs_change *m_sc = &sdp->sd_statfs_master;
+ struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode);
+ struct gfs2_statfs_change *l_sc = &sdp->sd_statfs_local;
+ struct buffer_head *m_bh, *l_bh;
+ struct gfs2_holder gh;
+ int error;
+
+ error = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE, GL_NOCACHE,
+ &gh);
+ if (error)
+ return error;
+
+ error = gfs2_meta_inode_buffer(m_ip, &m_bh);
+ if (error)
+ goto out;
+
+ if (sdp->sd_args.ar_spectator) {
+ spin_lock(&sdp->sd_statfs_spin);
+ gfs2_statfs_change_in(m_sc, m_bh->b_data +
+ sizeof(struct gfs2_dinode));
+ spin_unlock(&sdp->sd_statfs_spin);
+ } else {
+ error = gfs2_meta_inode_buffer(l_ip, &l_bh);
+ if (error)
+ goto out_m_bh;
+
+ spin_lock(&sdp->sd_statfs_spin);
+ gfs2_statfs_change_in(m_sc, m_bh->b_data +
+ sizeof(struct gfs2_dinode));
+ gfs2_statfs_change_in(l_sc, l_bh->b_data +
+ sizeof(struct gfs2_dinode));
+ spin_unlock(&sdp->sd_statfs_spin);
+
+ brelse(l_bh);
+ }
+
+out_m_bh:
+ brelse(m_bh);
+out:
+ gfs2_glock_dq_uninit(&gh);
+ return 0;
+}
+
+void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free,
+ s64 dinodes)
+{
+ struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode);
+ struct gfs2_statfs_change *l_sc = &sdp->sd_statfs_local;
+ struct buffer_head *l_bh;
+ int error;
+
+ error = gfs2_meta_inode_buffer(l_ip, &l_bh);
+ if (error)
+ return;
+
+ mutex_lock(&sdp->sd_statfs_mutex);
+ gfs2_trans_add_bh(l_ip->i_gl, l_bh, 1);
+ mutex_unlock(&sdp->sd_statfs_mutex);
+
+ spin_lock(&sdp->sd_statfs_spin);
+ l_sc->sc_total += total;
+ l_sc->sc_free += free;
+ l_sc->sc_dinodes += dinodes;
+ gfs2_statfs_change_out(l_sc, l_bh->b_data + sizeof(struct gfs2_dinode));
+ spin_unlock(&sdp->sd_statfs_spin);
+
+ brelse(l_bh);
+}
+
+int gfs2_statfs_sync(struct gfs2_sbd *sdp)
+{
+ struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
+ struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode);
+ struct gfs2_statfs_change *m_sc = &sdp->sd_statfs_master;
+ struct gfs2_statfs_change *l_sc = &sdp->sd_statfs_local;
+ struct gfs2_holder gh;
+ struct buffer_head *m_bh, *l_bh;
+ int error;
+
+ error = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE, GL_NOCACHE,
+ &gh);
+ if (error)
+ return error;
+
+ error = gfs2_meta_inode_buffer(m_ip, &m_bh);
+ if (error)
+ goto out;
+
+ spin_lock(&sdp->sd_statfs_spin);
+ gfs2_statfs_change_in(m_sc, m_bh->b_data +
+ sizeof(struct gfs2_dinode));
+ if (!l_sc->sc_total && !l_sc->sc_free && !l_sc->sc_dinodes) {
+ spin_unlock(&sdp->sd_statfs_spin);
+ goto out_bh;
+ }
+ spin_unlock(&sdp->sd_statfs_spin);
+
+ error = gfs2_meta_inode_buffer(l_ip, &l_bh);
+ if (error)
+ goto out_bh;
+
+ error = gfs2_trans_begin(sdp, 2 * RES_DINODE, 0);
+ if (error)
+ goto out_bh2;
+
+ mutex_lock(&sdp->sd_statfs_mutex);
+ gfs2_trans_add_bh(l_ip->i_gl, l_bh, 1);
+ mutex_unlock(&sdp->sd_statfs_mutex);
+
+ spin_lock(&sdp->sd_statfs_spin);
+ m_sc->sc_total += l_sc->sc_total;
+ m_sc->sc_free += l_sc->sc_free;
+ m_sc->sc_dinodes += l_sc->sc_dinodes;
+ memset(l_sc, 0, sizeof(struct gfs2_statfs_change));
+ memset(l_bh->b_data + sizeof(struct gfs2_dinode),
+ 0, sizeof(struct gfs2_statfs_change));
+ spin_unlock(&sdp->sd_statfs_spin);
+
+ gfs2_trans_add_bh(m_ip->i_gl, m_bh, 1);
+ gfs2_statfs_change_out(m_sc, m_bh->b_data + sizeof(struct gfs2_dinode));
+
+ gfs2_trans_end(sdp);
+
+out_bh2:
+ brelse(l_bh);
+out_bh:
+ brelse(m_bh);
+out:
+ gfs2_glock_dq_uninit(&gh);
+ return error;
+}
+
+/**
+ * gfs2_statfs_i - Do a statfs
+ * @sdp: the filesystem
+ * @sg: the sg structure
+ *
+ * Returns: errno
+ */
+
+int gfs2_statfs_i(struct gfs2_sbd *sdp, struct gfs2_statfs_change *sc)
+{
+ struct gfs2_statfs_change *m_sc = &sdp->sd_statfs_master;
+ struct gfs2_statfs_change *l_sc = &sdp->sd_statfs_local;
+
+ spin_lock(&sdp->sd_statfs_spin);
+
+ *sc = *m_sc;
+ sc->sc_total += l_sc->sc_total;
+ sc->sc_free += l_sc->sc_free;
+ sc->sc_dinodes += l_sc->sc_dinodes;
+
+ spin_unlock(&sdp->sd_statfs_spin);
+
+ if (sc->sc_free < 0)
+ sc->sc_free = 0;
+ if (sc->sc_free > sc->sc_total)
+ sc->sc_free = sc->sc_total;
+ if (sc->sc_dinodes < 0)
+ sc->sc_dinodes = 0;
+
+ return 0;
+}
+
+/**
+ * statfs_fill - fill in the sg for a given RG
+ * @rgd: the RG
+ * @sc: the sc structure
+ *
+ * Returns: 0 on success, -ESTALE if the LVB is invalid
+ */
+
+static int statfs_slow_fill(struct gfs2_rgrpd *rgd,
+ struct gfs2_statfs_change *sc)
+{
+ gfs2_rgrp_verify(rgd);
+ sc->sc_total += rgd->rd_ri.ri_data;
+ sc->sc_free += rgd->rd_rg.rg_free;
+ sc->sc_dinodes += rgd->rd_rg.rg_dinodes;
+ return 0;
+}
+
+/**
+ * gfs2_statfs_slow - Stat a filesystem using asynchronous locking
+ * @sdp: the filesystem
+ * @sc: the sc info that will be returned
+ *
+ * Any error (other than a signal) will cause this routine to fall back
+ * to the synchronous version.
+ *
+ * FIXME: This really shouldn't busy wait like this.
+ *
+ * Returns: errno
+ */
+
+int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change *sc)
+{
+ struct gfs2_holder ri_gh;
+ struct gfs2_rgrpd *rgd_next;
+ struct gfs2_holder *gha, *gh;
+ unsigned int slots = 64;
+ unsigned int x;
+ int done;
+ int error = 0, err;
+
+ memset(sc, 0, sizeof(struct gfs2_statfs_change));
+ gha = kcalloc(slots, sizeof(struct gfs2_holder), GFP_KERNEL);
+ if (!gha)
+ return -ENOMEM;
+
+ error = gfs2_rindex_hold(sdp, &ri_gh);
+ if (error)
+ goto out;
+
+ rgd_next = gfs2_rgrpd_get_first(sdp);
+
+ for (;;) {
+ done = 1;
+
+ for (x = 0; x < slots; x++) {
+ gh = gha + x;
+
+ if (gh->gh_gl && gfs2_glock_poll(gh)) {
+ err = gfs2_glock_wait(gh);
+ if (err) {
+ gfs2_holder_uninit(gh);
+ error = err;
+ } else {
+ if (!error)
+ error = statfs_slow_fill(
+ gh->gh_gl->gl_object, sc);
+ gfs2_glock_dq_uninit(gh);
+ }
+ }
+
+ if (gh->gh_gl)
+ done = 0;
+ else if (rgd_next && !error) {
+ error = gfs2_glock_nq_init(rgd_next->rd_gl,
+ LM_ST_SHARED,
+ GL_ASYNC,
+ gh);
+ rgd_next = gfs2_rgrpd_get_next(rgd_next);
+ done = 0;
+ }
+
+ if (signal_pending(current))
+ error = -ERESTARTSYS;
+ }
+
+ if (done)
+ break;
+
+ yield();
+ }
+
+ gfs2_glock_dq_uninit(&ri_gh);
+
+out:
+ kfree(gha);
+ return error;
+}
+
+struct lfcc {
+ struct list_head list;
+ struct gfs2_holder gh;
+};
+
+/**
+ * gfs2_lock_fs_check_clean - Stop all writes to the FS and check that all
+ * journals are clean
+ * @sdp: the file system
+ * @state: the state to put the transaction lock into
+ * @t_gh: the hold on the transaction lock
+ *
+ * Returns: errno
+ */
+
+static int gfs2_lock_fs_check_clean(struct gfs2_sbd *sdp,
+ struct gfs2_holder *t_gh)
+{
+ struct gfs2_inode *ip;
+ struct gfs2_holder ji_gh;
+ struct gfs2_jdesc *jd;
+ struct lfcc *lfcc;
+ LIST_HEAD(list);
+ struct gfs2_log_header lh;
+ int error;
+
+ error = gfs2_jindex_hold(sdp, &ji_gh);
+ if (error)
+ return error;
+
+ list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
+ lfcc = kmalloc(sizeof(struct lfcc), GFP_KERNEL);
+ if (!lfcc) {
+ error = -ENOMEM;
+ goto out;
+ }
+ ip = GFS2_I(jd->jd_inode);
+ error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &lfcc->gh);
+ if (error) {
+ kfree(lfcc);
+ goto out;
+ }
+ list_add(&lfcc->list, &list);
+ }
+
+ error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_DEFERRED,
+ LM_FLAG_PRIORITY | GL_NOCACHE,
+ t_gh);
+
+ list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
+ error = gfs2_jdesc_check(jd);
+ if (error)
+ break;
+ error = gfs2_find_jhead(jd, &lh);
+ if (error)
+ break;
+ if (!(lh.lh_flags & GFS2_LOG_HEAD_UNMOUNT)) {
+ error = -EBUSY;
+ break;
+ }
+ }
+
+ if (error)
+ gfs2_glock_dq_uninit(t_gh);
+
+out:
+ while (!list_empty(&list)) {
+ lfcc = list_entry(list.next, struct lfcc, list);
+ list_del(&lfcc->list);
+ gfs2_glock_dq_uninit(&lfcc->gh);
+ kfree(lfcc);
+ }
+ gfs2_glock_dq_uninit(&ji_gh);
+ return error;
+}
+
+/**
+ * gfs2_freeze_fs - freezes the file system
+ * @sdp: the file system
+ *
+ * This function flushes data and meta data for all machines by
+ * aquiring the transaction log exclusively. All journals are
+ * ensured to be in a clean state as well.
+ *
+ * Returns: errno
+ */
+
+int gfs2_freeze_fs(struct gfs2_sbd *sdp)
+{
+ int error = 0;
+
+ mutex_lock(&sdp->sd_freeze_lock);
+
+ if (!sdp->sd_freeze_count++) {
+ error = gfs2_lock_fs_check_clean(sdp, &sdp->sd_freeze_gh);
+ if (error)
+ sdp->sd_freeze_count--;
+ }
+
+ mutex_unlock(&sdp->sd_freeze_lock);
+
+ return error;
+}
+
+/**
+ * gfs2_unfreeze_fs - unfreezes the file system
+ * @sdp: the file system
+ *
+ * This function allows the file system to proceed by unlocking
+ * the exclusively held transaction lock. Other GFS2 nodes are
+ * now free to acquire the lock shared and go on with their lives.
+ *
+ */
+
+void gfs2_unfreeze_fs(struct gfs2_sbd *sdp)
+{
+ mutex_lock(&sdp->sd_freeze_lock);
+
+ if (sdp->sd_freeze_count && !--sdp->sd_freeze_count)
+ gfs2_glock_dq_uninit(&sdp->sd_freeze_gh);
+
+ mutex_unlock(&sdp->sd_freeze_lock);
+}
+
diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h
new file mode 100644
index 000000000000..5bb443ae0f59
--- /dev/null
+++ b/fs/gfs2/super.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#ifndef __SUPER_DOT_H__
+#define __SUPER_DOT_H__
+
+#include "incore.h"
+
+void gfs2_tune_init(struct gfs2_tune *gt);
+
+int gfs2_check_sb(struct gfs2_sbd *sdp, struct gfs2_sb *sb, int silent);
+int gfs2_read_sb(struct gfs2_sbd *sdp, struct gfs2_glock *gl, int silent);
+struct page *gfs2_read_super(struct super_block *sb, sector_t sector);
+
+static inline unsigned int gfs2_jindex_size(struct gfs2_sbd *sdp)
+{
+ unsigned int x;
+ spin_lock(&sdp->sd_jindex_spin);
+ x = sdp->sd_journals;
+ spin_unlock(&sdp->sd_jindex_spin);
+ return x;
+}
+
+int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh);
+void gfs2_jindex_free(struct gfs2_sbd *sdp);
+
+struct gfs2_jdesc *gfs2_jdesc_find(struct gfs2_sbd *sdp, unsigned int jid);
+void gfs2_jdesc_make_dirty(struct gfs2_sbd *sdp, unsigned int jid);
+struct gfs2_jdesc *gfs2_jdesc_find_dirty(struct gfs2_sbd *sdp);
+int gfs2_jdesc_check(struct gfs2_jdesc *jd);
+
+int gfs2_lookup_in_master_dir(struct gfs2_sbd *sdp, char *filename,
+ struct gfs2_inode **ipp);
+
+int gfs2_make_fs_rw(struct gfs2_sbd *sdp);
+int gfs2_make_fs_ro(struct gfs2_sbd *sdp);
+
+int gfs2_statfs_init(struct gfs2_sbd *sdp);
+void gfs2_statfs_change(struct gfs2_sbd *sdp,
+ s64 total, s64 free, s64 dinodes);
+int gfs2_statfs_sync(struct gfs2_sbd *sdp);
+int gfs2_statfs_i(struct gfs2_sbd *sdp, struct gfs2_statfs_change *sc);
+int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change *sc);
+
+int gfs2_freeze_fs(struct gfs2_sbd *sdp);
+void gfs2_unfreeze_fs(struct gfs2_sbd *sdp);
+
+#endif /* __SUPER_DOT_H__ */
+
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
new file mode 100644
index 000000000000..0e0ec988f731
--- /dev/null
+++ b/fs/gfs2/sys.c
@@ -0,0 +1,583 @@
+/*
+ * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/module.h>
+#include <linux/kobject.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/lm_interface.h>
+#include <asm/uaccess.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "lm.h"
+#include "sys.h"
+#include "super.h"
+#include "glock.h"
+#include "quota.h"
+#include "util.h"
+
+char *gfs2_sys_margs;
+spinlock_t gfs2_sys_margs_lock;
+
+static ssize_t id_show(struct gfs2_sbd *sdp, char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%s\n", sdp->sd_vfs->s_id);
+}
+
+static ssize_t fsname_show(struct gfs2_sbd *sdp, char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%s\n", sdp->sd_fsname);
+}
+
+static ssize_t freeze_show(struct gfs2_sbd *sdp, char *buf)
+{
+ unsigned int count;
+
+ mutex_lock(&sdp->sd_freeze_lock);
+ count = sdp->sd_freeze_count;
+ mutex_unlock(&sdp->sd_freeze_lock);
+
+ return snprintf(buf, PAGE_SIZE, "%u\n", count);
+}
+
+static ssize_t freeze_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
+{
+ ssize_t ret = len;
+ int error = 0;
+ int n = simple_strtol(buf, NULL, 0);
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EACCES;
+
+ switch (n) {
+ case 0:
+ gfs2_unfreeze_fs(sdp);
+ break;
+ case 1:
+ error = gfs2_freeze_fs(sdp);
+ break;
+ default:
+ ret = -EINVAL;
+ }
+
+ if (error)
+ fs_warn(sdp, "freeze %d error %d", n, error);
+
+ return ret;
+}
+
+static ssize_t withdraw_show(struct gfs2_sbd *sdp, char *buf)
+{
+ unsigned int b = test_bit(SDF_SHUTDOWN, &sdp->sd_flags);
+ return snprintf(buf, PAGE_SIZE, "%u\n", b);
+}
+
+static ssize_t withdraw_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
+{
+ if (!capable(CAP_SYS_ADMIN))
+ return -EACCES;
+
+ if (simple_strtol(buf, NULL, 0) != 1)
+ return -EINVAL;
+
+ gfs2_lm_withdraw(sdp,
+ "GFS2: fsid=%s: withdrawing from cluster at user's request\n",
+ sdp->sd_fsname);
+ return len;
+}
+
+static ssize_t statfs_sync_store(struct gfs2_sbd *sdp, const char *buf,
+ size_t len)
+{
+ if (!capable(CAP_SYS_ADMIN))
+ return -EACCES;
+
+ if (simple_strtol(buf, NULL, 0) != 1)
+ return -EINVAL;
+
+ gfs2_statfs_sync(sdp);
+ return len;
+}
+
+static ssize_t shrink_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
+{
+ if (!capable(CAP_SYS_ADMIN))
+ return -EACCES;
+
+ if (simple_strtol(buf, NULL, 0) != 1)
+ return -EINVAL;
+
+ gfs2_gl_hash_clear(sdp, NO_WAIT);
+ return len;
+}
+
+static ssize_t quota_sync_store(struct gfs2_sbd *sdp, const char *buf,
+ size_t len)
+{
+ if (!capable(CAP_SYS_ADMIN))
+ return -EACCES;
+
+ if (simple_strtol(buf, NULL, 0) != 1)
+ return -EINVAL;
+
+ gfs2_quota_sync(sdp);
+ return len;
+}
+
+static ssize_t quota_refresh_user_store(struct gfs2_sbd *sdp, const char *buf,
+ size_t len)
+{
+ u32 id;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EACCES;
+
+ id = simple_strtoul(buf, NULL, 0);
+
+ gfs2_quota_refresh(sdp, 1, id);
+ return len;
+}
+
+static ssize_t quota_refresh_group_store(struct gfs2_sbd *sdp, const char *buf,
+ size_t len)
+{
+ u32 id;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EACCES;
+
+ id = simple_strtoul(buf, NULL, 0);
+
+ gfs2_quota_refresh(sdp, 0, id);
+ return len;
+}
+
+struct gfs2_attr {
+ struct attribute attr;
+ ssize_t (*show)(struct gfs2_sbd *, char *);
+ ssize_t (*store)(struct gfs2_sbd *, const char *, size_t);
+};
+
+#define GFS2_ATTR(name, mode, show, store) \
+static struct gfs2_attr gfs2_attr_##name = __ATTR(name, mode, show, store)
+
+GFS2_ATTR(id, 0444, id_show, NULL);
+GFS2_ATTR(fsname, 0444, fsname_show, NULL);
+GFS2_ATTR(freeze, 0644, freeze_show, freeze_store);
+GFS2_ATTR(shrink, 0200, NULL, shrink_store);
+GFS2_ATTR(withdraw, 0644, withdraw_show, withdraw_store);
+GFS2_ATTR(statfs_sync, 0200, NULL, statfs_sync_store);
+GFS2_ATTR(quota_sync, 0200, NULL, quota_sync_store);
+GFS2_ATTR(quota_refresh_user, 0200, NULL, quota_refresh_user_store);
+GFS2_ATTR(quota_refresh_group, 0200, NULL, quota_refresh_group_store);
+
+static struct attribute *gfs2_attrs[] = {
+ &gfs2_attr_id.attr,
+ &gfs2_attr_fsname.attr,
+ &gfs2_attr_freeze.attr,
+ &gfs2_attr_shrink.attr,
+ &gfs2_attr_withdraw.attr,
+ &gfs2_attr_statfs_sync.attr,
+ &gfs2_attr_quota_sync.attr,
+ &gfs2_attr_quota_refresh_user.attr,
+ &gfs2_attr_quota_refresh_group.attr,
+ NULL,
+};
+
+static ssize_t gfs2_attr_show(struct kobject *kobj, struct attribute *attr,
+ char *buf)
+{
+ struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj);
+ struct gfs2_attr *a = container_of(attr, struct gfs2_attr, attr);
+ return a->show ? a->show(sdp, buf) : 0;
+}
+
+static ssize_t gfs2_attr_store(struct kobject *kobj, struct attribute *attr,
+ const char *buf, size_t len)
+{
+ struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj);
+ struct gfs2_attr *a = container_of(attr, struct gfs2_attr, attr);
+ return a->store ? a->store(sdp, buf, len) : len;
+}
+
+static struct sysfs_ops gfs2_attr_ops = {
+ .show = gfs2_attr_show,
+ .store = gfs2_attr_store,
+};
+
+static struct kobj_type gfs2_ktype = {
+ .default_attrs = gfs2_attrs,
+ .sysfs_ops = &gfs2_attr_ops,
+};
+
+static struct kset gfs2_kset = {
+ .subsys = &fs_subsys,
+ .kobj = {.name = "gfs2"},
+ .ktype = &gfs2_ktype,
+};
+
+/*
+ * display struct lm_lockstruct fields
+ */
+
+struct lockstruct_attr {
+ struct attribute attr;
+ ssize_t (*show)(struct gfs2_sbd *, char *);
+};
+
+#define LOCKSTRUCT_ATTR(name, fmt) \
+static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf) \
+{ \
+ return snprintf(buf, PAGE_SIZE, fmt, sdp->sd_lockstruct.ls_##name); \
+} \
+static struct lockstruct_attr lockstruct_attr_##name = __ATTR_RO(name)
+
+LOCKSTRUCT_ATTR(jid, "%u\n");
+LOCKSTRUCT_ATTR(first, "%u\n");
+LOCKSTRUCT_ATTR(lvb_size, "%u\n");
+LOCKSTRUCT_ATTR(flags, "%d\n");
+
+static struct attribute *lockstruct_attrs[] = {
+ &lockstruct_attr_jid.attr,
+ &lockstruct_attr_first.attr,
+ &lockstruct_attr_lvb_size.attr,
+ &lockstruct_attr_flags.attr,
+ NULL,
+};
+
+/*
+ * display struct gfs2_args fields
+ */
+
+struct args_attr {
+ struct attribute attr;
+ ssize_t (*show)(struct gfs2_sbd *, char *);
+};
+
+#define ARGS_ATTR(name, fmt) \
+static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf) \
+{ \
+ return snprintf(buf, PAGE_SIZE, fmt, sdp->sd_args.ar_##name); \
+} \
+static struct args_attr args_attr_##name = __ATTR_RO(name)
+
+ARGS_ATTR(lockproto, "%s\n");
+ARGS_ATTR(locktable, "%s\n");
+ARGS_ATTR(hostdata, "%s\n");
+ARGS_ATTR(spectator, "%d\n");
+ARGS_ATTR(ignore_local_fs, "%d\n");
+ARGS_ATTR(localcaching, "%d\n");
+ARGS_ATTR(localflocks, "%d\n");
+ARGS_ATTR(debug, "%d\n");
+ARGS_ATTR(upgrade, "%d\n");
+ARGS_ATTR(num_glockd, "%u\n");
+ARGS_ATTR(posix_acl, "%d\n");
+ARGS_ATTR(quota, "%u\n");
+ARGS_ATTR(suiddir, "%d\n");
+ARGS_ATTR(data, "%d\n");
+
+/* one oddball doesn't fit the macro mold */
+static ssize_t noatime_show(struct gfs2_sbd *sdp, char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%d\n",
+ !!test_bit(SDF_NOATIME, &sdp->sd_flags));
+}
+static struct args_attr args_attr_noatime = __ATTR_RO(noatime);
+
+static struct attribute *args_attrs[] = {
+ &args_attr_lockproto.attr,
+ &args_attr_locktable.attr,
+ &args_attr_hostdata.attr,
+ &args_attr_spectator.attr,
+ &args_attr_ignore_local_fs.attr,
+ &args_attr_localcaching.attr,
+ &args_attr_localflocks.attr,
+ &args_attr_debug.attr,
+ &args_attr_upgrade.attr,
+ &args_attr_num_glockd.attr,
+ &args_attr_posix_acl.attr,
+ &args_attr_quota.attr,
+ &args_attr_suiddir.attr,
+ &args_attr_data.attr,
+ &args_attr_noatime.attr,
+ NULL,
+};
+
+/*
+ * display counters from superblock
+ */
+
+struct counters_attr {
+ struct attribute attr;
+ ssize_t (*show)(struct gfs2_sbd *, char *);
+};
+
+#define COUNTERS_ATTR(name, fmt) \
+static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf) \
+{ \
+ return snprintf(buf, PAGE_SIZE, fmt, \
+ (unsigned int)atomic_read(&sdp->sd_##name)); \
+} \
+static struct counters_attr counters_attr_##name = __ATTR_RO(name)
+
+COUNTERS_ATTR(glock_count, "%u\n");
+COUNTERS_ATTR(glock_held_count, "%u\n");
+COUNTERS_ATTR(inode_count, "%u\n");
+COUNTERS_ATTR(reclaimed, "%u\n");
+
+static struct attribute *counters_attrs[] = {
+ &counters_attr_glock_count.attr,
+ &counters_attr_glock_held_count.attr,
+ &counters_attr_inode_count.attr,
+ &counters_attr_reclaimed.attr,
+ NULL,
+};
+
+/*
+ * get and set struct gfs2_tune fields
+ */
+
+static ssize_t quota_scale_show(struct gfs2_sbd *sdp, char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%u %u\n",
+ sdp->sd_tune.gt_quota_scale_num,
+ sdp->sd_tune.gt_quota_scale_den);
+}
+
+static ssize_t quota_scale_store(struct gfs2_sbd *sdp, const char *buf,
+ size_t len)
+{
+ struct gfs2_tune *gt = &sdp->sd_tune;
+ unsigned int x, y;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EACCES;
+
+ if (sscanf(buf, "%u %u", &x, &y) != 2 || !y)
+ return -EINVAL;
+
+ spin_lock(&gt->gt_spin);
+ gt->gt_quota_scale_num = x;
+ gt->gt_quota_scale_den = y;
+ spin_unlock(&gt->gt_spin);
+ return len;
+}
+
+static ssize_t tune_set(struct gfs2_sbd *sdp, unsigned int *field,
+ int check_zero, const char *buf, size_t len)
+{
+ struct gfs2_tune *gt = &sdp->sd_tune;
+ unsigned int x;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EACCES;
+
+ x = simple_strtoul(buf, NULL, 0);
+
+ if (check_zero && !x)
+ return -EINVAL;
+
+ spin_lock(&gt->gt_spin);
+ *field = x;
+ spin_unlock(&gt->gt_spin);
+ return len;
+}
+
+struct tune_attr {
+ struct attribute attr;
+ ssize_t (*show)(struct gfs2_sbd *, char *);
+ ssize_t (*store)(struct gfs2_sbd *, const char *, size_t);
+};
+
+#define TUNE_ATTR_3(name, show, store) \
+static struct tune_attr tune_attr_##name = __ATTR(name, 0644, show, store)
+
+#define TUNE_ATTR_2(name, store) \
+static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf) \
+{ \
+ return snprintf(buf, PAGE_SIZE, "%u\n", sdp->sd_tune.gt_##name); \
+} \
+TUNE_ATTR_3(name, name##_show, store)
+
+#define TUNE_ATTR(name, check_zero) \
+static ssize_t name##_store(struct gfs2_sbd *sdp, const char *buf, size_t len)\
+{ \
+ return tune_set(sdp, &sdp->sd_tune.gt_##name, check_zero, buf, len); \
+} \
+TUNE_ATTR_2(name, name##_store)
+
+#define TUNE_ATTR_DAEMON(name, process) \
+static ssize_t name##_store(struct gfs2_sbd *sdp, const char *buf, size_t len)\
+{ \
+ ssize_t r = tune_set(sdp, &sdp->sd_tune.gt_##name, 1, buf, len); \
+ wake_up_process(sdp->sd_##process); \
+ return r; \
+} \
+TUNE_ATTR_2(name, name##_store)
+
+TUNE_ATTR(ilimit, 0);
+TUNE_ATTR(ilimit_tries, 0);
+TUNE_ATTR(ilimit_min, 0);
+TUNE_ATTR(demote_secs, 0);
+TUNE_ATTR(incore_log_blocks, 0);
+TUNE_ATTR(log_flush_secs, 0);
+TUNE_ATTR(jindex_refresh_secs, 0);
+TUNE_ATTR(quota_warn_period, 0);
+TUNE_ATTR(quota_quantum, 0);
+TUNE_ATTR(atime_quantum, 0);
+TUNE_ATTR(max_readahead, 0);
+TUNE_ATTR(complain_secs, 0);
+TUNE_ATTR(reclaim_limit, 0);
+TUNE_ATTR(prefetch_secs, 0);
+TUNE_ATTR(statfs_slow, 0);
+TUNE_ATTR(new_files_jdata, 0);
+TUNE_ATTR(new_files_directio, 0);
+TUNE_ATTR(quota_simul_sync, 1);
+TUNE_ATTR(quota_cache_secs, 1);
+TUNE_ATTR(max_atomic_write, 1);
+TUNE_ATTR(stall_secs, 1);
+TUNE_ATTR(entries_per_readdir, 1);
+TUNE_ATTR(greedy_default, 1);
+TUNE_ATTR(greedy_quantum, 1);
+TUNE_ATTR(greedy_max, 1);
+TUNE_ATTR(statfs_quantum, 1);
+TUNE_ATTR_DAEMON(scand_secs, scand_process);
+TUNE_ATTR_DAEMON(recoverd_secs, recoverd_process);
+TUNE_ATTR_DAEMON(logd_secs, logd_process);
+TUNE_ATTR_DAEMON(quotad_secs, quotad_process);
+TUNE_ATTR_3(quota_scale, quota_scale_show, quota_scale_store);
+
+static struct attribute *tune_attrs[] = {
+ &tune_attr_ilimit.attr,
+ &tune_attr_ilimit_tries.attr,
+ &tune_attr_ilimit_min.attr,
+ &tune_attr_demote_secs.attr,
+ &tune_attr_incore_log_blocks.attr,
+ &tune_attr_log_flush_secs.attr,
+ &tune_attr_jindex_refresh_secs.attr,
+ &tune_attr_quota_warn_period.attr,
+ &tune_attr_quota_quantum.attr,
+ &tune_attr_atime_quantum.attr,
+ &tune_attr_max_readahead.attr,
+ &tune_attr_complain_secs.attr,
+ &tune_attr_reclaim_limit.attr,
+ &tune_attr_prefetch_secs.attr,
+ &tune_attr_statfs_slow.attr,
+ &tune_attr_quota_simul_sync.attr,
+ &tune_attr_quota_cache_secs.attr,
+ &tune_attr_max_atomic_write.attr,
+ &tune_attr_stall_secs.attr,
+ &tune_attr_entries_per_readdir.attr,
+ &tune_attr_greedy_default.attr,
+ &tune_attr_greedy_quantum.attr,
+ &tune_attr_greedy_max.attr,
+ &tune_attr_statfs_quantum.attr,
+ &tune_attr_scand_secs.attr,
+ &tune_attr_recoverd_secs.attr,
+ &tune_attr_logd_secs.attr,
+ &tune_attr_quotad_secs.attr,
+ &tune_attr_quota_scale.attr,
+ &tune_attr_new_files_jdata.attr,
+ &tune_attr_new_files_directio.attr,
+ NULL,
+};
+
+static struct attribute_group lockstruct_group = {
+ .name = "lockstruct",
+ .attrs = lockstruct_attrs,
+};
+
+static struct attribute_group counters_group = {
+ .name = "counters",
+ .attrs = counters_attrs,
+};
+
+static struct attribute_group args_group = {
+ .name = "args",
+ .attrs = args_attrs,
+};
+
+static struct attribute_group tune_group = {
+ .name = "tune",
+ .attrs = tune_attrs,
+};
+
+int gfs2_sys_fs_add(struct gfs2_sbd *sdp)
+{
+ int error;
+
+ sdp->sd_kobj.kset = &gfs2_kset;
+ sdp->sd_kobj.ktype = &gfs2_ktype;
+
+ error = kobject_set_name(&sdp->sd_kobj, "%s", sdp->sd_table_name);
+ if (error)
+ goto fail;
+
+ error = kobject_register(&sdp->sd_kobj);
+ if (error)
+ goto fail;
+
+ error = sysfs_create_group(&sdp->sd_kobj, &lockstruct_group);
+ if (error)
+ goto fail_reg;
+
+ error = sysfs_create_group(&sdp->sd_kobj, &counters_group);
+ if (error)
+ goto fail_lockstruct;
+
+ error = sysfs_create_group(&sdp->sd_kobj, &args_group);
+ if (error)
+ goto fail_counters;
+
+ error = sysfs_create_group(&sdp->sd_kobj, &tune_group);
+ if (error)
+ goto fail_args;
+
+ return 0;
+
+fail_args:
+ sysfs_remove_group(&sdp->sd_kobj, &args_group);
+fail_counters:
+ sysfs_remove_group(&sdp->sd_kobj, &counters_group);
+fail_lockstruct:
+ sysfs_remove_group(&sdp->sd_kobj, &lockstruct_group);
+fail_reg:
+ kobject_unregister(&sdp->sd_kobj);
+fail:
+ fs_err(sdp, "error %d adding sysfs files", error);
+ return error;
+}
+
+void gfs2_sys_fs_del(struct gfs2_sbd *sdp)
+{
+ sysfs_remove_group(&sdp->sd_kobj, &tune_group);
+ sysfs_remove_group(&sdp->sd_kobj, &args_group);
+ sysfs_remove_group(&sdp->sd_kobj, &counters_group);
+ sysfs_remove_group(&sdp->sd_kobj, &lockstruct_group);
+ kobject_unregister(&sdp->sd_kobj);
+}
+
+int gfs2_sys_init(void)
+{
+ gfs2_sys_margs = NULL;
+ spin_lock_init(&gfs2_sys_margs_lock);
+ return kset_register(&gfs2_kset);
+}
+
+void gfs2_sys_uninit(void)
+{
+ kfree(gfs2_sys_margs);
+ kset_unregister(&gfs2_kset);
+}
+
diff --git a/fs/gfs2/sys.h b/fs/gfs2/sys.h
new file mode 100644
index 000000000000..1ca8cdac5304
--- /dev/null
+++ b/fs/gfs2/sys.h
@@ -0,0 +1,27 @@
+/*
+ * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#ifndef __SYS_DOT_H__
+#define __SYS_DOT_H__
+
+#include <linux/spinlock.h>
+struct gfs2_sbd;
+
+/* Allow args to be passed to GFS2 when using an initial ram disk */
+extern char *gfs2_sys_margs;
+extern spinlock_t gfs2_sys_margs_lock;
+
+int gfs2_sys_fs_add(struct gfs2_sbd *sdp);
+void gfs2_sys_fs_del(struct gfs2_sbd *sdp);
+
+int gfs2_sys_init(void);
+void gfs2_sys_uninit(void);
+
+#endif /* __SYS_DOT_H__ */
+
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
new file mode 100644
index 000000000000..f8dabf8446bb
--- /dev/null
+++ b/fs/gfs2/trans.c
@@ -0,0 +1,184 @@
+/*
+ * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/kallsyms.h>
+#include <linux/lm_interface.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "glock.h"
+#include "log.h"
+#include "lops.h"
+#include "meta_io.h"
+#include "trans.h"
+#include "util.h"
+
+int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks,
+ unsigned int revokes)
+{
+ struct gfs2_trans *tr;
+ int error;
+
+ BUG_ON(current->journal_info);
+ BUG_ON(blocks == 0 && revokes == 0);
+
+ tr = kzalloc(sizeof(struct gfs2_trans), GFP_NOFS);
+ if (!tr)
+ return -ENOMEM;
+
+ tr->tr_ip = (unsigned long)__builtin_return_address(0);
+ tr->tr_blocks = blocks;
+ tr->tr_revokes = revokes;
+ tr->tr_reserved = 1;
+ if (blocks)
+ tr->tr_reserved += 6 + blocks;
+ if (revokes)
+ tr->tr_reserved += gfs2_struct2blk(sdp, revokes,
+ sizeof(u64));
+ INIT_LIST_HEAD(&tr->tr_list_buf);
+
+ gfs2_holder_init(sdp->sd_trans_gl, LM_ST_SHARED, 0, &tr->tr_t_gh);
+
+ error = gfs2_glock_nq(&tr->tr_t_gh);
+ if (error)
+ goto fail_holder_uninit;
+
+ if (!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
+ tr->tr_t_gh.gh_flags |= GL_NOCACHE;
+ error = -EROFS;
+ goto fail_gunlock;
+ }
+
+ error = gfs2_log_reserve(sdp, tr->tr_reserved);
+ if (error)
+ goto fail_gunlock;
+
+ current->journal_info = tr;
+
+ return 0;
+
+fail_gunlock:
+ gfs2_glock_dq(&tr->tr_t_gh);
+
+fail_holder_uninit:
+ gfs2_holder_uninit(&tr->tr_t_gh);
+ kfree(tr);
+
+ return error;
+}
+
+void gfs2_trans_end(struct gfs2_sbd *sdp)
+{
+ struct gfs2_trans *tr = current->journal_info;
+
+ BUG_ON(!tr);
+ current->journal_info = NULL;
+
+ if (!tr->tr_touched) {
+ gfs2_log_release(sdp, tr->tr_reserved);
+ gfs2_glock_dq(&tr->tr_t_gh);
+ gfs2_holder_uninit(&tr->tr_t_gh);
+ kfree(tr);
+ return;
+ }
+
+ if (gfs2_assert_withdraw(sdp, tr->tr_num_buf <= tr->tr_blocks)) {
+ fs_err(sdp, "tr_num_buf = %u, tr_blocks = %u ",
+ tr->tr_num_buf, tr->tr_blocks);
+ print_symbol(KERN_WARNING "GFS2: Transaction created at: %s\n", tr->tr_ip);
+ }
+ if (gfs2_assert_withdraw(sdp, tr->tr_num_revoke <= tr->tr_revokes)) {
+ fs_err(sdp, "tr_num_revoke = %u, tr_revokes = %u ",
+ tr->tr_num_revoke, tr->tr_revokes);
+ print_symbol(KERN_WARNING "GFS2: Transaction created at: %s\n", tr->tr_ip);
+ }
+
+ gfs2_log_commit(sdp, tr);
+ gfs2_glock_dq(&tr->tr_t_gh);
+ gfs2_holder_uninit(&tr->tr_t_gh);
+ kfree(tr);
+
+ if (sdp->sd_vfs->s_flags & MS_SYNCHRONOUS)
+ gfs2_log_flush(sdp, NULL);
+}
+
+void gfs2_trans_add_gl(struct gfs2_glock *gl)
+{
+ lops_add(gl->gl_sbd, &gl->gl_le);
+}
+
+/**
+ * gfs2_trans_add_bh - Add a to-be-modified buffer to the current transaction
+ * @gl: the glock the buffer belongs to
+ * @bh: The buffer to add
+ * @meta: True in the case of adding metadata
+ *
+ */
+
+void gfs2_trans_add_bh(struct gfs2_glock *gl, struct buffer_head *bh, int meta)
+{
+ struct gfs2_sbd *sdp = gl->gl_sbd;
+ struct gfs2_bufdata *bd;
+
+ bd = bh->b_private;
+ if (bd)
+ gfs2_assert(sdp, bd->bd_gl == gl);
+ else {
+ gfs2_attach_bufdata(gl, bh, meta);
+ bd = bh->b_private;
+ }
+ lops_add(sdp, &bd->bd_le);
+}
+
+void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, u64 blkno)
+{
+ struct gfs2_revoke *rv = kmalloc(sizeof(struct gfs2_revoke),
+ GFP_NOFS | __GFP_NOFAIL);
+ lops_init_le(&rv->rv_le, &gfs2_revoke_lops);
+ rv->rv_blkno = blkno;
+ lops_add(sdp, &rv->rv_le);
+}
+
+void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno)
+{
+ struct gfs2_revoke *rv;
+ int found = 0;
+
+ gfs2_log_lock(sdp);
+
+ list_for_each_entry(rv, &sdp->sd_log_le_revoke, rv_le.le_list) {
+ if (rv->rv_blkno == blkno) {
+ list_del(&rv->rv_le.le_list);
+ gfs2_assert_withdraw(sdp, sdp->sd_log_num_revoke);
+ sdp->sd_log_num_revoke--;
+ found = 1;
+ break;
+ }
+ }
+
+ gfs2_log_unlock(sdp);
+
+ if (found) {
+ struct gfs2_trans *tr = current->journal_info;
+ kfree(rv);
+ tr->tr_num_revoke_rm++;
+ }
+}
+
+void gfs2_trans_add_rg(struct gfs2_rgrpd *rgd)
+{
+ lops_add(rgd->rd_sbd, &rgd->rd_le);
+}
+
diff --git a/fs/gfs2/trans.h b/fs/gfs2/trans.h
new file mode 100644
index 000000000000..23d4cbe1de5b
--- /dev/null
+++ b/fs/gfs2/trans.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#ifndef __TRANS_DOT_H__
+#define __TRANS_DOT_H__
+
+#include <linux/buffer_head.h>
+struct gfs2_sbd;
+struct gfs2_rgrpd;
+struct gfs2_glock;
+
+#define RES_DINODE 1
+#define RES_INDIRECT 1
+#define RES_JDATA 1
+#define RES_DATA 1
+#define RES_LEAF 1
+#define RES_RG_BIT 2
+#define RES_EATTR 1
+#define RES_STATFS 1
+#define RES_QUOTA 2
+
+int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks,
+ unsigned int revokes);
+
+void gfs2_trans_end(struct gfs2_sbd *sdp);
+
+void gfs2_trans_add_gl(struct gfs2_glock *gl);
+void gfs2_trans_add_bh(struct gfs2_glock *gl, struct buffer_head *bh, int meta);
+void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, u64 blkno);
+void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno);
+void gfs2_trans_add_rg(struct gfs2_rgrpd *rgd);
+
+#endif /* __TRANS_DOT_H__ */
diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c
new file mode 100644
index 000000000000..196c604faadc
--- /dev/null
+++ b/fs/gfs2/util.c
@@ -0,0 +1,245 @@
+/*
+ * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/crc32.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/lm_interface.h>
+#include <asm/uaccess.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "glock.h"
+#include "lm.h"
+#include "util.h"
+
+kmem_cache_t *gfs2_glock_cachep __read_mostly;
+kmem_cache_t *gfs2_inode_cachep __read_mostly;
+kmem_cache_t *gfs2_bufdata_cachep __read_mostly;
+
+void gfs2_assert_i(struct gfs2_sbd *sdp)
+{
+ printk(KERN_EMERG "GFS2: fsid=%s: fatal assertion failed\n",
+ sdp->sd_fsname);
+}
+
+/**
+ * gfs2_assert_withdraw_i - Cause the machine to withdraw if @assertion is false
+ * Returns: -1 if this call withdrew the machine,
+ * -2 if it was already withdrawn
+ */
+
+int gfs2_assert_withdraw_i(struct gfs2_sbd *sdp, char *assertion,
+ const char *function, char *file, unsigned int line)
+{
+ int me;
+ me = gfs2_lm_withdraw(sdp,
+ "GFS2: fsid=%s: fatal: assertion \"%s\" failed\n"
+ "GFS2: fsid=%s: function = %s, file = %s, line = %u\n",
+ sdp->sd_fsname, assertion,
+ sdp->sd_fsname, function, file, line);
+ dump_stack();
+ return (me) ? -1 : -2;
+}
+
+/**
+ * gfs2_assert_warn_i - Print a message to the console if @assertion is false
+ * Returns: -1 if we printed something
+ * -2 if we didn't
+ */
+
+int gfs2_assert_warn_i(struct gfs2_sbd *sdp, char *assertion,
+ const char *function, char *file, unsigned int line)
+{
+ if (time_before(jiffies,
+ sdp->sd_last_warning +
+ gfs2_tune_get(sdp, gt_complain_secs) * HZ))
+ return -2;
+
+ printk(KERN_WARNING
+ "GFS2: fsid=%s: warning: assertion \"%s\" failed\n"
+ "GFS2: fsid=%s: function = %s, file = %s, line = %u\n",
+ sdp->sd_fsname, assertion,
+ sdp->sd_fsname, function, file, line);
+
+ if (sdp->sd_args.ar_debug)
+ BUG();
+ else
+ dump_stack();
+
+ sdp->sd_last_warning = jiffies;
+
+ return -1;
+}
+
+/**
+ * gfs2_consist_i - Flag a filesystem consistency error and withdraw
+ * Returns: -1 if this call withdrew the machine,
+ * 0 if it was already withdrawn
+ */
+
+int gfs2_consist_i(struct gfs2_sbd *sdp, int cluster_wide, const char *function,
+ char *file, unsigned int line)
+{
+ int rv;
+ rv = gfs2_lm_withdraw(sdp,
+ "GFS2: fsid=%s: fatal: filesystem consistency error\n"
+ "GFS2: fsid=%s: function = %s, file = %s, line = %u\n",
+ sdp->sd_fsname,
+ sdp->sd_fsname, function, file, line);
+ return rv;
+}
+
+/**
+ * gfs2_consist_inode_i - Flag an inode consistency error and withdraw
+ * Returns: -1 if this call withdrew the machine,
+ * 0 if it was already withdrawn
+ */
+
+int gfs2_consist_inode_i(struct gfs2_inode *ip, int cluster_wide,
+ const char *function, char *file, unsigned int line)
+{
+ struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+ int rv;
+ rv = gfs2_lm_withdraw(sdp,
+ "GFS2: fsid=%s: fatal: filesystem consistency error\n"
+ "GFS2: fsid=%s: inode = %llu %llu\n"
+ "GFS2: fsid=%s: function = %s, file = %s, line = %u\n",
+ sdp->sd_fsname,
+ sdp->sd_fsname, (unsigned long long)ip->i_num.no_formal_ino,
+ (unsigned long long)ip->i_num.no_addr,
+ sdp->sd_fsname, function, file, line);
+ return rv;
+}
+
+/**
+ * gfs2_consist_rgrpd_i - Flag a RG consistency error and withdraw
+ * Returns: -1 if this call withdrew the machine,
+ * 0 if it was already withdrawn
+ */
+
+int gfs2_consist_rgrpd_i(struct gfs2_rgrpd *rgd, int cluster_wide,
+ const char *function, char *file, unsigned int line)
+{
+ struct gfs2_sbd *sdp = rgd->rd_sbd;
+ int rv;
+ rv = gfs2_lm_withdraw(sdp,
+ "GFS2: fsid=%s: fatal: filesystem consistency error\n"
+ "GFS2: fsid=%s: RG = %llu\n"
+ "GFS2: fsid=%s: function = %s, file = %s, line = %u\n",
+ sdp->sd_fsname,
+ sdp->sd_fsname, (unsigned long long)rgd->rd_ri.ri_addr,
+ sdp->sd_fsname, function, file, line);
+ return rv;
+}
+
+/**
+ * gfs2_meta_check_ii - Flag a magic number consistency error and withdraw
+ * Returns: -1 if this call withdrew the machine,
+ * -2 if it was already withdrawn
+ */
+
+int gfs2_meta_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh,
+ const char *type, const char *function, char *file,
+ unsigned int line)
+{
+ int me;
+ me = gfs2_lm_withdraw(sdp,
+ "GFS2: fsid=%s: fatal: invalid metadata block\n"
+ "GFS2: fsid=%s: bh = %llu (%s)\n"
+ "GFS2: fsid=%s: function = %s, file = %s, line = %u\n",
+ sdp->sd_fsname,
+ sdp->sd_fsname, (unsigned long long)bh->b_blocknr, type,
+ sdp->sd_fsname, function, file, line);
+ return (me) ? -1 : -2;
+}
+
+/**
+ * gfs2_metatype_check_ii - Flag a metadata type consistency error and withdraw
+ * Returns: -1 if this call withdrew the machine,
+ * -2 if it was already withdrawn
+ */
+
+int gfs2_metatype_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh,
+ u16 type, u16 t, const char *function,
+ char *file, unsigned int line)
+{
+ int me;
+ me = gfs2_lm_withdraw(sdp,
+ "GFS2: fsid=%s: fatal: invalid metadata block\n"
+ "GFS2: fsid=%s: bh = %llu (type: exp=%u, found=%u)\n"
+ "GFS2: fsid=%s: function = %s, file = %s, line = %u\n",
+ sdp->sd_fsname,
+ sdp->sd_fsname, (unsigned long long)bh->b_blocknr, type, t,
+ sdp->sd_fsname, function, file, line);
+ return (me) ? -1 : -2;
+}
+
+/**
+ * gfs2_io_error_i - Flag an I/O error and withdraw
+ * Returns: -1 if this call withdrew the machine,
+ * 0 if it was already withdrawn
+ */
+
+int gfs2_io_error_i(struct gfs2_sbd *sdp, const char *function, char *file,
+ unsigned int line)
+{
+ int rv;
+ rv = gfs2_lm_withdraw(sdp,
+ "GFS2: fsid=%s: fatal: I/O error\n"
+ "GFS2: fsid=%s: function = %s, file = %s, line = %u\n",
+ sdp->sd_fsname,
+ sdp->sd_fsname, function, file, line);
+ return rv;
+}
+
+/**
+ * gfs2_io_error_bh_i - Flag a buffer I/O error and withdraw
+ * Returns: -1 if this call withdrew the machine,
+ * 0 if it was already withdrawn
+ */
+
+int gfs2_io_error_bh_i(struct gfs2_sbd *sdp, struct buffer_head *bh,
+ const char *function, char *file, unsigned int line)
+{
+ int rv;
+ rv = gfs2_lm_withdraw(sdp,
+ "GFS2: fsid=%s: fatal: I/O error\n"
+ "GFS2: fsid=%s: block = %llu\n"
+ "GFS2: fsid=%s: function = %s, file = %s, line = %u\n",
+ sdp->sd_fsname,
+ sdp->sd_fsname, (unsigned long long)bh->b_blocknr,
+ sdp->sd_fsname, function, file, line);
+ return rv;
+}
+
+void gfs2_icbit_munge(struct gfs2_sbd *sdp, unsigned char **bitmap,
+ unsigned int bit, int new_value)
+{
+ unsigned int c, o, b = bit;
+ int old_value;
+
+ c = b / (8 * PAGE_SIZE);
+ b %= 8 * PAGE_SIZE;
+ o = b / 8;
+ b %= 8;
+
+ old_value = (bitmap[c][o] & (1 << b));
+ gfs2_assert_withdraw(sdp, !old_value != !new_value);
+
+ if (new_value)
+ bitmap[c][o] |= 1 << b;
+ else
+ bitmap[c][o] &= ~(1 << b);
+}
+
diff --git a/fs/gfs2/util.h b/fs/gfs2/util.h
new file mode 100644
index 000000000000..76a50899fe9e
--- /dev/null
+++ b/fs/gfs2/util.h
@@ -0,0 +1,170 @@
+/*
+ * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#ifndef __UTIL_DOT_H__
+#define __UTIL_DOT_H__
+
+#include "incore.h"
+
+#define fs_printk(level, fs, fmt, arg...) \
+ printk(level "GFS2: fsid=%s: " fmt , (fs)->sd_fsname , ## arg)
+
+#define fs_info(fs, fmt, arg...) \
+ fs_printk(KERN_INFO , fs , fmt , ## arg)
+
+#define fs_warn(fs, fmt, arg...) \
+ fs_printk(KERN_WARNING , fs , fmt , ## arg)
+
+#define fs_err(fs, fmt, arg...) \
+ fs_printk(KERN_ERR, fs , fmt , ## arg)
+
+
+void gfs2_assert_i(struct gfs2_sbd *sdp);
+
+#define gfs2_assert(sdp, assertion) \
+do { \
+ if (unlikely(!(assertion))) { \
+ gfs2_assert_i(sdp); \
+ BUG(); \
+ } \
+} while (0)
+
+
+int gfs2_assert_withdraw_i(struct gfs2_sbd *sdp, char *assertion,
+ const char *function, char *file, unsigned int line);
+
+#define gfs2_assert_withdraw(sdp, assertion) \
+((likely(assertion)) ? 0 : gfs2_assert_withdraw_i((sdp), #assertion, \
+ __FUNCTION__, __FILE__, __LINE__))
+
+
+int gfs2_assert_warn_i(struct gfs2_sbd *sdp, char *assertion,
+ const char *function, char *file, unsigned int line);
+
+#define gfs2_assert_warn(sdp, assertion) \
+((likely(assertion)) ? 0 : gfs2_assert_warn_i((sdp), #assertion, \
+ __FUNCTION__, __FILE__, __LINE__))
+
+
+int gfs2_consist_i(struct gfs2_sbd *sdp, int cluster_wide,
+ const char *function, char *file, unsigned int line);
+
+#define gfs2_consist(sdp) \
+gfs2_consist_i((sdp), 0, __FUNCTION__, __FILE__, __LINE__)
+
+
+int gfs2_consist_inode_i(struct gfs2_inode *ip, int cluster_wide,
+ const char *function, char *file, unsigned int line);
+
+#define gfs2_consist_inode(ip) \
+gfs2_consist_inode_i((ip), 0, __FUNCTION__, __FILE__, __LINE__)
+
+
+int gfs2_consist_rgrpd_i(struct gfs2_rgrpd *rgd, int cluster_wide,
+ const char *function, char *file, unsigned int line);
+
+#define gfs2_consist_rgrpd(rgd) \
+gfs2_consist_rgrpd_i((rgd), 0, __FUNCTION__, __FILE__, __LINE__)
+
+
+int gfs2_meta_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh,
+ const char *type, const char *function,
+ char *file, unsigned int line);
+
+static inline int gfs2_meta_check_i(struct gfs2_sbd *sdp,
+ struct buffer_head *bh,
+ const char *function,
+ char *file, unsigned int line)
+{
+ struct gfs2_meta_header *mh = (struct gfs2_meta_header *)bh->b_data;
+ u32 magic = mh->mh_magic;
+ magic = be32_to_cpu(magic);
+ if (unlikely(magic != GFS2_MAGIC))
+ return gfs2_meta_check_ii(sdp, bh, "magic number", function,
+ file, line);
+ return 0;
+}
+
+#define gfs2_meta_check(sdp, bh) \
+gfs2_meta_check_i((sdp), (bh), __FUNCTION__, __FILE__, __LINE__)
+
+
+int gfs2_metatype_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh,
+ u16 type, u16 t,
+ const char *function,
+ char *file, unsigned int line);
+
+static inline int gfs2_metatype_check_i(struct gfs2_sbd *sdp,
+ struct buffer_head *bh,
+ u16 type,
+ const char *function,
+ char *file, unsigned int line)
+{
+ struct gfs2_meta_header *mh = (struct gfs2_meta_header *)bh->b_data;
+ u32 magic = mh->mh_magic;
+ u16 t = be32_to_cpu(mh->mh_type);
+ magic = be32_to_cpu(magic);
+ if (unlikely(magic != GFS2_MAGIC))
+ return gfs2_meta_check_ii(sdp, bh, "magic number", function,
+ file, line);
+ if (unlikely(t != type))
+ return gfs2_metatype_check_ii(sdp, bh, type, t, function,
+ file, line);
+ return 0;
+}
+
+#define gfs2_metatype_check(sdp, bh, type) \
+gfs2_metatype_check_i((sdp), (bh), (type), __FUNCTION__, __FILE__, __LINE__)
+
+static inline void gfs2_metatype_set(struct buffer_head *bh, u16 type,
+ u16 format)
+{
+ struct gfs2_meta_header *mh;
+ mh = (struct gfs2_meta_header *)bh->b_data;
+ mh->mh_type = cpu_to_be32(type);
+ mh->mh_format = cpu_to_be32(format);
+}
+
+
+int gfs2_io_error_i(struct gfs2_sbd *sdp, const char *function,
+ char *file, unsigned int line);
+
+#define gfs2_io_error(sdp) \
+gfs2_io_error_i((sdp), __FUNCTION__, __FILE__, __LINE__);
+
+
+int gfs2_io_error_bh_i(struct gfs2_sbd *sdp, struct buffer_head *bh,
+ const char *function, char *file, unsigned int line);
+
+#define gfs2_io_error_bh(sdp, bh) \
+gfs2_io_error_bh_i((sdp), (bh), __FUNCTION__, __FILE__, __LINE__);
+
+
+extern kmem_cache_t *gfs2_glock_cachep;
+extern kmem_cache_t *gfs2_inode_cachep;
+extern kmem_cache_t *gfs2_bufdata_cachep;
+
+static inline unsigned int gfs2_tune_get_i(struct gfs2_tune *gt,
+ unsigned int *p)
+{
+ unsigned int x;
+ spin_lock(&gt->gt_spin);
+ x = *p;
+ spin_unlock(&gt->gt_spin);
+ return x;
+}
+
+#define gfs2_tune_get(sdp, field) \
+gfs2_tune_get_i(&(sdp)->sd_tune, &(sdp)->sd_tune.field)
+
+void gfs2_icbit_munge(struct gfs2_sbd *sdp, unsigned char **bitmap,
+ unsigned int bit, int new_value);
+
+#endif /* __UTIL_DOT_H__ */
+
diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c
index bcf6ee36e065..7faef8544f32 100644
--- a/fs/hpfs/inode.c
+++ b/fs/hpfs/inode.c
@@ -60,14 +60,14 @@ void hpfs_read_inode(struct inode *i)
if (hpfs_sb(i->i_sb)->sb_eas) {
if ((ea = hpfs_get_ea(i->i_sb, fnode, "UID", &ea_size))) {
if (ea_size == 2) {
- i->i_uid = le16_to_cpu(*(u16*)ea);
+ i->i_uid = le16_to_cpu(*(__le16*)ea);
hpfs_inode->i_ea_uid = 1;
}
kfree(ea);
}
if ((ea = hpfs_get_ea(i->i_sb, fnode, "GID", &ea_size))) {
if (ea_size == 2) {
- i->i_gid = le16_to_cpu(*(u16*)ea);
+ i->i_gid = le16_to_cpu(*(__le16*)ea);
hpfs_inode->i_ea_gid = 1;
}
kfree(ea);
@@ -87,7 +87,7 @@ void hpfs_read_inode(struct inode *i)
int rdev = 0;
umode_t mode = hpfs_sb(sb)->sb_mode;
if (ea_size == 2) {
- mode = le16_to_cpu(*(u16*)ea);
+ mode = le16_to_cpu(*(__le16*)ea);
hpfs_inode->i_ea_mode = 1;
}
kfree(ea);
@@ -95,7 +95,7 @@ void hpfs_read_inode(struct inode *i)
if (S_ISBLK(mode) || S_ISCHR(mode)) {
if ((ea = hpfs_get_ea(i->i_sb, fnode, "DEV", &ea_size))) {
if (ea_size == 4)
- rdev = le32_to_cpu(*(u32*)ea);
+ rdev = le32_to_cpu(*(__le32*)ea);
kfree(ea);
}
}
@@ -148,7 +148,7 @@ static void hpfs_write_inode_ea(struct inode *i, struct fnode *fnode)
we'd better not overwrite them
hpfs_error(i->i_sb, "fnode %08x has some unknown HPFS386 stuctures", i->i_ino);
} else*/ if (hpfs_sb(i->i_sb)->sb_eas >= 2) {
- u32 ea;
+ __le32 ea;
if ((i->i_uid != hpfs_sb(i->i_sb)->sb_uid) || hpfs_inode->i_ea_uid) {
ea = cpu_to_le32(i->i_uid);
hpfs_set_ea(i, fnode, "UID", (char*)&ea, 2);
@@ -165,6 +165,7 @@ static void hpfs_write_inode_ea(struct inode *i, struct fnode *fnode)
&& i->i_mode != ((hpfs_sb(i->i_sb)->sb_mode & ~(S_ISDIR(i->i_mode) ? 0222 : 0333))
| (S_ISDIR(i->i_mode) ? S_IFDIR : S_IFREG))) || hpfs_inode->i_ea_mode) {
ea = cpu_to_le32(i->i_mode);
+ /* sick, but legal */
hpfs_set_ea(i, fnode, "MODE", (char *)&ea, 2);
hpfs_inode->i_ea_mode = 1;
}
diff --git a/fs/hppfs/hppfs_kern.c b/fs/hppfs/hppfs_kern.c
index dcb6d2e988b8..642675fc394a 100644
--- a/fs/hppfs/hppfs_kern.c
+++ b/fs/hppfs/hppfs_kern.c
@@ -572,7 +572,7 @@ struct hppfs_dirent {
};
static int hppfs_filldir(void *d, const char *name, int size,
- loff_t offset, ino_t inode, unsigned int type)
+ loff_t offset, u64 inode, unsigned int type)
{
struct hppfs_dirent *dirent = d;
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 5e03b2f67b93..4ee3f006b861 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -293,7 +293,7 @@ hugetlb_vmtruncate_list(struct prio_tree_root *root, unsigned long h_pgoff)
if (h_vm_pgoff >= h_pgoff)
v_offset = 0;
- unmap_hugepage_range(vma,
+ __unmap_hugepage_range(vma,
vma->vm_start + v_offset, vma->vm_end);
}
}
diff --git a/fs/inode.c b/fs/inode.c
index bf6bec4e54ff..d9a21d122926 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -162,7 +162,7 @@ static struct inode *alloc_inode(struct super_block *sb)
bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info;
mapping->backing_dev_info = bdi;
}
- inode->i_private = 0;
+ inode->i_private = NULL;
inode->i_mapping = mapping;
}
return inode;
diff --git a/fs/ioprio.c b/fs/ioprio.c
index 6dc6721d9e82..89e8da112a75 100644
--- a/fs/ioprio.c
+++ b/fs/ioprio.c
@@ -150,11 +150,6 @@ int ioprio_best(unsigned short aprio, unsigned short bprio)
unsigned short aclass = IOPRIO_PRIO_CLASS(aprio);
unsigned short bclass = IOPRIO_PRIO_CLASS(bprio);
- if (!ioprio_valid(aprio))
- return bprio;
- if (!ioprio_valid(bprio))
- return aprio;
-
if (aclass == IOPRIO_CLASS_NONE)
aclass = IOPRIO_CLASS_BE;
if (bclass == IOPRIO_CLASS_NONE)
diff --git a/fs/isofs/joliet.c b/fs/isofs/joliet.c
index 81a90e170ac3..fb8fe7a9ddc6 100644
--- a/fs/isofs/joliet.c
+++ b/fs/isofs/joliet.c
@@ -14,9 +14,9 @@
* Convert Unicode 16 to UTF-8 or ASCII.
*/
static int
-uni16_to_x8(unsigned char *ascii, u16 *uni, int len, struct nls_table *nls)
+uni16_to_x8(unsigned char *ascii, __be16 *uni, int len, struct nls_table *nls)
{
- wchar_t *ip, ch;
+ __be16 *ip, ch;
unsigned char *op;
ip = uni;
@@ -24,8 +24,8 @@ uni16_to_x8(unsigned char *ascii, u16 *uni, int len, struct nls_table *nls)
while ((ch = get_unaligned(ip)) && len) {
int llen;
- ch = be16_to_cpu(ch);
- if ((llen = nls->uni2char(ch, op, NLS_MAX_CHARSET_SIZE)) > 0)
+ llen = nls->uni2char(be16_to_cpu(ch), op, NLS_MAX_CHARSET_SIZE);
+ if (llen > 0)
op += llen;
else
*op++ = '?';
@@ -82,7 +82,7 @@ get_joliet_filename(struct iso_directory_record * de, unsigned char *outname, st
len = wcsntombs_be(outname, de->name,
de->name_len[0] >> 1, PAGE_SIZE);
} else {
- len = uni16_to_x8(outname, (u16 *) de->name,
+ len = uni16_to_x8(outname, (__be16 *) de->name,
de->name_len[0] >> 1, nls);
}
if ((len > 2) && (outname[len-2] == ';') && (outname[len-1] == '1')) {
diff --git a/fs/isofs/namei.c b/fs/isofs/namei.c
index e7ba0c30e071..c04b3a14a3e9 100644
--- a/fs/isofs/namei.c
+++ b/fs/isofs/namei.c
@@ -6,7 +6,6 @@
* (C) 1991 Linus Torvalds - minix filesystem
*/
-#include <linux/config.h> /* Joliet? */
#include <linux/smp_lock.h>
#include "isofs.h"
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index c518dd8fe60a..b85c686b60db 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -725,6 +725,7 @@ journal_t * journal_init_dev(struct block_device *bdev,
__FUNCTION__);
kfree(journal);
journal = NULL;
+ goto out;
}
journal->j_dev = bdev;
journal->j_fs_dev = fs_dev;
@@ -735,7 +736,7 @@ journal_t * journal_init_dev(struct block_device *bdev,
J_ASSERT(bh != NULL);
journal->j_sb_buffer = bh;
journal->j_superblock = (journal_superblock_t *)bh->b_data;
-
+out:
return journal;
}
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index e1b3c8af4d17..d5c63047a8b3 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -1314,13 +1314,14 @@ int journal_stop(handle_t *handle)
int old_handle_count, err;
pid_t pid;
- J_ASSERT(transaction->t_updates > 0);
J_ASSERT(journal_current_handle() == handle);
if (is_handle_aborted(handle))
err = -EIO;
- else
+ else {
+ J_ASSERT(transaction->t_updates > 0);
err = 0;
+ }
if (--handle->h_ref > 0) {
jbd_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1,
diff --git a/fs/jbd2/Makefile b/fs/jbd2/Makefile
new file mode 100644
index 000000000000..802a3413872a
--- /dev/null
+++ b/fs/jbd2/Makefile
@@ -0,0 +1,7 @@
+#
+# Makefile for the linux journaling routines.
+#
+
+obj-$(CONFIG_JBD2) += jbd2.o
+
+jbd2-objs := transaction.o commit.o recovery.o checkpoint.o revoke.o journal.o
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
new file mode 100644
index 000000000000..68039fa9a566
--- /dev/null
+++ b/fs/jbd2/checkpoint.c
@@ -0,0 +1,697 @@
+/*
+ * linux/fs/checkpoint.c
+ *
+ * Written by Stephen C. Tweedie <sct@redhat.com>, 1999
+ *
+ * Copyright 1999 Red Hat Software --- All Rights Reserved
+ *
+ * This file is part of the Linux kernel and is made available under
+ * the terms of the GNU General Public License, version 2, or at your
+ * option, any later version, incorporated herein by reference.
+ *
+ * Checkpoint routines for the generic filesystem journaling code.
+ * Part of the ext2fs journaling system.
+ *
+ * Checkpointing is the process of ensuring that a section of the log is
+ * committed fully to disk, so that that portion of the log can be
+ * reused.
+ */
+
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/jbd2.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+
+/*
+ * Unlink a buffer from a transaction checkpoint list.
+ *
+ * Called with j_list_lock held.
+ */
+static inline void __buffer_unlink_first(struct journal_head *jh)
+{
+ transaction_t *transaction = jh->b_cp_transaction;
+
+ jh->b_cpnext->b_cpprev = jh->b_cpprev;
+ jh->b_cpprev->b_cpnext = jh->b_cpnext;
+ if (transaction->t_checkpoint_list == jh) {
+ transaction->t_checkpoint_list = jh->b_cpnext;
+ if (transaction->t_checkpoint_list == jh)
+ transaction->t_checkpoint_list = NULL;
+ }
+}
+
+/*
+ * Unlink a buffer from a transaction checkpoint(io) list.
+ *
+ * Called with j_list_lock held.
+ */
+static inline void __buffer_unlink(struct journal_head *jh)
+{
+ transaction_t *transaction = jh->b_cp_transaction;
+
+ __buffer_unlink_first(jh);
+ if (transaction->t_checkpoint_io_list == jh) {
+ transaction->t_checkpoint_io_list = jh->b_cpnext;
+ if (transaction->t_checkpoint_io_list == jh)
+ transaction->t_checkpoint_io_list = NULL;
+ }
+}
+
+/*
+ * Move a buffer from the checkpoint list to the checkpoint io list
+ *
+ * Called with j_list_lock held
+ */
+static inline void __buffer_relink_io(struct journal_head *jh)
+{
+ transaction_t *transaction = jh->b_cp_transaction;
+
+ __buffer_unlink_first(jh);
+
+ if (!transaction->t_checkpoint_io_list) {
+ jh->b_cpnext = jh->b_cpprev = jh;
+ } else {
+ jh->b_cpnext = transaction->t_checkpoint_io_list;
+ jh->b_cpprev = transaction->t_checkpoint_io_list->b_cpprev;
+ jh->b_cpprev->b_cpnext = jh;
+ jh->b_cpnext->b_cpprev = jh;
+ }
+ transaction->t_checkpoint_io_list = jh;
+}
+
+/*
+ * Try to release a checkpointed buffer from its transaction.
+ * Returns 1 if we released it and 2 if we also released the
+ * whole transaction.
+ *
+ * Requires j_list_lock
+ * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
+ */
+static int __try_to_free_cp_buf(struct journal_head *jh)
+{
+ int ret = 0;
+ struct buffer_head *bh = jh2bh(jh);
+
+ if (jh->b_jlist == BJ_None && !buffer_locked(bh) && !buffer_dirty(bh)) {
+ JBUFFER_TRACE(jh, "remove from checkpoint list");
+ ret = __jbd2_journal_remove_checkpoint(jh) + 1;
+ jbd_unlock_bh_state(bh);
+ jbd2_journal_remove_journal_head(bh);
+ BUFFER_TRACE(bh, "release");
+ __brelse(bh);
+ } else {
+ jbd_unlock_bh_state(bh);
+ }
+ return ret;
+}
+
+/*
+ * __jbd2_log_wait_for_space: wait until there is space in the journal.
+ *
+ * Called under j-state_lock *only*. It will be unlocked if we have to wait
+ * for a checkpoint to free up some space in the log.
+ */
+void __jbd2_log_wait_for_space(journal_t *journal)
+{
+ int nblocks;
+ assert_spin_locked(&journal->j_state_lock);
+
+ nblocks = jbd_space_needed(journal);
+ while (__jbd2_log_space_left(journal) < nblocks) {
+ if (journal->j_flags & JBD2_ABORT)
+ return;
+ spin_unlock(&journal->j_state_lock);
+ mutex_lock(&journal->j_checkpoint_mutex);
+
+ /*
+ * Test again, another process may have checkpointed while we
+ * were waiting for the checkpoint lock
+ */
+ spin_lock(&journal->j_state_lock);
+ nblocks = jbd_space_needed(journal);
+ if (__jbd2_log_space_left(journal) < nblocks) {
+ spin_unlock(&journal->j_state_lock);
+ jbd2_log_do_checkpoint(journal);
+ spin_lock(&journal->j_state_lock);
+ }
+ mutex_unlock(&journal->j_checkpoint_mutex);
+ }
+}
+
+/*
+ * We were unable to perform jbd_trylock_bh_state() inside j_list_lock.
+ * The caller must restart a list walk. Wait for someone else to run
+ * jbd_unlock_bh_state().
+ */
+static void jbd_sync_bh(journal_t *journal, struct buffer_head *bh)
+ __releases(journal->j_list_lock)
+{
+ get_bh(bh);
+ spin_unlock(&journal->j_list_lock);
+ jbd_lock_bh_state(bh);
+ jbd_unlock_bh_state(bh);
+ put_bh(bh);
+}
+
+/*
+ * Clean up transaction's list of buffers submitted for io.
+ * We wait for any pending IO to complete and remove any clean
+ * buffers. Note that we take the buffers in the opposite ordering
+ * from the one in which they were submitted for IO.
+ *
+ * Called with j_list_lock held.
+ */
+static void __wait_cp_io(journal_t *journal, transaction_t *transaction)
+{
+ struct journal_head *jh;
+ struct buffer_head *bh;
+ tid_t this_tid;
+ int released = 0;
+
+ this_tid = transaction->t_tid;
+restart:
+ /* Did somebody clean up the transaction in the meanwhile? */
+ if (journal->j_checkpoint_transactions != transaction ||
+ transaction->t_tid != this_tid)
+ return;
+ while (!released && transaction->t_checkpoint_io_list) {
+ jh = transaction->t_checkpoint_io_list;
+ bh = jh2bh(jh);
+ if (!jbd_trylock_bh_state(bh)) {
+ jbd_sync_bh(journal, bh);
+ spin_lock(&journal->j_list_lock);
+ goto restart;
+ }
+ if (buffer_locked(bh)) {
+ atomic_inc(&bh->b_count);
+ spin_unlock(&journal->j_list_lock);
+ jbd_unlock_bh_state(bh);
+ wait_on_buffer(bh);
+ /* the journal_head may have gone by now */
+ BUFFER_TRACE(bh, "brelse");
+ __brelse(bh);
+ spin_lock(&journal->j_list_lock);
+ goto restart;
+ }
+ /*
+ * Now in whatever state the buffer currently is, we know that
+ * it has been written out and so we can drop it from the list
+ */
+ released = __jbd2_journal_remove_checkpoint(jh);
+ jbd_unlock_bh_state(bh);
+ jbd2_journal_remove_journal_head(bh);
+ __brelse(bh);
+ }
+}
+
+#define NR_BATCH 64
+
+static void
+__flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count)
+{
+ int i;
+
+ ll_rw_block(SWRITE, *batch_count, bhs);
+ for (i = 0; i < *batch_count; i++) {
+ struct buffer_head *bh = bhs[i];
+ clear_buffer_jwrite(bh);
+ BUFFER_TRACE(bh, "brelse");
+ __brelse(bh);
+ }
+ *batch_count = 0;
+}
+
+/*
+ * Try to flush one buffer from the checkpoint list to disk.
+ *
+ * Return 1 if something happened which requires us to abort the current
+ * scan of the checkpoint list.
+ *
+ * Called with j_list_lock held and drops it if 1 is returned
+ * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
+ */
+static int __process_buffer(journal_t *journal, struct journal_head *jh,
+ struct buffer_head **bhs, int *batch_count)
+{
+ struct buffer_head *bh = jh2bh(jh);
+ int ret = 0;
+
+ if (buffer_locked(bh)) {
+ atomic_inc(&bh->b_count);
+ spin_unlock(&journal->j_list_lock);
+ jbd_unlock_bh_state(bh);
+ wait_on_buffer(bh);
+ /* the journal_head may have gone by now */
+ BUFFER_TRACE(bh, "brelse");
+ __brelse(bh);
+ ret = 1;
+ } else if (jh->b_transaction != NULL) {
+ transaction_t *t = jh->b_transaction;
+ tid_t tid = t->t_tid;
+
+ spin_unlock(&journal->j_list_lock);
+ jbd_unlock_bh_state(bh);
+ jbd2_log_start_commit(journal, tid);
+ jbd2_log_wait_commit(journal, tid);
+ ret = 1;
+ } else if (!buffer_dirty(bh)) {
+ J_ASSERT_JH(jh, !buffer_jbddirty(bh));
+ BUFFER_TRACE(bh, "remove from checkpoint");
+ __jbd2_journal_remove_checkpoint(jh);
+ spin_unlock(&journal->j_list_lock);
+ jbd_unlock_bh_state(bh);
+ jbd2_journal_remove_journal_head(bh);
+ __brelse(bh);
+ ret = 1;
+ } else {
+ /*
+ * Important: we are about to write the buffer, and
+ * possibly block, while still holding the journal lock.
+ * We cannot afford to let the transaction logic start
+ * messing around with this buffer before we write it to
+ * disk, as that would break recoverability.
+ */
+ BUFFER_TRACE(bh, "queue");
+ get_bh(bh);
+ J_ASSERT_BH(bh, !buffer_jwrite(bh));
+ set_buffer_jwrite(bh);
+ bhs[*batch_count] = bh;
+ __buffer_relink_io(jh);
+ jbd_unlock_bh_state(bh);
+ (*batch_count)++;
+ if (*batch_count == NR_BATCH) {
+ spin_unlock(&journal->j_list_lock);
+ __flush_batch(journal, bhs, batch_count);
+ ret = 1;
+ }
+ }
+ return ret;
+}
+
+/*
+ * Perform an actual checkpoint. We take the first transaction on the
+ * list of transactions to be checkpointed and send all its buffers
+ * to disk. We submit larger chunks of data at once.
+ *
+ * The journal should be locked before calling this function.
+ */
+int jbd2_log_do_checkpoint(journal_t *journal)
+{
+ transaction_t *transaction;
+ tid_t this_tid;
+ int result;
+
+ jbd_debug(1, "Start checkpoint\n");
+
+ /*
+ * First thing: if there are any transactions in the log which
+ * don't need checkpointing, just eliminate them from the
+ * journal straight away.
+ */
+ result = jbd2_cleanup_journal_tail(journal);
+ jbd_debug(1, "cleanup_journal_tail returned %d\n", result);
+ if (result <= 0)
+ return result;
+
+ /*
+ * OK, we need to start writing disk blocks. Take one transaction
+ * and write it.
+ */
+ spin_lock(&journal->j_list_lock);
+ if (!journal->j_checkpoint_transactions)
+ goto out;
+ transaction = journal->j_checkpoint_transactions;
+ this_tid = transaction->t_tid;
+restart:
+ /*
+ * If someone cleaned up this transaction while we slept, we're
+ * done (maybe it's a new transaction, but it fell at the same
+ * address).
+ */
+ if (journal->j_checkpoint_transactions == transaction &&
+ transaction->t_tid == this_tid) {
+ int batch_count = 0;
+ struct buffer_head *bhs[NR_BATCH];
+ struct journal_head *jh;
+ int retry = 0;
+
+ while (!retry && transaction->t_checkpoint_list) {
+ struct buffer_head *bh;
+
+ jh = transaction->t_checkpoint_list;
+ bh = jh2bh(jh);
+ if (!jbd_trylock_bh_state(bh)) {
+ jbd_sync_bh(journal, bh);
+ retry = 1;
+ break;
+ }
+ retry = __process_buffer(journal, jh, bhs,&batch_count);
+ if (!retry && lock_need_resched(&journal->j_list_lock)){
+ spin_unlock(&journal->j_list_lock);
+ retry = 1;
+ break;
+ }
+ }
+
+ if (batch_count) {
+ if (!retry) {
+ spin_unlock(&journal->j_list_lock);
+ retry = 1;
+ }
+ __flush_batch(journal, bhs, &batch_count);
+ }
+
+ if (retry) {
+ spin_lock(&journal->j_list_lock);
+ goto restart;
+ }
+ /*
+ * Now we have cleaned up the first transaction's checkpoint
+ * list. Let's clean up the second one
+ */
+ __wait_cp_io(journal, transaction);
+ }
+out:
+ spin_unlock(&journal->j_list_lock);
+ result = jbd2_cleanup_journal_tail(journal);
+ if (result < 0)
+ return result;
+ return 0;
+}
+
+/*
+ * Check the list of checkpoint transactions for the journal to see if
+ * we have already got rid of any since the last update of the log tail
+ * in the journal superblock. If so, we can instantly roll the
+ * superblock forward to remove those transactions from the log.
+ *
+ * Return <0 on error, 0 on success, 1 if there was nothing to clean up.
+ *
+ * Called with the journal lock held.
+ *
+ * This is the only part of the journaling code which really needs to be
+ * aware of transaction aborts. Checkpointing involves writing to the
+ * main filesystem area rather than to the journal, so it can proceed
+ * even in abort state, but we must not update the journal superblock if
+ * we have an abort error outstanding.
+ */
+
+int jbd2_cleanup_journal_tail(journal_t *journal)
+{
+ transaction_t * transaction;
+ tid_t first_tid;
+ unsigned long blocknr, freed;
+
+ /* OK, work out the oldest transaction remaining in the log, and
+ * the log block it starts at.
+ *
+ * If the log is now empty, we need to work out which is the
+ * next transaction ID we will write, and where it will
+ * start. */
+
+ spin_lock(&journal->j_state_lock);
+ spin_lock(&journal->j_list_lock);
+ transaction = journal->j_checkpoint_transactions;
+ if (transaction) {
+ first_tid = transaction->t_tid;
+ blocknr = transaction->t_log_start;
+ } else if ((transaction = journal->j_committing_transaction) != NULL) {
+ first_tid = transaction->t_tid;
+ blocknr = transaction->t_log_start;
+ } else if ((transaction = journal->j_running_transaction) != NULL) {
+ first_tid = transaction->t_tid;
+ blocknr = journal->j_head;
+ } else {
+ first_tid = journal->j_transaction_sequence;
+ blocknr = journal->j_head;
+ }
+ spin_unlock(&journal->j_list_lock);
+ J_ASSERT(blocknr != 0);
+
+ /* If the oldest pinned transaction is at the tail of the log
+ already then there's not much we can do right now. */
+ if (journal->j_tail_sequence == first_tid) {
+ spin_unlock(&journal->j_state_lock);
+ return 1;
+ }
+
+ /* OK, update the superblock to recover the freed space.
+ * Physical blocks come first: have we wrapped beyond the end of
+ * the log? */
+ freed = blocknr - journal->j_tail;
+ if (blocknr < journal->j_tail)
+ freed = freed + journal->j_last - journal->j_first;
+
+ jbd_debug(1,
+ "Cleaning journal tail from %d to %d (offset %lu), "
+ "freeing %lu\n",
+ journal->j_tail_sequence, first_tid, blocknr, freed);
+
+ journal->j_free += freed;
+ journal->j_tail_sequence = first_tid;
+ journal->j_tail = blocknr;
+ spin_unlock(&journal->j_state_lock);
+ if (!(journal->j_flags & JBD2_ABORT))
+ jbd2_journal_update_superblock(journal, 1);
+ return 0;
+}
+
+
+/* Checkpoint list management */
+
+/*
+ * journal_clean_one_cp_list
+ *
+ * Find all the written-back checkpoint buffers in the given list and release them.
+ *
+ * Called with the journal locked.
+ * Called with j_list_lock held.
+ * Returns number of bufers reaped (for debug)
+ */
+
+static int journal_clean_one_cp_list(struct journal_head *jh, int *released)
+{
+ struct journal_head *last_jh;
+ struct journal_head *next_jh = jh;
+ int ret, freed = 0;
+
+ *released = 0;
+ if (!jh)
+ return 0;
+
+ last_jh = jh->b_cpprev;
+ do {
+ jh = next_jh;
+ next_jh = jh->b_cpnext;
+ /* Use trylock because of the ranking */
+ if (jbd_trylock_bh_state(jh2bh(jh))) {
+ ret = __try_to_free_cp_buf(jh);
+ if (ret) {
+ freed++;
+ if (ret == 2) {
+ *released = 1;
+ return freed;
+ }
+ }
+ }
+ /*
+ * This function only frees up some memory
+ * if possible so we dont have an obligation
+ * to finish processing. Bail out if preemption
+ * requested:
+ */
+ if (need_resched())
+ return freed;
+ } while (jh != last_jh);
+
+ return freed;
+}
+
+/*
+ * journal_clean_checkpoint_list
+ *
+ * Find all the written-back checkpoint buffers in the journal and release them.
+ *
+ * Called with the journal locked.
+ * Called with j_list_lock held.
+ * Returns number of buffers reaped (for debug)
+ */
+
+int __jbd2_journal_clean_checkpoint_list(journal_t *journal)
+{
+ transaction_t *transaction, *last_transaction, *next_transaction;
+ int ret = 0;
+ int released;
+
+ transaction = journal->j_checkpoint_transactions;
+ if (!transaction)
+ goto out;
+
+ last_transaction = transaction->t_cpprev;
+ next_transaction = transaction;
+ do {
+ transaction = next_transaction;
+ next_transaction = transaction->t_cpnext;
+ ret += journal_clean_one_cp_list(transaction->
+ t_checkpoint_list, &released);
+ /*
+ * This function only frees up some memory if possible so we
+ * dont have an obligation to finish processing. Bail out if
+ * preemption requested:
+ */
+ if (need_resched())
+ goto out;
+ if (released)
+ continue;
+ /*
+ * It is essential that we are as careful as in the case of
+ * t_checkpoint_list with removing the buffer from the list as
+ * we can possibly see not yet submitted buffers on io_list
+ */
+ ret += journal_clean_one_cp_list(transaction->
+ t_checkpoint_io_list, &released);
+ if (need_resched())
+ goto out;
+ } while (transaction != last_transaction);
+out:
+ return ret;
+}
+
+/*
+ * journal_remove_checkpoint: called after a buffer has been committed
+ * to disk (either by being write-back flushed to disk, or being
+ * committed to the log).
+ *
+ * We cannot safely clean a transaction out of the log until all of the
+ * buffer updates committed in that transaction have safely been stored
+ * elsewhere on disk. To achieve this, all of the buffers in a
+ * transaction need to be maintained on the transaction's checkpoint
+ * lists until they have been rewritten, at which point this function is
+ * called to remove the buffer from the existing transaction's
+ * checkpoint lists.
+ *
+ * The function returns 1 if it frees the transaction, 0 otherwise.
+ *
+ * This function is called with the journal locked.
+ * This function is called with j_list_lock held.
+ * This function is called with jbd_lock_bh_state(jh2bh(jh))
+ */
+
+int __jbd2_journal_remove_checkpoint(struct journal_head *jh)
+{
+ transaction_t *transaction;
+ journal_t *journal;
+ int ret = 0;
+
+ JBUFFER_TRACE(jh, "entry");
+
+ if ((transaction = jh->b_cp_transaction) == NULL) {
+ JBUFFER_TRACE(jh, "not on transaction");
+ goto out;
+ }
+ journal = transaction->t_journal;
+
+ __buffer_unlink(jh);
+ jh->b_cp_transaction = NULL;
+
+ if (transaction->t_checkpoint_list != NULL ||
+ transaction->t_checkpoint_io_list != NULL)
+ goto out;
+ JBUFFER_TRACE(jh, "transaction has no more buffers");
+
+ /*
+ * There is one special case to worry about: if we have just pulled the
+ * buffer off a committing transaction's forget list, then even if the
+ * checkpoint list is empty, the transaction obviously cannot be
+ * dropped!
+ *
+ * The locking here around j_committing_transaction is a bit sleazy.
+ * See the comment at the end of jbd2_journal_commit_transaction().
+ */
+ if (transaction == journal->j_committing_transaction) {
+ JBUFFER_TRACE(jh, "belongs to committing transaction");
+ goto out;
+ }
+
+ /* OK, that was the last buffer for the transaction: we can now
+ safely remove this transaction from the log */
+
+ __jbd2_journal_drop_transaction(journal, transaction);
+
+ /* Just in case anybody was waiting for more transactions to be
+ checkpointed... */
+ wake_up(&journal->j_wait_logspace);
+ ret = 1;
+out:
+ JBUFFER_TRACE(jh, "exit");
+ return ret;
+}
+
+/*
+ * journal_insert_checkpoint: put a committed buffer onto a checkpoint
+ * list so that we know when it is safe to clean the transaction out of
+ * the log.
+ *
+ * Called with the journal locked.
+ * Called with j_list_lock held.
+ */
+void __jbd2_journal_insert_checkpoint(struct journal_head *jh,
+ transaction_t *transaction)
+{
+ JBUFFER_TRACE(jh, "entry");
+ J_ASSERT_JH(jh, buffer_dirty(jh2bh(jh)) || buffer_jbddirty(jh2bh(jh)));
+ J_ASSERT_JH(jh, jh->b_cp_transaction == NULL);
+
+ jh->b_cp_transaction = transaction;
+
+ if (!transaction->t_checkpoint_list) {
+ jh->b_cpnext = jh->b_cpprev = jh;
+ } else {
+ jh->b_cpnext = transaction->t_checkpoint_list;
+ jh->b_cpprev = transaction->t_checkpoint_list->b_cpprev;
+ jh->b_cpprev->b_cpnext = jh;
+ jh->b_cpnext->b_cpprev = jh;
+ }
+ transaction->t_checkpoint_list = jh;
+}
+
+/*
+ * We've finished with this transaction structure: adios...
+ *
+ * The transaction must have no links except for the checkpoint by this
+ * point.
+ *
+ * Called with the journal locked.
+ * Called with j_list_lock held.
+ */
+
+void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transaction)
+{
+ assert_spin_locked(&journal->j_list_lock);
+ if (transaction->t_cpnext) {
+ transaction->t_cpnext->t_cpprev = transaction->t_cpprev;
+ transaction->t_cpprev->t_cpnext = transaction->t_cpnext;
+ if (journal->j_checkpoint_transactions == transaction)
+ journal->j_checkpoint_transactions =
+ transaction->t_cpnext;
+ if (journal->j_checkpoint_transactions == transaction)
+ journal->j_checkpoint_transactions = NULL;
+ }
+
+ J_ASSERT(transaction->t_state == T_FINISHED);
+ J_ASSERT(transaction->t_buffers == NULL);
+ J_ASSERT(transaction->t_sync_datalist == NULL);
+ J_ASSERT(transaction->t_forget == NULL);
+ J_ASSERT(transaction->t_iobuf_list == NULL);
+ J_ASSERT(transaction->t_shadow_list == NULL);
+ J_ASSERT(transaction->t_log_list == NULL);
+ J_ASSERT(transaction->t_checkpoint_list == NULL);
+ J_ASSERT(transaction->t_checkpoint_io_list == NULL);
+ J_ASSERT(transaction->t_updates == 0);
+ J_ASSERT(journal->j_committing_transaction != transaction);
+ J_ASSERT(journal->j_running_transaction != transaction);
+
+ jbd_debug(1, "Dropping transaction %d, all done\n", transaction->t_tid);
+ kfree(transaction);
+}
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
new file mode 100644
index 000000000000..70b2ae1ef281
--- /dev/null
+++ b/fs/jbd2/commit.c
@@ -0,0 +1,920 @@
+/*
+ * linux/fs/jbd2/commit.c
+ *
+ * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
+ *
+ * Copyright 1998 Red Hat corp --- All Rights Reserved
+ *
+ * This file is part of the Linux kernel and is made available under
+ * the terms of the GNU General Public License, version 2, or at your
+ * option, any later version, incorporated herein by reference.
+ *
+ * Journal commit routines for the generic filesystem journaling code;
+ * part of the ext2fs journaling system.
+ */
+
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/jbd2.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/smp_lock.h>
+
+/*
+ * Default IO end handler for temporary BJ_IO buffer_heads.
+ */
+static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
+{
+ BUFFER_TRACE(bh, "");
+ if (uptodate)
+ set_buffer_uptodate(bh);
+ else
+ clear_buffer_uptodate(bh);
+ unlock_buffer(bh);
+}
+
+/*
+ * When an ext3-ordered file is truncated, it is possible that many pages are
+ * not sucessfully freed, because they are attached to a committing transaction.
+ * After the transaction commits, these pages are left on the LRU, with no
+ * ->mapping, and with attached buffers. These pages are trivially reclaimable
+ * by the VM, but their apparent absence upsets the VM accounting, and it makes
+ * the numbers in /proc/meminfo look odd.
+ *
+ * So here, we have a buffer which has just come off the forget list. Look to
+ * see if we can strip all buffers from the backing page.
+ *
+ * Called under lock_journal(), and possibly under journal_datalist_lock. The
+ * caller provided us with a ref against the buffer, and we drop that here.
+ */
+static void release_buffer_page(struct buffer_head *bh)
+{
+ struct page *page;
+
+ if (buffer_dirty(bh))
+ goto nope;
+ if (atomic_read(&bh->b_count) != 1)
+ goto nope;
+ page = bh->b_page;
+ if (!page)
+ goto nope;
+ if (page->mapping)
+ goto nope;
+
+ /* OK, it's a truncated page */
+ if (TestSetPageLocked(page))
+ goto nope;
+
+ page_cache_get(page);
+ __brelse(bh);
+ try_to_free_buffers(page);
+ unlock_page(page);
+ page_cache_release(page);
+ return;
+
+nope:
+ __brelse(bh);
+}
+
+/*
+ * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is
+ * held. For ranking reasons we must trylock. If we lose, schedule away and
+ * return 0. j_list_lock is dropped in this case.
+ */
+static int inverted_lock(journal_t *journal, struct buffer_head *bh)
+{
+ if (!jbd_trylock_bh_state(bh)) {
+ spin_unlock(&journal->j_list_lock);
+ schedule();
+ return 0;
+ }
+ return 1;
+}
+
+/* Done it all: now write the commit record. We should have
+ * cleaned up our previous buffers by now, so if we are in abort
+ * mode we can now just skip the rest of the journal write
+ * entirely.
+ *
+ * Returns 1 if the journal needs to be aborted or 0 on success
+ */
+static int journal_write_commit_record(journal_t *journal,
+ transaction_t *commit_transaction)
+{
+ struct journal_head *descriptor;
+ struct buffer_head *bh;
+ int i, ret;
+ int barrier_done = 0;
+
+ if (is_journal_aborted(journal))
+ return 0;
+
+ descriptor = jbd2_journal_get_descriptor_buffer(journal);
+ if (!descriptor)
+ return 1;
+
+ bh = jh2bh(descriptor);
+
+ /* AKPM: buglet - add `i' to tmp! */
+ for (i = 0; i < bh->b_size; i += 512) {
+ journal_header_t *tmp = (journal_header_t*)bh->b_data;
+ tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
+ tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
+ tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
+ }
+
+ JBUFFER_TRACE(descriptor, "write commit block");
+ set_buffer_dirty(bh);
+ if (journal->j_flags & JBD2_BARRIER) {
+ set_buffer_ordered(bh);
+ barrier_done = 1;
+ }
+ ret = sync_dirty_buffer(bh);
+ /* is it possible for another commit to fail at roughly
+ * the same time as this one? If so, we don't want to
+ * trust the barrier flag in the super, but instead want
+ * to remember if we sent a barrier request
+ */
+ if (ret == -EOPNOTSUPP && barrier_done) {
+ char b[BDEVNAME_SIZE];
+
+ printk(KERN_WARNING
+ "JBD: barrier-based sync failed on %s - "
+ "disabling barriers\n",
+ bdevname(journal->j_dev, b));
+ spin_lock(&journal->j_state_lock);
+ journal->j_flags &= ~JBD2_BARRIER;
+ spin_unlock(&journal->j_state_lock);
+
+ /* And try again, without the barrier */
+ clear_buffer_ordered(bh);
+ set_buffer_uptodate(bh);
+ set_buffer_dirty(bh);
+ ret = sync_dirty_buffer(bh);
+ }
+ put_bh(bh); /* One for getblk() */
+ jbd2_journal_put_journal_head(descriptor);
+
+ return (ret == -EIO);
+}
+
+static void journal_do_submit_data(struct buffer_head **wbuf, int bufs)
+{
+ int i;
+
+ for (i = 0; i < bufs; i++) {
+ wbuf[i]->b_end_io = end_buffer_write_sync;
+ /* We use-up our safety reference in submit_bh() */
+ submit_bh(WRITE, wbuf[i]);
+ }
+}
+
+/*
+ * Submit all the data buffers to disk
+ */
+static void journal_submit_data_buffers(journal_t *journal,
+ transaction_t *commit_transaction)
+{
+ struct journal_head *jh;
+ struct buffer_head *bh;
+ int locked;
+ int bufs = 0;
+ struct buffer_head **wbuf = journal->j_wbuf;
+
+ /*
+ * Whenever we unlock the journal and sleep, things can get added
+ * onto ->t_sync_datalist, so we have to keep looping back to
+ * write_out_data until we *know* that the list is empty.
+ *
+ * Cleanup any flushed data buffers from the data list. Even in
+ * abort mode, we want to flush this out as soon as possible.
+ */
+write_out_data:
+ cond_resched();
+ spin_lock(&journal->j_list_lock);
+
+ while (commit_transaction->t_sync_datalist) {
+ jh = commit_transaction->t_sync_datalist;
+ bh = jh2bh(jh);
+ locked = 0;
+
+ /* Get reference just to make sure buffer does not disappear
+ * when we are forced to drop various locks */
+ get_bh(bh);
+ /* If the buffer is dirty, we need to submit IO and hence
+ * we need the buffer lock. We try to lock the buffer without
+ * blocking. If we fail, we need to drop j_list_lock and do
+ * blocking lock_buffer().
+ */
+ if (buffer_dirty(bh)) {
+ if (test_set_buffer_locked(bh)) {
+ BUFFER_TRACE(bh, "needs blocking lock");
+ spin_unlock(&journal->j_list_lock);
+ /* Write out all data to prevent deadlocks */
+ journal_do_submit_data(wbuf, bufs);
+ bufs = 0;
+ lock_buffer(bh);
+ spin_lock(&journal->j_list_lock);
+ }
+ locked = 1;
+ }
+ /* We have to get bh_state lock. Again out of order, sigh. */
+ if (!inverted_lock(journal, bh)) {
+ jbd_lock_bh_state(bh);
+ spin_lock(&journal->j_list_lock);
+ }
+ /* Someone already cleaned up the buffer? */
+ if (!buffer_jbd(bh)
+ || jh->b_transaction != commit_transaction
+ || jh->b_jlist != BJ_SyncData) {
+ jbd_unlock_bh_state(bh);
+ if (locked)
+ unlock_buffer(bh);
+ BUFFER_TRACE(bh, "already cleaned up");
+ put_bh(bh);
+ continue;
+ }
+ if (locked && test_clear_buffer_dirty(bh)) {
+ BUFFER_TRACE(bh, "needs writeout, adding to array");
+ wbuf[bufs++] = bh;
+ __jbd2_journal_file_buffer(jh, commit_transaction,
+ BJ_Locked);
+ jbd_unlock_bh_state(bh);
+ if (bufs == journal->j_wbufsize) {
+ spin_unlock(&journal->j_list_lock);
+ journal_do_submit_data(wbuf, bufs);
+ bufs = 0;
+ goto write_out_data;
+ }
+ }
+ else {
+ BUFFER_TRACE(bh, "writeout complete: unfile");
+ __jbd2_journal_unfile_buffer(jh);
+ jbd_unlock_bh_state(bh);
+ if (locked)
+ unlock_buffer(bh);
+ jbd2_journal_remove_journal_head(bh);
+ /* Once for our safety reference, once for
+ * jbd2_journal_remove_journal_head() */
+ put_bh(bh);
+ put_bh(bh);
+ }
+
+ if (lock_need_resched(&journal->j_list_lock)) {
+ spin_unlock(&journal->j_list_lock);
+ goto write_out_data;
+ }
+ }
+ spin_unlock(&journal->j_list_lock);
+ journal_do_submit_data(wbuf, bufs);
+}
+
+static inline void write_tag_block(int tag_bytes, journal_block_tag_t *tag,
+ unsigned long long block)
+{
+ tag->t_blocknr = cpu_to_be32(block & (u32)~0);
+ if (tag_bytes > JBD_TAG_SIZE32)
+ tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1);
+}
+
+/*
+ * jbd2_journal_commit_transaction
+ *
+ * The primary function for committing a transaction to the log. This
+ * function is called by the journal thread to begin a complete commit.
+ */
+void jbd2_journal_commit_transaction(journal_t *journal)
+{
+ transaction_t *commit_transaction;
+ struct journal_head *jh, *new_jh, *descriptor;
+ struct buffer_head **wbuf = journal->j_wbuf;
+ int bufs;
+ int flags;
+ int err;
+ unsigned long long blocknr;
+ char *tagp = NULL;
+ journal_header_t *header;
+ journal_block_tag_t *tag = NULL;
+ int space_left = 0;
+ int first_tag = 0;
+ int tag_flag;
+ int i;
+ int tag_bytes = journal_tag_bytes(journal);
+
+ /*
+ * First job: lock down the current transaction and wait for
+ * all outstanding updates to complete.
+ */
+
+#ifdef COMMIT_STATS
+ spin_lock(&journal->j_list_lock);
+ summarise_journal_usage(journal);
+ spin_unlock(&journal->j_list_lock);
+#endif
+
+ /* Do we need to erase the effects of a prior jbd2_journal_flush? */
+ if (journal->j_flags & JBD2_FLUSHED) {
+ jbd_debug(3, "super block updated\n");
+ jbd2_journal_update_superblock(journal, 1);
+ } else {
+ jbd_debug(3, "superblock not updated\n");
+ }
+
+ J_ASSERT(journal->j_running_transaction != NULL);
+ J_ASSERT(journal->j_committing_transaction == NULL);
+
+ commit_transaction = journal->j_running_transaction;
+ J_ASSERT(commit_transaction->t_state == T_RUNNING);
+
+ jbd_debug(1, "JBD: starting commit of transaction %d\n",
+ commit_transaction->t_tid);
+
+ spin_lock(&journal->j_state_lock);
+ commit_transaction->t_state = T_LOCKED;
+
+ spin_lock(&commit_transaction->t_handle_lock);
+ while (commit_transaction->t_updates) {
+ DEFINE_WAIT(wait);
+
+ prepare_to_wait(&journal->j_wait_updates, &wait,
+ TASK_UNINTERRUPTIBLE);
+ if (commit_transaction->t_updates) {
+ spin_unlock(&commit_transaction->t_handle_lock);
+ spin_unlock(&journal->j_state_lock);
+ schedule();
+ spin_lock(&journal->j_state_lock);
+ spin_lock(&commit_transaction->t_handle_lock);
+ }
+ finish_wait(&journal->j_wait_updates, &wait);
+ }
+ spin_unlock(&commit_transaction->t_handle_lock);
+
+ J_ASSERT (commit_transaction->t_outstanding_credits <=
+ journal->j_max_transaction_buffers);
+
+ /*
+ * First thing we are allowed to do is to discard any remaining
+ * BJ_Reserved buffers. Note, it is _not_ permissible to assume
+ * that there are no such buffers: if a large filesystem
+ * operation like a truncate needs to split itself over multiple
+ * transactions, then it may try to do a jbd2_journal_restart() while
+ * there are still BJ_Reserved buffers outstanding. These must
+ * be released cleanly from the current transaction.
+ *
+ * In this case, the filesystem must still reserve write access
+ * again before modifying the buffer in the new transaction, but
+ * we do not require it to remember exactly which old buffers it
+ * has reserved. This is consistent with the existing behaviour
+ * that multiple jbd2_journal_get_write_access() calls to the same
+ * buffer are perfectly permissable.
+ */
+ while (commit_transaction->t_reserved_list) {
+ jh = commit_transaction->t_reserved_list;
+ JBUFFER_TRACE(jh, "reserved, unused: refile");
+ /*
+ * A jbd2_journal_get_undo_access()+jbd2_journal_release_buffer() may
+ * leave undo-committed data.
+ */
+ if (jh->b_committed_data) {
+ struct buffer_head *bh = jh2bh(jh);
+
+ jbd_lock_bh_state(bh);
+ jbd2_slab_free(jh->b_committed_data, bh->b_size);
+ jh->b_committed_data = NULL;
+ jbd_unlock_bh_state(bh);
+ }
+ jbd2_journal_refile_buffer(journal, jh);
+ }
+
+ /*
+ * Now try to drop any written-back buffers from the journal's
+ * checkpoint lists. We do this *before* commit because it potentially
+ * frees some memory
+ */
+ spin_lock(&journal->j_list_lock);
+ __jbd2_journal_clean_checkpoint_list(journal);
+ spin_unlock(&journal->j_list_lock);
+
+ jbd_debug (3, "JBD: commit phase 1\n");
+
+ /*
+ * Switch to a new revoke table.
+ */
+ jbd2_journal_switch_revoke_table(journal);
+
+ commit_transaction->t_state = T_FLUSH;
+ journal->j_committing_transaction = commit_transaction;
+ journal->j_running_transaction = NULL;
+ commit_transaction->t_log_start = journal->j_head;
+ wake_up(&journal->j_wait_transaction_locked);
+ spin_unlock(&journal->j_state_lock);
+
+ jbd_debug (3, "JBD: commit phase 2\n");
+
+ /*
+ * First, drop modified flag: all accesses to the buffers
+ * will be tracked for a new trasaction only -bzzz
+ */
+ spin_lock(&journal->j_list_lock);
+ if (commit_transaction->t_buffers) {
+ new_jh = jh = commit_transaction->t_buffers->b_tnext;
+ do {
+ J_ASSERT_JH(new_jh, new_jh->b_modified == 1 ||
+ new_jh->b_modified == 0);
+ new_jh->b_modified = 0;
+ new_jh = new_jh->b_tnext;
+ } while (new_jh != jh);
+ }
+ spin_unlock(&journal->j_list_lock);
+
+ /*
+ * Now start flushing things to disk, in the order they appear
+ * on the transaction lists. Data blocks go first.
+ */
+ err = 0;
+ journal_submit_data_buffers(journal, commit_transaction);
+
+ /*
+ * Wait for all previously submitted IO to complete.
+ */
+ spin_lock(&journal->j_list_lock);
+ while (commit_transaction->t_locked_list) {
+ struct buffer_head *bh;
+
+ jh = commit_transaction->t_locked_list->b_tprev;
+ bh = jh2bh(jh);
+ get_bh(bh);
+ if (buffer_locked(bh)) {
+ spin_unlock(&journal->j_list_lock);
+ wait_on_buffer(bh);
+ if (unlikely(!buffer_uptodate(bh)))
+ err = -EIO;
+ spin_lock(&journal->j_list_lock);
+ }
+ if (!inverted_lock(journal, bh)) {
+ put_bh(bh);
+ spin_lock(&journal->j_list_lock);
+ continue;
+ }
+ if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) {
+ __jbd2_journal_unfile_buffer(jh);
+ jbd_unlock_bh_state(bh);
+ jbd2_journal_remove_journal_head(bh);
+ put_bh(bh);
+ } else {
+ jbd_unlock_bh_state(bh);
+ }
+ put_bh(bh);
+ cond_resched_lock(&journal->j_list_lock);
+ }
+ spin_unlock(&journal->j_list_lock);
+
+ if (err)
+ __jbd2_journal_abort_hard(journal);
+
+ jbd2_journal_write_revoke_records(journal, commit_transaction);
+
+ jbd_debug(3, "JBD: commit phase 2\n");
+
+ /*
+ * If we found any dirty or locked buffers, then we should have
+ * looped back up to the write_out_data label. If there weren't
+ * any then journal_clean_data_list should have wiped the list
+ * clean by now, so check that it is in fact empty.
+ */
+ J_ASSERT (commit_transaction->t_sync_datalist == NULL);
+
+ jbd_debug (3, "JBD: commit phase 3\n");
+
+ /*
+ * Way to go: we have now written out all of the data for a
+ * transaction! Now comes the tricky part: we need to write out
+ * metadata. Loop over the transaction's entire buffer list:
+ */
+ commit_transaction->t_state = T_COMMIT;
+
+ descriptor = NULL;
+ bufs = 0;
+ while (commit_transaction->t_buffers) {
+
+ /* Find the next buffer to be journaled... */
+
+ jh = commit_transaction->t_buffers;
+
+ /* If we're in abort mode, we just un-journal the buffer and
+ release it for background writing. */
+
+ if (is_journal_aborted(journal)) {
+ JBUFFER_TRACE(jh, "journal is aborting: refile");
+ jbd2_journal_refile_buffer(journal, jh);
+ /* If that was the last one, we need to clean up
+ * any descriptor buffers which may have been
+ * already allocated, even if we are now
+ * aborting. */
+ if (!commit_transaction->t_buffers)
+ goto start_journal_io;
+ continue;
+ }
+
+ /* Make sure we have a descriptor block in which to
+ record the metadata buffer. */
+
+ if (!descriptor) {
+ struct buffer_head *bh;
+
+ J_ASSERT (bufs == 0);
+
+ jbd_debug(4, "JBD: get descriptor\n");
+
+ descriptor = jbd2_journal_get_descriptor_buffer(journal);
+ if (!descriptor) {
+ __jbd2_journal_abort_hard(journal);
+ continue;
+ }
+
+ bh = jh2bh(descriptor);
+ jbd_debug(4, "JBD: got buffer %llu (%p)\n",
+ (unsigned long long)bh->b_blocknr, bh->b_data);
+ header = (journal_header_t *)&bh->b_data[0];
+ header->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
+ header->h_blocktype = cpu_to_be32(JBD2_DESCRIPTOR_BLOCK);
+ header->h_sequence = cpu_to_be32(commit_transaction->t_tid);
+
+ tagp = &bh->b_data[sizeof(journal_header_t)];
+ space_left = bh->b_size - sizeof(journal_header_t);
+ first_tag = 1;
+ set_buffer_jwrite(bh);
+ set_buffer_dirty(bh);
+ wbuf[bufs++] = bh;
+
+ /* Record it so that we can wait for IO
+ completion later */
+ BUFFER_TRACE(bh, "ph3: file as descriptor");
+ jbd2_journal_file_buffer(descriptor, commit_transaction,
+ BJ_LogCtl);
+ }
+
+ /* Where is the buffer to be written? */
+
+ err = jbd2_journal_next_log_block(journal, &blocknr);
+ /* If the block mapping failed, just abandon the buffer
+ and repeat this loop: we'll fall into the
+ refile-on-abort condition above. */
+ if (err) {
+ __jbd2_journal_abort_hard(journal);
+ continue;
+ }
+
+ /*
+ * start_this_handle() uses t_outstanding_credits to determine
+ * the free space in the log, but this counter is changed
+ * by jbd2_journal_next_log_block() also.
+ */
+ commit_transaction->t_outstanding_credits--;
+
+ /* Bump b_count to prevent truncate from stumbling over
+ the shadowed buffer! @@@ This can go if we ever get
+ rid of the BJ_IO/BJ_Shadow pairing of buffers. */
+ atomic_inc(&jh2bh(jh)->b_count);
+
+ /* Make a temporary IO buffer with which to write it out
+ (this will requeue both the metadata buffer and the
+ temporary IO buffer). new_bh goes on BJ_IO*/
+
+ set_bit(BH_JWrite, &jh2bh(jh)->b_state);
+ /*
+ * akpm: jbd2_journal_write_metadata_buffer() sets
+ * new_bh->b_transaction to commit_transaction.
+ * We need to clean this up before we release new_bh
+ * (which is of type BJ_IO)
+ */
+ JBUFFER_TRACE(jh, "ph3: write metadata");
+ flags = jbd2_journal_write_metadata_buffer(commit_transaction,
+ jh, &new_jh, blocknr);
+ set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
+ wbuf[bufs++] = jh2bh(new_jh);
+
+ /* Record the new block's tag in the current descriptor
+ buffer */
+
+ tag_flag = 0;
+ if (flags & 1)
+ tag_flag |= JBD2_FLAG_ESCAPE;
+ if (!first_tag)
+ tag_flag |= JBD2_FLAG_SAME_UUID;
+
+ tag = (journal_block_tag_t *) tagp;
+ write_tag_block(tag_bytes, tag, jh2bh(jh)->b_blocknr);
+ tag->t_flags = cpu_to_be32(tag_flag);
+ tagp += tag_bytes;
+ space_left -= tag_bytes;
+
+ if (first_tag) {
+ memcpy (tagp, journal->j_uuid, 16);
+ tagp += 16;
+ space_left -= 16;
+ first_tag = 0;
+ }
+
+ /* If there's no more to do, or if the descriptor is full,
+ let the IO rip! */
+
+ if (bufs == journal->j_wbufsize ||
+ commit_transaction->t_buffers == NULL ||
+ space_left < tag_bytes + 16) {
+
+ jbd_debug(4, "JBD: Submit %d IOs\n", bufs);
+
+ /* Write an end-of-descriptor marker before
+ submitting the IOs. "tag" still points to
+ the last tag we set up. */
+
+ tag->t_flags |= cpu_to_be32(JBD2_FLAG_LAST_TAG);
+
+start_journal_io:
+ for (i = 0; i < bufs; i++) {
+ struct buffer_head *bh = wbuf[i];
+ lock_buffer(bh);
+ clear_buffer_dirty(bh);
+ set_buffer_uptodate(bh);
+ bh->b_end_io = journal_end_buffer_io_sync;
+ submit_bh(WRITE, bh);
+ }
+ cond_resched();
+
+ /* Force a new descriptor to be generated next
+ time round the loop. */
+ descriptor = NULL;
+ bufs = 0;
+ }
+ }
+
+ /* Lo and behold: we have just managed to send a transaction to
+ the log. Before we can commit it, wait for the IO so far to
+ complete. Control buffers being written are on the
+ transaction's t_log_list queue, and metadata buffers are on
+ the t_iobuf_list queue.
+
+ Wait for the buffers in reverse order. That way we are
+ less likely to be woken up until all IOs have completed, and
+ so we incur less scheduling load.
+ */
+
+ jbd_debug(3, "JBD: commit phase 4\n");
+
+ /*
+ * akpm: these are BJ_IO, and j_list_lock is not needed.
+ * See __journal_try_to_free_buffer.
+ */
+wait_for_iobuf:
+ while (commit_transaction->t_iobuf_list != NULL) {
+ struct buffer_head *bh;
+
+ jh = commit_transaction->t_iobuf_list->b_tprev;
+ bh = jh2bh(jh);
+ if (buffer_locked(bh)) {
+ wait_on_buffer(bh);
+ goto wait_for_iobuf;
+ }
+ if (cond_resched())
+ goto wait_for_iobuf;
+
+ if (unlikely(!buffer_uptodate(bh)))
+ err = -EIO;
+
+ clear_buffer_jwrite(bh);
+
+ JBUFFER_TRACE(jh, "ph4: unfile after journal write");
+ jbd2_journal_unfile_buffer(journal, jh);
+
+ /*
+ * ->t_iobuf_list should contain only dummy buffer_heads
+ * which were created by jbd2_journal_write_metadata_buffer().
+ */
+ BUFFER_TRACE(bh, "dumping temporary bh");
+ jbd2_journal_put_journal_head(jh);
+ __brelse(bh);
+ J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
+ free_buffer_head(bh);
+
+ /* We also have to unlock and free the corresponding
+ shadowed buffer */
+ jh = commit_transaction->t_shadow_list->b_tprev;
+ bh = jh2bh(jh);
+ clear_bit(BH_JWrite, &bh->b_state);
+ J_ASSERT_BH(bh, buffer_jbddirty(bh));
+
+ /* The metadata is now released for reuse, but we need
+ to remember it against this transaction so that when
+ we finally commit, we can do any checkpointing
+ required. */
+ JBUFFER_TRACE(jh, "file as BJ_Forget");
+ jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
+ /* Wake up any transactions which were waiting for this
+ IO to complete */
+ wake_up_bit(&bh->b_state, BH_Unshadow);
+ JBUFFER_TRACE(jh, "brelse shadowed buffer");
+ __brelse(bh);
+ }
+
+ J_ASSERT (commit_transaction->t_shadow_list == NULL);
+
+ jbd_debug(3, "JBD: commit phase 5\n");
+
+ /* Here we wait for the revoke record and descriptor record buffers */
+ wait_for_ctlbuf:
+ while (commit_transaction->t_log_list != NULL) {
+ struct buffer_head *bh;
+
+ jh = commit_transaction->t_log_list->b_tprev;
+ bh = jh2bh(jh);
+ if (buffer_locked(bh)) {
+ wait_on_buffer(bh);
+ goto wait_for_ctlbuf;
+ }
+ if (cond_resched())
+ goto wait_for_ctlbuf;
+
+ if (unlikely(!buffer_uptodate(bh)))
+ err = -EIO;
+
+ BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
+ clear_buffer_jwrite(bh);
+ jbd2_journal_unfile_buffer(journal, jh);
+ jbd2_journal_put_journal_head(jh);
+ __brelse(bh); /* One for getblk */
+ /* AKPM: bforget here */
+ }
+
+ jbd_debug(3, "JBD: commit phase 6\n");
+
+ if (journal_write_commit_record(journal, commit_transaction))
+ err = -EIO;
+
+ if (err)
+ __jbd2_journal_abort_hard(journal);
+
+ /* End of a transaction! Finally, we can do checkpoint
+ processing: any buffers committed as a result of this
+ transaction can be removed from any checkpoint list it was on
+ before. */
+
+ jbd_debug(3, "JBD: commit phase 7\n");
+
+ J_ASSERT(commit_transaction->t_sync_datalist == NULL);
+ J_ASSERT(commit_transaction->t_buffers == NULL);
+ J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
+ J_ASSERT(commit_transaction->t_iobuf_list == NULL);
+ J_ASSERT(commit_transaction->t_shadow_list == NULL);
+ J_ASSERT(commit_transaction->t_log_list == NULL);
+
+restart_loop:
+ /*
+ * As there are other places (journal_unmap_buffer()) adding buffers
+ * to this list we have to be careful and hold the j_list_lock.
+ */
+ spin_lock(&journal->j_list_lock);
+ while (commit_transaction->t_forget) {
+ transaction_t *cp_transaction;
+ struct buffer_head *bh;
+
+ jh = commit_transaction->t_forget;
+ spin_unlock(&journal->j_list_lock);
+ bh = jh2bh(jh);
+ jbd_lock_bh_state(bh);
+ J_ASSERT_JH(jh, jh->b_transaction == commit_transaction ||
+ jh->b_transaction == journal->j_running_transaction);
+
+ /*
+ * If there is undo-protected committed data against
+ * this buffer, then we can remove it now. If it is a
+ * buffer needing such protection, the old frozen_data
+ * field now points to a committed version of the
+ * buffer, so rotate that field to the new committed
+ * data.
+ *
+ * Otherwise, we can just throw away the frozen data now.
+ */
+ if (jh->b_committed_data) {
+ jbd2_slab_free(jh->b_committed_data, bh->b_size);
+ jh->b_committed_data = NULL;
+ if (jh->b_frozen_data) {
+ jh->b_committed_data = jh->b_frozen_data;
+ jh->b_frozen_data = NULL;
+ }
+ } else if (jh->b_frozen_data) {
+ jbd2_slab_free(jh->b_frozen_data, bh->b_size);
+ jh->b_frozen_data = NULL;
+ }
+
+ spin_lock(&journal->j_list_lock);
+ cp_transaction = jh->b_cp_transaction;
+ if (cp_transaction) {
+ JBUFFER_TRACE(jh, "remove from old cp transaction");
+ __jbd2_journal_remove_checkpoint(jh);
+ }
+
+ /* Only re-checkpoint the buffer_head if it is marked
+ * dirty. If the buffer was added to the BJ_Forget list
+ * by jbd2_journal_forget, it may no longer be dirty and
+ * there's no point in keeping a checkpoint record for
+ * it. */
+
+ /* A buffer which has been freed while still being
+ * journaled by a previous transaction may end up still
+ * being dirty here, but we want to avoid writing back
+ * that buffer in the future now that the last use has
+ * been committed. That's not only a performance gain,
+ * it also stops aliasing problems if the buffer is left
+ * behind for writeback and gets reallocated for another
+ * use in a different page. */
+ if (buffer_freed(bh)) {
+ clear_buffer_freed(bh);
+ clear_buffer_jbddirty(bh);
+ }
+
+ if (buffer_jbddirty(bh)) {
+ JBUFFER_TRACE(jh, "add to new checkpointing trans");
+ __jbd2_journal_insert_checkpoint(jh, commit_transaction);
+ JBUFFER_TRACE(jh, "refile for checkpoint writeback");
+ __jbd2_journal_refile_buffer(jh);
+ jbd_unlock_bh_state(bh);
+ } else {
+ J_ASSERT_BH(bh, !buffer_dirty(bh));
+ /* The buffer on BJ_Forget list and not jbddirty means
+ * it has been freed by this transaction and hence it
+ * could not have been reallocated until this
+ * transaction has committed. *BUT* it could be
+ * reallocated once we have written all the data to
+ * disk and before we process the buffer on BJ_Forget
+ * list. */
+ JBUFFER_TRACE(jh, "refile or unfile freed buffer");
+ __jbd2_journal_refile_buffer(jh);
+ if (!jh->b_transaction) {
+ jbd_unlock_bh_state(bh);
+ /* needs a brelse */
+ jbd2_journal_remove_journal_head(bh);
+ release_buffer_page(bh);
+ } else
+ jbd_unlock_bh_state(bh);
+ }
+ cond_resched_lock(&journal->j_list_lock);
+ }
+ spin_unlock(&journal->j_list_lock);
+ /*
+ * This is a bit sleazy. We borrow j_list_lock to protect
+ * journal->j_committing_transaction in __jbd2_journal_remove_checkpoint.
+ * Really, __jbd2_journal_remove_checkpoint should be using j_state_lock but
+ * it's a bit hassle to hold that across __jbd2_journal_remove_checkpoint
+ */
+ spin_lock(&journal->j_state_lock);
+ spin_lock(&journal->j_list_lock);
+ /*
+ * Now recheck if some buffers did not get attached to the transaction
+ * while the lock was dropped...
+ */
+ if (commit_transaction->t_forget) {
+ spin_unlock(&journal->j_list_lock);
+ spin_unlock(&journal->j_state_lock);
+ goto restart_loop;
+ }
+
+ /* Done with this transaction! */
+
+ jbd_debug(3, "JBD: commit phase 8\n");
+
+ J_ASSERT(commit_transaction->t_state == T_COMMIT);
+
+ commit_transaction->t_state = T_FINISHED;
+ J_ASSERT(commit_transaction == journal->j_committing_transaction);
+ journal->j_commit_sequence = commit_transaction->t_tid;
+ journal->j_committing_transaction = NULL;
+ spin_unlock(&journal->j_state_lock);
+
+ if (commit_transaction->t_checkpoint_list == NULL) {
+ __jbd2_journal_drop_transaction(journal, commit_transaction);
+ } else {
+ if (journal->j_checkpoint_transactions == NULL) {
+ journal->j_checkpoint_transactions = commit_transaction;
+ commit_transaction->t_cpnext = commit_transaction;
+ commit_transaction->t_cpprev = commit_transaction;
+ } else {
+ commit_transaction->t_cpnext =
+ journal->j_checkpoint_transactions;
+ commit_transaction->t_cpprev =
+ commit_transaction->t_cpnext->t_cpprev;
+ commit_transaction->t_cpnext->t_cpprev =
+ commit_transaction;
+ commit_transaction->t_cpprev->t_cpnext =
+ commit_transaction;
+ }
+ }
+ spin_unlock(&journal->j_list_lock);
+
+ jbd_debug(1, "JBD: commit %d complete, head %d\n",
+ journal->j_commit_sequence, journal->j_tail_sequence);
+
+ wake_up(&journal->j_wait_done_commit);
+}
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
new file mode 100644
index 000000000000..c60f378b0f76
--- /dev/null
+++ b/fs/jbd2/journal.c
@@ -0,0 +1,2084 @@
+/*
+ * linux/fs/jbd2/journal.c
+ *
+ * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
+ *
+ * Copyright 1998 Red Hat corp --- All Rights Reserved
+ *
+ * This file is part of the Linux kernel and is made available under
+ * the terms of the GNU General Public License, version 2, or at your
+ * option, any later version, incorporated herein by reference.
+ *
+ * Generic filesystem journal-writing code; part of the ext2fs
+ * journaling system.
+ *
+ * This file manages journals: areas of disk reserved for logging
+ * transactional updates. This includes the kernel journaling thread
+ * which is responsible for scheduling updates to the log.
+ *
+ * We do not actually manage the physical storage of the journal in this
+ * file: that is left to a per-journal policy function, which allows us
+ * to store the journal within a filesystem-specified area for ext2
+ * journaling (ext2 can use a reserved inode for storing the log).
+ */
+
+#include <linux/module.h>
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/jbd2.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/smp_lock.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/suspend.h>
+#include <linux/pagemap.h>
+#include <linux/kthread.h>
+#include <linux/poison.h>
+#include <linux/proc_fs.h>
+
+#include <asm/uaccess.h>
+#include <asm/page.h>
+
+EXPORT_SYMBOL(jbd2_journal_start);
+EXPORT_SYMBOL(jbd2_journal_restart);
+EXPORT_SYMBOL(jbd2_journal_extend);
+EXPORT_SYMBOL(jbd2_journal_stop);
+EXPORT_SYMBOL(jbd2_journal_lock_updates);
+EXPORT_SYMBOL(jbd2_journal_unlock_updates);
+EXPORT_SYMBOL(jbd2_journal_get_write_access);
+EXPORT_SYMBOL(jbd2_journal_get_create_access);
+EXPORT_SYMBOL(jbd2_journal_get_undo_access);
+EXPORT_SYMBOL(jbd2_journal_dirty_data);
+EXPORT_SYMBOL(jbd2_journal_dirty_metadata);
+EXPORT_SYMBOL(jbd2_journal_release_buffer);
+EXPORT_SYMBOL(jbd2_journal_forget);
+#if 0
+EXPORT_SYMBOL(journal_sync_buffer);
+#endif
+EXPORT_SYMBOL(jbd2_journal_flush);
+EXPORT_SYMBOL(jbd2_journal_revoke);
+
+EXPORT_SYMBOL(jbd2_journal_init_dev);
+EXPORT_SYMBOL(jbd2_journal_init_inode);
+EXPORT_SYMBOL(jbd2_journal_update_format);
+EXPORT_SYMBOL(jbd2_journal_check_used_features);
+EXPORT_SYMBOL(jbd2_journal_check_available_features);
+EXPORT_SYMBOL(jbd2_journal_set_features);
+EXPORT_SYMBOL(jbd2_journal_create);
+EXPORT_SYMBOL(jbd2_journal_load);
+EXPORT_SYMBOL(jbd2_journal_destroy);
+EXPORT_SYMBOL(jbd2_journal_update_superblock);
+EXPORT_SYMBOL(jbd2_journal_abort);
+EXPORT_SYMBOL(jbd2_journal_errno);
+EXPORT_SYMBOL(jbd2_journal_ack_err);
+EXPORT_SYMBOL(jbd2_journal_clear_err);
+EXPORT_SYMBOL(jbd2_log_wait_commit);
+EXPORT_SYMBOL(jbd2_journal_start_commit);
+EXPORT_SYMBOL(jbd2_journal_force_commit_nested);
+EXPORT_SYMBOL(jbd2_journal_wipe);
+EXPORT_SYMBOL(jbd2_journal_blocks_per_page);
+EXPORT_SYMBOL(jbd2_journal_invalidatepage);
+EXPORT_SYMBOL(jbd2_journal_try_to_free_buffers);
+EXPORT_SYMBOL(jbd2_journal_force_commit);
+
+static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *);
+static void __journal_abort_soft (journal_t *journal, int errno);
+static int jbd2_journal_create_jbd_slab(size_t slab_size);
+
+/*
+ * Helper function used to manage commit timeouts
+ */
+
+static void commit_timeout(unsigned long __data)
+{
+ struct task_struct * p = (struct task_struct *) __data;
+
+ wake_up_process(p);
+}
+
+/*
+ * kjournald2: The main thread function used to manage a logging device
+ * journal.
+ *
+ * This kernel thread is responsible for two things:
+ *
+ * 1) COMMIT: Every so often we need to commit the current state of the
+ * filesystem to disk. The journal thread is responsible for writing
+ * all of the metadata buffers to disk.
+ *
+ * 2) CHECKPOINT: We cannot reuse a used section of the log file until all
+ * of the data in that part of the log has been rewritten elsewhere on
+ * the disk. Flushing these old buffers to reclaim space in the log is
+ * known as checkpointing, and this thread is responsible for that job.
+ */
+
+static int kjournald2(void *arg)
+{
+ journal_t *journal = arg;
+ transaction_t *transaction;
+
+ /*
+ * Set up an interval timer which can be used to trigger a commit wakeup
+ * after the commit interval expires
+ */
+ setup_timer(&journal->j_commit_timer, commit_timeout,
+ (unsigned long)current);
+
+ /* Record that the journal thread is running */
+ journal->j_task = current;
+ wake_up(&journal->j_wait_done_commit);
+
+ printk(KERN_INFO "kjournald2 starting. Commit interval %ld seconds\n",
+ journal->j_commit_interval / HZ);
+
+ /*
+ * And now, wait forever for commit wakeup events.
+ */
+ spin_lock(&journal->j_state_lock);
+
+loop:
+ if (journal->j_flags & JBD2_UNMOUNT)
+ goto end_loop;
+
+ jbd_debug(1, "commit_sequence=%d, commit_request=%d\n",
+ journal->j_commit_sequence, journal->j_commit_request);
+
+ if (journal->j_commit_sequence != journal->j_commit_request) {
+ jbd_debug(1, "OK, requests differ\n");
+ spin_unlock(&journal->j_state_lock);
+ del_timer_sync(&journal->j_commit_timer);
+ jbd2_journal_commit_transaction(journal);
+ spin_lock(&journal->j_state_lock);
+ goto loop;
+ }
+
+ wake_up(&journal->j_wait_done_commit);
+ if (freezing(current)) {
+ /*
+ * The simpler the better. Flushing journal isn't a
+ * good idea, because that depends on threads that may
+ * be already stopped.
+ */
+ jbd_debug(1, "Now suspending kjournald2\n");
+ spin_unlock(&journal->j_state_lock);
+ refrigerator();
+ spin_lock(&journal->j_state_lock);
+ } else {
+ /*
+ * We assume on resume that commits are already there,
+ * so we don't sleep
+ */
+ DEFINE_WAIT(wait);
+ int should_sleep = 1;
+
+ prepare_to_wait(&journal->j_wait_commit, &wait,
+ TASK_INTERRUPTIBLE);
+ if (journal->j_commit_sequence != journal->j_commit_request)
+ should_sleep = 0;
+ transaction = journal->j_running_transaction;
+ if (transaction && time_after_eq(jiffies,
+ transaction->t_expires))
+ should_sleep = 0;
+ if (journal->j_flags & JBD2_UNMOUNT)
+ should_sleep = 0;
+ if (should_sleep) {
+ spin_unlock(&journal->j_state_lock);
+ schedule();
+ spin_lock(&journal->j_state_lock);
+ }
+ finish_wait(&journal->j_wait_commit, &wait);
+ }
+
+ jbd_debug(1, "kjournald2 wakes\n");
+
+ /*
+ * Were we woken up by a commit wakeup event?
+ */
+ transaction = journal->j_running_transaction;
+ if (transaction && time_after_eq(jiffies, transaction->t_expires)) {
+ journal->j_commit_request = transaction->t_tid;
+ jbd_debug(1, "woke because of timeout\n");
+ }
+ goto loop;
+
+end_loop:
+ spin_unlock(&journal->j_state_lock);
+ del_timer_sync(&journal->j_commit_timer);
+ journal->j_task = NULL;
+ wake_up(&journal->j_wait_done_commit);
+ jbd_debug(1, "Journal thread exiting.\n");
+ return 0;
+}
+
+static void jbd2_journal_start_thread(journal_t *journal)
+{
+ kthread_run(kjournald2, journal, "kjournald2");
+ wait_event(journal->j_wait_done_commit, journal->j_task != 0);
+}
+
+static void journal_kill_thread(journal_t *journal)
+{
+ spin_lock(&journal->j_state_lock);
+ journal->j_flags |= JBD2_UNMOUNT;
+
+ while (journal->j_task) {
+ wake_up(&journal->j_wait_commit);
+ spin_unlock(&journal->j_state_lock);
+ wait_event(journal->j_wait_done_commit, journal->j_task == 0);
+ spin_lock(&journal->j_state_lock);
+ }
+ spin_unlock(&journal->j_state_lock);
+}
+
+/*
+ * jbd2_journal_write_metadata_buffer: write a metadata buffer to the journal.
+ *
+ * Writes a metadata buffer to a given disk block. The actual IO is not
+ * performed but a new buffer_head is constructed which labels the data
+ * to be written with the correct destination disk block.
+ *
+ * Any magic-number escaping which needs to be done will cause a
+ * copy-out here. If the buffer happens to start with the
+ * JBD2_MAGIC_NUMBER, then we can't write it to the log directly: the
+ * magic number is only written to the log for descripter blocks. In
+ * this case, we copy the data and replace the first word with 0, and we
+ * return a result code which indicates that this buffer needs to be
+ * marked as an escaped buffer in the corresponding log descriptor
+ * block. The missing word can then be restored when the block is read
+ * during recovery.
+ *
+ * If the source buffer has already been modified by a new transaction
+ * since we took the last commit snapshot, we use the frozen copy of
+ * that data for IO. If we end up using the existing buffer_head's data
+ * for the write, then we *have* to lock the buffer to prevent anyone
+ * else from using and possibly modifying it while the IO is in
+ * progress.
+ *
+ * The function returns a pointer to the buffer_heads to be used for IO.
+ *
+ * We assume that the journal has already been locked in this function.
+ *
+ * Return value:
+ * <0: Error
+ * >=0: Finished OK
+ *
+ * On success:
+ * Bit 0 set == escape performed on the data
+ * Bit 1 set == buffer copy-out performed (kfree the data after IO)
+ */
+
+int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
+ struct journal_head *jh_in,
+ struct journal_head **jh_out,
+ unsigned long long blocknr)
+{
+ int need_copy_out = 0;
+ int done_copy_out = 0;
+ int do_escape = 0;
+ char *mapped_data;
+ struct buffer_head *new_bh;
+ struct journal_head *new_jh;
+ struct page *new_page;
+ unsigned int new_offset;
+ struct buffer_head *bh_in = jh2bh(jh_in);
+
+ /*
+ * The buffer really shouldn't be locked: only the current committing
+ * transaction is allowed to write it, so nobody else is allowed
+ * to do any IO.
+ *
+ * akpm: except if we're journalling data, and write() output is
+ * also part of a shared mapping, and another thread has
+ * decided to launch a writepage() against this buffer.
+ */
+ J_ASSERT_BH(bh_in, buffer_jbddirty(bh_in));
+
+ new_bh = alloc_buffer_head(GFP_NOFS|__GFP_NOFAIL);
+
+ /*
+ * If a new transaction has already done a buffer copy-out, then
+ * we use that version of the data for the commit.
+ */
+ jbd_lock_bh_state(bh_in);
+repeat:
+ if (jh_in->b_frozen_data) {
+ done_copy_out = 1;
+ new_page = virt_to_page(jh_in->b_frozen_data);
+ new_offset = offset_in_page(jh_in->b_frozen_data);
+ } else {
+ new_page = jh2bh(jh_in)->b_page;
+ new_offset = offset_in_page(jh2bh(jh_in)->b_data);
+ }
+
+ mapped_data = kmap_atomic(new_page, KM_USER0);
+ /*
+ * Check for escaping
+ */
+ if (*((__be32 *)(mapped_data + new_offset)) ==
+ cpu_to_be32(JBD2_MAGIC_NUMBER)) {
+ need_copy_out = 1;
+ do_escape = 1;
+ }
+ kunmap_atomic(mapped_data, KM_USER0);
+
+ /*
+ * Do we need to do a data copy?
+ */
+ if (need_copy_out && !done_copy_out) {
+ char *tmp;
+
+ jbd_unlock_bh_state(bh_in);
+ tmp = jbd2_slab_alloc(bh_in->b_size, GFP_NOFS);
+ jbd_lock_bh_state(bh_in);
+ if (jh_in->b_frozen_data) {
+ jbd2_slab_free(tmp, bh_in->b_size);
+ goto repeat;
+ }
+
+ jh_in->b_frozen_data = tmp;
+ mapped_data = kmap_atomic(new_page, KM_USER0);
+ memcpy(tmp, mapped_data + new_offset, jh2bh(jh_in)->b_size);
+ kunmap_atomic(mapped_data, KM_USER0);
+
+ new_page = virt_to_page(tmp);
+ new_offset = offset_in_page(tmp);
+ done_copy_out = 1;
+ }
+
+ /*
+ * Did we need to do an escaping? Now we've done all the
+ * copying, we can finally do so.
+ */
+ if (do_escape) {
+ mapped_data = kmap_atomic(new_page, KM_USER0);
+ *((unsigned int *)(mapped_data + new_offset)) = 0;
+ kunmap_atomic(mapped_data, KM_USER0);
+ }
+
+ /* keep subsequent assertions sane */
+ new_bh->b_state = 0;
+ init_buffer(new_bh, NULL, NULL);
+ atomic_set(&new_bh->b_count, 1);
+ jbd_unlock_bh_state(bh_in);
+
+ new_jh = jbd2_journal_add_journal_head(new_bh); /* This sleeps */
+
+ set_bh_page(new_bh, new_page, new_offset);
+ new_jh->b_transaction = NULL;
+ new_bh->b_size = jh2bh(jh_in)->b_size;
+ new_bh->b_bdev = transaction->t_journal->j_dev;
+ new_bh->b_blocknr = blocknr;
+ set_buffer_mapped(new_bh);
+ set_buffer_dirty(new_bh);
+
+ *jh_out = new_jh;
+
+ /*
+ * The to-be-written buffer needs to get moved to the io queue,
+ * and the original buffer whose contents we are shadowing or
+ * copying is moved to the transaction's shadow queue.
+ */
+ JBUFFER_TRACE(jh_in, "file as BJ_Shadow");
+ jbd2_journal_file_buffer(jh_in, transaction, BJ_Shadow);
+ JBUFFER_TRACE(new_jh, "file as BJ_IO");
+ jbd2_journal_file_buffer(new_jh, transaction, BJ_IO);
+
+ return do_escape | (done_copy_out << 1);
+}
+
+/*
+ * Allocation code for the journal file. Manage the space left in the
+ * journal, so that we can begin checkpointing when appropriate.
+ */
+
+/*
+ * __jbd2_log_space_left: Return the number of free blocks left in the journal.
+ *
+ * Called with the journal already locked.
+ *
+ * Called under j_state_lock
+ */
+
+int __jbd2_log_space_left(journal_t *journal)
+{
+ int left = journal->j_free;
+
+ assert_spin_locked(&journal->j_state_lock);
+
+ /*
+ * Be pessimistic here about the number of those free blocks which
+ * might be required for log descriptor control blocks.
+ */
+
+#define MIN_LOG_RESERVED_BLOCKS 32 /* Allow for rounding errors */
+
+ left -= MIN_LOG_RESERVED_BLOCKS;
+
+ if (left <= 0)
+ return 0;
+ left -= (left >> 3);
+ return left;
+}
+
+/*
+ * Called under j_state_lock. Returns true if a transaction was started.
+ */
+int __jbd2_log_start_commit(journal_t *journal, tid_t target)
+{
+ /*
+ * Are we already doing a recent enough commit?
+ */
+ if (!tid_geq(journal->j_commit_request, target)) {
+ /*
+ * We want a new commit: OK, mark the request and wakup the
+ * commit thread. We do _not_ do the commit ourselves.
+ */
+
+ journal->j_commit_request = target;
+ jbd_debug(1, "JBD: requesting commit %d/%d\n",
+ journal->j_commit_request,
+ journal->j_commit_sequence);
+ wake_up(&journal->j_wait_commit);
+ return 1;
+ }
+ return 0;
+}
+
+int jbd2_log_start_commit(journal_t *journal, tid_t tid)
+{
+ int ret;
+
+ spin_lock(&journal->j_state_lock);
+ ret = __jbd2_log_start_commit(journal, tid);
+ spin_unlock(&journal->j_state_lock);
+ return ret;
+}
+
+/*
+ * Force and wait upon a commit if the calling process is not within
+ * transaction. This is used for forcing out undo-protected data which contains
+ * bitmaps, when the fs is running out of space.
+ *
+ * We can only force the running transaction if we don't have an active handle;
+ * otherwise, we will deadlock.
+ *
+ * Returns true if a transaction was started.
+ */
+int jbd2_journal_force_commit_nested(journal_t *journal)
+{
+ transaction_t *transaction = NULL;
+ tid_t tid;
+
+ spin_lock(&journal->j_state_lock);
+ if (journal->j_running_transaction && !current->journal_info) {
+ transaction = journal->j_running_transaction;
+ __jbd2_log_start_commit(journal, transaction->t_tid);
+ } else if (journal->j_committing_transaction)
+ transaction = journal->j_committing_transaction;
+
+ if (!transaction) {
+ spin_unlock(&journal->j_state_lock);
+ return 0; /* Nothing to retry */
+ }
+
+ tid = transaction->t_tid;
+ spin_unlock(&journal->j_state_lock);
+ jbd2_log_wait_commit(journal, tid);
+ return 1;
+}
+
+/*
+ * Start a commit of the current running transaction (if any). Returns true
+ * if a transaction was started, and fills its tid in at *ptid
+ */
+int jbd2_journal_start_commit(journal_t *journal, tid_t *ptid)
+{
+ int ret = 0;
+
+ spin_lock(&journal->j_state_lock);
+ if (journal->j_running_transaction) {
+ tid_t tid = journal->j_running_transaction->t_tid;
+
+ ret = __jbd2_log_start_commit(journal, tid);
+ if (ret && ptid)
+ *ptid = tid;
+ } else if (journal->j_committing_transaction && ptid) {
+ /*
+ * If ext3_write_super() recently started a commit, then we
+ * have to wait for completion of that transaction
+ */
+ *ptid = journal->j_committing_transaction->t_tid;
+ ret = 1;
+ }
+ spin_unlock(&journal->j_state_lock);
+ return ret;
+}
+
+/*
+ * Wait for a specified commit to complete.
+ * The caller may not hold the journal lock.
+ */
+int jbd2_log_wait_commit(journal_t *journal, tid_t tid)
+{
+ int err = 0;
+
+#ifdef CONFIG_JBD_DEBUG
+ spin_lock(&journal->j_state_lock);
+ if (!tid_geq(journal->j_commit_request, tid)) {
+ printk(KERN_EMERG
+ "%s: error: j_commit_request=%d, tid=%d\n",
+ __FUNCTION__, journal->j_commit_request, tid);
+ }
+ spin_unlock(&journal->j_state_lock);
+#endif
+ spin_lock(&journal->j_state_lock);
+ while (tid_gt(tid, journal->j_commit_sequence)) {
+ jbd_debug(1, "JBD: want %d, j_commit_sequence=%d\n",
+ tid, journal->j_commit_sequence);
+ wake_up(&journal->j_wait_commit);
+ spin_unlock(&journal->j_state_lock);
+ wait_event(journal->j_wait_done_commit,
+ !tid_gt(tid, journal->j_commit_sequence));
+ spin_lock(&journal->j_state_lock);
+ }
+ spin_unlock(&journal->j_state_lock);
+
+ if (unlikely(is_journal_aborted(journal))) {
+ printk(KERN_EMERG "journal commit I/O error\n");
+ err = -EIO;
+ }
+ return err;
+}
+
+/*
+ * Log buffer allocation routines:
+ */
+
+int jbd2_journal_next_log_block(journal_t *journal, unsigned long long *retp)
+{
+ unsigned long blocknr;
+
+ spin_lock(&journal->j_state_lock);
+ J_ASSERT(journal->j_free > 1);
+
+ blocknr = journal->j_head;
+ journal->j_head++;
+ journal->j_free--;
+ if (journal->j_head == journal->j_last)
+ journal->j_head = journal->j_first;
+ spin_unlock(&journal->j_state_lock);
+ return jbd2_journal_bmap(journal, blocknr, retp);
+}
+
+/*
+ * Conversion of logical to physical block numbers for the journal
+ *
+ * On external journals the journal blocks are identity-mapped, so
+ * this is a no-op. If needed, we can use j_blk_offset - everything is
+ * ready.
+ */
+int jbd2_journal_bmap(journal_t *journal, unsigned long blocknr,
+ unsigned long long *retp)
+{
+ int err = 0;
+ unsigned long long ret;
+
+ if (journal->j_inode) {
+ ret = bmap(journal->j_inode, blocknr);
+ if (ret)
+ *retp = ret;
+ else {
+ char b[BDEVNAME_SIZE];
+
+ printk(KERN_ALERT "%s: journal block not found "
+ "at offset %lu on %s\n",
+ __FUNCTION__,
+ blocknr,
+ bdevname(journal->j_dev, b));
+ err = -EIO;
+ __journal_abort_soft(journal, err);
+ }
+ } else {
+ *retp = blocknr; /* +journal->j_blk_offset */
+ }
+ return err;
+}
+
+/*
+ * We play buffer_head aliasing tricks to write data/metadata blocks to
+ * the journal without copying their contents, but for journal
+ * descriptor blocks we do need to generate bona fide buffers.
+ *
+ * After the caller of jbd2_journal_get_descriptor_buffer() has finished modifying
+ * the buffer's contents they really should run flush_dcache_page(bh->b_page).
+ * But we don't bother doing that, so there will be coherency problems with
+ * mmaps of blockdevs which hold live JBD-controlled filesystems.
+ */
+struct journal_head *jbd2_journal_get_descriptor_buffer(journal_t *journal)
+{
+ struct buffer_head *bh;
+ unsigned long long blocknr;
+ int err;
+
+ err = jbd2_journal_next_log_block(journal, &blocknr);
+
+ if (err)
+ return NULL;
+
+ bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
+ lock_buffer(bh);
+ memset(bh->b_data, 0, journal->j_blocksize);
+ set_buffer_uptodate(bh);
+ unlock_buffer(bh);
+ BUFFER_TRACE(bh, "return this buffer");
+ return jbd2_journal_add_journal_head(bh);
+}
+
+/*
+ * Management for journal control blocks: functions to create and
+ * destroy journal_t structures, and to initialise and read existing
+ * journal blocks from disk. */
+
+/* First: create and setup a journal_t object in memory. We initialise
+ * very few fields yet: that has to wait until we have created the
+ * journal structures from from scratch, or loaded them from disk. */
+
+static journal_t * journal_init_common (void)
+{
+ journal_t *journal;
+ int err;
+
+ journal = jbd_kmalloc(sizeof(*journal), GFP_KERNEL);
+ if (!journal)
+ goto fail;
+ memset(journal, 0, sizeof(*journal));
+
+ init_waitqueue_head(&journal->j_wait_transaction_locked);
+ init_waitqueue_head(&journal->j_wait_logspace);
+ init_waitqueue_head(&journal->j_wait_done_commit);
+ init_waitqueue_head(&journal->j_wait_checkpoint);
+ init_waitqueue_head(&journal->j_wait_commit);
+ init_waitqueue_head(&journal->j_wait_updates);
+ mutex_init(&journal->j_barrier);
+ mutex_init(&journal->j_checkpoint_mutex);
+ spin_lock_init(&journal->j_revoke_lock);
+ spin_lock_init(&journal->j_list_lock);
+ spin_lock_init(&journal->j_state_lock);
+
+ journal->j_commit_interval = (HZ * JBD_DEFAULT_MAX_COMMIT_AGE);
+
+ /* The journal is marked for error until we succeed with recovery! */
+ journal->j_flags = JBD2_ABORT;
+
+ /* Set up a default-sized revoke table for the new mount. */
+ err = jbd2_journal_init_revoke(journal, JOURNAL_REVOKE_DEFAULT_HASH);
+ if (err) {
+ kfree(journal);
+ goto fail;
+ }
+ return journal;
+fail:
+ return NULL;
+}
+
+/* jbd2_journal_init_dev and jbd2_journal_init_inode:
+ *
+ * Create a journal structure assigned some fixed set of disk blocks to
+ * the journal. We don't actually touch those disk blocks yet, but we
+ * need to set up all of the mapping information to tell the journaling
+ * system where the journal blocks are.
+ *
+ */
+
+/**
+ * journal_t * jbd2_journal_init_dev() - creates an initialises a journal structure
+ * @bdev: Block device on which to create the journal
+ * @fs_dev: Device which hold journalled filesystem for this journal.
+ * @start: Block nr Start of journal.
+ * @len: Length of the journal in blocks.
+ * @blocksize: blocksize of journalling device
+ * @returns: a newly created journal_t *
+ *
+ * jbd2_journal_init_dev creates a journal which maps a fixed contiguous
+ * range of blocks on an arbitrary block device.
+ *
+ */
+journal_t * jbd2_journal_init_dev(struct block_device *bdev,
+ struct block_device *fs_dev,
+ unsigned long long start, int len, int blocksize)
+{
+ journal_t *journal = journal_init_common();
+ struct buffer_head *bh;
+ int n;
+
+ if (!journal)
+ return NULL;
+
+ /* journal descriptor can store up to n blocks -bzzz */
+ journal->j_blocksize = blocksize;
+ n = journal->j_blocksize / sizeof(journal_block_tag_t);
+ journal->j_wbufsize = n;
+ journal->j_wbuf = kmalloc(n * sizeof(struct buffer_head*), GFP_KERNEL);
+ if (!journal->j_wbuf) {
+ printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n",
+ __FUNCTION__);
+ kfree(journal);
+ journal = NULL;
+ goto out;
+ }
+ journal->j_dev = bdev;
+ journal->j_fs_dev = fs_dev;
+ journal->j_blk_offset = start;
+ journal->j_maxlen = len;
+
+ bh = __getblk(journal->j_dev, start, journal->j_blocksize);
+ J_ASSERT(bh != NULL);
+ journal->j_sb_buffer = bh;
+ journal->j_superblock = (journal_superblock_t *)bh->b_data;
+out:
+ return journal;
+}
+
+/**
+ * journal_t * jbd2_journal_init_inode () - creates a journal which maps to a inode.
+ * @inode: An inode to create the journal in
+ *
+ * jbd2_journal_init_inode creates a journal which maps an on-disk inode as
+ * the journal. The inode must exist already, must support bmap() and
+ * must have all data blocks preallocated.
+ */
+journal_t * jbd2_journal_init_inode (struct inode *inode)
+{
+ struct buffer_head *bh;
+ journal_t *journal = journal_init_common();
+ int err;
+ int n;
+ unsigned long long blocknr;
+
+ if (!journal)
+ return NULL;
+
+ journal->j_dev = journal->j_fs_dev = inode->i_sb->s_bdev;
+ journal->j_inode = inode;
+ jbd_debug(1,
+ "journal %p: inode %s/%ld, size %Ld, bits %d, blksize %ld\n",
+ journal, inode->i_sb->s_id, inode->i_ino,
+ (long long) inode->i_size,
+ inode->i_sb->s_blocksize_bits, inode->i_sb->s_blocksize);
+
+ journal->j_maxlen = inode->i_size >> inode->i_sb->s_blocksize_bits;
+ journal->j_blocksize = inode->i_sb->s_blocksize;
+
+ /* journal descriptor can store up to n blocks -bzzz */
+ n = journal->j_blocksize / sizeof(journal_block_tag_t);
+ journal->j_wbufsize = n;
+ journal->j_wbuf = kmalloc(n * sizeof(struct buffer_head*), GFP_KERNEL);
+ if (!journal->j_wbuf) {
+ printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n",
+ __FUNCTION__);
+ kfree(journal);
+ return NULL;
+ }
+
+ err = jbd2_journal_bmap(journal, 0, &blocknr);
+ /* If that failed, give up */
+ if (err) {
+ printk(KERN_ERR "%s: Cannnot locate journal superblock\n",
+ __FUNCTION__);
+ kfree(journal);
+ return NULL;
+ }
+
+ bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
+ J_ASSERT(bh != NULL);
+ journal->j_sb_buffer = bh;
+ journal->j_superblock = (journal_superblock_t *)bh->b_data;
+
+ return journal;
+}
+
+/*
+ * If the journal init or create aborts, we need to mark the journal
+ * superblock as being NULL to prevent the journal destroy from writing
+ * back a bogus superblock.
+ */
+static void journal_fail_superblock (journal_t *journal)
+{
+ struct buffer_head *bh = journal->j_sb_buffer;
+ brelse(bh);
+ journal->j_sb_buffer = NULL;
+}
+
+/*
+ * Given a journal_t structure, initialise the various fields for
+ * startup of a new journaling session. We use this both when creating
+ * a journal, and after recovering an old journal to reset it for
+ * subsequent use.
+ */
+
+static int journal_reset(journal_t *journal)
+{
+ journal_superblock_t *sb = journal->j_superblock;
+ unsigned long long first, last;
+
+ first = be32_to_cpu(sb->s_first);
+ last = be32_to_cpu(sb->s_maxlen);
+
+ journal->j_first = first;
+ journal->j_last = last;
+
+ journal->j_head = first;
+ journal->j_tail = first;
+ journal->j_free = last - first;
+
+ journal->j_tail_sequence = journal->j_transaction_sequence;
+ journal->j_commit_sequence = journal->j_transaction_sequence - 1;
+ journal->j_commit_request = journal->j_commit_sequence;
+
+ journal->j_max_transaction_buffers = journal->j_maxlen / 4;
+
+ /* Add the dynamic fields and write it to disk. */
+ jbd2_journal_update_superblock(journal, 1);
+ jbd2_journal_start_thread(journal);
+ return 0;
+}
+
+/**
+ * int jbd2_journal_create() - Initialise the new journal file
+ * @journal: Journal to create. This structure must have been initialised
+ *
+ * Given a journal_t structure which tells us which disk blocks we can
+ * use, create a new journal superblock and initialise all of the
+ * journal fields from scratch.
+ **/
+int jbd2_journal_create(journal_t *journal)
+{
+ unsigned long long blocknr;
+ struct buffer_head *bh;
+ journal_superblock_t *sb;
+ int i, err;
+
+ if (journal->j_maxlen < JBD2_MIN_JOURNAL_BLOCKS) {
+ printk (KERN_ERR "Journal length (%d blocks) too short.\n",
+ journal->j_maxlen);
+ journal_fail_superblock(journal);
+ return -EINVAL;
+ }
+
+ if (journal->j_inode == NULL) {
+ /*
+ * We don't know what block to start at!
+ */
+ printk(KERN_EMERG
+ "%s: creation of journal on external device!\n",
+ __FUNCTION__);
+ BUG();
+ }
+
+ /* Zero out the entire journal on disk. We cannot afford to
+ have any blocks on disk beginning with JBD2_MAGIC_NUMBER. */
+ jbd_debug(1, "JBD: Zeroing out journal blocks...\n");
+ for (i = 0; i < journal->j_maxlen; i++) {
+ err = jbd2_journal_bmap(journal, i, &blocknr);
+ if (err)
+ return err;
+ bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
+ lock_buffer(bh);
+ memset (bh->b_data, 0, journal->j_blocksize);
+ BUFFER_TRACE(bh, "marking dirty");
+ mark_buffer_dirty(bh);
+ BUFFER_TRACE(bh, "marking uptodate");
+ set_buffer_uptodate(bh);
+ unlock_buffer(bh);
+ __brelse(bh);
+ }
+
+ sync_blockdev(journal->j_dev);
+ jbd_debug(1, "JBD: journal cleared.\n");
+
+ /* OK, fill in the initial static fields in the new superblock */
+ sb = journal->j_superblock;
+
+ sb->s_header.h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
+ sb->s_header.h_blocktype = cpu_to_be32(JBD2_SUPERBLOCK_V2);
+
+ sb->s_blocksize = cpu_to_be32(journal->j_blocksize);
+ sb->s_maxlen = cpu_to_be32(journal->j_maxlen);
+ sb->s_first = cpu_to_be32(1);
+
+ journal->j_transaction_sequence = 1;
+
+ journal->j_flags &= ~JBD2_ABORT;
+ journal->j_format_version = 2;
+
+ return journal_reset(journal);
+}
+
+/**
+ * void jbd2_journal_update_superblock() - Update journal sb on disk.
+ * @journal: The journal to update.
+ * @wait: Set to '0' if you don't want to wait for IO completion.
+ *
+ * Update a journal's dynamic superblock fields and write it to disk,
+ * optionally waiting for the IO to complete.
+ */
+void jbd2_journal_update_superblock(journal_t *journal, int wait)
+{
+ journal_superblock_t *sb = journal->j_superblock;
+ struct buffer_head *bh = journal->j_sb_buffer;
+
+ /*
+ * As a special case, if the on-disk copy is already marked as needing
+ * no recovery (s_start == 0) and there are no outstanding transactions
+ * in the filesystem, then we can safely defer the superblock update
+ * until the next commit by setting JBD2_FLUSHED. This avoids
+ * attempting a write to a potential-readonly device.
+ */
+ if (sb->s_start == 0 && journal->j_tail_sequence ==
+ journal->j_transaction_sequence) {
+ jbd_debug(1,"JBD: Skipping superblock update on recovered sb "
+ "(start %ld, seq %d, errno %d)\n",
+ journal->j_tail, journal->j_tail_sequence,
+ journal->j_errno);
+ goto out;
+ }
+
+ spin_lock(&journal->j_state_lock);
+ jbd_debug(1,"JBD: updating superblock (start %ld, seq %d, errno %d)\n",
+ journal->j_tail, journal->j_tail_sequence, journal->j_errno);
+
+ sb->s_sequence = cpu_to_be32(journal->j_tail_sequence);
+ sb->s_start = cpu_to_be32(journal->j_tail);
+ sb->s_errno = cpu_to_be32(journal->j_errno);
+ spin_unlock(&journal->j_state_lock);
+
+ BUFFER_TRACE(bh, "marking dirty");
+ mark_buffer_dirty(bh);
+ if (wait)
+ sync_dirty_buffer(bh);
+ else
+ ll_rw_block(SWRITE, 1, &bh);
+
+out:
+ /* If we have just flushed the log (by marking s_start==0), then
+ * any future commit will have to be careful to update the
+ * superblock again to re-record the true start of the log. */
+
+ spin_lock(&journal->j_state_lock);
+ if (sb->s_start)
+ journal->j_flags &= ~JBD2_FLUSHED;
+ else
+ journal->j_flags |= JBD2_FLUSHED;
+ spin_unlock(&journal->j_state_lock);
+}
+
+/*
+ * Read the superblock for a given journal, performing initial
+ * validation of the format.
+ */
+
+static int journal_get_superblock(journal_t *journal)
+{
+ struct buffer_head *bh;
+ journal_superblock_t *sb;
+ int err = -EIO;
+
+ bh = journal->j_sb_buffer;
+
+ J_ASSERT(bh != NULL);
+ if (!buffer_uptodate(bh)) {
+ ll_rw_block(READ, 1, &bh);
+ wait_on_buffer(bh);
+ if (!buffer_uptodate(bh)) {
+ printk (KERN_ERR
+ "JBD: IO error reading journal superblock\n");
+ goto out;
+ }
+ }
+
+ sb = journal->j_superblock;
+
+ err = -EINVAL;
+
+ if (sb->s_header.h_magic != cpu_to_be32(JBD2_MAGIC_NUMBER) ||
+ sb->s_blocksize != cpu_to_be32(journal->j_blocksize)) {
+ printk(KERN_WARNING "JBD: no valid journal superblock found\n");
+ goto out;
+ }
+
+ switch(be32_to_cpu(sb->s_header.h_blocktype)) {
+ case JBD2_SUPERBLOCK_V1:
+ journal->j_format_version = 1;
+ break;
+ case JBD2_SUPERBLOCK_V2:
+ journal->j_format_version = 2;
+ break;
+ default:
+ printk(KERN_WARNING "JBD: unrecognised superblock format ID\n");
+ goto out;
+ }
+
+ if (be32_to_cpu(sb->s_maxlen) < journal->j_maxlen)
+ journal->j_maxlen = be32_to_cpu(sb->s_maxlen);
+ else if (be32_to_cpu(sb->s_maxlen) > journal->j_maxlen) {
+ printk (KERN_WARNING "JBD: journal file too short\n");
+ goto out;
+ }
+
+ return 0;
+
+out:
+ journal_fail_superblock(journal);
+ return err;
+}
+
+/*
+ * Load the on-disk journal superblock and read the key fields into the
+ * journal_t.
+ */
+
+static int load_superblock(journal_t *journal)
+{
+ int err;
+ journal_superblock_t *sb;
+
+ err = journal_get_superblock(journal);
+ if (err)
+ return err;
+
+ sb = journal->j_superblock;
+
+ journal->j_tail_sequence = be32_to_cpu(sb->s_sequence);
+ journal->j_tail = be32_to_cpu(sb->s_start);
+ journal->j_first = be32_to_cpu(sb->s_first);
+ journal->j_last = be32_to_cpu(sb->s_maxlen);
+ journal->j_errno = be32_to_cpu(sb->s_errno);
+
+ return 0;
+}
+
+
+/**
+ * int jbd2_journal_load() - Read journal from disk.
+ * @journal: Journal to act on.
+ *
+ * Given a journal_t structure which tells us which disk blocks contain
+ * a journal, read the journal from disk to initialise the in-memory
+ * structures.
+ */
+int jbd2_journal_load(journal_t *journal)
+{
+ int err;
+ journal_superblock_t *sb;
+
+ err = load_superblock(journal);
+ if (err)
+ return err;
+
+ sb = journal->j_superblock;
+ /* If this is a V2 superblock, then we have to check the
+ * features flags on it. */
+
+ if (journal->j_format_version >= 2) {
+ if ((sb->s_feature_ro_compat &
+ ~cpu_to_be32(JBD2_KNOWN_ROCOMPAT_FEATURES)) ||
+ (sb->s_feature_incompat &
+ ~cpu_to_be32(JBD2_KNOWN_INCOMPAT_FEATURES))) {
+ printk (KERN_WARNING
+ "JBD: Unrecognised features on journal\n");
+ return -EINVAL;
+ }
+ }
+
+ /*
+ * Create a slab for this blocksize
+ */
+ err = jbd2_journal_create_jbd_slab(be32_to_cpu(sb->s_blocksize));
+ if (err)
+ return err;
+
+ /* Let the recovery code check whether it needs to recover any
+ * data from the journal. */
+ if (jbd2_journal_recover(journal))
+ goto recovery_error;
+
+ /* OK, we've finished with the dynamic journal bits:
+ * reinitialise the dynamic contents of the superblock in memory
+ * and reset them on disk. */
+ if (journal_reset(journal))
+ goto recovery_error;
+
+ journal->j_flags &= ~JBD2_ABORT;
+ journal->j_flags |= JBD2_LOADED;
+ return 0;
+
+recovery_error:
+ printk (KERN_WARNING "JBD: recovery failed\n");
+ return -EIO;
+}
+
+/**
+ * void jbd2_journal_destroy() - Release a journal_t structure.
+ * @journal: Journal to act on.
+ *
+ * Release a journal_t structure once it is no longer in use by the
+ * journaled object.
+ */
+void jbd2_journal_destroy(journal_t *journal)
+{
+ /* Wait for the commit thread to wake up and die. */
+ journal_kill_thread(journal);
+
+ /* Force a final log commit */
+ if (journal->j_running_transaction)
+ jbd2_journal_commit_transaction(journal);
+
+ /* Force any old transactions to disk */
+
+ /* Totally anal locking here... */
+ spin_lock(&journal->j_list_lock);
+ while (journal->j_checkpoint_transactions != NULL) {
+ spin_unlock(&journal->j_list_lock);
+ jbd2_log_do_checkpoint(journal);
+ spin_lock(&journal->j_list_lock);
+ }
+
+ J_ASSERT(journal->j_running_transaction == NULL);
+ J_ASSERT(journal->j_committing_transaction == NULL);
+ J_ASSERT(journal->j_checkpoint_transactions == NULL);
+ spin_unlock(&journal->j_list_lock);
+
+ /* We can now mark the journal as empty. */
+ journal->j_tail = 0;
+ journal->j_tail_sequence = ++journal->j_transaction_sequence;
+ if (journal->j_sb_buffer) {
+ jbd2_journal_update_superblock(journal, 1);
+ brelse(journal->j_sb_buffer);
+ }
+
+ if (journal->j_inode)
+ iput(journal->j_inode);
+ if (journal->j_revoke)
+ jbd2_journal_destroy_revoke(journal);
+ kfree(journal->j_wbuf);
+ kfree(journal);
+}
+
+
+/**
+ *int jbd2_journal_check_used_features () - Check if features specified are used.
+ * @journal: Journal to check.
+ * @compat: bitmask of compatible features
+ * @ro: bitmask of features that force read-only mount
+ * @incompat: bitmask of incompatible features
+ *
+ * Check whether the journal uses all of a given set of
+ * features. Return true (non-zero) if it does.
+ **/
+
+int jbd2_journal_check_used_features (journal_t *journal, unsigned long compat,
+ unsigned long ro, unsigned long incompat)
+{
+ journal_superblock_t *sb;
+
+ if (!compat && !ro && !incompat)
+ return 1;
+ if (journal->j_format_version == 1)
+ return 0;
+
+ sb = journal->j_superblock;
+
+ if (((be32_to_cpu(sb->s_feature_compat) & compat) == compat) &&
+ ((be32_to_cpu(sb->s_feature_ro_compat) & ro) == ro) &&
+ ((be32_to_cpu(sb->s_feature_incompat) & incompat) == incompat))
+ return 1;
+
+ return 0;
+}
+
+/**
+ * int jbd2_journal_check_available_features() - Check feature set in journalling layer
+ * @journal: Journal to check.
+ * @compat: bitmask of compatible features
+ * @ro: bitmask of features that force read-only mount
+ * @incompat: bitmask of incompatible features
+ *
+ * Check whether the journaling code supports the use of
+ * all of a given set of features on this journal. Return true
+ * (non-zero) if it can. */
+
+int jbd2_journal_check_available_features (journal_t *journal, unsigned long compat,
+ unsigned long ro, unsigned long incompat)
+{
+ journal_superblock_t *sb;
+
+ if (!compat && !ro && !incompat)
+ return 1;
+
+ sb = journal->j_superblock;
+
+ /* We can support any known requested features iff the
+ * superblock is in version 2. Otherwise we fail to support any
+ * extended sb features. */
+
+ if (journal->j_format_version != 2)
+ return 0;
+
+ if ((compat & JBD2_KNOWN_COMPAT_FEATURES) == compat &&
+ (ro & JBD2_KNOWN_ROCOMPAT_FEATURES) == ro &&
+ (incompat & JBD2_KNOWN_INCOMPAT_FEATURES) == incompat)
+ return 1;
+
+ return 0;
+}
+
+/**
+ * int jbd2_journal_set_features () - Mark a given journal feature in the superblock
+ * @journal: Journal to act on.
+ * @compat: bitmask of compatible features
+ * @ro: bitmask of features that force read-only mount
+ * @incompat: bitmask of incompatible features
+ *
+ * Mark a given journal feature as present on the
+ * superblock. Returns true if the requested features could be set.
+ *
+ */
+
+int jbd2_journal_set_features (journal_t *journal, unsigned long compat,
+ unsigned long ro, unsigned long incompat)
+{
+ journal_superblock_t *sb;
+
+ if (jbd2_journal_check_used_features(journal, compat, ro, incompat))
+ return 1;
+
+ if (!jbd2_journal_check_available_features(journal, compat, ro, incompat))
+ return 0;
+
+ jbd_debug(1, "Setting new features 0x%lx/0x%lx/0x%lx\n",
+ compat, ro, incompat);
+
+ sb = journal->j_superblock;
+
+ sb->s_feature_compat |= cpu_to_be32(compat);
+ sb->s_feature_ro_compat |= cpu_to_be32(ro);
+ sb->s_feature_incompat |= cpu_to_be32(incompat);
+
+ return 1;
+}
+
+
+/**
+ * int jbd2_journal_update_format () - Update on-disk journal structure.
+ * @journal: Journal to act on.
+ *
+ * Given an initialised but unloaded journal struct, poke about in the
+ * on-disk structure to update it to the most recent supported version.
+ */
+int jbd2_journal_update_format (journal_t *journal)
+{
+ journal_superblock_t *sb;
+ int err;
+
+ err = journal_get_superblock(journal);
+ if (err)
+ return err;
+
+ sb = journal->j_superblock;
+
+ switch (be32_to_cpu(sb->s_header.h_blocktype)) {
+ case JBD2_SUPERBLOCK_V2:
+ return 0;
+ case JBD2_SUPERBLOCK_V1:
+ return journal_convert_superblock_v1(journal, sb);
+ default:
+ break;
+ }
+ return -EINVAL;
+}
+
+static int journal_convert_superblock_v1(journal_t *journal,
+ journal_superblock_t *sb)
+{
+ int offset, blocksize;
+ struct buffer_head *bh;
+
+ printk(KERN_WARNING
+ "JBD: Converting superblock from version 1 to 2.\n");
+
+ /* Pre-initialise new fields to zero */
+ offset = ((char *) &(sb->s_feature_compat)) - ((char *) sb);
+ blocksize = be32_to_cpu(sb->s_blocksize);
+ memset(&sb->s_feature_compat, 0, blocksize-offset);
+
+ sb->s_nr_users = cpu_to_be32(1);
+ sb->s_header.h_blocktype = cpu_to_be32(JBD2_SUPERBLOCK_V2);
+ journal->j_format_version = 2;
+
+ bh = journal->j_sb_buffer;
+ BUFFER_TRACE(bh, "marking dirty");
+ mark_buffer_dirty(bh);
+ sync_dirty_buffer(bh);
+ return 0;
+}
+
+
+/**
+ * int jbd2_journal_flush () - Flush journal
+ * @journal: Journal to act on.
+ *
+ * Flush all data for a given journal to disk and empty the journal.
+ * Filesystems can use this when remounting readonly to ensure that
+ * recovery does not need to happen on remount.
+ */
+
+int jbd2_journal_flush(journal_t *journal)
+{
+ int err = 0;
+ transaction_t *transaction = NULL;
+ unsigned long old_tail;
+
+ spin_lock(&journal->j_state_lock);
+
+ /* Force everything buffered to the log... */
+ if (journal->j_running_transaction) {
+ transaction = journal->j_running_transaction;
+ __jbd2_log_start_commit(journal, transaction->t_tid);
+ } else if (journal->j_committing_transaction)
+ transaction = journal->j_committing_transaction;
+
+ /* Wait for the log commit to complete... */
+ if (transaction) {
+ tid_t tid = transaction->t_tid;
+
+ spin_unlock(&journal->j_state_lock);
+ jbd2_log_wait_commit(journal, tid);
+ } else {
+ spin_unlock(&journal->j_state_lock);
+ }
+
+ /* ...and flush everything in the log out to disk. */
+ spin_lock(&journal->j_list_lock);
+ while (!err && journal->j_checkpoint_transactions != NULL) {
+ spin_unlock(&journal->j_list_lock);
+ err = jbd2_log_do_checkpoint(journal);
+ spin_lock(&journal->j_list_lock);
+ }
+ spin_unlock(&journal->j_list_lock);
+ jbd2_cleanup_journal_tail(journal);
+
+ /* Finally, mark the journal as really needing no recovery.
+ * This sets s_start==0 in the underlying superblock, which is
+ * the magic code for a fully-recovered superblock. Any future
+ * commits of data to the journal will restore the current
+ * s_start value. */
+ spin_lock(&journal->j_state_lock);
+ old_tail = journal->j_tail;
+ journal->j_tail = 0;
+ spin_unlock(&journal->j_state_lock);
+ jbd2_journal_update_superblock(journal, 1);
+ spin_lock(&journal->j_state_lock);
+ journal->j_tail = old_tail;
+
+ J_ASSERT(!journal->j_running_transaction);
+ J_ASSERT(!journal->j_committing_transaction);
+ J_ASSERT(!journal->j_checkpoint_transactions);
+ J_ASSERT(journal->j_head == journal->j_tail);
+ J_ASSERT(journal->j_tail_sequence == journal->j_transaction_sequence);
+ spin_unlock(&journal->j_state_lock);
+ return err;
+}
+
+/**
+ * int jbd2_journal_wipe() - Wipe journal contents
+ * @journal: Journal to act on.
+ * @write: flag (see below)
+ *
+ * Wipe out all of the contents of a journal, safely. This will produce
+ * a warning if the journal contains any valid recovery information.
+ * Must be called between journal_init_*() and jbd2_journal_load().
+ *
+ * If 'write' is non-zero, then we wipe out the journal on disk; otherwise
+ * we merely suppress recovery.
+ */
+
+int jbd2_journal_wipe(journal_t *journal, int write)
+{
+ journal_superblock_t *sb;
+ int err = 0;
+
+ J_ASSERT (!(journal->j_flags & JBD2_LOADED));
+
+ err = load_superblock(journal);
+ if (err)
+ return err;
+
+ sb = journal->j_superblock;
+
+ if (!journal->j_tail)
+ goto no_recovery;
+
+ printk (KERN_WARNING "JBD: %s recovery information on journal\n",
+ write ? "Clearing" : "Ignoring");
+
+ err = jbd2_journal_skip_recovery(journal);
+ if (write)
+ jbd2_journal_update_superblock(journal, 1);
+
+ no_recovery:
+ return err;
+}
+
+/*
+ * journal_dev_name: format a character string to describe on what
+ * device this journal is present.
+ */
+
+static const char *journal_dev_name(journal_t *journal, char *buffer)
+{
+ struct block_device *bdev;
+
+ if (journal->j_inode)
+ bdev = journal->j_inode->i_sb->s_bdev;
+ else
+ bdev = journal->j_dev;
+
+ return bdevname(bdev, buffer);
+}
+
+/*
+ * Journal abort has very specific semantics, which we describe
+ * for journal abort.
+ *
+ * Two internal function, which provide abort to te jbd layer
+ * itself are here.
+ */
+
+/*
+ * Quick version for internal journal use (doesn't lock the journal).
+ * Aborts hard --- we mark the abort as occurred, but do _nothing_ else,
+ * and don't attempt to make any other journal updates.
+ */
+void __jbd2_journal_abort_hard(journal_t *journal)
+{
+ transaction_t *transaction;
+ char b[BDEVNAME_SIZE];
+
+ if (journal->j_flags & JBD2_ABORT)
+ return;
+
+ printk(KERN_ERR "Aborting journal on device %s.\n",
+ journal_dev_name(journal, b));
+
+ spin_lock(&journal->j_state_lock);
+ journal->j_flags |= JBD2_ABORT;
+ transaction = journal->j_running_transaction;
+ if (transaction)
+ __jbd2_log_start_commit(journal, transaction->t_tid);
+ spin_unlock(&journal->j_state_lock);
+}
+
+/* Soft abort: record the abort error status in the journal superblock,
+ * but don't do any other IO. */
+static void __journal_abort_soft (journal_t *journal, int errno)
+{
+ if (journal->j_flags & JBD2_ABORT)
+ return;
+
+ if (!journal->j_errno)
+ journal->j_errno = errno;
+
+ __jbd2_journal_abort_hard(journal);
+
+ if (errno)
+ jbd2_journal_update_superblock(journal, 1);
+}
+
+/**
+ * void jbd2_journal_abort () - Shutdown the journal immediately.
+ * @journal: the journal to shutdown.
+ * @errno: an error number to record in the journal indicating
+ * the reason for the shutdown.
+ *
+ * Perform a complete, immediate shutdown of the ENTIRE
+ * journal (not of a single transaction). This operation cannot be
+ * undone without closing and reopening the journal.
+ *
+ * The jbd2_journal_abort function is intended to support higher level error
+ * recovery mechanisms such as the ext2/ext3 remount-readonly error
+ * mode.
+ *
+ * Journal abort has very specific semantics. Any existing dirty,
+ * unjournaled buffers in the main filesystem will still be written to
+ * disk by bdflush, but the journaling mechanism will be suspended
+ * immediately and no further transaction commits will be honoured.
+ *
+ * Any dirty, journaled buffers will be written back to disk without
+ * hitting the journal. Atomicity cannot be guaranteed on an aborted
+ * filesystem, but we _do_ attempt to leave as much data as possible
+ * behind for fsck to use for cleanup.
+ *
+ * Any attempt to get a new transaction handle on a journal which is in
+ * ABORT state will just result in an -EROFS error return. A
+ * jbd2_journal_stop on an existing handle will return -EIO if we have
+ * entered abort state during the update.
+ *
+ * Recursive transactions are not disturbed by journal abort until the
+ * final jbd2_journal_stop, which will receive the -EIO error.
+ *
+ * Finally, the jbd2_journal_abort call allows the caller to supply an errno
+ * which will be recorded (if possible) in the journal superblock. This
+ * allows a client to record failure conditions in the middle of a
+ * transaction without having to complete the transaction to record the
+ * failure to disk. ext3_error, for example, now uses this
+ * functionality.
+ *
+ * Errors which originate from within the journaling layer will NOT
+ * supply an errno; a null errno implies that absolutely no further
+ * writes are done to the journal (unless there are any already in
+ * progress).
+ *
+ */
+
+void jbd2_journal_abort(journal_t *journal, int errno)
+{
+ __journal_abort_soft(journal, errno);
+}
+
+/**
+ * int jbd2_journal_errno () - returns the journal's error state.
+ * @journal: journal to examine.
+ *
+ * This is the errno numbet set with jbd2_journal_abort(), the last
+ * time the journal was mounted - if the journal was stopped
+ * without calling abort this will be 0.
+ *
+ * If the journal has been aborted on this mount time -EROFS will
+ * be returned.
+ */
+int jbd2_journal_errno(journal_t *journal)
+{
+ int err;
+
+ spin_lock(&journal->j_state_lock);
+ if (journal->j_flags & JBD2_ABORT)
+ err = -EROFS;
+ else
+ err = journal->j_errno;
+ spin_unlock(&journal->j_state_lock);
+ return err;
+}
+
+/**
+ * int jbd2_journal_clear_err () - clears the journal's error state
+ * @journal: journal to act on.
+ *
+ * An error must be cleared or Acked to take a FS out of readonly
+ * mode.
+ */
+int jbd2_journal_clear_err(journal_t *journal)
+{
+ int err = 0;
+
+ spin_lock(&journal->j_state_lock);
+ if (journal->j_flags & JBD2_ABORT)
+ err = -EROFS;
+ else
+ journal->j_errno = 0;
+ spin_unlock(&journal->j_state_lock);
+ return err;
+}
+
+/**
+ * void jbd2_journal_ack_err() - Ack journal err.
+ * @journal: journal to act on.
+ *
+ * An error must be cleared or Acked to take a FS out of readonly
+ * mode.
+ */
+void jbd2_journal_ack_err(journal_t *journal)
+{
+ spin_lock(&journal->j_state_lock);
+ if (journal->j_errno)
+ journal->j_flags |= JBD2_ACK_ERR;
+ spin_unlock(&journal->j_state_lock);
+}
+
+int jbd2_journal_blocks_per_page(struct inode *inode)
+{
+ return 1 << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
+}
+
+/*
+ * helper functions to deal with 32 or 64bit block numbers.
+ */
+size_t journal_tag_bytes(journal_t *journal)
+{
+ if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT))
+ return JBD_TAG_SIZE64;
+ else
+ return JBD_TAG_SIZE32;
+}
+
+/*
+ * Simple support for retrying memory allocations. Introduced to help to
+ * debug different VM deadlock avoidance strategies.
+ */
+void * __jbd2_kmalloc (const char *where, size_t size, gfp_t flags, int retry)
+{
+ return kmalloc(size, flags | (retry ? __GFP_NOFAIL : 0));
+}
+
+/*
+ * jbd slab management: create 1k, 2k, 4k, 8k slabs as needed
+ * and allocate frozen and commit buffers from these slabs.
+ *
+ * Reason for doing this is to avoid, SLAB_DEBUG - since it could
+ * cause bh to cross page boundary.
+ */
+
+#define JBD_MAX_SLABS 5
+#define JBD_SLAB_INDEX(size) (size >> 11)
+
+static kmem_cache_t *jbd_slab[JBD_MAX_SLABS];
+static const char *jbd_slab_names[JBD_MAX_SLABS] = {
+ "jbd2_1k", "jbd2_2k", "jbd2_4k", NULL, "jbd2_8k"
+};
+
+static void jbd2_journal_destroy_jbd_slabs(void)
+{
+ int i;
+
+ for (i = 0; i < JBD_MAX_SLABS; i++) {
+ if (jbd_slab[i])
+ kmem_cache_destroy(jbd_slab[i]);
+ jbd_slab[i] = NULL;
+ }
+}
+
+static int jbd2_journal_create_jbd_slab(size_t slab_size)
+{
+ int i = JBD_SLAB_INDEX(slab_size);
+
+ BUG_ON(i >= JBD_MAX_SLABS);
+
+ /*
+ * Check if we already have a slab created for this size
+ */
+ if (jbd_slab[i])
+ return 0;
+
+ /*
+ * Create a slab and force alignment to be same as slabsize -
+ * this will make sure that allocations won't cross the page
+ * boundary.
+ */
+ jbd_slab[i] = kmem_cache_create(jbd_slab_names[i],
+ slab_size, slab_size, 0, NULL, NULL);
+ if (!jbd_slab[i]) {
+ printk(KERN_EMERG "JBD: no memory for jbd_slab cache\n");
+ return -ENOMEM;
+ }
+ return 0;
+}
+
+void * jbd2_slab_alloc(size_t size, gfp_t flags)
+{
+ int idx;
+
+ idx = JBD_SLAB_INDEX(size);
+ BUG_ON(jbd_slab[idx] == NULL);
+ return kmem_cache_alloc(jbd_slab[idx], flags | __GFP_NOFAIL);
+}
+
+void jbd2_slab_free(void *ptr, size_t size)
+{
+ int idx;
+
+ idx = JBD_SLAB_INDEX(size);
+ BUG_ON(jbd_slab[idx] == NULL);
+ kmem_cache_free(jbd_slab[idx], ptr);
+}
+
+/*
+ * Journal_head storage management
+ */
+static kmem_cache_t *jbd2_journal_head_cache;
+#ifdef CONFIG_JBD_DEBUG
+static atomic_t nr_journal_heads = ATOMIC_INIT(0);
+#endif
+
+static int journal_init_jbd2_journal_head_cache(void)
+{
+ int retval;
+
+ J_ASSERT(jbd2_journal_head_cache == 0);
+ jbd2_journal_head_cache = kmem_cache_create("jbd2_journal_head",
+ sizeof(struct journal_head),
+ 0, /* offset */
+ 0, /* flags */
+ NULL, /* ctor */
+ NULL); /* dtor */
+ retval = 0;
+ if (jbd2_journal_head_cache == 0) {
+ retval = -ENOMEM;
+ printk(KERN_EMERG "JBD: no memory for journal_head cache\n");
+ }
+ return retval;
+}
+
+static void jbd2_journal_destroy_jbd2_journal_head_cache(void)
+{
+ J_ASSERT(jbd2_journal_head_cache != NULL);
+ kmem_cache_destroy(jbd2_journal_head_cache);
+ jbd2_journal_head_cache = NULL;
+}
+
+/*
+ * journal_head splicing and dicing
+ */
+static struct journal_head *journal_alloc_journal_head(void)
+{
+ struct journal_head *ret;
+ static unsigned long last_warning;
+
+#ifdef CONFIG_JBD_DEBUG
+ atomic_inc(&nr_journal_heads);
+#endif
+ ret = kmem_cache_alloc(jbd2_journal_head_cache, GFP_NOFS);
+ if (ret == 0) {
+ jbd_debug(1, "out of memory for journal_head\n");
+ if (time_after(jiffies, last_warning + 5*HZ)) {
+ printk(KERN_NOTICE "ENOMEM in %s, retrying.\n",
+ __FUNCTION__);
+ last_warning = jiffies;
+ }
+ while (ret == 0) {
+ yield();
+ ret = kmem_cache_alloc(jbd2_journal_head_cache, GFP_NOFS);
+ }
+ }
+ return ret;
+}
+
+static void journal_free_journal_head(struct journal_head *jh)
+{
+#ifdef CONFIG_JBD_DEBUG
+ atomic_dec(&nr_journal_heads);
+ memset(jh, JBD_POISON_FREE, sizeof(*jh));
+#endif
+ kmem_cache_free(jbd2_journal_head_cache, jh);
+}
+
+/*
+ * A journal_head is attached to a buffer_head whenever JBD has an
+ * interest in the buffer.
+ *
+ * Whenever a buffer has an attached journal_head, its ->b_state:BH_JBD bit
+ * is set. This bit is tested in core kernel code where we need to take
+ * JBD-specific actions. Testing the zeroness of ->b_private is not reliable
+ * there.
+ *
+ * When a buffer has its BH_JBD bit set, its ->b_count is elevated by one.
+ *
+ * When a buffer has its BH_JBD bit set it is immune from being released by
+ * core kernel code, mainly via ->b_count.
+ *
+ * A journal_head may be detached from its buffer_head when the journal_head's
+ * b_transaction, b_cp_transaction and b_next_transaction pointers are NULL.
+ * Various places in JBD call jbd2_journal_remove_journal_head() to indicate that the
+ * journal_head can be dropped if needed.
+ *
+ * Various places in the kernel want to attach a journal_head to a buffer_head
+ * _before_ attaching the journal_head to a transaction. To protect the
+ * journal_head in this situation, jbd2_journal_add_journal_head elevates the
+ * journal_head's b_jcount refcount by one. The caller must call
+ * jbd2_journal_put_journal_head() to undo this.
+ *
+ * So the typical usage would be:
+ *
+ * (Attach a journal_head if needed. Increments b_jcount)
+ * struct journal_head *jh = jbd2_journal_add_journal_head(bh);
+ * ...
+ * jh->b_transaction = xxx;
+ * jbd2_journal_put_journal_head(jh);
+ *
+ * Now, the journal_head's b_jcount is zero, but it is safe from being released
+ * because it has a non-zero b_transaction.
+ */
+
+/*
+ * Give a buffer_head a journal_head.
+ *
+ * Doesn't need the journal lock.
+ * May sleep.
+ */
+struct journal_head *jbd2_journal_add_journal_head(struct buffer_head *bh)
+{
+ struct journal_head *jh;
+ struct journal_head *new_jh = NULL;
+
+repeat:
+ if (!buffer_jbd(bh)) {
+ new_jh = journal_alloc_journal_head();
+ memset(new_jh, 0, sizeof(*new_jh));
+ }
+
+ jbd_lock_bh_journal_head(bh);
+ if (buffer_jbd(bh)) {
+ jh = bh2jh(bh);
+ } else {
+ J_ASSERT_BH(bh,
+ (atomic_read(&bh->b_count) > 0) ||
+ (bh->b_page && bh->b_page->mapping));
+
+ if (!new_jh) {
+ jbd_unlock_bh_journal_head(bh);
+ goto repeat;
+ }
+
+ jh = new_jh;
+ new_jh = NULL; /* We consumed it */
+ set_buffer_jbd(bh);
+ bh->b_private = jh;
+ jh->b_bh = bh;
+ get_bh(bh);
+ BUFFER_TRACE(bh, "added journal_head");
+ }
+ jh->b_jcount++;
+ jbd_unlock_bh_journal_head(bh);
+ if (new_jh)
+ journal_free_journal_head(new_jh);
+ return bh->b_private;
+}
+
+/*
+ * Grab a ref against this buffer_head's journal_head. If it ended up not
+ * having a journal_head, return NULL
+ */
+struct journal_head *jbd2_journal_grab_journal_head(struct buffer_head *bh)
+{
+ struct journal_head *jh = NULL;
+
+ jbd_lock_bh_journal_head(bh);
+ if (buffer_jbd(bh)) {
+ jh = bh2jh(bh);
+ jh->b_jcount++;
+ }
+ jbd_unlock_bh_journal_head(bh);
+ return jh;
+}
+
+static void __journal_remove_journal_head(struct buffer_head *bh)
+{
+ struct journal_head *jh = bh2jh(bh);
+
+ J_ASSERT_JH(jh, jh->b_jcount >= 0);
+
+ get_bh(bh);
+ if (jh->b_jcount == 0) {
+ if (jh->b_transaction == NULL &&
+ jh->b_next_transaction == NULL &&
+ jh->b_cp_transaction == NULL) {
+ J_ASSERT_JH(jh, jh->b_jlist == BJ_None);
+ J_ASSERT_BH(bh, buffer_jbd(bh));
+ J_ASSERT_BH(bh, jh2bh(jh) == bh);
+ BUFFER_TRACE(bh, "remove journal_head");
+ if (jh->b_frozen_data) {
+ printk(KERN_WARNING "%s: freeing "
+ "b_frozen_data\n",
+ __FUNCTION__);
+ jbd2_slab_free(jh->b_frozen_data, bh->b_size);
+ }
+ if (jh->b_committed_data) {
+ printk(KERN_WARNING "%s: freeing "
+ "b_committed_data\n",
+ __FUNCTION__);
+ jbd2_slab_free(jh->b_committed_data, bh->b_size);
+ }
+ bh->b_private = NULL;
+ jh->b_bh = NULL; /* debug, really */
+ clear_buffer_jbd(bh);
+ __brelse(bh);
+ journal_free_journal_head(jh);
+ } else {
+ BUFFER_TRACE(bh, "journal_head was locked");
+ }
+ }
+}
+
+/*
+ * jbd2_journal_remove_journal_head(): if the buffer isn't attached to a transaction
+ * and has a zero b_jcount then remove and release its journal_head. If we did
+ * see that the buffer is not used by any transaction we also "logically"
+ * decrement ->b_count.
+ *
+ * We in fact take an additional increment on ->b_count as a convenience,
+ * because the caller usually wants to do additional things with the bh
+ * after calling here.
+ * The caller of jbd2_journal_remove_journal_head() *must* run __brelse(bh) at some
+ * time. Once the caller has run __brelse(), the buffer is eligible for
+ * reaping by try_to_free_buffers().
+ */
+void jbd2_journal_remove_journal_head(struct buffer_head *bh)
+{
+ jbd_lock_bh_journal_head(bh);
+ __journal_remove_journal_head(bh);
+ jbd_unlock_bh_journal_head(bh);
+}
+
+/*
+ * Drop a reference on the passed journal_head. If it fell to zero then try to
+ * release the journal_head from the buffer_head.
+ */
+void jbd2_journal_put_journal_head(struct journal_head *jh)
+{
+ struct buffer_head *bh = jh2bh(jh);
+
+ jbd_lock_bh_journal_head(bh);
+ J_ASSERT_JH(jh, jh->b_jcount > 0);
+ --jh->b_jcount;
+ if (!jh->b_jcount && !jh->b_transaction) {
+ __journal_remove_journal_head(bh);
+ __brelse(bh);
+ }
+ jbd_unlock_bh_journal_head(bh);
+}
+
+/*
+ * /proc tunables
+ */
+#if defined(CONFIG_JBD_DEBUG)
+int jbd2_journal_enable_debug;
+EXPORT_SYMBOL(jbd2_journal_enable_debug);
+#endif
+
+#if defined(CONFIG_JBD_DEBUG) && defined(CONFIG_PROC_FS)
+
+static struct proc_dir_entry *proc_jbd_debug;
+
+static int read_jbd_debug(char *page, char **start, off_t off,
+ int count, int *eof, void *data)
+{
+ int ret;
+
+ ret = sprintf(page + off, "%d\n", jbd2_journal_enable_debug);
+ *eof = 1;
+ return ret;
+}
+
+static int write_jbd_debug(struct file *file, const char __user *buffer,
+ unsigned long count, void *data)
+{
+ char buf[32];
+
+ if (count > ARRAY_SIZE(buf) - 1)
+ count = ARRAY_SIZE(buf) - 1;
+ if (copy_from_user(buf, buffer, count))
+ return -EFAULT;
+ buf[ARRAY_SIZE(buf) - 1] = '\0';
+ jbd2_journal_enable_debug = simple_strtoul(buf, NULL, 10);
+ return count;
+}
+
+#define JBD_PROC_NAME "sys/fs/jbd2-debug"
+
+static void __init create_jbd_proc_entry(void)
+{
+ proc_jbd_debug = create_proc_entry(JBD_PROC_NAME, 0644, NULL);
+ if (proc_jbd_debug) {
+ /* Why is this so hard? */
+ proc_jbd_debug->read_proc = read_jbd_debug;
+ proc_jbd_debug->write_proc = write_jbd_debug;
+ }
+}
+
+static void __exit jbd2_remove_jbd_proc_entry(void)
+{
+ if (proc_jbd_debug)
+ remove_proc_entry(JBD_PROC_NAME, NULL);
+}
+
+#else
+
+#define create_jbd_proc_entry() do {} while (0)
+#define jbd2_remove_jbd_proc_entry() do {} while (0)
+
+#endif
+
+kmem_cache_t *jbd2_handle_cache;
+
+static int __init journal_init_handle_cache(void)
+{
+ jbd2_handle_cache = kmem_cache_create("jbd2_journal_handle",
+ sizeof(handle_t),
+ 0, /* offset */
+ 0, /* flags */
+ NULL, /* ctor */
+ NULL); /* dtor */
+ if (jbd2_handle_cache == NULL) {
+ printk(KERN_EMERG "JBD: failed to create handle cache\n");
+ return -ENOMEM;
+ }
+ return 0;
+}
+
+static void jbd2_journal_destroy_handle_cache(void)
+{
+ if (jbd2_handle_cache)
+ kmem_cache_destroy(jbd2_handle_cache);
+}
+
+/*
+ * Module startup and shutdown
+ */
+
+static int __init journal_init_caches(void)
+{
+ int ret;
+
+ ret = jbd2_journal_init_revoke_caches();
+ if (ret == 0)
+ ret = journal_init_jbd2_journal_head_cache();
+ if (ret == 0)
+ ret = journal_init_handle_cache();
+ return ret;
+}
+
+static void jbd2_journal_destroy_caches(void)
+{
+ jbd2_journal_destroy_revoke_caches();
+ jbd2_journal_destroy_jbd2_journal_head_cache();
+ jbd2_journal_destroy_handle_cache();
+ jbd2_journal_destroy_jbd_slabs();
+}
+
+static int __init journal_init(void)
+{
+ int ret;
+
+ BUILD_BUG_ON(sizeof(struct journal_superblock_s) != 1024);
+
+ ret = journal_init_caches();
+ if (ret != 0)
+ jbd2_journal_destroy_caches();
+ create_jbd_proc_entry();
+ return ret;
+}
+
+static void __exit journal_exit(void)
+{
+#ifdef CONFIG_JBD_DEBUG
+ int n = atomic_read(&nr_journal_heads);
+ if (n)
+ printk(KERN_EMERG "JBD: leaked %d journal_heads!\n", n);
+#endif
+ jbd2_remove_jbd_proc_entry();
+ jbd2_journal_destroy_caches();
+}
+
+MODULE_LICENSE("GPL");
+module_init(journal_init);
+module_exit(journal_exit);
+
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
new file mode 100644
index 000000000000..9f10acafaf70
--- /dev/null
+++ b/fs/jbd2/recovery.c
@@ -0,0 +1,609 @@
+/*
+ * linux/fs/recovery.c
+ *
+ * Written by Stephen C. Tweedie <sct@redhat.com>, 1999
+ *
+ * Copyright 1999-2000 Red Hat Software --- All Rights Reserved
+ *
+ * This file is part of the Linux kernel and is made available under
+ * the terms of the GNU General Public License, version 2, or at your
+ * option, any later version, incorporated herein by reference.
+ *
+ * Journal recovery routines for the generic filesystem journaling code;
+ * part of the ext2fs journaling system.
+ */
+
+#ifndef __KERNEL__
+#include "jfs_user.h"
+#else
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/jbd2.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#endif
+
+/*
+ * Maintain information about the progress of the recovery job, so that
+ * the different passes can carry information between them.
+ */
+struct recovery_info
+{
+ tid_t start_transaction;
+ tid_t end_transaction;
+
+ int nr_replays;
+ int nr_revokes;
+ int nr_revoke_hits;
+};
+
+enum passtype {PASS_SCAN, PASS_REVOKE, PASS_REPLAY};
+static int do_one_pass(journal_t *journal,
+ struct recovery_info *info, enum passtype pass);
+static int scan_revoke_records(journal_t *, struct buffer_head *,
+ tid_t, struct recovery_info *);
+
+#ifdef __KERNEL__
+
+/* Release readahead buffers after use */
+static void journal_brelse_array(struct buffer_head *b[], int n)
+{
+ while (--n >= 0)
+ brelse (b[n]);
+}
+
+
+/*
+ * When reading from the journal, we are going through the block device
+ * layer directly and so there is no readahead being done for us. We
+ * need to implement any readahead ourselves if we want it to happen at
+ * all. Recovery is basically one long sequential read, so make sure we
+ * do the IO in reasonably large chunks.
+ *
+ * This is not so critical that we need to be enormously clever about
+ * the readahead size, though. 128K is a purely arbitrary, good-enough
+ * fixed value.
+ */
+
+#define MAXBUF 8
+static int do_readahead(journal_t *journal, unsigned int start)
+{
+ int err;
+ unsigned int max, nbufs, next;
+ unsigned long long blocknr;
+ struct buffer_head *bh;
+
+ struct buffer_head * bufs[MAXBUF];
+
+ /* Do up to 128K of readahead */
+ max = start + (128 * 1024 / journal->j_blocksize);
+ if (max > journal->j_maxlen)
+ max = journal->j_maxlen;
+
+ /* Do the readahead itself. We'll submit MAXBUF buffer_heads at
+ * a time to the block device IO layer. */
+
+ nbufs = 0;
+
+ for (next = start; next < max; next++) {
+ err = jbd2_journal_bmap(journal, next, &blocknr);
+
+ if (err) {
+ printk (KERN_ERR "JBD: bad block at offset %u\n",
+ next);
+ goto failed;
+ }
+
+ bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
+ if (!bh) {
+ err = -ENOMEM;
+ goto failed;
+ }
+
+ if (!buffer_uptodate(bh) && !buffer_locked(bh)) {
+ bufs[nbufs++] = bh;
+ if (nbufs == MAXBUF) {
+ ll_rw_block(READ, nbufs, bufs);
+ journal_brelse_array(bufs, nbufs);
+ nbufs = 0;
+ }
+ } else
+ brelse(bh);
+ }
+
+ if (nbufs)
+ ll_rw_block(READ, nbufs, bufs);
+ err = 0;
+
+failed:
+ if (nbufs)
+ journal_brelse_array(bufs, nbufs);
+ return err;
+}
+
+#endif /* __KERNEL__ */
+
+
+/*
+ * Read a block from the journal
+ */
+
+static int jread(struct buffer_head **bhp, journal_t *journal,
+ unsigned int offset)
+{
+ int err;
+ unsigned long long blocknr;
+ struct buffer_head *bh;
+
+ *bhp = NULL;
+
+ if (offset >= journal->j_maxlen) {
+ printk(KERN_ERR "JBD: corrupted journal superblock\n");
+ return -EIO;
+ }
+
+ err = jbd2_journal_bmap(journal, offset, &blocknr);
+
+ if (err) {
+ printk (KERN_ERR "JBD: bad block at offset %u\n",
+ offset);
+ return err;
+ }
+
+ bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
+ if (!bh)
+ return -ENOMEM;
+
+ if (!buffer_uptodate(bh)) {
+ /* If this is a brand new buffer, start readahead.
+ Otherwise, we assume we are already reading it. */
+ if (!buffer_req(bh))
+ do_readahead(journal, offset);
+ wait_on_buffer(bh);
+ }
+
+ if (!buffer_uptodate(bh)) {
+ printk (KERN_ERR "JBD: Failed to read block at offset %u\n",
+ offset);
+ brelse(bh);
+ return -EIO;
+ }
+
+ *bhp = bh;
+ return 0;
+}
+
+
+/*
+ * Count the number of in-use tags in a journal descriptor block.
+ */
+
+static int count_tags(journal_t *journal, struct buffer_head *bh)
+{
+ char * tagp;
+ journal_block_tag_t * tag;
+ int nr = 0, size = journal->j_blocksize;
+ int tag_bytes = journal_tag_bytes(journal);
+
+ tagp = &bh->b_data[sizeof(journal_header_t)];
+
+ while ((tagp - bh->b_data + tag_bytes) <= size) {
+ tag = (journal_block_tag_t *) tagp;
+
+ nr++;
+ tagp += tag_bytes;
+ if (!(tag->t_flags & cpu_to_be32(JBD2_FLAG_SAME_UUID)))
+ tagp += 16;
+
+ if (tag->t_flags & cpu_to_be32(JBD2_FLAG_LAST_TAG))
+ break;
+ }
+
+ return nr;
+}
+
+
+/* Make sure we wrap around the log correctly! */
+#define wrap(journal, var) \
+do { \
+ if (var >= (journal)->j_last) \
+ var -= ((journal)->j_last - (journal)->j_first); \
+} while (0)
+
+/**
+ * jbd2_journal_recover - recovers a on-disk journal
+ * @journal: the journal to recover
+ *
+ * The primary function for recovering the log contents when mounting a
+ * journaled device.
+ *
+ * Recovery is done in three passes. In the first pass, we look for the
+ * end of the log. In the second, we assemble the list of revoke
+ * blocks. In the third and final pass, we replay any un-revoked blocks
+ * in the log.
+ */
+int jbd2_journal_recover(journal_t *journal)
+{
+ int err;
+ journal_superblock_t * sb;
+
+ struct recovery_info info;
+
+ memset(&info, 0, sizeof(info));
+ sb = journal->j_superblock;
+
+ /*
+ * The journal superblock's s_start field (the current log head)
+ * is always zero if, and only if, the journal was cleanly
+ * unmounted.
+ */
+
+ if (!sb->s_start) {
+ jbd_debug(1, "No recovery required, last transaction %d\n",
+ be32_to_cpu(sb->s_sequence));
+ journal->j_transaction_sequence = be32_to_cpu(sb->s_sequence) + 1;
+ return 0;
+ }
+
+ err = do_one_pass(journal, &info, PASS_SCAN);
+ if (!err)
+ err = do_one_pass(journal, &info, PASS_REVOKE);
+ if (!err)
+ err = do_one_pass(journal, &info, PASS_REPLAY);
+
+ jbd_debug(0, "JBD: recovery, exit status %d, "
+ "recovered transactions %u to %u\n",
+ err, info.start_transaction, info.end_transaction);
+ jbd_debug(0, "JBD: Replayed %d and revoked %d/%d blocks\n",
+ info.nr_replays, info.nr_revoke_hits, info.nr_revokes);
+
+ /* Restart the log at the next transaction ID, thus invalidating
+ * any existing commit records in the log. */
+ journal->j_transaction_sequence = ++info.end_transaction;
+
+ jbd2_journal_clear_revoke(journal);
+ sync_blockdev(journal->j_fs_dev);
+ return err;
+}
+
+/**
+ * jbd2_journal_skip_recovery - Start journal and wipe exiting records
+ * @journal: journal to startup
+ *
+ * Locate any valid recovery information from the journal and set up the
+ * journal structures in memory to ignore it (presumably because the
+ * caller has evidence that it is out of date).
+ * This function does'nt appear to be exorted..
+ *
+ * We perform one pass over the journal to allow us to tell the user how
+ * much recovery information is being erased, and to let us initialise
+ * the journal transaction sequence numbers to the next unused ID.
+ */
+int jbd2_journal_skip_recovery(journal_t *journal)
+{
+ int err;
+ journal_superblock_t * sb;
+
+ struct recovery_info info;
+
+ memset (&info, 0, sizeof(info));
+ sb = journal->j_superblock;
+
+ err = do_one_pass(journal, &info, PASS_SCAN);
+
+ if (err) {
+ printk(KERN_ERR "JBD: error %d scanning journal\n", err);
+ ++journal->j_transaction_sequence;
+ } else {
+#ifdef CONFIG_JBD_DEBUG
+ int dropped = info.end_transaction - be32_to_cpu(sb->s_sequence);
+#endif
+ jbd_debug(0,
+ "JBD: ignoring %d transaction%s from the journal.\n",
+ dropped, (dropped == 1) ? "" : "s");
+ journal->j_transaction_sequence = ++info.end_transaction;
+ }
+
+ journal->j_tail = 0;
+ return err;
+}
+
+static inline unsigned long long read_tag_block(int tag_bytes, journal_block_tag_t *tag)
+{
+ unsigned long long block = be32_to_cpu(tag->t_blocknr);
+ if (tag_bytes > JBD_TAG_SIZE32)
+ block |= (u64)be32_to_cpu(tag->t_blocknr_high) << 32;
+ return block;
+}
+
+static int do_one_pass(journal_t *journal,
+ struct recovery_info *info, enum passtype pass)
+{
+ unsigned int first_commit_ID, next_commit_ID;
+ unsigned long next_log_block;
+ int err, success = 0;
+ journal_superblock_t * sb;
+ journal_header_t * tmp;
+ struct buffer_head * bh;
+ unsigned int sequence;
+ int blocktype;
+ int tag_bytes = journal_tag_bytes(journal);
+
+ /* Precompute the maximum metadata descriptors in a descriptor block */
+ int MAX_BLOCKS_PER_DESC;
+ MAX_BLOCKS_PER_DESC = ((journal->j_blocksize-sizeof(journal_header_t))
+ / tag_bytes);
+
+ /*
+ * First thing is to establish what we expect to find in the log
+ * (in terms of transaction IDs), and where (in terms of log
+ * block offsets): query the superblock.
+ */
+
+ sb = journal->j_superblock;
+ next_commit_ID = be32_to_cpu(sb->s_sequence);
+ next_log_block = be32_to_cpu(sb->s_start);
+
+ first_commit_ID = next_commit_ID;
+ if (pass == PASS_SCAN)
+ info->start_transaction = first_commit_ID;
+
+ jbd_debug(1, "Starting recovery pass %d\n", pass);
+
+ /*
+ * Now we walk through the log, transaction by transaction,
+ * making sure that each transaction has a commit block in the
+ * expected place. Each complete transaction gets replayed back
+ * into the main filesystem.
+ */
+
+ while (1) {
+ int flags;
+ char * tagp;
+ journal_block_tag_t * tag;
+ struct buffer_head * obh;
+ struct buffer_head * nbh;
+
+ cond_resched(); /* We're under lock_kernel() */
+
+ /* If we already know where to stop the log traversal,
+ * check right now that we haven't gone past the end of
+ * the log. */
+
+ if (pass != PASS_SCAN)
+ if (tid_geq(next_commit_ID, info->end_transaction))
+ break;
+
+ jbd_debug(2, "Scanning for sequence ID %u at %lu/%lu\n",
+ next_commit_ID, next_log_block, journal->j_last);
+
+ /* Skip over each chunk of the transaction looking
+ * either the next descriptor block or the final commit
+ * record. */
+
+ jbd_debug(3, "JBD: checking block %ld\n", next_log_block);
+ err = jread(&bh, journal, next_log_block);
+ if (err)
+ goto failed;
+
+ next_log_block++;
+ wrap(journal, next_log_block);
+
+ /* What kind of buffer is it?
+ *
+ * If it is a descriptor block, check that it has the
+ * expected sequence number. Otherwise, we're all done
+ * here. */
+
+ tmp = (journal_header_t *)bh->b_data;
+
+ if (tmp->h_magic != cpu_to_be32(JBD2_MAGIC_NUMBER)) {
+ brelse(bh);
+ break;
+ }
+
+ blocktype = be32_to_cpu(tmp->h_blocktype);
+ sequence = be32_to_cpu(tmp->h_sequence);
+ jbd_debug(3, "Found magic %d, sequence %d\n",
+ blocktype, sequence);
+
+ if (sequence != next_commit_ID) {
+ brelse(bh);
+ break;
+ }
+
+ /* OK, we have a valid descriptor block which matches
+ * all of the sequence number checks. What are we going
+ * to do with it? That depends on the pass... */
+
+ switch(blocktype) {
+ case JBD2_DESCRIPTOR_BLOCK:
+ /* If it is a valid descriptor block, replay it
+ * in pass REPLAY; otherwise, just skip over the
+ * blocks it describes. */
+ if (pass != PASS_REPLAY) {
+ next_log_block += count_tags(journal, bh);
+ wrap(journal, next_log_block);
+ brelse(bh);
+ continue;
+ }
+
+ /* A descriptor block: we can now write all of
+ * the data blocks. Yay, useful work is finally
+ * getting done here! */
+
+ tagp = &bh->b_data[sizeof(journal_header_t)];
+ while ((tagp - bh->b_data + tag_bytes)
+ <= journal->j_blocksize) {
+ unsigned long io_block;
+
+ tag = (journal_block_tag_t *) tagp;
+ flags = be32_to_cpu(tag->t_flags);
+
+ io_block = next_log_block++;
+ wrap(journal, next_log_block);
+ err = jread(&obh, journal, io_block);
+ if (err) {
+ /* Recover what we can, but
+ * report failure at the end. */
+ success = err;
+ printk (KERN_ERR
+ "JBD: IO error %d recovering "
+ "block %ld in log\n",
+ err, io_block);
+ } else {
+ unsigned long long blocknr;
+
+ J_ASSERT(obh != NULL);
+ blocknr = read_tag_block(tag_bytes,
+ tag);
+
+ /* If the block has been
+ * revoked, then we're all done
+ * here. */
+ if (jbd2_journal_test_revoke
+ (journal, blocknr,
+ next_commit_ID)) {
+ brelse(obh);
+ ++info->nr_revoke_hits;
+ goto skip_write;
+ }
+
+ /* Find a buffer for the new
+ * data being restored */
+ nbh = __getblk(journal->j_fs_dev,
+ blocknr,
+ journal->j_blocksize);
+ if (nbh == NULL) {
+ printk(KERN_ERR
+ "JBD: Out of memory "
+ "during recovery.\n");
+ err = -ENOMEM;
+ brelse(bh);
+ brelse(obh);
+ goto failed;
+ }
+
+ lock_buffer(nbh);
+ memcpy(nbh->b_data, obh->b_data,
+ journal->j_blocksize);
+ if (flags & JBD2_FLAG_ESCAPE) {
+ *((__be32 *)bh->b_data) =
+ cpu_to_be32(JBD2_MAGIC_NUMBER);
+ }
+
+ BUFFER_TRACE(nbh, "marking dirty");
+ set_buffer_uptodate(nbh);
+ mark_buffer_dirty(nbh);
+ BUFFER_TRACE(nbh, "marking uptodate");
+ ++info->nr_replays;
+ /* ll_rw_block(WRITE, 1, &nbh); */
+ unlock_buffer(nbh);
+ brelse(obh);
+ brelse(nbh);
+ }
+
+ skip_write:
+ tagp += tag_bytes;
+ if (!(flags & JBD2_FLAG_SAME_UUID))
+ tagp += 16;
+
+ if (flags & JBD2_FLAG_LAST_TAG)
+ break;
+ }
+
+ brelse(bh);
+ continue;
+
+ case JBD2_COMMIT_BLOCK:
+ /* Found an expected commit block: not much to
+ * do other than move on to the next sequence
+ * number. */
+ brelse(bh);
+ next_commit_ID++;
+ continue;
+
+ case JBD2_REVOKE_BLOCK:
+ /* If we aren't in the REVOKE pass, then we can
+ * just skip over this block. */
+ if (pass != PASS_REVOKE) {
+ brelse(bh);
+ continue;
+ }
+
+ err = scan_revoke_records(journal, bh,
+ next_commit_ID, info);
+ brelse(bh);
+ if (err)
+ goto failed;
+ continue;
+
+ default:
+ jbd_debug(3, "Unrecognised magic %d, end of scan.\n",
+ blocktype);
+ brelse(bh);
+ goto done;
+ }
+ }
+
+ done:
+ /*
+ * We broke out of the log scan loop: either we came to the
+ * known end of the log or we found an unexpected block in the
+ * log. If the latter happened, then we know that the "current"
+ * transaction marks the end of the valid log.
+ */
+
+ if (pass == PASS_SCAN)
+ info->end_transaction = next_commit_ID;
+ else {
+ /* It's really bad news if different passes end up at
+ * different places (but possible due to IO errors). */
+ if (info->end_transaction != next_commit_ID) {
+ printk (KERN_ERR "JBD: recovery pass %d ended at "
+ "transaction %u, expected %u\n",
+ pass, next_commit_ID, info->end_transaction);
+ if (!success)
+ success = -EIO;
+ }
+ }
+
+ return success;
+
+ failed:
+ return err;
+}
+
+
+/* Scan a revoke record, marking all blocks mentioned as revoked. */
+
+static int scan_revoke_records(journal_t *journal, struct buffer_head *bh,
+ tid_t sequence, struct recovery_info *info)
+{
+ jbd2_journal_revoke_header_t *header;
+ int offset, max;
+ int record_len = 4;
+
+ header = (jbd2_journal_revoke_header_t *) bh->b_data;
+ offset = sizeof(jbd2_journal_revoke_header_t);
+ max = be32_to_cpu(header->r_count);
+
+ if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT))
+ record_len = 8;
+
+ while (offset + record_len <= max) {
+ unsigned long long blocknr;
+ int err;
+
+ if (record_len == 4)
+ blocknr = be32_to_cpu(* ((__be32 *) (bh->b_data+offset)));
+ else
+ blocknr = be64_to_cpu(* ((__be64 *) (bh->b_data+offset)));
+ offset += record_len;
+ err = jbd2_journal_set_revoke(journal, blocknr, sequence);
+ if (err)
+ return err;
+ ++info->nr_revokes;
+ }
+ return 0;
+}
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
new file mode 100644
index 000000000000..380d19917f37
--- /dev/null
+++ b/fs/jbd2/revoke.c
@@ -0,0 +1,712 @@
+/*
+ * linux/fs/revoke.c
+ *
+ * Written by Stephen C. Tweedie <sct@redhat.com>, 2000
+ *
+ * Copyright 2000 Red Hat corp --- All Rights Reserved
+ *
+ * This file is part of the Linux kernel and is made available under
+ * the terms of the GNU General Public License, version 2, or at your
+ * option, any later version, incorporated herein by reference.
+ *
+ * Journal revoke routines for the generic filesystem journaling code;
+ * part of the ext2fs journaling system.
+ *
+ * Revoke is the mechanism used to prevent old log records for deleted
+ * metadata from being replayed on top of newer data using the same
+ * blocks. The revoke mechanism is used in two separate places:
+ *
+ * + Commit: during commit we write the entire list of the current
+ * transaction's revoked blocks to the journal
+ *
+ * + Recovery: during recovery we record the transaction ID of all
+ * revoked blocks. If there are multiple revoke records in the log
+ * for a single block, only the last one counts, and if there is a log
+ * entry for a block beyond the last revoke, then that log entry still
+ * gets replayed.
+ *
+ * We can get interactions between revokes and new log data within a
+ * single transaction:
+ *
+ * Block is revoked and then journaled:
+ * The desired end result is the journaling of the new block, so we
+ * cancel the revoke before the transaction commits.
+ *
+ * Block is journaled and then revoked:
+ * The revoke must take precedence over the write of the block, so we
+ * need either to cancel the journal entry or to write the revoke
+ * later in the log than the log block. In this case, we choose the
+ * latter: journaling a block cancels any revoke record for that block
+ * in the current transaction, so any revoke for that block in the
+ * transaction must have happened after the block was journaled and so
+ * the revoke must take precedence.
+ *
+ * Block is revoked and then written as data:
+ * The data write is allowed to succeed, but the revoke is _not_
+ * cancelled. We still need to prevent old log records from
+ * overwriting the new data. We don't even need to clear the revoke
+ * bit here.
+ *
+ * Revoke information on buffers is a tri-state value:
+ *
+ * RevokeValid clear: no cached revoke status, need to look it up
+ * RevokeValid set, Revoked clear:
+ * buffer has not been revoked, and cancel_revoke
+ * need do nothing.
+ * RevokeValid set, Revoked set:
+ * buffer has been revoked.
+ */
+
+#ifndef __KERNEL__
+#include "jfs_user.h"
+#else
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/jbd2.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/list.h>
+#include <linux/smp_lock.h>
+#include <linux/init.h>
+#endif
+
+static kmem_cache_t *jbd2_revoke_record_cache;
+static kmem_cache_t *jbd2_revoke_table_cache;
+
+/* Each revoke record represents one single revoked block. During
+ journal replay, this involves recording the transaction ID of the
+ last transaction to revoke this block. */
+
+struct jbd2_revoke_record_s
+{
+ struct list_head hash;
+ tid_t sequence; /* Used for recovery only */
+ unsigned long long blocknr;
+};
+
+
+/* The revoke table is just a simple hash table of revoke records. */
+struct jbd2_revoke_table_s
+{
+ /* It is conceivable that we might want a larger hash table
+ * for recovery. Must be a power of two. */
+ int hash_size;
+ int hash_shift;
+ struct list_head *hash_table;
+};
+
+
+#ifdef __KERNEL__
+static void write_one_revoke_record(journal_t *, transaction_t *,
+ struct journal_head **, int *,
+ struct jbd2_revoke_record_s *);
+static void flush_descriptor(journal_t *, struct journal_head *, int);
+#endif
+
+/* Utility functions to maintain the revoke table */
+
+/* Borrowed from buffer.c: this is a tried and tested block hash function */
+static inline int hash(journal_t *journal, unsigned long long block)
+{
+ struct jbd2_revoke_table_s *table = journal->j_revoke;
+ int hash_shift = table->hash_shift;
+ int hash = (int)block ^ (int)((block >> 31) >> 1);
+
+ return ((hash << (hash_shift - 6)) ^
+ (hash >> 13) ^
+ (hash << (hash_shift - 12))) & (table->hash_size - 1);
+}
+
+static int insert_revoke_hash(journal_t *journal, unsigned long long blocknr,
+ tid_t seq)
+{
+ struct list_head *hash_list;
+ struct jbd2_revoke_record_s *record;
+
+repeat:
+ record = kmem_cache_alloc(jbd2_revoke_record_cache, GFP_NOFS);
+ if (!record)
+ goto oom;
+
+ record->sequence = seq;
+ record->blocknr = blocknr;
+ hash_list = &journal->j_revoke->hash_table[hash(journal, blocknr)];
+ spin_lock(&journal->j_revoke_lock);
+ list_add(&record->hash, hash_list);
+ spin_unlock(&journal->j_revoke_lock);
+ return 0;
+
+oom:
+ if (!journal_oom_retry)
+ return -ENOMEM;
+ jbd_debug(1, "ENOMEM in %s, retrying\n", __FUNCTION__);
+ yield();
+ goto repeat;
+}
+
+/* Find a revoke record in the journal's hash table. */
+
+static struct jbd2_revoke_record_s *find_revoke_record(journal_t *journal,
+ unsigned long long blocknr)
+{
+ struct list_head *hash_list;
+ struct jbd2_revoke_record_s *record;
+
+ hash_list = &journal->j_revoke->hash_table[hash(journal, blocknr)];
+
+ spin_lock(&journal->j_revoke_lock);
+ record = (struct jbd2_revoke_record_s *) hash_list->next;
+ while (&(record->hash) != hash_list) {
+ if (record->blocknr == blocknr) {
+ spin_unlock(&journal->j_revoke_lock);
+ return record;
+ }
+ record = (struct jbd2_revoke_record_s *) record->hash.next;
+ }
+ spin_unlock(&journal->j_revoke_lock);
+ return NULL;
+}
+
+int __init jbd2_journal_init_revoke_caches(void)
+{
+ jbd2_revoke_record_cache = kmem_cache_create("jbd2_revoke_record",
+ sizeof(struct jbd2_revoke_record_s),
+ 0, SLAB_HWCACHE_ALIGN, NULL, NULL);
+ if (jbd2_revoke_record_cache == 0)
+ return -ENOMEM;
+
+ jbd2_revoke_table_cache = kmem_cache_create("jbd2_revoke_table",
+ sizeof(struct jbd2_revoke_table_s),
+ 0, 0, NULL, NULL);
+ if (jbd2_revoke_table_cache == 0) {
+ kmem_cache_destroy(jbd2_revoke_record_cache);
+ jbd2_revoke_record_cache = NULL;
+ return -ENOMEM;
+ }
+ return 0;
+}
+
+void jbd2_journal_destroy_revoke_caches(void)
+{
+ kmem_cache_destroy(jbd2_revoke_record_cache);
+ jbd2_revoke_record_cache = NULL;
+ kmem_cache_destroy(jbd2_revoke_table_cache);
+ jbd2_revoke_table_cache = NULL;
+}
+
+/* Initialise the revoke table for a given journal to a given size. */
+
+int jbd2_journal_init_revoke(journal_t *journal, int hash_size)
+{
+ int shift, tmp;
+
+ J_ASSERT (journal->j_revoke_table[0] == NULL);
+
+ shift = 0;
+ tmp = hash_size;
+ while((tmp >>= 1UL) != 0UL)
+ shift++;
+
+ journal->j_revoke_table[0] = kmem_cache_alloc(jbd2_revoke_table_cache, GFP_KERNEL);
+ if (!journal->j_revoke_table[0])
+ return -ENOMEM;
+ journal->j_revoke = journal->j_revoke_table[0];
+
+ /* Check that the hash_size is a power of two */
+ J_ASSERT ((hash_size & (hash_size-1)) == 0);
+
+ journal->j_revoke->hash_size = hash_size;
+
+ journal->j_revoke->hash_shift = shift;
+
+ journal->j_revoke->hash_table =
+ kmalloc(hash_size * sizeof(struct list_head), GFP_KERNEL);
+ if (!journal->j_revoke->hash_table) {
+ kmem_cache_free(jbd2_revoke_table_cache, journal->j_revoke_table[0]);
+ journal->j_revoke = NULL;
+ return -ENOMEM;
+ }
+
+ for (tmp = 0; tmp < hash_size; tmp++)
+ INIT_LIST_HEAD(&journal->j_revoke->hash_table[tmp]);
+
+ journal->j_revoke_table[1] = kmem_cache_alloc(jbd2_revoke_table_cache, GFP_KERNEL);
+ if (!journal->j_revoke_table[1]) {
+ kfree(journal->j_revoke_table[0]->hash_table);
+ kmem_cache_free(jbd2_revoke_table_cache, journal->j_revoke_table[0]);
+ return -ENOMEM;
+ }
+
+ journal->j_revoke = journal->j_revoke_table[1];
+
+ /* Check that the hash_size is a power of two */
+ J_ASSERT ((hash_size & (hash_size-1)) == 0);
+
+ journal->j_revoke->hash_size = hash_size;
+
+ journal->j_revoke->hash_shift = shift;
+
+ journal->j_revoke->hash_table =
+ kmalloc(hash_size * sizeof(struct list_head), GFP_KERNEL);
+ if (!journal->j_revoke->hash_table) {
+ kfree(journal->j_revoke_table[0]->hash_table);
+ kmem_cache_free(jbd2_revoke_table_cache, journal->j_revoke_table[0]);
+ kmem_cache_free(jbd2_revoke_table_cache, journal->j_revoke_table[1]);
+ journal->j_revoke = NULL;
+ return -ENOMEM;
+ }
+
+ for (tmp = 0; tmp < hash_size; tmp++)
+ INIT_LIST_HEAD(&journal->j_revoke->hash_table[tmp]);
+
+ spin_lock_init(&journal->j_revoke_lock);
+
+ return 0;
+}
+
+/* Destoy a journal's revoke table. The table must already be empty! */
+
+void jbd2_journal_destroy_revoke(journal_t *journal)
+{
+ struct jbd2_revoke_table_s *table;
+ struct list_head *hash_list;
+ int i;
+
+ table = journal->j_revoke_table[0];
+ if (!table)
+ return;
+
+ for (i=0; i<table->hash_size; i++) {
+ hash_list = &table->hash_table[i];
+ J_ASSERT (list_empty(hash_list));
+ }
+
+ kfree(table->hash_table);
+ kmem_cache_free(jbd2_revoke_table_cache, table);
+ journal->j_revoke = NULL;
+
+ table = journal->j_revoke_table[1];
+ if (!table)
+ return;
+
+ for (i=0; i<table->hash_size; i++) {
+ hash_list = &table->hash_table[i];
+ J_ASSERT (list_empty(hash_list));
+ }
+
+ kfree(table->hash_table);
+ kmem_cache_free(jbd2_revoke_table_cache, table);
+ journal->j_revoke = NULL;
+}
+
+
+#ifdef __KERNEL__
+
+/*
+ * jbd2_journal_revoke: revoke a given buffer_head from the journal. This
+ * prevents the block from being replayed during recovery if we take a
+ * crash after this current transaction commits. Any subsequent
+ * metadata writes of the buffer in this transaction cancel the
+ * revoke.
+ *
+ * Note that this call may block --- it is up to the caller to make
+ * sure that there are no further calls to journal_write_metadata
+ * before the revoke is complete. In ext3, this implies calling the
+ * revoke before clearing the block bitmap when we are deleting
+ * metadata.
+ *
+ * Revoke performs a jbd2_journal_forget on any buffer_head passed in as a
+ * parameter, but does _not_ forget the buffer_head if the bh was only
+ * found implicitly.
+ *
+ * bh_in may not be a journalled buffer - it may have come off
+ * the hash tables without an attached journal_head.
+ *
+ * If bh_in is non-zero, jbd2_journal_revoke() will decrement its b_count
+ * by one.
+ */
+
+int jbd2_journal_revoke(handle_t *handle, unsigned long long blocknr,
+ struct buffer_head *bh_in)
+{
+ struct buffer_head *bh = NULL;
+ journal_t *journal;
+ struct block_device *bdev;
+ int err;
+
+ might_sleep();
+ if (bh_in)
+ BUFFER_TRACE(bh_in, "enter");
+
+ journal = handle->h_transaction->t_journal;
+ if (!jbd2_journal_set_features(journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)){
+ J_ASSERT (!"Cannot set revoke feature!");
+ return -EINVAL;
+ }
+
+ bdev = journal->j_fs_dev;
+ bh = bh_in;
+
+ if (!bh) {
+ bh = __find_get_block(bdev, blocknr, journal->j_blocksize);
+ if (bh)
+ BUFFER_TRACE(bh, "found on hash");
+ }
+#ifdef JBD_EXPENSIVE_CHECKING
+ else {
+ struct buffer_head *bh2;
+
+ /* If there is a different buffer_head lying around in
+ * memory anywhere... */
+ bh2 = __find_get_block(bdev, blocknr, journal->j_blocksize);
+ if (bh2) {
+ /* ... and it has RevokeValid status... */
+ if (bh2 != bh && buffer_revokevalid(bh2))
+ /* ...then it better be revoked too,
+ * since it's illegal to create a revoke
+ * record against a buffer_head which is
+ * not marked revoked --- that would
+ * risk missing a subsequent revoke
+ * cancel. */
+ J_ASSERT_BH(bh2, buffer_revoked(bh2));
+ put_bh(bh2);
+ }
+ }
+#endif
+
+ /* We really ought not ever to revoke twice in a row without
+ first having the revoke cancelled: it's illegal to free a
+ block twice without allocating it in between! */
+ if (bh) {
+ if (!J_EXPECT_BH(bh, !buffer_revoked(bh),
+ "inconsistent data on disk")) {
+ if (!bh_in)
+ brelse(bh);
+ return -EIO;
+ }
+ set_buffer_revoked(bh);
+ set_buffer_revokevalid(bh);
+ if (bh_in) {
+ BUFFER_TRACE(bh_in, "call jbd2_journal_forget");
+ jbd2_journal_forget(handle, bh_in);
+ } else {
+ BUFFER_TRACE(bh, "call brelse");
+ __brelse(bh);
+ }
+ }
+
+ jbd_debug(2, "insert revoke for block %llu, bh_in=%p\n",blocknr, bh_in);
+ err = insert_revoke_hash(journal, blocknr,
+ handle->h_transaction->t_tid);
+ BUFFER_TRACE(bh_in, "exit");
+ return err;
+}
+
+/*
+ * Cancel an outstanding revoke. For use only internally by the
+ * journaling code (called from jbd2_journal_get_write_access).
+ *
+ * We trust buffer_revoked() on the buffer if the buffer is already
+ * being journaled: if there is no revoke pending on the buffer, then we
+ * don't do anything here.
+ *
+ * This would break if it were possible for a buffer to be revoked and
+ * discarded, and then reallocated within the same transaction. In such
+ * a case we would have lost the revoked bit, but when we arrived here
+ * the second time we would still have a pending revoke to cancel. So,
+ * do not trust the Revoked bit on buffers unless RevokeValid is also
+ * set.
+ *
+ * The caller must have the journal locked.
+ */
+int jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh)
+{
+ struct jbd2_revoke_record_s *record;
+ journal_t *journal = handle->h_transaction->t_journal;
+ int need_cancel;
+ int did_revoke = 0; /* akpm: debug */
+ struct buffer_head *bh = jh2bh(jh);
+
+ jbd_debug(4, "journal_head %p, cancelling revoke\n", jh);
+
+ /* Is the existing Revoke bit valid? If so, we trust it, and
+ * only perform the full cancel if the revoke bit is set. If
+ * not, we can't trust the revoke bit, and we need to do the
+ * full search for a revoke record. */
+ if (test_set_buffer_revokevalid(bh)) {
+ need_cancel = test_clear_buffer_revoked(bh);
+ } else {
+ need_cancel = 1;
+ clear_buffer_revoked(bh);
+ }
+
+ if (need_cancel) {
+ record = find_revoke_record(journal, bh->b_blocknr);
+ if (record) {
+ jbd_debug(4, "cancelled existing revoke on "
+ "blocknr %llu\n", (unsigned long long)bh->b_blocknr);
+ spin_lock(&journal->j_revoke_lock);
+ list_del(&record->hash);
+ spin_unlock(&journal->j_revoke_lock);
+ kmem_cache_free(jbd2_revoke_record_cache, record);
+ did_revoke = 1;
+ }
+ }
+
+#ifdef JBD_EXPENSIVE_CHECKING
+ /* There better not be one left behind by now! */
+ record = find_revoke_record(journal, bh->b_blocknr);
+ J_ASSERT_JH(jh, record == NULL);
+#endif
+
+ /* Finally, have we just cleared revoke on an unhashed
+ * buffer_head? If so, we'd better make sure we clear the
+ * revoked status on any hashed alias too, otherwise the revoke
+ * state machine will get very upset later on. */
+ if (need_cancel) {
+ struct buffer_head *bh2;
+ bh2 = __find_get_block(bh->b_bdev, bh->b_blocknr, bh->b_size);
+ if (bh2) {
+ if (bh2 != bh)
+ clear_buffer_revoked(bh2);
+ __brelse(bh2);
+ }
+ }
+ return did_revoke;
+}
+
+/* journal_switch_revoke table select j_revoke for next transaction
+ * we do not want to suspend any processing until all revokes are
+ * written -bzzz
+ */
+void jbd2_journal_switch_revoke_table(journal_t *journal)
+{
+ int i;
+
+ if (journal->j_revoke == journal->j_revoke_table[0])
+ journal->j_revoke = journal->j_revoke_table[1];
+ else
+ journal->j_revoke = journal->j_revoke_table[0];
+
+ for (i = 0; i < journal->j_revoke->hash_size; i++)
+ INIT_LIST_HEAD(&journal->j_revoke->hash_table[i]);
+}
+
+/*
+ * Write revoke records to the journal for all entries in the current
+ * revoke hash, deleting the entries as we go.
+ *
+ * Called with the journal lock held.
+ */
+
+void jbd2_journal_write_revoke_records(journal_t *journal,
+ transaction_t *transaction)
+{
+ struct journal_head *descriptor;
+ struct jbd2_revoke_record_s *record;
+ struct jbd2_revoke_table_s *revoke;
+ struct list_head *hash_list;
+ int i, offset, count;
+
+ descriptor = NULL;
+ offset = 0;
+ count = 0;
+
+ /* select revoke table for committing transaction */
+ revoke = journal->j_revoke == journal->j_revoke_table[0] ?
+ journal->j_revoke_table[1] : journal->j_revoke_table[0];
+
+ for (i = 0; i < revoke->hash_size; i++) {
+ hash_list = &revoke->hash_table[i];
+
+ while (!list_empty(hash_list)) {
+ record = (struct jbd2_revoke_record_s *)
+ hash_list->next;
+ write_one_revoke_record(journal, transaction,
+ &descriptor, &offset,
+ record);
+ count++;
+ list_del(&record->hash);
+ kmem_cache_free(jbd2_revoke_record_cache, record);
+ }
+ }
+ if (descriptor)
+ flush_descriptor(journal, descriptor, offset);
+ jbd_debug(1, "Wrote %d revoke records\n", count);
+}
+
+/*
+ * Write out one revoke record. We need to create a new descriptor
+ * block if the old one is full or if we have not already created one.
+ */
+
+static void write_one_revoke_record(journal_t *journal,
+ transaction_t *transaction,
+ struct journal_head **descriptorp,
+ int *offsetp,
+ struct jbd2_revoke_record_s *record)
+{
+ struct journal_head *descriptor;
+ int offset;
+ journal_header_t *header;
+
+ /* If we are already aborting, this all becomes a noop. We
+ still need to go round the loop in
+ jbd2_journal_write_revoke_records in order to free all of the
+ revoke records: only the IO to the journal is omitted. */
+ if (is_journal_aborted(journal))
+ return;
+
+ descriptor = *descriptorp;
+ offset = *offsetp;
+
+ /* Make sure we have a descriptor with space left for the record */
+ if (descriptor) {
+ if (offset == journal->j_blocksize) {
+ flush_descriptor(journal, descriptor, offset);
+ descriptor = NULL;
+ }
+ }
+
+ if (!descriptor) {
+ descriptor = jbd2_journal_get_descriptor_buffer(journal);
+ if (!descriptor)
+ return;
+ header = (journal_header_t *) &jh2bh(descriptor)->b_data[0];
+ header->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
+ header->h_blocktype = cpu_to_be32(JBD2_REVOKE_BLOCK);
+ header->h_sequence = cpu_to_be32(transaction->t_tid);
+
+ /* Record it so that we can wait for IO completion later */
+ JBUFFER_TRACE(descriptor, "file as BJ_LogCtl");
+ jbd2_journal_file_buffer(descriptor, transaction, BJ_LogCtl);
+
+ offset = sizeof(jbd2_journal_revoke_header_t);
+ *descriptorp = descriptor;
+ }
+
+ if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT)) {
+ * ((__be64 *)(&jh2bh(descriptor)->b_data[offset])) =
+ cpu_to_be64(record->blocknr);
+ offset += 8;
+
+ } else {
+ * ((__be32 *)(&jh2bh(descriptor)->b_data[offset])) =
+ cpu_to_be32(record->blocknr);
+ offset += 4;
+ }
+
+ *offsetp = offset;
+}
+
+/*
+ * Flush a revoke descriptor out to the journal. If we are aborting,
+ * this is a noop; otherwise we are generating a buffer which needs to
+ * be waited for during commit, so it has to go onto the appropriate
+ * journal buffer list.
+ */
+
+static void flush_descriptor(journal_t *journal,
+ struct journal_head *descriptor,
+ int offset)
+{
+ jbd2_journal_revoke_header_t *header;
+ struct buffer_head *bh = jh2bh(descriptor);
+
+ if (is_journal_aborted(journal)) {
+ put_bh(bh);
+ return;
+ }
+
+ header = (jbd2_journal_revoke_header_t *) jh2bh(descriptor)->b_data;
+ header->r_count = cpu_to_be32(offset);
+ set_buffer_jwrite(bh);
+ BUFFER_TRACE(bh, "write");
+ set_buffer_dirty(bh);
+ ll_rw_block(SWRITE, 1, &bh);
+}
+#endif
+
+/*
+ * Revoke support for recovery.
+ *
+ * Recovery needs to be able to:
+ *
+ * record all revoke records, including the tid of the latest instance
+ * of each revoke in the journal
+ *
+ * check whether a given block in a given transaction should be replayed
+ * (ie. has not been revoked by a revoke record in that or a subsequent
+ * transaction)
+ *
+ * empty the revoke table after recovery.
+ */
+
+/*
+ * First, setting revoke records. We create a new revoke record for
+ * every block ever revoked in the log as we scan it for recovery, and
+ * we update the existing records if we find multiple revokes for a
+ * single block.
+ */
+
+int jbd2_journal_set_revoke(journal_t *journal,
+ unsigned long long blocknr,
+ tid_t sequence)
+{
+ struct jbd2_revoke_record_s *record;
+
+ record = find_revoke_record(journal, blocknr);
+ if (record) {
+ /* If we have multiple occurrences, only record the
+ * latest sequence number in the hashed record */
+ if (tid_gt(sequence, record->sequence))
+ record->sequence = sequence;
+ return 0;
+ }
+ return insert_revoke_hash(journal, blocknr, sequence);
+}
+
+/*
+ * Test revoke records. For a given block referenced in the log, has
+ * that block been revoked? A revoke record with a given transaction
+ * sequence number revokes all blocks in that transaction and earlier
+ * ones, but later transactions still need replayed.
+ */
+
+int jbd2_journal_test_revoke(journal_t *journal,
+ unsigned long long blocknr,
+ tid_t sequence)
+{
+ struct jbd2_revoke_record_s *record;
+
+ record = find_revoke_record(journal, blocknr);
+ if (!record)
+ return 0;
+ if (tid_gt(sequence, record->sequence))
+ return 0;
+ return 1;
+}
+
+/*
+ * Finally, once recovery is over, we need to clear the revoke table so
+ * that it can be reused by the running filesystem.
+ */
+
+void jbd2_journal_clear_revoke(journal_t *journal)
+{
+ int i;
+ struct list_head *hash_list;
+ struct jbd2_revoke_record_s *record;
+ struct jbd2_revoke_table_s *revoke;
+
+ revoke = journal->j_revoke;
+
+ for (i = 0; i < revoke->hash_size; i++) {
+ hash_list = &revoke->hash_table[i];
+ while (!list_empty(hash_list)) {
+ record = (struct jbd2_revoke_record_s*) hash_list->next;
+ list_del(&record->hash);
+ kmem_cache_free(jbd2_revoke_record_cache, record);
+ }
+ }
+}
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
new file mode 100644
index 000000000000..b6cf2be845a1
--- /dev/null
+++ b/fs/jbd2/transaction.c
@@ -0,0 +1,2081 @@
+/*
+ * linux/fs/transaction.c
+ *
+ * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
+ *
+ * Copyright 1998 Red Hat corp --- All Rights Reserved
+ *
+ * This file is part of the Linux kernel and is made available under
+ * the terms of the GNU General Public License, version 2, or at your
+ * option, any later version, incorporated herein by reference.
+ *
+ * Generic filesystem transaction handling code; part of the ext2fs
+ * journaling system.
+ *
+ * This file manages transactions (compound commits managed by the
+ * journaling code) and handles (individual atomic operations by the
+ * filesystem).
+ */
+
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/jbd2.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/timer.h>
+#include <linux/smp_lock.h>
+#include <linux/mm.h>
+#include <linux/highmem.h>
+
+/*
+ * jbd2_get_transaction: obtain a new transaction_t object.
+ *
+ * Simply allocate and initialise a new transaction. Create it in
+ * RUNNING state and add it to the current journal (which should not
+ * have an existing running transaction: we only make a new transaction
+ * once we have started to commit the old one).
+ *
+ * Preconditions:
+ * The journal MUST be locked. We don't perform atomic mallocs on the
+ * new transaction and we can't block without protecting against other
+ * processes trying to touch the journal while it is in transition.
+ *
+ * Called under j_state_lock
+ */
+
+static transaction_t *
+jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
+{
+ transaction->t_journal = journal;
+ transaction->t_state = T_RUNNING;
+ transaction->t_tid = journal->j_transaction_sequence++;
+ transaction->t_expires = jiffies + journal->j_commit_interval;
+ spin_lock_init(&transaction->t_handle_lock);
+
+ /* Set up the commit timer for the new transaction. */
+ journal->j_commit_timer.expires = transaction->t_expires;
+ add_timer(&journal->j_commit_timer);
+
+ J_ASSERT(journal->j_running_transaction == NULL);
+ journal->j_running_transaction = transaction;
+
+ return transaction;
+}
+
+/*
+ * Handle management.
+ *
+ * A handle_t is an object which represents a single atomic update to a
+ * filesystem, and which tracks all of the modifications which form part
+ * of that one update.
+ */
+
+/*
+ * start_this_handle: Given a handle, deal with any locking or stalling
+ * needed to make sure that there is enough journal space for the handle
+ * to begin. Attach the handle to a transaction and set up the
+ * transaction's buffer credits.
+ */
+
+static int start_this_handle(journal_t *journal, handle_t *handle)
+{
+ transaction_t *transaction;
+ int needed;
+ int nblocks = handle->h_buffer_credits;
+ transaction_t *new_transaction = NULL;
+ int ret = 0;
+
+ if (nblocks > journal->j_max_transaction_buffers) {
+ printk(KERN_ERR "JBD: %s wants too many credits (%d > %d)\n",
+ current->comm, nblocks,
+ journal->j_max_transaction_buffers);
+ ret = -ENOSPC;
+ goto out;
+ }
+
+alloc_transaction:
+ if (!journal->j_running_transaction) {
+ new_transaction = jbd_kmalloc(sizeof(*new_transaction),
+ GFP_NOFS);
+ if (!new_transaction) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ memset(new_transaction, 0, sizeof(*new_transaction));
+ }
+
+ jbd_debug(3, "New handle %p going live.\n", handle);
+
+repeat:
+
+ /*
+ * We need to hold j_state_lock until t_updates has been incremented,
+ * for proper journal barrier handling
+ */
+ spin_lock(&journal->j_state_lock);
+repeat_locked:
+ if (is_journal_aborted(journal) ||
+ (journal->j_errno != 0 && !(journal->j_flags & JBD2_ACK_ERR))) {
+ spin_unlock(&journal->j_state_lock);
+ ret = -EROFS;
+ goto out;
+ }
+
+ /* Wait on the journal's transaction barrier if necessary */
+ if (journal->j_barrier_count) {
+ spin_unlock(&journal->j_state_lock);
+ wait_event(journal->j_wait_transaction_locked,
+ journal->j_barrier_count == 0);
+ goto repeat;
+ }
+
+ if (!journal->j_running_transaction) {
+ if (!new_transaction) {
+ spin_unlock(&journal->j_state_lock);
+ goto alloc_transaction;
+ }
+ jbd2_get_transaction(journal, new_transaction);
+ new_transaction = NULL;
+ }
+
+ transaction = journal->j_running_transaction;
+
+ /*
+ * If the current transaction is locked down for commit, wait for the
+ * lock to be released.
+ */
+ if (transaction->t_state == T_LOCKED) {
+ DEFINE_WAIT(wait);
+
+ prepare_to_wait(&journal->j_wait_transaction_locked,
+ &wait, TASK_UNINTERRUPTIBLE);
+ spin_unlock(&journal->j_state_lock);
+ schedule();
+ finish_wait(&journal->j_wait_transaction_locked, &wait);
+ goto repeat;
+ }
+
+ /*
+ * If there is not enough space left in the log to write all potential
+ * buffers requested by this operation, we need to stall pending a log
+ * checkpoint to free some more log space.
+ */
+ spin_lock(&transaction->t_handle_lock);
+ needed = transaction->t_outstanding_credits + nblocks;
+
+ if (needed > journal->j_max_transaction_buffers) {
+ /*
+ * If the current transaction is already too large, then start
+ * to commit it: we can then go back and attach this handle to
+ * a new transaction.
+ */
+ DEFINE_WAIT(wait);
+
+ jbd_debug(2, "Handle %p starting new commit...\n", handle);
+ spin_unlock(&transaction->t_handle_lock);
+ prepare_to_wait(&journal->j_wait_transaction_locked, &wait,
+ TASK_UNINTERRUPTIBLE);
+ __jbd2_log_start_commit(journal, transaction->t_tid);
+ spin_unlock(&journal->j_state_lock);
+ schedule();
+ finish_wait(&journal->j_wait_transaction_locked, &wait);
+ goto repeat;
+ }
+
+ /*
+ * The commit code assumes that it can get enough log space
+ * without forcing a checkpoint. This is *critical* for
+ * correctness: a checkpoint of a buffer which is also
+ * associated with a committing transaction creates a deadlock,
+ * so commit simply cannot force through checkpoints.
+ *
+ * We must therefore ensure the necessary space in the journal
+ * *before* starting to dirty potentially checkpointed buffers
+ * in the new transaction.
+ *
+ * The worst part is, any transaction currently committing can
+ * reduce the free space arbitrarily. Be careful to account for
+ * those buffers when checkpointing.
+ */
+
+ /*
+ * @@@ AKPM: This seems rather over-defensive. We're giving commit
+ * a _lot_ of headroom: 1/4 of the journal plus the size of
+ * the committing transaction. Really, we only need to give it
+ * committing_transaction->t_outstanding_credits plus "enough" for
+ * the log control blocks.
+ * Also, this test is inconsitent with the matching one in
+ * jbd2_journal_extend().
+ */
+ if (__jbd2_log_space_left(journal) < jbd_space_needed(journal)) {
+ jbd_debug(2, "Handle %p waiting for checkpoint...\n", handle);
+ spin_unlock(&transaction->t_handle_lock);
+ __jbd2_log_wait_for_space(journal);
+ goto repeat_locked;
+ }
+
+ /* OK, account for the buffers that this operation expects to
+ * use and add the handle to the running transaction. */
+
+ handle->h_transaction = transaction;
+ transaction->t_outstanding_credits += nblocks;
+ transaction->t_updates++;
+ transaction->t_handle_count++;
+ jbd_debug(4, "Handle %p given %d credits (total %d, free %d)\n",
+ handle, nblocks, transaction->t_outstanding_credits,
+ __jbd2_log_space_left(journal));
+ spin_unlock(&transaction->t_handle_lock);
+ spin_unlock(&journal->j_state_lock);
+out:
+ if (unlikely(new_transaction)) /* It's usually NULL */
+ kfree(new_transaction);
+ return ret;
+}
+
+/* Allocate a new handle. This should probably be in a slab... */
+static handle_t *new_handle(int nblocks)
+{
+ handle_t *handle = jbd_alloc_handle(GFP_NOFS);
+ if (!handle)
+ return NULL;
+ memset(handle, 0, sizeof(*handle));
+ handle->h_buffer_credits = nblocks;
+ handle->h_ref = 1;
+
+ return handle;
+}
+
+/**
+ * handle_t *jbd2_journal_start() - Obtain a new handle.
+ * @journal: Journal to start transaction on.
+ * @nblocks: number of block buffer we might modify
+ *
+ * We make sure that the transaction can guarantee at least nblocks of
+ * modified buffers in the log. We block until the log can guarantee
+ * that much space.
+ *
+ * This function is visible to journal users (like ext3fs), so is not
+ * called with the journal already locked.
+ *
+ * Return a pointer to a newly allocated handle, or NULL on failure
+ */
+handle_t *jbd2_journal_start(journal_t *journal, int nblocks)
+{
+ handle_t *handle = journal_current_handle();
+ int err;
+
+ if (!journal)
+ return ERR_PTR(-EROFS);
+
+ if (handle) {
+ J_ASSERT(handle->h_transaction->t_journal == journal);
+ handle->h_ref++;
+ return handle;
+ }
+
+ handle = new_handle(nblocks);
+ if (!handle)
+ return ERR_PTR(-ENOMEM);
+
+ current->journal_info = handle;
+
+ err = start_this_handle(journal, handle);
+ if (err < 0) {
+ jbd_free_handle(handle);
+ current->journal_info = NULL;
+ handle = ERR_PTR(err);
+ }
+ return handle;
+}
+
+/**
+ * int jbd2_journal_extend() - extend buffer credits.
+ * @handle: handle to 'extend'
+ * @nblocks: nr blocks to try to extend by.
+ *
+ * Some transactions, such as large extends and truncates, can be done
+ * atomically all at once or in several stages. The operation requests
+ * a credit for a number of buffer modications in advance, but can
+ * extend its credit if it needs more.
+ *
+ * jbd2_journal_extend tries to give the running handle more buffer credits.
+ * It does not guarantee that allocation - this is a best-effort only.
+ * The calling process MUST be able to deal cleanly with a failure to
+ * extend here.
+ *
+ * Return 0 on success, non-zero on failure.
+ *
+ * return code < 0 implies an error
+ * return code > 0 implies normal transaction-full status.
+ */
+int jbd2_journal_extend(handle_t *handle, int nblocks)
+{
+ transaction_t *transaction = handle->h_transaction;
+ journal_t *journal = transaction->t_journal;
+ int result;
+ int wanted;
+
+ result = -EIO;
+ if (is_handle_aborted(handle))
+ goto out;
+
+ result = 1;
+
+ spin_lock(&journal->j_state_lock);
+
+ /* Don't extend a locked-down transaction! */
+ if (handle->h_transaction->t_state != T_RUNNING) {
+ jbd_debug(3, "denied handle %p %d blocks: "
+ "transaction not running\n", handle, nblocks);
+ goto error_out;
+ }
+
+ spin_lock(&transaction->t_handle_lock);
+ wanted = transaction->t_outstanding_credits + nblocks;
+
+ if (wanted > journal->j_max_transaction_buffers) {
+ jbd_debug(3, "denied handle %p %d blocks: "
+ "transaction too large\n", handle, nblocks);
+ goto unlock;
+ }
+
+ if (wanted > __jbd2_log_space_left(journal)) {
+ jbd_debug(3, "denied handle %p %d blocks: "
+ "insufficient log space\n", handle, nblocks);
+ goto unlock;
+ }
+
+ handle->h_buffer_credits += nblocks;
+ transaction->t_outstanding_credits += nblocks;
+ result = 0;
+
+ jbd_debug(3, "extended handle %p by %d\n", handle, nblocks);
+unlock:
+ spin_unlock(&transaction->t_handle_lock);
+error_out:
+ spin_unlock(&journal->j_state_lock);
+out:
+ return result;
+}
+
+
+/**
+ * int jbd2_journal_restart() - restart a handle .
+ * @handle: handle to restart
+ * @nblocks: nr credits requested
+ *
+ * Restart a handle for a multi-transaction filesystem
+ * operation.
+ *
+ * If the jbd2_journal_extend() call above fails to grant new buffer credits
+ * to a running handle, a call to jbd2_journal_restart will commit the
+ * handle's transaction so far and reattach the handle to a new
+ * transaction capabable of guaranteeing the requested number of
+ * credits.
+ */
+
+int jbd2_journal_restart(handle_t *handle, int nblocks)
+{
+ transaction_t *transaction = handle->h_transaction;
+ journal_t *journal = transaction->t_journal;
+ int ret;
+
+ /* If we've had an abort of any type, don't even think about
+ * actually doing the restart! */
+ if (is_handle_aborted(handle))
+ return 0;
+
+ /*
+ * First unlink the handle from its current transaction, and start the
+ * commit on that.
+ */
+ J_ASSERT(transaction->t_updates > 0);
+ J_ASSERT(journal_current_handle() == handle);
+
+ spin_lock(&journal->j_state_lock);
+ spin_lock(&transaction->t_handle_lock);
+ transaction->t_outstanding_credits -= handle->h_buffer_credits;
+ transaction->t_updates--;
+
+ if (!transaction->t_updates)
+ wake_up(&journal->j_wait_updates);
+ spin_unlock(&transaction->t_handle_lock);
+
+ jbd_debug(2, "restarting handle %p\n", handle);
+ __jbd2_log_start_commit(journal, transaction->t_tid);
+ spin_unlock(&journal->j_state_lock);
+
+ handle->h_buffer_credits = nblocks;
+ ret = start_this_handle(journal, handle);
+ return ret;
+}
+
+
+/**
+ * void jbd2_journal_lock_updates () - establish a transaction barrier.
+ * @journal: Journal to establish a barrier on.
+ *
+ * This locks out any further updates from being started, and blocks
+ * until all existing updates have completed, returning only once the
+ * journal is in a quiescent state with no updates running.
+ *
+ * The journal lock should not be held on entry.
+ */
+void jbd2_journal_lock_updates(journal_t *journal)
+{
+ DEFINE_WAIT(wait);
+
+ spin_lock(&journal->j_state_lock);
+ ++journal->j_barrier_count;
+
+ /* Wait until there are no running updates */
+ while (1) {
+ transaction_t *transaction = journal->j_running_transaction;
+
+ if (!transaction)
+ break;
+
+ spin_lock(&transaction->t_handle_lock);
+ if (!transaction->t_updates) {
+ spin_unlock(&transaction->t_handle_lock);
+ break;
+ }
+ prepare_to_wait(&journal->j_wait_updates, &wait,
+ TASK_UNINTERRUPTIBLE);
+ spin_unlock(&transaction->t_handle_lock);
+ spin_unlock(&journal->j_state_lock);
+ schedule();
+ finish_wait(&journal->j_wait_updates, &wait);
+ spin_lock(&journal->j_state_lock);
+ }
+ spin_unlock(&journal->j_state_lock);
+
+ /*
+ * We have now established a barrier against other normal updates, but
+ * we also need to barrier against other jbd2_journal_lock_updates() calls
+ * to make sure that we serialise special journal-locked operations
+ * too.
+ */
+ mutex_lock(&journal->j_barrier);
+}
+
+/**
+ * void jbd2_journal_unlock_updates (journal_t* journal) - release barrier
+ * @journal: Journal to release the barrier on.
+ *
+ * Release a transaction barrier obtained with jbd2_journal_lock_updates().
+ *
+ * Should be called without the journal lock held.
+ */
+void jbd2_journal_unlock_updates (journal_t *journal)
+{
+ J_ASSERT(journal->j_barrier_count != 0);
+
+ mutex_unlock(&journal->j_barrier);
+ spin_lock(&journal->j_state_lock);
+ --journal->j_barrier_count;
+ spin_unlock(&journal->j_state_lock);
+ wake_up(&journal->j_wait_transaction_locked);
+}
+
+/*
+ * Report any unexpected dirty buffers which turn up. Normally those
+ * indicate an error, but they can occur if the user is running (say)
+ * tune2fs to modify the live filesystem, so we need the option of
+ * continuing as gracefully as possible. #
+ *
+ * The caller should already hold the journal lock and
+ * j_list_lock spinlock: most callers will need those anyway
+ * in order to probe the buffer's journaling state safely.
+ */
+static void jbd_unexpected_dirty_buffer(struct journal_head *jh)
+{
+ int jlist;
+
+ /* If this buffer is one which might reasonably be dirty
+ * --- ie. data, or not part of this journal --- then
+ * we're OK to leave it alone, but otherwise we need to
+ * move the dirty bit to the journal's own internal
+ * JBDDirty bit. */
+ jlist = jh->b_jlist;
+
+ if (jlist == BJ_Metadata || jlist == BJ_Reserved ||
+ jlist == BJ_Shadow || jlist == BJ_Forget) {
+ struct buffer_head *bh = jh2bh(jh);
+
+ if (test_clear_buffer_dirty(bh))
+ set_buffer_jbddirty(bh);
+ }
+}
+
+/*
+ * If the buffer is already part of the current transaction, then there
+ * is nothing we need to do. If it is already part of a prior
+ * transaction which we are still committing to disk, then we need to
+ * make sure that we do not overwrite the old copy: we do copy-out to
+ * preserve the copy going to disk. We also account the buffer against
+ * the handle's metadata buffer credits (unless the buffer is already
+ * part of the transaction, that is).
+ *
+ */
+static int
+do_get_write_access(handle_t *handle, struct journal_head *jh,
+ int force_copy)
+{
+ struct buffer_head *bh;
+ transaction_t *transaction;
+ journal_t *journal;
+ int error;
+ char *frozen_buffer = NULL;
+ int need_copy = 0;
+
+ if (is_handle_aborted(handle))
+ return -EROFS;
+
+ transaction = handle->h_transaction;
+ journal = transaction->t_journal;
+
+ jbd_debug(5, "buffer_head %p, force_copy %d\n", jh, force_copy);
+
+ JBUFFER_TRACE(jh, "entry");
+repeat:
+ bh = jh2bh(jh);
+
+ /* @@@ Need to check for errors here at some point. */
+
+ lock_buffer(bh);
+ jbd_lock_bh_state(bh);
+
+ /* We now hold the buffer lock so it is safe to query the buffer
+ * state. Is the buffer dirty?
+ *
+ * If so, there are two possibilities. The buffer may be
+ * non-journaled, and undergoing a quite legitimate writeback.
+ * Otherwise, it is journaled, and we don't expect dirty buffers
+ * in that state (the buffers should be marked JBD_Dirty
+ * instead.) So either the IO is being done under our own
+ * control and this is a bug, or it's a third party IO such as
+ * dump(8) (which may leave the buffer scheduled for read ---
+ * ie. locked but not dirty) or tune2fs (which may actually have
+ * the buffer dirtied, ugh.) */
+
+ if (buffer_dirty(bh)) {
+ /*
+ * First question: is this buffer already part of the current
+ * transaction or the existing committing transaction?
+ */
+ if (jh->b_transaction) {
+ J_ASSERT_JH(jh,
+ jh->b_transaction == transaction ||
+ jh->b_transaction ==
+ journal->j_committing_transaction);
+ if (jh->b_next_transaction)
+ J_ASSERT_JH(jh, jh->b_next_transaction ==
+ transaction);
+ }
+ /*
+ * In any case we need to clean the dirty flag and we must
+ * do it under the buffer lock to be sure we don't race
+ * with running write-out.
+ */
+ JBUFFER_TRACE(jh, "Unexpected dirty buffer");
+ jbd_unexpected_dirty_buffer(jh);
+ }
+
+ unlock_buffer(bh);
+
+ error = -EROFS;
+ if (is_handle_aborted(handle)) {
+ jbd_unlock_bh_state(bh);
+ goto out;
+ }
+ error = 0;
+
+ /*
+ * The buffer is already part of this transaction if b_transaction or
+ * b_next_transaction points to it
+ */
+ if (jh->b_transaction == transaction ||
+ jh->b_next_transaction == transaction)
+ goto done;
+
+ /*
+ * If there is already a copy-out version of this buffer, then we don't
+ * need to make another one
+ */
+ if (jh->b_frozen_data) {
+ JBUFFER_TRACE(jh, "has frozen data");
+ J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
+ jh->b_next_transaction = transaction;
+ goto done;
+ }
+
+ /* Is there data here we need to preserve? */
+
+ if (jh->b_transaction && jh->b_transaction != transaction) {
+ JBUFFER_TRACE(jh, "owned by older transaction");
+ J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
+ J_ASSERT_JH(jh, jh->b_transaction ==
+ journal->j_committing_transaction);
+
+ /* There is one case we have to be very careful about.
+ * If the committing transaction is currently writing
+ * this buffer out to disk and has NOT made a copy-out,
+ * then we cannot modify the buffer contents at all
+ * right now. The essence of copy-out is that it is the
+ * extra copy, not the primary copy, which gets
+ * journaled. If the primary copy is already going to
+ * disk then we cannot do copy-out here. */
+
+ if (jh->b_jlist == BJ_Shadow) {
+ DEFINE_WAIT_BIT(wait, &bh->b_state, BH_Unshadow);
+ wait_queue_head_t *wqh;
+
+ wqh = bit_waitqueue(&bh->b_state, BH_Unshadow);
+
+ JBUFFER_TRACE(jh, "on shadow: sleep");
+ jbd_unlock_bh_state(bh);
+ /* commit wakes up all shadow buffers after IO */
+ for ( ; ; ) {
+ prepare_to_wait(wqh, &wait.wait,
+ TASK_UNINTERRUPTIBLE);
+ if (jh->b_jlist != BJ_Shadow)
+ break;
+ schedule();
+ }
+ finish_wait(wqh, &wait.wait);
+ goto repeat;
+ }
+
+ /* Only do the copy if the currently-owning transaction
+ * still needs it. If it is on the Forget list, the
+ * committing transaction is past that stage. The
+ * buffer had better remain locked during the kmalloc,
+ * but that should be true --- we hold the journal lock
+ * still and the buffer is already on the BUF_JOURNAL
+ * list so won't be flushed.
+ *
+ * Subtle point, though: if this is a get_undo_access,
+ * then we will be relying on the frozen_data to contain
+ * the new value of the committed_data record after the
+ * transaction, so we HAVE to force the frozen_data copy
+ * in that case. */
+
+ if (jh->b_jlist != BJ_Forget || force_copy) {
+ JBUFFER_TRACE(jh, "generate frozen data");
+ if (!frozen_buffer) {
+ JBUFFER_TRACE(jh, "allocate memory for buffer");
+ jbd_unlock_bh_state(bh);
+ frozen_buffer =
+ jbd2_slab_alloc(jh2bh(jh)->b_size,
+ GFP_NOFS);
+ if (!frozen_buffer) {
+ printk(KERN_EMERG
+ "%s: OOM for frozen_buffer\n",
+ __FUNCTION__);
+ JBUFFER_TRACE(jh, "oom!");
+ error = -ENOMEM;
+ jbd_lock_bh_state(bh);
+ goto done;
+ }
+ goto repeat;
+ }
+ jh->b_frozen_data = frozen_buffer;
+ frozen_buffer = NULL;
+ need_copy = 1;
+ }
+ jh->b_next_transaction = transaction;
+ }
+
+
+ /*
+ * Finally, if the buffer is not journaled right now, we need to make
+ * sure it doesn't get written to disk before the caller actually
+ * commits the new data
+ */
+ if (!jh->b_transaction) {
+ JBUFFER_TRACE(jh, "no transaction");
+ J_ASSERT_JH(jh, !jh->b_next_transaction);
+ jh->b_transaction = transaction;
+ JBUFFER_TRACE(jh, "file as BJ_Reserved");
+ spin_lock(&journal->j_list_lock);
+ __jbd2_journal_file_buffer(jh, transaction, BJ_Reserved);
+ spin_unlock(&journal->j_list_lock);
+ }
+
+done:
+ if (need_copy) {
+ struct page *page;
+ int offset;
+ char *source;
+
+ J_EXPECT_JH(jh, buffer_uptodate(jh2bh(jh)),
+ "Possible IO failure.\n");
+ page = jh2bh(jh)->b_page;
+ offset = ((unsigned long) jh2bh(jh)->b_data) & ~PAGE_MASK;
+ source = kmap_atomic(page, KM_USER0);
+ memcpy(jh->b_frozen_data, source+offset, jh2bh(jh)->b_size);
+ kunmap_atomic(source, KM_USER0);
+ }
+ jbd_unlock_bh_state(bh);
+
+ /*
+ * If we are about to journal a buffer, then any revoke pending on it is
+ * no longer valid
+ */
+ jbd2_journal_cancel_revoke(handle, jh);
+
+out:
+ if (unlikely(frozen_buffer)) /* It's usually NULL */
+ jbd2_slab_free(frozen_buffer, bh->b_size);
+
+ JBUFFER_TRACE(jh, "exit");
+ return error;
+}
+
+/**
+ * int jbd2_journal_get_write_access() - notify intent to modify a buffer for metadata (not data) update.
+ * @handle: transaction to add buffer modifications to
+ * @bh: bh to be used for metadata writes
+ * @credits: variable that will receive credits for the buffer
+ *
+ * Returns an error code or 0 on success.
+ *
+ * In full data journalling mode the buffer may be of type BJ_AsyncData,
+ * because we're write()ing a buffer which is also part of a shared mapping.
+ */
+
+int jbd2_journal_get_write_access(handle_t *handle, struct buffer_head *bh)
+{
+ struct journal_head *jh = jbd2_journal_add_journal_head(bh);
+ int rc;
+
+ /* We do not want to get caught playing with fields which the
+ * log thread also manipulates. Make sure that the buffer
+ * completes any outstanding IO before proceeding. */
+ rc = do_get_write_access(handle, jh, 0);
+ jbd2_journal_put_journal_head(jh);
+ return rc;
+}
+
+
+/*
+ * When the user wants to journal a newly created buffer_head
+ * (ie. getblk() returned a new buffer and we are going to populate it
+ * manually rather than reading off disk), then we need to keep the
+ * buffer_head locked until it has been completely filled with new
+ * data. In this case, we should be able to make the assertion that
+ * the bh is not already part of an existing transaction.
+ *
+ * The buffer should already be locked by the caller by this point.
+ * There is no lock ranking violation: it was a newly created,
+ * unlocked buffer beforehand. */
+
+/**
+ * int jbd2_journal_get_create_access () - notify intent to use newly created bh
+ * @handle: transaction to new buffer to
+ * @bh: new buffer.
+ *
+ * Call this if you create a new bh.
+ */
+int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh)
+{
+ transaction_t *transaction = handle->h_transaction;
+ journal_t *journal = transaction->t_journal;
+ struct journal_head *jh = jbd2_journal_add_journal_head(bh);
+ int err;
+
+ jbd_debug(5, "journal_head %p\n", jh);
+ err = -EROFS;
+ if (is_handle_aborted(handle))
+ goto out;
+ err = 0;
+
+ JBUFFER_TRACE(jh, "entry");
+ /*
+ * The buffer may already belong to this transaction due to pre-zeroing
+ * in the filesystem's new_block code. It may also be on the previous,
+ * committing transaction's lists, but it HAS to be in Forget state in
+ * that case: the transaction must have deleted the buffer for it to be
+ * reused here.
+ */
+ jbd_lock_bh_state(bh);
+ spin_lock(&journal->j_list_lock);
+ J_ASSERT_JH(jh, (jh->b_transaction == transaction ||
+ jh->b_transaction == NULL ||
+ (jh->b_transaction == journal->j_committing_transaction &&
+ jh->b_jlist == BJ_Forget)));
+
+ J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
+ J_ASSERT_JH(jh, buffer_locked(jh2bh(jh)));
+
+ if (jh->b_transaction == NULL) {
+ jh->b_transaction = transaction;
+ JBUFFER_TRACE(jh, "file as BJ_Reserved");
+ __jbd2_journal_file_buffer(jh, transaction, BJ_Reserved);
+ } else if (jh->b_transaction == journal->j_committing_transaction) {
+ JBUFFER_TRACE(jh, "set next transaction");
+ jh->b_next_transaction = transaction;
+ }
+ spin_unlock(&journal->j_list_lock);
+ jbd_unlock_bh_state(bh);
+
+ /*
+ * akpm: I added this. ext3_alloc_branch can pick up new indirect
+ * blocks which contain freed but then revoked metadata. We need
+ * to cancel the revoke in case we end up freeing it yet again
+ * and the reallocating as data - this would cause a second revoke,
+ * which hits an assertion error.
+ */
+ JBUFFER_TRACE(jh, "cancelling revoke");
+ jbd2_journal_cancel_revoke(handle, jh);
+ jbd2_journal_put_journal_head(jh);
+out:
+ return err;
+}
+
+/**
+ * int jbd2_journal_get_undo_access() - Notify intent to modify metadata with
+ * non-rewindable consequences
+ * @handle: transaction
+ * @bh: buffer to undo
+ * @credits: store the number of taken credits here (if not NULL)
+ *
+ * Sometimes there is a need to distinguish between metadata which has
+ * been committed to disk and that which has not. The ext3fs code uses
+ * this for freeing and allocating space, we have to make sure that we
+ * do not reuse freed space until the deallocation has been committed,
+ * since if we overwrote that space we would make the delete
+ * un-rewindable in case of a crash.
+ *
+ * To deal with that, jbd2_journal_get_undo_access requests write access to a
+ * buffer for parts of non-rewindable operations such as delete
+ * operations on the bitmaps. The journaling code must keep a copy of
+ * the buffer's contents prior to the undo_access call until such time
+ * as we know that the buffer has definitely been committed to disk.
+ *
+ * We never need to know which transaction the committed data is part
+ * of, buffers touched here are guaranteed to be dirtied later and so
+ * will be committed to a new transaction in due course, at which point
+ * we can discard the old committed data pointer.
+ *
+ * Returns error number or 0 on success.
+ */
+int jbd2_journal_get_undo_access(handle_t *handle, struct buffer_head *bh)
+{
+ int err;
+ struct journal_head *jh = jbd2_journal_add_journal_head(bh);
+ char *committed_data = NULL;
+
+ JBUFFER_TRACE(jh, "entry");
+
+ /*
+ * Do this first --- it can drop the journal lock, so we want to
+ * make sure that obtaining the committed_data is done
+ * atomically wrt. completion of any outstanding commits.
+ */
+ err = do_get_write_access(handle, jh, 1);
+ if (err)
+ goto out;
+
+repeat:
+ if (!jh->b_committed_data) {
+ committed_data = jbd2_slab_alloc(jh2bh(jh)->b_size, GFP_NOFS);
+ if (!committed_data) {
+ printk(KERN_EMERG "%s: No memory for committed data\n",
+ __FUNCTION__);
+ err = -ENOMEM;
+ goto out;
+ }
+ }
+
+ jbd_lock_bh_state(bh);
+ if (!jh->b_committed_data) {
+ /* Copy out the current buffer contents into the
+ * preserved, committed copy. */
+ JBUFFER_TRACE(jh, "generate b_committed data");
+ if (!committed_data) {
+ jbd_unlock_bh_state(bh);
+ goto repeat;
+ }
+
+ jh->b_committed_data = committed_data;
+ committed_data = NULL;
+ memcpy(jh->b_committed_data, bh->b_data, bh->b_size);
+ }
+ jbd_unlock_bh_state(bh);
+out:
+ jbd2_journal_put_journal_head(jh);
+ if (unlikely(committed_data))
+ jbd2_slab_free(committed_data, bh->b_size);
+ return err;
+}
+
+/**
+ * int jbd2_journal_dirty_data() - mark a buffer as containing dirty data which
+ * needs to be flushed before we can commit the
+ * current transaction.
+ * @handle: transaction
+ * @bh: bufferhead to mark
+ *
+ * The buffer is placed on the transaction's data list and is marked as
+ * belonging to the transaction.
+ *
+ * Returns error number or 0 on success.
+ *
+ * jbd2_journal_dirty_data() can be called via page_launder->ext3_writepage
+ * by kswapd.
+ */
+int jbd2_journal_dirty_data(handle_t *handle, struct buffer_head *bh)
+{
+ journal_t *journal = handle->h_transaction->t_journal;
+ int need_brelse = 0;
+ struct journal_head *jh;
+
+ if (is_handle_aborted(handle))
+ return 0;
+
+ jh = jbd2_journal_add_journal_head(bh);
+ JBUFFER_TRACE(jh, "entry");
+
+ /*
+ * The buffer could *already* be dirty. Writeout can start
+ * at any time.
+ */
+ jbd_debug(4, "jh: %p, tid:%d\n", jh, handle->h_transaction->t_tid);
+
+ /*
+ * What if the buffer is already part of a running transaction?
+ *
+ * There are two cases:
+ * 1) It is part of the current running transaction. Refile it,
+ * just in case we have allocated it as metadata, deallocated
+ * it, then reallocated it as data.
+ * 2) It is part of the previous, still-committing transaction.
+ * If all we want to do is to guarantee that the buffer will be
+ * written to disk before this new transaction commits, then
+ * being sure that the *previous* transaction has this same
+ * property is sufficient for us! Just leave it on its old
+ * transaction.
+ *
+ * In case (2), the buffer must not already exist as metadata
+ * --- that would violate write ordering (a transaction is free
+ * to write its data at any point, even before the previous
+ * committing transaction has committed). The caller must
+ * never, ever allow this to happen: there's nothing we can do
+ * about it in this layer.
+ */
+ jbd_lock_bh_state(bh);
+ spin_lock(&journal->j_list_lock);
+ if (jh->b_transaction) {
+ JBUFFER_TRACE(jh, "has transaction");
+ if (jh->b_transaction != handle->h_transaction) {
+ JBUFFER_TRACE(jh, "belongs to older transaction");
+ J_ASSERT_JH(jh, jh->b_transaction ==
+ journal->j_committing_transaction);
+
+ /* @@@ IS THIS TRUE ? */
+ /*
+ * Not any more. Scenario: someone does a write()
+ * in data=journal mode. The buffer's transaction has
+ * moved into commit. Then someone does another
+ * write() to the file. We do the frozen data copyout
+ * and set b_next_transaction to point to j_running_t.
+ * And while we're in that state, someone does a
+ * writepage() in an attempt to pageout the same area
+ * of the file via a shared mapping. At present that
+ * calls jbd2_journal_dirty_data(), and we get right here.
+ * It may be too late to journal the data. Simply
+ * falling through to the next test will suffice: the
+ * data will be dirty and wil be checkpointed. The
+ * ordering comments in the next comment block still
+ * apply.
+ */
+ //J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
+
+ /*
+ * If we're journalling data, and this buffer was
+ * subject to a write(), it could be metadata, forget
+ * or shadow against the committing transaction. Now,
+ * someone has dirtied the same darn page via a mapping
+ * and it is being writepage()'d.
+ * We *could* just steal the page from commit, with some
+ * fancy locking there. Instead, we just skip it -
+ * don't tie the page's buffers to the new transaction
+ * at all.
+ * Implication: if we crash before the writepage() data
+ * is written into the filesystem, recovery will replay
+ * the write() data.
+ */
+ if (jh->b_jlist != BJ_None &&
+ jh->b_jlist != BJ_SyncData &&
+ jh->b_jlist != BJ_Locked) {
+ JBUFFER_TRACE(jh, "Not stealing");
+ goto no_journal;
+ }
+
+ /*
+ * This buffer may be undergoing writeout in commit. We
+ * can't return from here and let the caller dirty it
+ * again because that can cause the write-out loop in
+ * commit to never terminate.
+ */
+ if (buffer_dirty(bh)) {
+ get_bh(bh);
+ spin_unlock(&journal->j_list_lock);
+ jbd_unlock_bh_state(bh);
+ need_brelse = 1;
+ sync_dirty_buffer(bh);
+ jbd_lock_bh_state(bh);
+ spin_lock(&journal->j_list_lock);
+ /* The buffer may become locked again at any
+ time if it is redirtied */
+ }
+
+ /* journal_clean_data_list() may have got there first */
+ if (jh->b_transaction != NULL) {
+ JBUFFER_TRACE(jh, "unfile from commit");
+ __jbd2_journal_temp_unlink_buffer(jh);
+ /* It still points to the committing
+ * transaction; move it to this one so
+ * that the refile assert checks are
+ * happy. */
+ jh->b_transaction = handle->h_transaction;
+ }
+ /* The buffer will be refiled below */
+
+ }
+ /*
+ * Special case --- the buffer might actually have been
+ * allocated and then immediately deallocated in the previous,
+ * committing transaction, so might still be left on that
+ * transaction's metadata lists.
+ */
+ if (jh->b_jlist != BJ_SyncData && jh->b_jlist != BJ_Locked) {
+ JBUFFER_TRACE(jh, "not on correct data list: unfile");
+ J_ASSERT_JH(jh, jh->b_jlist != BJ_Shadow);
+ __jbd2_journal_temp_unlink_buffer(jh);
+ jh->b_transaction = handle->h_transaction;
+ JBUFFER_TRACE(jh, "file as data");
+ __jbd2_journal_file_buffer(jh, handle->h_transaction,
+ BJ_SyncData);
+ }
+ } else {
+ JBUFFER_TRACE(jh, "not on a transaction");
+ __jbd2_journal_file_buffer(jh, handle->h_transaction, BJ_SyncData);
+ }
+no_journal:
+ spin_unlock(&journal->j_list_lock);
+ jbd_unlock_bh_state(bh);
+ if (need_brelse) {
+ BUFFER_TRACE(bh, "brelse");
+ __brelse(bh);
+ }
+ JBUFFER_TRACE(jh, "exit");
+ jbd2_journal_put_journal_head(jh);
+ return 0;
+}
+
+/**
+ * int jbd2_journal_dirty_metadata() - mark a buffer as containing dirty metadata
+ * @handle: transaction to add buffer to.
+ * @bh: buffer to mark
+ *
+ * mark dirty metadata which needs to be journaled as part of the current
+ * transaction.
+ *
+ * The buffer is placed on the transaction's metadata list and is marked
+ * as belonging to the transaction.
+ *
+ * Returns error number or 0 on success.
+ *
+ * Special care needs to be taken if the buffer already belongs to the
+ * current committing transaction (in which case we should have frozen
+ * data present for that commit). In that case, we don't relink the
+ * buffer: that only gets done when the old transaction finally
+ * completes its commit.
+ */
+int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
+{
+ transaction_t *transaction = handle->h_transaction;
+ journal_t *journal = transaction->t_journal;
+ struct journal_head *jh = bh2jh(bh);
+
+ jbd_debug(5, "journal_head %p\n", jh);
+ JBUFFER_TRACE(jh, "entry");
+ if (is_handle_aborted(handle))
+ goto out;
+
+ jbd_lock_bh_state(bh);
+
+ if (jh->b_modified == 0) {
+ /*
+ * This buffer's got modified and becoming part
+ * of the transaction. This needs to be done
+ * once a transaction -bzzz
+ */
+ jh->b_modified = 1;
+ J_ASSERT_JH(jh, handle->h_buffer_credits > 0);
+ handle->h_buffer_credits--;
+ }
+
+ /*
+ * fastpath, to avoid expensive locking. If this buffer is already
+ * on the running transaction's metadata list there is nothing to do.
+ * Nobody can take it off again because there is a handle open.
+ * I _think_ we're OK here with SMP barriers - a mistaken decision will
+ * result in this test being false, so we go in and take the locks.
+ */
+ if (jh->b_transaction == transaction && jh->b_jlist == BJ_Metadata) {
+ JBUFFER_TRACE(jh, "fastpath");
+ J_ASSERT_JH(jh, jh->b_transaction ==
+ journal->j_running_transaction);
+ goto out_unlock_bh;
+ }
+
+ set_buffer_jbddirty(bh);
+
+ /*
+ * Metadata already on the current transaction list doesn't
+ * need to be filed. Metadata on another transaction's list must
+ * be committing, and will be refiled once the commit completes:
+ * leave it alone for now.
+ */
+ if (jh->b_transaction != transaction) {
+ JBUFFER_TRACE(jh, "already on other transaction");
+ J_ASSERT_JH(jh, jh->b_transaction ==
+ journal->j_committing_transaction);
+ J_ASSERT_JH(jh, jh->b_next_transaction == transaction);
+ /* And this case is illegal: we can't reuse another
+ * transaction's data buffer, ever. */
+ goto out_unlock_bh;
+ }
+
+ /* That test should have eliminated the following case: */
+ J_ASSERT_JH(jh, jh->b_frozen_data == 0);
+
+ JBUFFER_TRACE(jh, "file as BJ_Metadata");
+ spin_lock(&journal->j_list_lock);
+ __jbd2_journal_file_buffer(jh, handle->h_transaction, BJ_Metadata);
+ spin_unlock(&journal->j_list_lock);
+out_unlock_bh:
+ jbd_unlock_bh_state(bh);
+out:
+ JBUFFER_TRACE(jh, "exit");
+ return 0;
+}
+
+/*
+ * jbd2_journal_release_buffer: undo a get_write_access without any buffer
+ * updates, if the update decided in the end that it didn't need access.
+ *
+ */
+void
+jbd2_journal_release_buffer(handle_t *handle, struct buffer_head *bh)
+{
+ BUFFER_TRACE(bh, "entry");
+}
+
+/**
+ * void jbd2_journal_forget() - bforget() for potentially-journaled buffers.
+ * @handle: transaction handle
+ * @bh: bh to 'forget'
+ *
+ * We can only do the bforget if there are no commits pending against the
+ * buffer. If the buffer is dirty in the current running transaction we
+ * can safely unlink it.
+ *
+ * bh may not be a journalled buffer at all - it may be a non-JBD
+ * buffer which came off the hashtable. Check for this.
+ *
+ * Decrements bh->b_count by one.
+ *
+ * Allow this call even if the handle has aborted --- it may be part of
+ * the caller's cleanup after an abort.
+ */
+int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
+{
+ transaction_t *transaction = handle->h_transaction;
+ journal_t *journal = transaction->t_journal;
+ struct journal_head *jh;
+ int drop_reserve = 0;
+ int err = 0;
+
+ BUFFER_TRACE(bh, "entry");
+
+ jbd_lock_bh_state(bh);
+ spin_lock(&journal->j_list_lock);
+
+ if (!buffer_jbd(bh))
+ goto not_jbd;
+ jh = bh2jh(bh);
+
+ /* Critical error: attempting to delete a bitmap buffer, maybe?
+ * Don't do any jbd operations, and return an error. */
+ if (!J_EXPECT_JH(jh, !jh->b_committed_data,
+ "inconsistent data on disk")) {
+ err = -EIO;
+ goto not_jbd;
+ }
+
+ /*
+ * The buffer's going from the transaction, we must drop
+ * all references -bzzz
+ */
+ jh->b_modified = 0;
+
+ if (jh->b_transaction == handle->h_transaction) {
+ J_ASSERT_JH(jh, !jh->b_frozen_data);
+
+ /* If we are forgetting a buffer which is already part
+ * of this transaction, then we can just drop it from
+ * the transaction immediately. */
+ clear_buffer_dirty(bh);
+ clear_buffer_jbddirty(bh);
+
+ JBUFFER_TRACE(jh, "belongs to current transaction: unfile");
+
+ drop_reserve = 1;
+
+ /*
+ * We are no longer going to journal this buffer.
+ * However, the commit of this transaction is still
+ * important to the buffer: the delete that we are now
+ * processing might obsolete an old log entry, so by
+ * committing, we can satisfy the buffer's checkpoint.
+ *
+ * So, if we have a checkpoint on the buffer, we should
+ * now refile the buffer on our BJ_Forget list so that
+ * we know to remove the checkpoint after we commit.
+ */
+
+ if (jh->b_cp_transaction) {
+ __jbd2_journal_temp_unlink_buffer(jh);
+ __jbd2_journal_file_buffer(jh, transaction, BJ_Forget);
+ } else {
+ __jbd2_journal_unfile_buffer(jh);
+ jbd2_journal_remove_journal_head(bh);
+ __brelse(bh);
+ if (!buffer_jbd(bh)) {
+ spin_unlock(&journal->j_list_lock);
+ jbd_unlock_bh_state(bh);
+ __bforget(bh);
+ goto drop;
+ }
+ }
+ } else if (jh->b_transaction) {
+ J_ASSERT_JH(jh, (jh->b_transaction ==
+ journal->j_committing_transaction));
+ /* However, if the buffer is still owned by a prior
+ * (committing) transaction, we can't drop it yet... */
+ JBUFFER_TRACE(jh, "belongs to older transaction");
+ /* ... but we CAN drop it from the new transaction if we
+ * have also modified it since the original commit. */
+
+ if (jh->b_next_transaction) {
+ J_ASSERT(jh->b_next_transaction == transaction);
+ jh->b_next_transaction = NULL;
+ drop_reserve = 1;
+ }
+ }
+
+not_jbd:
+ spin_unlock(&journal->j_list_lock);
+ jbd_unlock_bh_state(bh);
+ __brelse(bh);
+drop:
+ if (drop_reserve) {
+ /* no need to reserve log space for this block -bzzz */
+ handle->h_buffer_credits++;
+ }
+ return err;
+}
+
+/**
+ * int jbd2_journal_stop() - complete a transaction
+ * @handle: tranaction to complete.
+ *
+ * All done for a particular handle.
+ *
+ * There is not much action needed here. We just return any remaining
+ * buffer credits to the transaction and remove the handle. The only
+ * complication is that we need to start a commit operation if the
+ * filesystem is marked for synchronous update.
+ *
+ * jbd2_journal_stop itself will not usually return an error, but it may
+ * do so in unusual circumstances. In particular, expect it to
+ * return -EIO if a jbd2_journal_abort has been executed since the
+ * transaction began.
+ */
+int jbd2_journal_stop(handle_t *handle)
+{
+ transaction_t *transaction = handle->h_transaction;
+ journal_t *journal = transaction->t_journal;
+ int old_handle_count, err;
+ pid_t pid;
+
+ J_ASSERT(journal_current_handle() == handle);
+
+ if (is_handle_aborted(handle))
+ err = -EIO;
+ else {
+ J_ASSERT(transaction->t_updates > 0);
+ err = 0;
+ }
+
+ if (--handle->h_ref > 0) {
+ jbd_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1,
+ handle->h_ref);
+ return err;
+ }
+
+ jbd_debug(4, "Handle %p going down\n", handle);
+
+ /*
+ * Implement synchronous transaction batching. If the handle
+ * was synchronous, don't force a commit immediately. Let's
+ * yield and let another thread piggyback onto this transaction.
+ * Keep doing that while new threads continue to arrive.
+ * It doesn't cost much - we're about to run a commit and sleep
+ * on IO anyway. Speeds up many-threaded, many-dir operations
+ * by 30x or more...
+ *
+ * But don't do this if this process was the most recent one to
+ * perform a synchronous write. We do this to detect the case where a
+ * single process is doing a stream of sync writes. No point in waiting
+ * for joiners in that case.
+ */
+ pid = current->pid;
+ if (handle->h_sync && journal->j_last_sync_writer != pid) {
+ journal->j_last_sync_writer = pid;
+ do {
+ old_handle_count = transaction->t_handle_count;
+ schedule_timeout_uninterruptible(1);
+ } while (old_handle_count != transaction->t_handle_count);
+ }
+
+ current->journal_info = NULL;
+ spin_lock(&journal->j_state_lock);
+ spin_lock(&transaction->t_handle_lock);
+ transaction->t_outstanding_credits -= handle->h_buffer_credits;
+ transaction->t_updates--;
+ if (!transaction->t_updates) {
+ wake_up(&journal->j_wait_updates);
+ if (journal->j_barrier_count)
+ wake_up(&journal->j_wait_transaction_locked);
+ }
+
+ /*
+ * If the handle is marked SYNC, we need to set another commit
+ * going! We also want to force a commit if the current
+ * transaction is occupying too much of the log, or if the
+ * transaction is too old now.
+ */
+ if (handle->h_sync ||
+ transaction->t_outstanding_credits >
+ journal->j_max_transaction_buffers ||
+ time_after_eq(jiffies, transaction->t_expires)) {
+ /* Do this even for aborted journals: an abort still
+ * completes the commit thread, it just doesn't write
+ * anything to disk. */
+ tid_t tid = transaction->t_tid;
+
+ spin_unlock(&transaction->t_handle_lock);
+ jbd_debug(2, "transaction too old, requesting commit for "
+ "handle %p\n", handle);
+ /* This is non-blocking */
+ __jbd2_log_start_commit(journal, transaction->t_tid);
+ spin_unlock(&journal->j_state_lock);
+
+ /*
+ * Special case: JBD2_SYNC synchronous updates require us
+ * to wait for the commit to complete.
+ */
+ if (handle->h_sync && !(current->flags & PF_MEMALLOC))
+ err = jbd2_log_wait_commit(journal, tid);
+ } else {
+ spin_unlock(&transaction->t_handle_lock);
+ spin_unlock(&journal->j_state_lock);
+ }
+
+ jbd_free_handle(handle);
+ return err;
+}
+
+/**int jbd2_journal_force_commit() - force any uncommitted transactions
+ * @journal: journal to force
+ *
+ * For synchronous operations: force any uncommitted transactions
+ * to disk. May seem kludgy, but it reuses all the handle batching
+ * code in a very simple manner.
+ */
+int jbd2_journal_force_commit(journal_t *journal)
+{
+ handle_t *handle;
+ int ret;
+
+ handle = jbd2_journal_start(journal, 1);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ } else {
+ handle->h_sync = 1;
+ ret = jbd2_journal_stop(handle);
+ }
+ return ret;
+}
+
+/*
+ *
+ * List management code snippets: various functions for manipulating the
+ * transaction buffer lists.
+ *
+ */
+
+/*
+ * Append a buffer to a transaction list, given the transaction's list head
+ * pointer.
+ *
+ * j_list_lock is held.
+ *
+ * jbd_lock_bh_state(jh2bh(jh)) is held.
+ */
+
+static inline void
+__blist_add_buffer(struct journal_head **list, struct journal_head *jh)
+{
+ if (!*list) {
+ jh->b_tnext = jh->b_tprev = jh;
+ *list = jh;
+ } else {
+ /* Insert at the tail of the list to preserve order */
+ struct journal_head *first = *list, *last = first->b_tprev;
+ jh->b_tprev = last;
+ jh->b_tnext = first;
+ last->b_tnext = first->b_tprev = jh;
+ }
+}
+
+/*
+ * Remove a buffer from a transaction list, given the transaction's list
+ * head pointer.
+ *
+ * Called with j_list_lock held, and the journal may not be locked.
+ *
+ * jbd_lock_bh_state(jh2bh(jh)) is held.
+ */
+
+static inline void
+__blist_del_buffer(struct journal_head **list, struct journal_head *jh)
+{
+ if (*list == jh) {
+ *list = jh->b_tnext;
+ if (*list == jh)
+ *list = NULL;
+ }
+ jh->b_tprev->b_tnext = jh->b_tnext;
+ jh->b_tnext->b_tprev = jh->b_tprev;
+}
+
+/*
+ * Remove a buffer from the appropriate transaction list.
+ *
+ * Note that this function can *change* the value of
+ * bh->b_transaction->t_sync_datalist, t_buffers, t_forget,
+ * t_iobuf_list, t_shadow_list, t_log_list or t_reserved_list. If the caller
+ * is holding onto a copy of one of thee pointers, it could go bad.
+ * Generally the caller needs to re-read the pointer from the transaction_t.
+ *
+ * Called under j_list_lock. The journal may not be locked.
+ */
+void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh)
+{
+ struct journal_head **list = NULL;
+ transaction_t *transaction;
+ struct buffer_head *bh = jh2bh(jh);
+
+ J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
+ transaction = jh->b_transaction;
+ if (transaction)
+ assert_spin_locked(&transaction->t_journal->j_list_lock);
+
+ J_ASSERT_JH(jh, jh->b_jlist < BJ_Types);
+ if (jh->b_jlist != BJ_None)
+ J_ASSERT_JH(jh, transaction != 0);
+
+ switch (jh->b_jlist) {
+ case BJ_None:
+ return;
+ case BJ_SyncData:
+ list = &transaction->t_sync_datalist;
+ break;
+ case BJ_Metadata:
+ transaction->t_nr_buffers--;
+ J_ASSERT_JH(jh, transaction->t_nr_buffers >= 0);
+ list = &transaction->t_buffers;
+ break;
+ case BJ_Forget:
+ list = &transaction->t_forget;
+ break;
+ case BJ_IO:
+ list = &transaction->t_iobuf_list;
+ break;
+ case BJ_Shadow:
+ list = &transaction->t_shadow_list;
+ break;
+ case BJ_LogCtl:
+ list = &transaction->t_log_list;
+ break;
+ case BJ_Reserved:
+ list = &transaction->t_reserved_list;
+ break;
+ case BJ_Locked:
+ list = &transaction->t_locked_list;
+ break;
+ }
+
+ __blist_del_buffer(list, jh);
+ jh->b_jlist = BJ_None;
+ if (test_clear_buffer_jbddirty(bh))
+ mark_buffer_dirty(bh); /* Expose it to the VM */
+}
+
+void __jbd2_journal_unfile_buffer(struct journal_head *jh)
+{
+ __jbd2_journal_temp_unlink_buffer(jh);
+ jh->b_transaction = NULL;
+}
+
+void jbd2_journal_unfile_buffer(journal_t *journal, struct journal_head *jh)
+{
+ jbd_lock_bh_state(jh2bh(jh));
+ spin_lock(&journal->j_list_lock);
+ __jbd2_journal_unfile_buffer(jh);
+ spin_unlock(&journal->j_list_lock);
+ jbd_unlock_bh_state(jh2bh(jh));
+}
+
+/*
+ * Called from jbd2_journal_try_to_free_buffers().
+ *
+ * Called under jbd_lock_bh_state(bh)
+ */
+static void
+__journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh)
+{
+ struct journal_head *jh;
+
+ jh = bh2jh(bh);
+
+ if (buffer_locked(bh) || buffer_dirty(bh))
+ goto out;
+
+ if (jh->b_next_transaction != 0)
+ goto out;
+
+ spin_lock(&journal->j_list_lock);
+ if (jh->b_transaction != 0 && jh->b_cp_transaction == 0) {
+ if (jh->b_jlist == BJ_SyncData || jh->b_jlist == BJ_Locked) {
+ /* A written-back ordered data buffer */
+ JBUFFER_TRACE(jh, "release data");
+ __jbd2_journal_unfile_buffer(jh);
+ jbd2_journal_remove_journal_head(bh);
+ __brelse(bh);
+ }
+ } else if (jh->b_cp_transaction != 0 && jh->b_transaction == 0) {
+ /* written-back checkpointed metadata buffer */
+ if (jh->b_jlist == BJ_None) {
+ JBUFFER_TRACE(jh, "remove from checkpoint list");
+ __jbd2_journal_remove_checkpoint(jh);
+ jbd2_journal_remove_journal_head(bh);
+ __brelse(bh);
+ }
+ }
+ spin_unlock(&journal->j_list_lock);
+out:
+ return;
+}
+
+
+/**
+ * int jbd2_journal_try_to_free_buffers() - try to free page buffers.
+ * @journal: journal for operation
+ * @page: to try and free
+ * @unused_gfp_mask: unused
+ *
+ *
+ * For all the buffers on this page,
+ * if they are fully written out ordered data, move them onto BUF_CLEAN
+ * so try_to_free_buffers() can reap them.
+ *
+ * This function returns non-zero if we wish try_to_free_buffers()
+ * to be called. We do this if the page is releasable by try_to_free_buffers().
+ * We also do it if the page has locked or dirty buffers and the caller wants
+ * us to perform sync or async writeout.
+ *
+ * This complicates JBD locking somewhat. We aren't protected by the
+ * BKL here. We wish to remove the buffer from its committing or
+ * running transaction's ->t_datalist via __jbd2_journal_unfile_buffer.
+ *
+ * This may *change* the value of transaction_t->t_datalist, so anyone
+ * who looks at t_datalist needs to lock against this function.
+ *
+ * Even worse, someone may be doing a jbd2_journal_dirty_data on this
+ * buffer. So we need to lock against that. jbd2_journal_dirty_data()
+ * will come out of the lock with the buffer dirty, which makes it
+ * ineligible for release here.
+ *
+ * Who else is affected by this? hmm... Really the only contender
+ * is do_get_write_access() - it could be looking at the buffer while
+ * journal_try_to_free_buffer() is changing its state. But that
+ * cannot happen because we never reallocate freed data as metadata
+ * while the data is part of a transaction. Yes?
+ */
+int jbd2_journal_try_to_free_buffers(journal_t *journal,
+ struct page *page, gfp_t unused_gfp_mask)
+{
+ struct buffer_head *head;
+ struct buffer_head *bh;
+ int ret = 0;
+
+ J_ASSERT(PageLocked(page));
+
+ head = page_buffers(page);
+ bh = head;
+ do {
+ struct journal_head *jh;
+
+ /*
+ * We take our own ref against the journal_head here to avoid
+ * having to add tons of locking around each instance of
+ * jbd2_journal_remove_journal_head() and jbd2_journal_put_journal_head().
+ */
+ jh = jbd2_journal_grab_journal_head(bh);
+ if (!jh)
+ continue;
+
+ jbd_lock_bh_state(bh);
+ __journal_try_to_free_buffer(journal, bh);
+ jbd2_journal_put_journal_head(jh);
+ jbd_unlock_bh_state(bh);
+ if (buffer_jbd(bh))
+ goto busy;
+ } while ((bh = bh->b_this_page) != head);
+ ret = try_to_free_buffers(page);
+busy:
+ return ret;
+}
+
+/*
+ * This buffer is no longer needed. If it is on an older transaction's
+ * checkpoint list we need to record it on this transaction's forget list
+ * to pin this buffer (and hence its checkpointing transaction) down until
+ * this transaction commits. If the buffer isn't on a checkpoint list, we
+ * release it.
+ * Returns non-zero if JBD no longer has an interest in the buffer.
+ *
+ * Called under j_list_lock.
+ *
+ * Called under jbd_lock_bh_state(bh).
+ */
+static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction)
+{
+ int may_free = 1;
+ struct buffer_head *bh = jh2bh(jh);
+
+ __jbd2_journal_unfile_buffer(jh);
+
+ if (jh->b_cp_transaction) {
+ JBUFFER_TRACE(jh, "on running+cp transaction");
+ __jbd2_journal_file_buffer(jh, transaction, BJ_Forget);
+ clear_buffer_jbddirty(bh);
+ may_free = 0;
+ } else {
+ JBUFFER_TRACE(jh, "on running transaction");
+ jbd2_journal_remove_journal_head(bh);
+ __brelse(bh);
+ }
+ return may_free;
+}
+
+/*
+ * jbd2_journal_invalidatepage
+ *
+ * This code is tricky. It has a number of cases to deal with.
+ *
+ * There are two invariants which this code relies on:
+ *
+ * i_size must be updated on disk before we start calling invalidatepage on the
+ * data.
+ *
+ * This is done in ext3 by defining an ext3_setattr method which
+ * updates i_size before truncate gets going. By maintaining this
+ * invariant, we can be sure that it is safe to throw away any buffers
+ * attached to the current transaction: once the transaction commits,
+ * we know that the data will not be needed.
+ *
+ * Note however that we can *not* throw away data belonging to the
+ * previous, committing transaction!
+ *
+ * Any disk blocks which *are* part of the previous, committing
+ * transaction (and which therefore cannot be discarded immediately) are
+ * not going to be reused in the new running transaction
+ *
+ * The bitmap committed_data images guarantee this: any block which is
+ * allocated in one transaction and removed in the next will be marked
+ * as in-use in the committed_data bitmap, so cannot be reused until
+ * the next transaction to delete the block commits. This means that
+ * leaving committing buffers dirty is quite safe: the disk blocks
+ * cannot be reallocated to a different file and so buffer aliasing is
+ * not possible.
+ *
+ *
+ * The above applies mainly to ordered data mode. In writeback mode we
+ * don't make guarantees about the order in which data hits disk --- in
+ * particular we don't guarantee that new dirty data is flushed before
+ * transaction commit --- so it is always safe just to discard data
+ * immediately in that mode. --sct
+ */
+
+/*
+ * The journal_unmap_buffer helper function returns zero if the buffer
+ * concerned remains pinned as an anonymous buffer belonging to an older
+ * transaction.
+ *
+ * We're outside-transaction here. Either or both of j_running_transaction
+ * and j_committing_transaction may be NULL.
+ */
+static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
+{
+ transaction_t *transaction;
+ struct journal_head *jh;
+ int may_free = 1;
+ int ret;
+
+ BUFFER_TRACE(bh, "entry");
+
+ /*
+ * It is safe to proceed here without the j_list_lock because the
+ * buffers cannot be stolen by try_to_free_buffers as long as we are
+ * holding the page lock. --sct
+ */
+
+ if (!buffer_jbd(bh))
+ goto zap_buffer_unlocked;
+
+ spin_lock(&journal->j_state_lock);
+ jbd_lock_bh_state(bh);
+ spin_lock(&journal->j_list_lock);
+
+ jh = jbd2_journal_grab_journal_head(bh);
+ if (!jh)
+ goto zap_buffer_no_jh;
+
+ transaction = jh->b_transaction;
+ if (transaction == NULL) {
+ /* First case: not on any transaction. If it
+ * has no checkpoint link, then we can zap it:
+ * it's a writeback-mode buffer so we don't care
+ * if it hits disk safely. */
+ if (!jh->b_cp_transaction) {
+ JBUFFER_TRACE(jh, "not on any transaction: zap");
+ goto zap_buffer;
+ }
+
+ if (!buffer_dirty(bh)) {
+ /* bdflush has written it. We can drop it now */
+ goto zap_buffer;
+ }
+
+ /* OK, it must be in the journal but still not
+ * written fully to disk: it's metadata or
+ * journaled data... */
+
+ if (journal->j_running_transaction) {
+ /* ... and once the current transaction has
+ * committed, the buffer won't be needed any
+ * longer. */
+ JBUFFER_TRACE(jh, "checkpointed: add to BJ_Forget");
+ ret = __dispose_buffer(jh,
+ journal->j_running_transaction);
+ jbd2_journal_put_journal_head(jh);
+ spin_unlock(&journal->j_list_lock);
+ jbd_unlock_bh_state(bh);
+ spin_unlock(&journal->j_state_lock);
+ return ret;
+ } else {
+ /* There is no currently-running transaction. So the
+ * orphan record which we wrote for this file must have
+ * passed into commit. We must attach this buffer to
+ * the committing transaction, if it exists. */
+ if (journal->j_committing_transaction) {
+ JBUFFER_TRACE(jh, "give to committing trans");
+ ret = __dispose_buffer(jh,
+ journal->j_committing_transaction);
+ jbd2_journal_put_journal_head(jh);
+ spin_unlock(&journal->j_list_lock);
+ jbd_unlock_bh_state(bh);
+ spin_unlock(&journal->j_state_lock);
+ return ret;
+ } else {
+ /* The orphan record's transaction has
+ * committed. We can cleanse this buffer */
+ clear_buffer_jbddirty(bh);
+ goto zap_buffer;
+ }
+ }
+ } else if (transaction == journal->j_committing_transaction) {
+ if (jh->b_jlist == BJ_Locked) {
+ /*
+ * The buffer is on the committing transaction's locked
+ * list. We have the buffer locked, so I/O has
+ * completed. So we can nail the buffer now.
+ */
+ may_free = __dispose_buffer(jh, transaction);
+ goto zap_buffer;
+ }
+ /*
+ * If it is committing, we simply cannot touch it. We
+ * can remove it's next_transaction pointer from the
+ * running transaction if that is set, but nothing
+ * else. */
+ JBUFFER_TRACE(jh, "on committing transaction");
+ set_buffer_freed(bh);
+ if (jh->b_next_transaction) {
+ J_ASSERT(jh->b_next_transaction ==
+ journal->j_running_transaction);
+ jh->b_next_transaction = NULL;
+ }
+ jbd2_journal_put_journal_head(jh);
+ spin_unlock(&journal->j_list_lock);
+ jbd_unlock_bh_state(bh);
+ spin_unlock(&journal->j_state_lock);
+ return 0;
+ } else {
+ /* Good, the buffer belongs to the running transaction.
+ * We are writing our own transaction's data, not any
+ * previous one's, so it is safe to throw it away
+ * (remember that we expect the filesystem to have set
+ * i_size already for this truncate so recovery will not
+ * expose the disk blocks we are discarding here.) */
+ J_ASSERT_JH(jh, transaction == journal->j_running_transaction);
+ may_free = __dispose_buffer(jh, transaction);
+ }
+
+zap_buffer:
+ jbd2_journal_put_journal_head(jh);
+zap_buffer_no_jh:
+ spin_unlock(&journal->j_list_lock);
+ jbd_unlock_bh_state(bh);
+ spin_unlock(&journal->j_state_lock);
+zap_buffer_unlocked:
+ clear_buffer_dirty(bh);
+ J_ASSERT_BH(bh, !buffer_jbddirty(bh));
+ clear_buffer_mapped(bh);
+ clear_buffer_req(bh);
+ clear_buffer_new(bh);
+ bh->b_bdev = NULL;
+ return may_free;
+}
+
+/**
+ * void jbd2_journal_invalidatepage()
+ * @journal: journal to use for flush...
+ * @page: page to flush
+ * @offset: length of page to invalidate.
+ *
+ * Reap page buffers containing data after offset in page.
+ *
+ */
+void jbd2_journal_invalidatepage(journal_t *journal,
+ struct page *page,
+ unsigned long offset)
+{
+ struct buffer_head *head, *bh, *next;
+ unsigned int curr_off = 0;
+ int may_free = 1;
+
+ if (!PageLocked(page))
+ BUG();
+ if (!page_has_buffers(page))
+ return;
+
+ /* We will potentially be playing with lists other than just the
+ * data lists (especially for journaled data mode), so be
+ * cautious in our locking. */
+
+ head = bh = page_buffers(page);
+ do {
+ unsigned int next_off = curr_off + bh->b_size;
+ next = bh->b_this_page;
+
+ if (offset <= curr_off) {
+ /* This block is wholly outside the truncation point */
+ lock_buffer(bh);
+ may_free &= journal_unmap_buffer(journal, bh);
+ unlock_buffer(bh);
+ }
+ curr_off = next_off;
+ bh = next;
+
+ } while (bh != head);
+
+ if (!offset) {
+ if (may_free && try_to_free_buffers(page))
+ J_ASSERT(!page_has_buffers(page));
+ }
+}
+
+/*
+ * File a buffer on the given transaction list.
+ */
+void __jbd2_journal_file_buffer(struct journal_head *jh,
+ transaction_t *transaction, int jlist)
+{
+ struct journal_head **list = NULL;
+ int was_dirty = 0;
+ struct buffer_head *bh = jh2bh(jh);
+
+ J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
+ assert_spin_locked(&transaction->t_journal->j_list_lock);
+
+ J_ASSERT_JH(jh, jh->b_jlist < BJ_Types);
+ J_ASSERT_JH(jh, jh->b_transaction == transaction ||
+ jh->b_transaction == 0);
+
+ if (jh->b_transaction && jh->b_jlist == jlist)
+ return;
+
+ /* The following list of buffer states needs to be consistent
+ * with __jbd_unexpected_dirty_buffer()'s handling of dirty
+ * state. */
+
+ if (jlist == BJ_Metadata || jlist == BJ_Reserved ||
+ jlist == BJ_Shadow || jlist == BJ_Forget) {
+ if (test_clear_buffer_dirty(bh) ||
+ test_clear_buffer_jbddirty(bh))
+ was_dirty = 1;
+ }
+
+ if (jh->b_transaction)
+ __jbd2_journal_temp_unlink_buffer(jh);
+ jh->b_transaction = transaction;
+
+ switch (jlist) {
+ case BJ_None:
+ J_ASSERT_JH(jh, !jh->b_committed_data);
+ J_ASSERT_JH(jh, !jh->b_frozen_data);
+ return;
+ case BJ_SyncData:
+ list = &transaction->t_sync_datalist;
+ break;
+ case BJ_Metadata:
+ transaction->t_nr_buffers++;
+ list = &transaction->t_buffers;
+ break;
+ case BJ_Forget:
+ list = &transaction->t_forget;
+ break;
+ case BJ_IO:
+ list = &transaction->t_iobuf_list;
+ break;
+ case BJ_Shadow:
+ list = &transaction->t_shadow_list;
+ break;
+ case BJ_LogCtl:
+ list = &transaction->t_log_list;
+ break;
+ case BJ_Reserved:
+ list = &transaction->t_reserved_list;
+ break;
+ case BJ_Locked:
+ list = &transaction->t_locked_list;
+ break;
+ }
+
+ __blist_add_buffer(list, jh);
+ jh->b_jlist = jlist;
+
+ if (was_dirty)
+ set_buffer_jbddirty(bh);
+}
+
+void jbd2_journal_file_buffer(struct journal_head *jh,
+ transaction_t *transaction, int jlist)
+{
+ jbd_lock_bh_state(jh2bh(jh));
+ spin_lock(&transaction->t_journal->j_list_lock);
+ __jbd2_journal_file_buffer(jh, transaction, jlist);
+ spin_unlock(&transaction->t_journal->j_list_lock);
+ jbd_unlock_bh_state(jh2bh(jh));
+}
+
+/*
+ * Remove a buffer from its current buffer list in preparation for
+ * dropping it from its current transaction entirely. If the buffer has
+ * already started to be used by a subsequent transaction, refile the
+ * buffer on that transaction's metadata list.
+ *
+ * Called under journal->j_list_lock
+ *
+ * Called under jbd_lock_bh_state(jh2bh(jh))
+ */
+void __jbd2_journal_refile_buffer(struct journal_head *jh)
+{
+ int was_dirty;
+ struct buffer_head *bh = jh2bh(jh);
+
+ J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
+ if (jh->b_transaction)
+ assert_spin_locked(&jh->b_transaction->t_journal->j_list_lock);
+
+ /* If the buffer is now unused, just drop it. */
+ if (jh->b_next_transaction == NULL) {
+ __jbd2_journal_unfile_buffer(jh);
+ return;
+ }
+
+ /*
+ * It has been modified by a later transaction: add it to the new
+ * transaction's metadata list.
+ */
+
+ was_dirty = test_clear_buffer_jbddirty(bh);
+ __jbd2_journal_temp_unlink_buffer(jh);
+ jh->b_transaction = jh->b_next_transaction;
+ jh->b_next_transaction = NULL;
+ __jbd2_journal_file_buffer(jh, jh->b_transaction,
+ was_dirty ? BJ_Metadata : BJ_Reserved);
+ J_ASSERT_JH(jh, jh->b_transaction->t_state == T_RUNNING);
+
+ if (was_dirty)
+ set_buffer_jbddirty(bh);
+}
+
+/*
+ * For the unlocked version of this call, also make sure that any
+ * hanging journal_head is cleaned up if necessary.
+ *
+ * __jbd2_journal_refile_buffer is usually called as part of a single locked
+ * operation on a buffer_head, in which the caller is probably going to
+ * be hooking the journal_head onto other lists. In that case it is up
+ * to the caller to remove the journal_head if necessary. For the
+ * unlocked jbd2_journal_refile_buffer call, the caller isn't going to be
+ * doing anything else to the buffer so we need to do the cleanup
+ * ourselves to avoid a jh leak.
+ *
+ * *** The journal_head may be freed by this call! ***
+ */
+void jbd2_journal_refile_buffer(journal_t *journal, struct journal_head *jh)
+{
+ struct buffer_head *bh = jh2bh(jh);
+
+ jbd_lock_bh_state(bh);
+ spin_lock(&journal->j_list_lock);
+
+ __jbd2_journal_refile_buffer(jh);
+ jbd_unlock_bh_state(bh);
+ jbd2_journal_remove_journal_head(bh);
+
+ spin_unlock(&journal->j_list_lock);
+ __brelse(bh);
+}
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index 6de374513c01..bc4b8106a490 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -334,10 +334,10 @@ static int __init init_jffs2_fs(void)
which means just 'no padding', without the alignment
thing. But GCC doesn't have that -- we have to just
hope the structs are the right sizes, instead. */
- BUG_ON(sizeof(struct jffs2_unknown_node) != 12);
- BUG_ON(sizeof(struct jffs2_raw_dirent) != 40);
- BUG_ON(sizeof(struct jffs2_raw_inode) != 68);
- BUG_ON(sizeof(struct jffs2_raw_summary) != 32);
+ BUILD_BUG_ON(sizeof(struct jffs2_unknown_node) != 12);
+ BUILD_BUG_ON(sizeof(struct jffs2_raw_dirent) != 40);
+ BUILD_BUG_ON(sizeof(struct jffs2_raw_inode) != 68);
+ BUILD_BUG_ON(sizeof(struct jffs2_raw_summary) != 32);
printk(KERN_INFO "JFFS2 version 2.2."
#ifdef CONFIG_JFFS2_FS_WRITEBUFFER
diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c
index 87e1d03e8267..b85a0ad2cfb6 100644
--- a/fs/lockd/clntlock.c
+++ b/fs/lockd/clntlock.c
@@ -100,12 +100,12 @@ int nlmclnt_block(struct nlm_wait *block, struct nlm_rqst *req, long timeout)
/*
* The server lockd has called us back to tell us the lock was granted
*/
-u32 nlmclnt_grant(const struct sockaddr_in *addr, const struct nlm_lock *lock)
+__be32 nlmclnt_grant(const struct sockaddr_in *addr, const struct nlm_lock *lock)
{
const struct file_lock *fl = &lock->fl;
const struct nfs_fh *fh = &lock->fh;
struct nlm_wait *block;
- u32 res = nlm_lck_denied;
+ __be32 res = nlm_lck_denied;
/*
* Look up blocked request based on arguments.
@@ -144,42 +144,12 @@ u32 nlmclnt_grant(const struct sockaddr_in *addr, const struct nlm_lock *lock)
*/
/*
- * Someone has sent us an SM_NOTIFY. Ensure we bind to the new port number,
- * that we mark locks for reclaiming, and that we bump the pseudo NSM state.
- */
-static void nlmclnt_prepare_reclaim(struct nlm_host *host)
-{
- down_write(&host->h_rwsem);
- host->h_monitored = 0;
- host->h_state++;
- host->h_nextrebind = 0;
- nlm_rebind_host(host);
-
- /*
- * Mark the locks for reclaiming.
- */
- list_splice_init(&host->h_granted, &host->h_reclaim);
-
- dprintk("NLM: reclaiming locks for host %s\n", host->h_name);
-}
-
-static void nlmclnt_finish_reclaim(struct nlm_host *host)
-{
- host->h_reclaiming = 0;
- up_write(&host->h_rwsem);
- dprintk("NLM: done reclaiming locks for host %s", host->h_name);
-}
-
-/*
* Reclaim all locks on server host. We do this by spawning a separate
* reclaimer thread.
*/
void
-nlmclnt_recovery(struct nlm_host *host, u32 newstate)
+nlmclnt_recovery(struct nlm_host *host)
{
- if (host->h_nsmstate == newstate)
- return;
- host->h_nsmstate = newstate;
if (!host->h_reclaiming++) {
nlm_get_host(host);
__module_get(THIS_MODULE);
@@ -199,18 +169,30 @@ reclaimer(void *ptr)
daemonize("%s-reclaim", host->h_name);
allow_signal(SIGKILL);
+ down_write(&host->h_rwsem);
+
/* This one ensures that our parent doesn't terminate while the
* reclaim is in progress */
lock_kernel();
lockd_up(0); /* note: this cannot fail as lockd is already running */
- nlmclnt_prepare_reclaim(host);
- /* First, reclaim all locks that have been marked. */
+ dprintk("lockd: reclaiming locks for host %s", host->h_name);
+
restart:
nsmstate = host->h_nsmstate;
+
+ /* Force a portmap getport - the peer's lockd will
+ * most likely end up on a different port.
+ */
+ host->h_nextrebind = jiffies;
+ nlm_rebind_host(host);
+
+ /* First, reclaim all locks that have been granted. */
+ list_splice_init(&host->h_granted, &host->h_reclaim);
list_for_each_entry_safe(fl, next, &host->h_reclaim, fl_u.nfs_fl.list) {
list_del_init(&fl->fl_u.nfs_fl.list);
+ /* Why are we leaking memory here? --okir */
if (signalled())
continue;
if (nlmclnt_reclaim(host, fl) != 0)
@@ -218,11 +200,13 @@ restart:
list_add_tail(&fl->fl_u.nfs_fl.list, &host->h_granted);
if (host->h_nsmstate != nsmstate) {
/* Argh! The server rebooted again! */
- list_splice_init(&host->h_granted, &host->h_reclaim);
goto restart;
}
}
- nlmclnt_finish_reclaim(host);
+
+ host->h_reclaiming = 0;
+ up_write(&host->h_rwsem);
+ dprintk("NLM: done reclaiming locks for host %s", host->h_name);
/* Now, wake up all processes that sleep on a blocked lock */
list_for_each_entry(block, &nlm_blocked, b_list) {
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index 0116729cec5f..3d84f600b633 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -36,14 +36,14 @@ static const struct rpc_call_ops nlmclnt_cancel_ops;
/*
* Cookie counter for NLM requests
*/
-static u32 nlm_cookie = 0x1234;
+static atomic_t nlm_cookie = ATOMIC_INIT(0x1234);
-static inline void nlmclnt_next_cookie(struct nlm_cookie *c)
+void nlmclnt_next_cookie(struct nlm_cookie *c)
{
- memcpy(c->data, &nlm_cookie, 4);
- memset(c->data+4, 0, 4);
+ u32 cookie = atomic_inc_return(&nlm_cookie);
+
+ memcpy(c->data, &cookie, 4);
c->len=4;
- nlm_cookie++;
}
static struct nlm_lockowner *nlm_get_lockowner(struct nlm_lockowner *lockowner)
@@ -153,6 +153,7 @@ nlmclnt_proc(struct inode *inode, int cmd, struct file_lock *fl)
{
struct rpc_clnt *client = NFS_CLIENT(inode);
struct sockaddr_in addr;
+ struct nfs_server *nfssrv = NFS_SERVER(inode);
struct nlm_host *host;
struct nlm_rqst *call;
sigset_t oldset;
@@ -166,7 +167,9 @@ nlmclnt_proc(struct inode *inode, int cmd, struct file_lock *fl)
}
rpc_peeraddr(client, (struct sockaddr *) &addr, sizeof(addr));
- host = nlmclnt_lookup_host(&addr, client->cl_xprt->prot, vers);
+ host = nlmclnt_lookup_host(&addr, client->cl_xprt->prot, vers,
+ nfssrv->nfs_client->cl_hostname,
+ strlen(nfssrv->nfs_client->cl_hostname));
if (host == NULL)
return -ENOLCK;
@@ -499,7 +502,7 @@ nlmclnt_lock(struct nlm_rqst *req, struct file_lock *fl)
unsigned char fl_flags = fl->fl_flags;
int status = -ENOLCK;
- if (!host->h_monitored && nsm_monitor(host) < 0) {
+ if (nsm_monitor(host) < 0) {
printk(KERN_NOTICE "lockd: failed to monitor %s\n",
host->h_name);
goto out;
diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index a0d0b58ce7a4..fb24a9730345 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -27,46 +27,60 @@
#define NLM_HOST_EXPIRE ((nrhosts > NLM_HOST_MAX)? 300 * HZ : 120 * HZ)
#define NLM_HOST_COLLECT ((nrhosts > NLM_HOST_MAX)? 120 * HZ : 60 * HZ)
-static struct nlm_host * nlm_hosts[NLM_HOST_NRHASH];
+static struct hlist_head nlm_hosts[NLM_HOST_NRHASH];
static unsigned long next_gc;
static int nrhosts;
static DEFINE_MUTEX(nlm_host_mutex);
static void nlm_gc_hosts(void);
+static struct nsm_handle * __nsm_find(const struct sockaddr_in *,
+ const char *, int, int);
/*
* Find an NLM server handle in the cache. If there is none, create it.
*/
struct nlm_host *
-nlmclnt_lookup_host(struct sockaddr_in *sin, int proto, int version)
+nlmclnt_lookup_host(const struct sockaddr_in *sin, int proto, int version,
+ const char *hostname, int hostname_len)
{
- return nlm_lookup_host(0, sin, proto, version);
+ return nlm_lookup_host(0, sin, proto, version,
+ hostname, hostname_len);
}
/*
* Find an NLM client handle in the cache. If there is none, create it.
*/
struct nlm_host *
-nlmsvc_lookup_host(struct svc_rqst *rqstp)
+nlmsvc_lookup_host(struct svc_rqst *rqstp,
+ const char *hostname, int hostname_len)
{
return nlm_lookup_host(1, &rqstp->rq_addr,
- rqstp->rq_prot, rqstp->rq_vers);
+ rqstp->rq_prot, rqstp->rq_vers,
+ hostname, hostname_len);
}
/*
* Common host lookup routine for server & client
*/
struct nlm_host *
-nlm_lookup_host(int server, struct sockaddr_in *sin,
- int proto, int version)
+nlm_lookup_host(int server, const struct sockaddr_in *sin,
+ int proto, int version,
+ const char *hostname,
+ int hostname_len)
{
- struct nlm_host *host, **hp;
- u32 addr;
+ struct hlist_head *chain;
+ struct hlist_node *pos;
+ struct nlm_host *host;
+ struct nsm_handle *nsm = NULL;
int hash;
- dprintk("lockd: nlm_lookup_host(%08x, p=%d, v=%d)\n",
- (unsigned)(sin? ntohl(sin->sin_addr.s_addr) : 0), proto, version);
+ dprintk("lockd: nlm_lookup_host(%u.%u.%u.%u, p=%d, v=%d, my role=%s, name=%.*s)\n",
+ NIPQUAD(sin->sin_addr.s_addr), proto, version,
+ server? "server" : "client",
+ hostname_len,
+ hostname? hostname : "<none>");
+
hash = NLM_ADDRHASH(sin->sin_addr.s_addr);
@@ -76,7 +90,22 @@ nlm_lookup_host(int server, struct sockaddr_in *sin,
if (time_after_eq(jiffies, next_gc))
nlm_gc_hosts();
- for (hp = &nlm_hosts[hash]; (host = *hp) != 0; hp = &host->h_next) {
+ /* We may keep several nlm_host objects for a peer, because each
+ * nlm_host is identified by
+ * (address, protocol, version, server/client)
+ * We could probably simplify this a little by putting all those
+ * different NLM rpc_clients into one single nlm_host object.
+ * This would allow us to have one nlm_host per address.
+ */
+ chain = &nlm_hosts[hash];
+ hlist_for_each_entry(host, pos, chain, h_hash) {
+ if (!nlm_cmp_addr(&host->h_addr, sin))
+ continue;
+
+ /* See if we have an NSM handle for this client */
+ if (!nsm)
+ nsm = host->h_nsmhandle;
+
if (host->h_proto != proto)
continue;
if (host->h_version != version)
@@ -84,28 +113,30 @@ nlm_lookup_host(int server, struct sockaddr_in *sin,
if (host->h_server != server)
continue;
- if (nlm_cmp_addr(&host->h_addr, sin)) {
- if (hp != nlm_hosts + hash) {
- *hp = host->h_next;
- host->h_next = nlm_hosts[hash];
- nlm_hosts[hash] = host;
- }
- nlm_get_host(host);
- mutex_unlock(&nlm_host_mutex);
- return host;
- }
- }
+ /* Move to head of hash chain. */
+ hlist_del(&host->h_hash);
+ hlist_add_head(&host->h_hash, chain);
- /* Ooops, no host found, create it */
- dprintk("lockd: creating host entry\n");
+ nlm_get_host(host);
+ goto out;
+ }
+ if (nsm)
+ atomic_inc(&nsm->sm_count);
- host = kzalloc(sizeof(*host), GFP_KERNEL);
- if (!host)
- goto nohost;
+ host = NULL;
- addr = sin->sin_addr.s_addr;
- sprintf(host->h_name, "%u.%u.%u.%u", NIPQUAD(addr));
+ /* Sadly, the host isn't in our hash table yet. See if
+ * we have an NSM handle for it. If not, create one.
+ */
+ if (!nsm && !(nsm = nsm_find(sin, hostname, hostname_len)))
+ goto out;
+ host = kzalloc(sizeof(*host), GFP_KERNEL);
+ if (!host) {
+ nsm_release(nsm);
+ goto out;
+ }
+ host->h_name = nsm->sm_name;
host->h_addr = *sin;
host->h_addr.sin_port = 0; /* ouch! */
host->h_version = version;
@@ -119,9 +150,9 @@ nlm_lookup_host(int server, struct sockaddr_in *sin,
init_rwsem(&host->h_rwsem);
host->h_state = 0; /* pseudo NSM state */
host->h_nsmstate = 0; /* real NSM state */
+ host->h_nsmhandle = nsm;
host->h_server = server;
- host->h_next = nlm_hosts[hash];
- nlm_hosts[hash] = host;
+ hlist_add_head(&host->h_hash, chain);
INIT_LIST_HEAD(&host->h_lockowners);
spin_lock_init(&host->h_lock);
INIT_LIST_HEAD(&host->h_granted);
@@ -130,35 +161,39 @@ nlm_lookup_host(int server, struct sockaddr_in *sin,
if (++nrhosts > NLM_HOST_MAX)
next_gc = 0;
-nohost:
+out:
mutex_unlock(&nlm_host_mutex);
return host;
}
-struct nlm_host *
-nlm_find_client(void)
+/*
+ * Destroy a host
+ */
+static void
+nlm_destroy_host(struct nlm_host *host)
{
- /* find a nlm_host for a client for which h_killed == 0.
- * and return it
+ struct rpc_clnt *clnt;
+
+ BUG_ON(!list_empty(&host->h_lockowners));
+ BUG_ON(atomic_read(&host->h_count));
+
+ /*
+ * Release NSM handle and unmonitor host.
*/
- int hash;
- mutex_lock(&nlm_host_mutex);
- for (hash = 0 ; hash < NLM_HOST_NRHASH; hash++) {
- struct nlm_host *host, **hp;
- for (hp = &nlm_hosts[hash]; (host = *hp) != 0; hp = &host->h_next) {
- if (host->h_server &&
- host->h_killed == 0) {
- nlm_get_host(host);
- mutex_unlock(&nlm_host_mutex);
- return host;
- }
+ nsm_unmonitor(host);
+
+ if ((clnt = host->h_rpcclnt) != NULL) {
+ if (atomic_read(&clnt->cl_users)) {
+ printk(KERN_WARNING
+ "lockd: active RPC handle\n");
+ clnt->cl_dead = 1;
+ } else {
+ rpc_destroy_client(host->h_rpcclnt);
}
}
- mutex_unlock(&nlm_host_mutex);
- return NULL;
+ kfree(host);
}
-
/*
* Create the NLM RPC client for an NLM peer
*/
@@ -260,22 +295,82 @@ void nlm_release_host(struct nlm_host *host)
}
/*
+ * We were notified that the host indicated by address &sin
+ * has rebooted.
+ * Release all resources held by that peer.
+ */
+void nlm_host_rebooted(const struct sockaddr_in *sin,
+ const char *hostname, int hostname_len,
+ u32 new_state)
+{
+ struct hlist_head *chain;
+ struct hlist_node *pos;
+ struct nsm_handle *nsm;
+ struct nlm_host *host;
+
+ dprintk("lockd: nlm_host_rebooted(%s, %u.%u.%u.%u)\n",
+ hostname, NIPQUAD(sin->sin_addr));
+
+ /* Find the NSM handle for this peer */
+ if (!(nsm = __nsm_find(sin, hostname, hostname_len, 0)))
+ return;
+
+ /* When reclaiming locks on this peer, make sure that
+ * we set up a new notification */
+ nsm->sm_monitored = 0;
+
+ /* Mark all hosts tied to this NSM state as having rebooted.
+ * We run the loop repeatedly, because we drop the host table
+ * lock for this.
+ * To avoid processing a host several times, we match the nsmstate.
+ */
+again: mutex_lock(&nlm_host_mutex);
+ for (chain = nlm_hosts; chain < nlm_hosts + NLM_HOST_NRHASH; ++chain) {
+ hlist_for_each_entry(host, pos, chain, h_hash) {
+ if (host->h_nsmhandle == nsm
+ && host->h_nsmstate != new_state) {
+ host->h_nsmstate = new_state;
+ host->h_state++;
+
+ nlm_get_host(host);
+ mutex_unlock(&nlm_host_mutex);
+
+ if (host->h_server) {
+ /* We're server for this guy, just ditch
+ * all the locks he held. */
+ nlmsvc_free_host_resources(host);
+ } else {
+ /* He's the server, initiate lock recovery. */
+ nlmclnt_recovery(host);
+ }
+
+ nlm_release_host(host);
+ goto again;
+ }
+ }
+ }
+
+ mutex_unlock(&nlm_host_mutex);
+}
+
+/*
* Shut down the hosts module.
* Note that this routine is called only at server shutdown time.
*/
void
nlm_shutdown_hosts(void)
{
+ struct hlist_head *chain;
+ struct hlist_node *pos;
struct nlm_host *host;
- int i;
dprintk("lockd: shutting down host module\n");
mutex_lock(&nlm_host_mutex);
/* First, make all hosts eligible for gc */
dprintk("lockd: nuking all hosts...\n");
- for (i = 0; i < NLM_HOST_NRHASH; i++) {
- for (host = nlm_hosts[i]; host; host = host->h_next)
+ for (chain = nlm_hosts; chain < nlm_hosts + NLM_HOST_NRHASH; ++chain) {
+ hlist_for_each_entry(host, pos, chain, h_hash)
host->h_expires = jiffies - 1;
}
@@ -287,8 +382,8 @@ nlm_shutdown_hosts(void)
if (nrhosts) {
printk(KERN_WARNING "lockd: couldn't shutdown host module!\n");
dprintk("lockd: %d hosts left:\n", nrhosts);
- for (i = 0; i < NLM_HOST_NRHASH; i++) {
- for (host = nlm_hosts[i]; host; host = host->h_next) {
+ for (chain = nlm_hosts; chain < nlm_hosts + NLM_HOST_NRHASH; ++chain) {
+ hlist_for_each_entry(host, pos, chain, h_hash) {
dprintk(" %s (cnt %d use %d exp %ld)\n",
host->h_name, atomic_read(&host->h_count),
host->h_inuse, host->h_expires);
@@ -305,45 +400,32 @@ nlm_shutdown_hosts(void)
static void
nlm_gc_hosts(void)
{
- struct nlm_host **q, *host;
- struct rpc_clnt *clnt;
- int i;
+ struct hlist_head *chain;
+ struct hlist_node *pos, *next;
+ struct nlm_host *host;
dprintk("lockd: host garbage collection\n");
- for (i = 0; i < NLM_HOST_NRHASH; i++) {
- for (host = nlm_hosts[i]; host; host = host->h_next)
+ for (chain = nlm_hosts; chain < nlm_hosts + NLM_HOST_NRHASH; ++chain) {
+ hlist_for_each_entry(host, pos, chain, h_hash)
host->h_inuse = 0;
}
/* Mark all hosts that hold locks, blocks or shares */
nlmsvc_mark_resources();
- for (i = 0; i < NLM_HOST_NRHASH; i++) {
- q = &nlm_hosts[i];
- while ((host = *q) != NULL) {
+ for (chain = nlm_hosts; chain < nlm_hosts + NLM_HOST_NRHASH; ++chain) {
+ hlist_for_each_entry_safe(host, pos, next, chain, h_hash) {
if (atomic_read(&host->h_count) || host->h_inuse
|| time_before(jiffies, host->h_expires)) {
dprintk("nlm_gc_hosts skipping %s (cnt %d use %d exp %ld)\n",
host->h_name, atomic_read(&host->h_count),
host->h_inuse, host->h_expires);
- q = &host->h_next;
continue;
}
dprintk("lockd: delete host %s\n", host->h_name);
- *q = host->h_next;
- /* Don't unmonitor hosts that have been invalidated */
- if (host->h_monitored && !host->h_killed)
- nsm_unmonitor(host);
- if ((clnt = host->h_rpcclnt) != NULL) {
- if (atomic_read(&clnt->cl_users)) {
- printk(KERN_WARNING
- "lockd: active RPC handle\n");
- clnt->cl_dead = 1;
- } else {
- rpc_destroy_client(host->h_rpcclnt);
- }
- }
- kfree(host);
+ hlist_del_init(&host->h_hash);
+
+ nlm_destroy_host(host);
nrhosts--;
}
}
@@ -351,3 +433,88 @@ nlm_gc_hosts(void)
next_gc = jiffies + NLM_HOST_COLLECT;
}
+
+/*
+ * Manage NSM handles
+ */
+static LIST_HEAD(nsm_handles);
+static DEFINE_MUTEX(nsm_mutex);
+
+static struct nsm_handle *
+__nsm_find(const struct sockaddr_in *sin,
+ const char *hostname, int hostname_len,
+ int create)
+{
+ struct nsm_handle *nsm = NULL;
+ struct list_head *pos;
+
+ if (!sin)
+ return NULL;
+
+ if (hostname && memchr(hostname, '/', hostname_len) != NULL) {
+ if (printk_ratelimit()) {
+ printk(KERN_WARNING "Invalid hostname \"%.*s\" "
+ "in NFS lock request\n",
+ hostname_len, hostname);
+ }
+ return NULL;
+ }
+
+ mutex_lock(&nsm_mutex);
+ list_for_each(pos, &nsm_handles) {
+ nsm = list_entry(pos, struct nsm_handle, sm_link);
+
+ if (hostname && nsm_use_hostnames) {
+ if (strlen(nsm->sm_name) != hostname_len
+ || memcmp(nsm->sm_name, hostname, hostname_len))
+ continue;
+ } else if (!nlm_cmp_addr(&nsm->sm_addr, sin))
+ continue;
+ atomic_inc(&nsm->sm_count);
+ goto out;
+ }
+
+ if (!create) {
+ nsm = NULL;
+ goto out;
+ }
+
+ nsm = kzalloc(sizeof(*nsm) + hostname_len + 1, GFP_KERNEL);
+ if (nsm != NULL) {
+ nsm->sm_addr = *sin;
+ nsm->sm_name = (char *) (nsm + 1);
+ memcpy(nsm->sm_name, hostname, hostname_len);
+ nsm->sm_name[hostname_len] = '\0';
+ atomic_set(&nsm->sm_count, 1);
+
+ list_add(&nsm->sm_link, &nsm_handles);
+ }
+
+out:
+ mutex_unlock(&nsm_mutex);
+ return nsm;
+}
+
+struct nsm_handle *
+nsm_find(const struct sockaddr_in *sin, const char *hostname, int hostname_len)
+{
+ return __nsm_find(sin, hostname, hostname_len, 1);
+}
+
+/*
+ * Release an NSM handle
+ */
+void
+nsm_release(struct nsm_handle *nsm)
+{
+ if (!nsm)
+ return;
+ if (atomic_dec_and_test(&nsm->sm_count)) {
+ mutex_lock(&nsm_mutex);
+ if (atomic_read(&nsm->sm_count) == 0) {
+ list_del(&nsm->sm_link);
+ kfree(nsm);
+ }
+ mutex_unlock(&nsm_mutex);
+ }
+}
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index a816b920d431..eb243edf8932 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -24,13 +24,13 @@ static struct rpc_program nsm_program;
/*
* Local NSM state
*/
-u32 nsm_local_state;
+int nsm_local_state;
/*
* Common procedure for SM_MON/SM_UNMON calls
*/
static int
-nsm_mon_unmon(struct nlm_host *host, u32 proc, struct nsm_res *res)
+nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res)
{
struct rpc_clnt *clnt;
int status;
@@ -46,10 +46,11 @@ nsm_mon_unmon(struct nlm_host *host, u32 proc, struct nsm_res *res)
goto out;
}
- args.addr = host->h_addr.sin_addr.s_addr;
- args.proto= (host->h_proto<<1) | host->h_server;
+ memset(&args, 0, sizeof(args));
+ args.mon_name = nsm->sm_name;
+ args.addr = nsm->sm_addr.sin_addr.s_addr;
args.prog = NLM_PROGRAM;
- args.vers = host->h_version;
+ args.vers = 3;
args.proc = NLMPROC_NSM_NOTIFY;
memset(res, 0, sizeof(*res));
@@ -70,17 +71,22 @@ nsm_mon_unmon(struct nlm_host *host, u32 proc, struct nsm_res *res)
int
nsm_monitor(struct nlm_host *host)
{
+ struct nsm_handle *nsm = host->h_nsmhandle;
struct nsm_res res;
int status;
dprintk("lockd: nsm_monitor(%s)\n", host->h_name);
+ BUG_ON(nsm == NULL);
- status = nsm_mon_unmon(host, SM_MON, &res);
+ if (nsm->sm_monitored)
+ return 0;
+
+ status = nsm_mon_unmon(nsm, SM_MON, &res);
if (status < 0 || res.status != 0)
printk(KERN_NOTICE "lockd: cannot monitor %s\n", host->h_name);
else
- host->h_monitored = 1;
+ nsm->sm_monitored = 1;
return status;
}
@@ -90,16 +96,26 @@ nsm_monitor(struct nlm_host *host)
int
nsm_unmonitor(struct nlm_host *host)
{
+ struct nsm_handle *nsm = host->h_nsmhandle;
struct nsm_res res;
- int status;
-
- dprintk("lockd: nsm_unmonitor(%s)\n", host->h_name);
-
- status = nsm_mon_unmon(host, SM_UNMON, &res);
- if (status < 0)
- printk(KERN_NOTICE "lockd: cannot unmonitor %s\n", host->h_name);
- else
- host->h_monitored = 0;
+ int status = 0;
+
+ if (nsm == NULL)
+ return 0;
+ host->h_nsmhandle = NULL;
+
+ if (atomic_read(&nsm->sm_count) == 1
+ && nsm->sm_monitored && !nsm->sm_sticky) {
+ dprintk("lockd: nsm_unmonitor(%s)\n", host->h_name);
+
+ status = nsm_mon_unmon(nsm, SM_UNMON, &res);
+ if (status < 0)
+ printk(KERN_NOTICE "lockd: cannot unmonitor %s\n",
+ host->h_name);
+ else
+ nsm->sm_monitored = 0;
+ }
+ nsm_release(nsm);
return status;
}
@@ -132,10 +148,10 @@ nsm_create(void)
* XDR functions for NSM.
*/
-static u32 *
-xdr_encode_common(struct rpc_rqst *rqstp, u32 *p, struct nsm_args *argp)
+static __be32 *
+xdr_encode_common(struct rpc_rqst *rqstp, __be32 *p, struct nsm_args *argp)
{
- char buffer[20];
+ char buffer[20], *name;
/*
* Use the dotted-quad IP address of the remote host as
@@ -143,8 +159,13 @@ xdr_encode_common(struct rpc_rqst *rqstp, u32 *p, struct nsm_args *argp)
* hostname first for whatever remote hostname it receives,
* so this works alright.
*/
- sprintf(buffer, "%u.%u.%u.%u", NIPQUAD(argp->addr));
- if (!(p = xdr_encode_string(p, buffer))
+ if (nsm_use_hostnames) {
+ name = argp->mon_name;
+ } else {
+ sprintf(buffer, "%u.%u.%u.%u", NIPQUAD(argp->addr));
+ name = buffer;
+ }
+ if (!(p = xdr_encode_string(p, name))
|| !(p = xdr_encode_string(p, utsname()->nodename)))
return ERR_PTR(-EIO);
*p++ = htonl(argp->prog);
@@ -155,21 +176,23 @@ xdr_encode_common(struct rpc_rqst *rqstp, u32 *p, struct nsm_args *argp)
}
static int
-xdr_encode_mon(struct rpc_rqst *rqstp, u32 *p, struct nsm_args *argp)
+xdr_encode_mon(struct rpc_rqst *rqstp, __be32 *p, struct nsm_args *argp)
{
p = xdr_encode_common(rqstp, p, argp);
if (IS_ERR(p))
return PTR_ERR(p);
+
+ /* Surprise - there may even be room for an IPv6 address now */
*p++ = argp->addr;
- *p++ = argp->vers;
- *p++ = argp->proto;
+ *p++ = 0;
+ *p++ = 0;
*p++ = 0;
rqstp->rq_slen = xdr_adjust_iovec(rqstp->rq_svec, p);
return 0;
}
static int
-xdr_encode_unmon(struct rpc_rqst *rqstp, u32 *p, struct nsm_args *argp)
+xdr_encode_unmon(struct rpc_rqst *rqstp, __be32 *p, struct nsm_args *argp)
{
p = xdr_encode_common(rqstp, p, argp);
if (IS_ERR(p))
@@ -179,7 +202,7 @@ xdr_encode_unmon(struct rpc_rqst *rqstp, u32 *p, struct nsm_args *argp)
}
static int
-xdr_decode_stat_res(struct rpc_rqst *rqstp, u32 *p, struct nsm_res *resp)
+xdr_decode_stat_res(struct rpc_rqst *rqstp, __be32 *p, struct nsm_res *resp)
{
resp->status = ntohl(*p++);
resp->state = ntohl(*p++);
@@ -189,7 +212,7 @@ xdr_decode_stat_res(struct rpc_rqst *rqstp, u32 *p, struct nsm_res *resp)
}
static int
-xdr_decode_stat(struct rpc_rqst *rqstp, u32 *p, struct nsm_res *resp)
+xdr_decode_stat(struct rpc_rqst *rqstp, __be32 *p, struct nsm_res *resp)
{
resp->state = ntohl(*p++);
return 0;
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 3cc369e5693f..634139232aaf 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -33,6 +33,7 @@
#include <linux/sunrpc/svcsock.h>
#include <net/ip.h>
#include <linux/lockd/lockd.h>
+#include <linux/lockd/sm_inter.h>
#include <linux/nfs.h>
#define NLMDBG_FACILITY NLMDBG_SVC
@@ -61,6 +62,7 @@ static DECLARE_WAIT_QUEUE_HEAD(lockd_exit);
static unsigned long nlm_grace_period;
static unsigned long nlm_timeout = LOCKD_DFLT_TIMEO;
static int nlm_udpport, nlm_tcpport;
+int nsm_use_hostnames = 0;
/*
* Constants needed for the sysctl interface.
@@ -395,6 +397,22 @@ static ctl_table nlm_sysctls[] = {
.extra1 = (int *) &nlm_port_min,
.extra2 = (int *) &nlm_port_max,
},
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "nsm_use_hostnames",
+ .data = &nsm_use_hostnames,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "nsm_local_state",
+ .data = &nsm_local_state,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
{ .ctl_name = 0 }
};
@@ -483,6 +501,7 @@ module_param_call(nlm_udpport, param_set_port, param_get_int,
&nlm_udpport, 0644);
module_param_call(nlm_tcpport, param_set_port, param_get_int,
&nlm_tcpport, 0644);
+module_param(nsm_use_hostnames, bool, 0644);
/*
* Initialising and terminating the module.
diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
index a2dd9ccb9b32..0ce5c81ff507 100644
--- a/fs/lockd/svc4proc.c
+++ b/fs/lockd/svc4proc.c
@@ -24,22 +24,22 @@
/*
* Obtain client and file from arguments
*/
-static u32
+static __be32
nlm4svc_retrieve_args(struct svc_rqst *rqstp, struct nlm_args *argp,
struct nlm_host **hostp, struct nlm_file **filp)
{
struct nlm_host *host = NULL;
struct nlm_file *file = NULL;
struct nlm_lock *lock = &argp->lock;
- u32 error = 0;
+ __be32 error = 0;
/* nfsd callbacks must have been installed for this procedure */
if (!nlmsvc_ops)
return nlm_lck_denied_nolocks;
/* Obtain host handle */
- if (!(host = nlmsvc_lookup_host(rqstp))
- || (argp->monitor && !host->h_monitored && nsm_monitor(host) < 0))
+ if (!(host = nlmsvc_lookup_host(rqstp, lock->caller, lock->len))
+ || (argp->monitor && nsm_monitor(host) < 0))
goto no_locks;
*hostp = host;
@@ -68,7 +68,7 @@ no_locks:
/*
* NULL: Test for presence of service
*/
-static int
+static __be32
nlm4svc_proc_null(struct svc_rqst *rqstp, void *argp, void *resp)
{
dprintk("lockd: NULL called\n");
@@ -78,7 +78,7 @@ nlm4svc_proc_null(struct svc_rqst *rqstp, void *argp, void *resp)
/*
* TEST: Check for conflicting lock
*/
-static int
+static __be32
nlm4svc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp,
struct nlm_res *resp)
{
@@ -96,7 +96,7 @@ nlm4svc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp,
/* Obtain client and file */
if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file)))
- return rpc_success;
+ return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
/* Now check for conflicting locks */
resp->status = nlmsvc_testlock(file, &argp->lock, &resp->lock);
@@ -107,7 +107,7 @@ nlm4svc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp,
return rpc_success;
}
-static int
+static __be32
nlm4svc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
struct nlm_res *resp)
{
@@ -126,7 +126,7 @@ nlm4svc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
/* Obtain client and file */
if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file)))
- return rpc_success;
+ return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
#if 0
/* If supplied state doesn't match current state, we assume it's
@@ -150,7 +150,7 @@ nlm4svc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
return rpc_success;
}
-static int
+static __be32
nlm4svc_proc_cancel(struct svc_rqst *rqstp, struct nlm_args *argp,
struct nlm_res *resp)
{
@@ -169,7 +169,7 @@ nlm4svc_proc_cancel(struct svc_rqst *rqstp, struct nlm_args *argp,
/* Obtain client and file */
if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file)))
- return rpc_success;
+ return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
/* Try to cancel request. */
resp->status = nlmsvc_cancel_blocked(file, &argp->lock);
@@ -183,7 +183,7 @@ nlm4svc_proc_cancel(struct svc_rqst *rqstp, struct nlm_args *argp,
/*
* UNLOCK: release a lock
*/
-static int
+static __be32
nlm4svc_proc_unlock(struct svc_rqst *rqstp, struct nlm_args *argp,
struct nlm_res *resp)
{
@@ -202,7 +202,7 @@ nlm4svc_proc_unlock(struct svc_rqst *rqstp, struct nlm_args *argp,
/* Obtain client and file */
if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file)))
- return rpc_success;
+ return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
/* Now try to remove the lock */
resp->status = nlmsvc_unlock(file, &argp->lock);
@@ -217,7 +217,7 @@ nlm4svc_proc_unlock(struct svc_rqst *rqstp, struct nlm_args *argp,
* GRANTED: A server calls us to tell that a process' lock request
* was granted
*/
-static int
+static __be32
nlm4svc_proc_granted(struct svc_rqst *rqstp, struct nlm_args *argp,
struct nlm_res *resp)
{
@@ -253,14 +253,16 @@ static const struct rpc_call_ops nlm4svc_callback_ops = {
* because we send the callback before the reply proper. I hope this
* doesn't break any clients.
*/
-static int nlm4svc_callback(struct svc_rqst *rqstp, u32 proc, struct nlm_args *argp,
- int (*func)(struct svc_rqst *, struct nlm_args *, struct nlm_res *))
+static __be32 nlm4svc_callback(struct svc_rqst *rqstp, u32 proc, struct nlm_args *argp,
+ __be32 (*func)(struct svc_rqst *, struct nlm_args *, struct nlm_res *))
{
struct nlm_host *host;
struct nlm_rqst *call;
- int stat;
+ __be32 stat;
- host = nlmsvc_lookup_host(rqstp);
+ host = nlmsvc_lookup_host(rqstp,
+ argp->lock.caller,
+ argp->lock.len);
if (host == NULL)
return rpc_system_err;
@@ -280,35 +282,35 @@ static int nlm4svc_callback(struct svc_rqst *rqstp, u32 proc, struct nlm_args *a
return rpc_success;
}
-static int nlm4svc_proc_test_msg(struct svc_rqst *rqstp, struct nlm_args *argp,
+static __be32 nlm4svc_proc_test_msg(struct svc_rqst *rqstp, struct nlm_args *argp,
void *resp)
{
dprintk("lockd: TEST_MSG called\n");
return nlm4svc_callback(rqstp, NLMPROC_TEST_RES, argp, nlm4svc_proc_test);
}
-static int nlm4svc_proc_lock_msg(struct svc_rqst *rqstp, struct nlm_args *argp,
+static __be32 nlm4svc_proc_lock_msg(struct svc_rqst *rqstp, struct nlm_args *argp,
void *resp)
{
dprintk("lockd: LOCK_MSG called\n");
return nlm4svc_callback(rqstp, NLMPROC_LOCK_RES, argp, nlm4svc_proc_lock);
}
-static int nlm4svc_proc_cancel_msg(struct svc_rqst *rqstp, struct nlm_args *argp,
+static __be32 nlm4svc_proc_cancel_msg(struct svc_rqst *rqstp, struct nlm_args *argp,
void *resp)
{
dprintk("lockd: CANCEL_MSG called\n");
return nlm4svc_callback(rqstp, NLMPROC_CANCEL_RES, argp, nlm4svc_proc_cancel);
}
-static int nlm4svc_proc_unlock_msg(struct svc_rqst *rqstp, struct nlm_args *argp,
+static __be32 nlm4svc_proc_unlock_msg(struct svc_rqst *rqstp, struct nlm_args *argp,
void *resp)
{
dprintk("lockd: UNLOCK_MSG called\n");
return nlm4svc_callback(rqstp, NLMPROC_UNLOCK_RES, argp, nlm4svc_proc_unlock);
}
-static int nlm4svc_proc_granted_msg(struct svc_rqst *rqstp, struct nlm_args *argp,
+static __be32 nlm4svc_proc_granted_msg(struct svc_rqst *rqstp, struct nlm_args *argp,
void *resp)
{
dprintk("lockd: GRANTED_MSG called\n");
@@ -318,7 +320,7 @@ static int nlm4svc_proc_granted_msg(struct svc_rqst *rqstp, struct nlm_args *arg
/*
* SHARE: create a DOS share or alter existing share.
*/
-static int
+static __be32
nlm4svc_proc_share(struct svc_rqst *rqstp, struct nlm_args *argp,
struct nlm_res *resp)
{
@@ -337,7 +339,7 @@ nlm4svc_proc_share(struct svc_rqst *rqstp, struct nlm_args *argp,
/* Obtain client and file */
if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file)))
- return rpc_success;
+ return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
/* Now try to create the share */
resp->status = nlmsvc_share_file(host, file, argp);
@@ -351,7 +353,7 @@ nlm4svc_proc_share(struct svc_rqst *rqstp, struct nlm_args *argp,
/*
* UNSHARE: Release a DOS share.
*/
-static int
+static __be32
nlm4svc_proc_unshare(struct svc_rqst *rqstp, struct nlm_args *argp,
struct nlm_res *resp)
{
@@ -370,7 +372,7 @@ nlm4svc_proc_unshare(struct svc_rqst *rqstp, struct nlm_args *argp,
/* Obtain client and file */
if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file)))
- return rpc_success;
+ return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
/* Now try to lock the file */
resp->status = nlmsvc_unshare_file(host, file, argp);
@@ -384,7 +386,7 @@ nlm4svc_proc_unshare(struct svc_rqst *rqstp, struct nlm_args *argp,
/*
* NM_LOCK: Create an unmonitored lock
*/
-static int
+static __be32
nlm4svc_proc_nm_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
struct nlm_res *resp)
{
@@ -397,7 +399,7 @@ nlm4svc_proc_nm_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
/*
* FREE_ALL: Release all locks and shares held by client
*/
-static int
+static __be32
nlm4svc_proc_free_all(struct svc_rqst *rqstp, struct nlm_args *argp,
void *resp)
{
@@ -415,15 +417,11 @@ nlm4svc_proc_free_all(struct svc_rqst *rqstp, struct nlm_args *argp,
/*
* SM_NOTIFY: private callback from statd (not part of official NLM proto)
*/
-static int
+static __be32
nlm4svc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp,
void *resp)
{
struct sockaddr_in saddr = rqstp->rq_addr;
- int vers = argp->vers;
- int prot = argp->proto >> 1;
-
- struct nlm_host *host;
dprintk("lockd: SM_NOTIFY called\n");
if (saddr.sin_addr.s_addr != htonl(INADDR_LOOPBACK)
@@ -438,28 +436,17 @@ nlm4svc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp,
/* Obtain the host pointer for this NFS server and try to
* reclaim all locks we hold on this server.
*/
+ memset(&saddr, 0, sizeof(saddr));
saddr.sin_addr.s_addr = argp->addr;
+ nlm_host_rebooted(&saddr, argp->mon, argp->len, argp->state);
- if ((argp->proto & 1)==0) {
- if ((host = nlmclnt_lookup_host(&saddr, prot, vers)) != NULL) {
- nlmclnt_recovery(host, argp->state);
- nlm_release_host(host);
- }
- } else {
- /* If we run on an NFS server, delete all locks held by the client */
-
- if ((host = nlm_lookup_host(1, &saddr, prot, vers)) != NULL) {
- nlmsvc_free_host_resources(host);
- nlm_release_host(host);
- }
- }
return rpc_success;
}
/*
* client sent a GRANTED_RES, let's remove the associated block
*/
-static int
+static __be32
nlm4svc_proc_granted_res(struct svc_rqst *rqstp, struct nlm_res *argp,
void *resp)
{
@@ -468,7 +455,7 @@ nlm4svc_proc_granted_res(struct svc_rqst *rqstp, struct nlm_res *argp,
dprintk("lockd: GRANTED_RES called\n");
- nlmsvc_grant_reply(rqstp, &argp->cookie, argp->status);
+ nlmsvc_grant_reply(&argp->cookie, argp->status);
return rpc_success;
}
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index 93c00ee7189d..7e219b938552 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -40,7 +40,7 @@
static void nlmsvc_release_block(struct nlm_block *block);
static void nlmsvc_insert_block(struct nlm_block *block, unsigned long);
-static int nlmsvc_remove_block(struct nlm_block *block);
+static void nlmsvc_remove_block(struct nlm_block *block);
static int nlmsvc_setgrantargs(struct nlm_rqst *call, struct nlm_lock *lock);
static void nlmsvc_freegrantargs(struct nlm_rqst *call);
@@ -49,7 +49,7 @@ static const struct rpc_call_ops nlmsvc_grant_ops;
/*
* The list of blocked locks to retry
*/
-static struct nlm_block * nlm_blocked;
+static LIST_HEAD(nlm_blocked);
/*
* Insert a blocked lock into the global list
@@ -57,48 +57,44 @@ static struct nlm_block * nlm_blocked;
static void
nlmsvc_insert_block(struct nlm_block *block, unsigned long when)
{
- struct nlm_block **bp, *b;
+ struct nlm_block *b;
+ struct list_head *pos;
dprintk("lockd: nlmsvc_insert_block(%p, %ld)\n", block, when);
- kref_get(&block->b_count);
- if (block->b_queued)
- nlmsvc_remove_block(block);
- bp = &nlm_blocked;
+ if (list_empty(&block->b_list)) {
+ kref_get(&block->b_count);
+ } else {
+ list_del_init(&block->b_list);
+ }
+
+ pos = &nlm_blocked;
if (when != NLM_NEVER) {
if ((when += jiffies) == NLM_NEVER)
when ++;
- while ((b = *bp) && time_before_eq(b->b_when,when) && b->b_when != NLM_NEVER)
- bp = &b->b_next;
- } else
- while ((b = *bp) != 0)
- bp = &b->b_next;
+ list_for_each(pos, &nlm_blocked) {
+ b = list_entry(pos, struct nlm_block, b_list);
+ if (time_after(b->b_when,when) || b->b_when == NLM_NEVER)
+ break;
+ }
+ /* On normal exit from the loop, pos == &nlm_blocked,
+ * so we will be adding to the end of the list - good
+ */
+ }
- block->b_queued = 1;
+ list_add_tail(&block->b_list, pos);
block->b_when = when;
- block->b_next = b;
- *bp = block;
}
/*
* Remove a block from the global list
*/
-static int
+static inline void
nlmsvc_remove_block(struct nlm_block *block)
{
- struct nlm_block **bp, *b;
-
- if (!block->b_queued)
- return 1;
- for (bp = &nlm_blocked; (b = *bp) != 0; bp = &b->b_next) {
- if (b == block) {
- *bp = block->b_next;
- block->b_queued = 0;
- nlmsvc_release_block(block);
- return 1;
- }
+ if (!list_empty(&block->b_list)) {
+ list_del_init(&block->b_list);
+ nlmsvc_release_block(block);
}
-
- return 0;
}
/*
@@ -107,14 +103,14 @@ nlmsvc_remove_block(struct nlm_block *block)
static struct nlm_block *
nlmsvc_lookup_block(struct nlm_file *file, struct nlm_lock *lock)
{
- struct nlm_block **head, *block;
+ struct nlm_block *block;
struct file_lock *fl;
dprintk("lockd: nlmsvc_lookup_block f=%p pd=%d %Ld-%Ld ty=%d\n",
file, lock->fl.fl_pid,
(long long)lock->fl.fl_start,
(long long)lock->fl.fl_end, lock->fl.fl_type);
- for (head = &nlm_blocked; (block = *head) != 0; head = &block->b_next) {
+ list_for_each_entry(block, &nlm_blocked, b_list) {
fl = &block->b_call->a_args.lock.fl;
dprintk("lockd: check f=%p pd=%d %Ld-%Ld ty=%d cookie=%s\n",
block->b_file, fl->fl_pid,
@@ -143,20 +139,20 @@ static inline int nlm_cookie_match(struct nlm_cookie *a, struct nlm_cookie *b)
* Find a block with a given NLM cookie.
*/
static inline struct nlm_block *
-nlmsvc_find_block(struct nlm_cookie *cookie, struct sockaddr_in *sin)
+nlmsvc_find_block(struct nlm_cookie *cookie)
{
struct nlm_block *block;
- for (block = nlm_blocked; block; block = block->b_next) {
- dprintk("cookie: head of blocked queue %p, block %p\n",
- nlm_blocked, block);
- if (nlm_cookie_match(&block->b_call->a_args.cookie,cookie)
- && nlm_cmp_addr(sin, &block->b_host->h_addr))
- break;
+ list_for_each_entry(block, &nlm_blocked, b_list) {
+ if (nlm_cookie_match(&block->b_call->a_args.cookie,cookie))
+ goto found;
}
- if (block != NULL)
- kref_get(&block->b_count);
+ return NULL;
+
+found:
+ dprintk("nlmsvc_find_block(%s): block=%p\n", nlmdbg_cookie2a(cookie), block);
+ kref_get(&block->b_count);
return block;
}
@@ -169,6 +165,11 @@ nlmsvc_find_block(struct nlm_cookie *cookie, struct sockaddr_in *sin)
* request, but (as I found out later) that's because some implementations
* do just this. Never mind the standards comittees, they support our
* logging industries.
+ *
+ * 10 years later: I hope we can safely ignore these old and broken
+ * clients by now. Let's fix this so we can uniquely identify an incoming
+ * GRANTED_RES message by cookie, without having to rely on the client's IP
+ * address. --okir
*/
static inline struct nlm_block *
nlmsvc_create_block(struct svc_rqst *rqstp, struct nlm_file *file,
@@ -179,7 +180,7 @@ nlmsvc_create_block(struct svc_rqst *rqstp, struct nlm_file *file,
struct nlm_rqst *call = NULL;
/* Create host handle for callback */
- host = nlmsvc_lookup_host(rqstp);
+ host = nlmsvc_lookup_host(rqstp, lock->caller, lock->len);
if (host == NULL)
return NULL;
@@ -192,6 +193,8 @@ nlmsvc_create_block(struct svc_rqst *rqstp, struct nlm_file *file,
if (block == NULL)
goto failed;
kref_init(&block->b_count);
+ INIT_LIST_HEAD(&block->b_list);
+ INIT_LIST_HEAD(&block->b_flist);
if (!nlmsvc_setgrantargs(call, lock))
goto failed_free;
@@ -199,7 +202,7 @@ nlmsvc_create_block(struct svc_rqst *rqstp, struct nlm_file *file,
/* Set notifier function for VFS, and init args */
call->a_args.lock.fl.fl_flags |= FL_SLEEP;
call->a_args.lock.fl.fl_lmops = &nlmsvc_lock_operations;
- call->a_args.cookie = *cookie; /* see above */
+ nlmclnt_next_cookie(&call->a_args.cookie);
dprintk("lockd: created block %p...\n", block);
@@ -210,8 +213,7 @@ nlmsvc_create_block(struct svc_rqst *rqstp, struct nlm_file *file,
file->f_count++;
/* Add to file's list of blocks */
- block->b_fnext = file->f_blocks;
- file->f_blocks = block;
+ list_add(&block->b_flist, &file->f_blocks);
/* Set up RPC arguments for callback */
block->b_call = call;
@@ -248,19 +250,13 @@ static void nlmsvc_free_block(struct kref *kref)
{
struct nlm_block *block = container_of(kref, struct nlm_block, b_count);
struct nlm_file *file = block->b_file;
- struct nlm_block **bp;
dprintk("lockd: freeing block %p...\n", block);
- down(&file->f_sema);
/* Remove block from file's list of blocks */
- for (bp = &file->f_blocks; *bp; bp = &(*bp)->b_fnext) {
- if (*bp == block) {
- *bp = block->b_fnext;
- break;
- }
- }
- up(&file->f_sema);
+ mutex_lock(&file->f_mutex);
+ list_del_init(&block->b_flist);
+ mutex_unlock(&file->f_mutex);
nlmsvc_freegrantargs(block->b_call);
nlm_release_call(block->b_call);
@@ -274,47 +270,32 @@ static void nlmsvc_release_block(struct nlm_block *block)
kref_put(&block->b_count, nlmsvc_free_block);
}
-static void nlmsvc_act_mark(struct nlm_host *host, struct nlm_file *file)
-{
- struct nlm_block *block;
-
- down(&file->f_sema);
- for (block = file->f_blocks; block != NULL; block = block->b_fnext)
- block->b_host->h_inuse = 1;
- up(&file->f_sema);
-}
-
-static void nlmsvc_act_unlock(struct nlm_host *host, struct nlm_file *file)
+/*
+ * Loop over all blocks and delete blocks held by
+ * a matching host.
+ */
+void nlmsvc_traverse_blocks(struct nlm_host *host,
+ struct nlm_file *file,
+ nlm_host_match_fn_t match)
{
- struct nlm_block *block;
+ struct nlm_block *block, *next;
restart:
- down(&file->f_sema);
- for (block = file->f_blocks; block != NULL; block = block->b_fnext) {
- if (host != NULL && host != block->b_host)
+ mutex_lock(&file->f_mutex);
+ list_for_each_entry_safe(block, next, &file->f_blocks, b_flist) {
+ if (!match(block->b_host, host))
continue;
- if (!block->b_queued)
+ /* Do not destroy blocks that are not on
+ * the global retry list - why? */
+ if (list_empty(&block->b_list))
continue;
kref_get(&block->b_count);
- up(&file->f_sema);
+ mutex_unlock(&file->f_mutex);
nlmsvc_unlink_block(block);
nlmsvc_release_block(block);
goto restart;
}
- up(&file->f_sema);
-}
-
-/*
- * Loop over all blocks and perform the action specified.
- * (NLM_ACT_CHECK handled by nlmsvc_inspect_file).
- */
-void
-nlmsvc_traverse_blocks(struct nlm_host *host, struct nlm_file *file, int action)
-{
- if (action == NLM_ACT_MARK)
- nlmsvc_act_mark(host, file);
- else
- nlmsvc_act_unlock(host, file);
+ mutex_unlock(&file->f_mutex);
}
/*
@@ -353,13 +334,13 @@ static void nlmsvc_freegrantargs(struct nlm_rqst *call)
* Attempt to establish a lock, and if it can't be granted, block it
* if required.
*/
-u32
+__be32
nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
struct nlm_lock *lock, int wait, struct nlm_cookie *cookie)
{
struct nlm_block *block, *newblock = NULL;
int error;
- u32 ret;
+ __be32 ret;
dprintk("lockd: nlmsvc_lock(%s/%ld, ty=%d, pi=%d, %Ld-%Ld, bl=%d)\n",
file->f_file->f_dentry->d_inode->i_sb->s_id,
@@ -373,7 +354,7 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
lock->fl.fl_flags &= ~FL_SLEEP;
again:
/* Lock file against concurrent access */
- down(&file->f_sema);
+ mutex_lock(&file->f_mutex);
/* Get existing block (in case client is busy-waiting) */
block = nlmsvc_lookup_block(file, lock);
if (block == NULL) {
@@ -411,10 +392,10 @@ again:
/* If we don't have a block, create and initialize it. Then
* retry because we may have slept in kmalloc. */
- /* We have to release f_sema as nlmsvc_create_block may try to
+ /* We have to release f_mutex as nlmsvc_create_block may try to
* to claim it while doing host garbage collection */
if (newblock == NULL) {
- up(&file->f_sema);
+ mutex_unlock(&file->f_mutex);
dprintk("lockd: blocking on this lock (allocating).\n");
if (!(newblock = nlmsvc_create_block(rqstp, file, lock, cookie)))
return nlm_lck_denied_nolocks;
@@ -424,7 +405,7 @@ again:
/* Append to list of blocked */
nlmsvc_insert_block(newblock, NLM_NEVER);
out:
- up(&file->f_sema);
+ mutex_unlock(&file->f_mutex);
nlmsvc_release_block(newblock);
nlmsvc_release_block(block);
dprintk("lockd: nlmsvc_lock returned %u\n", ret);
@@ -434,7 +415,7 @@ out:
/*
* Test for presence of a conflicting lock.
*/
-u32
+__be32
nlmsvc_testlock(struct nlm_file *file, struct nlm_lock *lock,
struct nlm_lock *conflock)
{
@@ -451,6 +432,7 @@ nlmsvc_testlock(struct nlm_file *file, struct nlm_lock *lock,
(long long)conflock->fl.fl_start,
(long long)conflock->fl.fl_end);
conflock->caller = "somehost"; /* FIXME */
+ conflock->len = strlen(conflock->caller);
conflock->oh.len = 0; /* don't return OH info */
conflock->svid = conflock->fl.fl_pid;
return nlm_lck_denied;
@@ -466,7 +448,7 @@ nlmsvc_testlock(struct nlm_file *file, struct nlm_lock *lock,
* afterwards. In this case the block will still be there, and hence
* must be removed.
*/
-u32
+__be32
nlmsvc_unlock(struct nlm_file *file, struct nlm_lock *lock)
{
int error;
@@ -494,7 +476,7 @@ nlmsvc_unlock(struct nlm_file *file, struct nlm_lock *lock)
* be in progress.
* The calling procedure must check whether the file can be closed.
*/
-u32
+__be32
nlmsvc_cancel_blocked(struct nlm_file *file, struct nlm_lock *lock)
{
struct nlm_block *block;
@@ -507,9 +489,9 @@ nlmsvc_cancel_blocked(struct nlm_file *file, struct nlm_lock *lock)
(long long)lock->fl.fl_start,
(long long)lock->fl.fl_end);
- down(&file->f_sema);
+ mutex_lock(&file->f_mutex);
block = nlmsvc_lookup_block(file, lock);
- up(&file->f_sema);
+ mutex_unlock(&file->f_mutex);
if (block != NULL) {
status = nlmsvc_unlink_block(block);
nlmsvc_release_block(block);
@@ -527,10 +509,10 @@ nlmsvc_cancel_blocked(struct nlm_file *file, struct nlm_lock *lock)
static void
nlmsvc_notify_blocked(struct file_lock *fl)
{
- struct nlm_block **bp, *block;
+ struct nlm_block *block;
dprintk("lockd: VFS unblock notification for block %p\n", fl);
- for (bp = &nlm_blocked; (block = *bp) != 0; bp = &block->b_next) {
+ list_for_each_entry(block, &nlm_blocked, b_list) {
if (nlm_compare_locks(&block->b_call->a_args.lock.fl, fl)) {
nlmsvc_insert_block(block, 0);
svc_wake_up(block->b_daemon);
@@ -663,17 +645,14 @@ static const struct rpc_call_ops nlmsvc_grant_ops = {
* block.
*/
void
-nlmsvc_grant_reply(struct svc_rqst *rqstp, struct nlm_cookie *cookie, u32 status)
+nlmsvc_grant_reply(struct nlm_cookie *cookie, u32 status)
{
struct nlm_block *block;
- struct nlm_file *file;
- dprintk("grant_reply: looking for cookie %x, host (%08x), s=%d \n",
- *(unsigned int *)(cookie->data),
- ntohl(rqstp->rq_addr.sin_addr.s_addr), status);
- if (!(block = nlmsvc_find_block(cookie, &rqstp->rq_addr)))
+ dprintk("grant_reply: looking for cookie %x, s=%d \n",
+ *(unsigned int *)(cookie->data), status);
+ if (!(block = nlmsvc_find_block(cookie)))
return;
- file = block->b_file;
if (block) {
if (status == NLM_LCK_DENIED_GRACE_PERIOD) {
@@ -696,16 +675,19 @@ nlmsvc_grant_reply(struct svc_rqst *rqstp, struct nlm_cookie *cookie, u32 status
unsigned long
nlmsvc_retry_blocked(void)
{
- struct nlm_block *block;
+ unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
+ struct nlm_block *block;
+
+ while (!list_empty(&nlm_blocked)) {
+ block = list_entry(nlm_blocked.next, struct nlm_block, b_list);
- dprintk("nlmsvc_retry_blocked(%p, when=%ld)\n",
- nlm_blocked,
- nlm_blocked? nlm_blocked->b_when : 0);
- while ((block = nlm_blocked) != 0) {
if (block->b_when == NLM_NEVER)
break;
- if (time_after(block->b_when,jiffies))
+ if (time_after(block->b_when,jiffies)) {
+ timeout = block->b_when - jiffies;
break;
+ }
+
dprintk("nlmsvc_retry_blocked(%p, when=%ld)\n",
block, block->b_when);
kref_get(&block->b_count);
@@ -713,8 +695,5 @@ nlmsvc_retry_blocked(void)
nlmsvc_release_block(block);
}
- if ((block = nlm_blocked) && block->b_when != NLM_NEVER)
- return (block->b_when - jiffies);
-
- return MAX_SCHEDULE_TIMEOUT;
+ return timeout;
}
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index dbb66a3b5cd9..32e99a6e8dca 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c
@@ -22,8 +22,8 @@
#define NLMDBG_FACILITY NLMDBG_CLIENT
#ifdef CONFIG_LOCKD_V4
-static u32
-cast_to_nlm(u32 status, u32 vers)
+static __be32
+cast_to_nlm(__be32 status, u32 vers)
{
/* Note: status is assumed to be in network byte order !!! */
if (vers != 4){
@@ -52,22 +52,22 @@ cast_to_nlm(u32 status, u32 vers)
/*
* Obtain client and file from arguments
*/
-static u32
+static __be32
nlmsvc_retrieve_args(struct svc_rqst *rqstp, struct nlm_args *argp,
struct nlm_host **hostp, struct nlm_file **filp)
{
struct nlm_host *host = NULL;
struct nlm_file *file = NULL;
struct nlm_lock *lock = &argp->lock;
- u32 error;
+ __be32 error = 0;
/* nfsd callbacks must have been installed for this procedure */
if (!nlmsvc_ops)
return nlm_lck_denied_nolocks;
/* Obtain host handle */
- if (!(host = nlmsvc_lookup_host(rqstp))
- || (argp->monitor && !host->h_monitored && nsm_monitor(host) < 0))
+ if (!(host = nlmsvc_lookup_host(rqstp, lock->caller, lock->len))
+ || (argp->monitor && nsm_monitor(host) < 0))
goto no_locks;
*hostp = host;
@@ -88,13 +88,15 @@ nlmsvc_retrieve_args(struct svc_rqst *rqstp, struct nlm_args *argp,
no_locks:
if (host)
nlm_release_host(host);
+ if (error)
+ return error;
return nlm_lck_denied_nolocks;
}
/*
* NULL: Test for presence of service
*/
-static int
+static __be32
nlmsvc_proc_null(struct svc_rqst *rqstp, void *argp, void *resp)
{
dprintk("lockd: NULL called\n");
@@ -104,7 +106,7 @@ nlmsvc_proc_null(struct svc_rqst *rqstp, void *argp, void *resp)
/*
* TEST: Check for conflicting lock
*/
-static int
+static __be32
nlmsvc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp,
struct nlm_res *resp)
{
@@ -122,7 +124,7 @@ nlmsvc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp,
/* Obtain client and file */
if ((resp->status = nlmsvc_retrieve_args(rqstp, argp, &host, &file)))
- return rpc_success;
+ return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
/* Now check for conflicting locks */
resp->status = cast_status(nlmsvc_testlock(file, &argp->lock, &resp->lock));
@@ -134,7 +136,7 @@ nlmsvc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp,
return rpc_success;
}
-static int
+static __be32
nlmsvc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
struct nlm_res *resp)
{
@@ -153,7 +155,7 @@ nlmsvc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
/* Obtain client and file */
if ((resp->status = nlmsvc_retrieve_args(rqstp, argp, &host, &file)))
- return rpc_success;
+ return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
#if 0
/* If supplied state doesn't match current state, we assume it's
@@ -177,7 +179,7 @@ nlmsvc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
return rpc_success;
}
-static int
+static __be32
nlmsvc_proc_cancel(struct svc_rqst *rqstp, struct nlm_args *argp,
struct nlm_res *resp)
{
@@ -196,7 +198,7 @@ nlmsvc_proc_cancel(struct svc_rqst *rqstp, struct nlm_args *argp,
/* Obtain client and file */
if ((resp->status = nlmsvc_retrieve_args(rqstp, argp, &host, &file)))
- return rpc_success;
+ return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
/* Try to cancel request. */
resp->status = cast_status(nlmsvc_cancel_blocked(file, &argp->lock));
@@ -210,7 +212,7 @@ nlmsvc_proc_cancel(struct svc_rqst *rqstp, struct nlm_args *argp,
/*
* UNLOCK: release a lock
*/
-static int
+static __be32
nlmsvc_proc_unlock(struct svc_rqst *rqstp, struct nlm_args *argp,
struct nlm_res *resp)
{
@@ -229,7 +231,7 @@ nlmsvc_proc_unlock(struct svc_rqst *rqstp, struct nlm_args *argp,
/* Obtain client and file */
if ((resp->status = nlmsvc_retrieve_args(rqstp, argp, &host, &file)))
- return rpc_success;
+ return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
/* Now try to remove the lock */
resp->status = cast_status(nlmsvc_unlock(file, &argp->lock));
@@ -244,7 +246,7 @@ nlmsvc_proc_unlock(struct svc_rqst *rqstp, struct nlm_args *argp,
* GRANTED: A server calls us to tell that a process' lock request
* was granted
*/
-static int
+static __be32
nlmsvc_proc_granted(struct svc_rqst *rqstp, struct nlm_args *argp,
struct nlm_res *resp)
{
@@ -280,14 +282,16 @@ static const struct rpc_call_ops nlmsvc_callback_ops = {
* because we send the callback before the reply proper. I hope this
* doesn't break any clients.
*/
-static int nlmsvc_callback(struct svc_rqst *rqstp, u32 proc, struct nlm_args *argp,
- int (*func)(struct svc_rqst *, struct nlm_args *, struct nlm_res *))
+static __be32 nlmsvc_callback(struct svc_rqst *rqstp, u32 proc, struct nlm_args *argp,
+ __be32 (*func)(struct svc_rqst *, struct nlm_args *, struct nlm_res *))
{
struct nlm_host *host;
struct nlm_rqst *call;
- int stat;
+ __be32 stat;
- host = nlmsvc_lookup_host(rqstp);
+ host = nlmsvc_lookup_host(rqstp,
+ argp->lock.caller,
+ argp->lock.len);
if (host == NULL)
return rpc_system_err;
@@ -307,28 +311,28 @@ static int nlmsvc_callback(struct svc_rqst *rqstp, u32 proc, struct nlm_args *ar
return rpc_success;
}
-static int nlmsvc_proc_test_msg(struct svc_rqst *rqstp, struct nlm_args *argp,
+static __be32 nlmsvc_proc_test_msg(struct svc_rqst *rqstp, struct nlm_args *argp,
void *resp)
{
dprintk("lockd: TEST_MSG called\n");
return nlmsvc_callback(rqstp, NLMPROC_TEST_RES, argp, nlmsvc_proc_test);
}
-static int nlmsvc_proc_lock_msg(struct svc_rqst *rqstp, struct nlm_args *argp,
+static __be32 nlmsvc_proc_lock_msg(struct svc_rqst *rqstp, struct nlm_args *argp,
void *resp)
{
dprintk("lockd: LOCK_MSG called\n");
return nlmsvc_callback(rqstp, NLMPROC_LOCK_RES, argp, nlmsvc_proc_lock);
}
-static int nlmsvc_proc_cancel_msg(struct svc_rqst *rqstp, struct nlm_args *argp,
+static __be32 nlmsvc_proc_cancel_msg(struct svc_rqst *rqstp, struct nlm_args *argp,
void *resp)
{
dprintk("lockd: CANCEL_MSG called\n");
return nlmsvc_callback(rqstp, NLMPROC_CANCEL_RES, argp, nlmsvc_proc_cancel);
}
-static int
+static __be32
nlmsvc_proc_unlock_msg(struct svc_rqst *rqstp, struct nlm_args *argp,
void *resp)
{
@@ -336,7 +340,7 @@ nlmsvc_proc_unlock_msg(struct svc_rqst *rqstp, struct nlm_args *argp,
return nlmsvc_callback(rqstp, NLMPROC_UNLOCK_RES, argp, nlmsvc_proc_unlock);
}
-static int
+static __be32
nlmsvc_proc_granted_msg(struct svc_rqst *rqstp, struct nlm_args *argp,
void *resp)
{
@@ -347,7 +351,7 @@ nlmsvc_proc_granted_msg(struct svc_rqst *rqstp, struct nlm_args *argp,
/*
* SHARE: create a DOS share or alter existing share.
*/
-static int
+static __be32
nlmsvc_proc_share(struct svc_rqst *rqstp, struct nlm_args *argp,
struct nlm_res *resp)
{
@@ -366,7 +370,7 @@ nlmsvc_proc_share(struct svc_rqst *rqstp, struct nlm_args *argp,
/* Obtain client and file */
if ((resp->status = nlmsvc_retrieve_args(rqstp, argp, &host, &file)))
- return rpc_success;
+ return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
/* Now try to create the share */
resp->status = cast_status(nlmsvc_share_file(host, file, argp));
@@ -380,7 +384,7 @@ nlmsvc_proc_share(struct svc_rqst *rqstp, struct nlm_args *argp,
/*
* UNSHARE: Release a DOS share.
*/
-static int
+static __be32
nlmsvc_proc_unshare(struct svc_rqst *rqstp, struct nlm_args *argp,
struct nlm_res *resp)
{
@@ -399,7 +403,7 @@ nlmsvc_proc_unshare(struct svc_rqst *rqstp, struct nlm_args *argp,
/* Obtain client and file */
if ((resp->status = nlmsvc_retrieve_args(rqstp, argp, &host, &file)))
- return rpc_success;
+ return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
/* Now try to unshare the file */
resp->status = cast_status(nlmsvc_unshare_file(host, file, argp));
@@ -413,7 +417,7 @@ nlmsvc_proc_unshare(struct svc_rqst *rqstp, struct nlm_args *argp,
/*
* NM_LOCK: Create an unmonitored lock
*/
-static int
+static __be32
nlmsvc_proc_nm_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
struct nlm_res *resp)
{
@@ -426,7 +430,7 @@ nlmsvc_proc_nm_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
/*
* FREE_ALL: Release all locks and shares held by client
*/
-static int
+static __be32
nlmsvc_proc_free_all(struct svc_rqst *rqstp, struct nlm_args *argp,
void *resp)
{
@@ -444,14 +448,11 @@ nlmsvc_proc_free_all(struct svc_rqst *rqstp, struct nlm_args *argp,
/*
* SM_NOTIFY: private callback from statd (not part of official NLM proto)
*/
-static int
+static __be32
nlmsvc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp,
void *resp)
{
struct sockaddr_in saddr = rqstp->rq_addr;
- int vers = argp->vers;
- int prot = argp->proto >> 1;
- struct nlm_host *host;
dprintk("lockd: SM_NOTIFY called\n");
if (saddr.sin_addr.s_addr != htonl(INADDR_LOOPBACK)
@@ -466,19 +467,9 @@ nlmsvc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp,
/* Obtain the host pointer for this NFS server and try to
* reclaim all locks we hold on this server.
*/
+ memset(&saddr, 0, sizeof(saddr));
saddr.sin_addr.s_addr = argp->addr;
- if ((argp->proto & 1)==0) {
- if ((host = nlmclnt_lookup_host(&saddr, prot, vers)) != NULL) {
- nlmclnt_recovery(host, argp->state);
- nlm_release_host(host);
- }
- } else {
- /* If we run on an NFS server, delete all locks held by the client */
- if ((host = nlm_lookup_host(1, &saddr, prot, vers)) != NULL) {
- nlmsvc_free_host_resources(host);
- nlm_release_host(host);
- }
- }
+ nlm_host_rebooted(&saddr, argp->mon, argp->len, argp->state);
return rpc_success;
}
@@ -486,7 +477,7 @@ nlmsvc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp,
/*
* client sent a GRANTED_RES, let's remove the associated block
*/
-static int
+static __be32
nlmsvc_proc_granted_res(struct svc_rqst *rqstp, struct nlm_res *argp,
void *resp)
{
@@ -495,7 +486,7 @@ nlmsvc_proc_granted_res(struct svc_rqst *rqstp, struct nlm_res *argp,
dprintk("lockd: GRANTED_RES called\n");
- nlmsvc_grant_reply(rqstp, &argp->cookie, argp->status);
+ nlmsvc_grant_reply(&argp->cookie, argp->status);
return rpc_success;
}
diff --git a/fs/lockd/svcshare.c b/fs/lockd/svcshare.c
index 27288c83da96..6220dc2a3f2c 100644
--- a/fs/lockd/svcshare.c
+++ b/fs/lockd/svcshare.c
@@ -23,7 +23,7 @@ nlm_cmp_owner(struct nlm_share *share, struct xdr_netobj *oh)
&& !memcmp(share->s_owner.data, oh->data, oh->len);
}
-u32
+__be32
nlmsvc_share_file(struct nlm_host *host, struct nlm_file *file,
struct nlm_args *argp)
{
@@ -64,7 +64,7 @@ update:
/*
* Delete a share.
*/
-u32
+__be32
nlmsvc_unshare_file(struct nlm_host *host, struct nlm_file *file,
struct nlm_args *argp)
{
@@ -85,24 +85,20 @@ nlmsvc_unshare_file(struct nlm_host *host, struct nlm_file *file,
}
/*
- * Traverse all shares for a given file (and host).
- * NLM_ACT_CHECK is handled by nlmsvc_inspect_file.
+ * Traverse all shares for a given file, and delete
+ * those owned by the given (type of) host
*/
-void
-nlmsvc_traverse_shares(struct nlm_host *host, struct nlm_file *file, int action)
+void nlmsvc_traverse_shares(struct nlm_host *host, struct nlm_file *file,
+ nlm_host_match_fn_t match)
{
struct nlm_share *share, **shpp;
shpp = &file->f_shares;
while ((share = *shpp) != NULL) {
- if (action == NLM_ACT_MARK)
- share->s_host->h_inuse = 1;
- else if (action == NLM_ACT_UNLOCK) {
- if (host == NULL || host == share->s_host) {
- *shpp = share->s_next;
- kfree(share);
- continue;
- }
+ if (match(share->s_host, host)) {
+ *shpp = share->s_next;
+ kfree(share);
+ continue;
}
shpp = &share->s_next;
}
diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c
index a92dd98f8401..e83024e16042 100644
--- a/fs/lockd/svcsubs.c
+++ b/fs/lockd/svcsubs.c
@@ -25,9 +25,9 @@
/*
* Global file hash table
*/
-#define FILE_HASH_BITS 5
+#define FILE_HASH_BITS 7
#define FILE_NRHASH (1<<FILE_HASH_BITS)
-static struct nlm_file * nlm_files[FILE_NRHASH];
+static struct hlist_head nlm_files[FILE_NRHASH];
static DEFINE_MUTEX(nlm_file_mutex);
#ifdef NFSD_DEBUG
@@ -78,13 +78,14 @@ static inline unsigned int file_hash(struct nfs_fh *f)
* This is not quite right, but for now, we assume the client performs
* the proper R/W checking.
*/
-u32
+__be32
nlm_lookup_file(struct svc_rqst *rqstp, struct nlm_file **result,
struct nfs_fh *f)
{
+ struct hlist_node *pos;
struct nlm_file *file;
unsigned int hash;
- u32 nfserr;
+ __be32 nfserr;
nlm_debug_print_fh("nlm_file_lookup", f);
@@ -93,7 +94,7 @@ nlm_lookup_file(struct svc_rqst *rqstp, struct nlm_file **result,
/* Lock file table */
mutex_lock(&nlm_file_mutex);
- for (file = nlm_files[hash]; file; file = file->f_next)
+ hlist_for_each_entry(file, pos, &nlm_files[hash], f_list)
if (!nfs_compare_fh(&file->f_handle, f))
goto found;
@@ -105,8 +106,9 @@ nlm_lookup_file(struct svc_rqst *rqstp, struct nlm_file **result,
goto out_unlock;
memcpy(&file->f_handle, f, sizeof(struct nfs_fh));
- file->f_hash = hash;
- init_MUTEX(&file->f_sema);
+ mutex_init(&file->f_mutex);
+ INIT_HLIST_NODE(&file->f_list);
+ INIT_LIST_HEAD(&file->f_blocks);
/* Open the file. Note that this must not sleep for too long, else
* we would lock up lockd:-) So no NFS re-exports, folks.
@@ -115,12 +117,11 @@ nlm_lookup_file(struct svc_rqst *rqstp, struct nlm_file **result,
* the file.
*/
if ((nfserr = nlmsvc_ops->fopen(rqstp, f, &file->f_file)) != 0) {
- dprintk("lockd: open failed (nfserr %d)\n", ntohl(nfserr));
+ dprintk("lockd: open failed (error %d)\n", nfserr);
goto out_free;
}
- file->f_next = nlm_files[hash];
- nlm_files[hash] = file;
+ hlist_add_head(&file->f_list, &nlm_files[hash]);
found:
dprintk("lockd: found file %p (count %d)\n", file, file->f_count);
@@ -134,12 +135,6 @@ out_unlock:
out_free:
kfree(file);
-#ifdef CONFIG_LOCKD_V4
- if (nfserr == 1)
- nfserr = nlm4_stale_fh;
- else
-#endif
- nfserr = nlm_lck_denied;
goto out_unlock;
}
@@ -149,22 +144,14 @@ out_free:
static inline void
nlm_delete_file(struct nlm_file *file)
{
- struct nlm_file **fp, *f;
-
nlm_debug_print_file("closing file", file);
-
- fp = nlm_files + file->f_hash;
- while ((f = *fp) != NULL) {
- if (f == file) {
- *fp = file->f_next;
- nlmsvc_ops->fclose(file->f_file);
- kfree(file);
- return;
- }
- fp = &f->f_next;
+ if (!hlist_unhashed(&file->f_list)) {
+ hlist_del(&file->f_list);
+ nlmsvc_ops->fclose(file->f_file);
+ kfree(file);
+ } else {
+ printk(KERN_WARNING "lockd: attempt to release unknown file!\n");
}
-
- printk(KERN_WARNING "lockd: attempt to release unknown file!\n");
}
/*
@@ -172,7 +159,8 @@ nlm_delete_file(struct nlm_file *file)
* action.
*/
static int
-nlm_traverse_locks(struct nlm_host *host, struct nlm_file *file, int action)
+nlm_traverse_locks(struct nlm_host *host, struct nlm_file *file,
+ nlm_host_match_fn_t match)
{
struct inode *inode = nlmsvc_file_inode(file);
struct file_lock *fl;
@@ -186,17 +174,11 @@ again:
/* update current lock count */
file->f_locks++;
+
lockhost = (struct nlm_host *) fl->fl_owner;
- if (action == NLM_ACT_MARK)
- lockhost->h_inuse = 1;
- else if (action == NLM_ACT_CHECK)
- return 1;
- else if (action == NLM_ACT_UNLOCK) {
+ if (match(lockhost, host)) {
struct file_lock lock = *fl;
- if (host && lockhost != host)
- continue;
-
lock.fl_type = F_UNLCK;
lock.fl_start = 0;
lock.fl_end = OFFSET_MAX;
@@ -213,53 +195,66 @@ again:
}
/*
- * Operate on a single file
+ * Inspect a single file
+ */
+static inline int
+nlm_inspect_file(struct nlm_host *host, struct nlm_file *file, nlm_host_match_fn_t match)
+{
+ nlmsvc_traverse_blocks(host, file, match);
+ nlmsvc_traverse_shares(host, file, match);
+ return nlm_traverse_locks(host, file, match);
+}
+
+/*
+ * Quick check whether there are still any locks, blocks or
+ * shares on a given file.
*/
static inline int
-nlm_inspect_file(struct nlm_host *host, struct nlm_file *file, int action)
+nlm_file_inuse(struct nlm_file *file)
{
- if (action == NLM_ACT_CHECK) {
- /* Fast path for mark and sweep garbage collection */
- if (file->f_count || file->f_blocks || file->f_shares)
+ struct inode *inode = nlmsvc_file_inode(file);
+ struct file_lock *fl;
+
+ if (file->f_count || !list_empty(&file->f_blocks) || file->f_shares)
+ return 1;
+
+ for (fl = inode->i_flock; fl; fl = fl->fl_next) {
+ if (fl->fl_lmops == &nlmsvc_lock_operations)
return 1;
- } else {
- nlmsvc_traverse_blocks(host, file, action);
- nlmsvc_traverse_shares(host, file, action);
}
- return nlm_traverse_locks(host, file, action);
+ file->f_locks = 0;
+ return 0;
}
/*
* Loop over all files in the file table.
*/
static int
-nlm_traverse_files(struct nlm_host *host, int action)
+nlm_traverse_files(struct nlm_host *host, nlm_host_match_fn_t match)
{
- struct nlm_file *file, **fp;
+ struct hlist_node *pos, *next;
+ struct nlm_file *file;
int i, ret = 0;
mutex_lock(&nlm_file_mutex);
for (i = 0; i < FILE_NRHASH; i++) {
- fp = nlm_files + i;
- while ((file = *fp) != NULL) {
+ hlist_for_each_entry_safe(file, pos, next, &nlm_files[i], f_list) {
file->f_count++;
mutex_unlock(&nlm_file_mutex);
/* Traverse locks, blocks and shares of this file
* and update file->f_locks count */
- if (nlm_inspect_file(host, file, action))
+ if (nlm_inspect_file(host, file, match))
ret = 1;
mutex_lock(&nlm_file_mutex);
file->f_count--;
/* No more references to this file. Let go of it. */
- if (!file->f_blocks && !file->f_locks
+ if (list_empty(&file->f_blocks) && !file->f_locks
&& !file->f_shares && !file->f_count) {
- *fp = file->f_next;
+ hlist_del(&file->f_list);
nlmsvc_ops->fclose(file->f_file);
kfree(file);
- } else {
- fp = &file->f_next;
}
}
}
@@ -286,23 +281,63 @@ nlm_release_file(struct nlm_file *file)
mutex_lock(&nlm_file_mutex);
/* If there are no more locks etc, delete the file */
- if(--file->f_count == 0) {
- if(!nlm_inspect_file(NULL, file, NLM_ACT_CHECK))
- nlm_delete_file(file);
- }
+ if (--file->f_count == 0 && !nlm_file_inuse(file))
+ nlm_delete_file(file);
mutex_unlock(&nlm_file_mutex);
}
/*
+ * Helpers function for resource traversal
+ *
+ * nlmsvc_mark_host:
+ * used by the garbage collector; simply sets h_inuse.
+ * Always returns 0.
+ *
+ * nlmsvc_same_host:
+ * returns 1 iff the two hosts match. Used to release
+ * all resources bound to a specific host.
+ *
+ * nlmsvc_is_client:
+ * returns 1 iff the host is a client.
+ * Used by nlmsvc_invalidate_all
+ */
+static int
+nlmsvc_mark_host(struct nlm_host *host, struct nlm_host *dummy)
+{
+ host->h_inuse = 1;
+ return 0;
+}
+
+static int
+nlmsvc_same_host(struct nlm_host *host, struct nlm_host *other)
+{
+ return host == other;
+}
+
+static int
+nlmsvc_is_client(struct nlm_host *host, struct nlm_host *dummy)
+{
+ if (host->h_server) {
+ /* we are destroying locks even though the client
+ * hasn't asked us too, so don't unmonitor the
+ * client
+ */
+ if (host->h_nsmhandle)
+ host->h_nsmhandle->sm_sticky = 1;
+ return 1;
+ } else
+ return 0;
+}
+
+/*
* Mark all hosts that still hold resources
*/
void
nlmsvc_mark_resources(void)
{
dprintk("lockd: nlmsvc_mark_resources\n");
-
- nlm_traverse_files(NULL, NLM_ACT_MARK);
+ nlm_traverse_files(NULL, nlmsvc_mark_host);
}
/*
@@ -313,23 +348,25 @@ nlmsvc_free_host_resources(struct nlm_host *host)
{
dprintk("lockd: nlmsvc_free_host_resources\n");
- if (nlm_traverse_files(host, NLM_ACT_UNLOCK))
+ if (nlm_traverse_files(host, nlmsvc_same_host)) {
printk(KERN_WARNING
- "lockd: couldn't remove all locks held by %s",
+ "lockd: couldn't remove all locks held by %s\n",
host->h_name);
+ BUG();
+ }
}
/*
- * delete all hosts structs for clients
+ * Remove all locks held for clients
*/
void
nlmsvc_invalidate_all(void)
{
- struct nlm_host *host;
- while ((host = nlm_find_client()) != NULL) {
- nlmsvc_free_host_resources(host);
- host->h_expires = 0;
- host->h_killed = 1;
- nlm_release_host(host);
- }
+ /* Release all locks held by NFS clients.
+ * Previously, the code would call
+ * nlmsvc_free_host_resources for each client in
+ * turn, which is about as inefficient as it gets.
+ * Now we just do it once in nlm_traverse_files.
+ */
+ nlm_traverse_files(NULL, nlmsvc_is_client);
}
diff --git a/fs/lockd/xdr.c b/fs/lockd/xdr.c
index 61c46facf257..b7c949256e5a 100644
--- a/fs/lockd/xdr.c
+++ b/fs/lockd/xdr.c
@@ -43,7 +43,7 @@ loff_t_to_s32(loff_t offset)
/*
* XDR functions for basic NLM types
*/
-static u32 *nlm_decode_cookie(u32 *p, struct nlm_cookie *c)
+static __be32 *nlm_decode_cookie(__be32 *p, struct nlm_cookie *c)
{
unsigned int len;
@@ -69,8 +69,8 @@ static u32 *nlm_decode_cookie(u32 *p, struct nlm_cookie *c)
return p;
}
-static inline u32 *
-nlm_encode_cookie(u32 *p, struct nlm_cookie *c)
+static inline __be32 *
+nlm_encode_cookie(__be32 *p, struct nlm_cookie *c)
{
*p++ = htonl(c->len);
memcpy(p, c->data, c->len);
@@ -78,8 +78,8 @@ nlm_encode_cookie(u32 *p, struct nlm_cookie *c)
return p;
}
-static u32 *
-nlm_decode_fh(u32 *p, struct nfs_fh *f)
+static __be32 *
+nlm_decode_fh(__be32 *p, struct nfs_fh *f)
{
unsigned int len;
@@ -95,8 +95,8 @@ nlm_decode_fh(u32 *p, struct nfs_fh *f)
return p + XDR_QUADLEN(NFS2_FHSIZE);
}
-static inline u32 *
-nlm_encode_fh(u32 *p, struct nfs_fh *f)
+static inline __be32 *
+nlm_encode_fh(__be32 *p, struct nfs_fh *f)
{
*p++ = htonl(NFS2_FHSIZE);
memcpy(p, f->data, NFS2_FHSIZE);
@@ -106,20 +106,20 @@ nlm_encode_fh(u32 *p, struct nfs_fh *f)
/*
* Encode and decode owner handle
*/
-static inline u32 *
-nlm_decode_oh(u32 *p, struct xdr_netobj *oh)
+static inline __be32 *
+nlm_decode_oh(__be32 *p, struct xdr_netobj *oh)
{
return xdr_decode_netobj(p, oh);
}
-static inline u32 *
-nlm_encode_oh(u32 *p, struct xdr_netobj *oh)
+static inline __be32 *
+nlm_encode_oh(__be32 *p, struct xdr_netobj *oh)
{
return xdr_encode_netobj(p, oh);
}
-static u32 *
-nlm_decode_lock(u32 *p, struct nlm_lock *lock)
+static __be32 *
+nlm_decode_lock(__be32 *p, struct nlm_lock *lock)
{
struct file_lock *fl = &lock->fl;
s32 start, len, end;
@@ -153,8 +153,8 @@ nlm_decode_lock(u32 *p, struct nlm_lock *lock)
/*
* Encode a lock as part of an NLM call
*/
-static u32 *
-nlm_encode_lock(u32 *p, struct nlm_lock *lock)
+static __be32 *
+nlm_encode_lock(__be32 *p, struct nlm_lock *lock)
{
struct file_lock *fl = &lock->fl;
__s32 start, len;
@@ -184,8 +184,8 @@ nlm_encode_lock(u32 *p, struct nlm_lock *lock)
/*
* Encode result of a TEST/TEST_MSG call
*/
-static u32 *
-nlm_encode_testres(u32 *p, struct nlm_res *resp)
+static __be32 *
+nlm_encode_testres(__be32 *p, struct nlm_res *resp)
{
s32 start, len;
@@ -221,7 +221,7 @@ nlm_encode_testres(u32 *p, struct nlm_res *resp)
* First, the server side XDR functions
*/
int
-nlmsvc_decode_testargs(struct svc_rqst *rqstp, u32 *p, nlm_args *argp)
+nlmsvc_decode_testargs(struct svc_rqst *rqstp, __be32 *p, nlm_args *argp)
{
u32 exclusive;
@@ -238,7 +238,7 @@ nlmsvc_decode_testargs(struct svc_rqst *rqstp, u32 *p, nlm_args *argp)
}
int
-nlmsvc_encode_testres(struct svc_rqst *rqstp, u32 *p, struct nlm_res *resp)
+nlmsvc_encode_testres(struct svc_rqst *rqstp, __be32 *p, struct nlm_res *resp)
{
if (!(p = nlm_encode_testres(p, resp)))
return 0;
@@ -246,7 +246,7 @@ nlmsvc_encode_testres(struct svc_rqst *rqstp, u32 *p, struct nlm_res *resp)
}
int
-nlmsvc_decode_lockargs(struct svc_rqst *rqstp, u32 *p, nlm_args *argp)
+nlmsvc_decode_lockargs(struct svc_rqst *rqstp, __be32 *p, nlm_args *argp)
{
u32 exclusive;
@@ -266,7 +266,7 @@ nlmsvc_decode_lockargs(struct svc_rqst *rqstp, u32 *p, nlm_args *argp)
}
int
-nlmsvc_decode_cancargs(struct svc_rqst *rqstp, u32 *p, nlm_args *argp)
+nlmsvc_decode_cancargs(struct svc_rqst *rqstp, __be32 *p, nlm_args *argp)
{
u32 exclusive;
@@ -282,7 +282,7 @@ nlmsvc_decode_cancargs(struct svc_rqst *rqstp, u32 *p, nlm_args *argp)
}
int
-nlmsvc_decode_unlockargs(struct svc_rqst *rqstp, u32 *p, nlm_args *argp)
+nlmsvc_decode_unlockargs(struct svc_rqst *rqstp, __be32 *p, nlm_args *argp)
{
if (!(p = nlm_decode_cookie(p, &argp->cookie))
|| !(p = nlm_decode_lock(p, &argp->lock)))
@@ -292,7 +292,7 @@ nlmsvc_decode_unlockargs(struct svc_rqst *rqstp, u32 *p, nlm_args *argp)
}
int
-nlmsvc_decode_shareargs(struct svc_rqst *rqstp, u32 *p, nlm_args *argp)
+nlmsvc_decode_shareargs(struct svc_rqst *rqstp, __be32 *p, nlm_args *argp)
{
struct nlm_lock *lock = &argp->lock;
@@ -313,7 +313,7 @@ nlmsvc_decode_shareargs(struct svc_rqst *rqstp, u32 *p, nlm_args *argp)
}
int
-nlmsvc_encode_shareres(struct svc_rqst *rqstp, u32 *p, struct nlm_res *resp)
+nlmsvc_encode_shareres(struct svc_rqst *rqstp, __be32 *p, struct nlm_res *resp)
{
if (!(p = nlm_encode_cookie(p, &resp->cookie)))
return 0;
@@ -323,7 +323,7 @@ nlmsvc_encode_shareres(struct svc_rqst *rqstp, u32 *p, struct nlm_res *resp)
}
int
-nlmsvc_encode_res(struct svc_rqst *rqstp, u32 *p, struct nlm_res *resp)
+nlmsvc_encode_res(struct svc_rqst *rqstp, __be32 *p, struct nlm_res *resp)
{
if (!(p = nlm_encode_cookie(p, &resp->cookie)))
return 0;
@@ -332,7 +332,7 @@ nlmsvc_encode_res(struct svc_rqst *rqstp, u32 *p, struct nlm_res *resp)
}
int
-nlmsvc_decode_notify(struct svc_rqst *rqstp, u32 *p, struct nlm_args *argp)
+nlmsvc_decode_notify(struct svc_rqst *rqstp, __be32 *p, struct nlm_args *argp)
{
struct nlm_lock *lock = &argp->lock;
@@ -344,7 +344,7 @@ nlmsvc_decode_notify(struct svc_rqst *rqstp, u32 *p, struct nlm_args *argp)
}
int
-nlmsvc_decode_reboot(struct svc_rqst *rqstp, u32 *p, struct nlm_reboot *argp)
+nlmsvc_decode_reboot(struct svc_rqst *rqstp, __be32 *p, struct nlm_reboot *argp)
{
if (!(p = xdr_decode_string_inplace(p, &argp->mon, &argp->len, SM_MAXSTRLEN)))
return 0;
@@ -357,7 +357,7 @@ nlmsvc_decode_reboot(struct svc_rqst *rqstp, u32 *p, struct nlm_reboot *argp)
}
int
-nlmsvc_decode_res(struct svc_rqst *rqstp, u32 *p, struct nlm_res *resp)
+nlmsvc_decode_res(struct svc_rqst *rqstp, __be32 *p, struct nlm_res *resp)
{
if (!(p = nlm_decode_cookie(p, &resp->cookie)))
return 0;
@@ -366,13 +366,13 @@ nlmsvc_decode_res(struct svc_rqst *rqstp, u32 *p, struct nlm_res *resp)
}
int
-nlmsvc_decode_void(struct svc_rqst *rqstp, u32 *p, void *dummy)
+nlmsvc_decode_void(struct svc_rqst *rqstp, __be32 *p, void *dummy)
{
return xdr_argsize_check(rqstp, p);
}
int
-nlmsvc_encode_void(struct svc_rqst *rqstp, u32 *p, void *dummy)
+nlmsvc_encode_void(struct svc_rqst *rqstp, __be32 *p, void *dummy)
{
return xdr_ressize_check(rqstp, p);
}
@@ -389,7 +389,7 @@ nlmclt_decode_void(struct rpc_rqst *req, u32 *p, void *ptr)
#endif
static int
-nlmclt_encode_testargs(struct rpc_rqst *req, u32 *p, nlm_args *argp)
+nlmclt_encode_testargs(struct rpc_rqst *req, __be32 *p, nlm_args *argp)
{
struct nlm_lock *lock = &argp->lock;
@@ -403,7 +403,7 @@ nlmclt_encode_testargs(struct rpc_rqst *req, u32 *p, nlm_args *argp)
}
static int
-nlmclt_decode_testres(struct rpc_rqst *req, u32 *p, struct nlm_res *resp)
+nlmclt_decode_testres(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
{
if (!(p = nlm_decode_cookie(p, &resp->cookie)))
return -EIO;
@@ -438,7 +438,7 @@ nlmclt_decode_testres(struct rpc_rqst *req, u32 *p, struct nlm_res *resp)
static int
-nlmclt_encode_lockargs(struct rpc_rqst *req, u32 *p, nlm_args *argp)
+nlmclt_encode_lockargs(struct rpc_rqst *req, __be32 *p, nlm_args *argp)
{
struct nlm_lock *lock = &argp->lock;
@@ -455,7 +455,7 @@ nlmclt_encode_lockargs(struct rpc_rqst *req, u32 *p, nlm_args *argp)
}
static int
-nlmclt_encode_cancargs(struct rpc_rqst *req, u32 *p, nlm_args *argp)
+nlmclt_encode_cancargs(struct rpc_rqst *req, __be32 *p, nlm_args *argp)
{
struct nlm_lock *lock = &argp->lock;
@@ -470,7 +470,7 @@ nlmclt_encode_cancargs(struct rpc_rqst *req, u32 *p, nlm_args *argp)
}
static int
-nlmclt_encode_unlockargs(struct rpc_rqst *req, u32 *p, nlm_args *argp)
+nlmclt_encode_unlockargs(struct rpc_rqst *req, __be32 *p, nlm_args *argp)
{
struct nlm_lock *lock = &argp->lock;
@@ -483,7 +483,7 @@ nlmclt_encode_unlockargs(struct rpc_rqst *req, u32 *p, nlm_args *argp)
}
static int
-nlmclt_encode_res(struct rpc_rqst *req, u32 *p, struct nlm_res *resp)
+nlmclt_encode_res(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
{
if (!(p = nlm_encode_cookie(p, &resp->cookie)))
return -EIO;
@@ -493,7 +493,7 @@ nlmclt_encode_res(struct rpc_rqst *req, u32 *p, struct nlm_res *resp)
}
static int
-nlmclt_encode_testres(struct rpc_rqst *req, u32 *p, struct nlm_res *resp)
+nlmclt_encode_testres(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
{
if (!(p = nlm_encode_testres(p, resp)))
return -EIO;
@@ -502,7 +502,7 @@ nlmclt_encode_testres(struct rpc_rqst *req, u32 *p, struct nlm_res *resp)
}
static int
-nlmclt_decode_res(struct rpc_rqst *req, u32 *p, struct nlm_res *resp)
+nlmclt_decode_res(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
{
if (!(p = nlm_decode_cookie(p, &resp->cookie)))
return -EIO;
diff --git a/fs/lockd/xdr4.c b/fs/lockd/xdr4.c
index 36eb175ec335..f4c0b2b9f75a 100644
--- a/fs/lockd/xdr4.c
+++ b/fs/lockd/xdr4.c
@@ -44,8 +44,8 @@ loff_t_to_s64(loff_t offset)
/*
* XDR functions for basic NLM types
*/
-static u32 *
-nlm4_decode_cookie(u32 *p, struct nlm_cookie *c)
+static __be32 *
+nlm4_decode_cookie(__be32 *p, struct nlm_cookie *c)
{
unsigned int len;
@@ -71,8 +71,8 @@ nlm4_decode_cookie(u32 *p, struct nlm_cookie *c)
return p;
}
-static u32 *
-nlm4_encode_cookie(u32 *p, struct nlm_cookie *c)
+static __be32 *
+nlm4_encode_cookie(__be32 *p, struct nlm_cookie *c)
{
*p++ = htonl(c->len);
memcpy(p, c->data, c->len);
@@ -80,8 +80,8 @@ nlm4_encode_cookie(u32 *p, struct nlm_cookie *c)
return p;
}
-static u32 *
-nlm4_decode_fh(u32 *p, struct nfs_fh *f)
+static __be32 *
+nlm4_decode_fh(__be32 *p, struct nfs_fh *f)
{
memset(f->data, 0, sizeof(f->data));
f->size = ntohl(*p++);
@@ -95,8 +95,8 @@ nlm4_decode_fh(u32 *p, struct nfs_fh *f)
return p + XDR_QUADLEN(f->size);
}
-static u32 *
-nlm4_encode_fh(u32 *p, struct nfs_fh *f)
+static __be32 *
+nlm4_encode_fh(__be32 *p, struct nfs_fh *f)
{
*p++ = htonl(f->size);
if (f->size) p[XDR_QUADLEN(f->size)-1] = 0; /* don't leak anything */
@@ -107,20 +107,20 @@ nlm4_encode_fh(u32 *p, struct nfs_fh *f)
/*
* Encode and decode owner handle
*/
-static u32 *
-nlm4_decode_oh(u32 *p, struct xdr_netobj *oh)
+static __be32 *
+nlm4_decode_oh(__be32 *p, struct xdr_netobj *oh)
{
return xdr_decode_netobj(p, oh);
}
-static u32 *
-nlm4_encode_oh(u32 *p, struct xdr_netobj *oh)
+static __be32 *
+nlm4_encode_oh(__be32 *p, struct xdr_netobj *oh)
{
return xdr_encode_netobj(p, oh);
}
-static u32 *
-nlm4_decode_lock(u32 *p, struct nlm_lock *lock)
+static __be32 *
+nlm4_decode_lock(__be32 *p, struct nlm_lock *lock)
{
struct file_lock *fl = &lock->fl;
__s64 len, start, end;
@@ -153,8 +153,8 @@ nlm4_decode_lock(u32 *p, struct nlm_lock *lock)
/*
* Encode a lock as part of an NLM call
*/
-static u32 *
-nlm4_encode_lock(u32 *p, struct nlm_lock *lock)
+static __be32 *
+nlm4_encode_lock(__be32 *p, struct nlm_lock *lock)
{
struct file_lock *fl = &lock->fl;
__s64 start, len;
@@ -185,8 +185,8 @@ nlm4_encode_lock(u32 *p, struct nlm_lock *lock)
/*
* Encode result of a TEST/TEST_MSG call
*/
-static u32 *
-nlm4_encode_testres(u32 *p, struct nlm_res *resp)
+static __be32 *
+nlm4_encode_testres(__be32 *p, struct nlm_res *resp)
{
s64 start, len;
@@ -227,7 +227,7 @@ nlm4_encode_testres(u32 *p, struct nlm_res *resp)
* First, the server side XDR functions
*/
int
-nlm4svc_decode_testargs(struct svc_rqst *rqstp, u32 *p, nlm_args *argp)
+nlm4svc_decode_testargs(struct svc_rqst *rqstp, __be32 *p, nlm_args *argp)
{
u32 exclusive;
@@ -244,7 +244,7 @@ nlm4svc_decode_testargs(struct svc_rqst *rqstp, u32 *p, nlm_args *argp)
}
int
-nlm4svc_encode_testres(struct svc_rqst *rqstp, u32 *p, struct nlm_res *resp)
+nlm4svc_encode_testres(struct svc_rqst *rqstp, __be32 *p, struct nlm_res *resp)
{
if (!(p = nlm4_encode_testres(p, resp)))
return 0;
@@ -252,7 +252,7 @@ nlm4svc_encode_testres(struct svc_rqst *rqstp, u32 *p, struct nlm_res *resp)
}
int
-nlm4svc_decode_lockargs(struct svc_rqst *rqstp, u32 *p, nlm_args *argp)
+nlm4svc_decode_lockargs(struct svc_rqst *rqstp, __be32 *p, nlm_args *argp)
{
u32 exclusive;
@@ -272,7 +272,7 @@ nlm4svc_decode_lockargs(struct svc_rqst *rqstp, u32 *p, nlm_args *argp)
}
int
-nlm4svc_decode_cancargs(struct svc_rqst *rqstp, u32 *p, nlm_args *argp)
+nlm4svc_decode_cancargs(struct svc_rqst *rqstp, __be32 *p, nlm_args *argp)
{
u32 exclusive;
@@ -288,7 +288,7 @@ nlm4svc_decode_cancargs(struct svc_rqst *rqstp, u32 *p, nlm_args *argp)
}
int
-nlm4svc_decode_unlockargs(struct svc_rqst *rqstp, u32 *p, nlm_args *argp)
+nlm4svc_decode_unlockargs(struct svc_rqst *rqstp, __be32 *p, nlm_args *argp)
{
if (!(p = nlm4_decode_cookie(p, &argp->cookie))
|| !(p = nlm4_decode_lock(p, &argp->lock)))
@@ -298,7 +298,7 @@ nlm4svc_decode_unlockargs(struct svc_rqst *rqstp, u32 *p, nlm_args *argp)
}
int
-nlm4svc_decode_shareargs(struct svc_rqst *rqstp, u32 *p, nlm_args *argp)
+nlm4svc_decode_shareargs(struct svc_rqst *rqstp, __be32 *p, nlm_args *argp)
{
struct nlm_lock *lock = &argp->lock;
@@ -319,7 +319,7 @@ nlm4svc_decode_shareargs(struct svc_rqst *rqstp, u32 *p, nlm_args *argp)
}
int
-nlm4svc_encode_shareres(struct svc_rqst *rqstp, u32 *p, struct nlm_res *resp)
+nlm4svc_encode_shareres(struct svc_rqst *rqstp, __be32 *p, struct nlm_res *resp)
{
if (!(p = nlm4_encode_cookie(p, &resp->cookie)))
return 0;
@@ -329,7 +329,7 @@ nlm4svc_encode_shareres(struct svc_rqst *rqstp, u32 *p, struct nlm_res *resp)
}
int
-nlm4svc_encode_res(struct svc_rqst *rqstp, u32 *p, struct nlm_res *resp)
+nlm4svc_encode_res(struct svc_rqst *rqstp, __be32 *p, struct nlm_res *resp)
{
if (!(p = nlm4_encode_cookie(p, &resp->cookie)))
return 0;
@@ -338,7 +338,7 @@ nlm4svc_encode_res(struct svc_rqst *rqstp, u32 *p, struct nlm_res *resp)
}
int
-nlm4svc_decode_notify(struct svc_rqst *rqstp, u32 *p, struct nlm_args *argp)
+nlm4svc_decode_notify(struct svc_rqst *rqstp, __be32 *p, struct nlm_args *argp)
{
struct nlm_lock *lock = &argp->lock;
@@ -350,7 +350,7 @@ nlm4svc_decode_notify(struct svc_rqst *rqstp, u32 *p, struct nlm_args *argp)
}
int
-nlm4svc_decode_reboot(struct svc_rqst *rqstp, u32 *p, struct nlm_reboot *argp)
+nlm4svc_decode_reboot(struct svc_rqst *rqstp, __be32 *p, struct nlm_reboot *argp)
{
if (!(p = xdr_decode_string_inplace(p, &argp->mon, &argp->len, SM_MAXSTRLEN)))
return 0;
@@ -363,7 +363,7 @@ nlm4svc_decode_reboot(struct svc_rqst *rqstp, u32 *p, struct nlm_reboot *argp)
}
int
-nlm4svc_decode_res(struct svc_rqst *rqstp, u32 *p, struct nlm_res *resp)
+nlm4svc_decode_res(struct svc_rqst *rqstp, __be32 *p, struct nlm_res *resp)
{
if (!(p = nlm4_decode_cookie(p, &resp->cookie)))
return 0;
@@ -372,13 +372,13 @@ nlm4svc_decode_res(struct svc_rqst *rqstp, u32 *p, struct nlm_res *resp)
}
int
-nlm4svc_decode_void(struct svc_rqst *rqstp, u32 *p, void *dummy)
+nlm4svc_decode_void(struct svc_rqst *rqstp, __be32 *p, void *dummy)
{
return xdr_argsize_check(rqstp, p);
}
int
-nlm4svc_encode_void(struct svc_rqst *rqstp, u32 *p, void *dummy)
+nlm4svc_encode_void(struct svc_rqst *rqstp, __be32 *p, void *dummy)
{
return xdr_ressize_check(rqstp, p);
}
@@ -388,14 +388,14 @@ nlm4svc_encode_void(struct svc_rqst *rqstp, u32 *p, void *dummy)
*/
#ifdef NLMCLNT_SUPPORT_SHARES
static int
-nlm4clt_decode_void(struct rpc_rqst *req, u32 *p, void *ptr)
+nlm4clt_decode_void(struct rpc_rqst *req, __be32 *p, void *ptr)
{
return 0;
}
#endif
static int
-nlm4clt_encode_testargs(struct rpc_rqst *req, u32 *p, nlm_args *argp)
+nlm4clt_encode_testargs(struct rpc_rqst *req, __be32 *p, nlm_args *argp)
{
struct nlm_lock *lock = &argp->lock;
@@ -409,7 +409,7 @@ nlm4clt_encode_testargs(struct rpc_rqst *req, u32 *p, nlm_args *argp)
}
static int
-nlm4clt_decode_testres(struct rpc_rqst *req, u32 *p, struct nlm_res *resp)
+nlm4clt_decode_testres(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
{
if (!(p = nlm4_decode_cookie(p, &resp->cookie)))
return -EIO;
@@ -444,7 +444,7 @@ nlm4clt_decode_testres(struct rpc_rqst *req, u32 *p, struct nlm_res *resp)
static int
-nlm4clt_encode_lockargs(struct rpc_rqst *req, u32 *p, nlm_args *argp)
+nlm4clt_encode_lockargs(struct rpc_rqst *req, __be32 *p, nlm_args *argp)
{
struct nlm_lock *lock = &argp->lock;
@@ -461,7 +461,7 @@ nlm4clt_encode_lockargs(struct rpc_rqst *req, u32 *p, nlm_args *argp)
}
static int
-nlm4clt_encode_cancargs(struct rpc_rqst *req, u32 *p, nlm_args *argp)
+nlm4clt_encode_cancargs(struct rpc_rqst *req, __be32 *p, nlm_args *argp)
{
struct nlm_lock *lock = &argp->lock;
@@ -476,7 +476,7 @@ nlm4clt_encode_cancargs(struct rpc_rqst *req, u32 *p, nlm_args *argp)
}
static int
-nlm4clt_encode_unlockargs(struct rpc_rqst *req, u32 *p, nlm_args *argp)
+nlm4clt_encode_unlockargs(struct rpc_rqst *req, __be32 *p, nlm_args *argp)
{
struct nlm_lock *lock = &argp->lock;
@@ -489,7 +489,7 @@ nlm4clt_encode_unlockargs(struct rpc_rqst *req, u32 *p, nlm_args *argp)
}
static int
-nlm4clt_encode_res(struct rpc_rqst *req, u32 *p, struct nlm_res *resp)
+nlm4clt_encode_res(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
{
if (!(p = nlm4_encode_cookie(p, &resp->cookie)))
return -EIO;
@@ -499,7 +499,7 @@ nlm4clt_encode_res(struct rpc_rqst *req, u32 *p, struct nlm_res *resp)
}
static int
-nlm4clt_encode_testres(struct rpc_rqst *req, u32 *p, struct nlm_res *resp)
+nlm4clt_encode_testres(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
{
if (!(p = nlm4_encode_testres(p, resp)))
return -EIO;
@@ -508,7 +508,7 @@ nlm4clt_encode_testres(struct rpc_rqst *req, u32 *p, struct nlm_res *resp)
}
static int
-nlm4clt_decode_res(struct rpc_rqst *req, u32 *p, struct nlm_res *resp)
+nlm4clt_decode_res(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
{
if (!(p = nlm4_decode_cookie(p, &resp->cookie)))
return -EIO;
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index c11a4b9fb863..1e36bae4d0eb 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -149,12 +149,8 @@ static int minix_fill_super(struct super_block *s, void *data, int silent)
return -ENOMEM;
s->s_fs_info = sbi;
- /* N.B. These should be compile-time tests.
- Unfortunately that is impossible. */
- if (32 != sizeof (struct minix_inode))
- panic("bad V1 i-node size");
- if (64 != sizeof(struct minix2_inode))
- panic("bad V2 i-node size");
+ BUILD_BUG_ON(32 != sizeof (struct minix_inode));
+ BUILD_BUG_ON(64 != sizeof(struct minix2_inode));
if (!sb_set_blocksize(s, BLOCK_SIZE))
goto out_bad_hblock;
diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c
index a89ac84a8241..589d1eac55c1 100644
--- a/fs/ncpfs/ioctl.c
+++ b/fs/ncpfs/ioctl.c
@@ -726,7 +726,7 @@ outrel:
struct compat_ncp_privatedata_ioctl user32;
user32.len = user.len;
user32.data = (unsigned long) user.data;
- if (copy_to_user(&user32, argp, sizeof(user32)))
+ if (copy_to_user(argp, &user32, sizeof(user32)))
return -EFAULT;
} else
#endif
diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
index 5676163d26e8..db3d7919c601 100644
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -31,10 +31,10 @@ struct cb_compound_hdr_arg {
};
struct cb_compound_hdr_res {
- uint32_t *status;
+ __be32 *status;
int taglen;
const char *tag;
- uint32_t *nops;
+ __be32 *nops;
};
struct cb_getattrargs {
@@ -44,7 +44,7 @@ struct cb_getattrargs {
};
struct cb_getattrres {
- uint32_t status;
+ __be32 status;
uint32_t bitmap[2];
uint64_t size;
uint64_t change_attr;
@@ -59,8 +59,8 @@ struct cb_recallargs {
uint32_t truncate;
};
-extern unsigned nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *res);
-extern unsigned nfs4_callback_recall(struct cb_recallargs *args, void *dummy);
+extern __be32 nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *res);
+extern __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy);
#ifdef CONFIG_NFS_V4
extern int nfs_callback_up(void);
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 97cf8f71451f..72e55d83756d 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -14,7 +14,7 @@
#define NFSDBG_FACILITY NFSDBG_CALLBACK
-unsigned nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *res)
+__be32 nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *res)
{
struct nfs_client *clp;
struct nfs_delegation *delegation;
@@ -55,11 +55,11 @@ out:
return res->status;
}
-unsigned nfs4_callback_recall(struct cb_recallargs *args, void *dummy)
+__be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy)
{
struct nfs_client *clp;
struct inode *inode;
- unsigned res;
+ __be32 res;
res = htonl(NFS4ERR_BADHANDLE);
clp = nfs_find_client(args->addr, 4);
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index 29f932192054..f8ea1f51f590 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -22,9 +22,9 @@
#define NFSDBG_FACILITY NFSDBG_CALLBACK
-typedef unsigned (*callback_process_op_t)(void *, void *);
-typedef unsigned (*callback_decode_arg_t)(struct svc_rqst *, struct xdr_stream *, void *);
-typedef unsigned (*callback_encode_res_t)(struct svc_rqst *, struct xdr_stream *, void *);
+typedef __be32 (*callback_process_op_t)(void *, void *);
+typedef __be32 (*callback_decode_arg_t)(struct svc_rqst *, struct xdr_stream *, void *);
+typedef __be32 (*callback_encode_res_t)(struct svc_rqst *, struct xdr_stream *, void *);
struct callback_op {
@@ -36,24 +36,24 @@ struct callback_op {
static struct callback_op callback_ops[];
-static int nfs4_callback_null(struct svc_rqst *rqstp, void *argp, void *resp)
+static __be32 nfs4_callback_null(struct svc_rqst *rqstp, void *argp, void *resp)
{
return htonl(NFS4_OK);
}
-static int nfs4_decode_void(struct svc_rqst *rqstp, uint32_t *p, void *dummy)
+static int nfs4_decode_void(struct svc_rqst *rqstp, __be32 *p, void *dummy)
{
return xdr_argsize_check(rqstp, p);
}
-static int nfs4_encode_void(struct svc_rqst *rqstp, uint32_t *p, void *dummy)
+static int nfs4_encode_void(struct svc_rqst *rqstp, __be32 *p, void *dummy)
{
return xdr_ressize_check(rqstp, p);
}
-static uint32_t *read_buf(struct xdr_stream *xdr, int nbytes)
+static __be32 *read_buf(struct xdr_stream *xdr, int nbytes)
{
- uint32_t *p;
+ __be32 *p;
p = xdr_inline_decode(xdr, nbytes);
if (unlikely(p == NULL))
@@ -61,9 +61,9 @@ static uint32_t *read_buf(struct xdr_stream *xdr, int nbytes)
return p;
}
-static unsigned decode_string(struct xdr_stream *xdr, unsigned int *len, const char **str)
+static __be32 decode_string(struct xdr_stream *xdr, unsigned int *len, const char **str)
{
- uint32_t *p;
+ __be32 *p;
p = read_buf(xdr, 4);
if (unlikely(p == NULL))
@@ -81,9 +81,9 @@ static unsigned decode_string(struct xdr_stream *xdr, unsigned int *len, const c
return 0;
}
-static unsigned decode_fh(struct xdr_stream *xdr, struct nfs_fh *fh)
+static __be32 decode_fh(struct xdr_stream *xdr, struct nfs_fh *fh)
{
- uint32_t *p;
+ __be32 *p;
p = read_buf(xdr, 4);
if (unlikely(p == NULL))
@@ -99,9 +99,9 @@ static unsigned decode_fh(struct xdr_stream *xdr, struct nfs_fh *fh)
return 0;
}
-static unsigned decode_bitmap(struct xdr_stream *xdr, uint32_t *bitmap)
+static __be32 decode_bitmap(struct xdr_stream *xdr, uint32_t *bitmap)
{
- uint32_t *p;
+ __be32 *p;
unsigned int attrlen;
p = read_buf(xdr, 4);
@@ -118,9 +118,9 @@ static unsigned decode_bitmap(struct xdr_stream *xdr, uint32_t *bitmap)
return 0;
}
-static unsigned decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
+static __be32 decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
{
- uint32_t *p;
+ __be32 *p;
p = read_buf(xdr, 16);
if (unlikely(p == NULL))
@@ -129,11 +129,11 @@ static unsigned decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
return 0;
}
-static unsigned decode_compound_hdr_arg(struct xdr_stream *xdr, struct cb_compound_hdr_arg *hdr)
+static __be32 decode_compound_hdr_arg(struct xdr_stream *xdr, struct cb_compound_hdr_arg *hdr)
{
- uint32_t *p;
+ __be32 *p;
unsigned int minor_version;
- unsigned status;
+ __be32 status;
status = decode_string(xdr, &hdr->taglen, &hdr->tag);
if (unlikely(status != 0))
@@ -159,9 +159,9 @@ static unsigned decode_compound_hdr_arg(struct xdr_stream *xdr, struct cb_compou
return 0;
}
-static unsigned decode_op_hdr(struct xdr_stream *xdr, unsigned int *op)
+static __be32 decode_op_hdr(struct xdr_stream *xdr, unsigned int *op)
{
- uint32_t *p;
+ __be32 *p;
p = read_buf(xdr, 4);
if (unlikely(p == NULL))
return htonl(NFS4ERR_RESOURCE);
@@ -169,9 +169,9 @@ static unsigned decode_op_hdr(struct xdr_stream *xdr, unsigned int *op)
return 0;
}
-static unsigned decode_getattr_args(struct svc_rqst *rqstp, struct xdr_stream *xdr, struct cb_getattrargs *args)
+static __be32 decode_getattr_args(struct svc_rqst *rqstp, struct xdr_stream *xdr, struct cb_getattrargs *args)
{
- unsigned status;
+ __be32 status;
status = decode_fh(xdr, &args->fh);
if (unlikely(status != 0))
@@ -183,10 +183,10 @@ out:
return status;
}
-static unsigned decode_recall_args(struct svc_rqst *rqstp, struct xdr_stream *xdr, struct cb_recallargs *args)
+static __be32 decode_recall_args(struct svc_rqst *rqstp, struct xdr_stream *xdr, struct cb_recallargs *args)
{
- uint32_t *p;
- unsigned status;
+ __be32 *p;
+ __be32 status;
args->addr = &rqstp->rq_addr;
status = decode_stateid(xdr, &args->stateid);
@@ -204,9 +204,9 @@ out:
return status;
}
-static unsigned encode_string(struct xdr_stream *xdr, unsigned int len, const char *str)
+static __be32 encode_string(struct xdr_stream *xdr, unsigned int len, const char *str)
{
- uint32_t *p;
+ __be32 *p;
p = xdr_reserve_space(xdr, 4 + len);
if (unlikely(p == NULL))
@@ -217,10 +217,10 @@ static unsigned encode_string(struct xdr_stream *xdr, unsigned int len, const ch
#define CB_SUPPORTED_ATTR0 (FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE)
#define CB_SUPPORTED_ATTR1 (FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY)
-static unsigned encode_attr_bitmap(struct xdr_stream *xdr, const uint32_t *bitmap, uint32_t **savep)
+static __be32 encode_attr_bitmap(struct xdr_stream *xdr, const uint32_t *bitmap, __be32 **savep)
{
- uint32_t bm[2];
- uint32_t *p;
+ __be32 bm[2];
+ __be32 *p;
bm[0] = htonl(bitmap[0] & CB_SUPPORTED_ATTR0);
bm[1] = htonl(bitmap[1] & CB_SUPPORTED_ATTR1);
@@ -247,9 +247,9 @@ static unsigned encode_attr_bitmap(struct xdr_stream *xdr, const uint32_t *bitma
return 0;
}
-static unsigned encode_attr_change(struct xdr_stream *xdr, const uint32_t *bitmap, uint64_t change)
+static __be32 encode_attr_change(struct xdr_stream *xdr, const uint32_t *bitmap, uint64_t change)
{
- uint32_t *p;
+ __be32 *p;
if (!(bitmap[0] & FATTR4_WORD0_CHANGE))
return 0;
@@ -260,9 +260,9 @@ static unsigned encode_attr_change(struct xdr_stream *xdr, const uint32_t *bitma
return 0;
}
-static unsigned encode_attr_size(struct xdr_stream *xdr, const uint32_t *bitmap, uint64_t size)
+static __be32 encode_attr_size(struct xdr_stream *xdr, const uint32_t *bitmap, uint64_t size)
{
- uint32_t *p;
+ __be32 *p;
if (!(bitmap[0] & FATTR4_WORD0_SIZE))
return 0;
@@ -273,9 +273,9 @@ static unsigned encode_attr_size(struct xdr_stream *xdr, const uint32_t *bitmap,
return 0;
}
-static unsigned encode_attr_time(struct xdr_stream *xdr, const struct timespec *time)
+static __be32 encode_attr_time(struct xdr_stream *xdr, const struct timespec *time)
{
- uint32_t *p;
+ __be32 *p;
p = xdr_reserve_space(xdr, 12);
if (unlikely(p == 0))
@@ -285,23 +285,23 @@ static unsigned encode_attr_time(struct xdr_stream *xdr, const struct timespec *
return 0;
}
-static unsigned encode_attr_ctime(struct xdr_stream *xdr, const uint32_t *bitmap, const struct timespec *time)
+static __be32 encode_attr_ctime(struct xdr_stream *xdr, const uint32_t *bitmap, const struct timespec *time)
{
if (!(bitmap[1] & FATTR4_WORD1_TIME_METADATA))
return 0;
return encode_attr_time(xdr,time);
}
-static unsigned encode_attr_mtime(struct xdr_stream *xdr, const uint32_t *bitmap, const struct timespec *time)
+static __be32 encode_attr_mtime(struct xdr_stream *xdr, const uint32_t *bitmap, const struct timespec *time)
{
if (!(bitmap[1] & FATTR4_WORD1_TIME_MODIFY))
return 0;
return encode_attr_time(xdr,time);
}
-static unsigned encode_compound_hdr_res(struct xdr_stream *xdr, struct cb_compound_hdr_res *hdr)
+static __be32 encode_compound_hdr_res(struct xdr_stream *xdr, struct cb_compound_hdr_res *hdr)
{
- unsigned status;
+ __be32 status;
hdr->status = xdr_reserve_space(xdr, 4);
if (unlikely(hdr->status == NULL))
@@ -315,9 +315,9 @@ static unsigned encode_compound_hdr_res(struct xdr_stream *xdr, struct cb_compou
return 0;
}
-static unsigned encode_op_hdr(struct xdr_stream *xdr, uint32_t op, uint32_t res)
+static __be32 encode_op_hdr(struct xdr_stream *xdr, uint32_t op, __be32 res)
{
- uint32_t *p;
+ __be32 *p;
p = xdr_reserve_space(xdr, 8);
if (unlikely(p == NULL))
@@ -327,10 +327,10 @@ static unsigned encode_op_hdr(struct xdr_stream *xdr, uint32_t op, uint32_t res)
return 0;
}
-static unsigned encode_getattr_res(struct svc_rqst *rqstp, struct xdr_stream *xdr, const struct cb_getattrres *res)
+static __be32 encode_getattr_res(struct svc_rqst *rqstp, struct xdr_stream *xdr, const struct cb_getattrres *res)
{
- uint32_t *savep = NULL;
- unsigned status = res->status;
+ __be32 *savep = NULL;
+ __be32 status = res->status;
if (unlikely(status != 0))
goto out;
@@ -353,15 +353,15 @@ out:
return status;
}
-static unsigned process_op(struct svc_rqst *rqstp,
+static __be32 process_op(struct svc_rqst *rqstp,
struct xdr_stream *xdr_in, void *argp,
struct xdr_stream *xdr_out, void *resp)
{
struct callback_op *op = &callback_ops[0];
unsigned int op_nr = OP_CB_ILLEGAL;
- unsigned int status = 0;
+ __be32 status = 0;
long maxlen;
- unsigned res;
+ __be32 res;
dprintk("%s: start\n", __FUNCTION__);
status = decode_op_hdr(xdr_in, &op_nr);
@@ -399,20 +399,20 @@ static unsigned process_op(struct svc_rqst *rqstp,
/*
* Decode, process and encode a COMPOUND
*/
-static int nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *resp)
+static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *resp)
{
struct cb_compound_hdr_arg hdr_arg;
struct cb_compound_hdr_res hdr_res;
struct xdr_stream xdr_in, xdr_out;
- uint32_t *p;
- unsigned int status;
+ __be32 *p;
+ __be32 status;
unsigned int nops = 1;
dprintk("%s: start\n", __FUNCTION__);
xdr_init_decode(&xdr_in, &rqstp->rq_arg, rqstp->rq_arg.head[0].iov_base);
- p = (uint32_t*)((char *)rqstp->rq_res.head[0].iov_base + rqstp->rq_res.head[0].iov_len);
+ p = (__be32*)((char *)rqstp->rq_res.head[0].iov_base + rqstp->rq_res.head[0].iov_len);
xdr_init_encode(&xdr_out, &rqstp->rq_res, p);
decode_compound_hdr_arg(&xdr_in, &hdr_arg);
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 8106f3b29e4a..5fea638743e4 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -10,7 +10,6 @@
*/
-#include <linux/config.h>
#include <linux/module.h>
#include <linux/init.h>
@@ -233,11 +232,15 @@ void nfs_put_client(struct nfs_client *clp)
* Find a client by address
* - caller must hold nfs_client_lock
*/
-static struct nfs_client *__nfs_find_client(const struct sockaddr_in *addr, int nfsversion)
+static struct nfs_client *__nfs_find_client(const struct sockaddr_in *addr, int nfsversion, int match_port)
{
struct nfs_client *clp;
list_for_each_entry(clp, &nfs_client_list, cl_share_link) {
+ /* Don't match clients that failed to initialise properly */
+ if (clp->cl_cons_state < 0)
+ continue;
+
/* Different NFS versions cannot share the same nfs_client */
if (clp->cl_nfsversion != nfsversion)
continue;
@@ -246,7 +249,7 @@ static struct nfs_client *__nfs_find_client(const struct sockaddr_in *addr, int
sizeof(clp->cl_addr.sin_addr)) != 0)
continue;
- if (clp->cl_addr.sin_port == addr->sin_port)
+ if (!match_port || clp->cl_addr.sin_port == addr->sin_port)
goto found;
}
@@ -266,11 +269,12 @@ struct nfs_client *nfs_find_client(const struct sockaddr_in *addr, int nfsversio
struct nfs_client *clp;
spin_lock(&nfs_client_lock);
- clp = __nfs_find_client(addr, nfsversion);
+ clp = __nfs_find_client(addr, nfsversion, 0);
spin_unlock(&nfs_client_lock);
-
- BUG_ON(clp && clp->cl_cons_state == 0);
-
+ if (clp != NULL && clp->cl_cons_state != NFS_CS_READY) {
+ nfs_put_client(clp);
+ clp = NULL;
+ }
return clp;
}
@@ -293,7 +297,7 @@ static struct nfs_client *nfs_get_client(const char *hostname,
do {
spin_lock(&nfs_client_lock);
- clp = __nfs_find_client(addr, nfsversion);
+ clp = __nfs_find_client(addr, nfsversion, 1);
if (clp)
goto found_client;
if (new)
@@ -323,25 +327,11 @@ found_client:
if (new)
nfs_free_client(new);
- if (clp->cl_cons_state == NFS_CS_INITING) {
- DECLARE_WAITQUEUE(myself, current);
-
- add_wait_queue(&nfs_client_active_wq, &myself);
-
- for (;;) {
- set_current_state(TASK_INTERRUPTIBLE);
- if (signal_pending(current) ||
- clp->cl_cons_state > NFS_CS_READY)
- break;
- schedule();
- }
-
- remove_wait_queue(&nfs_client_active_wq, &myself);
-
- if (signal_pending(current)) {
- nfs_put_client(clp);
- return ERR_PTR(-ERESTARTSYS);
- }
+ error = wait_event_interruptible(nfs_client_active_wq,
+ clp->cl_cons_state != NFS_CS_INITING);
+ if (error < 0) {
+ nfs_put_client(clp);
+ return ERR_PTR(-ERESTARTSYS);
}
if (clp->cl_cons_state < NFS_CS_READY) {
@@ -864,6 +854,7 @@ error:
*/
static int nfs4_init_client(struct nfs_client *clp,
int proto, int timeo, int retrans,
+ const char *ip_addr,
rpc_authflavor_t authflavour)
{
int error;
@@ -880,6 +871,7 @@ static int nfs4_init_client(struct nfs_client *clp,
error = nfs_create_rpc_client(clp, proto, timeo, retrans, authflavour);
if (error < 0)
goto error;
+ memcpy(clp->cl_ipaddr, ip_addr, sizeof(clp->cl_ipaddr));
error = nfs_idmap_new(clp);
if (error < 0) {
@@ -903,6 +895,7 @@ error:
*/
static int nfs4_set_client(struct nfs_server *server,
const char *hostname, const struct sockaddr_in *addr,
+ const char *ip_addr,
rpc_authflavor_t authflavour,
int proto, int timeo, int retrans)
{
@@ -917,7 +910,7 @@ static int nfs4_set_client(struct nfs_server *server,
error = PTR_ERR(clp);
goto error;
}
- error = nfs4_init_client(clp, proto, timeo, retrans, authflavour);
+ error = nfs4_init_client(clp, proto, timeo, retrans, ip_addr, authflavour);
if (error < 0)
goto error_put;
@@ -986,7 +979,7 @@ struct nfs_server *nfs4_create_server(const struct nfs4_mount_data *data,
return ERR_PTR(-ENOMEM);
/* Get a client record */
- error = nfs4_set_client(server, hostname, addr, authflavour,
+ error = nfs4_set_client(server, hostname, addr, ip_addr, authflavour,
data->proto, data->timeo, data->retrans);
if (error < 0)
goto error;
@@ -1056,6 +1049,7 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,
/* Get a client representation.
* Note: NFSv4 always uses TCP, */
error = nfs4_set_client(server, data->hostname, data->addr,
+ parent_client->cl_ipaddr,
data->authflavor,
parent_server->client->cl_xprt->prot,
parent_client->retrans_timeo,
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 481f8892a919..4133ef5264e5 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -142,12 +142,12 @@ nfs_opendir(struct inode *inode, struct file *filp)
return res;
}
-typedef u32 * (*decode_dirent_t)(u32 *, struct nfs_entry *, int);
+typedef __be32 * (*decode_dirent_t)(__be32 *, struct nfs_entry *, int);
typedef struct {
struct file *file;
struct page *page;
unsigned long page_index;
- u32 *ptr;
+ __be32 *ptr;
u64 *dir_cookie;
loff_t current_index;
struct nfs_entry *entry;
@@ -203,8 +203,10 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page *page)
* Note: assumes we have exclusive access to this mapping either
* through inode->i_mutex or some other mechanism.
*/
- if (page->index == 0)
- invalidate_inode_pages2_range(inode->i_mapping, PAGE_CACHE_SIZE, -1);
+ if (page->index == 0 && invalidate_inode_pages2_range(inode->i_mapping, PAGE_CACHE_SIZE, -1) < 0) {
+ /* Should never happen */
+ nfs_zap_mapping(inode, inode->i_mapping);
+ }
unlock_page(page);
return 0;
error:
@@ -218,7 +220,7 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page *page)
static inline
int dir_decode(nfs_readdir_descriptor_t *desc)
{
- u32 *p = desc->ptr;
+ __be32 *p = desc->ptr;
p = desc->decode(p, desc->entry, desc->plus);
if (IS_ERR(p))
return PTR_ERR(p);
@@ -1517,8 +1519,8 @@ static int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *sym
pagevec_init(&lru_pvec, 0);
if (!add_to_page_cache(page, dentry->d_inode->i_mapping, 0,
GFP_KERNEL)) {
- if (!pagevec_add(&lru_pvec, page))
- __pagevec_lru_add(&lru_pvec);
+ pagevec_add(&lru_pvec, page);
+ pagevec_lru_add(&lru_pvec);
SetPageUptodate(page);
unlock_page(page);
} else
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 9f7f8b9ea1e2..bdfabf854a51 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -497,6 +497,7 @@ static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode
if (dreq->commit_data != NULL)
nfs_commit_free(dreq->commit_data);
nfs_direct_free_writedata(dreq);
+ nfs_zap_mapping(inode, inode->i_mapping);
nfs_direct_complete(dreq);
}
}
@@ -517,6 +518,7 @@ static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode
{
nfs_end_data_update(inode);
nfs_direct_free_writedata(dreq);
+ nfs_zap_mapping(inode, inode->i_mapping);
nfs_direct_complete(dreq);
}
#endif
@@ -532,10 +534,12 @@ static void nfs_direct_write_result(struct rpc_task *task, void *calldata)
spin_lock(&dreq->lock);
- if (likely(status >= 0))
- dreq->count += data->res.count;
- else
- dreq->error = task->tk_status;
+ if (unlikely(status < 0)) {
+ dreq->error = status;
+ goto out_unlock;
+ }
+
+ dreq->count += data->res.count;
if (data->res.verf->committed != NFS_FILE_SYNC) {
switch (dreq->flags) {
@@ -550,7 +554,7 @@ static void nfs_direct_write_result(struct rpc_task *task, void *calldata)
}
}
}
-
+out_unlock:
spin_unlock(&dreq->lock);
}
@@ -828,17 +832,6 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
retval = nfs_direct_write(iocb, (unsigned long) buf, count, pos);
- /*
- * XXX: nfs_end_data_update() already ensures this file's
- * cached data is subsequently invalidated. Do we really
- * need to call invalidate_inode_pages2() again here?
- *
- * For aio writes, this invalidation will almost certainly
- * occur before the writes complete. Kind of racey.
- */
- if (mapping->nrpages)
- invalidate_inode_pages2(mapping);
-
if (retval > 0)
iocb->ki_pos = pos + retval;
diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c
index 76b08ae9ed82..20c6f39ea38a 100644
--- a/fs/nfs/getroot.c
+++ b/fs/nfs/getroot.c
@@ -9,7 +9,6 @@
* 2 of the License, or (at your option) any later version.
*/
-#include <linux/config.h>
#include <linux/module.h>
#include <linux/init.h>
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index bc9376ca86cd..08cc4c5919ab 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -131,6 +131,15 @@ void nfs_zap_caches(struct inode *inode)
spin_unlock(&inode->i_lock);
}
+void nfs_zap_mapping(struct inode *inode, struct address_space *mapping)
+{
+ if (mapping->nrpages != 0) {
+ spin_lock(&inode->i_lock);
+ NFS_I(inode)->cache_validity |= NFS_INO_INVALID_DATA;
+ spin_unlock(&inode->i_lock);
+ }
+}
+
static void nfs_zap_acl_cache(struct inode *inode)
{
void (*clear_acl_cache)(struct inode *);
@@ -574,7 +583,7 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
nfs_inc_stats(inode, NFSIOS_INODEREVALIDATE);
lock_kernel();
- if (!inode || is_bad_inode(inode))
+ if (is_bad_inode(inode))
goto out_nowait;
if (NFS_STALE(inode))
goto out_nowait;
@@ -671,13 +680,20 @@ int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping)
if ((nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE)
|| nfs_attribute_timeout(inode))
ret = __nfs_revalidate_inode(NFS_SERVER(inode), inode);
+ if (ret < 0)
+ goto out;
if (nfsi->cache_validity & NFS_INO_INVALID_DATA) {
- nfs_inc_stats(inode, NFSIOS_DATAINVALIDATE);
- if (S_ISREG(inode->i_mode))
- nfs_sync_mapping(mapping);
- invalidate_inode_pages2(mapping);
-
+ if (mapping->nrpages != 0) {
+ if (S_ISREG(inode->i_mode)) {
+ ret = nfs_sync_mapping(mapping);
+ if (ret < 0)
+ goto out;
+ }
+ ret = invalidate_inode_pages2(mapping);
+ if (ret < 0)
+ goto out;
+ }
spin_lock(&inode->i_lock);
nfsi->cache_validity &= ~NFS_INO_INVALID_DATA;
if (S_ISDIR(inode->i_mode)) {
@@ -687,10 +703,12 @@ int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping)
}
spin_unlock(&inode->i_lock);
+ nfs_inc_stats(inode, NFSIOS_DATAINVALIDATE);
dfprintk(PAGECACHE, "NFS: (%s/%Ld) data cache invalidated\n",
inode->i_sb->s_id,
(long long)NFS_FILEID(inode));
}
+out:
return ret;
}
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index bea0b016bd70..d205466233f6 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -93,15 +93,15 @@ extern void nfs_destroy_directcache(void);
/* nfs2xdr.c */
extern int nfs_stat_to_errno(int);
extern struct rpc_procinfo nfs_procedures[];
-extern u32 * nfs_decode_dirent(u32 *, struct nfs_entry *, int);
+extern __be32 * nfs_decode_dirent(__be32 *, struct nfs_entry *, int);
/* nfs3xdr.c */
extern struct rpc_procinfo nfs3_procedures[];
-extern u32 *nfs3_decode_dirent(u32 *, struct nfs_entry *, int);
+extern __be32 *nfs3_decode_dirent(__be32 *, struct nfs_entry *, int);
/* nfs4xdr.c */
#ifdef CONFIG_NFS_V4
-extern u32 *nfs4_decode_dirent(u32 *p, struct nfs_entry *entry, int plus);
+extern __be32 *nfs4_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus);
#endif
/* nfs4proc.c */
diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c
index d507b021207f..f75fe72b4160 100644
--- a/fs/nfs/mount_clnt.c
+++ b/fs/nfs/mount_clnt.c
@@ -95,7 +95,7 @@ mnt_create(char *hostname, struct sockaddr_in *srvaddr, int version,
* XDR encode/decode functions for MOUNT
*/
static int
-xdr_encode_dirpath(struct rpc_rqst *req, u32 *p, const char *path)
+xdr_encode_dirpath(struct rpc_rqst *req, __be32 *p, const char *path)
{
p = xdr_encode_string(p, path);
@@ -104,7 +104,7 @@ xdr_encode_dirpath(struct rpc_rqst *req, u32 *p, const char *path)
}
static int
-xdr_decode_fhstatus(struct rpc_rqst *req, u32 *p, struct mnt_fhstatus *res)
+xdr_decode_fhstatus(struct rpc_rqst *req, __be32 *p, struct mnt_fhstatus *res)
{
struct nfs_fh *fh = res->fh;
@@ -116,7 +116,7 @@ xdr_decode_fhstatus(struct rpc_rqst *req, u32 *p, struct mnt_fhstatus *res)
}
static int
-xdr_decode_fhstatus3(struct rpc_rqst *req, u32 *p, struct mnt_fhstatus *res)
+xdr_decode_fhstatus3(struct rpc_rqst *req, __be32 *p, struct mnt_fhstatus *res)
{
struct nfs_fh *fh = res->fh;
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index 60408646176b..ec1114b33d89 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -7,8 +7,6 @@
* NFS namespace
*/
-#include <linux/config.h>
-
#include <linux/dcache.h>
#include <linux/mount.h>
#include <linux/namei.h>
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index b49501fc0a79..3be4e72a0227 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -66,15 +66,15 @@
/*
* Common NFS XDR functions as inlines
*/
-static inline u32 *
-xdr_encode_fhandle(u32 *p, struct nfs_fh *fhandle)
+static inline __be32 *
+xdr_encode_fhandle(__be32 *p, struct nfs_fh *fhandle)
{
memcpy(p, fhandle->data, NFS2_FHSIZE);
return p + XDR_QUADLEN(NFS2_FHSIZE);
}
-static inline u32 *
-xdr_decode_fhandle(u32 *p, struct nfs_fh *fhandle)
+static inline __be32 *
+xdr_decode_fhandle(__be32 *p, struct nfs_fh *fhandle)
{
/* NFSv2 handles have a fixed length */
fhandle->size = NFS2_FHSIZE;
@@ -82,8 +82,8 @@ xdr_decode_fhandle(u32 *p, struct nfs_fh *fhandle)
return p + XDR_QUADLEN(NFS2_FHSIZE);
}
-static inline u32*
-xdr_encode_time(u32 *p, struct timespec *timep)
+static inline __be32*
+xdr_encode_time(__be32 *p, struct timespec *timep)
{
*p++ = htonl(timep->tv_sec);
/* Convert nanoseconds into microseconds */
@@ -91,8 +91,8 @@ xdr_encode_time(u32 *p, struct timespec *timep)
return p;
}
-static inline u32*
-xdr_encode_current_server_time(u32 *p, struct timespec *timep)
+static inline __be32*
+xdr_encode_current_server_time(__be32 *p, struct timespec *timep)
{
/*
* Passing the invalid value useconds=1000000 is a
@@ -108,8 +108,8 @@ xdr_encode_current_server_time(u32 *p, struct timespec *timep)
return p;
}
-static inline u32*
-xdr_decode_time(u32 *p, struct timespec *timep)
+static inline __be32*
+xdr_decode_time(__be32 *p, struct timespec *timep)
{
timep->tv_sec = ntohl(*p++);
/* Convert microseconds into nanoseconds */
@@ -117,8 +117,8 @@ xdr_decode_time(u32 *p, struct timespec *timep)
return p;
}
-static u32 *
-xdr_decode_fattr(u32 *p, struct nfs_fattr *fattr)
+static __be32 *
+xdr_decode_fattr(__be32 *p, struct nfs_fattr *fattr)
{
u32 rdev;
fattr->type = (enum nfs_ftype) ntohl(*p++);
@@ -146,10 +146,10 @@ xdr_decode_fattr(u32 *p, struct nfs_fattr *fattr)
return p;
}
-static inline u32 *
-xdr_encode_sattr(u32 *p, struct iattr *attr)
+static inline __be32 *
+xdr_encode_sattr(__be32 *p, struct iattr *attr)
{
- const u32 not_set = __constant_htonl(0xFFFFFFFF);
+ const __be32 not_set = __constant_htonl(0xFFFFFFFF);
*p++ = (attr->ia_valid & ATTR_MODE) ? htonl(attr->ia_mode) : not_set;
*p++ = (attr->ia_valid & ATTR_UID) ? htonl(attr->ia_uid) : not_set;
@@ -184,7 +184,7 @@ xdr_encode_sattr(u32 *p, struct iattr *attr)
* GETATTR, READLINK, STATFS
*/
static int
-nfs_xdr_fhandle(struct rpc_rqst *req, u32 *p, struct nfs_fh *fh)
+nfs_xdr_fhandle(struct rpc_rqst *req, __be32 *p, struct nfs_fh *fh)
{
p = xdr_encode_fhandle(p, fh);
req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
@@ -195,7 +195,7 @@ nfs_xdr_fhandle(struct rpc_rqst *req, u32 *p, struct nfs_fh *fh)
* Encode SETATTR arguments
*/
static int
-nfs_xdr_sattrargs(struct rpc_rqst *req, u32 *p, struct nfs_sattrargs *args)
+nfs_xdr_sattrargs(struct rpc_rqst *req, __be32 *p, struct nfs_sattrargs *args)
{
p = xdr_encode_fhandle(p, args->fh);
p = xdr_encode_sattr(p, args->sattr);
@@ -208,7 +208,7 @@ nfs_xdr_sattrargs(struct rpc_rqst *req, u32 *p, struct nfs_sattrargs *args)
* LOOKUP, REMOVE, RMDIR
*/
static int
-nfs_xdr_diropargs(struct rpc_rqst *req, u32 *p, struct nfs_diropargs *args)
+nfs_xdr_diropargs(struct rpc_rqst *req, __be32 *p, struct nfs_diropargs *args)
{
p = xdr_encode_fhandle(p, args->fh);
p = xdr_encode_array(p, args->name, args->len);
@@ -222,7 +222,7 @@ nfs_xdr_diropargs(struct rpc_rqst *req, u32 *p, struct nfs_diropargs *args)
* exactly to the page we want to fetch.
*/
static int
-nfs_xdr_readargs(struct rpc_rqst *req, u32 *p, struct nfs_readargs *args)
+nfs_xdr_readargs(struct rpc_rqst *req, __be32 *p, struct nfs_readargs *args)
{
struct rpc_auth *auth = req->rq_task->tk_auth;
unsigned int replen;
@@ -246,7 +246,7 @@ nfs_xdr_readargs(struct rpc_rqst *req, u32 *p, struct nfs_readargs *args)
* Decode READ reply
*/
static int
-nfs_xdr_readres(struct rpc_rqst *req, u32 *p, struct nfs_readres *res)
+nfs_xdr_readres(struct rpc_rqst *req, __be32 *p, struct nfs_readres *res)
{
struct kvec *iov = req->rq_rcv_buf.head;
int status, count, recvd, hdrlen;
@@ -286,7 +286,7 @@ nfs_xdr_readres(struct rpc_rqst *req, u32 *p, struct nfs_readres *res)
* Write arguments. Splice the buffer to be written into the iovec.
*/
static int
-nfs_xdr_writeargs(struct rpc_rqst *req, u32 *p, struct nfs_writeargs *args)
+nfs_xdr_writeargs(struct rpc_rqst *req, __be32 *p, struct nfs_writeargs *args)
{
struct xdr_buf *sndbuf = &req->rq_snd_buf;
u32 offset = (u32)args->offset;
@@ -309,7 +309,7 @@ nfs_xdr_writeargs(struct rpc_rqst *req, u32 *p, struct nfs_writeargs *args)
* CREATE, MKDIR
*/
static int
-nfs_xdr_createargs(struct rpc_rqst *req, u32 *p, struct nfs_createargs *args)
+nfs_xdr_createargs(struct rpc_rqst *req, __be32 *p, struct nfs_createargs *args)
{
p = xdr_encode_fhandle(p, args->fh);
p = xdr_encode_array(p, args->name, args->len);
@@ -322,7 +322,7 @@ nfs_xdr_createargs(struct rpc_rqst *req, u32 *p, struct nfs_createargs *args)
* Encode RENAME arguments
*/
static int
-nfs_xdr_renameargs(struct rpc_rqst *req, u32 *p, struct nfs_renameargs *args)
+nfs_xdr_renameargs(struct rpc_rqst *req, __be32 *p, struct nfs_renameargs *args)
{
p = xdr_encode_fhandle(p, args->fromfh);
p = xdr_encode_array(p, args->fromname, args->fromlen);
@@ -336,7 +336,7 @@ nfs_xdr_renameargs(struct rpc_rqst *req, u32 *p, struct nfs_renameargs *args)
* Encode LINK arguments
*/
static int
-nfs_xdr_linkargs(struct rpc_rqst *req, u32 *p, struct nfs_linkargs *args)
+nfs_xdr_linkargs(struct rpc_rqst *req, __be32 *p, struct nfs_linkargs *args)
{
p = xdr_encode_fhandle(p, args->fromfh);
p = xdr_encode_fhandle(p, args->tofh);
@@ -349,7 +349,7 @@ nfs_xdr_linkargs(struct rpc_rqst *req, u32 *p, struct nfs_linkargs *args)
* Encode SYMLINK arguments
*/
static int
-nfs_xdr_symlinkargs(struct rpc_rqst *req, u32 *p, struct nfs_symlinkargs *args)
+nfs_xdr_symlinkargs(struct rpc_rqst *req, __be32 *p, struct nfs_symlinkargs *args)
{
struct xdr_buf *sndbuf = &req->rq_snd_buf;
size_t pad;
@@ -378,7 +378,7 @@ nfs_xdr_symlinkargs(struct rpc_rqst *req, u32 *p, struct nfs_symlinkargs *args)
* Encode arguments to readdir call
*/
static int
-nfs_xdr_readdirargs(struct rpc_rqst *req, u32 *p, struct nfs_readdirargs *args)
+nfs_xdr_readdirargs(struct rpc_rqst *req, __be32 *p, struct nfs_readdirargs *args)
{
struct rpc_task *task = req->rq_task;
struct rpc_auth *auth = task->tk_auth;
@@ -404,7 +404,7 @@ nfs_xdr_readdirargs(struct rpc_rqst *req, u32 *p, struct nfs_readdirargs *args)
* from nfs_readdir for each entry.
*/
static int
-nfs_xdr_readdirres(struct rpc_rqst *req, u32 *p, void *dummy)
+nfs_xdr_readdirres(struct rpc_rqst *req, __be32 *p, void *dummy)
{
struct xdr_buf *rcvbuf = &req->rq_rcv_buf;
struct kvec *iov = rcvbuf->head;
@@ -412,7 +412,7 @@ nfs_xdr_readdirres(struct rpc_rqst *req, u32 *p, void *dummy)
int hdrlen, recvd;
int status, nr;
unsigned int len, pglen;
- u32 *end, *entry, *kaddr;
+ __be32 *end, *entry, *kaddr;
if ((status = ntohl(*p++)))
return -nfs_stat_to_errno(status);
@@ -432,8 +432,8 @@ nfs_xdr_readdirres(struct rpc_rqst *req, u32 *p, void *dummy)
if (pglen > recvd)
pglen = recvd;
page = rcvbuf->pages;
- kaddr = p = (u32 *)kmap_atomic(*page, KM_USER0);
- end = (u32 *)((char *)p + pglen);
+ kaddr = p = kmap_atomic(*page, KM_USER0);
+ end = (__be32 *)((char *)p + pglen);
entry = p;
for (nr = 0; *p++; nr++) {
if (p + 2 > end)
@@ -468,8 +468,8 @@ err_unmap:
goto out;
}
-u32 *
-nfs_decode_dirent(u32 *p, struct nfs_entry *entry, int plus)
+__be32 *
+nfs_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus)
{
if (!*p++) {
if (!*p)
@@ -496,7 +496,7 @@ nfs_decode_dirent(u32 *p, struct nfs_entry *entry, int plus)
* Decode simple status reply
*/
static int
-nfs_xdr_stat(struct rpc_rqst *req, u32 *p, void *dummy)
+nfs_xdr_stat(struct rpc_rqst *req, __be32 *p, void *dummy)
{
int status;
@@ -510,7 +510,7 @@ nfs_xdr_stat(struct rpc_rqst *req, u32 *p, void *dummy)
* GETATTR, SETATTR, WRITE
*/
static int
-nfs_xdr_attrstat(struct rpc_rqst *req, u32 *p, struct nfs_fattr *fattr)
+nfs_xdr_attrstat(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr)
{
int status;
@@ -525,7 +525,7 @@ nfs_xdr_attrstat(struct rpc_rqst *req, u32 *p, struct nfs_fattr *fattr)
* LOOKUP, CREATE, MKDIR
*/
static int
-nfs_xdr_diropres(struct rpc_rqst *req, u32 *p, struct nfs_diropok *res)
+nfs_xdr_diropres(struct rpc_rqst *req, __be32 *p, struct nfs_diropok *res)
{
int status;
@@ -540,7 +540,7 @@ nfs_xdr_diropres(struct rpc_rqst *req, u32 *p, struct nfs_diropok *res)
* Encode READLINK args
*/
static int
-nfs_xdr_readlinkargs(struct rpc_rqst *req, u32 *p, struct nfs_readlinkargs *args)
+nfs_xdr_readlinkargs(struct rpc_rqst *req, __be32 *p, struct nfs_readlinkargs *args)
{
struct rpc_auth *auth = req->rq_task->tk_auth;
unsigned int replen;
@@ -558,7 +558,7 @@ nfs_xdr_readlinkargs(struct rpc_rqst *req, u32 *p, struct nfs_readlinkargs *args
* Decode READLINK reply
*/
static int
-nfs_xdr_readlinkres(struct rpc_rqst *req, u32 *p, void *dummy)
+nfs_xdr_readlinkres(struct rpc_rqst *req, __be32 *p, void *dummy)
{
struct xdr_buf *rcvbuf = &req->rq_rcv_buf;
struct kvec *iov = rcvbuf->head;
@@ -601,7 +601,7 @@ nfs_xdr_readlinkres(struct rpc_rqst *req, u32 *p, void *dummy)
* Decode WRITE reply
*/
static int
-nfs_xdr_writeres(struct rpc_rqst *req, u32 *p, struct nfs_writeres *res)
+nfs_xdr_writeres(struct rpc_rqst *req, __be32 *p, struct nfs_writeres *res)
{
res->verf->committed = NFS_FILE_SYNC;
return nfs_xdr_attrstat(req, p, res->fattr);
@@ -611,7 +611,7 @@ nfs_xdr_writeres(struct rpc_rqst *req, u32 *p, struct nfs_writeres *res)
* Decode STATFS reply
*/
static int
-nfs_xdr_statfsres(struct rpc_rqst *req, u32 *p, struct nfs2_fsstat *res)
+nfs_xdr_statfsres(struct rpc_rqst *req, __be32 *p, struct nfs2_fsstat *res)
{
int status;
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 3b234d4601e7..e5f128ffc32d 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -668,7 +668,7 @@ nfs3_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
{
struct inode *dir = dentry->d_inode;
struct nfs_fattr dir_attr;
- u32 *verf = NFS_COOKIEVERF(dir);
+ __be32 *verf = NFS_COOKIEVERF(dir);
struct nfs3_readdirargs arg = {
.fh = NFS_FH(dir),
.cookie = cookie,
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 16556fa4effb..0ace092d126f 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -105,14 +105,14 @@ static struct {
/*
* Common NFS XDR functions as inlines
*/
-static inline u32 *
-xdr_encode_fhandle(u32 *p, struct nfs_fh *fh)
+static inline __be32 *
+xdr_encode_fhandle(__be32 *p, struct nfs_fh *fh)
{
return xdr_encode_array(p, fh->data, fh->size);
}
-static inline u32 *
-xdr_decode_fhandle(u32 *p, struct nfs_fh *fh)
+static inline __be32 *
+xdr_decode_fhandle(__be32 *p, struct nfs_fh *fh)
{
if ((fh->size = ntohl(*p++)) <= NFS3_FHSIZE) {
memcpy(fh->data, p, fh->size);
@@ -124,24 +124,24 @@ xdr_decode_fhandle(u32 *p, struct nfs_fh *fh)
/*
* Encode/decode time.
*/
-static inline u32 *
-xdr_encode_time3(u32 *p, struct timespec *timep)
+static inline __be32 *
+xdr_encode_time3(__be32 *p, struct timespec *timep)
{
*p++ = htonl(timep->tv_sec);
*p++ = htonl(timep->tv_nsec);
return p;
}
-static inline u32 *
-xdr_decode_time3(u32 *p, struct timespec *timep)
+static inline __be32 *
+xdr_decode_time3(__be32 *p, struct timespec *timep)
{
timep->tv_sec = ntohl(*p++);
timep->tv_nsec = ntohl(*p++);
return p;
}
-static u32 *
-xdr_decode_fattr(u32 *p, struct nfs_fattr *fattr)
+static __be32 *
+xdr_decode_fattr(__be32 *p, struct nfs_fattr *fattr)
{
unsigned int type, major, minor;
int fmode;
@@ -177,8 +177,8 @@ xdr_decode_fattr(u32 *p, struct nfs_fattr *fattr)
return p;
}
-static inline u32 *
-xdr_encode_sattr(u32 *p, struct iattr *attr)
+static inline __be32 *
+xdr_encode_sattr(__be32 *p, struct iattr *attr)
{
if (attr->ia_valid & ATTR_MODE) {
*p++ = xdr_one;
@@ -223,8 +223,8 @@ xdr_encode_sattr(u32 *p, struct iattr *attr)
return p;
}
-static inline u32 *
-xdr_decode_wcc_attr(u32 *p, struct nfs_fattr *fattr)
+static inline __be32 *
+xdr_decode_wcc_attr(__be32 *p, struct nfs_fattr *fattr)
{
p = xdr_decode_hyper(p, &fattr->pre_size);
p = xdr_decode_time3(p, &fattr->pre_mtime);
@@ -233,16 +233,16 @@ xdr_decode_wcc_attr(u32 *p, struct nfs_fattr *fattr)
return p;
}
-static inline u32 *
-xdr_decode_post_op_attr(u32 *p, struct nfs_fattr *fattr)
+static inline __be32 *
+xdr_decode_post_op_attr(__be32 *p, struct nfs_fattr *fattr)
{
if (*p++)
p = xdr_decode_fattr(p, fattr);
return p;
}
-static inline u32 *
-xdr_decode_pre_op_attr(u32 *p, struct nfs_fattr *fattr)
+static inline __be32 *
+xdr_decode_pre_op_attr(__be32 *p, struct nfs_fattr *fattr)
{
if (*p++)
return xdr_decode_wcc_attr(p, fattr);
@@ -250,8 +250,8 @@ xdr_decode_pre_op_attr(u32 *p, struct nfs_fattr *fattr)
}
-static inline u32 *
-xdr_decode_wcc_data(u32 *p, struct nfs_fattr *fattr)
+static inline __be32 *
+xdr_decode_wcc_data(__be32 *p, struct nfs_fattr *fattr)
{
p = xdr_decode_pre_op_attr(p, fattr);
return xdr_decode_post_op_attr(p, fattr);
@@ -265,7 +265,7 @@ xdr_decode_wcc_data(u32 *p, struct nfs_fattr *fattr)
* Encode file handle argument
*/
static int
-nfs3_xdr_fhandle(struct rpc_rqst *req, u32 *p, struct nfs_fh *fh)
+nfs3_xdr_fhandle(struct rpc_rqst *req, __be32 *p, struct nfs_fh *fh)
{
p = xdr_encode_fhandle(p, fh);
req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
@@ -276,7 +276,7 @@ nfs3_xdr_fhandle(struct rpc_rqst *req, u32 *p, struct nfs_fh *fh)
* Encode SETATTR arguments
*/
static int
-nfs3_xdr_sattrargs(struct rpc_rqst *req, u32 *p, struct nfs3_sattrargs *args)
+nfs3_xdr_sattrargs(struct rpc_rqst *req, __be32 *p, struct nfs3_sattrargs *args)
{
p = xdr_encode_fhandle(p, args->fh);
p = xdr_encode_sattr(p, args->sattr);
@@ -291,7 +291,7 @@ nfs3_xdr_sattrargs(struct rpc_rqst *req, u32 *p, struct nfs3_sattrargs *args)
* Encode directory ops argument
*/
static int
-nfs3_xdr_diropargs(struct rpc_rqst *req, u32 *p, struct nfs3_diropargs *args)
+nfs3_xdr_diropargs(struct rpc_rqst *req, __be32 *p, struct nfs3_diropargs *args)
{
p = xdr_encode_fhandle(p, args->fh);
p = xdr_encode_array(p, args->name, args->len);
@@ -303,7 +303,7 @@ nfs3_xdr_diropargs(struct rpc_rqst *req, u32 *p, struct nfs3_diropargs *args)
* Encode access() argument
*/
static int
-nfs3_xdr_accessargs(struct rpc_rqst *req, u32 *p, struct nfs3_accessargs *args)
+nfs3_xdr_accessargs(struct rpc_rqst *req, __be32 *p, struct nfs3_accessargs *args)
{
p = xdr_encode_fhandle(p, args->fh);
*p++ = htonl(args->access);
@@ -317,7 +317,7 @@ nfs3_xdr_accessargs(struct rpc_rqst *req, u32 *p, struct nfs3_accessargs *args)
* exactly to the page we want to fetch.
*/
static int
-nfs3_xdr_readargs(struct rpc_rqst *req, u32 *p, struct nfs_readargs *args)
+nfs3_xdr_readargs(struct rpc_rqst *req, __be32 *p, struct nfs_readargs *args)
{
struct rpc_auth *auth = req->rq_task->tk_auth;
unsigned int replen;
@@ -339,7 +339,7 @@ nfs3_xdr_readargs(struct rpc_rqst *req, u32 *p, struct nfs_readargs *args)
* Write arguments. Splice the buffer to be written into the iovec.
*/
static int
-nfs3_xdr_writeargs(struct rpc_rqst *req, u32 *p, struct nfs_writeargs *args)
+nfs3_xdr_writeargs(struct rpc_rqst *req, __be32 *p, struct nfs_writeargs *args)
{
struct xdr_buf *sndbuf = &req->rq_snd_buf;
u32 count = args->count;
@@ -360,7 +360,7 @@ nfs3_xdr_writeargs(struct rpc_rqst *req, u32 *p, struct nfs_writeargs *args)
* Encode CREATE arguments
*/
static int
-nfs3_xdr_createargs(struct rpc_rqst *req, u32 *p, struct nfs3_createargs *args)
+nfs3_xdr_createargs(struct rpc_rqst *req, __be32 *p, struct nfs3_createargs *args)
{
p = xdr_encode_fhandle(p, args->fh);
p = xdr_encode_array(p, args->name, args->len);
@@ -380,7 +380,7 @@ nfs3_xdr_createargs(struct rpc_rqst *req, u32 *p, struct nfs3_createargs *args)
* Encode MKDIR arguments
*/
static int
-nfs3_xdr_mkdirargs(struct rpc_rqst *req, u32 *p, struct nfs3_mkdirargs *args)
+nfs3_xdr_mkdirargs(struct rpc_rqst *req, __be32 *p, struct nfs3_mkdirargs *args)
{
p = xdr_encode_fhandle(p, args->fh);
p = xdr_encode_array(p, args->name, args->len);
@@ -393,7 +393,7 @@ nfs3_xdr_mkdirargs(struct rpc_rqst *req, u32 *p, struct nfs3_mkdirargs *args)
* Encode SYMLINK arguments
*/
static int
-nfs3_xdr_symlinkargs(struct rpc_rqst *req, u32 *p, struct nfs3_symlinkargs *args)
+nfs3_xdr_symlinkargs(struct rpc_rqst *req, __be32 *p, struct nfs3_symlinkargs *args)
{
p = xdr_encode_fhandle(p, args->fromfh);
p = xdr_encode_array(p, args->fromname, args->fromlen);
@@ -410,7 +410,7 @@ nfs3_xdr_symlinkargs(struct rpc_rqst *req, u32 *p, struct nfs3_symlinkargs *args
* Encode MKNOD arguments
*/
static int
-nfs3_xdr_mknodargs(struct rpc_rqst *req, u32 *p, struct nfs3_mknodargs *args)
+nfs3_xdr_mknodargs(struct rpc_rqst *req, __be32 *p, struct nfs3_mknodargs *args)
{
p = xdr_encode_fhandle(p, args->fh);
p = xdr_encode_array(p, args->name, args->len);
@@ -429,7 +429,7 @@ nfs3_xdr_mknodargs(struct rpc_rqst *req, u32 *p, struct nfs3_mknodargs *args)
* Encode RENAME arguments
*/
static int
-nfs3_xdr_renameargs(struct rpc_rqst *req, u32 *p, struct nfs3_renameargs *args)
+nfs3_xdr_renameargs(struct rpc_rqst *req, __be32 *p, struct nfs3_renameargs *args)
{
p = xdr_encode_fhandle(p, args->fromfh);
p = xdr_encode_array(p, args->fromname, args->fromlen);
@@ -443,7 +443,7 @@ nfs3_xdr_renameargs(struct rpc_rqst *req, u32 *p, struct nfs3_renameargs *args)
* Encode LINK arguments
*/
static int
-nfs3_xdr_linkargs(struct rpc_rqst *req, u32 *p, struct nfs3_linkargs *args)
+nfs3_xdr_linkargs(struct rpc_rqst *req, __be32 *p, struct nfs3_linkargs *args)
{
p = xdr_encode_fhandle(p, args->fromfh);
p = xdr_encode_fhandle(p, args->tofh);
@@ -456,7 +456,7 @@ nfs3_xdr_linkargs(struct rpc_rqst *req, u32 *p, struct nfs3_linkargs *args)
* Encode arguments to readdir call
*/
static int
-nfs3_xdr_readdirargs(struct rpc_rqst *req, u32 *p, struct nfs3_readdirargs *args)
+nfs3_xdr_readdirargs(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirargs *args)
{
struct rpc_auth *auth = req->rq_task->tk_auth;
unsigned int replen;
@@ -485,7 +485,7 @@ nfs3_xdr_readdirargs(struct rpc_rqst *req, u32 *p, struct nfs3_readdirargs *args
* We just check for syntactical correctness.
*/
static int
-nfs3_xdr_readdirres(struct rpc_rqst *req, u32 *p, struct nfs3_readdirres *res)
+nfs3_xdr_readdirres(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirres *res)
{
struct xdr_buf *rcvbuf = &req->rq_rcv_buf;
struct kvec *iov = rcvbuf->head;
@@ -493,7 +493,7 @@ nfs3_xdr_readdirres(struct rpc_rqst *req, u32 *p, struct nfs3_readdirres *res)
int hdrlen, recvd;
int status, nr;
unsigned int len, pglen;
- u32 *entry, *end, *kaddr;
+ __be32 *entry, *end, *kaddr;
status = ntohl(*p++);
/* Decode post_op_attrs */
@@ -523,8 +523,8 @@ nfs3_xdr_readdirres(struct rpc_rqst *req, u32 *p, struct nfs3_readdirres *res)
if (pglen > recvd)
pglen = recvd;
page = rcvbuf->pages;
- kaddr = p = (u32 *)kmap_atomic(*page, KM_USER0);
- end = (u32 *)((char *)p + pglen);
+ kaddr = p = kmap_atomic(*page, KM_USER0);
+ end = (__be32 *)((char *)p + pglen);
entry = p;
for (nr = 0; *p++; nr++) {
if (p + 3 > end)
@@ -583,8 +583,8 @@ err_unmap:
goto out;
}
-u32 *
-nfs3_decode_dirent(u32 *p, struct nfs_entry *entry, int plus)
+__be32 *
+nfs3_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus)
{
struct nfs_entry old = *entry;
@@ -626,7 +626,7 @@ nfs3_decode_dirent(u32 *p, struct nfs_entry *entry, int plus)
* Encode COMMIT arguments
*/
static int
-nfs3_xdr_commitargs(struct rpc_rqst *req, u32 *p, struct nfs_writeargs *args)
+nfs3_xdr_commitargs(struct rpc_rqst *req, __be32 *p, struct nfs_writeargs *args)
{
p = xdr_encode_fhandle(p, args->fh);
p = xdr_encode_hyper(p, args->offset);
@@ -640,7 +640,7 @@ nfs3_xdr_commitargs(struct rpc_rqst *req, u32 *p, struct nfs_writeargs *args)
* Encode GETACL arguments
*/
static int
-nfs3_xdr_getaclargs(struct rpc_rqst *req, u32 *p,
+nfs3_xdr_getaclargs(struct rpc_rqst *req, __be32 *p,
struct nfs3_getaclargs *args)
{
struct rpc_auth *auth = req->rq_task->tk_auth;
@@ -664,7 +664,7 @@ nfs3_xdr_getaclargs(struct rpc_rqst *req, u32 *p,
* Encode SETACL arguments
*/
static int
-nfs3_xdr_setaclargs(struct rpc_rqst *req, u32 *p,
+nfs3_xdr_setaclargs(struct rpc_rqst *req, __be32 *p,
struct nfs3_setaclargs *args)
{
struct xdr_buf *buf = &req->rq_snd_buf;
@@ -711,7 +711,7 @@ nfs3_xdr_setaclargs(struct rpc_rqst *req, u32 *p,
* Decode attrstat reply.
*/
static int
-nfs3_xdr_attrstat(struct rpc_rqst *req, u32 *p, struct nfs_fattr *fattr)
+nfs3_xdr_attrstat(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr)
{
int status;
@@ -726,7 +726,7 @@ nfs3_xdr_attrstat(struct rpc_rqst *req, u32 *p, struct nfs_fattr *fattr)
* SATTR, REMOVE, RMDIR
*/
static int
-nfs3_xdr_wccstat(struct rpc_rqst *req, u32 *p, struct nfs_fattr *fattr)
+nfs3_xdr_wccstat(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr)
{
int status;
@@ -740,7 +740,7 @@ nfs3_xdr_wccstat(struct rpc_rqst *req, u32 *p, struct nfs_fattr *fattr)
* Decode LOOKUP reply
*/
static int
-nfs3_xdr_lookupres(struct rpc_rqst *req, u32 *p, struct nfs3_diropres *res)
+nfs3_xdr_lookupres(struct rpc_rqst *req, __be32 *p, struct nfs3_diropres *res)
{
int status;
@@ -759,7 +759,7 @@ nfs3_xdr_lookupres(struct rpc_rqst *req, u32 *p, struct nfs3_diropres *res)
* Decode ACCESS reply
*/
static int
-nfs3_xdr_accessres(struct rpc_rqst *req, u32 *p, struct nfs3_accessres *res)
+nfs3_xdr_accessres(struct rpc_rqst *req, __be32 *p, struct nfs3_accessres *res)
{
int status = ntohl(*p++);
@@ -771,7 +771,7 @@ nfs3_xdr_accessres(struct rpc_rqst *req, u32 *p, struct nfs3_accessres *res)
}
static int
-nfs3_xdr_readlinkargs(struct rpc_rqst *req, u32 *p, struct nfs3_readlinkargs *args)
+nfs3_xdr_readlinkargs(struct rpc_rqst *req, __be32 *p, struct nfs3_readlinkargs *args)
{
struct rpc_auth *auth = req->rq_task->tk_auth;
unsigned int replen;
@@ -789,7 +789,7 @@ nfs3_xdr_readlinkargs(struct rpc_rqst *req, u32 *p, struct nfs3_readlinkargs *ar
* Decode READLINK reply
*/
static int
-nfs3_xdr_readlinkres(struct rpc_rqst *req, u32 *p, struct nfs_fattr *fattr)
+nfs3_xdr_readlinkres(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr)
{
struct xdr_buf *rcvbuf = &req->rq_rcv_buf;
struct kvec *iov = rcvbuf->head;
@@ -837,7 +837,7 @@ nfs3_xdr_readlinkres(struct rpc_rqst *req, u32 *p, struct nfs_fattr *fattr)
* Decode READ reply
*/
static int
-nfs3_xdr_readres(struct rpc_rqst *req, u32 *p, struct nfs_readres *res)
+nfs3_xdr_readres(struct rpc_rqst *req, __be32 *p, struct nfs_readres *res)
{
struct kvec *iov = req->rq_rcv_buf.head;
int status, count, ocount, recvd, hdrlen;
@@ -888,7 +888,7 @@ nfs3_xdr_readres(struct rpc_rqst *req, u32 *p, struct nfs_readres *res)
* Decode WRITE response
*/
static int
-nfs3_xdr_writeres(struct rpc_rqst *req, u32 *p, struct nfs_writeres *res)
+nfs3_xdr_writeres(struct rpc_rqst *req, __be32 *p, struct nfs_writeres *res)
{
int status;
@@ -910,7 +910,7 @@ nfs3_xdr_writeres(struct rpc_rqst *req, u32 *p, struct nfs_writeres *res)
* Decode a CREATE response
*/
static int
-nfs3_xdr_createres(struct rpc_rqst *req, u32 *p, struct nfs3_diropres *res)
+nfs3_xdr_createres(struct rpc_rqst *req, __be32 *p, struct nfs3_diropres *res)
{
int status;
@@ -937,7 +937,7 @@ nfs3_xdr_createres(struct rpc_rqst *req, u32 *p, struct nfs3_diropres *res)
* Decode RENAME reply
*/
static int
-nfs3_xdr_renameres(struct rpc_rqst *req, u32 *p, struct nfs3_renameres *res)
+nfs3_xdr_renameres(struct rpc_rqst *req, __be32 *p, struct nfs3_renameres *res)
{
int status;
@@ -952,7 +952,7 @@ nfs3_xdr_renameres(struct rpc_rqst *req, u32 *p, struct nfs3_renameres *res)
* Decode LINK reply
*/
static int
-nfs3_xdr_linkres(struct rpc_rqst *req, u32 *p, struct nfs3_linkres *res)
+nfs3_xdr_linkres(struct rpc_rqst *req, __be32 *p, struct nfs3_linkres *res)
{
int status;
@@ -967,7 +967,7 @@ nfs3_xdr_linkres(struct rpc_rqst *req, u32 *p, struct nfs3_linkres *res)
* Decode FSSTAT reply
*/
static int
-nfs3_xdr_fsstatres(struct rpc_rqst *req, u32 *p, struct nfs_fsstat *res)
+nfs3_xdr_fsstatres(struct rpc_rqst *req, __be32 *p, struct nfs_fsstat *res)
{
int status;
@@ -992,7 +992,7 @@ nfs3_xdr_fsstatres(struct rpc_rqst *req, u32 *p, struct nfs_fsstat *res)
* Decode FSINFO reply
*/
static int
-nfs3_xdr_fsinfores(struct rpc_rqst *req, u32 *p, struct nfs_fsinfo *res)
+nfs3_xdr_fsinfores(struct rpc_rqst *req, __be32 *p, struct nfs_fsinfo *res)
{
int status;
@@ -1020,7 +1020,7 @@ nfs3_xdr_fsinfores(struct rpc_rqst *req, u32 *p, struct nfs_fsinfo *res)
* Decode PATHCONF reply
*/
static int
-nfs3_xdr_pathconfres(struct rpc_rqst *req, u32 *p, struct nfs_pathconf *res)
+nfs3_xdr_pathconfres(struct rpc_rqst *req, __be32 *p, struct nfs_pathconf *res)
{
int status;
@@ -1040,7 +1040,7 @@ nfs3_xdr_pathconfres(struct rpc_rqst *req, u32 *p, struct nfs_pathconf *res)
* Decode COMMIT reply
*/
static int
-nfs3_xdr_commitres(struct rpc_rqst *req, u32 *p, struct nfs_writeres *res)
+nfs3_xdr_commitres(struct rpc_rqst *req, __be32 *p, struct nfs_writeres *res)
{
int status;
@@ -1059,7 +1059,7 @@ nfs3_xdr_commitres(struct rpc_rqst *req, u32 *p, struct nfs_writeres *res)
* Decode GETACL reply
*/
static int
-nfs3_xdr_getaclres(struct rpc_rqst *req, u32 *p,
+nfs3_xdr_getaclres(struct rpc_rqst *req, __be32 *p,
struct nfs3_getaclres *res)
{
struct xdr_buf *buf = &req->rq_rcv_buf;
@@ -1091,7 +1091,7 @@ nfs3_xdr_getaclres(struct rpc_rqst *req, u32 *p,
* Decode setacl reply.
*/
static int
-nfs3_xdr_setaclres(struct rpc_rqst *req, u32 *p, struct nfs_fattr *fattr)
+nfs3_xdr_setaclres(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr)
{
int status = ntohl(*p++);
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 61095fe4b5ca..6f346677332d 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -212,7 +212,7 @@ extern void nfs_free_seqid(struct nfs_seqid *seqid);
extern const nfs4_stateid zero_stateid;
/* nfs4xdr.c */
-extern uint32_t *nfs4_decode_dirent(uint32_t *p, struct nfs_entry *entry, int plus);
+extern __be32 *nfs4_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus);
extern struct rpc_procinfo nfs4_procedures[];
struct nfs4_mount_data;
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index 24e47f3bbd17..b872779d7cd5 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -7,8 +7,6 @@
* NFSv4 namespace
*/
-#include <linux/config.h>
-
#include <linux/dcache.h>
#include <linux/mount.h>
#include <linux/namei.h>
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 47c7e6e3910d..8118036cc449 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -138,10 +138,10 @@ const u32 nfs4_fs_locations_bitmap[2] = {
| FATTR4_WORD1_MOUNTED_ON_FILEID
};
-static void nfs4_setup_readdir(u64 cookie, u32 *verifier, struct dentry *dentry,
+static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dentry,
struct nfs4_readdir_arg *readdir)
{
- u32 *start, *p;
+ __be32 *start, *p;
BUG_ON(readdir->count < 80);
if (cookie > 2) {
@@ -162,7 +162,7 @@ static void nfs4_setup_readdir(u64 cookie, u32 *verifier, struct dentry *dentry,
* when talking to the server, we always send cookie 0
* instead of 1 or 2.
*/
- start = p = (u32 *)kmap_atomic(*readdir->pages, KM_USER0);
+ start = p = kmap_atomic(*readdir->pages, KM_USER0);
if (cookie == 0) {
*p++ = xdr_one; /* next */
@@ -1314,11 +1314,9 @@ nfs4_open_revalidate(struct inode *dir, struct dentry *dentry, int openflags, st
case -EROFS:
lookup_instantiate_filp(nd, (struct dentry *)state, NULL);
return 1;
- case -ENOENT:
- if (dentry->d_inode == NULL)
- return 1;
+ default:
+ goto out_drop;
}
- goto out_drop;
}
if (state->inode == dentry->d_inode) {
nfs4_intent_set_file(nd, dentry, state);
@@ -2917,11 +2915,11 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, unsigned short po
.rpc_resp = clp,
.rpc_cred = cred,
};
- u32 *p;
+ __be32 *p;
int loop = 0;
int status;
- p = (u32*)sc_verifier.data;
+ p = (__be32*)sc_verifier.data;
*p++ = htonl((u32)clp->cl_boot_time.tv_sec);
*p = htonl((u32)clp->cl_boot_time.tv_nsec);
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 3dd413f52da1..0cf3fa312a33 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -471,7 +471,7 @@ struct compound_hdr {
static void encode_string(struct xdr_stream *xdr, unsigned int len, const char *str)
{
- uint32_t *p;
+ __be32 *p;
p = xdr_reserve_space(xdr, 4 + len);
BUG_ON(p == NULL);
@@ -480,7 +480,7 @@ static void encode_string(struct xdr_stream *xdr, unsigned int len, const char *
static int encode_compound_hdr(struct xdr_stream *xdr, struct compound_hdr *hdr)
{
- uint32_t *p;
+ __be32 *p;
dprintk("encode_compound: tag=%.*s\n", (int)hdr->taglen, hdr->tag);
BUG_ON(hdr->taglen > NFS4_MAXTAGLEN);
@@ -494,7 +494,7 @@ static int encode_compound_hdr(struct xdr_stream *xdr, struct compound_hdr *hdr)
static void encode_nfs4_verifier(struct xdr_stream *xdr, const nfs4_verifier *verf)
{
- uint32_t *p;
+ __be32 *p;
p = xdr_reserve_space(xdr, NFS4_VERIFIER_SIZE);
BUG_ON(p == NULL);
@@ -507,8 +507,8 @@ static int encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const s
char owner_group[IDMAP_NAMESZ];
int owner_namelen = 0;
int owner_grouplen = 0;
- uint32_t *p;
- uint32_t *q;
+ __be32 *p;
+ __be32 *q;
int len;
uint32_t bmval0 = 0;
uint32_t bmval1 = 0;
@@ -630,7 +630,7 @@ static int encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const s
static int encode_access(struct xdr_stream *xdr, u32 access)
{
- uint32_t *p;
+ __be32 *p;
RESERVE_SPACE(8);
WRITE32(OP_ACCESS);
@@ -641,7 +641,7 @@ static int encode_access(struct xdr_stream *xdr, u32 access)
static int encode_close(struct xdr_stream *xdr, const struct nfs_closeargs *arg)
{
- uint32_t *p;
+ __be32 *p;
RESERVE_SPACE(8+sizeof(arg->stateid->data));
WRITE32(OP_CLOSE);
@@ -653,7 +653,7 @@ static int encode_close(struct xdr_stream *xdr, const struct nfs_closeargs *arg)
static int encode_commit(struct xdr_stream *xdr, const struct nfs_writeargs *args)
{
- uint32_t *p;
+ __be32 *p;
RESERVE_SPACE(16);
WRITE32(OP_COMMIT);
@@ -665,7 +665,7 @@ static int encode_commit(struct xdr_stream *xdr, const struct nfs_writeargs *arg
static int encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg *create)
{
- uint32_t *p;
+ __be32 *p;
RESERVE_SPACE(8);
WRITE32(OP_CREATE);
@@ -697,7 +697,7 @@ static int encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg *c
static int encode_getattr_one(struct xdr_stream *xdr, uint32_t bitmap)
{
- uint32_t *p;
+ __be32 *p;
RESERVE_SPACE(12);
WRITE32(OP_GETATTR);
@@ -708,7 +708,7 @@ static int encode_getattr_one(struct xdr_stream *xdr, uint32_t bitmap)
static int encode_getattr_two(struct xdr_stream *xdr, uint32_t bm0, uint32_t bm1)
{
- uint32_t *p;
+ __be32 *p;
RESERVE_SPACE(16);
WRITE32(OP_GETATTR);
@@ -740,7 +740,7 @@ static int encode_fs_locations(struct xdr_stream *xdr, const u32* bitmask)
static int encode_getfh(struct xdr_stream *xdr)
{
- uint32_t *p;
+ __be32 *p;
RESERVE_SPACE(4);
WRITE32(OP_GETFH);
@@ -750,7 +750,7 @@ static int encode_getfh(struct xdr_stream *xdr)
static int encode_link(struct xdr_stream *xdr, const struct qstr *name)
{
- uint32_t *p;
+ __be32 *p;
RESERVE_SPACE(8 + name->len);
WRITE32(OP_LINK);
@@ -780,7 +780,7 @@ static inline uint64_t nfs4_lock_length(struct file_lock *fl)
*/
static int encode_lock(struct xdr_stream *xdr, const struct nfs_lock_args *args)
{
- uint32_t *p;
+ __be32 *p;
RESERVE_SPACE(32);
WRITE32(OP_LOCK);
@@ -809,7 +809,7 @@ static int encode_lock(struct xdr_stream *xdr, const struct nfs_lock_args *args)
static int encode_lockt(struct xdr_stream *xdr, const struct nfs_lockt_args *args)
{
- uint32_t *p;
+ __be32 *p;
RESERVE_SPACE(40);
WRITE32(OP_LOCKT);
@@ -825,7 +825,7 @@ static int encode_lockt(struct xdr_stream *xdr, const struct nfs_lockt_args *arg
static int encode_locku(struct xdr_stream *xdr, const struct nfs_locku_args *args)
{
- uint32_t *p;
+ __be32 *p;
RESERVE_SPACE(44);
WRITE32(OP_LOCKU);
@@ -841,7 +841,7 @@ static int encode_locku(struct xdr_stream *xdr, const struct nfs_locku_args *arg
static int encode_lookup(struct xdr_stream *xdr, const struct qstr *name)
{
int len = name->len;
- uint32_t *p;
+ __be32 *p;
RESERVE_SPACE(8 + len);
WRITE32(OP_LOOKUP);
@@ -853,7 +853,7 @@ static int encode_lookup(struct xdr_stream *xdr, const struct qstr *name)
static void encode_share_access(struct xdr_stream *xdr, int open_flags)
{
- uint32_t *p;
+ __be32 *p;
RESERVE_SPACE(8);
switch (open_flags & (FMODE_READ|FMODE_WRITE)) {
@@ -874,7 +874,7 @@ static void encode_share_access(struct xdr_stream *xdr, int open_flags)
static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_openargs *arg)
{
- uint32_t *p;
+ __be32 *p;
/*
* opcode 4, seqid 4, share_access 4, share_deny 4, clientid 8, ownerlen 4,
* owner 4 = 32
@@ -891,7 +891,7 @@ static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_opena
static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_openargs *arg)
{
- uint32_t *p;
+ __be32 *p;
RESERVE_SPACE(4);
switch(arg->open_flags & O_EXCL) {
@@ -907,7 +907,7 @@ static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_op
static void encode_opentype(struct xdr_stream *xdr, const struct nfs_openargs *arg)
{
- uint32_t *p;
+ __be32 *p;
RESERVE_SPACE(4);
switch (arg->open_flags & O_CREAT) {
@@ -923,7 +923,7 @@ static void encode_opentype(struct xdr_stream *xdr, const struct nfs_openargs *a
static inline void encode_delegation_type(struct xdr_stream *xdr, int delegation_type)
{
- uint32_t *p;
+ __be32 *p;
RESERVE_SPACE(4);
switch (delegation_type) {
@@ -943,7 +943,7 @@ static inline void encode_delegation_type(struct xdr_stream *xdr, int delegation
static inline void encode_claim_null(struct xdr_stream *xdr, const struct qstr *name)
{
- uint32_t *p;
+ __be32 *p;
RESERVE_SPACE(4);
WRITE32(NFS4_OPEN_CLAIM_NULL);
@@ -952,7 +952,7 @@ static inline void encode_claim_null(struct xdr_stream *xdr, const struct qstr *
static inline void encode_claim_previous(struct xdr_stream *xdr, int type)
{
- uint32_t *p;
+ __be32 *p;
RESERVE_SPACE(4);
WRITE32(NFS4_OPEN_CLAIM_PREVIOUS);
@@ -961,7 +961,7 @@ static inline void encode_claim_previous(struct xdr_stream *xdr, int type)
static inline void encode_claim_delegate_cur(struct xdr_stream *xdr, const struct qstr *name, const nfs4_stateid *stateid)
{
- uint32_t *p;
+ __be32 *p;
RESERVE_SPACE(4+sizeof(stateid->data));
WRITE32(NFS4_OPEN_CLAIM_DELEGATE_CUR);
@@ -991,7 +991,7 @@ static int encode_open(struct xdr_stream *xdr, const struct nfs_openargs *arg)
static int encode_open_confirm(struct xdr_stream *xdr, const struct nfs_open_confirmargs *arg)
{
- uint32_t *p;
+ __be32 *p;
RESERVE_SPACE(8+sizeof(arg->stateid->data));
WRITE32(OP_OPEN_CONFIRM);
@@ -1003,7 +1003,7 @@ static int encode_open_confirm(struct xdr_stream *xdr, const struct nfs_open_con
static int encode_open_downgrade(struct xdr_stream *xdr, const struct nfs_closeargs *arg)
{
- uint32_t *p;
+ __be32 *p;
RESERVE_SPACE(8+sizeof(arg->stateid->data));
WRITE32(OP_OPEN_DOWNGRADE);
@@ -1017,7 +1017,7 @@ static int
encode_putfh(struct xdr_stream *xdr, const struct nfs_fh *fh)
{
int len = fh->size;
- uint32_t *p;
+ __be32 *p;
RESERVE_SPACE(8 + len);
WRITE32(OP_PUTFH);
@@ -1029,7 +1029,7 @@ encode_putfh(struct xdr_stream *xdr, const struct nfs_fh *fh)
static int encode_putrootfh(struct xdr_stream *xdr)
{
- uint32_t *p;
+ __be32 *p;
RESERVE_SPACE(4);
WRITE32(OP_PUTROOTFH);
@@ -1040,7 +1040,7 @@ static int encode_putrootfh(struct xdr_stream *xdr)
static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context *ctx)
{
nfs4_stateid stateid;
- uint32_t *p;
+ __be32 *p;
RESERVE_SPACE(16);
if (ctx->state != NULL) {
@@ -1052,7 +1052,7 @@ static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context
static int encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args)
{
- uint32_t *p;
+ __be32 *p;
RESERVE_SPACE(4);
WRITE32(OP_READ);
@@ -1074,7 +1074,7 @@ static int encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg
FATTR4_WORD1_MOUNTED_ON_FILEID,
};
int replen;
- uint32_t *p;
+ __be32 *p;
RESERVE_SPACE(32+sizeof(nfs4_verifier));
WRITE32(OP_READDIR);
@@ -1116,7 +1116,7 @@ static int encode_readlink(struct xdr_stream *xdr, const struct nfs4_readlink *r
{
struct rpc_auth *auth = req->rq_task->tk_auth;
unsigned int replen;
- uint32_t *p;
+ __be32 *p;
RESERVE_SPACE(4);
WRITE32(OP_READLINK);
@@ -1134,7 +1134,7 @@ static int encode_readlink(struct xdr_stream *xdr, const struct nfs4_readlink *r
static int encode_remove(struct xdr_stream *xdr, const struct qstr *name)
{
- uint32_t *p;
+ __be32 *p;
RESERVE_SPACE(8 + name->len);
WRITE32(OP_REMOVE);
@@ -1146,7 +1146,7 @@ static int encode_remove(struct xdr_stream *xdr, const struct qstr *name)
static int encode_rename(struct xdr_stream *xdr, const struct qstr *oldname, const struct qstr *newname)
{
- uint32_t *p;
+ __be32 *p;
RESERVE_SPACE(8 + oldname->len);
WRITE32(OP_RENAME);
@@ -1162,7 +1162,7 @@ static int encode_rename(struct xdr_stream *xdr, const struct qstr *oldname, con
static int encode_renew(struct xdr_stream *xdr, const struct nfs_client *client_stateid)
{
- uint32_t *p;
+ __be32 *p;
RESERVE_SPACE(12);
WRITE32(OP_RENEW);
@@ -1174,7 +1174,7 @@ static int encode_renew(struct xdr_stream *xdr, const struct nfs_client *client_
static int
encode_restorefh(struct xdr_stream *xdr)
{
- uint32_t *p;
+ __be32 *p;
RESERVE_SPACE(4);
WRITE32(OP_RESTOREFH);
@@ -1185,7 +1185,7 @@ encode_restorefh(struct xdr_stream *xdr)
static int
encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg)
{
- uint32_t *p;
+ __be32 *p;
RESERVE_SPACE(4+sizeof(zero_stateid.data));
WRITE32(OP_SETATTR);
@@ -1204,7 +1204,7 @@ encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg)
static int
encode_savefh(struct xdr_stream *xdr)
{
- uint32_t *p;
+ __be32 *p;
RESERVE_SPACE(4);
WRITE32(OP_SAVEFH);
@@ -1215,7 +1215,7 @@ encode_savefh(struct xdr_stream *xdr)
static int encode_setattr(struct xdr_stream *xdr, const struct nfs_setattrargs *arg, const struct nfs_server *server)
{
int status;
- uint32_t *p;
+ __be32 *p;
RESERVE_SPACE(4+sizeof(arg->stateid.data));
WRITE32(OP_SETATTR);
@@ -1229,7 +1229,7 @@ static int encode_setattr(struct xdr_stream *xdr, const struct nfs_setattrargs *
static int encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclientid *setclientid)
{
- uint32_t *p;
+ __be32 *p;
RESERVE_SPACE(4 + sizeof(setclientid->sc_verifier->data));
WRITE32(OP_SETCLIENTID);
@@ -1248,7 +1248,7 @@ static int encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclien
static int encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs_client *client_state)
{
- uint32_t *p;
+ __be32 *p;
RESERVE_SPACE(12 + sizeof(client_state->cl_confirm.data));
WRITE32(OP_SETCLIENTID_CONFIRM);
@@ -1260,7 +1260,7 @@ static int encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs_c
static int encode_write(struct xdr_stream *xdr, const struct nfs_writeargs *args)
{
- uint32_t *p;
+ __be32 *p;
RESERVE_SPACE(4);
WRITE32(OP_WRITE);
@@ -1279,7 +1279,7 @@ static int encode_write(struct xdr_stream *xdr, const struct nfs_writeargs *args
static int encode_delegreturn(struct xdr_stream *xdr, const nfs4_stateid *stateid)
{
- uint32_t *p;
+ __be32 *p;
RESERVE_SPACE(20);
@@ -1295,7 +1295,7 @@ static int encode_delegreturn(struct xdr_stream *xdr, const nfs4_stateid *statei
/*
* Encode an ACCESS request
*/
-static int nfs4_xdr_enc_access(struct rpc_rqst *req, uint32_t *p, const struct nfs4_accessargs *args)
+static int nfs4_xdr_enc_access(struct rpc_rqst *req, __be32 *p, const struct nfs4_accessargs *args)
{
struct xdr_stream xdr;
struct compound_hdr hdr = {
@@ -1313,7 +1313,7 @@ static int nfs4_xdr_enc_access(struct rpc_rqst *req, uint32_t *p, const struct n
/*
* Encode LOOKUP request
*/
-static int nfs4_xdr_enc_lookup(struct rpc_rqst *req, uint32_t *p, const struct nfs4_lookup_arg *args)
+static int nfs4_xdr_enc_lookup(struct rpc_rqst *req, __be32 *p, const struct nfs4_lookup_arg *args)
{
struct xdr_stream xdr;
struct compound_hdr hdr = {
@@ -1337,7 +1337,7 @@ out:
/*
* Encode LOOKUP_ROOT request
*/
-static int nfs4_xdr_enc_lookup_root(struct rpc_rqst *req, uint32_t *p, const struct nfs4_lookup_root_arg *args)
+static int nfs4_xdr_enc_lookup_root(struct rpc_rqst *req, __be32 *p, const struct nfs4_lookup_root_arg *args)
{
struct xdr_stream xdr;
struct compound_hdr hdr = {
@@ -1358,7 +1358,7 @@ out:
/*
* Encode REMOVE request
*/
-static int nfs4_xdr_enc_remove(struct rpc_rqst *req, uint32_t *p, const struct nfs4_remove_arg *args)
+static int nfs4_xdr_enc_remove(struct rpc_rqst *req, __be32 *p, const struct nfs4_remove_arg *args)
{
struct xdr_stream xdr;
struct compound_hdr hdr = {
@@ -1380,7 +1380,7 @@ out:
/*
* Encode RENAME request
*/
-static int nfs4_xdr_enc_rename(struct rpc_rqst *req, uint32_t *p, const struct nfs4_rename_arg *args)
+static int nfs4_xdr_enc_rename(struct rpc_rqst *req, __be32 *p, const struct nfs4_rename_arg *args)
{
struct xdr_stream xdr;
struct compound_hdr hdr = {
@@ -1410,7 +1410,7 @@ out:
/*
* Encode LINK request
*/
-static int nfs4_xdr_enc_link(struct rpc_rqst *req, uint32_t *p, const struct nfs4_link_arg *args)
+static int nfs4_xdr_enc_link(struct rpc_rqst *req, __be32 *p, const struct nfs4_link_arg *args)
{
struct xdr_stream xdr;
struct compound_hdr hdr = {
@@ -1440,7 +1440,7 @@ out:
/*
* Encode CREATE request
*/
-static int nfs4_xdr_enc_create(struct rpc_rqst *req, uint32_t *p, const struct nfs4_create_arg *args)
+static int nfs4_xdr_enc_create(struct rpc_rqst *req, __be32 *p, const struct nfs4_create_arg *args)
{
struct xdr_stream xdr;
struct compound_hdr hdr = {
@@ -1470,7 +1470,7 @@ out:
/*
* Encode SYMLINK request
*/
-static int nfs4_xdr_enc_symlink(struct rpc_rqst *req, uint32_t *p, const struct nfs4_create_arg *args)
+static int nfs4_xdr_enc_symlink(struct rpc_rqst *req, __be32 *p, const struct nfs4_create_arg *args)
{
return nfs4_xdr_enc_create(req, p, args);
}
@@ -1478,7 +1478,7 @@ static int nfs4_xdr_enc_symlink(struct rpc_rqst *req, uint32_t *p, const struct
/*
* Encode GETATTR request
*/
-static int nfs4_xdr_enc_getattr(struct rpc_rqst *req, uint32_t *p, const struct nfs4_getattr_arg *args)
+static int nfs4_xdr_enc_getattr(struct rpc_rqst *req, __be32 *p, const struct nfs4_getattr_arg *args)
{
struct xdr_stream xdr;
struct compound_hdr hdr = {
@@ -1496,7 +1496,7 @@ static int nfs4_xdr_enc_getattr(struct rpc_rqst *req, uint32_t *p, const struct
/*
* Encode a CLOSE request
*/
-static int nfs4_xdr_enc_close(struct rpc_rqst *req, uint32_t *p, struct nfs_closeargs *args)
+static int nfs4_xdr_enc_close(struct rpc_rqst *req, __be32 *p, struct nfs_closeargs *args)
{
struct xdr_stream xdr;
struct compound_hdr hdr = {
@@ -1520,7 +1520,7 @@ out:
/*
* Encode an OPEN request
*/
-static int nfs4_xdr_enc_open(struct rpc_rqst *req, uint32_t *p, struct nfs_openargs *args)
+static int nfs4_xdr_enc_open(struct rpc_rqst *req, __be32 *p, struct nfs_openargs *args)
{
struct xdr_stream xdr;
struct compound_hdr hdr = {
@@ -1556,7 +1556,7 @@ out:
/*
* Encode an OPEN_CONFIRM request
*/
-static int nfs4_xdr_enc_open_confirm(struct rpc_rqst *req, uint32_t *p, struct nfs_open_confirmargs *args)
+static int nfs4_xdr_enc_open_confirm(struct rpc_rqst *req, __be32 *p, struct nfs_open_confirmargs *args)
{
struct xdr_stream xdr;
struct compound_hdr hdr = {
@@ -1577,7 +1577,7 @@ out:
/*
* Encode an OPEN request with no attributes.
*/
-static int nfs4_xdr_enc_open_noattr(struct rpc_rqst *req, uint32_t *p, struct nfs_openargs *args)
+static int nfs4_xdr_enc_open_noattr(struct rpc_rqst *req, __be32 *p, struct nfs_openargs *args)
{
struct xdr_stream xdr;
struct compound_hdr hdr = {
@@ -1601,7 +1601,7 @@ out:
/*
* Encode an OPEN_DOWNGRADE request
*/
-static int nfs4_xdr_enc_open_downgrade(struct rpc_rqst *req, uint32_t *p, struct nfs_closeargs *args)
+static int nfs4_xdr_enc_open_downgrade(struct rpc_rqst *req, __be32 *p, struct nfs_closeargs *args)
{
struct xdr_stream xdr;
struct compound_hdr hdr = {
@@ -1625,7 +1625,7 @@ out:
/*
* Encode a LOCK request
*/
-static int nfs4_xdr_enc_lock(struct rpc_rqst *req, uint32_t *p, struct nfs_lock_args *args)
+static int nfs4_xdr_enc_lock(struct rpc_rqst *req, __be32 *p, struct nfs_lock_args *args)
{
struct xdr_stream xdr;
struct compound_hdr hdr = {
@@ -1646,7 +1646,7 @@ out:
/*
* Encode a LOCKT request
*/
-static int nfs4_xdr_enc_lockt(struct rpc_rqst *req, uint32_t *p, struct nfs_lockt_args *args)
+static int nfs4_xdr_enc_lockt(struct rpc_rqst *req, __be32 *p, struct nfs_lockt_args *args)
{
struct xdr_stream xdr;
struct compound_hdr hdr = {
@@ -1667,7 +1667,7 @@ out:
/*
* Encode a LOCKU request
*/
-static int nfs4_xdr_enc_locku(struct rpc_rqst *req, uint32_t *p, struct nfs_locku_args *args)
+static int nfs4_xdr_enc_locku(struct rpc_rqst *req, __be32 *p, struct nfs_locku_args *args)
{
struct xdr_stream xdr;
struct compound_hdr hdr = {
@@ -1688,7 +1688,7 @@ out:
/*
* Encode a READLINK request
*/
-static int nfs4_xdr_enc_readlink(struct rpc_rqst *req, uint32_t *p, const struct nfs4_readlink *args)
+static int nfs4_xdr_enc_readlink(struct rpc_rqst *req, __be32 *p, const struct nfs4_readlink *args)
{
struct xdr_stream xdr;
struct compound_hdr hdr = {
@@ -1709,7 +1709,7 @@ out:
/*
* Encode a READDIR request
*/
-static int nfs4_xdr_enc_readdir(struct rpc_rqst *req, uint32_t *p, const struct nfs4_readdir_arg *args)
+static int nfs4_xdr_enc_readdir(struct rpc_rqst *req, __be32 *p, const struct nfs4_readdir_arg *args)
{
struct xdr_stream xdr;
struct compound_hdr hdr = {
@@ -1730,7 +1730,7 @@ out:
/*
* Encode a READ request
*/
-static int nfs4_xdr_enc_read(struct rpc_rqst *req, uint32_t *p, struct nfs_readargs *args)
+static int nfs4_xdr_enc_read(struct rpc_rqst *req, __be32 *p, struct nfs_readargs *args)
{
struct rpc_auth *auth = req->rq_task->tk_auth;
struct xdr_stream xdr;
@@ -1762,7 +1762,7 @@ out:
/*
* Encode an SETATTR request
*/
-static int nfs4_xdr_enc_setattr(struct rpc_rqst *req, uint32_t *p, struct nfs_setattrargs *args)
+static int nfs4_xdr_enc_setattr(struct rpc_rqst *req, __be32 *p, struct nfs_setattrargs *args)
{
struct xdr_stream xdr;
@@ -1788,7 +1788,7 @@ out:
* Encode a GETACL request
*/
static int
-nfs4_xdr_enc_getacl(struct rpc_rqst *req, uint32_t *p,
+nfs4_xdr_enc_getacl(struct rpc_rqst *req, __be32 *p,
struct nfs_getaclargs *args)
{
struct xdr_stream xdr;
@@ -1815,7 +1815,7 @@ out:
/*
* Encode a WRITE request
*/
-static int nfs4_xdr_enc_write(struct rpc_rqst *req, uint32_t *p, struct nfs_writeargs *args)
+static int nfs4_xdr_enc_write(struct rpc_rqst *req, __be32 *p, struct nfs_writeargs *args)
{
struct xdr_stream xdr;
struct compound_hdr hdr = {
@@ -1839,7 +1839,7 @@ out:
/*
* a COMMIT request
*/
-static int nfs4_xdr_enc_commit(struct rpc_rqst *req, uint32_t *p, struct nfs_writeargs *args)
+static int nfs4_xdr_enc_commit(struct rpc_rqst *req, __be32 *p, struct nfs_writeargs *args)
{
struct xdr_stream xdr;
struct compound_hdr hdr = {
@@ -1863,7 +1863,7 @@ out:
/*
* FSINFO request
*/
-static int nfs4_xdr_enc_fsinfo(struct rpc_rqst *req, uint32_t *p, struct nfs4_fsinfo_arg *args)
+static int nfs4_xdr_enc_fsinfo(struct rpc_rqst *req, __be32 *p, struct nfs4_fsinfo_arg *args)
{
struct xdr_stream xdr;
struct compound_hdr hdr = {
@@ -1882,7 +1882,7 @@ static int nfs4_xdr_enc_fsinfo(struct rpc_rqst *req, uint32_t *p, struct nfs4_fs
/*
* a PATHCONF request
*/
-static int nfs4_xdr_enc_pathconf(struct rpc_rqst *req, uint32_t *p, const struct nfs4_pathconf_arg *args)
+static int nfs4_xdr_enc_pathconf(struct rpc_rqst *req, __be32 *p, const struct nfs4_pathconf_arg *args)
{
struct xdr_stream xdr;
struct compound_hdr hdr = {
@@ -1902,7 +1902,7 @@ static int nfs4_xdr_enc_pathconf(struct rpc_rqst *req, uint32_t *p, const struct
/*
* a STATFS request
*/
-static int nfs4_xdr_enc_statfs(struct rpc_rqst *req, uint32_t *p, const struct nfs4_statfs_arg *args)
+static int nfs4_xdr_enc_statfs(struct rpc_rqst *req, __be32 *p, const struct nfs4_statfs_arg *args)
{
struct xdr_stream xdr;
struct compound_hdr hdr = {
@@ -1923,7 +1923,7 @@ static int nfs4_xdr_enc_statfs(struct rpc_rqst *req, uint32_t *p, const struct n
/*
* GETATTR_BITMAP request
*/
-static int nfs4_xdr_enc_server_caps(struct rpc_rqst *req, uint32_t *p, const struct nfs_fh *fhandle)
+static int nfs4_xdr_enc_server_caps(struct rpc_rqst *req, __be32 *p, const struct nfs_fh *fhandle)
{
struct xdr_stream xdr;
struct compound_hdr hdr = {
@@ -1945,7 +1945,7 @@ static int nfs4_xdr_enc_server_caps(struct rpc_rqst *req, uint32_t *p, const str
/*
* a RENEW request
*/
-static int nfs4_xdr_enc_renew(struct rpc_rqst *req, uint32_t *p, struct nfs_client *clp)
+static int nfs4_xdr_enc_renew(struct rpc_rqst *req, __be32 *p, struct nfs_client *clp)
{
struct xdr_stream xdr;
struct compound_hdr hdr = {
@@ -1960,7 +1960,7 @@ static int nfs4_xdr_enc_renew(struct rpc_rqst *req, uint32_t *p, struct nfs_clie
/*
* a SETCLIENTID request
*/
-static int nfs4_xdr_enc_setclientid(struct rpc_rqst *req, uint32_t *p, struct nfs4_setclientid *sc)
+static int nfs4_xdr_enc_setclientid(struct rpc_rqst *req, __be32 *p, struct nfs4_setclientid *sc)
{
struct xdr_stream xdr;
struct compound_hdr hdr = {
@@ -1975,7 +1975,7 @@ static int nfs4_xdr_enc_setclientid(struct rpc_rqst *req, uint32_t *p, struct nf
/*
* a SETCLIENTID_CONFIRM request
*/
-static int nfs4_xdr_enc_setclientid_confirm(struct rpc_rqst *req, uint32_t *p, struct nfs_client *clp)
+static int nfs4_xdr_enc_setclientid_confirm(struct rpc_rqst *req, __be32 *p, struct nfs_client *clp)
{
struct xdr_stream xdr;
struct compound_hdr hdr = {
@@ -1997,7 +1997,7 @@ static int nfs4_xdr_enc_setclientid_confirm(struct rpc_rqst *req, uint32_t *p, s
/*
* DELEGRETURN request
*/
-static int nfs4_xdr_enc_delegreturn(struct rpc_rqst *req, uint32_t *p, const struct nfs4_delegreturnargs *args)
+static int nfs4_xdr_enc_delegreturn(struct rpc_rqst *req, __be32 *p, const struct nfs4_delegreturnargs *args)
{
struct xdr_stream xdr;
struct compound_hdr hdr = {
@@ -2021,7 +2021,7 @@ out:
/*
* Encode FS_LOCATIONS request
*/
-static int nfs4_xdr_enc_fs_locations(struct rpc_rqst *req, uint32_t *p, struct nfs4_fs_locations_arg *args)
+static int nfs4_xdr_enc_fs_locations(struct rpc_rqst *req, __be32 *p, struct nfs4_fs_locations_arg *args)
{
struct xdr_stream xdr;
struct compound_hdr hdr = {
@@ -2086,7 +2086,7 @@ out:
static int decode_opaque_inline(struct xdr_stream *xdr, unsigned int *len, char **string)
{
- uint32_t *p;
+ __be32 *p;
READ_BUF(4);
READ32(*len);
@@ -2097,7 +2097,7 @@ static int decode_opaque_inline(struct xdr_stream *xdr, unsigned int *len, char
static int decode_compound_hdr(struct xdr_stream *xdr, struct compound_hdr *hdr)
{
- uint32_t *p;
+ __be32 *p;
READ_BUF(8);
READ32(hdr->status);
@@ -2112,7 +2112,7 @@ static int decode_compound_hdr(struct xdr_stream *xdr, struct compound_hdr *hdr)
static int decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected)
{
- uint32_t *p;
+ __be32 *p;
uint32_t opnum;
int32_t nfserr;
@@ -2134,7 +2134,7 @@ static int decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected)
/* Dummy routine */
static int decode_ace(struct xdr_stream *xdr, void *ace, struct nfs_client *clp)
{
- uint32_t *p;
+ __be32 *p;
unsigned int strlen;
char *str;
@@ -2144,7 +2144,8 @@ static int decode_ace(struct xdr_stream *xdr, void *ace, struct nfs_client *clp)
static int decode_attr_bitmap(struct xdr_stream *xdr, uint32_t *bitmap)
{
- uint32_t bmlen, *p;
+ uint32_t bmlen;
+ __be32 *p;
READ_BUF(4);
READ32(bmlen);
@@ -2159,9 +2160,9 @@ static int decode_attr_bitmap(struct xdr_stream *xdr, uint32_t *bitmap)
return 0;
}
-static inline int decode_attr_length(struct xdr_stream *xdr, uint32_t *attrlen, uint32_t **savep)
+static inline int decode_attr_length(struct xdr_stream *xdr, uint32_t *attrlen, __be32 **savep)
{
- uint32_t *p;
+ __be32 *p;
READ_BUF(4);
READ32(*attrlen);
@@ -2182,7 +2183,7 @@ static int decode_attr_supported(struct xdr_stream *xdr, uint32_t *bitmap, uint3
static int decode_attr_type(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *type)
{
- uint32_t *p;
+ __be32 *p;
*type = 0;
if (unlikely(bitmap[0] & (FATTR4_WORD0_TYPE - 1U)))
@@ -2202,7 +2203,7 @@ static int decode_attr_type(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *
static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *change)
{
- uint32_t *p;
+ __be32 *p;
*change = 0;
if (unlikely(bitmap[0] & (FATTR4_WORD0_CHANGE - 1U)))
@@ -2219,7 +2220,7 @@ static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t
static int decode_attr_size(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *size)
{
- uint32_t *p;
+ __be32 *p;
*size = 0;
if (unlikely(bitmap[0] & (FATTR4_WORD0_SIZE - 1U)))
@@ -2235,7 +2236,7 @@ static int decode_attr_size(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *
static int decode_attr_link_support(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
{
- uint32_t *p;
+ __be32 *p;
*res = 0;
if (unlikely(bitmap[0] & (FATTR4_WORD0_LINK_SUPPORT - 1U)))
@@ -2251,7 +2252,7 @@ static int decode_attr_link_support(struct xdr_stream *xdr, uint32_t *bitmap, ui
static int decode_attr_symlink_support(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
{
- uint32_t *p;
+ __be32 *p;
*res = 0;
if (unlikely(bitmap[0] & (FATTR4_WORD0_SYMLINK_SUPPORT - 1U)))
@@ -2267,7 +2268,7 @@ static int decode_attr_symlink_support(struct xdr_stream *xdr, uint32_t *bitmap,
static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_fsid *fsid)
{
- uint32_t *p;
+ __be32 *p;
fsid->major = 0;
fsid->minor = 0;
@@ -2287,7 +2288,7 @@ static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs
static int decode_attr_lease_time(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
{
- uint32_t *p;
+ __be32 *p;
*res = 60;
if (unlikely(bitmap[0] & (FATTR4_WORD0_LEASE_TIME - 1U)))
@@ -2303,7 +2304,7 @@ static int decode_attr_lease_time(struct xdr_stream *xdr, uint32_t *bitmap, uint
static int decode_attr_aclsupport(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
{
- uint32_t *p;
+ __be32 *p;
*res = ACL4_SUPPORT_ALLOW_ACL|ACL4_SUPPORT_DENY_ACL;
if (unlikely(bitmap[0] & (FATTR4_WORD0_ACLSUPPORT - 1U)))
@@ -2319,7 +2320,7 @@ static int decode_attr_aclsupport(struct xdr_stream *xdr, uint32_t *bitmap, uint
static int decode_attr_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *fileid)
{
- uint32_t *p;
+ __be32 *p;
*fileid = 0;
if (unlikely(bitmap[0] & (FATTR4_WORD0_FILEID - 1U)))
@@ -2335,7 +2336,7 @@ static int decode_attr_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t
static int decode_attr_mounted_on_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *fileid)
{
- uint32_t *p;
+ __be32 *p;
*fileid = 0;
if (unlikely(bitmap[1] & (FATTR4_WORD1_MOUNTED_ON_FILEID - 1U)))
@@ -2351,7 +2352,7 @@ static int decode_attr_mounted_on_fileid(struct xdr_stream *xdr, uint32_t *bitma
static int decode_attr_files_avail(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
{
- uint32_t *p;
+ __be32 *p;
int status = 0;
*res = 0;
@@ -2368,7 +2369,7 @@ static int decode_attr_files_avail(struct xdr_stream *xdr, uint32_t *bitmap, uin
static int decode_attr_files_free(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
{
- uint32_t *p;
+ __be32 *p;
int status = 0;
*res = 0;
@@ -2385,7 +2386,7 @@ static int decode_attr_files_free(struct xdr_stream *xdr, uint32_t *bitmap, uint
static int decode_attr_files_total(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
{
- uint32_t *p;
+ __be32 *p;
int status = 0;
*res = 0;
@@ -2403,7 +2404,7 @@ static int decode_attr_files_total(struct xdr_stream *xdr, uint32_t *bitmap, uin
static int decode_pathname(struct xdr_stream *xdr, struct nfs4_pathname *path)
{
int n;
- uint32_t *p;
+ __be32 *p;
int status = 0;
READ_BUF(4);
@@ -2448,7 +2449,7 @@ out_eio:
static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs4_fs_locations *res)
{
int n;
- uint32_t *p;
+ __be32 *p;
int status = -EIO;
if (unlikely(bitmap[0] & (FATTR4_WORD0_FS_LOCATIONS -1U)))
@@ -2512,7 +2513,7 @@ out_eio:
static int decode_attr_maxfilesize(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
{
- uint32_t *p;
+ __be32 *p;
int status = 0;
*res = 0;
@@ -2529,7 +2530,7 @@ static int decode_attr_maxfilesize(struct xdr_stream *xdr, uint32_t *bitmap, uin
static int decode_attr_maxlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *maxlink)
{
- uint32_t *p;
+ __be32 *p;
int status = 0;
*maxlink = 1;
@@ -2546,7 +2547,7 @@ static int decode_attr_maxlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_
static int decode_attr_maxname(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *maxname)
{
- uint32_t *p;
+ __be32 *p;
int status = 0;
*maxname = 1024;
@@ -2563,7 +2564,7 @@ static int decode_attr_maxname(struct xdr_stream *xdr, uint32_t *bitmap, uint32_
static int decode_attr_maxread(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
{
- uint32_t *p;
+ __be32 *p;
int status = 0;
*res = 1024;
@@ -2584,7 +2585,7 @@ static int decode_attr_maxread(struct xdr_stream *xdr, uint32_t *bitmap, uint32_
static int decode_attr_maxwrite(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
{
- uint32_t *p;
+ __be32 *p;
int status = 0;
*res = 1024;
@@ -2605,7 +2606,7 @@ static int decode_attr_maxwrite(struct xdr_stream *xdr, uint32_t *bitmap, uint32
static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *mode)
{
- uint32_t *p;
+ __be32 *p;
*mode = 0;
if (unlikely(bitmap[1] & (FATTR4_WORD1_MODE - 1U)))
@@ -2622,7 +2623,7 @@ static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *
static int decode_attr_nlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *nlink)
{
- uint32_t *p;
+ __be32 *p;
*nlink = 1;
if (unlikely(bitmap[1] & (FATTR4_WORD1_NUMLINKS - 1U)))
@@ -2638,7 +2639,8 @@ static int decode_attr_nlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t
static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_client *clp, int32_t *uid)
{
- uint32_t len, *p;
+ uint32_t len;
+ __be32 *p;
*uid = -2;
if (unlikely(bitmap[1] & (FATTR4_WORD1_OWNER - 1U)))
@@ -2662,7 +2664,8 @@ static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nf
static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_client *clp, int32_t *gid)
{
- uint32_t len, *p;
+ uint32_t len;
+ __be32 *p;
*gid = -2;
if (unlikely(bitmap[1] & (FATTR4_WORD1_OWNER_GROUP - 1U)))
@@ -2686,7 +2689,8 @@ static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nf
static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rdev)
{
- uint32_t major = 0, minor = 0, *p;
+ uint32_t major = 0, minor = 0;
+ __be32 *p;
*rdev = MKDEV(0,0);
if (unlikely(bitmap[1] & (FATTR4_WORD1_RAWDEV - 1U)))
@@ -2708,7 +2712,7 @@ static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rde
static int decode_attr_space_avail(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
{
- uint32_t *p;
+ __be32 *p;
int status = 0;
*res = 0;
@@ -2725,7 +2729,7 @@ static int decode_attr_space_avail(struct xdr_stream *xdr, uint32_t *bitmap, uin
static int decode_attr_space_free(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
{
- uint32_t *p;
+ __be32 *p;
int status = 0;
*res = 0;
@@ -2742,7 +2746,7 @@ static int decode_attr_space_free(struct xdr_stream *xdr, uint32_t *bitmap, uint
static int decode_attr_space_total(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
{
- uint32_t *p;
+ __be32 *p;
int status = 0;
*res = 0;
@@ -2759,7 +2763,7 @@ static int decode_attr_space_total(struct xdr_stream *xdr, uint32_t *bitmap, uin
static int decode_attr_space_used(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *used)
{
- uint32_t *p;
+ __be32 *p;
*used = 0;
if (unlikely(bitmap[1] & (FATTR4_WORD1_SPACE_USED - 1U)))
@@ -2776,7 +2780,7 @@ static int decode_attr_space_used(struct xdr_stream *xdr, uint32_t *bitmap, uint
static int decode_attr_time(struct xdr_stream *xdr, struct timespec *time)
{
- uint32_t *p;
+ __be32 *p;
uint64_t sec;
uint32_t nsec;
@@ -2836,7 +2840,7 @@ static int decode_attr_time_modify(struct xdr_stream *xdr, uint32_t *bitmap, str
return status;
}
-static int verify_attr_len(struct xdr_stream *xdr, uint32_t *savep, uint32_t attrlen)
+static int verify_attr_len(struct xdr_stream *xdr, __be32 *savep, uint32_t attrlen)
{
unsigned int attrwords = XDR_QUADLEN(attrlen);
unsigned int nwords = xdr->p - savep;
@@ -2854,7 +2858,7 @@ static int verify_attr_len(struct xdr_stream *xdr, uint32_t *savep, uint32_t att
static int decode_change_info(struct xdr_stream *xdr, struct nfs4_change_info *cinfo)
{
- uint32_t *p;
+ __be32 *p;
READ_BUF(20);
READ32(cinfo->atomic);
@@ -2865,7 +2869,7 @@ static int decode_change_info(struct xdr_stream *xdr, struct nfs4_change_info *c
static int decode_access(struct xdr_stream *xdr, struct nfs4_accessres *access)
{
- uint32_t *p;
+ __be32 *p;
uint32_t supp, acc;
int status;
@@ -2882,7 +2886,7 @@ static int decode_access(struct xdr_stream *xdr, struct nfs4_accessres *access)
static int decode_close(struct xdr_stream *xdr, struct nfs_closeres *res)
{
- uint32_t *p;
+ __be32 *p;
int status;
status = decode_op_hdr(xdr, OP_CLOSE);
@@ -2895,7 +2899,7 @@ static int decode_close(struct xdr_stream *xdr, struct nfs_closeres *res)
static int decode_commit(struct xdr_stream *xdr, struct nfs_writeres *res)
{
- uint32_t *p;
+ __be32 *p;
int status;
status = decode_op_hdr(xdr, OP_COMMIT);
@@ -2908,7 +2912,7 @@ static int decode_commit(struct xdr_stream *xdr, struct nfs_writeres *res)
static int decode_create(struct xdr_stream *xdr, struct nfs4_change_info *cinfo)
{
- uint32_t *p;
+ __be32 *p;
uint32_t bmlen;
int status;
@@ -2925,7 +2929,7 @@ static int decode_create(struct xdr_stream *xdr, struct nfs4_change_info *cinfo)
static int decode_server_caps(struct xdr_stream *xdr, struct nfs4_server_caps_res *res)
{
- uint32_t *savep;
+ __be32 *savep;
uint32_t attrlen,
bitmap[2] = {0};
int status;
@@ -2952,7 +2956,7 @@ xdr_error:
static int decode_statfs(struct xdr_stream *xdr, struct nfs_fsstat *fsstat)
{
- uint32_t *savep;
+ __be32 *savep;
uint32_t attrlen,
bitmap[2] = {0};
int status;
@@ -2985,7 +2989,7 @@ xdr_error:
static int decode_pathconf(struct xdr_stream *xdr, struct nfs_pathconf *pathconf)
{
- uint32_t *savep;
+ __be32 *savep;
uint32_t attrlen,
bitmap[2] = {0};
int status;
@@ -3010,7 +3014,7 @@ xdr_error:
static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, const struct nfs_server *server)
{
- uint32_t *savep;
+ __be32 *savep;
uint32_t attrlen,
bitmap[2] = {0},
type;
@@ -3079,7 +3083,7 @@ xdr_error:
static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo)
{
- uint32_t *savep;
+ __be32 *savep;
uint32_t attrlen, bitmap[2];
int status;
@@ -3111,7 +3115,7 @@ xdr_error:
static int decode_getfh(struct xdr_stream *xdr, struct nfs_fh *fh)
{
- uint32_t *p;
+ __be32 *p;
uint32_t len;
int status;
@@ -3147,7 +3151,7 @@ static int decode_link(struct xdr_stream *xdr, struct nfs4_change_info *cinfo)
static int decode_lock_denied (struct xdr_stream *xdr, struct file_lock *fl)
{
uint64_t offset, length, clientid;
- uint32_t *p;
+ __be32 *p;
uint32_t namelen, type;
READ_BUF(32);
@@ -3172,7 +3176,7 @@ static int decode_lock_denied (struct xdr_stream *xdr, struct file_lock *fl)
static int decode_lock(struct xdr_stream *xdr, struct nfs_lock_res *res)
{
- uint32_t *p;
+ __be32 *p;
int status;
status = decode_op_hdr(xdr, OP_LOCK);
@@ -3195,7 +3199,7 @@ static int decode_lockt(struct xdr_stream *xdr, struct nfs_lockt_res *res)
static int decode_locku(struct xdr_stream *xdr, struct nfs_locku_res *res)
{
- uint32_t *p;
+ __be32 *p;
int status;
status = decode_op_hdr(xdr, OP_LOCKU);
@@ -3214,7 +3218,7 @@ static int decode_lookup(struct xdr_stream *xdr)
/* This is too sick! */
static int decode_space_limit(struct xdr_stream *xdr, u64 *maxsize)
{
- uint32_t *p;
+ __be32 *p;
uint32_t limit_type, nblocks, blocksize;
READ_BUF(12);
@@ -3233,7 +3237,7 @@ static int decode_space_limit(struct xdr_stream *xdr, u64 *maxsize)
static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res)
{
- uint32_t *p;
+ __be32 *p;
uint32_t delegation_type;
READ_BUF(4);
@@ -3259,7 +3263,7 @@ static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res)
static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res)
{
- uint32_t *p;
+ __be32 *p;
uint32_t bmlen;
int status;
@@ -3287,7 +3291,7 @@ xdr_error:
static int decode_open_confirm(struct xdr_stream *xdr, struct nfs_open_confirmres *res)
{
- uint32_t *p;
+ __be32 *p;
int status;
status = decode_op_hdr(xdr, OP_OPEN_CONFIRM);
@@ -3300,7 +3304,7 @@ static int decode_open_confirm(struct xdr_stream *xdr, struct nfs_open_confirmre
static int decode_open_downgrade(struct xdr_stream *xdr, struct nfs_closeres *res)
{
- uint32_t *p;
+ __be32 *p;
int status;
status = decode_op_hdr(xdr, OP_OPEN_DOWNGRADE);
@@ -3324,7 +3328,7 @@ static int decode_putrootfh(struct xdr_stream *xdr)
static int decode_read(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs_readres *res)
{
struct kvec *iov = req->rq_rcv_buf.head;
- uint32_t *p;
+ __be32 *p;
uint32_t count, eof, recvd, hdrlen;
int status;
@@ -3354,7 +3358,7 @@ static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct n
struct page *page = *rcvbuf->pages;
struct kvec *iov = rcvbuf->head;
unsigned int nr, pglen = rcvbuf->page_len;
- uint32_t *end, *entry, *p, *kaddr;
+ __be32 *end, *entry, *p, *kaddr;
uint32_t len, attrlen, xlen;
int hdrlen, recvd, status;
@@ -3376,7 +3380,7 @@ static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct n
xdr_read_pages(xdr, pglen);
BUG_ON(pglen + readdir->pgbase > PAGE_CACHE_SIZE);
- kaddr = p = (uint32_t *) kmap_atomic(page, KM_USER0);
+ kaddr = p = kmap_atomic(page, KM_USER0);
end = p + ((pglen + readdir->pgbase) >> 2);
entry = p;
for (nr = 0; *p++; nr++) {
@@ -3428,7 +3432,7 @@ static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req)
struct xdr_buf *rcvbuf = &req->rq_rcv_buf;
struct kvec *iov = rcvbuf->head;
int hdrlen, len, recvd;
- uint32_t *p;
+ __be32 *p;
char *kaddr;
int status;
@@ -3505,7 +3509,7 @@ decode_restorefh(struct xdr_stream *xdr)
static int decode_getacl(struct xdr_stream *xdr, struct rpc_rqst *req,
size_t *acl_len)
{
- uint32_t *savep;
+ __be32 *savep;
uint32_t attrlen,
bitmap[2] = {0};
struct kvec *iov = req->rq_rcv_buf.head;
@@ -3551,7 +3555,7 @@ decode_savefh(struct xdr_stream *xdr)
static int decode_setattr(struct xdr_stream *xdr, struct nfs_setattrres *res)
{
- uint32_t *p;
+ __be32 *p;
uint32_t bmlen;
int status;
@@ -3567,7 +3571,7 @@ static int decode_setattr(struct xdr_stream *xdr, struct nfs_setattrres *res)
static int decode_setclientid(struct xdr_stream *xdr, struct nfs_client *clp)
{
- uint32_t *p;
+ __be32 *p;
uint32_t opnum;
int32_t nfserr;
@@ -3610,7 +3614,7 @@ static int decode_setclientid_confirm(struct xdr_stream *xdr)
static int decode_write(struct xdr_stream *xdr, struct nfs_writeres *res)
{
- uint32_t *p;
+ __be32 *p;
int status;
status = decode_op_hdr(xdr, OP_WRITE);
@@ -3632,7 +3636,7 @@ static int decode_delegreturn(struct xdr_stream *xdr)
/*
* Decode OPEN_DOWNGRADE response
*/
-static int nfs4_xdr_dec_open_downgrade(struct rpc_rqst *rqstp, uint32_t *p, struct nfs_closeres *res)
+static int nfs4_xdr_dec_open_downgrade(struct rpc_rqst *rqstp, __be32 *p, struct nfs_closeres *res)
{
struct xdr_stream xdr;
struct compound_hdr hdr;
@@ -3660,7 +3664,7 @@ out:
/*
* Decode ACCESS response
*/
-static int nfs4_xdr_dec_access(struct rpc_rqst *rqstp, uint32_t *p, struct nfs4_accessres *res)
+static int nfs4_xdr_dec_access(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_accessres *res)
{
struct xdr_stream xdr;
struct compound_hdr hdr;
@@ -3678,7 +3682,7 @@ out:
/*
* Decode LOOKUP response
*/
-static int nfs4_xdr_dec_lookup(struct rpc_rqst *rqstp, uint32_t *p, struct nfs4_lookup_res *res)
+static int nfs4_xdr_dec_lookup(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_lookup_res *res)
{
struct xdr_stream xdr;
struct compound_hdr hdr;
@@ -3701,7 +3705,7 @@ out:
/*
* Decode LOOKUP_ROOT response
*/
-static int nfs4_xdr_dec_lookup_root(struct rpc_rqst *rqstp, uint32_t *p, struct nfs4_lookup_res *res)
+static int nfs4_xdr_dec_lookup_root(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_lookup_res *res)
{
struct xdr_stream xdr;
struct compound_hdr hdr;
@@ -3721,7 +3725,7 @@ out:
/*
* Decode REMOVE response
*/
-static int nfs4_xdr_dec_remove(struct rpc_rqst *rqstp, uint32_t *p, struct nfs4_remove_res *res)
+static int nfs4_xdr_dec_remove(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_remove_res *res)
{
struct xdr_stream xdr;
struct compound_hdr hdr;
@@ -3742,7 +3746,7 @@ out:
/*
* Decode RENAME response
*/
-static int nfs4_xdr_dec_rename(struct rpc_rqst *rqstp, uint32_t *p, struct nfs4_rename_res *res)
+static int nfs4_xdr_dec_rename(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_rename_res *res)
{
struct xdr_stream xdr;
struct compound_hdr hdr;
@@ -3772,7 +3776,7 @@ out:
/*
* Decode LINK response
*/
-static int nfs4_xdr_dec_link(struct rpc_rqst *rqstp, uint32_t *p, struct nfs4_link_res *res)
+static int nfs4_xdr_dec_link(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_link_res *res)
{
struct xdr_stream xdr;
struct compound_hdr hdr;
@@ -3805,7 +3809,7 @@ out:
/*
* Decode CREATE response
*/
-static int nfs4_xdr_dec_create(struct rpc_rqst *rqstp, uint32_t *p, struct nfs4_create_res *res)
+static int nfs4_xdr_dec_create(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_create_res *res)
{
struct xdr_stream xdr;
struct compound_hdr hdr;
@@ -3834,7 +3838,7 @@ out:
/*
* Decode SYMLINK response
*/
-static int nfs4_xdr_dec_symlink(struct rpc_rqst *rqstp, uint32_t *p, struct nfs4_create_res *res)
+static int nfs4_xdr_dec_symlink(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_create_res *res)
{
return nfs4_xdr_dec_create(rqstp, p, res);
}
@@ -3842,7 +3846,7 @@ static int nfs4_xdr_dec_symlink(struct rpc_rqst *rqstp, uint32_t *p, struct nfs4
/*
* Decode GETATTR response
*/
-static int nfs4_xdr_dec_getattr(struct rpc_rqst *rqstp, uint32_t *p, struct nfs4_getattr_res *res)
+static int nfs4_xdr_dec_getattr(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_getattr_res *res)
{
struct xdr_stream xdr;
struct compound_hdr hdr;
@@ -3865,7 +3869,7 @@ out:
* Encode an SETACL request
*/
static int
-nfs4_xdr_enc_setacl(struct rpc_rqst *req, uint32_t *p, struct nfs_setaclargs *args)
+nfs4_xdr_enc_setacl(struct rpc_rqst *req, __be32 *p, struct nfs_setaclargs *args)
{
struct xdr_stream xdr;
struct compound_hdr hdr = {
@@ -3886,7 +3890,7 @@ out:
* Decode SETACL response
*/
static int
-nfs4_xdr_dec_setacl(struct rpc_rqst *rqstp, uint32_t *p, void *res)
+nfs4_xdr_dec_setacl(struct rpc_rqst *rqstp, __be32 *p, void *res)
{
struct xdr_stream xdr;
struct compound_hdr hdr;
@@ -3908,7 +3912,7 @@ out:
* Decode GETACL response
*/
static int
-nfs4_xdr_dec_getacl(struct rpc_rqst *rqstp, uint32_t *p, size_t *acl_len)
+nfs4_xdr_dec_getacl(struct rpc_rqst *rqstp, __be32 *p, size_t *acl_len)
{
struct xdr_stream xdr;
struct compound_hdr hdr;
@@ -3930,7 +3934,7 @@ out:
/*
* Decode CLOSE response
*/
-static int nfs4_xdr_dec_close(struct rpc_rqst *rqstp, uint32_t *p, struct nfs_closeres *res)
+static int nfs4_xdr_dec_close(struct rpc_rqst *rqstp, __be32 *p, struct nfs_closeres *res)
{
struct xdr_stream xdr;
struct compound_hdr hdr;
@@ -3960,7 +3964,7 @@ out:
/*
* Decode OPEN response
*/
-static int nfs4_xdr_dec_open(struct rpc_rqst *rqstp, uint32_t *p, struct nfs_openres *res)
+static int nfs4_xdr_dec_open(struct rpc_rqst *rqstp, __be32 *p, struct nfs_openres *res)
{
struct xdr_stream xdr;
struct compound_hdr hdr;
@@ -3994,7 +3998,7 @@ out:
/*
* Decode OPEN_CONFIRM response
*/
-static int nfs4_xdr_dec_open_confirm(struct rpc_rqst *rqstp, uint32_t *p, struct nfs_open_confirmres *res)
+static int nfs4_xdr_dec_open_confirm(struct rpc_rqst *rqstp, __be32 *p, struct nfs_open_confirmres *res)
{
struct xdr_stream xdr;
struct compound_hdr hdr;
@@ -4015,7 +4019,7 @@ out:
/*
* Decode OPEN response
*/
-static int nfs4_xdr_dec_open_noattr(struct rpc_rqst *rqstp, uint32_t *p, struct nfs_openres *res)
+static int nfs4_xdr_dec_open_noattr(struct rpc_rqst *rqstp, __be32 *p, struct nfs_openres *res)
{
struct xdr_stream xdr;
struct compound_hdr hdr;
@@ -4039,7 +4043,7 @@ out:
/*
* Decode SETATTR response
*/
-static int nfs4_xdr_dec_setattr(struct rpc_rqst *rqstp, uint32_t *p, struct nfs_setattrres *res)
+static int nfs4_xdr_dec_setattr(struct rpc_rqst *rqstp, __be32 *p, struct nfs_setattrres *res)
{
struct xdr_stream xdr;
struct compound_hdr hdr;
@@ -4065,7 +4069,7 @@ out:
/*
* Decode LOCK response
*/
-static int nfs4_xdr_dec_lock(struct rpc_rqst *rqstp, uint32_t *p, struct nfs_lock_res *res)
+static int nfs4_xdr_dec_lock(struct rpc_rqst *rqstp, __be32 *p, struct nfs_lock_res *res)
{
struct xdr_stream xdr;
struct compound_hdr hdr;
@@ -4086,7 +4090,7 @@ out:
/*
* Decode LOCKT response
*/
-static int nfs4_xdr_dec_lockt(struct rpc_rqst *rqstp, uint32_t *p, struct nfs_lockt_res *res)
+static int nfs4_xdr_dec_lockt(struct rpc_rqst *rqstp, __be32 *p, struct nfs_lockt_res *res)
{
struct xdr_stream xdr;
struct compound_hdr hdr;
@@ -4107,7 +4111,7 @@ out:
/*
* Decode LOCKU response
*/
-static int nfs4_xdr_dec_locku(struct rpc_rqst *rqstp, uint32_t *p, struct nfs_locku_res *res)
+static int nfs4_xdr_dec_locku(struct rpc_rqst *rqstp, __be32 *p, struct nfs_locku_res *res)
{
struct xdr_stream xdr;
struct compound_hdr hdr;
@@ -4128,7 +4132,7 @@ out:
/*
* Decode READLINK response
*/
-static int nfs4_xdr_dec_readlink(struct rpc_rqst *rqstp, uint32_t *p, void *res)
+static int nfs4_xdr_dec_readlink(struct rpc_rqst *rqstp, __be32 *p, void *res)
{
struct xdr_stream xdr;
struct compound_hdr hdr;
@@ -4149,7 +4153,7 @@ out:
/*
* Decode READDIR response
*/
-static int nfs4_xdr_dec_readdir(struct rpc_rqst *rqstp, uint32_t *p, struct nfs4_readdir_res *res)
+static int nfs4_xdr_dec_readdir(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_readdir_res *res)
{
struct xdr_stream xdr;
struct compound_hdr hdr;
@@ -4170,7 +4174,7 @@ out:
/*
* Decode Read response
*/
-static int nfs4_xdr_dec_read(struct rpc_rqst *rqstp, uint32_t *p, struct nfs_readres *res)
+static int nfs4_xdr_dec_read(struct rpc_rqst *rqstp, __be32 *p, struct nfs_readres *res)
{
struct xdr_stream xdr;
struct compound_hdr hdr;
@@ -4193,7 +4197,7 @@ out:
/*
* Decode WRITE response
*/
-static int nfs4_xdr_dec_write(struct rpc_rqst *rqstp, uint32_t *p, struct nfs_writeres *res)
+static int nfs4_xdr_dec_write(struct rpc_rqst *rqstp, __be32 *p, struct nfs_writeres *res)
{
struct xdr_stream xdr;
struct compound_hdr hdr;
@@ -4219,7 +4223,7 @@ out:
/*
* Decode COMMIT response
*/
-static int nfs4_xdr_dec_commit(struct rpc_rqst *rqstp, uint32_t *p, struct nfs_writeres *res)
+static int nfs4_xdr_dec_commit(struct rpc_rqst *rqstp, __be32 *p, struct nfs_writeres *res)
{
struct xdr_stream xdr;
struct compound_hdr hdr;
@@ -4243,7 +4247,7 @@ out:
/*
* FSINFO request
*/
-static int nfs4_xdr_dec_fsinfo(struct rpc_rqst *req, uint32_t *p, struct nfs_fsinfo *fsinfo)
+static int nfs4_xdr_dec_fsinfo(struct rpc_rqst *req, __be32 *p, struct nfs_fsinfo *fsinfo)
{
struct xdr_stream xdr;
struct compound_hdr hdr;
@@ -4263,7 +4267,7 @@ static int nfs4_xdr_dec_fsinfo(struct rpc_rqst *req, uint32_t *p, struct nfs_fsi
/*
* PATHCONF request
*/
-static int nfs4_xdr_dec_pathconf(struct rpc_rqst *req, uint32_t *p, struct nfs_pathconf *pathconf)
+static int nfs4_xdr_dec_pathconf(struct rpc_rqst *req, __be32 *p, struct nfs_pathconf *pathconf)
{
struct xdr_stream xdr;
struct compound_hdr hdr;
@@ -4281,7 +4285,7 @@ static int nfs4_xdr_dec_pathconf(struct rpc_rqst *req, uint32_t *p, struct nfs_p
/*
* STATFS request
*/
-static int nfs4_xdr_dec_statfs(struct rpc_rqst *req, uint32_t *p, struct nfs_fsstat *fsstat)
+static int nfs4_xdr_dec_statfs(struct rpc_rqst *req, __be32 *p, struct nfs_fsstat *fsstat)
{
struct xdr_stream xdr;
struct compound_hdr hdr;
@@ -4299,7 +4303,7 @@ static int nfs4_xdr_dec_statfs(struct rpc_rqst *req, uint32_t *p, struct nfs_fss
/*
* GETATTR_BITMAP request
*/
-static int nfs4_xdr_dec_server_caps(struct rpc_rqst *req, uint32_t *p, struct nfs4_server_caps_res *res)
+static int nfs4_xdr_dec_server_caps(struct rpc_rqst *req, __be32 *p, struct nfs4_server_caps_res *res)
{
struct xdr_stream xdr;
struct compound_hdr hdr;
@@ -4318,7 +4322,7 @@ out:
/*
* Decode RENEW response
*/
-static int nfs4_xdr_dec_renew(struct rpc_rqst *rqstp, uint32_t *p, void *dummy)
+static int nfs4_xdr_dec_renew(struct rpc_rqst *rqstp, __be32 *p, void *dummy)
{
struct xdr_stream xdr;
struct compound_hdr hdr;
@@ -4334,7 +4338,7 @@ static int nfs4_xdr_dec_renew(struct rpc_rqst *rqstp, uint32_t *p, void *dummy)
/*
* a SETCLIENTID request
*/
-static int nfs4_xdr_dec_setclientid(struct rpc_rqst *req, uint32_t *p,
+static int nfs4_xdr_dec_setclientid(struct rpc_rqst *req, __be32 *p,
struct nfs_client *clp)
{
struct xdr_stream xdr;
@@ -4353,7 +4357,7 @@ static int nfs4_xdr_dec_setclientid(struct rpc_rqst *req, uint32_t *p,
/*
* a SETCLIENTID_CONFIRM request
*/
-static int nfs4_xdr_dec_setclientid_confirm(struct rpc_rqst *req, uint32_t *p, struct nfs_fsinfo *fsinfo)
+static int nfs4_xdr_dec_setclientid_confirm(struct rpc_rqst *req, __be32 *p, struct nfs_fsinfo *fsinfo)
{
struct xdr_stream xdr;
struct compound_hdr hdr;
@@ -4375,7 +4379,7 @@ static int nfs4_xdr_dec_setclientid_confirm(struct rpc_rqst *req, uint32_t *p, s
/*
* DELEGRETURN request
*/
-static int nfs4_xdr_dec_delegreturn(struct rpc_rqst *rqstp, uint32_t *p, struct nfs4_delegreturnres *res)
+static int nfs4_xdr_dec_delegreturn(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_delegreturnres *res)
{
struct xdr_stream xdr;
struct compound_hdr hdr;
@@ -4397,7 +4401,7 @@ out:
/*
* FS_LOCATIONS request
*/
-static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req, uint32_t *p, struct nfs4_fs_locations *res)
+static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req, __be32 *p, struct nfs4_fs_locations *res)
{
struct xdr_stream xdr;
struct compound_hdr hdr;
@@ -4417,7 +4421,7 @@ out:
return status;
}
-uint32_t *nfs4_decode_dirent(uint32_t *p, struct nfs_entry *entry, int plus)
+__be32 *nfs4_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus)
{
uint32_t bitmap[2] = {0};
uint32_t len;
diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c
index 1d656a645199..8dfefe41a8da 100644
--- a/fs/nfs/nfsroot.c
+++ b/fs/nfs/nfsroot.c
@@ -69,7 +69,6 @@
* Fabian Frederick: Option parser rebuilt (using parser lib)
*/
-#include <linux/config.h>
#include <linux/types.h>
#include <linux/string.h>
#include <linux/kernel.h>
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index e8d40030cab4..28108c82b887 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -20,7 +20,6 @@
* of another (see nfs_lookup())
*/
-#include <linux/config.h>
#include <linux/module.h>
#include <linux/init.h>
@@ -835,7 +834,7 @@ static int nfs4_get_sb(struct file_system_type *fs_type,
}
/* RFC3530: The default port for NFS is 2049 */
if (addr.sin_port == 0)
- addr.sin_port = NFS_PORT;
+ addr.sin_port = htons(NFS_PORT);
/* Grab the authentication type */
authflavour = RPC_AUTH_UNIX;
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index f6675d2c386c..883dd4a1c157 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -57,6 +57,8 @@
#include <linux/nfs_fs.h>
#include <linux/nfs_mount.h>
#include <linux/nfs_page.h>
+#include <linux/backing-dev.h>
+
#include <asm/uaccess.h>
#include <linux/smp_lock.h>
@@ -395,7 +397,7 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
out:
clear_bit(BDI_write_congested, &bdi->state);
wake_up_all(&nfs_write_congestion);
- writeback_congestion_end();
+ congestion_end(WRITE);
return err;
}
@@ -588,10 +590,10 @@ static void nfs_cancel_commit_list(struct list_head *head)
while(!list_empty(head)) {
req = nfs_list_entry(head->next);
+ dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
nfs_list_remove_request(req);
nfs_inode_remove_request(req);
- dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
- nfs_clear_page_writeback(req);
+ nfs_unlock_request(req);
}
}
diff --git a/fs/nfs_common/nfsacl.c b/fs/nfs_common/nfsacl.c
index 0c2be8c0307d..c11f5375d7c1 100644
--- a/fs/nfs_common/nfsacl.c
+++ b/fs/nfs_common/nfsacl.c
@@ -46,7 +46,7 @@ xdr_nfsace_encode(struct xdr_array2_desc *desc, void *elem)
{
struct nfsacl_encode_desc *nfsacl_desc =
(struct nfsacl_encode_desc *) desc;
- u32 *p = (u32 *) elem;
+ __be32 *p = elem;
struct posix_acl_entry *entry =
&nfsacl_desc->acl->a_entries[nfsacl_desc->count++];
@@ -127,7 +127,7 @@ xdr_nfsace_decode(struct xdr_array2_desc *desc, void *elem)
{
struct nfsacl_decode_desc *nfsacl_desc =
(struct nfsacl_decode_desc *) desc;
- u32 *p = (u32 *) elem;
+ __be32 *p = elem;
struct posix_acl_entry *entry;
if (!nfsacl_desc->acl) {
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index cfe141e5d759..f37df46d2eaa 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -319,12 +319,25 @@ svc_expkey_update(struct svc_expkey *new, struct svc_expkey *old)
static struct cache_head *export_table[EXPORT_HASHMAX];
+static void nfsd4_fslocs_free(struct nfsd4_fs_locations *fsloc)
+{
+ int i;
+
+ for (i = 0; i < fsloc->locations_count; i++) {
+ kfree(fsloc->locations[i].path);
+ kfree(fsloc->locations[i].hosts);
+ }
+ kfree(fsloc->locations);
+}
+
static void svc_export_put(struct kref *ref)
{
struct svc_export *exp = container_of(ref, struct svc_export, h.ref);
dput(exp->ex_dentry);
mntput(exp->ex_mnt);
auth_domain_put(exp->ex_client);
+ kfree(exp->ex_path);
+ nfsd4_fslocs_free(&exp->ex_fslocs);
kfree(exp);
}
@@ -386,6 +399,69 @@ static int check_export(struct inode *inode, int flags)
}
+#ifdef CONFIG_NFSD_V4
+
+static int
+fsloc_parse(char **mesg, char *buf, struct nfsd4_fs_locations *fsloc)
+{
+ int len;
+ int migrated, i, err;
+
+ len = qword_get(mesg, buf, PAGE_SIZE);
+ if (len != 5 || memcmp(buf, "fsloc", 5))
+ return 0;
+
+ /* listsize */
+ err = get_int(mesg, &fsloc->locations_count);
+ if (err)
+ return err;
+ if (fsloc->locations_count > MAX_FS_LOCATIONS)
+ return -EINVAL;
+ if (fsloc->locations_count == 0)
+ return 0;
+
+ fsloc->locations = kzalloc(fsloc->locations_count
+ * sizeof(struct nfsd4_fs_location), GFP_KERNEL);
+ if (!fsloc->locations)
+ return -ENOMEM;
+ for (i=0; i < fsloc->locations_count; i++) {
+ /* colon separated host list */
+ err = -EINVAL;
+ len = qword_get(mesg, buf, PAGE_SIZE);
+ if (len <= 0)
+ goto out_free_all;
+ err = -ENOMEM;
+ fsloc->locations[i].hosts = kstrdup(buf, GFP_KERNEL);
+ if (!fsloc->locations[i].hosts)
+ goto out_free_all;
+ err = -EINVAL;
+ /* slash separated path component list */
+ len = qword_get(mesg, buf, PAGE_SIZE);
+ if (len <= 0)
+ goto out_free_all;
+ err = -ENOMEM;
+ fsloc->locations[i].path = kstrdup(buf, GFP_KERNEL);
+ if (!fsloc->locations[i].path)
+ goto out_free_all;
+ }
+ /* migrated */
+ err = get_int(mesg, &migrated);
+ if (err)
+ goto out_free_all;
+ err = -EINVAL;
+ if (migrated < 0 || migrated > 1)
+ goto out_free_all;
+ fsloc->migrated = migrated;
+ return 0;
+out_free_all:
+ nfsd4_fslocs_free(fsloc);
+ return err;
+}
+
+#else /* CONFIG_NFSD_V4 */
+static inline int fsloc_parse(char **mesg, char *buf, struct nfsd4_fs_locations *fsloc) { return 0; }
+#endif
+
static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
{
/* client path expiry [flags anonuid anongid fsid] */
@@ -398,6 +474,7 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
int an_int;
nd.dentry = NULL;
+ exp.ex_path = NULL;
if (mesg[mlen-1] != '\n')
return -EINVAL;
@@ -428,6 +505,10 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
exp.ex_client = dom;
exp.ex_mnt = nd.mnt;
exp.ex_dentry = nd.dentry;
+ exp.ex_path = kstrdup(buf, GFP_KERNEL);
+ err = -ENOMEM;
+ if (!exp.ex_path)
+ goto out;
/* expiry */
err = -EINVAL;
@@ -435,6 +516,11 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
if (exp.h.expiry_time == 0)
goto out;
+ /* fs locations */
+ exp.ex_fslocs.locations = NULL;
+ exp.ex_fslocs.locations_count = 0;
+ exp.ex_fslocs.migrated = 0;
+
/* flags */
err = get_int(&mesg, &an_int);
if (err == -ENOENT)
@@ -460,6 +546,10 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
err = check_export(nd.dentry->d_inode, exp.ex_flags);
if (err) goto out;
+
+ err = fsloc_parse(&mesg, buf, &exp.ex_fslocs);
+ if (err)
+ goto out;
}
expp = svc_export_lookup(&exp);
@@ -473,6 +563,7 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
else
exp_put(expp);
out:
+ kfree(exp.ex_path);
if (nd.dentry)
path_release(&nd);
out_no_path:
@@ -482,7 +573,8 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
return err;
}
-static void exp_flags(struct seq_file *m, int flag, int fsid, uid_t anonu, uid_t anong);
+static void exp_flags(struct seq_file *m, int flag, int fsid,
+ uid_t anonu, uid_t anong, struct nfsd4_fs_locations *fslocs);
static int svc_export_show(struct seq_file *m,
struct cache_detail *cd,
@@ -501,8 +593,8 @@ static int svc_export_show(struct seq_file *m,
seq_putc(m, '(');
if (test_bit(CACHE_VALID, &h->flags) &&
!test_bit(CACHE_NEGATIVE, &h->flags))
- exp_flags(m, exp->ex_flags, exp->ex_fsid,
- exp->ex_anon_uid, exp->ex_anon_gid);
+ exp_flags(m, exp->ex_flags, exp->ex_fsid,
+ exp->ex_anon_uid, exp->ex_anon_gid, &exp->ex_fslocs);
seq_puts(m, ")\n");
return 0;
}
@@ -524,6 +616,10 @@ static void svc_export_init(struct cache_head *cnew, struct cache_head *citem)
new->ex_client = item->ex_client;
new->ex_dentry = dget(item->ex_dentry);
new->ex_mnt = mntget(item->ex_mnt);
+ new->ex_path = NULL;
+ new->ex_fslocs.locations = NULL;
+ new->ex_fslocs.locations_count = 0;
+ new->ex_fslocs.migrated = 0;
}
static void export_update(struct cache_head *cnew, struct cache_head *citem)
@@ -535,6 +631,14 @@ static void export_update(struct cache_head *cnew, struct cache_head *citem)
new->ex_anon_uid = item->ex_anon_uid;
new->ex_anon_gid = item->ex_anon_gid;
new->ex_fsid = item->ex_fsid;
+ new->ex_path = item->ex_path;
+ item->ex_path = NULL;
+ new->ex_fslocs.locations = item->ex_fslocs.locations;
+ item->ex_fslocs.locations = NULL;
+ new->ex_fslocs.locations_count = item->ex_fslocs.locations_count;
+ item->ex_fslocs.locations_count = 0;
+ new->ex_fslocs.migrated = item->ex_fslocs.migrated;
+ item->ex_fslocs.migrated = 0;
}
static struct cache_head *svc_export_alloc(void)
@@ -1044,34 +1148,25 @@ exp_find(struct auth_domain *clp, int fsid_type, u32 *fsidv,
* for a given NFSv4 client. The root is defined to be the
* export point with fsid==0
*/
-int
+__be32
exp_pseudoroot(struct auth_domain *clp, struct svc_fh *fhp,
struct cache_req *creq)
{
- struct svc_expkey *fsid_key;
struct svc_export *exp;
- int rv;
+ __be32 rv;
u32 fsidv[2];
mk_fsid_v1(fsidv, 0);
- fsid_key = exp_find_key(clp, 1, fsidv, creq);
- if (IS_ERR(fsid_key) && PTR_ERR(fsid_key) == -EAGAIN)
+ exp = exp_find(clp, 1, fsidv, creq);
+ if (IS_ERR(exp) && PTR_ERR(exp) == -EAGAIN)
return nfserr_dropit;
- if (!fsid_key || IS_ERR(fsid_key))
- return nfserr_perm;
-
- exp = exp_get_by_name(clp, fsid_key->ek_mnt, fsid_key->ek_dentry, creq);
if (exp == NULL)
- rv = nfserr_perm;
+ return nfserr_perm;
else if (IS_ERR(exp))
- rv = nfserrno(PTR_ERR(exp));
- else {
- rv = fh_compose(fhp, exp,
- fsid_key->ek_dentry, NULL);
- exp_put(exp);
- }
- cache_put(&fsid_key->h, &svc_expkey_cache);
+ return nfserrno(PTR_ERR(exp));
+ rv = fh_compose(fhp, exp, exp->ex_dentry, NULL);
+ exp_put(exp);
return rv;
}
@@ -1158,7 +1253,8 @@ static struct flags {
{ 0, {"", ""}}
};
-static void exp_flags(struct seq_file *m, int flag, int fsid, uid_t anonu, uid_t anong)
+static void exp_flags(struct seq_file *m, int flag, int fsid,
+ uid_t anonu, uid_t anong, struct nfsd4_fs_locations *fsloc)
{
int first = 0;
struct flags *flg;
@@ -1174,6 +1270,21 @@ static void exp_flags(struct seq_file *m, int flag, int fsid, uid_t anonu, uid_t
seq_printf(m, "%sanonuid=%d", first++?",":"", anonu);
if (anong != (gid_t)-2 && anong != (0x10000-2))
seq_printf(m, "%sanongid=%d", first++?",":"", anong);
+ if (fsloc && fsloc->locations_count > 0) {
+ char *loctype = (fsloc->migrated) ? "refer" : "replicas";
+ int i;
+
+ seq_printf(m, "%s%s=", first++?",":"", loctype);
+ seq_escape(m, fsloc->locations[0].path, ",;@ \t\n\\");
+ seq_putc(m, '@');
+ seq_escape(m, fsloc->locations[0].hosts, ",;@ \t\n\\");
+ for (i = 1; i < fsloc->locations_count; i++) {
+ seq_putc(m, ';');
+ seq_escape(m, fsloc->locations[i].path, ",;@ \t\n\\");
+ seq_putc(m, '@');
+ seq_escape(m, fsloc->locations[i].hosts, ",;@ \t\n\\");
+ }
+ }
}
static int e_show(struct seq_file *m, void *p)
diff --git a/fs/nfsd/lockd.c b/fs/nfsd/lockd.c
index 7b889ff15ae6..11fdaf7721b4 100644
--- a/fs/nfsd/lockd.c
+++ b/fs/nfsd/lockd.c
@@ -25,7 +25,7 @@
static u32
nlm_fopen(struct svc_rqst *rqstp, struct nfs_fh *f, struct file **filp)
{
- u32 nfserr;
+ __be32 nfserr;
struct svc_fh fh;
/* must initialize before using! but maxsize doesn't matter */
@@ -39,18 +39,20 @@ nlm_fopen(struct svc_rqst *rqstp, struct nfs_fh *f, struct file **filp)
fh_put(&fh);
rqstp->rq_client = NULL;
exp_readunlock();
- /* nlm and nfsd don't share error codes.
- * we invent: 0 = no error
- * 1 = stale file handle
- * 2 = other error
+ /* We return nlm error codes as nlm doesn't know
+ * about nfsd, but nfsd does know about nlm..
*/
switch (nfserr) {
case nfs_ok:
return 0;
+ case nfserr_dropit:
+ return nlm_drop_reply;
+#ifdef CONFIG_LOCKD_V4
case nfserr_stale:
- return 1;
+ return nlm4_stale_fh;
+#endif
default:
- return 2;
+ return nlm_lck_denied;
}
}
diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c
index fe56b38364cc..e3eca0816986 100644
--- a/fs/nfsd/nfs2acl.c
+++ b/fs/nfsd/nfs2acl.c
@@ -21,7 +21,7 @@
/*
* NULL call.
*/
-static int
+static __be32
nfsacld_proc_null(struct svc_rqst *rqstp, void *argp, void *resp)
{
return nfs_ok;
@@ -30,12 +30,12 @@ nfsacld_proc_null(struct svc_rqst *rqstp, void *argp, void *resp)
/*
* Get the Access and/or Default ACL of a file.
*/
-static int nfsacld_proc_getacl(struct svc_rqst * rqstp,
+static __be32 nfsacld_proc_getacl(struct svc_rqst * rqstp,
struct nfsd3_getaclargs *argp, struct nfsd3_getaclres *resp)
{
svc_fh *fh;
struct posix_acl *acl;
- int nfserr = 0;
+ __be32 nfserr = 0;
dprintk("nfsd: GETACL(2acl) %s\n", SVCFH_fmt(&argp->fh));
@@ -97,12 +97,12 @@ fail:
/*
* Set the Access and/or Default ACL of a file.
*/
-static int nfsacld_proc_setacl(struct svc_rqst * rqstp,
+static __be32 nfsacld_proc_setacl(struct svc_rqst * rqstp,
struct nfsd3_setaclargs *argp,
struct nfsd_attrstat *resp)
{
svc_fh *fh;
- int nfserr = 0;
+ __be32 nfserr = 0;
dprintk("nfsd: SETACL(2acl) %s\n", SVCFH_fmt(&argp->fh));
@@ -128,7 +128,7 @@ static int nfsacld_proc_setacl(struct svc_rqst * rqstp,
/*
* Check file attributes
*/
-static int nfsacld_proc_getattr(struct svc_rqst * rqstp,
+static __be32 nfsacld_proc_getattr(struct svc_rqst * rqstp,
struct nfsd_fhandle *argp, struct nfsd_attrstat *resp)
{
dprintk("nfsd: GETATTR %s\n", SVCFH_fmt(&argp->fh));
@@ -140,10 +140,10 @@ static int nfsacld_proc_getattr(struct svc_rqst * rqstp,
/*
* Check file access
*/
-static int nfsacld_proc_access(struct svc_rqst *rqstp, struct nfsd3_accessargs *argp,
+static __be32 nfsacld_proc_access(struct svc_rqst *rqstp, struct nfsd3_accessargs *argp,
struct nfsd3_accessres *resp)
{
- int nfserr;
+ __be32 nfserr;
dprintk("nfsd: ACCESS(2acl) %s 0x%x\n",
SVCFH_fmt(&argp->fh),
@@ -158,7 +158,7 @@ static int nfsacld_proc_access(struct svc_rqst *rqstp, struct nfsd3_accessargs *
/*
* XDR decode functions
*/
-static int nfsaclsvc_decode_getaclargs(struct svc_rqst *rqstp, u32 *p,
+static int nfsaclsvc_decode_getaclargs(struct svc_rqst *rqstp, __be32 *p,
struct nfsd3_getaclargs *argp)
{
if (!(p = nfs2svc_decode_fh(p, &argp->fh)))
@@ -169,7 +169,7 @@ static int nfsaclsvc_decode_getaclargs(struct svc_rqst *rqstp, u32 *p,
}
-static int nfsaclsvc_decode_setaclargs(struct svc_rqst *rqstp, u32 *p,
+static int nfsaclsvc_decode_setaclargs(struct svc_rqst *rqstp, __be32 *p,
struct nfsd3_setaclargs *argp)
{
struct kvec *head = rqstp->rq_arg.head;
@@ -194,7 +194,7 @@ static int nfsaclsvc_decode_setaclargs(struct svc_rqst *rqstp, u32 *p,
return (n > 0);
}
-static int nfsaclsvc_decode_fhandleargs(struct svc_rqst *rqstp, u32 *p,
+static int nfsaclsvc_decode_fhandleargs(struct svc_rqst *rqstp, __be32 *p,
struct nfsd_fhandle *argp)
{
if (!(p = nfs2svc_decode_fh(p, &argp->fh)))
@@ -202,7 +202,7 @@ static int nfsaclsvc_decode_fhandleargs(struct svc_rqst *rqstp, u32 *p,
return xdr_argsize_check(rqstp, p);
}
-static int nfsaclsvc_decode_accessargs(struct svc_rqst *rqstp, u32 *p,
+static int nfsaclsvc_decode_accessargs(struct svc_rqst *rqstp, __be32 *p,
struct nfsd3_accessargs *argp)
{
if (!(p = nfs2svc_decode_fh(p, &argp->fh)))
@@ -217,7 +217,7 @@ static int nfsaclsvc_decode_accessargs(struct svc_rqst *rqstp, u32 *p,
*/
/* GETACL */
-static int nfsaclsvc_encode_getaclres(struct svc_rqst *rqstp, u32 *p,
+static int nfsaclsvc_encode_getaclres(struct svc_rqst *rqstp, __be32 *p,
struct nfsd3_getaclres *resp)
{
struct dentry *dentry = resp->fh.fh_dentry;
@@ -241,7 +241,7 @@ static int nfsaclsvc_encode_getaclres(struct svc_rqst *rqstp, u32 *p,
rqstp->rq_res.page_len = w;
while (w > 0) {
- if (!svc_take_res_page(rqstp))
+ if (!rqstp->rq_respages[rqstp->rq_resused++])
return 0;
w -= PAGE_SIZE;
}
@@ -259,7 +259,7 @@ static int nfsaclsvc_encode_getaclres(struct svc_rqst *rqstp, u32 *p,
return 1;
}
-static int nfsaclsvc_encode_attrstatres(struct svc_rqst *rqstp, u32 *p,
+static int nfsaclsvc_encode_attrstatres(struct svc_rqst *rqstp, __be32 *p,
struct nfsd_attrstat *resp)
{
p = nfs2svc_encode_fattr(rqstp, p, &resp->fh);
@@ -267,7 +267,7 @@ static int nfsaclsvc_encode_attrstatres(struct svc_rqst *rqstp, u32 *p,
}
/* ACCESS */
-static int nfsaclsvc_encode_accessres(struct svc_rqst *rqstp, u32 *p,
+static int nfsaclsvc_encode_accessres(struct svc_rqst *rqstp, __be32 *p,
struct nfsd3_accessres *resp)
{
p = nfs2svc_encode_fattr(rqstp, p, &resp->fh);
@@ -278,7 +278,7 @@ static int nfsaclsvc_encode_accessres(struct svc_rqst *rqstp, u32 *p,
/*
* XDR release functions
*/
-static int nfsaclsvc_release_getacl(struct svc_rqst *rqstp, u32 *p,
+static int nfsaclsvc_release_getacl(struct svc_rqst *rqstp, __be32 *p,
struct nfsd3_getaclres *resp)
{
fh_put(&resp->fh);
@@ -287,7 +287,7 @@ static int nfsaclsvc_release_getacl(struct svc_rqst *rqstp, u32 *p,
return 1;
}
-static int nfsaclsvc_release_fhandle(struct svc_rqst *rqstp, u32 *p,
+static int nfsaclsvc_release_fhandle(struct svc_rqst *rqstp, __be32 *p,
struct nfsd_fhandle *resp)
{
fh_put(&resp->fh);
@@ -333,4 +333,5 @@ struct svc_version nfsd_acl_version2 = {
.vs_proc = nfsd_acl_procedures2,
.vs_dispatch = nfsd_dispatch,
.vs_xdrsize = NFS3_SVC_XDRSIZE,
+ .vs_hidden = 1,
};
diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c
index 16e10c170aed..fcad2895ddb0 100644
--- a/fs/nfsd/nfs3acl.c
+++ b/fs/nfsd/nfs3acl.c
@@ -19,7 +19,7 @@
/*
* NULL call.
*/
-static int
+static __be32
nfsd3_proc_null(struct svc_rqst *rqstp, void *argp, void *resp)
{
return nfs_ok;
@@ -28,12 +28,12 @@ nfsd3_proc_null(struct svc_rqst *rqstp, void *argp, void *resp)
/*
* Get the Access and/or Default ACL of a file.
*/
-static int nfsd3_proc_getacl(struct svc_rqst * rqstp,
+static __be32 nfsd3_proc_getacl(struct svc_rqst * rqstp,
struct nfsd3_getaclargs *argp, struct nfsd3_getaclres *resp)
{
svc_fh *fh;
struct posix_acl *acl;
- int nfserr = 0;
+ __be32 nfserr = 0;
fh = fh_copy(&resp->fh, &argp->fh);
if ((nfserr = fh_verify(rqstp, &resp->fh, 0, MAY_NOP)))
@@ -93,12 +93,12 @@ fail:
/*
* Set the Access and/or Default ACL of a file.
*/
-static int nfsd3_proc_setacl(struct svc_rqst * rqstp,
+static __be32 nfsd3_proc_setacl(struct svc_rqst * rqstp,
struct nfsd3_setaclargs *argp,
struct nfsd3_attrstat *resp)
{
svc_fh *fh;
- int nfserr = 0;
+ __be32 nfserr = 0;
fh = fh_copy(&resp->fh, &argp->fh);
nfserr = fh_verify(rqstp, &resp->fh, 0, MAY_SATTR);
@@ -122,7 +122,7 @@ static int nfsd3_proc_setacl(struct svc_rqst * rqstp,
/*
* XDR decode functions
*/
-static int nfs3svc_decode_getaclargs(struct svc_rqst *rqstp, u32 *p,
+static int nfs3svc_decode_getaclargs(struct svc_rqst *rqstp, __be32 *p,
struct nfsd3_getaclargs *args)
{
if (!(p = nfs3svc_decode_fh(p, &args->fh)))
@@ -133,7 +133,7 @@ static int nfs3svc_decode_getaclargs(struct svc_rqst *rqstp, u32 *p,
}
-static int nfs3svc_decode_setaclargs(struct svc_rqst *rqstp, u32 *p,
+static int nfs3svc_decode_setaclargs(struct svc_rqst *rqstp, __be32 *p,
struct nfsd3_setaclargs *args)
{
struct kvec *head = rqstp->rq_arg.head;
@@ -163,7 +163,7 @@ static int nfs3svc_decode_setaclargs(struct svc_rqst *rqstp, u32 *p,
*/
/* GETACL */
-static int nfs3svc_encode_getaclres(struct svc_rqst *rqstp, u32 *p,
+static int nfs3svc_encode_getaclres(struct svc_rqst *rqstp, __be32 *p,
struct nfsd3_getaclres *resp)
{
struct dentry *dentry = resp->fh.fh_dentry;
@@ -185,7 +185,7 @@ static int nfs3svc_encode_getaclres(struct svc_rqst *rqstp, u32 *p,
rqstp->rq_res.page_len = w;
while (w > 0) {
- if (!svc_take_res_page(rqstp))
+ if (!rqstp->rq_respages[rqstp->rq_resused++])
return 0;
w -= PAGE_SIZE;
}
@@ -208,7 +208,7 @@ static int nfs3svc_encode_getaclres(struct svc_rqst *rqstp, u32 *p,
}
/* SETACL */
-static int nfs3svc_encode_setaclres(struct svc_rqst *rqstp, u32 *p,
+static int nfs3svc_encode_setaclres(struct svc_rqst *rqstp, __be32 *p,
struct nfsd3_attrstat *resp)
{
p = nfs3svc_encode_post_op_attr(rqstp, p, &resp->fh);
@@ -219,7 +219,7 @@ static int nfs3svc_encode_setaclres(struct svc_rqst *rqstp, u32 *p,
/*
* XDR release functions
*/
-static int nfs3svc_release_getacl(struct svc_rqst *rqstp, u32 *p,
+static int nfs3svc_release_getacl(struct svc_rqst *rqstp, __be32 *p,
struct nfsd3_getaclres *resp)
{
fh_put(&resp->fh);
@@ -263,5 +263,6 @@ struct svc_version nfsd_acl_version3 = {
.vs_proc = nfsd_acl_procedures3,
.vs_dispatch = nfsd_dispatch,
.vs_xdrsize = NFS3_SVC_XDRSIZE,
+ .vs_hidden = 1,
};
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index f61142afea44..64db601c2bd2 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -43,7 +43,7 @@ static int nfs3_ftypes[] = {
/*
* NULL call.
*/
-static int
+static __be32
nfsd3_proc_null(struct svc_rqst *rqstp, void *argp, void *resp)
{
return nfs_ok;
@@ -52,11 +52,12 @@ nfsd3_proc_null(struct svc_rqst *rqstp, void *argp, void *resp)
/*
* Get a file's attributes
*/
-static int
+static __be32
nfsd3_proc_getattr(struct svc_rqst *rqstp, struct nfsd_fhandle *argp,
struct nfsd3_attrstat *resp)
{
- int err, nfserr;
+ int err;
+ __be32 nfserr;
dprintk("nfsd: GETATTR(3) %s\n",
SVCFH_fmt(&argp->fh));
@@ -76,11 +77,11 @@ nfsd3_proc_getattr(struct svc_rqst *rqstp, struct nfsd_fhandle *argp,
/*
* Set a file's attributes
*/
-static int
+static __be32
nfsd3_proc_setattr(struct svc_rqst *rqstp, struct nfsd3_sattrargs *argp,
struct nfsd3_attrstat *resp)
{
- int nfserr;
+ __be32 nfserr;
dprintk("nfsd: SETATTR(3) %s\n",
SVCFH_fmt(&argp->fh));
@@ -94,11 +95,11 @@ nfsd3_proc_setattr(struct svc_rqst *rqstp, struct nfsd3_sattrargs *argp,
/*
* Look up a path name component
*/
-static int
+static __be32
nfsd3_proc_lookup(struct svc_rqst *rqstp, struct nfsd3_diropargs *argp,
struct nfsd3_diropres *resp)
{
- int nfserr;
+ __be32 nfserr;
dprintk("nfsd: LOOKUP(3) %s %.*s\n",
SVCFH_fmt(&argp->fh),
@@ -118,11 +119,11 @@ nfsd3_proc_lookup(struct svc_rqst *rqstp, struct nfsd3_diropargs *argp,
/*
* Check file access
*/
-static int
+static __be32
nfsd3_proc_access(struct svc_rqst *rqstp, struct nfsd3_accessargs *argp,
struct nfsd3_accessres *resp)
{
- int nfserr;
+ __be32 nfserr;
dprintk("nfsd: ACCESS(3) %s 0x%x\n",
SVCFH_fmt(&argp->fh),
@@ -137,11 +138,11 @@ nfsd3_proc_access(struct svc_rqst *rqstp, struct nfsd3_accessargs *argp,
/*
* Read a symlink.
*/
-static int
+static __be32
nfsd3_proc_readlink(struct svc_rqst *rqstp, struct nfsd3_readlinkargs *argp,
struct nfsd3_readlinkres *resp)
{
- int nfserr;
+ __be32 nfserr;
dprintk("nfsd: READLINK(3) %s\n", SVCFH_fmt(&argp->fh));
@@ -155,11 +156,12 @@ nfsd3_proc_readlink(struct svc_rqst *rqstp, struct nfsd3_readlinkargs *argp,
/*
* Read a portion of a file.
*/
-static int
+static __be32
nfsd3_proc_read(struct svc_rqst *rqstp, struct nfsd3_readargs *argp,
struct nfsd3_readres *resp)
{
- int nfserr;
+ __be32 nfserr;
+ u32 max_blocksize = svc_max_payload(rqstp);
dprintk("nfsd: READ(3) %s %lu bytes at %lu\n",
SVCFH_fmt(&argp->fh),
@@ -172,15 +174,15 @@ nfsd3_proc_read(struct svc_rqst *rqstp, struct nfsd3_readargs *argp,
*/
resp->count = argp->count;
- if (NFSSVC_MAXBLKSIZE < resp->count)
- resp->count = NFSSVC_MAXBLKSIZE;
+ if (max_blocksize < resp->count)
+ resp->count = max_blocksize;
svc_reserve(rqstp, ((1 + NFS3_POST_OP_ATTR_WORDS + 3)<<2) + resp->count +4);
fh_copy(&resp->fh, &argp->fh);
nfserr = nfsd_read(rqstp, &resp->fh, NULL,
argp->offset,
- argp->vec, argp->vlen,
+ rqstp->rq_vec, argp->vlen,
&resp->count);
if (nfserr == 0) {
struct inode *inode = resp->fh.fh_dentry->d_inode;
@@ -194,11 +196,11 @@ nfsd3_proc_read(struct svc_rqst *rqstp, struct nfsd3_readargs *argp,
/*
* Write data to a file
*/
-static int
+static __be32
nfsd3_proc_write(struct svc_rqst *rqstp, struct nfsd3_writeargs *argp,
struct nfsd3_writeres *resp)
{
- int nfserr;
+ __be32 nfserr;
dprintk("nfsd: WRITE(3) %s %d bytes at %ld%s\n",
SVCFH_fmt(&argp->fh),
@@ -210,7 +212,7 @@ nfsd3_proc_write(struct svc_rqst *rqstp, struct nfsd3_writeargs *argp,
resp->committed = argp->stable;
nfserr = nfsd_write(rqstp, &resp->fh, NULL,
argp->offset,
- argp->vec, argp->vlen,
+ rqstp->rq_vec, argp->vlen,
argp->len,
&resp->committed);
resp->count = argp->count;
@@ -222,13 +224,13 @@ nfsd3_proc_write(struct svc_rqst *rqstp, struct nfsd3_writeargs *argp,
* At least in theory; we'll see how it fares in practice when the
* first reports about SunOS compatibility problems start to pour in...
*/
-static int
+static __be32
nfsd3_proc_create(struct svc_rqst *rqstp, struct nfsd3_createargs *argp,
struct nfsd3_diropres *resp)
{
svc_fh *dirfhp, *newfhp = NULL;
struct iattr *attr;
- u32 nfserr;
+ __be32 nfserr;
dprintk("nfsd: CREATE(3) %s %.*s\n",
SVCFH_fmt(&argp->fh),
@@ -264,11 +266,11 @@ nfsd3_proc_create(struct svc_rqst *rqstp, struct nfsd3_createargs *argp,
/*
* Make directory. This operation is not idempotent.
*/
-static int
+static __be32
nfsd3_proc_mkdir(struct svc_rqst *rqstp, struct nfsd3_createargs *argp,
struct nfsd3_diropres *resp)
{
- int nfserr;
+ __be32 nfserr;
dprintk("nfsd: MKDIR(3) %s %.*s\n",
SVCFH_fmt(&argp->fh),
@@ -284,11 +286,11 @@ nfsd3_proc_mkdir(struct svc_rqst *rqstp, struct nfsd3_createargs *argp,
RETURN_STATUS(nfserr);
}
-static int
+static __be32
nfsd3_proc_symlink(struct svc_rqst *rqstp, struct nfsd3_symlinkargs *argp,
struct nfsd3_diropres *resp)
{
- int nfserr;
+ __be32 nfserr;
dprintk("nfsd: SYMLINK(3) %s %.*s -> %.*s\n",
SVCFH_fmt(&argp->ffh),
@@ -306,11 +308,12 @@ nfsd3_proc_symlink(struct svc_rqst *rqstp, struct nfsd3_symlinkargs *argp,
/*
* Make socket/fifo/device.
*/
-static int
+static __be32
nfsd3_proc_mknod(struct svc_rqst *rqstp, struct nfsd3_mknodargs *argp,
struct nfsd3_diropres *resp)
{
- int nfserr, type;
+ __be32 nfserr;
+ int type;
dev_t rdev = 0;
dprintk("nfsd: MKNOD(3) %s %.*s\n",
@@ -342,11 +345,11 @@ nfsd3_proc_mknod(struct svc_rqst *rqstp, struct nfsd3_mknodargs *argp,
/*
* Remove file/fifo/socket etc.
*/
-static int
+static __be32
nfsd3_proc_remove(struct svc_rqst *rqstp, struct nfsd3_diropargs *argp,
struct nfsd3_attrstat *resp)
{
- int nfserr;
+ __be32 nfserr;
dprintk("nfsd: REMOVE(3) %s %.*s\n",
SVCFH_fmt(&argp->fh),
@@ -362,11 +365,11 @@ nfsd3_proc_remove(struct svc_rqst *rqstp, struct nfsd3_diropargs *argp,
/*
* Remove a directory
*/
-static int
+static __be32
nfsd3_proc_rmdir(struct svc_rqst *rqstp, struct nfsd3_diropargs *argp,
struct nfsd3_attrstat *resp)
{
- int nfserr;
+ __be32 nfserr;
dprintk("nfsd: RMDIR(3) %s %.*s\n",
SVCFH_fmt(&argp->fh),
@@ -378,11 +381,11 @@ nfsd3_proc_rmdir(struct svc_rqst *rqstp, struct nfsd3_diropargs *argp,
RETURN_STATUS(nfserr);
}
-static int
+static __be32
nfsd3_proc_rename(struct svc_rqst *rqstp, struct nfsd3_renameargs *argp,
struct nfsd3_renameres *resp)
{
- int nfserr;
+ __be32 nfserr;
dprintk("nfsd: RENAME(3) %s %.*s ->\n",
SVCFH_fmt(&argp->ffh),
@@ -400,11 +403,11 @@ nfsd3_proc_rename(struct svc_rqst *rqstp, struct nfsd3_renameargs *argp,
RETURN_STATUS(nfserr);
}
-static int
+static __be32
nfsd3_proc_link(struct svc_rqst *rqstp, struct nfsd3_linkargs *argp,
struct nfsd3_linkres *resp)
{
- int nfserr;
+ __be32 nfserr;
dprintk("nfsd: LINK(3) %s ->\n",
SVCFH_fmt(&argp->ffh));
@@ -423,11 +426,12 @@ nfsd3_proc_link(struct svc_rqst *rqstp, struct nfsd3_linkargs *argp,
/*
* Read a portion of a directory.
*/
-static int
+static __be32
nfsd3_proc_readdir(struct svc_rqst *rqstp, struct nfsd3_readdirargs *argp,
struct nfsd3_readdirres *resp)
{
- int nfserr, count;
+ __be32 nfserr;
+ int count;
dprintk("nfsd: READDIR(3) %s %d bytes at %d\n",
SVCFH_fmt(&argp->fh),
@@ -458,11 +462,12 @@ nfsd3_proc_readdir(struct svc_rqst *rqstp, struct nfsd3_readdirargs *argp,
* Read a portion of a directory, including file handles and attrs.
* For now, we choose to ignore the dircount parameter.
*/
-static int
+static __be32
nfsd3_proc_readdirplus(struct svc_rqst *rqstp, struct nfsd3_readdirargs *argp,
struct nfsd3_readdirres *resp)
{
- int nfserr, count = 0;
+ __be32 nfserr;
+ int count = 0;
loff_t offset;
int i;
caddr_t page_addr = NULL;
@@ -516,11 +521,11 @@ nfsd3_proc_readdirplus(struct svc_rqst *rqstp, struct nfsd3_readdirargs *argp,
/*
* Get file system stats
*/
-static int
+static __be32
nfsd3_proc_fsstat(struct svc_rqst * rqstp, struct nfsd_fhandle *argp,
struct nfsd3_fsstatres *resp)
{
- int nfserr;
+ __be32 nfserr;
dprintk("nfsd: FSSTAT(3) %s\n",
SVCFH_fmt(&argp->fh));
@@ -533,20 +538,21 @@ nfsd3_proc_fsstat(struct svc_rqst * rqstp, struct nfsd_fhandle *argp,
/*
* Get file system info
*/
-static int
+static __be32
nfsd3_proc_fsinfo(struct svc_rqst * rqstp, struct nfsd_fhandle *argp,
struct nfsd3_fsinfores *resp)
{
- int nfserr;
+ __be32 nfserr;
+ u32 max_blocksize = svc_max_payload(rqstp);
dprintk("nfsd: FSINFO(3) %s\n",
SVCFH_fmt(&argp->fh));
- resp->f_rtmax = NFSSVC_MAXBLKSIZE;
- resp->f_rtpref = NFSSVC_MAXBLKSIZE;
+ resp->f_rtmax = max_blocksize;
+ resp->f_rtpref = max_blocksize;
resp->f_rtmult = PAGE_SIZE;
- resp->f_wtmax = NFSSVC_MAXBLKSIZE;
- resp->f_wtpref = NFSSVC_MAXBLKSIZE;
+ resp->f_wtmax = max_blocksize;
+ resp->f_wtpref = max_blocksize;
resp->f_wtmult = PAGE_SIZE;
resp->f_dtpref = PAGE_SIZE;
resp->f_maxfilesize = ~(u32) 0;
@@ -574,11 +580,11 @@ nfsd3_proc_fsinfo(struct svc_rqst * rqstp, struct nfsd_fhandle *argp,
/*
* Get pathconf info for the specified file
*/
-static int
+static __be32
nfsd3_proc_pathconf(struct svc_rqst * rqstp, struct nfsd_fhandle *argp,
struct nfsd3_pathconfres *resp)
{
- int nfserr;
+ __be32 nfserr;
dprintk("nfsd: PATHCONF(3) %s\n",
SVCFH_fmt(&argp->fh));
@@ -617,11 +623,11 @@ nfsd3_proc_pathconf(struct svc_rqst * rqstp, struct nfsd_fhandle *argp,
/*
* Commit a file (range) to stable storage.
*/
-static int
+static __be32
nfsd3_proc_commit(struct svc_rqst * rqstp, struct nfsd3_commitargs *argp,
struct nfsd3_commitres *resp)
{
- int nfserr;
+ __be32 nfserr;
dprintk("nfsd: COMMIT(3) %s %u@%Lu\n",
SVCFH_fmt(&argp->fh),
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index 243d94b9653a..b4baca3053c3 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -42,23 +42,23 @@ static u32 nfs3_ftypes[] = {
/*
* XDR functions for basic NFS types
*/
-static inline u32 *
-encode_time3(u32 *p, struct timespec *time)
+static inline __be32 *
+encode_time3(__be32 *p, struct timespec *time)
{
*p++ = htonl((u32) time->tv_sec); *p++ = htonl(time->tv_nsec);
return p;
}
-static inline u32 *
-decode_time3(u32 *p, struct timespec *time)
+static inline __be32 *
+decode_time3(__be32 *p, struct timespec *time)
{
time->tv_sec = ntohl(*p++);
time->tv_nsec = ntohl(*p++);
return p;
}
-static inline u32 *
-decode_fh(u32 *p, struct svc_fh *fhp)
+static inline __be32 *
+decode_fh(__be32 *p, struct svc_fh *fhp)
{
unsigned int size;
fh_init(fhp, NFS3_FHSIZE);
@@ -72,13 +72,13 @@ decode_fh(u32 *p, struct svc_fh *fhp)
}
/* Helper function for NFSv3 ACL code */
-u32 *nfs3svc_decode_fh(u32 *p, struct svc_fh *fhp)
+__be32 *nfs3svc_decode_fh(__be32 *p, struct svc_fh *fhp)
{
return decode_fh(p, fhp);
}
-static inline u32 *
-encode_fh(u32 *p, struct svc_fh *fhp)
+static inline __be32 *
+encode_fh(__be32 *p, struct svc_fh *fhp)
{
unsigned int size = fhp->fh_handle.fh_size;
*p++ = htonl(size);
@@ -91,8 +91,8 @@ encode_fh(u32 *p, struct svc_fh *fhp)
* Decode a file name and make sure that the path contains
* no slashes or null bytes.
*/
-static inline u32 *
-decode_filename(u32 *p, char **namp, int *lenp)
+static inline __be32 *
+decode_filename(__be32 *p, char **namp, int *lenp)
{
char *name;
int i;
@@ -107,8 +107,8 @@ decode_filename(u32 *p, char **namp, int *lenp)
return p;
}
-static inline u32 *
-decode_sattr3(u32 *p, struct iattr *iap)
+static inline __be32 *
+decode_sattr3(__be32 *p, struct iattr *iap)
{
u32 tmp;
@@ -153,8 +153,8 @@ decode_sattr3(u32 *p, struct iattr *iap)
return p;
}
-static inline u32 *
-encode_fattr3(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp,
+static inline __be32 *
+encode_fattr3(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp,
struct kstat *stat)
{
struct dentry *dentry = fhp->fh_dentry;
@@ -186,8 +186,8 @@ encode_fattr3(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp,
return p;
}
-static inline u32 *
-encode_saved_post_attr(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp)
+static inline __be32 *
+encode_saved_post_attr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp)
{
struct inode *inode = fhp->fh_dentry->d_inode;
@@ -224,8 +224,8 @@ encode_saved_post_attr(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp)
* The inode may be NULL if the call failed because of a stale file
* handle. In this case, no attributes are returned.
*/
-static u32 *
-encode_post_op_attr(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp)
+static __be32 *
+encode_post_op_attr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp)
{
struct dentry *dentry = fhp->fh_dentry;
if (dentry && dentry->d_inode != NULL) {
@@ -243,8 +243,8 @@ encode_post_op_attr(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp)
}
/* Helper for NFSv3 ACLs */
-u32 *
-nfs3svc_encode_post_op_attr(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp)
+__be32 *
+nfs3svc_encode_post_op_attr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp)
{
return encode_post_op_attr(rqstp, p, fhp);
}
@@ -252,8 +252,8 @@ nfs3svc_encode_post_op_attr(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp)
/*
* Enocde weak cache consistency data
*/
-static u32 *
-encode_wcc_data(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp)
+static __be32 *
+encode_wcc_data(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp)
{
struct dentry *dentry = fhp->fh_dentry;
@@ -278,7 +278,7 @@ encode_wcc_data(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp)
* XDR decode functions
*/
int
-nfs3svc_decode_fhandle(struct svc_rqst *rqstp, u32 *p, struct nfsd_fhandle *args)
+nfs3svc_decode_fhandle(struct svc_rqst *rqstp, __be32 *p, struct nfsd_fhandle *args)
{
if (!(p = decode_fh(p, &args->fh)))
return 0;
@@ -286,7 +286,7 @@ nfs3svc_decode_fhandle(struct svc_rqst *rqstp, u32 *p, struct nfsd_fhandle *args
}
int
-nfs3svc_decode_sattrargs(struct svc_rqst *rqstp, u32 *p,
+nfs3svc_decode_sattrargs(struct svc_rqst *rqstp, __be32 *p,
struct nfsd3_sattrargs *args)
{
if (!(p = decode_fh(p, &args->fh))
@@ -303,7 +303,7 @@ nfs3svc_decode_sattrargs(struct svc_rqst *rqstp, u32 *p,
}
int
-nfs3svc_decode_diropargs(struct svc_rqst *rqstp, u32 *p,
+nfs3svc_decode_diropargs(struct svc_rqst *rqstp, __be32 *p,
struct nfsd3_diropargs *args)
{
if (!(p = decode_fh(p, &args->fh))
@@ -314,7 +314,7 @@ nfs3svc_decode_diropargs(struct svc_rqst *rqstp, u32 *p,
}
int
-nfs3svc_decode_accessargs(struct svc_rqst *rqstp, u32 *p,
+nfs3svc_decode_accessargs(struct svc_rqst *rqstp, __be32 *p,
struct nfsd3_accessargs *args)
{
if (!(p = decode_fh(p, &args->fh)))
@@ -325,11 +325,12 @@ nfs3svc_decode_accessargs(struct svc_rqst *rqstp, u32 *p,
}
int
-nfs3svc_decode_readargs(struct svc_rqst *rqstp, u32 *p,
+nfs3svc_decode_readargs(struct svc_rqst *rqstp, __be32 *p,
struct nfsd3_readargs *args)
{
unsigned int len;
int v,pn;
+ u32 max_blocksize = svc_max_payload(rqstp);
if (!(p = decode_fh(p, &args->fh))
|| !(p = xdr_decode_hyper(p, &args->offset)))
@@ -337,17 +338,16 @@ nfs3svc_decode_readargs(struct svc_rqst *rqstp, u32 *p,
len = args->count = ntohl(*p++);
- if (len > NFSSVC_MAXBLKSIZE)
- len = NFSSVC_MAXBLKSIZE;
+ if (len > max_blocksize)
+ len = max_blocksize;
/* set up the kvec */
v=0;
while (len > 0) {
- pn = rqstp->rq_resused;
- svc_take_page(rqstp);
- args->vec[v].iov_base = page_address(rqstp->rq_respages[pn]);
- args->vec[v].iov_len = len < PAGE_SIZE? len : PAGE_SIZE;
- len -= args->vec[v].iov_len;
+ pn = rqstp->rq_resused++;
+ rqstp->rq_vec[v].iov_base = page_address(rqstp->rq_respages[pn]);
+ rqstp->rq_vec[v].iov_len = len < PAGE_SIZE? len : PAGE_SIZE;
+ len -= rqstp->rq_vec[v].iov_len;
v++;
}
args->vlen = v;
@@ -355,10 +355,11 @@ nfs3svc_decode_readargs(struct svc_rqst *rqstp, u32 *p,
}
int
-nfs3svc_decode_writeargs(struct svc_rqst *rqstp, u32 *p,
+nfs3svc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p,
struct nfsd3_writeargs *args)
{
unsigned int len, v, hdr;
+ u32 max_blocksize = svc_max_payload(rqstp);
if (!(p = decode_fh(p, &args->fh))
|| !(p = xdr_decode_hyper(p, &args->offset)))
@@ -373,26 +374,26 @@ nfs3svc_decode_writeargs(struct svc_rqst *rqstp, u32 *p,
rqstp->rq_arg.len - hdr < len)
return 0;
- args->vec[0].iov_base = (void*)p;
- args->vec[0].iov_len = rqstp->rq_arg.head[0].iov_len - hdr;
+ rqstp->rq_vec[0].iov_base = (void*)p;
+ rqstp->rq_vec[0].iov_len = rqstp->rq_arg.head[0].iov_len - hdr;
- if (len > NFSSVC_MAXBLKSIZE)
- len = NFSSVC_MAXBLKSIZE;
+ if (len > max_blocksize)
+ len = max_blocksize;
v= 0;
- while (len > args->vec[v].iov_len) {
- len -= args->vec[v].iov_len;
+ while (len > rqstp->rq_vec[v].iov_len) {
+ len -= rqstp->rq_vec[v].iov_len;
v++;
- args->vec[v].iov_base = page_address(rqstp->rq_argpages[v]);
- args->vec[v].iov_len = PAGE_SIZE;
+ rqstp->rq_vec[v].iov_base = page_address(rqstp->rq_pages[v]);
+ rqstp->rq_vec[v].iov_len = PAGE_SIZE;
}
- args->vec[v].iov_len = len;
+ rqstp->rq_vec[v].iov_len = len;
args->vlen = v+1;
- return args->count == args->len && args->vec[0].iov_len > 0;
+ return args->count == args->len && rqstp->rq_vec[0].iov_len > 0;
}
int
-nfs3svc_decode_createargs(struct svc_rqst *rqstp, u32 *p,
+nfs3svc_decode_createargs(struct svc_rqst *rqstp, __be32 *p,
struct nfsd3_createargs *args)
{
if (!(p = decode_fh(p, &args->fh))
@@ -416,7 +417,7 @@ nfs3svc_decode_createargs(struct svc_rqst *rqstp, u32 *p,
return xdr_argsize_check(rqstp, p);
}
int
-nfs3svc_decode_mkdirargs(struct svc_rqst *rqstp, u32 *p,
+nfs3svc_decode_mkdirargs(struct svc_rqst *rqstp, __be32 *p,
struct nfsd3_createargs *args)
{
if (!(p = decode_fh(p, &args->fh))
@@ -428,7 +429,7 @@ nfs3svc_decode_mkdirargs(struct svc_rqst *rqstp, u32 *p,
}
int
-nfs3svc_decode_symlinkargs(struct svc_rqst *rqstp, u32 *p,
+nfs3svc_decode_symlinkargs(struct svc_rqst *rqstp, __be32 *p,
struct nfsd3_symlinkargs *args)
{
unsigned int len;
@@ -446,11 +447,11 @@ nfs3svc_decode_symlinkargs(struct svc_rqst *rqstp, u32 *p,
* This page appears in the rq_res.pages list, but as pages_len is always
* 0, it won't get in the way
*/
- svc_take_page(rqstp);
len = ntohl(*p++);
if (len == 0 || len > NFS3_MAXPATHLEN || len >= PAGE_SIZE)
return 0;
- args->tname = new = page_address(rqstp->rq_respages[rqstp->rq_resused-1]);
+ args->tname = new =
+ page_address(rqstp->rq_respages[rqstp->rq_resused++]);
args->tlen = len;
/* first copy and check from the first page */
old = (char*)p;
@@ -480,7 +481,7 @@ nfs3svc_decode_symlinkargs(struct svc_rqst *rqstp, u32 *p,
}
int
-nfs3svc_decode_mknodargs(struct svc_rqst *rqstp, u32 *p,
+nfs3svc_decode_mknodargs(struct svc_rqst *rqstp, __be32 *p,
struct nfsd3_mknodargs *args)
{
if (!(p = decode_fh(p, &args->fh))
@@ -504,7 +505,7 @@ nfs3svc_decode_mknodargs(struct svc_rqst *rqstp, u32 *p,
}
int
-nfs3svc_decode_renameargs(struct svc_rqst *rqstp, u32 *p,
+nfs3svc_decode_renameargs(struct svc_rqst *rqstp, __be32 *p,
struct nfsd3_renameargs *args)
{
if (!(p = decode_fh(p, &args->ffh))
@@ -517,19 +518,19 @@ nfs3svc_decode_renameargs(struct svc_rqst *rqstp, u32 *p,
}
int
-nfs3svc_decode_readlinkargs(struct svc_rqst *rqstp, u32 *p,
+nfs3svc_decode_readlinkargs(struct svc_rqst *rqstp, __be32 *p,
struct nfsd3_readlinkargs *args)
{
if (!(p = decode_fh(p, &args->fh)))
return 0;
- svc_take_page(rqstp);
- args->buffer = page_address(rqstp->rq_respages[rqstp->rq_resused-1]);
+ args->buffer =
+ page_address(rqstp->rq_respages[rqstp->rq_resused++]);
return xdr_argsize_check(rqstp, p);
}
int
-nfs3svc_decode_linkargs(struct svc_rqst *rqstp, u32 *p,
+nfs3svc_decode_linkargs(struct svc_rqst *rqstp, __be32 *p,
struct nfsd3_linkargs *args)
{
if (!(p = decode_fh(p, &args->ffh))
@@ -541,7 +542,7 @@ nfs3svc_decode_linkargs(struct svc_rqst *rqstp, u32 *p,
}
int
-nfs3svc_decode_readdirargs(struct svc_rqst *rqstp, u32 *p,
+nfs3svc_decode_readdirargs(struct svc_rqst *rqstp, __be32 *p,
struct nfsd3_readdirargs *args)
{
if (!(p = decode_fh(p, &args->fh)))
@@ -554,17 +555,18 @@ nfs3svc_decode_readdirargs(struct svc_rqst *rqstp, u32 *p,
if (args->count > PAGE_SIZE)
args->count = PAGE_SIZE;
- svc_take_page(rqstp);
- args->buffer = page_address(rqstp->rq_respages[rqstp->rq_resused-1]);
+ args->buffer =
+ page_address(rqstp->rq_respages[rqstp->rq_resused++]);
return xdr_argsize_check(rqstp, p);
}
int
-nfs3svc_decode_readdirplusargs(struct svc_rqst *rqstp, u32 *p,
+nfs3svc_decode_readdirplusargs(struct svc_rqst *rqstp, __be32 *p,
struct nfsd3_readdirargs *args)
{
int len, pn;
+ u32 max_blocksize = svc_max_payload(rqstp);
if (!(p = decode_fh(p, &args->fh)))
return 0;
@@ -573,13 +575,12 @@ nfs3svc_decode_readdirplusargs(struct svc_rqst *rqstp, u32 *p,
args->dircount = ntohl(*p++);
args->count = ntohl(*p++);
- len = (args->count > NFSSVC_MAXBLKSIZE) ? NFSSVC_MAXBLKSIZE :
+ len = (args->count > max_blocksize) ? max_blocksize :
args->count;
args->count = len;
while (len > 0) {
- pn = rqstp->rq_resused;
- svc_take_page(rqstp);
+ pn = rqstp->rq_resused++;
if (!args->buffer)
args->buffer = page_address(rqstp->rq_respages[pn]);
len -= PAGE_SIZE;
@@ -589,7 +590,7 @@ nfs3svc_decode_readdirplusargs(struct svc_rqst *rqstp, u32 *p,
}
int
-nfs3svc_decode_commitargs(struct svc_rqst *rqstp, u32 *p,
+nfs3svc_decode_commitargs(struct svc_rqst *rqstp, __be32 *p,
struct nfsd3_commitargs *args)
{
if (!(p = decode_fh(p, &args->fh)))
@@ -608,14 +609,14 @@ nfs3svc_decode_commitargs(struct svc_rqst *rqstp, u32 *p,
* will work properly.
*/
int
-nfs3svc_encode_voidres(struct svc_rqst *rqstp, u32 *p, void *dummy)
+nfs3svc_encode_voidres(struct svc_rqst *rqstp, __be32 *p, void *dummy)
{
return xdr_ressize_check(rqstp, p);
}
/* GETATTR */
int
-nfs3svc_encode_attrstat(struct svc_rqst *rqstp, u32 *p,
+nfs3svc_encode_attrstat(struct svc_rqst *rqstp, __be32 *p,
struct nfsd3_attrstat *resp)
{
if (resp->status == 0)
@@ -625,7 +626,7 @@ nfs3svc_encode_attrstat(struct svc_rqst *rqstp, u32 *p,
/* SETATTR, REMOVE, RMDIR */
int
-nfs3svc_encode_wccstat(struct svc_rqst *rqstp, u32 *p,
+nfs3svc_encode_wccstat(struct svc_rqst *rqstp, __be32 *p,
struct nfsd3_attrstat *resp)
{
p = encode_wcc_data(rqstp, p, &resp->fh);
@@ -634,7 +635,7 @@ nfs3svc_encode_wccstat(struct svc_rqst *rqstp, u32 *p,
/* LOOKUP */
int
-nfs3svc_encode_diropres(struct svc_rqst *rqstp, u32 *p,
+nfs3svc_encode_diropres(struct svc_rqst *rqstp, __be32 *p,
struct nfsd3_diropres *resp)
{
if (resp->status == 0) {
@@ -647,7 +648,7 @@ nfs3svc_encode_diropres(struct svc_rqst *rqstp, u32 *p,
/* ACCESS */
int
-nfs3svc_encode_accessres(struct svc_rqst *rqstp, u32 *p,
+nfs3svc_encode_accessres(struct svc_rqst *rqstp, __be32 *p,
struct nfsd3_accessres *resp)
{
p = encode_post_op_attr(rqstp, p, &resp->fh);
@@ -658,7 +659,7 @@ nfs3svc_encode_accessres(struct svc_rqst *rqstp, u32 *p,
/* READLINK */
int
-nfs3svc_encode_readlinkres(struct svc_rqst *rqstp, u32 *p,
+nfs3svc_encode_readlinkres(struct svc_rqst *rqstp, __be32 *p,
struct nfsd3_readlinkres *resp)
{
p = encode_post_op_attr(rqstp, p, &resp->fh);
@@ -668,7 +669,6 @@ nfs3svc_encode_readlinkres(struct svc_rqst *rqstp, u32 *p,
rqstp->rq_res.page_len = resp->len;
if (resp->len & 3) {
/* need to pad the tail */
- rqstp->rq_restailpage = 0;
rqstp->rq_res.tail[0].iov_base = p;
*p = 0;
rqstp->rq_res.tail[0].iov_len = 4 - (resp->len&3);
@@ -680,7 +680,7 @@ nfs3svc_encode_readlinkres(struct svc_rqst *rqstp, u32 *p,
/* READ */
int
-nfs3svc_encode_readres(struct svc_rqst *rqstp, u32 *p,
+nfs3svc_encode_readres(struct svc_rqst *rqstp, __be32 *p,
struct nfsd3_readres *resp)
{
p = encode_post_op_attr(rqstp, p, &resp->fh);
@@ -693,7 +693,6 @@ nfs3svc_encode_readres(struct svc_rqst *rqstp, u32 *p,
rqstp->rq_res.page_len = resp->count;
if (resp->count & 3) {
/* need to pad the tail */
- rqstp->rq_restailpage = 0;
rqstp->rq_res.tail[0].iov_base = p;
*p = 0;
rqstp->rq_res.tail[0].iov_len = 4 - (resp->count & 3);
@@ -705,7 +704,7 @@ nfs3svc_encode_readres(struct svc_rqst *rqstp, u32 *p,
/* WRITE */
int
-nfs3svc_encode_writeres(struct svc_rqst *rqstp, u32 *p,
+nfs3svc_encode_writeres(struct svc_rqst *rqstp, __be32 *p,
struct nfsd3_writeres *resp)
{
p = encode_wcc_data(rqstp, p, &resp->fh);
@@ -720,7 +719,7 @@ nfs3svc_encode_writeres(struct svc_rqst *rqstp, u32 *p,
/* CREATE, MKDIR, SYMLINK, MKNOD */
int
-nfs3svc_encode_createres(struct svc_rqst *rqstp, u32 *p,
+nfs3svc_encode_createres(struct svc_rqst *rqstp, __be32 *p,
struct nfsd3_diropres *resp)
{
if (resp->status == 0) {
@@ -734,7 +733,7 @@ nfs3svc_encode_createres(struct svc_rqst *rqstp, u32 *p,
/* RENAME */
int
-nfs3svc_encode_renameres(struct svc_rqst *rqstp, u32 *p,
+nfs3svc_encode_renameres(struct svc_rqst *rqstp, __be32 *p,
struct nfsd3_renameres *resp)
{
p = encode_wcc_data(rqstp, p, &resp->ffh);
@@ -744,7 +743,7 @@ nfs3svc_encode_renameres(struct svc_rqst *rqstp, u32 *p,
/* LINK */
int
-nfs3svc_encode_linkres(struct svc_rqst *rqstp, u32 *p,
+nfs3svc_encode_linkres(struct svc_rqst *rqstp, __be32 *p,
struct nfsd3_linkres *resp)
{
p = encode_post_op_attr(rqstp, p, &resp->fh);
@@ -754,7 +753,7 @@ nfs3svc_encode_linkres(struct svc_rqst *rqstp, u32 *p,
/* READDIR */
int
-nfs3svc_encode_readdirres(struct svc_rqst *rqstp, u32 *p,
+nfs3svc_encode_readdirres(struct svc_rqst *rqstp, __be32 *p,
struct nfsd3_readdirres *resp)
{
p = encode_post_op_attr(rqstp, p, &resp->fh);
@@ -768,7 +767,6 @@ nfs3svc_encode_readdirres(struct svc_rqst *rqstp, u32 *p,
rqstp->rq_res.page_len = (resp->count) << 2;
/* add the 'tail' to the end of the 'head' page - page 0. */
- rqstp->rq_restailpage = 0;
rqstp->rq_res.tail[0].iov_base = p;
*p++ = 0; /* no more entries */
*p++ = htonl(resp->common.err == nfserr_eof);
@@ -778,8 +776,8 @@ nfs3svc_encode_readdirres(struct svc_rqst *rqstp, u32 *p,
return xdr_ressize_check(rqstp, p);
}
-static inline u32 *
-encode_entry_baggage(struct nfsd3_readdirres *cd, u32 *p, const char *name,
+static inline __be32 *
+encode_entry_baggage(struct nfsd3_readdirres *cd, __be32 *p, const char *name,
int namlen, ino_t ino)
{
*p++ = xdr_one; /* mark entry present */
@@ -792,8 +790,8 @@ encode_entry_baggage(struct nfsd3_readdirres *cd, u32 *p, const char *name,
return p;
}
-static inline u32 *
-encode_entryplus_baggage(struct nfsd3_readdirres *cd, u32 *p,
+static inline __be32 *
+encode_entryplus_baggage(struct nfsd3_readdirres *cd, __be32 *p,
struct svc_fh *fhp)
{
p = encode_post_op_attr(cd->rqstp, p, fhp);
@@ -855,7 +853,7 @@ encode_entry(struct readdir_cd *ccd, const char *name,
{
struct nfsd3_readdirres *cd = container_of(ccd, struct nfsd3_readdirres,
common);
- u32 *p = cd->buffer;
+ __be32 *p = cd->buffer;
caddr_t curr_page_addr = NULL;
int pn; /* current page number */
int slen; /* string (name) length */
@@ -921,7 +919,7 @@ encode_entry(struct readdir_cd *ccd, const char *name,
} else if (cd->rqstp->rq_respages[pn+1] != NULL) {
/* temporarily encode entry into next page, then move back to
* current and next page in rq_respages[] */
- u32 *p1, *tmp;
+ __be32 *p1, *tmp;
int len1, len2;
/* grab next page for temporary storage of entry */
@@ -1011,7 +1009,7 @@ nfs3svc_encode_entry_plus(struct readdir_cd *cd, const char *name,
/* FSSTAT */
int
-nfs3svc_encode_fsstatres(struct svc_rqst *rqstp, u32 *p,
+nfs3svc_encode_fsstatres(struct svc_rqst *rqstp, __be32 *p,
struct nfsd3_fsstatres *resp)
{
struct kstatfs *s = &resp->stats;
@@ -1033,7 +1031,7 @@ nfs3svc_encode_fsstatres(struct svc_rqst *rqstp, u32 *p,
/* FSINFO */
int
-nfs3svc_encode_fsinfores(struct svc_rqst *rqstp, u32 *p,
+nfs3svc_encode_fsinfores(struct svc_rqst *rqstp, __be32 *p,
struct nfsd3_fsinfores *resp)
{
*p++ = xdr_zero; /* no post_op_attr */
@@ -1057,7 +1055,7 @@ nfs3svc_encode_fsinfores(struct svc_rqst *rqstp, u32 *p,
/* PATHCONF */
int
-nfs3svc_encode_pathconfres(struct svc_rqst *rqstp, u32 *p,
+nfs3svc_encode_pathconfres(struct svc_rqst *rqstp, __be32 *p,
struct nfsd3_pathconfres *resp)
{
*p++ = xdr_zero; /* no post_op_attr */
@@ -1076,7 +1074,7 @@ nfs3svc_encode_pathconfres(struct svc_rqst *rqstp, u32 *p,
/* COMMIT */
int
-nfs3svc_encode_commitres(struct svc_rqst *rqstp, u32 *p,
+nfs3svc_encode_commitres(struct svc_rqst *rqstp, __be32 *p,
struct nfsd3_commitres *resp)
{
p = encode_wcc_data(rqstp, p, &resp->fh);
@@ -1092,7 +1090,7 @@ nfs3svc_encode_commitres(struct svc_rqst *rqstp, u32 *p,
* XDR release functions
*/
int
-nfs3svc_release_fhandle(struct svc_rqst *rqstp, u32 *p,
+nfs3svc_release_fhandle(struct svc_rqst *rqstp, __be32 *p,
struct nfsd3_attrstat *resp)
{
fh_put(&resp->fh);
@@ -1100,7 +1098,7 @@ nfs3svc_release_fhandle(struct svc_rqst *rqstp, u32 *p,
}
int
-nfs3svc_release_fhandle2(struct svc_rqst *rqstp, u32 *p,
+nfs3svc_release_fhandle2(struct svc_rqst *rqstp, __be32 *p,
struct nfsd3_fhandle_pair *resp)
{
fh_put(&resp->fh1);
diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c
index edb107e61b91..5d94555cdc83 100644
--- a/fs/nfsd/nfs4acl.c
+++ b/fs/nfsd/nfs4acl.c
@@ -63,6 +63,8 @@
#define NFS4_INHERITANCE_FLAGS (NFS4_ACE_FILE_INHERIT_ACE \
| NFS4_ACE_DIRECTORY_INHERIT_ACE | NFS4_ACE_INHERIT_ONLY_ACE)
+#define NFS4_SUPPORTED_FLAGS (NFS4_INHERITANCE_FLAGS | NFS4_ACE_IDENTIFIER_GROUP)
+
#define MASK_EQUAL(mask1, mask2) \
( ((mask1) & NFS4_ACE_MASK_ALL) == ((mask2) & NFS4_ACE_MASK_ALL) )
@@ -96,24 +98,26 @@ deny_mask(u32 allow_mask, unsigned int flags)
/* XXX: modify functions to return NFS errors; they're only ever
* used by nfs code, after all.... */
-static int
-mode_from_nfs4(u32 perm, unsigned short *mode, unsigned int flags)
+/* We only map from NFSv4 to POSIX ACLs when setting ACLs, when we err on the
+ * side of being more restrictive, so the mode bit mapping below is
+ * pessimistic. An optimistic version would be needed to handle DENY's,
+ * but we espect to coalesce all ALLOWs and DENYs before mapping to mode
+ * bits. */
+
+static void
+low_mode_from_nfs4(u32 perm, unsigned short *mode, unsigned int flags)
{
- u32 ignore = 0;
+ u32 write_mode = NFS4_WRITE_MODE;
- if (!(flags & NFS4_ACL_DIR))
- ignore |= NFS4_ACE_DELETE_CHILD; /* ignore it */
- perm |= ignore;
+ if (flags & NFS4_ACL_DIR)
+ write_mode |= NFS4_ACE_DELETE_CHILD;
*mode = 0;
if ((perm & NFS4_READ_MODE) == NFS4_READ_MODE)
*mode |= ACL_READ;
- if ((perm & NFS4_WRITE_MODE) == NFS4_WRITE_MODE)
+ if ((perm & write_mode) == write_mode)
*mode |= ACL_WRITE;
if ((perm & NFS4_EXECUTE_MODE) == NFS4_EXECUTE_MODE)
*mode |= ACL_EXECUTE;
- if (!MASK_EQUAL(perm, ignore|mask_from_posix(*mode, flags)))
- return -EINVAL;
- return 0;
}
struct ace_container {
@@ -338,38 +342,6 @@ sort_pacl(struct posix_acl *pacl)
return;
}
-static int
-write_pace(struct nfs4_ace *ace, struct posix_acl *pacl,
- struct posix_acl_entry **pace, short tag, unsigned int flags)
-{
- struct posix_acl_entry *this = *pace;
-
- if (*pace == pacl->a_entries + pacl->a_count)
- return -EINVAL; /* fell off the end */
- (*pace)++;
- this->e_tag = tag;
- if (tag == ACL_USER_OBJ)
- flags |= NFS4_ACL_OWNER;
- if (mode_from_nfs4(ace->access_mask, &this->e_perm, flags))
- return -EINVAL;
- this->e_id = (tag == ACL_USER || tag == ACL_GROUP ?
- ace->who : ACL_UNDEFINED_ID);
- return 0;
-}
-
-static struct nfs4_ace *
-get_next_v4_ace(struct list_head **p, struct list_head *head)
-{
- struct nfs4_ace *ace;
-
- *p = (*p)->next;
- if (*p == head)
- return NULL;
- ace = list_entry(*p, struct nfs4_ace, l_ace);
-
- return ace;
-}
-
int
nfs4_acl_nfsv4_to_posix(struct nfs4_acl *acl, struct posix_acl **pacl,
struct posix_acl **dpacl, unsigned int flags)
@@ -385,42 +357,23 @@ nfs4_acl_nfsv4_to_posix(struct nfs4_acl *acl, struct posix_acl **pacl,
goto out;
error = nfs4_acl_split(acl, dacl);
- if (error < 0)
+ if (error)
goto out_acl;
- if (pacl != NULL) {
- if (acl->naces == 0) {
- error = -ENODATA;
- goto try_dpacl;
- }
-
- *pacl = _nfsv4_to_posix_one(acl, flags);
- if (IS_ERR(*pacl)) {
- error = PTR_ERR(*pacl);
- *pacl = NULL;
- goto out_acl;
- }
+ *pacl = _nfsv4_to_posix_one(acl, flags);
+ if (IS_ERR(*pacl)) {
+ error = PTR_ERR(*pacl);
+ *pacl = NULL;
+ goto out_acl;
}
-try_dpacl:
- if (dpacl != NULL) {
- if (dacl->naces == 0) {
- if (pacl == NULL || *pacl == NULL)
- error = -ENODATA;
- goto out_acl;
- }
-
- error = 0;
- *dpacl = _nfsv4_to_posix_one(dacl, flags);
- if (IS_ERR(*dpacl)) {
- error = PTR_ERR(*dpacl);
- *dpacl = NULL;
- goto out_acl;
- }
+ *dpacl = _nfsv4_to_posix_one(dacl, flags);
+ if (IS_ERR(*dpacl)) {
+ error = PTR_ERR(*dpacl);
+ *dpacl = NULL;
}
-
out_acl:
- if (error && pacl) {
+ if (error) {
posix_acl_release(*pacl);
*pacl = NULL;
}
@@ -429,349 +382,311 @@ out:
return error;
}
+/*
+ * While processing the NFSv4 ACE, this maintains bitmasks representing
+ * which permission bits have been allowed and which denied to a given
+ * entity: */
+struct posix_ace_state {
+ u32 allow;
+ u32 deny;
+};
+
+struct posix_user_ace_state {
+ uid_t uid;
+ struct posix_ace_state perms;
+};
+
+struct posix_ace_state_array {
+ int n;
+ struct posix_user_ace_state aces[];
+};
+
+/*
+ * While processing the NFSv4 ACE, this maintains the partial permissions
+ * calculated so far: */
+
+struct posix_acl_state {
+ struct posix_ace_state owner;
+ struct posix_ace_state group;
+ struct posix_ace_state other;
+ struct posix_ace_state everyone;
+ struct posix_ace_state mask; /* Deny unused in this case */
+ struct posix_ace_state_array *users;
+ struct posix_ace_state_array *groups;
+};
+
static int
-same_who(struct nfs4_ace *a, struct nfs4_ace *b)
+init_state(struct posix_acl_state *state, int cnt)
{
- return a->whotype == b->whotype &&
- (a->whotype != NFS4_ACL_WHO_NAMED || a->who == b->who);
+ int alloc;
+
+ memset(state, 0, sizeof(struct posix_acl_state));
+ /*
+ * In the worst case, each individual acl could be for a distinct
+ * named user or group, but we don't no which, so we allocate
+ * enough space for either:
+ */
+ alloc = sizeof(struct posix_ace_state_array)
+ + cnt*sizeof(struct posix_ace_state);
+ state->users = kzalloc(alloc, GFP_KERNEL);
+ if (!state->users)
+ return -ENOMEM;
+ state->groups = kzalloc(alloc, GFP_KERNEL);
+ if (!state->groups) {
+ kfree(state->users);
+ return -ENOMEM;
+ }
+ return 0;
}
-static int
-complementary_ace_pair(struct nfs4_ace *allow, struct nfs4_ace *deny,
- unsigned int flags)
-{
- int ignore = 0;
- if (!(flags & NFS4_ACL_DIR))
- ignore |= NFS4_ACE_DELETE_CHILD;
- return MASK_EQUAL(ignore|deny_mask(allow->access_mask, flags),
- ignore|deny->access_mask) &&
- allow->type == NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE &&
- deny->type == NFS4_ACE_ACCESS_DENIED_ACE_TYPE &&
- allow->flag == deny->flag &&
- same_who(allow, deny);
+static void
+free_state(struct posix_acl_state *state) {
+ kfree(state->users);
+ kfree(state->groups);
}
-static inline int
-user_obj_from_v4(struct nfs4_acl *n4acl, struct list_head **p,
- struct posix_acl *pacl, struct posix_acl_entry **pace,
- unsigned int flags)
+static inline void add_to_mask(struct posix_acl_state *state, struct posix_ace_state *astate)
{
- int error = -EINVAL;
- struct nfs4_ace *ace, *ace2;
-
- ace = get_next_v4_ace(p, &n4acl->ace_head);
- if (ace == NULL)
- goto out;
- if (ace2type(ace) != ACL_USER_OBJ)
- goto out;
- error = write_pace(ace, pacl, pace, ACL_USER_OBJ, flags);
- if (error < 0)
- goto out;
- error = -EINVAL;
- ace2 = get_next_v4_ace(p, &n4acl->ace_head);
- if (ace2 == NULL)
- goto out;
- if (!complementary_ace_pair(ace, ace2, flags))
- goto out;
- error = 0;
-out:
- return error;
+ state->mask.allow |= astate->allow;
}
-static inline int
-users_from_v4(struct nfs4_acl *n4acl, struct list_head **p,
- struct nfs4_ace **mask_ace,
- struct posix_acl *pacl, struct posix_acl_entry **pace,
- unsigned int flags)
-{
- int error = -EINVAL;
- struct nfs4_ace *ace, *ace2;
+/*
+ * Certain bits (SYNCHRONIZE, DELETE, WRITE_OWNER, READ/WRITE_NAMED_ATTRS,
+ * READ_ATTRIBUTES, READ_ACL) are currently unenforceable and don't translate
+ * to traditional read/write/execute permissions.
+ *
+ * It's problematic to reject acls that use certain mode bits, because it
+ * places the burden on users to learn the rules about which bits one
+ * particular server sets, without giving the user a lot of help--we return an
+ * error that could mean any number of different things. To make matters
+ * worse, the problematic bits might be introduced by some application that's
+ * automatically mapping from some other acl model.
+ *
+ * So wherever possible we accept anything, possibly erring on the side of
+ * denying more permissions than necessary.
+ *
+ * However we do reject *explicit* DENY's of a few bits representing
+ * permissions we could never deny:
+ */
- ace = get_next_v4_ace(p, &n4acl->ace_head);
- if (ace == NULL)
- goto out;
- while (ace2type(ace) == ACL_USER) {
- if (ace->type != NFS4_ACE_ACCESS_DENIED_ACE_TYPE)
- goto out;
- if (*mask_ace &&
- !MASK_EQUAL(ace->access_mask, (*mask_ace)->access_mask))
- goto out;
- *mask_ace = ace;
- ace = get_next_v4_ace(p, &n4acl->ace_head);
- if (ace == NULL)
- goto out;
- if (ace->type != NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE)
- goto out;
- error = write_pace(ace, pacl, pace, ACL_USER, flags);
- if (error < 0)
- goto out;
- error = -EINVAL;
- ace2 = get_next_v4_ace(p, &n4acl->ace_head);
- if (ace2 == NULL)
- goto out;
- if (!complementary_ace_pair(ace, ace2, flags))
- goto out;
- if ((*mask_ace)->flag != ace2->flag ||
- !same_who(*mask_ace, ace2))
- goto out;
- ace = get_next_v4_ace(p, &n4acl->ace_head);
- if (ace == NULL)
- goto out;
- }
- error = 0;
-out:
- return error;
+static inline int check_deny(u32 mask, int isowner)
+{
+ if (mask & (NFS4_ACE_READ_ATTRIBUTES | NFS4_ACE_READ_ACL))
+ return -EINVAL;
+ if (!isowner)
+ return 0;
+ if (mask & (NFS4_ACE_WRITE_ATTRIBUTES | NFS4_ACE_WRITE_ACL))
+ return -EINVAL;
+ return 0;
}
-static inline int
-group_obj_and_groups_from_v4(struct nfs4_acl *n4acl, struct list_head **p,
- struct nfs4_ace **mask_ace,
- struct posix_acl *pacl, struct posix_acl_entry **pace,
- unsigned int flags)
+static struct posix_acl *
+posix_state_to_acl(struct posix_acl_state *state, unsigned int flags)
{
- int error = -EINVAL;
- struct nfs4_ace *ace, *ace2;
- struct ace_container *ac;
- struct list_head group_l;
-
- INIT_LIST_HEAD(&group_l);
- ace = list_entry(*p, struct nfs4_ace, l_ace);
-
- /* group owner (mask and allow aces) */
+ struct posix_acl_entry *pace;
+ struct posix_acl *pacl;
+ int nace;
+ int i, error = 0;
- if (pacl->a_count != 3) {
- /* then the group owner should be preceded by mask */
- if (ace->type != NFS4_ACE_ACCESS_DENIED_ACE_TYPE)
- goto out;
- if (*mask_ace &&
- !MASK_EQUAL(ace->access_mask, (*mask_ace)->access_mask))
- goto out;
- *mask_ace = ace;
- ace = get_next_v4_ace(p, &n4acl->ace_head);
- if (ace == NULL)
- goto out;
+ nace = 4 + state->users->n + state->groups->n;
+ pacl = posix_acl_alloc(nace, GFP_KERNEL);
+ if (!pacl)
+ return ERR_PTR(-ENOMEM);
- if ((*mask_ace)->flag != ace->flag || !same_who(*mask_ace, ace))
- goto out;
+ pace = pacl->a_entries;
+ pace->e_tag = ACL_USER_OBJ;
+ error = check_deny(state->owner.deny, 1);
+ if (error)
+ goto out_err;
+ low_mode_from_nfs4(state->owner.allow, &pace->e_perm, flags);
+ pace->e_id = ACL_UNDEFINED_ID;
+
+ for (i=0; i < state->users->n; i++) {
+ pace++;
+ pace->e_tag = ACL_USER;
+ error = check_deny(state->users->aces[i].perms.deny, 0);
+ if (error)
+ goto out_err;
+ low_mode_from_nfs4(state->users->aces[i].perms.allow,
+ &pace->e_perm, flags);
+ pace->e_id = state->users->aces[i].uid;
+ add_to_mask(state, &state->users->aces[i].perms);
}
- if (ace2type(ace) != ACL_GROUP_OBJ)
- goto out;
-
- ac = kmalloc(sizeof(*ac), GFP_KERNEL);
- error = -ENOMEM;
- if (ac == NULL)
- goto out;
- ac->ace = ace;
- list_add_tail(&ac->ace_l, &group_l);
-
- error = -EINVAL;
- if (ace->type != NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE)
- goto out;
-
- error = write_pace(ace, pacl, pace, ACL_GROUP_OBJ, flags);
- if (error < 0)
- goto out;
-
- error = -EINVAL;
- ace = get_next_v4_ace(p, &n4acl->ace_head);
- if (ace == NULL)
- goto out;
-
- /* groups (mask and allow aces) */
-
- while (ace2type(ace) == ACL_GROUP) {
- if (*mask_ace == NULL)
- goto out;
-
- if (ace->type != NFS4_ACE_ACCESS_DENIED_ACE_TYPE ||
- !MASK_EQUAL(ace->access_mask, (*mask_ace)->access_mask))
- goto out;
- *mask_ace = ace;
+ pace++;
+ pace->e_tag = ACL_GROUP_OBJ;
+ error = check_deny(state->group.deny, 0);
+ if (error)
+ goto out_err;
+ low_mode_from_nfs4(state->group.allow, &pace->e_perm, flags);
+ pace->e_id = ACL_UNDEFINED_ID;
+ add_to_mask(state, &state->group);
+
+ for (i=0; i < state->groups->n; i++) {
+ pace++;
+ pace->e_tag = ACL_GROUP;
+ error = check_deny(state->groups->aces[i].perms.deny, 0);
+ if (error)
+ goto out_err;
+ low_mode_from_nfs4(state->groups->aces[i].perms.allow,
+ &pace->e_perm, flags);
+ pace->e_id = state->groups->aces[i].uid;
+ add_to_mask(state, &state->groups->aces[i].perms);
+ }
- ace = get_next_v4_ace(p, &n4acl->ace_head);
- if (ace == NULL)
- goto out;
- ac = kmalloc(sizeof(*ac), GFP_KERNEL);
- error = -ENOMEM;
- if (ac == NULL)
- goto out;
- error = -EINVAL;
- if (ace->type != NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE ||
- !same_who(ace, *mask_ace))
- goto out;
+ pace++;
+ pace->e_tag = ACL_MASK;
+ low_mode_from_nfs4(state->mask.allow, &pace->e_perm, flags);
+ pace->e_id = ACL_UNDEFINED_ID;
- ac->ace = ace;
- list_add_tail(&ac->ace_l, &group_l);
+ pace++;
+ pace->e_tag = ACL_OTHER;
+ error = check_deny(state->other.deny, 0);
+ if (error)
+ goto out_err;
+ low_mode_from_nfs4(state->other.allow, &pace->e_perm, flags);
+ pace->e_id = ACL_UNDEFINED_ID;
- error = write_pace(ace, pacl, pace, ACL_GROUP, flags);
- if (error < 0)
- goto out;
- error = -EINVAL;
- ace = get_next_v4_ace(p, &n4acl->ace_head);
- if (ace == NULL)
- goto out;
- }
+ return pacl;
+out_err:
+ posix_acl_release(pacl);
+ return ERR_PTR(error);
+}
- /* group owner (deny ace) */
+static inline void allow_bits(struct posix_ace_state *astate, u32 mask)
+{
+ /* Allow all bits in the mask not already denied: */
+ astate->allow |= mask & ~astate->deny;
+}
- if (ace2type(ace) != ACL_GROUP_OBJ)
- goto out;
- ac = list_entry(group_l.next, struct ace_container, ace_l);
- ace2 = ac->ace;
- if (!complementary_ace_pair(ace2, ace, flags))
- goto out;
- list_del(group_l.next);
- kfree(ac);
+static inline void deny_bits(struct posix_ace_state *astate, u32 mask)
+{
+ /* Deny all bits in the mask not already allowed: */
+ astate->deny |= mask & ~astate->allow;
+}
- /* groups (deny aces) */
+static int find_uid(struct posix_acl_state *state, struct posix_ace_state_array *a, uid_t uid)
+{
+ int i;
- while (!list_empty(&group_l)) {
- ace = get_next_v4_ace(p, &n4acl->ace_head);
- if (ace == NULL)
- goto out;
- if (ace2type(ace) != ACL_GROUP)
- goto out;
- ac = list_entry(group_l.next, struct ace_container, ace_l);
- ace2 = ac->ace;
- if (!complementary_ace_pair(ace2, ace, flags))
- goto out;
- list_del(group_l.next);
- kfree(ac);
- }
+ for (i = 0; i < a->n; i++)
+ if (a->aces[i].uid == uid)
+ return i;
+ /* Not found: */
+ a->n++;
+ a->aces[i].uid = uid;
+ a->aces[i].perms.allow = state->everyone.allow;
+ a->aces[i].perms.deny = state->everyone.deny;
- ace = get_next_v4_ace(p, &n4acl->ace_head);
- if (ace == NULL)
- goto out;
- if (ace2type(ace) != ACL_OTHER)
- goto out;
- error = 0;
-out:
- while (!list_empty(&group_l)) {
- ac = list_entry(group_l.next, struct ace_container, ace_l);
- list_del(group_l.next);
- kfree(ac);
- }
- return error;
+ return i;
}
-static inline int
-mask_from_v4(struct nfs4_acl *n4acl, struct list_head **p,
- struct nfs4_ace **mask_ace,
- struct posix_acl *pacl, struct posix_acl_entry **pace,
- unsigned int flags)
+static void deny_bits_array(struct posix_ace_state_array *a, u32 mask)
{
- int error = -EINVAL;
- struct nfs4_ace *ace;
+ int i;
- ace = list_entry(*p, struct nfs4_ace, l_ace);
- if (pacl->a_count != 3) {
- if (*mask_ace == NULL)
- goto out;
- (*mask_ace)->access_mask = deny_mask((*mask_ace)->access_mask, flags);
- write_pace(*mask_ace, pacl, pace, ACL_MASK, flags);
- }
- error = 0;
-out:
- return error;
+ for (i=0; i < a->n; i++)
+ deny_bits(&a->aces[i].perms, mask);
}
-static inline int
-other_from_v4(struct nfs4_acl *n4acl, struct list_head **p,
- struct posix_acl *pacl, struct posix_acl_entry **pace,
- unsigned int flags)
+static void allow_bits_array(struct posix_ace_state_array *a, u32 mask)
{
- int error = -EINVAL;
- struct nfs4_ace *ace, *ace2;
+ int i;
- ace = list_entry(*p, struct nfs4_ace, l_ace);
- if (ace->type != NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE)
- goto out;
- error = write_pace(ace, pacl, pace, ACL_OTHER, flags);
- if (error < 0)
- goto out;
- error = -EINVAL;
- ace2 = get_next_v4_ace(p, &n4acl->ace_head);
- if (ace2 == NULL)
- goto out;
- if (!complementary_ace_pair(ace, ace2, flags))
- goto out;
- error = 0;
-out:
- return error;
+ for (i=0; i < a->n; i++)
+ allow_bits(&a->aces[i].perms, mask);
}
-static int
-calculate_posix_ace_count(struct nfs4_acl *n4acl)
+static void process_one_v4_ace(struct posix_acl_state *state,
+ struct nfs4_ace *ace)
{
- if (n4acl->naces == 6) /* owner, owner group, and other only */
- return 3;
- else { /* Otherwise there must be a mask entry. */
- /* Also, the remaining entries are for named users and
- * groups, and come in threes (mask, allow, deny): */
- if (n4acl->naces < 7)
- return -EINVAL;
- if ((n4acl->naces - 7) % 3)
- return -EINVAL;
- return 4 + (n4acl->naces - 7)/3;
+ u32 mask = ace->access_mask;
+ int i;
+
+ switch (ace2type(ace)) {
+ case ACL_USER_OBJ:
+ if (ace->type == NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE) {
+ allow_bits(&state->owner, mask);
+ } else {
+ deny_bits(&state->owner, mask);
+ }
+ break;
+ case ACL_USER:
+ i = find_uid(state, state->users, ace->who);
+ if (ace->type == NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE) {
+ allow_bits(&state->users->aces[i].perms, mask);
+ } else {
+ deny_bits(&state->users->aces[i].perms, mask);
+ mask = state->users->aces[i].perms.deny;
+ deny_bits(&state->owner, mask);
+ }
+ break;
+ case ACL_GROUP_OBJ:
+ if (ace->type == NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE) {
+ allow_bits(&state->group, mask);
+ } else {
+ deny_bits(&state->group, mask);
+ mask = state->group.deny;
+ deny_bits(&state->owner, mask);
+ deny_bits(&state->everyone, mask);
+ deny_bits_array(state->users, mask);
+ deny_bits_array(state->groups, mask);
+ }
+ break;
+ case ACL_GROUP:
+ i = find_uid(state, state->groups, ace->who);
+ if (ace->type == NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE) {
+ allow_bits(&state->groups->aces[i].perms, mask);
+ } else {
+ deny_bits(&state->groups->aces[i].perms, mask);
+ mask = state->groups->aces[i].perms.deny;
+ deny_bits(&state->owner, mask);
+ deny_bits(&state->group, mask);
+ deny_bits(&state->everyone, mask);
+ deny_bits_array(state->users, mask);
+ deny_bits_array(state->groups, mask);
+ }
+ break;
+ case ACL_OTHER:
+ if (ace->type == NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE) {
+ allow_bits(&state->owner, mask);
+ allow_bits(&state->group, mask);
+ allow_bits(&state->other, mask);
+ allow_bits(&state->everyone, mask);
+ allow_bits_array(state->users, mask);
+ allow_bits_array(state->groups, mask);
+ } else {
+ deny_bits(&state->owner, mask);
+ deny_bits(&state->group, mask);
+ deny_bits(&state->other, mask);
+ deny_bits(&state->everyone, mask);
+ deny_bits_array(state->users, mask);
+ deny_bits_array(state->groups, mask);
+ }
}
}
-
static struct posix_acl *
_nfsv4_to_posix_one(struct nfs4_acl *n4acl, unsigned int flags)
{
+ struct posix_acl_state state;
struct posix_acl *pacl;
- int error = -EINVAL, nace = 0;
- struct list_head *p;
- struct nfs4_ace *mask_ace = NULL;
- struct posix_acl_entry *pace;
-
- nace = calculate_posix_ace_count(n4acl);
- if (nace < 0)
- goto out_err;
-
- pacl = posix_acl_alloc(nace, GFP_KERNEL);
- error = -ENOMEM;
- if (pacl == NULL)
- goto out_err;
-
- pace = &pacl->a_entries[0];
- p = &n4acl->ace_head;
-
- error = user_obj_from_v4(n4acl, &p, pacl, &pace, flags);
- if (error)
- goto out_acl;
-
- error = users_from_v4(n4acl, &p, &mask_ace, pacl, &pace, flags);
- if (error)
- goto out_acl;
+ struct nfs4_ace *ace;
+ int ret;
- error = group_obj_and_groups_from_v4(n4acl, &p, &mask_ace, pacl, &pace,
- flags);
- if (error)
- goto out_acl;
+ ret = init_state(&state, n4acl->naces);
+ if (ret)
+ return ERR_PTR(ret);
- error = mask_from_v4(n4acl, &p, &mask_ace, pacl, &pace, flags);
- if (error)
- goto out_acl;
- error = other_from_v4(n4acl, &p, pacl, &pace, flags);
- if (error)
- goto out_acl;
+ list_for_each_entry(ace, &n4acl->ace_head, l_ace)
+ process_one_v4_ace(&state, ace);
- error = -EINVAL;
- if (p->next != &n4acl->ace_head)
- goto out_acl;
- if (pace != pacl->a_entries + pacl->a_count)
- goto out_acl;
+ pacl = posix_state_to_acl(&state, flags);
- sort_pacl(pacl);
+ free_state(&state);
- return pacl;
-out_acl:
- posix_acl_release(pacl);
-out_err:
- pacl = ERR_PTR(error);
+ if (!IS_ERR(pacl))
+ sort_pacl(pacl);
return pacl;
}
@@ -785,22 +700,41 @@ nfs4_acl_split(struct nfs4_acl *acl, struct nfs4_acl *dacl)
list_for_each_safe(h, n, &acl->ace_head) {
ace = list_entry(h, struct nfs4_ace, l_ace);
- if ((ace->flag & NFS4_INHERITANCE_FLAGS)
- != NFS4_INHERITANCE_FLAGS)
- continue;
+ if (ace->type != NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE &&
+ ace->type != NFS4_ACE_ACCESS_DENIED_ACE_TYPE)
+ return -EINVAL;
- error = nfs4_acl_add_ace(dacl, ace->type, ace->flag,
- ace->access_mask, ace->whotype, ace->who);
- if (error < 0)
- goto out;
+ if (ace->flag & ~NFS4_SUPPORTED_FLAGS)
+ return -EINVAL;
- list_del(h);
- kfree(ace);
- acl->naces--;
+ switch (ace->flag & NFS4_INHERITANCE_FLAGS) {
+ case 0:
+ /* Leave this ace in the effective acl: */
+ continue;
+ case NFS4_INHERITANCE_FLAGS:
+ /* Add this ace to the default acl and remove it
+ * from the effective acl: */
+ error = nfs4_acl_add_ace(dacl, ace->type, ace->flag,
+ ace->access_mask, ace->whotype, ace->who);
+ if (error)
+ return error;
+ list_del(h);
+ kfree(ace);
+ acl->naces--;
+ break;
+ case NFS4_INHERITANCE_FLAGS & ~NFS4_ACE_INHERIT_ONLY_ACE:
+ /* Add this ace to the default, but leave it in
+ * the effective acl as well: */
+ error = nfs4_acl_add_ace(dacl, ace->type, ace->flag,
+ ace->access_mask, ace->whotype, ace->who);
+ if (error)
+ return error;
+ break;
+ default:
+ return -EINVAL;
+ }
}
-
-out:
- return error;
+ return 0;
}
static short
@@ -930,23 +864,6 @@ nfs4_acl_write_who(int who, char *p)
return -1;
}
-static inline int
-match_who(struct nfs4_ace *ace, uid_t owner, gid_t group, uid_t who)
-{
- switch (ace->whotype) {
- case NFS4_ACL_WHO_NAMED:
- return who == ace->who;
- case NFS4_ACL_WHO_OWNER:
- return who == owner;
- case NFS4_ACL_WHO_GROUP:
- return who == group;
- case NFS4_ACL_WHO_EVERYONE:
- return 1;
- default:
- return 0;
- }
-}
-
EXPORT_SYMBOL(nfs4_acl_new);
EXPORT_SYMBOL(nfs4_acl_free);
EXPORT_SYMBOL(nfs4_acl_add_ace);
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index f6ca9fb3fc63..f57655a7a2b6 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -85,8 +85,8 @@ enum nfs_cb_opnum4 {
/*
* Generic encode routines from fs/nfs/nfs4xdr.c
*/
-static inline u32 *
-xdr_writemem(u32 *p, const void *ptr, int nbytes)
+static inline __be32 *
+xdr_writemem(__be32 *p, const void *ptr, int nbytes)
{
int tmp = XDR_QUADLEN(nbytes);
if (!tmp)
@@ -205,7 +205,7 @@ nfs_cb_stat_to_errno(int stat)
static int
encode_cb_compound_hdr(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr)
{
- u32 * p;
+ __be32 * p;
RESERVE_SPACE(16);
WRITE32(0); /* tag length is always 0 */
@@ -218,7 +218,7 @@ encode_cb_compound_hdr(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr)
static int
encode_cb_recall(struct xdr_stream *xdr, struct nfs4_cb_recall *cb_rec)
{
- u32 *p;
+ __be32 *p;
int len = cb_rec->cbr_fhlen;
RESERVE_SPACE(12+sizeof(cb_rec->cbr_stateid) + len);
@@ -231,7 +231,7 @@ encode_cb_recall(struct xdr_stream *xdr, struct nfs4_cb_recall *cb_rec)
}
static int
-nfs4_xdr_enc_cb_null(struct rpc_rqst *req, u32 *p)
+nfs4_xdr_enc_cb_null(struct rpc_rqst *req, __be32 *p)
{
struct xdr_stream xdrs, *xdr = &xdrs;
@@ -241,7 +241,7 @@ nfs4_xdr_enc_cb_null(struct rpc_rqst *req, u32 *p)
}
static int
-nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, u32 *p, struct nfs4_cb_recall *args)
+nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, __be32 *p, struct nfs4_cb_recall *args)
{
struct xdr_stream xdr;
struct nfs4_cb_compound_hdr hdr = {
@@ -257,7 +257,7 @@ nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, u32 *p, struct nfs4_cb_recall *args
static int
decode_cb_compound_hdr(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr){
- u32 *p;
+ __be32 *p;
READ_BUF(8);
READ32(hdr->status);
@@ -272,7 +272,7 @@ decode_cb_compound_hdr(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr)
static int
decode_cb_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected)
{
- u32 *p;
+ __be32 *p;
u32 op;
int32_t nfserr;
@@ -291,13 +291,13 @@ decode_cb_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected)
}
static int
-nfs4_xdr_dec_cb_null(struct rpc_rqst *req, u32 *p)
+nfs4_xdr_dec_cb_null(struct rpc_rqst *req, __be32 *p)
{
return 0;
}
static int
-nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp, u32 *p)
+nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp, __be32 *p)
{
struct xdr_stream xdr;
struct nfs4_cb_compound_hdr hdr;
@@ -421,7 +421,7 @@ nfsd4_probe_callback(struct nfs4_client *clp)
/* Create RPC client */
cb->cb_client = rpc_create(&args);
- if (!cb->cb_client) {
+ if (IS_ERR(cb->cb_client)) {
dprintk("NFSD: couldn't create callback client\n");
goto out_err;
}
@@ -448,10 +448,10 @@ nfsd4_probe_callback(struct nfs4_client *clp)
out_rpciod:
atomic_dec(&clp->cl_count);
rpciod_down();
- cb->cb_client = NULL;
out_clnt:
rpc_shutdown_client(cb->cb_client);
out_err:
+ cb->cb_client = NULL;
dprintk("NFSD: warning: no callback path to client %.*s\n",
(int)clp->cl_name.len, clp->cl_name.data);
}
@@ -461,7 +461,7 @@ nfs4_cb_null(struct rpc_task *task, void *dummy)
{
struct nfs4_client *clp = (struct nfs4_client *)task->tk_msg.rpc_argp;
struct nfs4_callback *cb = &clp->cl_callback;
- u32 addr = htonl(cb->cb_addr);
+ __be32 addr = htonl(cb->cb_addr);
dprintk("NFSD: nfs4_cb_null task->tk_status %d\n", task->tk_status);
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 15ded7a30a72..0a7bbdc4a10a 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -67,32 +67,32 @@ fh_dup2(struct svc_fh *dst, struct svc_fh *src)
*dst = *src;
}
-static int
-do_open_permission(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open)
+static __be32
+do_open_permission(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open, int accmode)
{
- int accmode, status;
+ __be32 status;
if (open->op_truncate &&
!(open->op_share_access & NFS4_SHARE_ACCESS_WRITE))
return nfserr_inval;
- accmode = MAY_NOP;
if (open->op_share_access & NFS4_SHARE_ACCESS_READ)
- accmode = MAY_READ;
- if (open->op_share_deny & NFS4_SHARE_ACCESS_WRITE)
+ accmode |= MAY_READ;
+ if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE)
accmode |= (MAY_WRITE | MAY_TRUNC);
- accmode |= MAY_OWNER_OVERRIDE;
+ if (open->op_share_deny & NFS4_SHARE_DENY_WRITE)
+ accmode |= MAY_WRITE;
status = fh_verify(rqstp, current_fh, S_IFREG, accmode);
return status;
}
-static int
+static __be32
do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open)
{
struct svc_fh resfh;
- int status;
+ __be32 status;
fh_init(&resfh, NFS4_FHSIZE);
open->op_truncate = 0;
@@ -124,17 +124,17 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o
&resfh.fh_handle.fh_base,
resfh.fh_handle.fh_size);
- status = do_open_permission(rqstp, current_fh, open);
+ status = do_open_permission(rqstp, current_fh, open, MAY_NOP);
}
fh_put(&resfh);
return status;
}
-static int
+static __be32
do_open_fhandle(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open)
{
- int status;
+ __be32 status;
/* Only reclaims from previously confirmed clients are valid */
if ((status = nfs4_check_open_reclaim(&open->op_clientid)))
@@ -155,16 +155,16 @@ do_open_fhandle(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_
open->op_truncate = (open->op_iattr.ia_valid & ATTR_SIZE) &&
(open->op_iattr.ia_size == 0);
- status = do_open_permission(rqstp, current_fh, open);
+ status = do_open_permission(rqstp, current_fh, open, MAY_OWNER_OVERRIDE);
return status;
}
-static inline int
+static inline __be32
nfsd4_open(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open, struct nfs4_stateowner **replay_owner)
{
- int status;
+ __be32 status;
dprintk("NFSD: nfsd4_open filename %.*s op_stateowner %p\n",
(int)open->op_fname.len, open->op_fname.data,
open->op_stateowner);
@@ -177,7 +177,7 @@ nfsd4_open(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open
/* check seqid for replay. set nfs4_owner */
status = nfsd4_process_open1(open);
- if (status == NFSERR_REPLAY_ME) {
+ if (status == nfserr_replay_me) {
struct nfs4_replay *rp = &open->op_stateowner->so_replay;
fh_put(current_fh);
current_fh->fh_handle.fh_size = rp->rp_openfh_len;
@@ -188,7 +188,7 @@ nfsd4_open(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open
dprintk("nfsd4_open: replay failed"
" restoring previous filehandle\n");
else
- status = NFSERR_REPLAY_ME;
+ status = nfserr_replay_me;
}
if (status)
goto out;
@@ -261,7 +261,7 @@ out:
/*
* filehandle-manipulating ops.
*/
-static inline int
+static inline __be32
nfsd4_getfh(struct svc_fh *current_fh, struct svc_fh **getfh)
{
if (!current_fh->fh_dentry)
@@ -271,7 +271,7 @@ nfsd4_getfh(struct svc_fh *current_fh, struct svc_fh **getfh)
return nfs_ok;
}
-static inline int
+static inline __be32
nfsd4_putfh(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_putfh *putfh)
{
fh_put(current_fh);
@@ -280,10 +280,10 @@ nfsd4_putfh(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_putf
return fh_verify(rqstp, current_fh, 0, MAY_NOP);
}
-static inline int
+static inline __be32
nfsd4_putrootfh(struct svc_rqst *rqstp, struct svc_fh *current_fh)
{
- int status;
+ __be32 status;
fh_put(current_fh);
status = exp_pseudoroot(rqstp->rq_client, current_fh,
@@ -291,7 +291,7 @@ nfsd4_putrootfh(struct svc_rqst *rqstp, struct svc_fh *current_fh)
return status;
}
-static inline int
+static inline __be32
nfsd4_restorefh(struct svc_fh *current_fh, struct svc_fh *save_fh)
{
if (!save_fh->fh_dentry)
@@ -301,7 +301,7 @@ nfsd4_restorefh(struct svc_fh *current_fh, struct svc_fh *save_fh)
return nfs_ok;
}
-static inline int
+static inline __be32
nfsd4_savefh(struct svc_fh *current_fh, struct svc_fh *save_fh)
{
if (!current_fh->fh_dentry)
@@ -314,7 +314,7 @@ nfsd4_savefh(struct svc_fh *current_fh, struct svc_fh *save_fh)
/*
* misc nfsv4 ops
*/
-static inline int
+static inline __be32
nfsd4_access(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_access *access)
{
if (access->ac_req_access & ~NFS3_ACCESS_FULL)
@@ -324,10 +324,10 @@ nfsd4_access(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_acc
return nfsd_access(rqstp, current_fh, &access->ac_resp_access, &access->ac_supported);
}
-static inline int
+static inline __be32
nfsd4_commit(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_commit *commit)
{
- int status;
+ __be32 status;
u32 *p = (u32 *)commit->co_verf.data;
*p++ = nfssvc_boot.tv_sec;
@@ -339,11 +339,11 @@ nfsd4_commit(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_com
return status;
}
-static int
+static __be32
nfsd4_create(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_create *create)
{
struct svc_fh resfh;
- int status;
+ __be32 status;
dev_t rdev;
fh_init(&resfh, NFS4_FHSIZE);
@@ -423,10 +423,10 @@ nfsd4_create(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_cre
return status;
}
-static inline int
+static inline __be32
nfsd4_getattr(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_getattr *getattr)
{
- int status;
+ __be32 status;
status = fh_verify(rqstp, current_fh, 0, MAY_NOP);
if (status)
@@ -442,11 +442,11 @@ nfsd4_getattr(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_ge
return nfs_ok;
}
-static inline int
+static inline __be32
nfsd4_link(struct svc_rqst *rqstp, struct svc_fh *current_fh,
struct svc_fh *save_fh, struct nfsd4_link *link)
{
- int status = nfserr_nofilehandle;
+ __be32 status = nfserr_nofilehandle;
if (!save_fh->fh_dentry)
return status;
@@ -456,11 +456,11 @@ nfsd4_link(struct svc_rqst *rqstp, struct svc_fh *current_fh,
return status;
}
-static int
+static __be32
nfsd4_lookupp(struct svc_rqst *rqstp, struct svc_fh *current_fh)
{
struct svc_fh tmp_fh;
- int ret;
+ __be32 ret;
fh_init(&tmp_fh, NFS4_FHSIZE);
if((ret = exp_pseudoroot(rqstp->rq_client, &tmp_fh,
@@ -474,16 +474,16 @@ nfsd4_lookupp(struct svc_rqst *rqstp, struct svc_fh *current_fh)
return nfsd_lookup(rqstp, current_fh, "..", 2, current_fh);
}
-static inline int
+static inline __be32
nfsd4_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_lookup *lookup)
{
return nfsd_lookup(rqstp, current_fh, lookup->lo_name, lookup->lo_len, current_fh);
}
-static inline int
+static inline __be32
nfsd4_read(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_read *read)
{
- int status;
+ __be32 status;
/* no need to check permission - this will be done in nfsd_read() */
@@ -508,7 +508,7 @@ out:
return status;
}
-static inline int
+static inline __be32
nfsd4_readdir(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_readdir *readdir)
{
u64 cookie = readdir->rd_cookie;
@@ -531,7 +531,7 @@ nfsd4_readdir(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_re
return nfs_ok;
}
-static inline int
+static inline __be32
nfsd4_readlink(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_readlink *readlink)
{
readlink->rl_rqstp = rqstp;
@@ -539,10 +539,10 @@ nfsd4_readlink(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_r
return nfs_ok;
}
-static inline int
+static inline __be32
nfsd4_remove(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_remove *remove)
{
- int status;
+ __be32 status;
if (nfs4_in_grace())
return nfserr_grace;
@@ -556,11 +556,11 @@ nfsd4_remove(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_rem
return status;
}
-static inline int
+static inline __be32
nfsd4_rename(struct svc_rqst *rqstp, struct svc_fh *current_fh,
struct svc_fh *save_fh, struct nfsd4_rename *rename)
{
- int status = nfserr_nofilehandle;
+ __be32 status = nfserr_nofilehandle;
if (!save_fh->fh_dentry)
return status;
@@ -589,10 +589,10 @@ nfsd4_rename(struct svc_rqst *rqstp, struct svc_fh *current_fh,
return status;
}
-static inline int
+static inline __be32
nfsd4_setattr(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_setattr *setattr)
{
- int status = nfs_ok;
+ __be32 status = nfs_ok;
if (setattr->sa_iattr.ia_valid & ATTR_SIZE) {
nfs4_lock_state();
@@ -614,13 +614,13 @@ nfsd4_setattr(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_se
return status;
}
-static inline int
+static inline __be32
nfsd4_write(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_write *write)
{
stateid_t *stateid = &write->wr_stateid;
struct file *filp = NULL;
u32 *p;
- int status = nfs_ok;
+ __be32 status = nfs_ok;
/* no need to check permission - this will be done in nfsd_write() */
@@ -646,7 +646,7 @@ nfsd4_write(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_writ
*p++ = nfssvc_boot.tv_usec;
status = nfsd_write(rqstp, current_fh, filp, write->wr_offset,
- write->wr_vec, write->wr_vlen, write->wr_buflen,
+ rqstp->rq_vec, write->wr_vlen, write->wr_buflen,
&write->wr_how_written);
if (filp)
fput(filp);
@@ -661,12 +661,12 @@ nfsd4_write(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_writ
* attributes matched. VERIFY is implemented by mapping NFSERR_SAME
* to NFS_OK after the call; NVERIFY by mapping NFSERR_NOT_SAME to NFS_OK.
*/
-static int
+static __be32
nfsd4_verify(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_verify *verify)
{
- u32 *buf, *p;
+ __be32 *buf, *p;
int count;
- int status;
+ __be32 status;
status = fh_verify(rqstp, current_fh, 0, MAY_NOP);
if (status)
@@ -715,7 +715,7 @@ out_kfree:
/*
* NULL call.
*/
-static int
+static __be32
nfsd4_proc_null(struct svc_rqst *rqstp, void *argp, void *resp)
{
return nfs_ok;
@@ -731,7 +731,7 @@ static inline void nfsd4_increment_op_stats(u32 opnum)
/*
* COMPOUND call.
*/
-static int
+static __be32
nfsd4_proc_compound(struct svc_rqst *rqstp,
struct nfsd4_compoundargs *args,
struct nfsd4_compoundres *resp)
@@ -741,7 +741,7 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
struct svc_fh *save_fh = NULL;
struct nfs4_stateowner *replay_owner = NULL;
int slack_space; /* in words, not bytes! */
- int status;
+ __be32 status;
status = nfserr_resource;
current_fh = kmalloc(sizeof(*current_fh), GFP_KERNEL);
@@ -802,13 +802,29 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
* SETCLIENTID_CONFIRM, PUTFH and PUTROOTFH
* require a valid current filehandle
*/
- if ((!current_fh->fh_dentry) &&
- !((op->opnum == OP_PUTFH) || (op->opnum == OP_PUTROOTFH) ||
- (op->opnum == OP_SETCLIENTID) ||
- (op->opnum == OP_SETCLIENTID_CONFIRM) ||
- (op->opnum == OP_RENEW) || (op->opnum == OP_RESTOREFH) ||
- (op->opnum == OP_RELEASE_LOCKOWNER))) {
- op->status = nfserr_nofilehandle;
+ if (!current_fh->fh_dentry) {
+ if (!((op->opnum == OP_PUTFH) ||
+ (op->opnum == OP_PUTROOTFH) ||
+ (op->opnum == OP_SETCLIENTID) ||
+ (op->opnum == OP_SETCLIENTID_CONFIRM) ||
+ (op->opnum == OP_RENEW) ||
+ (op->opnum == OP_RESTOREFH) ||
+ (op->opnum == OP_RELEASE_LOCKOWNER))) {
+ op->status = nfserr_nofilehandle;
+ goto encode_op;
+ }
+ }
+ /* Check must be done at start of each operation, except
+ * for GETATTR and ops not listed as returning NFS4ERR_MOVED
+ */
+ else if (current_fh->fh_export->ex_fslocs.migrated &&
+ !((op->opnum == OP_GETATTR) ||
+ (op->opnum == OP_PUTROOTFH) ||
+ (op->opnum == OP_PUTPUBFH) ||
+ (op->opnum == OP_RENEW) ||
+ (op->opnum == OP_SETCLIENTID) ||
+ (op->opnum == OP_RELEASE_LOCKOWNER))) {
+ op->status = nfserr_moved;
goto encode_op;
}
switch (op->opnum) {
@@ -921,7 +937,7 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
}
encode_op:
- if (op->status == NFSERR_REPLAY_ME) {
+ if (op->status == nfserr_replay_me) {
op->replay = &replay_owner->so_replay;
nfsd4_encode_replay(resp, op);
status = op->status = op->replay->rp_status;
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 1cbd2e4ee122..e9d07704680e 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -83,13 +83,13 @@ md5_to_hex(char *out, char *md5)
*out = '\0';
}
-int
+__be32
nfs4_make_rec_clidname(char *dname, struct xdr_netobj *clname)
{
struct xdr_netobj cksum;
struct hash_desc desc;
struct scatterlist sg[1];
- int status = nfserr_resource;
+ __be32 status = nfserr_resource;
dprintk("NFSD: nfs4_make_rec_clidname for %.*s\n",
clname->len, clname->data);
@@ -193,7 +193,7 @@ nfsd4_build_dentrylist(void *arg, const char *name, int namlen,
struct dentry_list *child;
if (name && isdotent(name, namlen))
- return nfs_ok;
+ return 0;
dentry = lookup_one_len(name, parent, namlen);
if (IS_ERR(dentry))
return PTR_ERR(dentry);
@@ -333,14 +333,14 @@ purge_old(struct dentry *parent, struct dentry *child)
int status;
if (nfs4_has_reclaimed_state(child->d_name.name))
- return nfs_ok;
+ return 0;
status = nfsd4_clear_clid_dir(parent, child);
if (status)
printk("failed to remove client recovery directory %s\n",
child->d_name.name);
/* Keep trying, success or failure: */
- return nfs_ok;
+ return 0;
}
void
@@ -365,10 +365,10 @@ load_recdir(struct dentry *parent, struct dentry *child)
printk("nfsd4: illegal name %s in recovery directory\n",
child->d_name.name);
/* Keep trying; maybe the others are OK: */
- return nfs_ok;
+ return 0;
}
nfs4_client_to_reclaim(child->d_name.name);
- return nfs_ok;
+ return 0;
}
int
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index ebcf226a9e4a..293b6495829f 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -710,10 +710,10 @@ out_err:
* as described above.
*
*/
-int
+__be32
nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_setclientid *setclid)
{
- u32 ip_addr = rqstp->rq_addr.sin_addr.s_addr;
+ __be32 ip_addr = rqstp->rq_addr.sin_addr.s_addr;
struct xdr_netobj clname = {
.len = setclid->se_namelen,
.data = setclid->se_name,
@@ -721,7 +721,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_setclientid *setclid)
nfs4_verifier clverifier = setclid->se_verf;
unsigned int strhashval;
struct nfs4_client *conf, *unconf, *new;
- int status;
+ __be32 status;
char dname[HEXDIR_LEN];
if (!check_name(clname))
@@ -875,14 +875,14 @@ out:
*
* NOTE: callback information will be processed here in a future patch
*/
-int
+__be32
nfsd4_setclientid_confirm(struct svc_rqst *rqstp, struct nfsd4_setclientid_confirm *setclientid_confirm)
{
- u32 ip_addr = rqstp->rq_addr.sin_addr.s_addr;
+ __be32 ip_addr = rqstp->rq_addr.sin_addr.s_addr;
struct nfs4_client *conf, *unconf;
nfs4_verifier confirm = setclientid_confirm->sc_confirm;
clientid_t * clid = &setclientid_confirm->sc_clientid;
- int status;
+ __be32 status;
if (STALE_CLIENTID(clid))
return nfserr_stale_clientid;
@@ -1280,13 +1280,13 @@ test_share(struct nfs4_stateid *stp, struct nfsd4_open *open) {
* Called to check deny when READ with all zero stateid or
* WRITE with all zero or all one stateid
*/
-static int
+static __be32
nfs4_share_conflict(struct svc_fh *current_fh, unsigned int deny_type)
{
struct inode *ino = current_fh->fh_dentry->d_inode;
struct nfs4_file *fp;
struct nfs4_stateid *stp;
- int ret;
+ __be32 ret;
dprintk("NFSD: nfs4_share_conflict\n");
@@ -1444,7 +1444,7 @@ static struct lock_manager_operations nfsd_lease_mng_ops = {
};
-int
+__be32
nfsd4_process_open1(struct nfsd4_open *open)
{
clientid_t *clientid = &open->op_clientid;
@@ -1477,7 +1477,7 @@ nfsd4_process_open1(struct nfsd4_open *open)
}
if (open->op_seqid == sop->so_seqid - 1) {
if (sop->so_replay.rp_buflen)
- return NFSERR_REPLAY_ME;
+ return nfserr_replay_me;
/* The original OPEN failed so spectacularly
* that we don't even have replay data saved!
* Therefore, we have no choice but to continue
@@ -1501,7 +1501,7 @@ renew:
return nfs_ok;
}
-static inline int
+static inline __be32
nfs4_check_delegmode(struct nfs4_delegation *dp, int flags)
{
if ((flags & WR_STATE) && (dp->dl_type == NFS4_OPEN_DELEGATE_READ))
@@ -1522,12 +1522,12 @@ find_delegation_file(struct nfs4_file *fp, stateid_t *stid)
return NULL;
}
-static int
+static __be32
nfs4_check_deleg(struct nfs4_file *fp, struct nfsd4_open *open,
struct nfs4_delegation **dp)
{
int flags;
- int status = nfserr_bad_stateid;
+ __be32 status = nfserr_bad_stateid;
*dp = find_delegation_file(fp, &open->op_delegate_stateid);
if (*dp == NULL)
@@ -1546,11 +1546,11 @@ out:
return nfs_ok;
}
-static int
+static __be32
nfs4_check_open(struct nfs4_file *fp, struct nfsd4_open *open, struct nfs4_stateid **stpp)
{
struct nfs4_stateid *local;
- int status = nfserr_share_denied;
+ __be32 status = nfserr_share_denied;
struct nfs4_stateowner *sop = open->op_stateowner;
list_for_each_entry(local, &fp->fi_stateids, st_perfile) {
@@ -1575,7 +1575,7 @@ nfs4_alloc_stateid(void)
return kmem_cache_alloc(stateid_slab, GFP_KERNEL);
}
-static int
+static __be32
nfs4_new_open(struct svc_rqst *rqstp, struct nfs4_stateid **stpp,
struct nfs4_delegation *dp,
struct svc_fh *cur_fh, int flags)
@@ -1590,7 +1590,7 @@ nfs4_new_open(struct svc_rqst *rqstp, struct nfs4_stateid **stpp,
get_file(dp->dl_vfs_file);
stp->st_vfs_file = dp->dl_vfs_file;
} else {
- int status;
+ __be32 status;
status = nfsd_open(rqstp, cur_fh, S_IFREG, flags,
&stp->st_vfs_file);
if (status) {
@@ -1604,7 +1604,7 @@ nfs4_new_open(struct svc_rqst *rqstp, struct nfs4_stateid **stpp,
return 0;
}
-static inline int
+static inline __be32
nfsd4_truncate(struct svc_rqst *rqstp, struct svc_fh *fh,
struct nfsd4_open *open)
{
@@ -1619,22 +1619,22 @@ nfsd4_truncate(struct svc_rqst *rqstp, struct svc_fh *fh,
return nfsd_setattr(rqstp, fh, &iattr, 0, (time_t)0);
}
-static int
+static __be32
nfs4_upgrade_open(struct svc_rqst *rqstp, struct svc_fh *cur_fh, struct nfs4_stateid *stp, struct nfsd4_open *open)
{
struct file *filp = stp->st_vfs_file;
struct inode *inode = filp->f_dentry->d_inode;
unsigned int share_access, new_writer;
- int status;
+ __be32 status;
set_access(&share_access, stp->st_access_bmap);
new_writer = (~share_access) & open->op_share_access
& NFS4_SHARE_ACCESS_WRITE;
if (new_writer) {
- status = get_write_access(inode);
- if (status)
- return nfserrno(status);
+ int err = get_write_access(inode);
+ if (err)
+ return nfserrno(err);
}
status = nfsd4_truncate(rqstp, cur_fh, open);
if (status) {
@@ -1738,14 +1738,14 @@ out:
/*
* called with nfs4_lock_state() held.
*/
-int
+__be32
nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open)
{
struct nfs4_file *fp = NULL;
struct inode *ino = current_fh->fh_dentry->d_inode;
struct nfs4_stateid *stp = NULL;
struct nfs4_delegation *dp = NULL;
- int status;
+ __be32 status;
status = nfserr_inval;
if (!access_valid(open->op_share_access)
@@ -1833,11 +1833,11 @@ static struct work_struct laundromat_work;
static void laundromat_main(void *);
static DECLARE_WORK(laundromat_work, laundromat_main, NULL);
-int
+__be32
nfsd4_renew(clientid_t *clid)
{
struct nfs4_client *clp;
- int status;
+ __be32 status;
nfs4_lock_state();
dprintk("process_renew(%08x/%08x): starting\n",
@@ -1996,9 +1996,9 @@ access_permit_write(unsigned long access_bmap)
}
static
-int nfs4_check_openmode(struct nfs4_stateid *stp, int flags)
+__be32 nfs4_check_openmode(struct nfs4_stateid *stp, int flags)
{
- int status = nfserr_openmode;
+ __be32 status = nfserr_openmode;
if ((flags & WR_STATE) && (!access_permit_write(stp->st_access_bmap)))
goto out;
@@ -2009,7 +2009,7 @@ out:
return status;
}
-static inline int
+static inline __be32
check_special_stateids(svc_fh *current_fh, stateid_t *stateid, int flags)
{
/* Trying to call delegreturn with a special stateid? Yuch: */
@@ -2043,14 +2043,14 @@ io_during_grace_disallowed(struct inode *inode, int flags)
/*
* Checks for stateid operations
*/
-int
+__be32
nfs4_preprocess_stateid_op(struct svc_fh *current_fh, stateid_t *stateid, int flags, struct file **filpp)
{
struct nfs4_stateid *stp = NULL;
struct nfs4_delegation *dp = NULL;
stateid_t *stidp;
struct inode *ino = current_fh->fh_dentry->d_inode;
- int status;
+ __be32 status;
dprintk("NFSD: preprocess_stateid_op: stateid = (%08x/%08x/%08x/%08x)\n",
stateid->si_boot, stateid->si_stateownerid,
@@ -2125,7 +2125,7 @@ setlkflg (int type)
/*
* Checks for sequence id mutating operations.
*/
-static int
+static __be32
nfs4_preprocess_seqid_op(struct svc_fh *current_fh, u32 seqid, stateid_t *stateid, int flags, struct nfs4_stateowner **sopp, struct nfs4_stateid **stpp, struct nfsd4_lock *lock)
{
struct nfs4_stateid *stp;
@@ -2169,7 +2169,7 @@ nfs4_preprocess_seqid_op(struct svc_fh *current_fh, u32 seqid, stateid_t *statei
clientid_t *lockclid = &lock->v.new.clientid;
struct nfs4_client *clp = sop->so_client;
int lkflg = 0;
- int status;
+ __be32 status;
lkflg = setlkflg(lock->lk_type);
@@ -2233,7 +2233,7 @@ check_replay:
if (seqid == sop->so_seqid - 1) {
dprintk("NFSD: preprocess_seqid_op: retransmission?\n");
/* indicate replay to calling function */
- return NFSERR_REPLAY_ME;
+ return nfserr_replay_me;
}
printk("NFSD: preprocess_seqid_op: bad seqid (expected %d, got %d)\n",
sop->so_seqid, seqid);
@@ -2241,10 +2241,10 @@ check_replay:
return nfserr_bad_seqid;
}
-int
+__be32
nfsd4_open_confirm(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open_confirm *oc, struct nfs4_stateowner **replay_owner)
{
- int status;
+ __be32 status;
struct nfs4_stateowner *sop;
struct nfs4_stateid *stp;
@@ -2310,10 +2310,10 @@ reset_union_bmap_deny(unsigned long deny, unsigned long *bmap)
}
}
-int
+__be32
nfsd4_open_downgrade(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open_downgrade *od, struct nfs4_stateowner **replay_owner)
{
- int status;
+ __be32 status;
struct nfs4_stateid *stp;
unsigned int share_access;
@@ -2365,10 +2365,10 @@ out:
/*
* nfs4_unlock_state() called after encode
*/
-int
+__be32
nfsd4_close(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_close *close, struct nfs4_stateowner **replay_owner)
{
- int status;
+ __be32 status;
struct nfs4_stateid *stp;
dprintk("NFSD: nfsd4_close on file %.*s\n",
@@ -2404,10 +2404,10 @@ out:
return status;
}
-int
+__be32
nfsd4_delegreturn(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_delegreturn *dr)
{
- int status;
+ __be32 status;
if ((status = fh_verify(rqstp, current_fh, S_IFREG, 0)))
goto out;
@@ -2635,7 +2635,7 @@ check_lock_length(u64 offset, u64 length)
/*
* LOCK operation
*/
-int
+__be32
nfsd4_lock(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_lock *lock, struct nfs4_stateowner **replay_owner)
{
struct nfs4_stateowner *open_sop = NULL;
@@ -2644,8 +2644,9 @@ nfsd4_lock(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_lock
struct file *filp;
struct file_lock file_lock;
struct file_lock conflock;
- int status = 0;
+ __be32 status = 0;
unsigned int strhashval;
+ int err;
dprintk("NFSD: nfsd4_lock: start=%Ld length=%Ld\n",
(long long) lock->lk_offset,
@@ -2758,13 +2759,14 @@ nfsd4_lock(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_lock
* locks_copy_lock: */
conflock.fl_ops = NULL;
conflock.fl_lmops = NULL;
- status = posix_lock_file_conf(filp, &file_lock, &conflock);
+ err = posix_lock_file_conf(filp, &file_lock, &conflock);
dprintk("NFSD: nfsd4_lock: posix_lock_file_conf status %d\n",status);
- switch (-status) {
+ switch (-err) {
case 0: /* success! */
update_stateid(&lock_stp->st_stateid);
memcpy(&lock->lk_resp_stateid, &lock_stp->st_stateid,
sizeof(stateid_t));
+ status = 0;
break;
case (EAGAIN): /* conflock holds conflicting lock */
status = nfserr_denied;
@@ -2775,7 +2777,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_lock
status = nfserr_deadlock;
break;
default:
- dprintk("NFSD: nfsd4_lock: posix_lock_file_conf() failed! status %d\n",status);
+ dprintk("NFSD: nfsd4_lock: posix_lock_file_conf() failed! status %d\n",err);
status = nfserr_resource;
break;
}
@@ -2793,14 +2795,14 @@ out:
/*
* LOCKT operation
*/
-int
+__be32
nfsd4_lockt(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_lockt *lockt)
{
struct inode *inode;
struct file file;
struct file_lock file_lock;
struct file_lock conflock;
- int status;
+ __be32 status;
if (nfs4_in_grace())
return nfserr_grace;
@@ -2873,13 +2875,14 @@ out:
return status;
}
-int
+__be32
nfsd4_locku(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_locku *locku, struct nfs4_stateowner **replay_owner)
{
struct nfs4_stateid *stp;
struct file *filp = NULL;
struct file_lock file_lock;
- int status;
+ __be32 status;
+ int err;
dprintk("NFSD: nfsd4_locku: start=%Ld length=%Ld\n",
(long long) locku->lu_offset,
@@ -2917,8 +2920,8 @@ nfsd4_locku(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_lock
/*
* Try to unlock the file in the VFS.
*/
- status = posix_lock_file(filp, &file_lock);
- if (status) {
+ err = posix_lock_file(filp, &file_lock);
+ if (err) {
dprintk("NFSD: nfs4_locku: posix_lock_file failed!\n");
goto out_nfserr;
}
@@ -2937,7 +2940,7 @@ out:
return status;
out_nfserr:
- status = nfserrno(status);
+ status = nfserrno(err);
goto out;
}
@@ -2965,7 +2968,7 @@ out:
return status;
}
-int
+__be32
nfsd4_release_lockowner(struct svc_rqst *rqstp, struct nfsd4_release_lockowner *rlockowner)
{
clientid_t *clid = &rlockowner->rl_clientid;
@@ -2974,7 +2977,7 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp, struct nfsd4_release_lockowner *
struct xdr_netobj *owner = &rlockowner->rl_owner;
struct list_head matches;
int i;
- int status;
+ __be32 status;
dprintk("nfsd4_release_lockowner clientid: (%08x/%08x):\n",
clid->cl_boot, clid->cl_id);
@@ -3111,7 +3114,7 @@ nfs4_find_reclaim_client(clientid_t *clid)
/*
* Called from OPEN. Look for clientid in reclaim list.
*/
-int
+__be32
nfs4_check_open_reclaim(clientid_t *clid)
{
return nfs4_find_reclaim_client(clid) ? nfs_ok : nfserr_reclaim_bad;
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 5be00436b5b8..f3f239db04bb 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -60,8 +60,16 @@
#define NFSDDBG_FACILITY NFSDDBG_XDR
-static int
-check_filename(char *str, int len, int err)
+/*
+ * As per referral draft, the fsid for a referral MUST be different from the fsid of the containing
+ * directory in order to indicate to the client that a filesystem boundary is present
+ * We use a fixed fsid for a referral
+ */
+#define NFS4_REFERRAL_FSID_MAJOR 0x8000000ULL
+#define NFS4_REFERRAL_FSID_MINOR 0x8000000ULL
+
+static __be32
+check_filename(char *str, int len, __be32 err)
{
int i;
@@ -86,8 +94,8 @@ check_filename(char *str, int len, int err)
* consistent with the style used in NFSv2/v3...
*/
#define DECODE_HEAD \
- u32 *p; \
- int status
+ __be32 *p; \
+ __be32 status
#define DECODE_TAIL \
status = 0; \
out: \
@@ -136,13 +144,13 @@ xdr_error: \
} \
} while (0)
-static u32 *read_buf(struct nfsd4_compoundargs *argp, int nbytes)
+static __be32 *read_buf(struct nfsd4_compoundargs *argp, int nbytes)
{
/* We want more bytes than seem to be available.
* Maybe we need a new page, maybe we have just run out
*/
int avail = (char*)argp->end - (char*)argp->p;
- u32 *p;
+ __be32 *p;
if (avail + argp->pagelen < nbytes)
return NULL;
if (avail + PAGE_SIZE < nbytes) /* need more than a page !! */
@@ -189,7 +197,7 @@ defer_free(struct nfsd4_compoundargs *argp,
return 0;
}
-static char *savemem(struct nfsd4_compoundargs *argp, u32 *p, int nbytes)
+static char *savemem(struct nfsd4_compoundargs *argp, __be32 *p, int nbytes)
{
void *new = NULL;
if (p == argp->tmp) {
@@ -209,7 +217,7 @@ static char *savemem(struct nfsd4_compoundargs *argp, u32 *p, int nbytes)
}
-static int
+static __be32
nfsd4_decode_bitmap(struct nfsd4_compoundargs *argp, u32 *bmval)
{
u32 bmlen;
@@ -232,13 +240,14 @@ nfsd4_decode_bitmap(struct nfsd4_compoundargs *argp, u32 *bmval)
DECODE_TAIL;
}
-static int
+static __be32
nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, struct iattr *iattr,
struct nfs4_acl **acl)
{
int expected_len, len = 0;
u32 dummy32;
char *buf;
+ int host_err;
DECODE_HEAD;
iattr->ia_valid = 0;
@@ -272,7 +281,7 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, struct iattr *ia
*acl = nfs4_acl_new();
if (*acl == NULL) {
- status = -ENOMEM;
+ host_err = -ENOMEM;
goto out_nfserr;
}
defer_free(argp, (void (*)(const void *))nfs4_acl_free, *acl);
@@ -287,20 +296,20 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, struct iattr *ia
len += XDR_QUADLEN(dummy32) << 2;
READMEM(buf, dummy32);
ace.whotype = nfs4_acl_get_whotype(buf, dummy32);
- status = 0;
+ host_err = 0;
if (ace.whotype != NFS4_ACL_WHO_NAMED)
ace.who = 0;
else if (ace.flag & NFS4_ACE_IDENTIFIER_GROUP)
- status = nfsd_map_name_to_gid(argp->rqstp,
+ host_err = nfsd_map_name_to_gid(argp->rqstp,
buf, dummy32, &ace.who);
else
- status = nfsd_map_name_to_uid(argp->rqstp,
+ host_err = nfsd_map_name_to_uid(argp->rqstp,
buf, dummy32, &ace.who);
- if (status)
+ if (host_err)
goto out_nfserr;
- status = nfs4_acl_add_ace(*acl, ace.type, ace.flag,
+ host_err = nfs4_acl_add_ace(*acl, ace.type, ace.flag,
ace.access_mask, ace.whotype, ace.who);
- if (status)
+ if (host_err)
goto out_nfserr;
}
} else
@@ -319,7 +328,7 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, struct iattr *ia
READ_BUF(dummy32);
len += (XDR_QUADLEN(dummy32) << 2);
READMEM(buf, dummy32);
- if ((status = nfsd_map_name_to_uid(argp->rqstp, buf, dummy32, &iattr->ia_uid)))
+ if ((host_err = nfsd_map_name_to_uid(argp->rqstp, buf, dummy32, &iattr->ia_uid)))
goto out_nfserr;
iattr->ia_valid |= ATTR_UID;
}
@@ -330,7 +339,7 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, struct iattr *ia
READ_BUF(dummy32);
len += (XDR_QUADLEN(dummy32) << 2);
READMEM(buf, dummy32);
- if ((status = nfsd_map_name_to_gid(argp->rqstp, buf, dummy32, &iattr->ia_gid)))
+ if ((host_err = nfsd_map_name_to_gid(argp->rqstp, buf, dummy32, &iattr->ia_gid)))
goto out_nfserr;
iattr->ia_valid |= ATTR_GID;
}
@@ -406,11 +415,11 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, struct iattr *ia
DECODE_TAIL;
out_nfserr:
- status = nfserrno(status);
+ status = nfserrno(host_err);
goto out;
}
-static int
+static __be32
nfsd4_decode_access(struct nfsd4_compoundargs *argp, struct nfsd4_access *access)
{
DECODE_HEAD;
@@ -421,7 +430,7 @@ nfsd4_decode_access(struct nfsd4_compoundargs *argp, struct nfsd4_access *access
DECODE_TAIL;
}
-static int
+static __be32
nfsd4_decode_close(struct nfsd4_compoundargs *argp, struct nfsd4_close *close)
{
DECODE_HEAD;
@@ -436,7 +445,7 @@ nfsd4_decode_close(struct nfsd4_compoundargs *argp, struct nfsd4_close *close)
}
-static int
+static __be32
nfsd4_decode_commit(struct nfsd4_compoundargs *argp, struct nfsd4_commit *commit)
{
DECODE_HEAD;
@@ -448,7 +457,7 @@ nfsd4_decode_commit(struct nfsd4_compoundargs *argp, struct nfsd4_commit *commit
DECODE_TAIL;
}
-static int
+static __be32
nfsd4_decode_create(struct nfsd4_compoundargs *argp, struct nfsd4_create *create)
{
DECODE_HEAD;
@@ -488,7 +497,7 @@ nfsd4_decode_create(struct nfsd4_compoundargs *argp, struct nfsd4_create *create
DECODE_TAIL;
}
-static inline int
+static inline __be32
nfsd4_decode_delegreturn(struct nfsd4_compoundargs *argp, struct nfsd4_delegreturn *dr)
{
DECODE_HEAD;
@@ -500,13 +509,13 @@ nfsd4_decode_delegreturn(struct nfsd4_compoundargs *argp, struct nfsd4_delegretu
DECODE_TAIL;
}
-static inline int
+static inline __be32
nfsd4_decode_getattr(struct nfsd4_compoundargs *argp, struct nfsd4_getattr *getattr)
{
return nfsd4_decode_bitmap(argp, getattr->ga_bmval);
}
-static int
+static __be32
nfsd4_decode_link(struct nfsd4_compoundargs *argp, struct nfsd4_link *link)
{
DECODE_HEAD;
@@ -521,7 +530,7 @@ nfsd4_decode_link(struct nfsd4_compoundargs *argp, struct nfsd4_link *link)
DECODE_TAIL;
}
-static int
+static __be32
nfsd4_decode_lock(struct nfsd4_compoundargs *argp, struct nfsd4_lock *lock)
{
DECODE_HEAD;
@@ -560,7 +569,7 @@ nfsd4_decode_lock(struct nfsd4_compoundargs *argp, struct nfsd4_lock *lock)
DECODE_TAIL;
}
-static int
+static __be32
nfsd4_decode_lockt(struct nfsd4_compoundargs *argp, struct nfsd4_lockt *lockt)
{
DECODE_HEAD;
@@ -579,7 +588,7 @@ nfsd4_decode_lockt(struct nfsd4_compoundargs *argp, struct nfsd4_lockt *lockt)
DECODE_TAIL;
}
-static int
+static __be32
nfsd4_decode_locku(struct nfsd4_compoundargs *argp, struct nfsd4_locku *locku)
{
DECODE_HEAD;
@@ -598,7 +607,7 @@ nfsd4_decode_locku(struct nfsd4_compoundargs *argp, struct nfsd4_locku *locku)
DECODE_TAIL;
}
-static int
+static __be32
nfsd4_decode_lookup(struct nfsd4_compoundargs *argp, struct nfsd4_lookup *lookup)
{
DECODE_HEAD;
@@ -613,7 +622,7 @@ nfsd4_decode_lookup(struct nfsd4_compoundargs *argp, struct nfsd4_lookup *lookup
DECODE_TAIL;
}
-static int
+static __be32
nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
{
DECODE_HEAD;
@@ -691,7 +700,7 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
DECODE_TAIL;
}
-static int
+static __be32
nfsd4_decode_open_confirm(struct nfsd4_compoundargs *argp, struct nfsd4_open_confirm *open_conf)
{
DECODE_HEAD;
@@ -705,7 +714,7 @@ nfsd4_decode_open_confirm(struct nfsd4_compoundargs *argp, struct nfsd4_open_con
DECODE_TAIL;
}
-static int
+static __be32
nfsd4_decode_open_downgrade(struct nfsd4_compoundargs *argp, struct nfsd4_open_downgrade *open_down)
{
DECODE_HEAD;
@@ -721,7 +730,7 @@ nfsd4_decode_open_downgrade(struct nfsd4_compoundargs *argp, struct nfsd4_open_d
DECODE_TAIL;
}
-static int
+static __be32
nfsd4_decode_putfh(struct nfsd4_compoundargs *argp, struct nfsd4_putfh *putfh)
{
DECODE_HEAD;
@@ -736,7 +745,7 @@ nfsd4_decode_putfh(struct nfsd4_compoundargs *argp, struct nfsd4_putfh *putfh)
DECODE_TAIL;
}
-static int
+static __be32
nfsd4_decode_read(struct nfsd4_compoundargs *argp, struct nfsd4_read *read)
{
DECODE_HEAD;
@@ -750,7 +759,7 @@ nfsd4_decode_read(struct nfsd4_compoundargs *argp, struct nfsd4_read *read)
DECODE_TAIL;
}
-static int
+static __be32
nfsd4_decode_readdir(struct nfsd4_compoundargs *argp, struct nfsd4_readdir *readdir)
{
DECODE_HEAD;
@@ -766,7 +775,7 @@ nfsd4_decode_readdir(struct nfsd4_compoundargs *argp, struct nfsd4_readdir *read
DECODE_TAIL;
}
-static int
+static __be32
nfsd4_decode_remove(struct nfsd4_compoundargs *argp, struct nfsd4_remove *remove)
{
DECODE_HEAD;
@@ -781,7 +790,7 @@ nfsd4_decode_remove(struct nfsd4_compoundargs *argp, struct nfsd4_remove *remove
DECODE_TAIL;
}
-static int
+static __be32
nfsd4_decode_rename(struct nfsd4_compoundargs *argp, struct nfsd4_rename *rename)
{
DECODE_HEAD;
@@ -801,7 +810,7 @@ nfsd4_decode_rename(struct nfsd4_compoundargs *argp, struct nfsd4_rename *rename
DECODE_TAIL;
}
-static int
+static __be32
nfsd4_decode_renew(struct nfsd4_compoundargs *argp, clientid_t *clientid)
{
DECODE_HEAD;
@@ -812,7 +821,7 @@ nfsd4_decode_renew(struct nfsd4_compoundargs *argp, clientid_t *clientid)
DECODE_TAIL;
}
-static int
+static __be32
nfsd4_decode_setattr(struct nfsd4_compoundargs *argp, struct nfsd4_setattr *setattr)
{
DECODE_HEAD;
@@ -826,7 +835,7 @@ nfsd4_decode_setattr(struct nfsd4_compoundargs *argp, struct nfsd4_setattr *seta
DECODE_TAIL;
}
-static int
+static __be32
nfsd4_decode_setclientid(struct nfsd4_compoundargs *argp, struct nfsd4_setclientid *setclientid)
{
DECODE_HEAD;
@@ -851,7 +860,7 @@ nfsd4_decode_setclientid(struct nfsd4_compoundargs *argp, struct nfsd4_setclient
DECODE_TAIL;
}
-static int
+static __be32
nfsd4_decode_setclientid_confirm(struct nfsd4_compoundargs *argp, struct nfsd4_setclientid_confirm *scd_c)
{
DECODE_HEAD;
@@ -864,7 +873,7 @@ nfsd4_decode_setclientid_confirm(struct nfsd4_compoundargs *argp, struct nfsd4_s
}
/* Also used for NVERIFY */
-static int
+static __be32
nfsd4_decode_verify(struct nfsd4_compoundargs *argp, struct nfsd4_verify *verify)
{
#if 0
@@ -900,7 +909,7 @@ nfsd4_decode_verify(struct nfsd4_compoundargs *argp, struct nfsd4_verify *verify
DECODE_TAIL;
}
-static int
+static __be32
nfsd4_decode_write(struct nfsd4_compoundargs *argp, struct nfsd4_write *write)
{
int avail;
@@ -926,32 +935,32 @@ nfsd4_decode_write(struct nfsd4_compoundargs *argp, struct nfsd4_write *write)
printk(KERN_NOTICE "xdr error! (%s:%d)\n", __FILE__, __LINE__);
goto xdr_error;
}
- write->wr_vec[0].iov_base = p;
- write->wr_vec[0].iov_len = avail;
+ argp->rqstp->rq_vec[0].iov_base = p;
+ argp->rqstp->rq_vec[0].iov_len = avail;
v = 0;
len = write->wr_buflen;
- while (len > write->wr_vec[v].iov_len) {
- len -= write->wr_vec[v].iov_len;
+ while (len > argp->rqstp->rq_vec[v].iov_len) {
+ len -= argp->rqstp->rq_vec[v].iov_len;
v++;
- write->wr_vec[v].iov_base = page_address(argp->pagelist[0]);
+ argp->rqstp->rq_vec[v].iov_base = page_address(argp->pagelist[0]);
argp->pagelist++;
if (argp->pagelen >= PAGE_SIZE) {
- write->wr_vec[v].iov_len = PAGE_SIZE;
+ argp->rqstp->rq_vec[v].iov_len = PAGE_SIZE;
argp->pagelen -= PAGE_SIZE;
} else {
- write->wr_vec[v].iov_len = argp->pagelen;
+ argp->rqstp->rq_vec[v].iov_len = argp->pagelen;
argp->pagelen -= len;
}
}
- argp->end = (u32*) (write->wr_vec[v].iov_base + write->wr_vec[v].iov_len);
- argp->p = (u32*) (write->wr_vec[v].iov_base + (XDR_QUADLEN(len) << 2));
- write->wr_vec[v].iov_len = len;
+ argp->end = (__be32*) (argp->rqstp->rq_vec[v].iov_base + argp->rqstp->rq_vec[v].iov_len);
+ argp->p = (__be32*) (argp->rqstp->rq_vec[v].iov_base + (XDR_QUADLEN(len) << 2));
+ argp->rqstp->rq_vec[v].iov_len = len;
write->wr_vlen = v+1;
DECODE_TAIL;
}
-static int
+static __be32
nfsd4_decode_release_lockowner(struct nfsd4_compoundargs *argp, struct nfsd4_release_lockowner *rlockowner)
{
DECODE_HEAD;
@@ -965,7 +974,7 @@ nfsd4_decode_release_lockowner(struct nfsd4_compoundargs *argp, struct nfsd4_rel
DECODE_TAIL;
}
-static int
+static __be32
nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
{
DECODE_HEAD;
@@ -1171,7 +1180,7 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
* task to translate them into Linux-specific versions which are more
* consistent with the style used in NFSv2/v3...
*/
-#define ENCODE_HEAD u32 *p
+#define ENCODE_HEAD __be32 *p
#define WRITE32(n) *p++ = htonl(n)
#define WRITE64(n) do { \
@@ -1201,8 +1210,8 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
* Header routine to setup seqid operation replay cache
*/
#define ENCODE_SEQID_OP_HEAD \
- u32 *p; \
- u32 *save; \
+ __be32 *p; \
+ __be32 *save; \
\
save = resp->p;
@@ -1223,6 +1232,120 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
stateowner->so_replay.rp_buflen); \
} } while (0);
+/* Encode as an array of strings the string given with components
+ * seperated @sep.
+ */
+static __be32 nfsd4_encode_components(char sep, char *components,
+ __be32 **pp, int *buflen)
+{
+ __be32 *p = *pp;
+ __be32 *countp = p;
+ int strlen, count=0;
+ char *str, *end;
+
+ dprintk("nfsd4_encode_components(%s)\n", components);
+ if ((*buflen -= 4) < 0)
+ return nfserr_resource;
+ WRITE32(0); /* We will fill this in with @count later */
+ end = str = components;
+ while (*end) {
+ for (; *end && (*end != sep); end++)
+ ; /* Point to end of component */
+ strlen = end - str;
+ if (strlen) {
+ if ((*buflen -= ((XDR_QUADLEN(strlen) << 2) + 4)) < 0)
+ return nfserr_resource;
+ WRITE32(strlen);
+ WRITEMEM(str, strlen);
+ count++;
+ }
+ else
+ end++;
+ str = end;
+ }
+ *pp = p;
+ p = countp;
+ WRITE32(count);
+ return 0;
+}
+
+/*
+ * encode a location element of a fs_locations structure
+ */
+static __be32 nfsd4_encode_fs_location4(struct nfsd4_fs_location *location,
+ __be32 **pp, int *buflen)
+{
+ __be32 status;
+ __be32 *p = *pp;
+
+ status = nfsd4_encode_components(':', location->hosts, &p, buflen);
+ if (status)
+ return status;
+ status = nfsd4_encode_components('/', location->path, &p, buflen);
+ if (status)
+ return status;
+ *pp = p;
+ return 0;
+}
+
+/*
+ * Return the path to an export point in the pseudo filesystem namespace
+ * Returned string is safe to use as long as the caller holds a reference
+ * to @exp.
+ */
+static char *nfsd4_path(struct svc_rqst *rqstp, struct svc_export *exp, __be32 *stat)
+{
+ struct svc_fh tmp_fh;
+ char *path, *rootpath;
+
+ fh_init(&tmp_fh, NFS4_FHSIZE);
+ *stat = exp_pseudoroot(rqstp->rq_client, &tmp_fh, &rqstp->rq_chandle);
+ if (*stat)
+ return NULL;
+ rootpath = tmp_fh.fh_export->ex_path;
+
+ path = exp->ex_path;
+
+ if (strncmp(path, rootpath, strlen(rootpath))) {
+ printk("nfsd: fs_locations failed;"
+ "%s is not contained in %s\n", path, rootpath);
+ *stat = nfserr_notsupp;
+ return NULL;
+ }
+
+ return path + strlen(rootpath);
+}
+
+/*
+ * encode a fs_locations structure
+ */
+static __be32 nfsd4_encode_fs_locations(struct svc_rqst *rqstp,
+ struct svc_export *exp,
+ __be32 **pp, int *buflen)
+{
+ __be32 status;
+ int i;
+ __be32 *p = *pp;
+ struct nfsd4_fs_locations *fslocs = &exp->ex_fslocs;
+ char *root = nfsd4_path(rqstp, exp, &status);
+
+ if (status)
+ return status;
+ status = nfsd4_encode_components('/', root, &p, buflen);
+ if (status)
+ return status;
+ if ((*buflen -= 4) < 0)
+ return nfserr_resource;
+ WRITE32(fslocs->locations_count);
+ for (i=0; i<fslocs->locations_count; i++) {
+ status = nfsd4_encode_fs_location4(&fslocs->locations[i],
+ &p, buflen);
+ if (status)
+ return status;
+ }
+ *pp = p;
+ return 0;
+}
static u32 nfs4_ftypes[16] = {
NF4BAD, NF4FIFO, NF4CHR, NF4BAD,
@@ -1231,9 +1354,9 @@ static u32 nfs4_ftypes[16] = {
NF4SOCK, NF4BAD, NF4LNK, NF4BAD,
};
-static int
+static __be32
nfsd4_encode_name(struct svc_rqst *rqstp, int whotype, uid_t id, int group,
- u32 **p, int *buflen)
+ __be32 **p, int *buflen)
{
int status;
@@ -1253,25 +1376,44 @@ nfsd4_encode_name(struct svc_rqst *rqstp, int whotype, uid_t id, int group,
return 0;
}
-static inline int
-nfsd4_encode_user(struct svc_rqst *rqstp, uid_t uid, u32 **p, int *buflen)
+static inline __be32
+nfsd4_encode_user(struct svc_rqst *rqstp, uid_t uid, __be32 **p, int *buflen)
{
return nfsd4_encode_name(rqstp, NFS4_ACL_WHO_NAMED, uid, 0, p, buflen);
}
-static inline int
-nfsd4_encode_group(struct svc_rqst *rqstp, uid_t gid, u32 **p, int *buflen)
+static inline __be32
+nfsd4_encode_group(struct svc_rqst *rqstp, uid_t gid, __be32 **p, int *buflen)
{
return nfsd4_encode_name(rqstp, NFS4_ACL_WHO_NAMED, gid, 1, p, buflen);
}
-static inline int
+static inline __be32
nfsd4_encode_aclname(struct svc_rqst *rqstp, int whotype, uid_t id, int group,
- u32 **p, int *buflen)
+ __be32 **p, int *buflen)
{
return nfsd4_encode_name(rqstp, whotype, id, group, p, buflen);
}
+#define WORD0_ABSENT_FS_ATTRS (FATTR4_WORD0_FS_LOCATIONS | FATTR4_WORD0_FSID | \
+ FATTR4_WORD0_RDATTR_ERROR)
+#define WORD1_ABSENT_FS_ATTRS FATTR4_WORD1_MOUNTED_ON_FILEID
+
+static __be32 fattr_handle_absent_fs(u32 *bmval0, u32 *bmval1, u32 *rdattr_err)
+{
+ /* As per referral draft: */
+ if (*bmval0 & ~WORD0_ABSENT_FS_ATTRS ||
+ *bmval1 & ~WORD1_ABSENT_FS_ATTRS) {
+ if (*bmval0 & FATTR4_WORD0_RDATTR_ERROR ||
+ *bmval0 & FATTR4_WORD0_FS_LOCATIONS)
+ *rdattr_err = NFSERR_MOVED;
+ else
+ return nfserr_moved;
+ }
+ *bmval0 &= WORD0_ABSENT_FS_ATTRS;
+ *bmval1 &= WORD1_ABSENT_FS_ATTRS;
+ return 0;
+}
/*
* Note: @fhp can be NULL; in this case, we might have to compose the filehandle
@@ -1280,9 +1422,9 @@ nfsd4_encode_aclname(struct svc_rqst *rqstp, int whotype, uid_t id, int group,
* @countp is the buffer size in _words_; upon successful return this becomes
* replaced with the number of words written.
*/
-int
+__be32
nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
- struct dentry *dentry, u32 *buffer, int *countp, u32 *bmval,
+ struct dentry *dentry, __be32 *buffer, int *countp, u32 *bmval,
struct svc_rqst *rqstp)
{
u32 bmval0 = bmval[0];
@@ -1291,11 +1433,13 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
struct svc_fh tempfh;
struct kstatfs statfs;
int buflen = *countp << 2;
- u32 *attrlenp;
+ __be32 *attrlenp;
u32 dummy;
u64 dummy64;
- u32 *p = buffer;
- int status;
+ u32 rdattr_err = 0;
+ __be32 *p = buffer;
+ __be32 status;
+ int err;
int aclsupport = 0;
struct nfs4_acl *acl = NULL;
@@ -1303,14 +1447,20 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
BUG_ON(bmval0 & ~NFSD_SUPPORTED_ATTRS_WORD0);
BUG_ON(bmval1 & ~NFSD_SUPPORTED_ATTRS_WORD1);
- status = vfs_getattr(exp->ex_mnt, dentry, &stat);
- if (status)
+ if (exp->ex_fslocs.migrated) {
+ status = fattr_handle_absent_fs(&bmval0, &bmval1, &rdattr_err);
+ if (status)
+ goto out;
+ }
+
+ err = vfs_getattr(exp->ex_mnt, dentry, &stat);
+ if (err)
goto out_nfserr;
if ((bmval0 & (FATTR4_WORD0_FILES_FREE | FATTR4_WORD0_FILES_TOTAL)) ||
(bmval1 & (FATTR4_WORD1_SPACE_AVAIL | FATTR4_WORD1_SPACE_FREE |
FATTR4_WORD1_SPACE_TOTAL))) {
- status = vfs_statfs(dentry, &statfs);
- if (status)
+ err = vfs_statfs(dentry, &statfs);
+ if (err)
goto out_nfserr;
}
if ((bmval0 & (FATTR4_WORD0_FILEHANDLE | FATTR4_WORD0_FSID)) && !fhp) {
@@ -1322,18 +1472,23 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
}
if (bmval0 & (FATTR4_WORD0_ACL | FATTR4_WORD0_ACLSUPPORT
| FATTR4_WORD0_SUPPORTED_ATTRS)) {
- status = nfsd4_get_nfs4_acl(rqstp, dentry, &acl);
- aclsupport = (status == 0);
+ err = nfsd4_get_nfs4_acl(rqstp, dentry, &acl);
+ aclsupport = (err == 0);
if (bmval0 & FATTR4_WORD0_ACL) {
- if (status == -EOPNOTSUPP)
+ if (err == -EOPNOTSUPP)
bmval0 &= ~FATTR4_WORD0_ACL;
- else if (status == -EINVAL) {
+ else if (err == -EINVAL) {
status = nfserr_attrnotsupp;
goto out;
- } else if (status != 0)
+ } else if (err != 0)
goto out_nfserr;
}
}
+ if (bmval0 & FATTR4_WORD0_FS_LOCATIONS) {
+ if (exp->ex_fslocs.locations == NULL) {
+ bmval0 &= ~FATTR4_WORD0_FS_LOCATIONS;
+ }
+ }
if ((buflen -= 16) < 0)
goto out_resource;
@@ -1343,12 +1498,15 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
attrlenp = p++; /* to be backfilled later */
if (bmval0 & FATTR4_WORD0_SUPPORTED_ATTRS) {
+ u32 word0 = NFSD_SUPPORTED_ATTRS_WORD0;
if ((buflen -= 12) < 0)
goto out_resource;
+ if (!aclsupport)
+ word0 &= ~FATTR4_WORD0_ACL;
+ if (!exp->ex_fslocs.locations)
+ word0 &= ~FATTR4_WORD0_FS_LOCATIONS;
WRITE32(2);
- WRITE32(aclsupport ?
- NFSD_SUPPORTED_ATTRS_WORD0 :
- NFSD_SUPPORTED_ATTRS_WORD0 & ~FATTR4_WORD0_ACL);
+ WRITE32(word0);
WRITE32(NFSD_SUPPORTED_ATTRS_WORD1);
}
if (bmval0 & FATTR4_WORD0_TYPE) {
@@ -1402,7 +1560,10 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
if (bmval0 & FATTR4_WORD0_FSID) {
if ((buflen -= 16) < 0)
goto out_resource;
- if (is_fsid(fhp, rqstp->rq_reffh)) {
+ if (exp->ex_fslocs.migrated) {
+ WRITE64(NFS4_REFERRAL_FSID_MAJOR);
+ WRITE64(NFS4_REFERRAL_FSID_MINOR);
+ } else if (is_fsid(fhp, rqstp->rq_reffh)) {
WRITE64((u64)exp->ex_fsid);
WRITE64((u64)0);
} else {
@@ -1425,7 +1586,7 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
if (bmval0 & FATTR4_WORD0_RDATTR_ERROR) {
if ((buflen -= 4) < 0)
goto out_resource;
- WRITE32(0);
+ WRITE32(rdattr_err);
}
if (bmval0 & FATTR4_WORD0_ACL) {
struct nfs4_ace *ace;
@@ -1513,6 +1674,13 @@ out_acl:
goto out_resource;
WRITE64((u64) statfs.f_files);
}
+ if (bmval0 & FATTR4_WORD0_FS_LOCATIONS) {
+ status = nfsd4_encode_fs_locations(rqstp, exp, &p, &buflen);
+ if (status == nfserr_resource)
+ goto out_resource;
+ if (status)
+ goto out;
+ }
if (bmval0 & FATTR4_WORD0_HOMOGENEOUS) {
if ((buflen -= 4) < 0)
goto out_resource;
@@ -1536,12 +1704,12 @@ out_acl:
if (bmval0 & FATTR4_WORD0_MAXREAD) {
if ((buflen -= 8) < 0)
goto out_resource;
- WRITE64((u64) NFSSVC_MAXBLKSIZE);
+ WRITE64((u64) svc_max_payload(rqstp));
}
if (bmval0 & FATTR4_WORD0_MAXWRITE) {
if ((buflen -= 8) < 0)
goto out_resource;
- WRITE64((u64) NFSSVC_MAXBLKSIZE);
+ WRITE64((u64) svc_max_payload(rqstp));
}
if (bmval1 & FATTR4_WORD1_MODE) {
if ((buflen -= 4) < 0)
@@ -1652,7 +1820,7 @@ out:
fh_put(&tempfh);
return status;
out_nfserr:
- status = nfserrno(status);
+ status = nfserrno(err);
goto out;
out_resource:
*countp = 0;
@@ -1663,13 +1831,13 @@ out_serverfault:
goto out;
}
-static int
+static __be32
nfsd4_encode_dirent_fattr(struct nfsd4_readdir *cd,
- const char *name, int namlen, u32 *p, int *buflen)
+ const char *name, int namlen, __be32 *p, int *buflen)
{
struct svc_export *exp = cd->rd_fhp->fh_export;
struct dentry *dentry;
- int nfserr;
+ __be32 nfserr;
dentry = lookup_one_len(name, cd->rd_fhp->fh_dentry, namlen);
if (IS_ERR(dentry))
@@ -1698,10 +1866,10 @@ out_put:
return nfserr;
}
-static u32 *
-nfsd4_encode_rdattr_error(u32 *p, int buflen, int nfserr)
+static __be32 *
+nfsd4_encode_rdattr_error(__be32 *p, int buflen, __be32 nfserr)
{
- u32 *attrlenp;
+ __be32 *attrlenp;
if (buflen < 6)
return NULL;
@@ -1721,8 +1889,8 @@ nfsd4_encode_dirent(struct readdir_cd *ccd, const char *name, int namlen,
{
struct nfsd4_readdir *cd = container_of(ccd, struct nfsd4_readdir, common);
int buflen;
- u32 *p = cd->buffer;
- int nfserr = nfserr_toosmall;
+ __be32 *p = cd->buffer;
+ __be32 nfserr = nfserr_toosmall;
/* In nfsv4, "." and ".." never make it onto the wire.. */
if (name && isdotent(name, namlen)) {
@@ -1778,7 +1946,7 @@ fail:
}
static void
-nfsd4_encode_access(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_access *access)
+nfsd4_encode_access(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_access *access)
{
ENCODE_HEAD;
@@ -1791,7 +1959,7 @@ nfsd4_encode_access(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_acc
}
static void
-nfsd4_encode_close(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_close *close)
+nfsd4_encode_close(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_close *close)
{
ENCODE_SEQID_OP_HEAD;
@@ -1806,7 +1974,7 @@ nfsd4_encode_close(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_clos
static void
-nfsd4_encode_commit(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_commit *commit)
+nfsd4_encode_commit(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_commit *commit)
{
ENCODE_HEAD;
@@ -1818,7 +1986,7 @@ nfsd4_encode_commit(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_com
}
static void
-nfsd4_encode_create(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_create *create)
+nfsd4_encode_create(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_create *create)
{
ENCODE_HEAD;
@@ -1832,8 +2000,8 @@ nfsd4_encode_create(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_cre
}
}
-static int
-nfsd4_encode_getattr(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_getattr *getattr)
+static __be32
+nfsd4_encode_getattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_getattr *getattr)
{
struct svc_fh *fhp = getattr->ga_fhp;
int buflen;
@@ -1845,14 +2013,13 @@ nfsd4_encode_getattr(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_ge
nfserr = nfsd4_encode_fattr(fhp, fhp->fh_export, fhp->fh_dentry,
resp->p, &buflen, getattr->ga_bmval,
resp->rqstp);
-
if (!nfserr)
resp->p += buflen;
return nfserr;
}
static void
-nfsd4_encode_getfh(struct nfsd4_compoundres *resp, int nfserr, struct svc_fh *fhp)
+nfsd4_encode_getfh(struct nfsd4_compoundres *resp, __be32 nfserr, struct svc_fh *fhp)
{
unsigned int len;
ENCODE_HEAD;
@@ -1892,7 +2059,7 @@ nfsd4_encode_lock_denied(struct nfsd4_compoundres *resp, struct nfsd4_lock_denie
}
static void
-nfsd4_encode_lock(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_lock *lock)
+nfsd4_encode_lock(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_lock *lock)
{
ENCODE_SEQID_OP_HEAD;
@@ -1908,14 +2075,14 @@ nfsd4_encode_lock(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_lock
}
static void
-nfsd4_encode_lockt(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_lockt *lockt)
+nfsd4_encode_lockt(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_lockt *lockt)
{
if (nfserr == nfserr_denied)
nfsd4_encode_lock_denied(resp, &lockt->lt_denied);
}
static void
-nfsd4_encode_locku(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_locku *locku)
+nfsd4_encode_locku(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_locku *locku)
{
ENCODE_SEQID_OP_HEAD;
@@ -1931,7 +2098,7 @@ nfsd4_encode_locku(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_lock
static void
-nfsd4_encode_link(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_link *link)
+nfsd4_encode_link(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_link *link)
{
ENCODE_HEAD;
@@ -1944,7 +2111,7 @@ nfsd4_encode_link(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_link
static void
-nfsd4_encode_open(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_open *open)
+nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open *open)
{
ENCODE_SEQID_OP_HEAD;
@@ -2009,7 +2176,7 @@ out:
}
static void
-nfsd4_encode_open_confirm(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_open_confirm *oc)
+nfsd4_encode_open_confirm(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open_confirm *oc)
{
ENCODE_SEQID_OP_HEAD;
@@ -2024,7 +2191,7 @@ nfsd4_encode_open_confirm(struct nfsd4_compoundres *resp, int nfserr, struct nfs
}
static void
-nfsd4_encode_open_downgrade(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_open_downgrade *od)
+nfsd4_encode_open_downgrade(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open_downgrade *od)
{
ENCODE_SEQID_OP_HEAD;
@@ -2038,8 +2205,9 @@ nfsd4_encode_open_downgrade(struct nfsd4_compoundres *resp, int nfserr, struct n
ENCODE_SEQID_OP_TAIL(od->od_stateowner);
}
-static int
-nfsd4_encode_read(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_read *read)
+static __be32
+nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr,
+ struct nfsd4_read *read)
{
u32 eof;
int v, pn;
@@ -2054,31 +2222,33 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_read
RESERVE_SPACE(8); /* eof flag and byte count */
- maxcount = NFSSVC_MAXBLKSIZE;
+ maxcount = svc_max_payload(resp->rqstp);
if (maxcount > read->rd_length)
maxcount = read->rd_length;
len = maxcount;
v = 0;
while (len > 0) {
- pn = resp->rqstp->rq_resused;
- svc_take_page(resp->rqstp);
- read->rd_iov[v].iov_base = page_address(resp->rqstp->rq_respages[pn]);
- read->rd_iov[v].iov_len = len < PAGE_SIZE ? len : PAGE_SIZE;
+ pn = resp->rqstp->rq_resused++;
+ resp->rqstp->rq_vec[v].iov_base =
+ page_address(resp->rqstp->rq_respages[pn]);
+ resp->rqstp->rq_vec[v].iov_len =
+ len < PAGE_SIZE ? len : PAGE_SIZE;
v++;
len -= PAGE_SIZE;
}
read->rd_vlen = v;
nfserr = nfsd_read(read->rd_rqstp, read->rd_fhp, read->rd_filp,
- read->rd_offset, read->rd_iov, read->rd_vlen,
+ read->rd_offset, resp->rqstp->rq_vec, read->rd_vlen,
&maxcount);
if (nfserr == nfserr_symlink)
nfserr = nfserr_inval;
if (nfserr)
return nfserr;
- eof = (read->rd_offset + maxcount >= read->rd_fhp->fh_dentry->d_inode->i_size);
+ eof = (read->rd_offset + maxcount >=
+ read->rd_fhp->fh_dentry->d_inode->i_size);
WRITE32(eof);
WRITE32(maxcount);
@@ -2088,7 +2258,6 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_read
resp->xbuf->page_len = maxcount;
/* Use rest of head for padding and remaining ops: */
- resp->rqstp->rq_restailpage = 0;
resp->xbuf->tail[0].iov_base = p;
resp->xbuf->tail[0].iov_len = 0;
if (maxcount&3) {
@@ -2101,8 +2270,8 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_read
return 0;
}
-static int
-nfsd4_encode_readlink(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_readlink *readlink)
+static __be32
+nfsd4_encode_readlink(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_readlink *readlink)
{
int maxcount;
char *page;
@@ -2113,8 +2282,7 @@ nfsd4_encode_readlink(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_r
if (resp->xbuf->page_len)
return nfserr_resource;
- svc_take_page(resp->rqstp);
- page = page_address(resp->rqstp->rq_respages[resp->rqstp->rq_resused-1]);
+ page = page_address(resp->rqstp->rq_respages[resp->rqstp->rq_resused++]);
maxcount = PAGE_SIZE;
RESERVE_SPACE(4);
@@ -2138,7 +2306,6 @@ nfsd4_encode_readlink(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_r
resp->xbuf->page_len = maxcount;
/* Use rest of head for padding and remaining ops: */
- resp->rqstp->rq_restailpage = 0;
resp->xbuf->tail[0].iov_base = p;
resp->xbuf->tail[0].iov_len = 0;
if (maxcount&3) {
@@ -2151,12 +2318,12 @@ nfsd4_encode_readlink(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_r
return 0;
}
-static int
-nfsd4_encode_readdir(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_readdir *readdir)
+static __be32
+nfsd4_encode_readdir(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_readdir *readdir)
{
int maxcount;
loff_t offset;
- u32 *page, *savep, *tailbase;
+ __be32 *page, *savep, *tailbase;
ENCODE_HEAD;
if (nfserr)
@@ -2189,8 +2356,7 @@ nfsd4_encode_readdir(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_re
goto err_no_verf;
}
- svc_take_page(resp->rqstp);
- page = page_address(resp->rqstp->rq_respages[resp->rqstp->rq_resused-1]);
+ page = page_address(resp->rqstp->rq_respages[resp->rqstp->rq_resused++]);
readdir->common.err = 0;
readdir->buflen = maxcount;
readdir->buffer = page;
@@ -2215,10 +2381,10 @@ nfsd4_encode_readdir(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_re
p = readdir->buffer;
*p++ = 0; /* no more entries */
*p++ = htonl(readdir->common.err == nfserr_eof);
- resp->xbuf->page_len = ((char*)p) - (char*)page_address(resp->rqstp->rq_respages[resp->rqstp->rq_resused-1]);
+ resp->xbuf->page_len = ((char*)p) - (char*)page_address(
+ resp->rqstp->rq_respages[resp->rqstp->rq_resused-1]);
/* Use rest of head for padding and remaining ops: */
- resp->rqstp->rq_restailpage = 0;
resp->xbuf->tail[0].iov_base = tailbase;
resp->xbuf->tail[0].iov_len = 0;
resp->p = resp->xbuf->tail[0].iov_base;
@@ -2232,7 +2398,7 @@ err_no_verf:
}
static void
-nfsd4_encode_remove(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_remove *remove)
+nfsd4_encode_remove(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_remove *remove)
{
ENCODE_HEAD;
@@ -2244,7 +2410,7 @@ nfsd4_encode_remove(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_rem
}
static void
-nfsd4_encode_rename(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_rename *rename)
+nfsd4_encode_rename(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_rename *rename)
{
ENCODE_HEAD;
@@ -2261,7 +2427,7 @@ nfsd4_encode_rename(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_ren
* regardless of the error status.
*/
static void
-nfsd4_encode_setattr(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_setattr *setattr)
+nfsd4_encode_setattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_setattr *setattr)
{
ENCODE_HEAD;
@@ -2280,7 +2446,7 @@ nfsd4_encode_setattr(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_se
}
static void
-nfsd4_encode_setclientid(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_setclientid *scd)
+nfsd4_encode_setclientid(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_setclientid *scd)
{
ENCODE_HEAD;
@@ -2299,7 +2465,7 @@ nfsd4_encode_setclientid(struct nfsd4_compoundres *resp, int nfserr, struct nfsd
}
static void
-nfsd4_encode_write(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_write *write)
+nfsd4_encode_write(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_write *write)
{
ENCODE_HEAD;
@@ -2315,7 +2481,7 @@ nfsd4_encode_write(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_writ
void
nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op)
{
- u32 *statp;
+ __be32 *statp;
ENCODE_HEAD;
RESERVE_SPACE(8);
@@ -2453,7 +2619,7 @@ nfsd4_encode_replay(struct nfsd4_compoundres *resp, struct nfsd4_op *op)
*/
int
-nfs4svc_encode_voidres(struct svc_rqst *rqstp, u32 *p, void *dummy)
+nfs4svc_encode_voidres(struct svc_rqst *rqstp, __be32 *p, void *dummy)
{
return xdr_ressize_check(rqstp, p);
}
@@ -2475,9 +2641,9 @@ void nfsd4_release_compoundargs(struct nfsd4_compoundargs *args)
}
int
-nfs4svc_decode_compoundargs(struct svc_rqst *rqstp, u32 *p, struct nfsd4_compoundargs *args)
+nfs4svc_decode_compoundargs(struct svc_rqst *rqstp, __be32 *p, struct nfsd4_compoundargs *args)
{
- int status;
+ __be32 status;
args->p = p;
args->end = rqstp->rq_arg.head[0].iov_base + rqstp->rq_arg.head[0].iov_len;
@@ -2496,7 +2662,7 @@ nfs4svc_decode_compoundargs(struct svc_rqst *rqstp, u32 *p, struct nfsd4_compoun
}
int
-nfs4svc_encode_compoundres(struct svc_rqst *rqstp, u32 *p, struct nfsd4_compoundres *resp)
+nfs4svc_encode_compoundres(struct svc_rqst *rqstp, __be32 *p, struct nfsd4_compoundres *resp)
{
/*
* All that remains is to write the tag and operation count...
diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index fdf7cf3dfadc..6100bbe27432 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -29,7 +29,7 @@
*/
#define CACHESIZE 1024
#define HASHSIZE 64
-#define REQHASH(xid) ((((xid) >> 24) ^ (xid)) & (HASHSIZE-1))
+#define REQHASH(xid) (((((__force __u32)xid) >> 24) ^ ((__force __u32)xid)) & (HASHSIZE-1))
static struct hlist_head * hash_list;
static struct list_head lru_head;
@@ -127,8 +127,8 @@ nfsd_cache_lookup(struct svc_rqst *rqstp, int type)
struct hlist_node *hn;
struct hlist_head *rh;
struct svc_cacherep *rp;
- u32 xid = rqstp->rq_xid,
- proto = rqstp->rq_prot,
+ __be32 xid = rqstp->rq_xid;
+ u32 proto = rqstp->rq_prot,
vers = rqstp->rq_vers,
proc = rqstp->rq_proc;
unsigned long age;
@@ -258,7 +258,7 @@ found_entry:
* In this case, nfsd_cache_update is called with statp == NULL.
*/
void
-nfsd_cache_update(struct svc_rqst *rqstp, int cachetype, u32 *statp)
+nfsd_cache_update(struct svc_rqst *rqstp, int cachetype, __be32 *statp)
{
struct svc_cacherep *rp;
struct kvec *resv = &rqstp->rq_res.head[0], *cachv;
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 5c6a477c20ec..39aed901514b 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -57,6 +57,7 @@ enum {
NFSD_Pool_Threads,
NFSD_Versions,
NFSD_Ports,
+ NFSD_MaxBlkSize,
/*
* The below MUST come last. Otherwise we leave a hole in nfsd_files[]
* with !CONFIG_NFSD_V4 and simple_fill_super() goes oops
@@ -82,6 +83,7 @@ static ssize_t write_threads(struct file *file, char *buf, size_t size);
static ssize_t write_pool_threads(struct file *file, char *buf, size_t size);
static ssize_t write_versions(struct file *file, char *buf, size_t size);
static ssize_t write_ports(struct file *file, char *buf, size_t size);
+static ssize_t write_maxblksize(struct file *file, char *buf, size_t size);
#ifdef CONFIG_NFSD_V4
static ssize_t write_leasetime(struct file *file, char *buf, size_t size);
static ssize_t write_recoverydir(struct file *file, char *buf, size_t size);
@@ -100,6 +102,7 @@ static ssize_t (*write_op[])(struct file *, char *, size_t) = {
[NFSD_Pool_Threads] = write_pool_threads,
[NFSD_Versions] = write_versions,
[NFSD_Ports] = write_ports,
+ [NFSD_MaxBlkSize] = write_maxblksize,
#ifdef CONFIG_NFSD_V4
[NFSD_Leasetime] = write_leasetime,
[NFSD_RecoveryDir] = write_recoverydir,
@@ -523,18 +526,20 @@ static ssize_t write_ports(struct file *file, char *buf, size_t size)
err = nfsd_create_serv();
if (!err) {
int proto = 0;
- err = lockd_up(proto);
- if (!err) {
- err = svc_addsock(nfsd_serv, fd, buf, &proto);
- if (err)
- lockd_down();
+ err = svc_addsock(nfsd_serv, fd, buf, &proto);
+ if (err >= 0) {
+ err = lockd_up(proto);
+ if (err < 0)
+ svc_sock_names(buf+strlen(buf)+1, nfsd_serv, buf);
}
/* Decrease the count, but don't shutdown the
* the service
*/
+ lock_kernel();
nfsd_serv->sv_nrthreads--;
+ unlock_kernel();
}
- return err;
+ return err < 0 ? err : 0;
}
if (buf[0] == '-') {
char *toclose = kstrdup(buf+1, GFP_KERNEL);
@@ -545,12 +550,43 @@ static ssize_t write_ports(struct file *file, char *buf, size_t size)
if (nfsd_serv)
len = svc_sock_names(buf, nfsd_serv, toclose);
unlock_kernel();
+ if (len >= 0)
+ lockd_down();
kfree(toclose);
return len;
}
return -EINVAL;
}
+int nfsd_max_blksize;
+
+static ssize_t write_maxblksize(struct file *file, char *buf, size_t size)
+{
+ char *mesg = buf;
+ if (size > 0) {
+ int bsize;
+ int rv = get_int(&mesg, &bsize);
+ if (rv)
+ return rv;
+ /* force bsize into allowed range and
+ * required alignment.
+ */
+ if (bsize < 1024)
+ bsize = 1024;
+ if (bsize > NFSSVC_MAXBLKSIZE)
+ bsize = NFSSVC_MAXBLKSIZE;
+ bsize &= ~(1024-1);
+ lock_kernel();
+ if (nfsd_serv && nfsd_serv->sv_nrthreads) {
+ unlock_kernel();
+ return -EBUSY;
+ }
+ nfsd_max_blksize = bsize;
+ unlock_kernel();
+ }
+ return sprintf(buf, "%d\n", nfsd_max_blksize);
+}
+
#ifdef CONFIG_NFSD_V4
extern time_t nfs4_leasetime(void);
@@ -616,6 +652,7 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent)
[NFSD_Pool_Threads] = {"pool_threads", &transaction_ops, S_IWUSR|S_IRUSR},
[NFSD_Versions] = {"versions", &transaction_ops, S_IWUSR|S_IRUSR},
[NFSD_Ports] = {"portlist", &transaction_ops, S_IWUSR|S_IRUGO},
+ [NFSD_MaxBlkSize] = {"max_block_size", &transaction_ops, S_IWUSR|S_IRUGO},
#ifdef CONFIG_NFSD_V4
[NFSD_Leasetime] = {"nfsv4leasetime", &transaction_ops, S_IWUSR|S_IRUSR},
[NFSD_RecoveryDir] = {"nfsv4recoverydir", &transaction_ops, S_IWUSR|S_IRUSR},
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index 501d83884530..727ab3bd450d 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -76,7 +76,7 @@ static int nfsd_acceptable(void *expv, struct dentry *dentry)
* comment in the NFSv3 spec says this is incorrect (implementation notes for
* the write call).
*/
-static inline int
+static inline __be32
nfsd_mode_check(struct svc_rqst *rqstp, umode_t mode, int type)
{
/* Type can be negative when creating hardlinks - not to a dir */
@@ -110,13 +110,13 @@ nfsd_mode_check(struct svc_rqst *rqstp, umode_t mode, int type)
* This is only called at the start of an nfsproc call, so fhp points to
* a svc_fh which is all 0 except for the over-the-wire file handle.
*/
-u32
+__be32
fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access)
{
struct knfsd_fh *fh = &fhp->fh_handle;
struct svc_export *exp = NULL;
struct dentry *dentry;
- u32 error = 0;
+ __be32 error = 0;
dprintk("nfsd: fh_verify(%s)\n", SVCFH_fmt(fhp));
@@ -315,7 +315,7 @@ static inline void _fh_update_old(struct dentry *dentry,
fh->ofh_dirino = 0;
}
-int
+__be32
fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry, struct svc_fh *ref_fh)
{
/* ref_fh is a reference file handle.
@@ -451,7 +451,7 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry, st
* Update file handle information after changing a dentry.
* This is only called by nfsd_create, nfsd_create_v3 and nfsd_proc_create
*/
-int
+__be32
fh_update(struct svc_fh *fhp)
{
struct dentry *dentry;
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index 06cd0db0f32b..ec983b777680 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -30,22 +30,22 @@ typedef struct svc_buf svc_buf;
#define NFSDDBG_FACILITY NFSDDBG_PROC
-static int
+static __be32
nfsd_proc_null(struct svc_rqst *rqstp, void *argp, void *resp)
{
return nfs_ok;
}
-static int
-nfsd_return_attrs(int err, struct nfsd_attrstat *resp)
+static __be32
+nfsd_return_attrs(__be32 err, struct nfsd_attrstat *resp)
{
if (err) return err;
return nfserrno(vfs_getattr(resp->fh.fh_export->ex_mnt,
resp->fh.fh_dentry,
&resp->stat));
}
-static int
-nfsd_return_dirop(int err, struct nfsd_diropres *resp)
+static __be32
+nfsd_return_dirop(__be32 err, struct nfsd_diropres *resp)
{
if (err) return err;
return nfserrno(vfs_getattr(resp->fh.fh_export->ex_mnt,
@@ -56,11 +56,11 @@ nfsd_return_dirop(int err, struct nfsd_diropres *resp)
* Get a file's attributes
* N.B. After this call resp->fh needs an fh_put
*/
-static int
+static __be32
nfsd_proc_getattr(struct svc_rqst *rqstp, struct nfsd_fhandle *argp,
struct nfsd_attrstat *resp)
{
- int nfserr;
+ __be32 nfserr;
dprintk("nfsd: GETATTR %s\n", SVCFH_fmt(&argp->fh));
fh_copy(&resp->fh, &argp->fh);
@@ -72,11 +72,11 @@ nfsd_proc_getattr(struct svc_rqst *rqstp, struct nfsd_fhandle *argp,
* Set a file's attributes
* N.B. After this call resp->fh needs an fh_put
*/
-static int
+static __be32
nfsd_proc_setattr(struct svc_rqst *rqstp, struct nfsd_sattrargs *argp,
struct nfsd_attrstat *resp)
{
- int nfserr;
+ __be32 nfserr;
dprintk("nfsd: SETATTR %s, valid=%x, size=%ld\n",
SVCFH_fmt(&argp->fh),
argp->attrs.ia_valid, (long) argp->attrs.ia_size);
@@ -92,11 +92,11 @@ nfsd_proc_setattr(struct svc_rqst *rqstp, struct nfsd_sattrargs *argp,
* doesn't exist yet.
* N.B. After this call resp->fh needs an fh_put
*/
-static int
+static __be32
nfsd_proc_lookup(struct svc_rqst *rqstp, struct nfsd_diropargs *argp,
struct nfsd_diropres *resp)
{
- int nfserr;
+ __be32 nfserr;
dprintk("nfsd: LOOKUP %s %.*s\n",
SVCFH_fmt(&argp->fh), argp->len, argp->name);
@@ -112,11 +112,11 @@ nfsd_proc_lookup(struct svc_rqst *rqstp, struct nfsd_diropargs *argp,
/*
* Read a symlink.
*/
-static int
+static __be32
nfsd_proc_readlink(struct svc_rqst *rqstp, struct nfsd_readlinkargs *argp,
struct nfsd_readlinkres *resp)
{
- int nfserr;
+ __be32 nfserr;
dprintk("nfsd: READLINK %s\n", SVCFH_fmt(&argp->fh));
@@ -132,11 +132,11 @@ nfsd_proc_readlink(struct svc_rqst *rqstp, struct nfsd_readlinkargs *argp,
* Read a portion of a file.
* N.B. After this call resp->fh needs an fh_put
*/
-static int
+static __be32
nfsd_proc_read(struct svc_rqst *rqstp, struct nfsd_readargs *argp,
struct nfsd_readres *resp)
{
- int nfserr;
+ __be32 nfserr;
dprintk("nfsd: READ %s %d bytes at %d\n",
SVCFH_fmt(&argp->fh),
@@ -146,20 +146,20 @@ nfsd_proc_read(struct svc_rqst *rqstp, struct nfsd_readargs *argp,
* status, 17 words for fattr, and 1 word for the byte count.
*/
- if (NFSSVC_MAXBLKSIZE < argp->count) {
+ if (NFSSVC_MAXBLKSIZE_V2 < argp->count) {
printk(KERN_NOTICE
"oversized read request from %u.%u.%u.%u:%d (%d bytes)\n",
NIPQUAD(rqstp->rq_addr.sin_addr.s_addr),
ntohs(rqstp->rq_addr.sin_port),
argp->count);
- argp->count = NFSSVC_MAXBLKSIZE;
+ argp->count = NFSSVC_MAXBLKSIZE_V2;
}
svc_reserve(rqstp, (19<<2) + argp->count + 4);
resp->count = argp->count;
nfserr = nfsd_read(rqstp, fh_copy(&resp->fh, &argp->fh), NULL,
argp->offset,
- argp->vec, argp->vlen,
+ rqstp->rq_vec, argp->vlen,
&resp->count);
if (nfserr) return nfserr;
@@ -172,11 +172,11 @@ nfsd_proc_read(struct svc_rqst *rqstp, struct nfsd_readargs *argp,
* Write data to a file
* N.B. After this call resp->fh needs an fh_put
*/
-static int
+static __be32
nfsd_proc_write(struct svc_rqst *rqstp, struct nfsd_writeargs *argp,
struct nfsd_attrstat *resp)
{
- int nfserr;
+ __be32 nfserr;
int stable = 1;
dprintk("nfsd: WRITE %s %d bytes at %d\n",
@@ -185,7 +185,7 @@ nfsd_proc_write(struct svc_rqst *rqstp, struct nfsd_writeargs *argp,
nfserr = nfsd_write(rqstp, fh_copy(&resp->fh, &argp->fh), NULL,
argp->offset,
- argp->vec, argp->vlen,
+ rqstp->rq_vec, argp->vlen,
argp->len,
&stable);
return nfsd_return_attrs(nfserr, resp);
@@ -197,7 +197,7 @@ nfsd_proc_write(struct svc_rqst *rqstp, struct nfsd_writeargs *argp,
* and the actual create() call in compliance with VFS protocols.
* N.B. After this call _both_ argp->fh and resp->fh need an fh_put
*/
-static int
+static __be32
nfsd_proc_create(struct svc_rqst *rqstp, struct nfsd_createargs *argp,
struct nfsd_diropres *resp)
{
@@ -206,7 +206,8 @@ nfsd_proc_create(struct svc_rqst *rqstp, struct nfsd_createargs *argp,
struct iattr *attr = &argp->attrs;
struct inode *inode;
struct dentry *dchild;
- int nfserr, type, mode;
+ int type, mode;
+ __be32 nfserr;
dev_t rdev = 0, wanted = new_decode_dev(attr->ia_size);
dprintk("nfsd: CREATE %s %.*s\n",
@@ -225,7 +226,7 @@ nfsd_proc_create(struct svc_rqst *rqstp, struct nfsd_createargs *argp,
nfserr = nfserr_exist;
if (isdotent(argp->name, argp->len))
goto done;
- fh_lock(dirfhp);
+ fh_lock_nested(dirfhp, I_MUTEX_PARENT);
dchild = lookup_one_len(argp->name, dirfhp->fh_dentry, argp->len);
if (IS_ERR(dchild)) {
nfserr = nfserrno(PTR_ERR(dchild));
@@ -348,11 +349,11 @@ done:
return nfsd_return_dirop(nfserr, resp);
}
-static int
+static __be32
nfsd_proc_remove(struct svc_rqst *rqstp, struct nfsd_diropargs *argp,
void *resp)
{
- int nfserr;
+ __be32 nfserr;
dprintk("nfsd: REMOVE %s %.*s\n", SVCFH_fmt(&argp->fh),
argp->len, argp->name);
@@ -363,11 +364,11 @@ nfsd_proc_remove(struct svc_rqst *rqstp, struct nfsd_diropargs *argp,
return nfserr;
}
-static int
+static __be32
nfsd_proc_rename(struct svc_rqst *rqstp, struct nfsd_renameargs *argp,
void *resp)
{
- int nfserr;
+ __be32 nfserr;
dprintk("nfsd: RENAME %s %.*s -> \n",
SVCFH_fmt(&argp->ffh), argp->flen, argp->fname);
@@ -381,11 +382,11 @@ nfsd_proc_rename(struct svc_rqst *rqstp, struct nfsd_renameargs *argp,
return nfserr;
}
-static int
+static __be32
nfsd_proc_link(struct svc_rqst *rqstp, struct nfsd_linkargs *argp,
void *resp)
{
- int nfserr;
+ __be32 nfserr;
dprintk("nfsd: LINK %s ->\n",
SVCFH_fmt(&argp->ffh));
@@ -401,12 +402,12 @@ nfsd_proc_link(struct svc_rqst *rqstp, struct nfsd_linkargs *argp,
return nfserr;
}
-static int
+static __be32
nfsd_proc_symlink(struct svc_rqst *rqstp, struct nfsd_symlinkargs *argp,
void *resp)
{
struct svc_fh newfh;
- int nfserr;
+ __be32 nfserr;
dprintk("nfsd: SYMLINK %s %.*s -> %.*s\n",
SVCFH_fmt(&argp->ffh), argp->flen, argp->fname,
@@ -430,11 +431,11 @@ nfsd_proc_symlink(struct svc_rqst *rqstp, struct nfsd_symlinkargs *argp,
* Make directory. This operation is not idempotent.
* N.B. After this call resp->fh needs an fh_put
*/
-static int
+static __be32
nfsd_proc_mkdir(struct svc_rqst *rqstp, struct nfsd_createargs *argp,
struct nfsd_diropres *resp)
{
- int nfserr;
+ __be32 nfserr;
dprintk("nfsd: MKDIR %s %.*s\n", SVCFH_fmt(&argp->fh), argp->len, argp->name);
@@ -454,11 +455,11 @@ nfsd_proc_mkdir(struct svc_rqst *rqstp, struct nfsd_createargs *argp,
/*
* Remove a directory
*/
-static int
+static __be32
nfsd_proc_rmdir(struct svc_rqst *rqstp, struct nfsd_diropargs *argp,
void *resp)
{
- int nfserr;
+ __be32 nfserr;
dprintk("nfsd: RMDIR %s %.*s\n", SVCFH_fmt(&argp->fh), argp->len, argp->name);
@@ -470,11 +471,12 @@ nfsd_proc_rmdir(struct svc_rqst *rqstp, struct nfsd_diropargs *argp,
/*
* Read a portion of a directory.
*/
-static int
+static __be32
nfsd_proc_readdir(struct svc_rqst *rqstp, struct nfsd_readdirargs *argp,
struct nfsd_readdirres *resp)
{
- int nfserr, count;
+ int count;
+ __be32 nfserr;
loff_t offset;
dprintk("nfsd: READDIR %s %d bytes at %d\n",
@@ -509,11 +511,11 @@ nfsd_proc_readdir(struct svc_rqst *rqstp, struct nfsd_readdirargs *argp,
/*
* Get file system info
*/
-static int
+static __be32
nfsd_proc_statfs(struct svc_rqst * rqstp, struct nfsd_fhandle *argp,
struct nfsd_statfsres *resp)
{
- int nfserr;
+ __be32 nfserr;
dprintk("nfsd: STATFS %s\n", SVCFH_fmt(&argp->fh));
@@ -553,7 +555,7 @@ static struct svc_procedure nfsd_procedures2[18] = {
PROC(none, void, void, none, RC_NOCACHE, ST),
PROC(lookup, diropargs, diropres, fhandle, RC_NOCACHE, ST+FH+AT),
PROC(readlink, readlinkargs, readlinkres, none, RC_NOCACHE, ST+1+NFS_MAXPATHLEN/4),
- PROC(read, readargs, readres, fhandle, RC_NOCACHE, ST+AT+1+NFSSVC_MAXBLKSIZE/4),
+ PROC(read, readargs, readres, fhandle, RC_NOCACHE, ST+AT+1+NFSSVC_MAXBLKSIZE_V2/4),
PROC(none, void, void, none, RC_NOCACHE, ST),
PROC(write, writeargs, attrstat, fhandle, RC_REPLBUFF, ST+AT),
PROC(create, createargs, diropres, fhandle, RC_REPLBUFF, ST+FH+AT),
@@ -579,11 +581,11 @@ struct svc_version nfsd_version2 = {
/*
* Map errnos to NFS errnos.
*/
-int
+__be32
nfserrno (int errno)
{
static struct {
- int nfserr;
+ __be32 nfserr;
int syserr;
} nfs_errtbl[] = {
{ nfs_ok, 0 },
@@ -615,11 +617,10 @@ nfserrno (int errno)
{ nfserr_badname, -ESRCH },
{ nfserr_io, -ETXTBSY },
{ nfserr_notsupp, -EOPNOTSUPP },
- { -1, -EIO }
};
int i;
- for (i = 0; nfs_errtbl[i].nfserr != -1; i++) {
+ for (i = 0; i < ARRAY_SIZE(nfs_errtbl); i++) {
if (nfs_errtbl[i].syserr == errno)
return nfs_errtbl[i].nfserr;
}
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 19443056ec30..0aaccb03bf76 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -198,9 +198,26 @@ int nfsd_create_serv(void)
unlock_kernel();
return 0;
}
+ if (nfsd_max_blksize == 0) {
+ /* choose a suitable default */
+ struct sysinfo i;
+ si_meminfo(&i);
+ /* Aim for 1/4096 of memory per thread
+ * This gives 1MB on 4Gig machines
+ * But only uses 32K on 128M machines.
+ * Bottom out at 8K on 32M and smaller.
+ * Of course, this is only a default.
+ */
+ nfsd_max_blksize = NFSSVC_MAXBLKSIZE;
+ i.totalram <<= PAGE_SHIFT - 12;
+ while (nfsd_max_blksize > i.totalram &&
+ nfsd_max_blksize >= 8*1024*2)
+ nfsd_max_blksize /= 2;
+ }
atomic_set(&nfsd_busy, 0);
- nfsd_serv = svc_create_pooled(&nfsd_program, NFSD_BUFSIZE,
+ nfsd_serv = svc_create_pooled(&nfsd_program,
+ nfsd_max_blksize,
nfsd_last_thread,
nfsd, SIG_NOCLEAN, THIS_MODULE);
if (nfsd_serv == NULL)
@@ -474,12 +491,12 @@ out:
}
int
-nfsd_dispatch(struct svc_rqst *rqstp, u32 *statp)
+nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp)
{
struct svc_procedure *proc;
kxdrproc_t xdr;
- u32 nfserr;
- u32 *nfserrp;
+ __be32 nfserr;
+ __be32 *nfserrp;
dprintk("nfsd_dispatch: vers %d proc %d\n",
rqstp->rq_vers, rqstp->rq_proc);
@@ -498,7 +515,7 @@ nfsd_dispatch(struct svc_rqst *rqstp, u32 *statp)
/* Decode arguments */
xdr = proc->pc_decode;
- if (xdr && !xdr(rqstp, (u32*)rqstp->rq_arg.head[0].iov_base,
+ if (xdr && !xdr(rqstp, (__be32*)rqstp->rq_arg.head[0].iov_base,
rqstp->rq_argp)) {
dprintk("nfsd: failed to decode arguments!\n");
nfsd_cache_update(rqstp, RC_NOCACHE, NULL);
@@ -511,7 +528,7 @@ nfsd_dispatch(struct svc_rqst *rqstp, u32 *statp)
*/
nfserrp = rqstp->rq_res.head[0].iov_base
+ rqstp->rq_res.head[0].iov_len;
- rqstp->rq_res.head[0].iov_len += sizeof(u32);
+ rqstp->rq_res.head[0].iov_len += sizeof(__be32);
/* Now call the procedure handler, and encode NFS status. */
nfserr = proc->pc_func(rqstp, rqstp->rq_argp, rqstp->rq_resp);
diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c
index 3f14a17eaa6e..56ebb1443e0e 100644
--- a/fs/nfsd/nfsxdr.c
+++ b/fs/nfsd/nfsxdr.c
@@ -37,8 +37,8 @@ static u32 nfs_ftypes[] = {
/*
* XDR functions for basic NFS types
*/
-static u32 *
-decode_fh(u32 *p, struct svc_fh *fhp)
+static __be32 *
+decode_fh(__be32 *p, struct svc_fh *fhp)
{
fh_init(fhp, NFS_FHSIZE);
memcpy(&fhp->fh_handle.fh_base, p, NFS_FHSIZE);
@@ -50,13 +50,13 @@ decode_fh(u32 *p, struct svc_fh *fhp)
}
/* Helper function for NFSv2 ACL code */
-u32 *nfs2svc_decode_fh(u32 *p, struct svc_fh *fhp)
+__be32 *nfs2svc_decode_fh(__be32 *p, struct svc_fh *fhp)
{
return decode_fh(p, fhp);
}
-static inline u32 *
-encode_fh(u32 *p, struct svc_fh *fhp)
+static inline __be32 *
+encode_fh(__be32 *p, struct svc_fh *fhp)
{
memcpy(p, &fhp->fh_handle.fh_base, NFS_FHSIZE);
return p + (NFS_FHSIZE>> 2);
@@ -66,8 +66,8 @@ encode_fh(u32 *p, struct svc_fh *fhp)
* Decode a file name and make sure that the path contains
* no slashes or null bytes.
*/
-static inline u32 *
-decode_filename(u32 *p, char **namp, int *lenp)
+static inline __be32 *
+decode_filename(__be32 *p, char **namp, int *lenp)
{
char *name;
int i;
@@ -82,8 +82,8 @@ decode_filename(u32 *p, char **namp, int *lenp)
return p;
}
-static inline u32 *
-decode_pathname(u32 *p, char **namp, int *lenp)
+static inline __be32 *
+decode_pathname(__be32 *p, char **namp, int *lenp)
{
char *name;
int i;
@@ -98,8 +98,8 @@ decode_pathname(u32 *p, char **namp, int *lenp)
return p;
}
-static inline u32 *
-decode_sattr(u32 *p, struct iattr *iap)
+static inline __be32 *
+decode_sattr(__be32 *p, struct iattr *iap)
{
u32 tmp, tmp1;
@@ -151,8 +151,8 @@ decode_sattr(u32 *p, struct iattr *iap)
return p;
}
-static u32 *
-encode_fattr(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp,
+static __be32 *
+encode_fattr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp,
struct kstat *stat)
{
struct dentry *dentry = fhp->fh_dentry;
@@ -195,7 +195,7 @@ encode_fattr(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp,
}
/* Helper function for NFSv2 ACL code */
-u32 *nfs2svc_encode_fattr(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp)
+__be32 *nfs2svc_encode_fattr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp)
{
struct kstat stat;
vfs_getattr(fhp->fh_export->ex_mnt, fhp->fh_dentry, &stat);
@@ -206,13 +206,13 @@ u32 *nfs2svc_encode_fattr(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp)
* XDR decode functions
*/
int
-nfssvc_decode_void(struct svc_rqst *rqstp, u32 *p, void *dummy)
+nfssvc_decode_void(struct svc_rqst *rqstp, __be32 *p, void *dummy)
{
return xdr_argsize_check(rqstp, p);
}
int
-nfssvc_decode_fhandle(struct svc_rqst *rqstp, u32 *p, struct nfsd_fhandle *args)
+nfssvc_decode_fhandle(struct svc_rqst *rqstp, __be32 *p, struct nfsd_fhandle *args)
{
if (!(p = decode_fh(p, &args->fh)))
return 0;
@@ -220,7 +220,7 @@ nfssvc_decode_fhandle(struct svc_rqst *rqstp, u32 *p, struct nfsd_fhandle *args)
}
int
-nfssvc_decode_sattrargs(struct svc_rqst *rqstp, u32 *p,
+nfssvc_decode_sattrargs(struct svc_rqst *rqstp, __be32 *p,
struct nfsd_sattrargs *args)
{
if (!(p = decode_fh(p, &args->fh))
@@ -231,7 +231,7 @@ nfssvc_decode_sattrargs(struct svc_rqst *rqstp, u32 *p,
}
int
-nfssvc_decode_diropargs(struct svc_rqst *rqstp, u32 *p,
+nfssvc_decode_diropargs(struct svc_rqst *rqstp, __be32 *p,
struct nfsd_diropargs *args)
{
if (!(p = decode_fh(p, &args->fh))
@@ -242,7 +242,7 @@ nfssvc_decode_diropargs(struct svc_rqst *rqstp, u32 *p,
}
int
-nfssvc_decode_readargs(struct svc_rqst *rqstp, u32 *p,
+nfssvc_decode_readargs(struct svc_rqst *rqstp, __be32 *p,
struct nfsd_readargs *args)
{
unsigned int len;
@@ -254,19 +254,18 @@ nfssvc_decode_readargs(struct svc_rqst *rqstp, u32 *p,
len = args->count = ntohl(*p++);
p++; /* totalcount - unused */
- if (len > NFSSVC_MAXBLKSIZE)
- len = NFSSVC_MAXBLKSIZE;
+ if (len > NFSSVC_MAXBLKSIZE_V2)
+ len = NFSSVC_MAXBLKSIZE_V2;
/* set up somewhere to store response.
* We take pages, put them on reslist and include in iovec
*/
v=0;
while (len > 0) {
- pn=rqstp->rq_resused;
- svc_take_page(rqstp);
- args->vec[v].iov_base = page_address(rqstp->rq_respages[pn]);
- args->vec[v].iov_len = len < PAGE_SIZE?len:PAGE_SIZE;
- len -= args->vec[v].iov_len;
+ pn = rqstp->rq_resused++;
+ rqstp->rq_vec[v].iov_base = page_address(rqstp->rq_respages[pn]);
+ rqstp->rq_vec[v].iov_len = len < PAGE_SIZE?len:PAGE_SIZE;
+ len -= rqstp->rq_vec[v].iov_len;
v++;
}
args->vlen = v;
@@ -274,7 +273,7 @@ nfssvc_decode_readargs(struct svc_rqst *rqstp, u32 *p,
}
int
-nfssvc_decode_writeargs(struct svc_rqst *rqstp, u32 *p,
+nfssvc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p,
struct nfsd_writeargs *args)
{
unsigned int len;
@@ -286,25 +285,25 @@ nfssvc_decode_writeargs(struct svc_rqst *rqstp, u32 *p,
args->offset = ntohl(*p++); /* offset */
p++; /* totalcount */
len = args->len = ntohl(*p++);
- args->vec[0].iov_base = (void*)p;
- args->vec[0].iov_len = rqstp->rq_arg.head[0].iov_len -
+ rqstp->rq_vec[0].iov_base = (void*)p;
+ rqstp->rq_vec[0].iov_len = rqstp->rq_arg.head[0].iov_len -
(((void*)p) - rqstp->rq_arg.head[0].iov_base);
- if (len > NFSSVC_MAXBLKSIZE)
- len = NFSSVC_MAXBLKSIZE;
+ if (len > NFSSVC_MAXBLKSIZE_V2)
+ len = NFSSVC_MAXBLKSIZE_V2;
v = 0;
- while (len > args->vec[v].iov_len) {
- len -= args->vec[v].iov_len;
+ while (len > rqstp->rq_vec[v].iov_len) {
+ len -= rqstp->rq_vec[v].iov_len;
v++;
- args->vec[v].iov_base = page_address(rqstp->rq_argpages[v]);
- args->vec[v].iov_len = PAGE_SIZE;
+ rqstp->rq_vec[v].iov_base = page_address(rqstp->rq_pages[v]);
+ rqstp->rq_vec[v].iov_len = PAGE_SIZE;
}
- args->vec[v].iov_len = len;
+ rqstp->rq_vec[v].iov_len = len;
args->vlen = v+1;
- return args->vec[0].iov_len > 0;
+ return rqstp->rq_vec[0].iov_len > 0;
}
int
-nfssvc_decode_createargs(struct svc_rqst *rqstp, u32 *p,
+nfssvc_decode_createargs(struct svc_rqst *rqstp, __be32 *p,
struct nfsd_createargs *args)
{
if (!(p = decode_fh(p, &args->fh))
@@ -316,7 +315,7 @@ nfssvc_decode_createargs(struct svc_rqst *rqstp, u32 *p,
}
int
-nfssvc_decode_renameargs(struct svc_rqst *rqstp, u32 *p,
+nfssvc_decode_renameargs(struct svc_rqst *rqstp, __be32 *p,
struct nfsd_renameargs *args)
{
if (!(p = decode_fh(p, &args->ffh))
@@ -329,18 +328,17 @@ nfssvc_decode_renameargs(struct svc_rqst *rqstp, u32 *p,
}
int
-nfssvc_decode_readlinkargs(struct svc_rqst *rqstp, u32 *p, struct nfsd_readlinkargs *args)
+nfssvc_decode_readlinkargs(struct svc_rqst *rqstp, __be32 *p, struct nfsd_readlinkargs *args)
{
if (!(p = decode_fh(p, &args->fh)))
return 0;
- svc_take_page(rqstp);
- args->buffer = page_address(rqstp->rq_respages[rqstp->rq_resused-1]);
+ args->buffer = page_address(rqstp->rq_respages[rqstp->rq_resused++]);
return xdr_argsize_check(rqstp, p);
}
int
-nfssvc_decode_linkargs(struct svc_rqst *rqstp, u32 *p,
+nfssvc_decode_linkargs(struct svc_rqst *rqstp, __be32 *p,
struct nfsd_linkargs *args)
{
if (!(p = decode_fh(p, &args->ffh))
@@ -352,7 +350,7 @@ nfssvc_decode_linkargs(struct svc_rqst *rqstp, u32 *p,
}
int
-nfssvc_decode_symlinkargs(struct svc_rqst *rqstp, u32 *p,
+nfssvc_decode_symlinkargs(struct svc_rqst *rqstp, __be32 *p,
struct nfsd_symlinkargs *args)
{
if (!(p = decode_fh(p, &args->ffh))
@@ -365,7 +363,7 @@ nfssvc_decode_symlinkargs(struct svc_rqst *rqstp, u32 *p,
}
int
-nfssvc_decode_readdirargs(struct svc_rqst *rqstp, u32 *p,
+nfssvc_decode_readdirargs(struct svc_rqst *rqstp, __be32 *p,
struct nfsd_readdirargs *args)
{
if (!(p = decode_fh(p, &args->fh)))
@@ -375,8 +373,7 @@ nfssvc_decode_readdirargs(struct svc_rqst *rqstp, u32 *p,
if (args->count > PAGE_SIZE)
args->count = PAGE_SIZE;
- svc_take_page(rqstp);
- args->buffer = page_address(rqstp->rq_respages[rqstp->rq_resused-1]);
+ args->buffer = page_address(rqstp->rq_respages[rqstp->rq_resused++]);
return xdr_argsize_check(rqstp, p);
}
@@ -385,13 +382,13 @@ nfssvc_decode_readdirargs(struct svc_rqst *rqstp, u32 *p,
* XDR encode functions
*/
int
-nfssvc_encode_void(struct svc_rqst *rqstp, u32 *p, void *dummy)
+nfssvc_encode_void(struct svc_rqst *rqstp, __be32 *p, void *dummy)
{
return xdr_ressize_check(rqstp, p);
}
int
-nfssvc_encode_attrstat(struct svc_rqst *rqstp, u32 *p,
+nfssvc_encode_attrstat(struct svc_rqst *rqstp, __be32 *p,
struct nfsd_attrstat *resp)
{
p = encode_fattr(rqstp, p, &resp->fh, &resp->stat);
@@ -399,7 +396,7 @@ nfssvc_encode_attrstat(struct svc_rqst *rqstp, u32 *p,
}
int
-nfssvc_encode_diropres(struct svc_rqst *rqstp, u32 *p,
+nfssvc_encode_diropres(struct svc_rqst *rqstp, __be32 *p,
struct nfsd_diropres *resp)
{
p = encode_fh(p, &resp->fh);
@@ -408,7 +405,7 @@ nfssvc_encode_diropres(struct svc_rqst *rqstp, u32 *p,
}
int
-nfssvc_encode_readlinkres(struct svc_rqst *rqstp, u32 *p,
+nfssvc_encode_readlinkres(struct svc_rqst *rqstp, __be32 *p,
struct nfsd_readlinkres *resp)
{
*p++ = htonl(resp->len);
@@ -416,7 +413,6 @@ nfssvc_encode_readlinkres(struct svc_rqst *rqstp, u32 *p,
rqstp->rq_res.page_len = resp->len;
if (resp->len & 3) {
/* need to pad the tail */
- rqstp->rq_restailpage = 0;
rqstp->rq_res.tail[0].iov_base = p;
*p = 0;
rqstp->rq_res.tail[0].iov_len = 4 - (resp->len&3);
@@ -425,7 +421,7 @@ nfssvc_encode_readlinkres(struct svc_rqst *rqstp, u32 *p,
}
int
-nfssvc_encode_readres(struct svc_rqst *rqstp, u32 *p,
+nfssvc_encode_readres(struct svc_rqst *rqstp, __be32 *p,
struct nfsd_readres *resp)
{
p = encode_fattr(rqstp, p, &resp->fh, &resp->stat);
@@ -436,7 +432,6 @@ nfssvc_encode_readres(struct svc_rqst *rqstp, u32 *p,
rqstp->rq_res.page_len = resp->count;
if (resp->count & 3) {
/* need to pad the tail */
- rqstp->rq_restailpage = 0;
rqstp->rq_res.tail[0].iov_base = p;
*p = 0;
rqstp->rq_res.tail[0].iov_len = 4 - (resp->count&3);
@@ -445,7 +440,7 @@ nfssvc_encode_readres(struct svc_rqst *rqstp, u32 *p,
}
int
-nfssvc_encode_readdirres(struct svc_rqst *rqstp, u32 *p,
+nfssvc_encode_readdirres(struct svc_rqst *rqstp, __be32 *p,
struct nfsd_readdirres *resp)
{
xdr_ressize_check(rqstp, p);
@@ -458,12 +453,12 @@ nfssvc_encode_readdirres(struct svc_rqst *rqstp, u32 *p,
}
int
-nfssvc_encode_statfsres(struct svc_rqst *rqstp, u32 *p,
+nfssvc_encode_statfsres(struct svc_rqst *rqstp, __be32 *p,
struct nfsd_statfsres *resp)
{
struct kstatfs *stat = &resp->stats;
- *p++ = htonl(NFSSVC_MAXBLKSIZE); /* max transfer size */
+ *p++ = htonl(NFSSVC_MAXBLKSIZE_V2); /* max transfer size */
*p++ = htonl(stat->f_bsize);
*p++ = htonl(stat->f_blocks);
*p++ = htonl(stat->f_bfree);
@@ -476,7 +471,7 @@ nfssvc_encode_entry(struct readdir_cd *ccd, const char *name,
int namlen, loff_t offset, ino_t ino, unsigned int d_type)
{
struct nfsd_readdirres *cd = container_of(ccd, struct nfsd_readdirres, common);
- u32 *p = cd->buffer;
+ __be32 *p = cd->buffer;
int buflen, slen;
/*
@@ -502,7 +497,7 @@ nfssvc_encode_entry(struct readdir_cd *ccd, const char *name,
*p++ = htonl((u32) ino); /* file id */
p = xdr_encode_array(p, name, namlen);/* name length & name */
cd->offset = p; /* remember pointer */
- *p++ = ~(u32) 0; /* offset of next entry */
+ *p++ = htonl(~0U); /* offset of next entry */
cd->buflen = buflen;
cd->buffer = p;
@@ -514,7 +509,7 @@ nfssvc_encode_entry(struct readdir_cd *ccd, const char *name,
* XDR release functions
*/
int
-nfssvc_release_fhandle(struct svc_rqst *rqstp, u32 *p,
+nfssvc_release_fhandle(struct svc_rqst *rqstp, __be32 *p,
struct nfsd_fhandle *resp)
{
fh_put(&resp->fh);
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 443ebc52e382..f21e917bb8ed 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -54,6 +54,7 @@
#include <linux/nfsd_idmap.h>
#include <linux/security.h>
#endif /* CONFIG_NFSD_V4 */
+#include <linux/jhash.h>
#include <asm/uaccess.h>
@@ -81,10 +82,19 @@ struct raparms {
dev_t p_dev;
int p_set;
struct file_ra_state p_ra;
+ unsigned int p_hindex;
};
+struct raparm_hbucket {
+ struct raparms *pb_head;
+ spinlock_t pb_lock;
+} ____cacheline_aligned_in_smp;
+
static struct raparms * raparml;
-static struct raparms * raparm_cache;
+#define RAPARM_HASH_BITS 4
+#define RAPARM_HASH_SIZE (1<<RAPARM_HASH_BITS)
+#define RAPARM_HASH_MASK (RAPARM_HASH_SIZE-1)
+static struct raparm_hbucket raparm_hash[RAPARM_HASH_SIZE];
/*
* Called from nfsd_lookup and encode_dirent. Check if we have crossed
@@ -100,7 +110,7 @@ nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp,
struct dentry *dentry = *dpp;
struct vfsmount *mnt = mntget(exp->ex_mnt);
struct dentry *mounts = dget(dentry);
- int err = nfs_ok;
+ int err = 0;
while (follow_down(&mnt,&mounts)&&d_mountpoint(mounts));
@@ -138,14 +148,15 @@ out:
* clients and is explicitly disallowed for NFSv3
* NeilBrown <neilb@cse.unsw.edu.au>
*/
-int
+__be32
nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name,
int len, struct svc_fh *resfh)
{
struct svc_export *exp;
struct dentry *dparent;
struct dentry *dentry;
- int err;
+ __be32 err;
+ int host_err;
dprintk("nfsd: nfsd_lookup(fh %s, %.*s)\n", SVCFH_fmt(fhp), len,name);
@@ -183,7 +194,7 @@ nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name,
exp2 = exp_parent(exp->ex_client, mnt, dentry,
&rqstp->rq_chandle);
if (IS_ERR(exp2)) {
- err = PTR_ERR(exp2);
+ host_err = PTR_ERR(exp2);
dput(dentry);
mntput(mnt);
goto out_nfserr;
@@ -200,14 +211,14 @@ nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name,
} else {
fh_lock(fhp);
dentry = lookup_one_len(name, dparent, len);
- err = PTR_ERR(dentry);
+ host_err = PTR_ERR(dentry);
if (IS_ERR(dentry))
goto out_nfserr;
/*
* check if we have crossed a mount point ...
*/
if (d_mountpoint(dentry)) {
- if ((err = nfsd_cross_mnt(rqstp, &dentry, &exp))) {
+ if ((host_err = nfsd_cross_mnt(rqstp, &dentry, &exp))) {
dput(dentry);
goto out_nfserr;
}
@@ -226,7 +237,7 @@ out:
return err;
out_nfserr:
- err = nfserrno(err);
+ err = nfserrno(host_err);
goto out;
}
@@ -234,7 +245,7 @@ out_nfserr:
* Set various file attributes.
* N.B. After this call fhp needs an fh_put
*/
-int
+__be32
nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
int check_guard, time_t guardtime)
{
@@ -243,7 +254,8 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
int accmode = MAY_SATTR;
int ftype = 0;
int imode;
- int err;
+ __be32 err;
+ int host_err;
int size_change = 0;
if (iap->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_SIZE))
@@ -309,19 +321,19 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
* If we are changing the size of the file, then
* we need to break all leases.
*/
- err = break_lease(inode, FMODE_WRITE | O_NONBLOCK);
- if (err == -EWOULDBLOCK)
- err = -ETIMEDOUT;
- if (err) /* ENOMEM or EWOULDBLOCK */
+ host_err = break_lease(inode, FMODE_WRITE | O_NONBLOCK);
+ if (host_err == -EWOULDBLOCK)
+ host_err = -ETIMEDOUT;
+ if (host_err) /* ENOMEM or EWOULDBLOCK */
goto out_nfserr;
- err = get_write_access(inode);
- if (err)
+ host_err = get_write_access(inode);
+ if (host_err)
goto out_nfserr;
size_change = 1;
- err = locks_verify_truncate(inode, NULL, iap->ia_size);
- if (err) {
+ host_err = locks_verify_truncate(inode, NULL, iap->ia_size);
+ if (host_err) {
put_write_access(inode);
goto out_nfserr;
}
@@ -347,8 +359,8 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
err = nfserr_notsync;
if (!check_guard || guardtime == inode->i_ctime.tv_sec) {
fh_lock(fhp);
- err = notify_change(dentry, iap);
- err = nfserrno(err);
+ host_err = notify_change(dentry, iap);
+ err = nfserrno(host_err);
fh_unlock(fhp);
}
if (size_change)
@@ -360,7 +372,7 @@ out:
return err;
out_nfserr:
- err = nfserrno(err);
+ err = nfserrno(host_err);
goto out;
}
@@ -410,11 +422,12 @@ out:
return error;
}
-int
+__be32
nfsd4_set_nfs4_acl(struct svc_rqst *rqstp, struct svc_fh *fhp,
struct nfs4_acl *acl)
{
- int error;
+ __be32 error;
+ int host_error;
struct dentry *dentry;
struct inode *inode;
struct posix_acl *pacl = NULL, *dpacl = NULL;
@@ -430,22 +443,20 @@ nfsd4_set_nfs4_acl(struct svc_rqst *rqstp, struct svc_fh *fhp,
if (S_ISDIR(inode->i_mode))
flags = NFS4_ACL_DIR;
- error = nfs4_acl_nfsv4_to_posix(acl, &pacl, &dpacl, flags);
- if (error == -EINVAL) {
+ host_error = nfs4_acl_nfsv4_to_posix(acl, &pacl, &dpacl, flags);
+ if (host_error == -EINVAL) {
error = nfserr_attrnotsupp;
goto out;
- } else if (error < 0)
+ } else if (host_error < 0)
goto out_nfserr;
- if (pacl) {
- error = set_nfsv4_acl_one(dentry, pacl, POSIX_ACL_XATTR_ACCESS);
- if (error < 0)
- goto out_nfserr;
- }
+ host_error = set_nfsv4_acl_one(dentry, pacl, POSIX_ACL_XATTR_ACCESS);
+ if (host_error < 0)
+ goto out_nfserr;
- if (dpacl) {
- error = set_nfsv4_acl_one(dentry, dpacl, POSIX_ACL_XATTR_DEFAULT);
- if (error < 0)
+ if (S_ISDIR(inode->i_mode)) {
+ host_error = set_nfsv4_acl_one(dentry, dpacl, POSIX_ACL_XATTR_DEFAULT);
+ if (host_error < 0)
goto out_nfserr;
}
@@ -456,7 +467,7 @@ out:
posix_acl_release(dpacl);
return (error);
out_nfserr:
- error = nfserrno(error);
+ error = nfserrno(host_error);
goto out;
}
@@ -563,14 +574,14 @@ static struct accessmap nfs3_anyaccess[] = {
{ 0, 0 }
};
-int
+__be32
nfsd_access(struct svc_rqst *rqstp, struct svc_fh *fhp, u32 *access, u32 *supported)
{
struct accessmap *map;
struct svc_export *export;
struct dentry *dentry;
u32 query, result = 0, sresult = 0;
- unsigned int error;
+ __be32 error;
error = fh_verify(rqstp, fhp, 0, MAY_NOP);
if (error)
@@ -590,7 +601,7 @@ nfsd_access(struct svc_rqst *rqstp, struct svc_fh *fhp, u32 *access, u32 *suppor
query = *access;
for (; map->access; map++) {
if (map->access & query) {
- unsigned int err2;
+ __be32 err2;
sresult |= map->access;
@@ -629,13 +640,15 @@ nfsd_access(struct svc_rqst *rqstp, struct svc_fh *fhp, u32 *access, u32 *suppor
* The access argument indicates the type of open (read/write/lock)
* N.B. After this call fhp needs an fh_put
*/
-int
+__be32
nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
int access, struct file **filp)
{
struct dentry *dentry;
struct inode *inode;
- int flags = O_RDONLY|O_LARGEFILE, err;
+ int flags = O_RDONLY|O_LARGEFILE;
+ __be32 err;
+ int host_err;
/*
* If we get here, then the client has already done an "open",
@@ -665,10 +678,10 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
* Check to see if there are any leases on this file.
* This may block while leases are broken.
*/
- err = break_lease(inode, O_NONBLOCK | ((access & MAY_WRITE) ? FMODE_WRITE : 0));
- if (err == -EWOULDBLOCK)
- err = -ETIMEDOUT;
- if (err) /* NOMEM or WOULDBLOCK */
+ host_err = break_lease(inode, O_NONBLOCK | ((access & MAY_WRITE) ? FMODE_WRITE : 0));
+ if (host_err == -EWOULDBLOCK)
+ host_err = -ETIMEDOUT;
+ if (host_err) /* NOMEM or WOULDBLOCK */
goto out_nfserr;
if (access & MAY_WRITE) {
@@ -681,10 +694,9 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
}
*filp = dentry_open(dget(dentry), mntget(fhp->fh_export->ex_mnt), flags);
if (IS_ERR(*filp))
- err = PTR_ERR(*filp);
+ host_err = PTR_ERR(*filp);
out_nfserr:
- if (err)
- err = nfserrno(err);
+ err = nfserrno(host_err);
out:
return err;
}
@@ -743,16 +755,20 @@ nfsd_sync_dir(struct dentry *dp)
* Obtain the readahead parameters for the file
* specified by (dev, ino).
*/
-static DEFINE_SPINLOCK(ra_lock);
static inline struct raparms *
nfsd_get_raparms(dev_t dev, ino_t ino)
{
struct raparms *ra, **rap, **frap = NULL;
int depth = 0;
+ unsigned int hash;
+ struct raparm_hbucket *rab;
- spin_lock(&ra_lock);
- for (rap = &raparm_cache; (ra = *rap); rap = &ra->p_next) {
+ hash = jhash_2words(dev, ino, 0xfeedbeef) & RAPARM_HASH_MASK;
+ rab = &raparm_hash[hash];
+
+ spin_lock(&rab->pb_lock);
+ for (rap = &rab->pb_head; (ra = *rap); rap = &ra->p_next) {
if (ra->p_ino == ino && ra->p_dev == dev)
goto found;
depth++;
@@ -761,7 +777,7 @@ nfsd_get_raparms(dev_t dev, ino_t ino)
}
depth = nfsdstats.ra_size*11/10;
if (!frap) {
- spin_unlock(&ra_lock);
+ spin_unlock(&rab->pb_lock);
return NULL;
}
rap = frap;
@@ -769,15 +785,16 @@ nfsd_get_raparms(dev_t dev, ino_t ino)
ra->p_dev = dev;
ra->p_ino = ino;
ra->p_set = 0;
+ ra->p_hindex = hash;
found:
- if (rap != &raparm_cache) {
+ if (rap != &rab->pb_head) {
*rap = ra->p_next;
- ra->p_next = raparm_cache;
- raparm_cache = ra;
+ ra->p_next = rab->pb_head;
+ rab->pb_head = ra;
}
ra->p_count++;
nfsdstats.ra_depth[depth*10/nfsdstats.ra_size]++;
- spin_unlock(&ra_lock);
+ spin_unlock(&rab->pb_lock);
return ra;
}
@@ -791,36 +808,41 @@ nfsd_read_actor(read_descriptor_t *desc, struct page *page, unsigned long offset
{
unsigned long count = desc->count;
struct svc_rqst *rqstp = desc->arg.data;
+ struct page **pp = rqstp->rq_respages + rqstp->rq_resused;
if (size > count)
size = count;
if (rqstp->rq_res.page_len == 0) {
get_page(page);
- rqstp->rq_respages[rqstp->rq_resused++] = page;
+ put_page(*pp);
+ *pp = page;
+ rqstp->rq_resused++;
rqstp->rq_res.page_base = offset;
rqstp->rq_res.page_len = size;
- } else if (page != rqstp->rq_respages[rqstp->rq_resused-1]) {
+ } else if (page != pp[-1]) {
get_page(page);
- rqstp->rq_respages[rqstp->rq_resused++] = page;
+ put_page(*pp);
+ *pp = page;
+ rqstp->rq_resused++;
rqstp->rq_res.page_len += size;
- } else {
+ } else
rqstp->rq_res.page_len += size;
- }
desc->count = count - size;
desc->written += size;
return size;
}
-static int
+static __be32
nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
loff_t offset, struct kvec *vec, int vlen, unsigned long *count)
{
struct inode *inode;
struct raparms *ra;
mm_segment_t oldfs;
- int err;
+ __be32 err;
+ int host_err;
err = nfserr_perm;
inode = file->f_dentry->d_inode;
@@ -837,32 +859,33 @@ nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
file->f_ra = ra->p_ra;
if (file->f_op->sendfile && rqstp->rq_sendfile_ok) {
- svc_pushback_unused_pages(rqstp);
- err = file->f_op->sendfile(file, &offset, *count,
+ rqstp->rq_resused = 1;
+ host_err = file->f_op->sendfile(file, &offset, *count,
nfsd_read_actor, rqstp);
} else {
oldfs = get_fs();
set_fs(KERNEL_DS);
- err = vfs_readv(file, (struct iovec __user *)vec, vlen, &offset);
+ host_err = vfs_readv(file, (struct iovec __user *)vec, vlen, &offset);
set_fs(oldfs);
}
/* Write back readahead params */
if (ra) {
- spin_lock(&ra_lock);
+ struct raparm_hbucket *rab = &raparm_hash[ra->p_hindex];
+ spin_lock(&rab->pb_lock);
ra->p_ra = file->f_ra;
ra->p_set = 1;
ra->p_count--;
- spin_unlock(&ra_lock);
+ spin_unlock(&rab->pb_lock);
}
- if (err >= 0) {
- nfsdstats.io_read += err;
- *count = err;
+ if (host_err >= 0) {
+ nfsdstats.io_read += host_err;
+ *count = host_err;
err = 0;
fsnotify_access(file->f_dentry);
} else
- err = nfserrno(err);
+ err = nfserrno(host_err);
out:
return err;
}
@@ -877,7 +900,7 @@ static void kill_suid(struct dentry *dentry)
mutex_unlock(&dentry->d_inode->i_mutex);
}
-static int
+static __be32
nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
loff_t offset, struct kvec *vec, int vlen,
unsigned long cnt, int *stablep)
@@ -886,7 +909,8 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
struct dentry *dentry;
struct inode *inode;
mm_segment_t oldfs;
- int err = 0;
+ __be32 err = 0;
+ int host_err;
int stable = *stablep;
#ifdef MSNFS
@@ -922,18 +946,18 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
/* Write the data. */
oldfs = get_fs(); set_fs(KERNEL_DS);
- err = vfs_writev(file, (struct iovec __user *)vec, vlen, &offset);
+ host_err = vfs_writev(file, (struct iovec __user *)vec, vlen, &offset);
set_fs(oldfs);
- if (err >= 0) {
+ if (host_err >= 0) {
nfsdstats.io_write += cnt;
fsnotify_modify(file->f_dentry);
}
/* clear setuid/setgid flag after write */
- if (err >= 0 && (inode->i_mode & (S_ISUID | S_ISGID)))
+ if (host_err >= 0 && (inode->i_mode & (S_ISUID | S_ISGID)))
kill_suid(dentry);
- if (err >= 0 && stable) {
+ if (host_err >= 0 && stable) {
static ino_t last_ino;
static dev_t last_dev;
@@ -959,7 +983,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
if (inode->i_state & I_DIRTY) {
dprintk("nfsd: write sync %d\n", current->pid);
- err=nfsd_sync(file);
+ host_err=nfsd_sync(file);
}
#if 0
wake_up(&inode->i_wait);
@@ -969,11 +993,11 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
last_dev = inode->i_sb->s_dev;
}
- dprintk("nfsd: write complete err=%d\n", err);
- if (err >= 0)
+ dprintk("nfsd: write complete host_err=%d\n", host_err);
+ if (host_err >= 0)
err = 0;
else
- err = nfserrno(err);
+ err = nfserrno(host_err);
out:
return err;
}
@@ -983,12 +1007,12 @@ out:
* on entry. On return, *count contains the number of bytes actually read.
* N.B. After this call fhp needs an fh_put
*/
-int
+__be32
nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
loff_t offset, struct kvec *vec, int vlen,
unsigned long *count)
{
- int err;
+ __be32 err;
if (file) {
err = nfsd_permission(fhp->fh_export, fhp->fh_dentry,
@@ -1012,12 +1036,12 @@ out:
* The stable flag requests synchronous writes.
* N.B. After this call fhp needs an fh_put
*/
-int
+__be32
nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
loff_t offset, struct kvec *vec, int vlen, unsigned long cnt,
int *stablep)
{
- int err = 0;
+ __be32 err = 0;
if (file) {
err = nfsd_permission(fhp->fh_export, fhp->fh_dentry,
@@ -1049,12 +1073,12 @@ out:
* Unfortunately we cannot lock the file to make sure we return full WCC
* data to the client, as locking happens lower down in the filesystem.
*/
-int
+__be32
nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp,
loff_t offset, unsigned long count)
{
struct file *file;
- int err;
+ __be32 err;
if ((u64)count > ~(u64)offset)
return nfserr_inval;
@@ -1082,14 +1106,15 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp,
*
* N.B. Every call to nfsd_create needs an fh_put for _both_ fhp and resfhp
*/
-int
+__be32
nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
char *fname, int flen, struct iattr *iap,
int type, dev_t rdev, struct svc_fh *resfhp)
{
struct dentry *dentry, *dchild = NULL;
struct inode *dirp;
- int err;
+ __be32 err;
+ int host_err;
err = nfserr_perm;
if (!flen)
@@ -1116,7 +1141,7 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
/* called from nfsd_proc_mkdir, or possibly nfsd3_proc_create */
fh_lock_nested(fhp, I_MUTEX_PARENT);
dchild = lookup_one_len(fname, dentry, flen);
- err = PTR_ERR(dchild);
+ host_err = PTR_ERR(dchild);
if (IS_ERR(dchild))
goto out_nfserr;
err = fh_compose(resfhp, fhp->fh_export, dchild, fhp);
@@ -1155,22 +1180,22 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
err = nfserr_perm;
switch (type) {
case S_IFREG:
- err = vfs_create(dirp, dchild, iap->ia_mode, NULL);
+ host_err = vfs_create(dirp, dchild, iap->ia_mode, NULL);
break;
case S_IFDIR:
- err = vfs_mkdir(dirp, dchild, iap->ia_mode);
+ host_err = vfs_mkdir(dirp, dchild, iap->ia_mode);
break;
case S_IFCHR:
case S_IFBLK:
case S_IFIFO:
case S_IFSOCK:
- err = vfs_mknod(dirp, dchild, iap->ia_mode, rdev);
+ host_err = vfs_mknod(dirp, dchild, iap->ia_mode, rdev);
break;
default:
printk("nfsd: bad file type %o in nfsd_create\n", type);
- err = -EINVAL;
+ host_err = -EINVAL;
}
- if (err < 0)
+ if (host_err < 0)
goto out_nfserr;
if (EX_ISSYNC(fhp->fh_export)) {
@@ -1185,7 +1210,7 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
* directories via NFS.
*/
if ((iap->ia_valid &= ~(ATTR_UID|ATTR_GID|ATTR_MODE)) != 0) {
- int err2 = nfsd_setattr(rqstp, resfhp, iap, 0, (time_t)0);
+ __be32 err2 = nfsd_setattr(rqstp, resfhp, iap, 0, (time_t)0);
if (err2)
err = err2;
}
@@ -1200,7 +1225,7 @@ out:
return err;
out_nfserr:
- err = nfserrno(err);
+ err = nfserrno(host_err);
goto out;
}
@@ -1208,7 +1233,7 @@ out_nfserr:
/*
* NFSv3 version of nfsd_create
*/
-int
+__be32
nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
char *fname, int flen, struct iattr *iap,
struct svc_fh *resfhp, int createmode, u32 *verifier,
@@ -1216,7 +1241,8 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
{
struct dentry *dentry, *dchild = NULL;
struct inode *dirp;
- int err;
+ __be32 err;
+ int host_err;
__u32 v_mtime=0, v_atime=0;
int v_mode=0;
@@ -1246,7 +1272,7 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
* Compose the response file handle.
*/
dchild = lookup_one_len(fname, dentry, flen);
- err = PTR_ERR(dchild);
+ host_err = PTR_ERR(dchild);
if (IS_ERR(dchild))
goto out_nfserr;
@@ -1302,8 +1328,8 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
goto out;
}
- err = vfs_create(dirp, dchild, iap->ia_mode, NULL);
- if (err < 0)
+ host_err = vfs_create(dirp, dchild, iap->ia_mode, NULL);
+ if (host_err < 0)
goto out_nfserr;
if (EX_ISSYNC(fhp->fh_export)) {
@@ -1332,7 +1358,7 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
*/
set_attr:
if ((iap->ia_valid &= ~(ATTR_UID|ATTR_GID)) != 0) {
- int err2 = nfsd_setattr(rqstp, resfhp, iap, 0, (time_t)0);
+ __be32 err2 = nfsd_setattr(rqstp, resfhp, iap, 0, (time_t)0);
if (err2)
err = err2;
}
@@ -1350,7 +1376,7 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
return err;
out_nfserr:
- err = nfserrno(err);
+ err = nfserrno(host_err);
goto out;
}
#endif /* CONFIG_NFSD_V3 */
@@ -1360,13 +1386,14 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
* fits into the buffer. On return, it contains the true length.
* N.B. After this call fhp needs an fh_put
*/
-int
+__be32
nfsd_readlink(struct svc_rqst *rqstp, struct svc_fh *fhp, char *buf, int *lenp)
{
struct dentry *dentry;
struct inode *inode;
mm_segment_t oldfs;
- int err;
+ __be32 err;
+ int host_err;
err = fh_verify(rqstp, fhp, S_IFLNK, MAY_NOP);
if (err)
@@ -1385,18 +1412,18 @@ nfsd_readlink(struct svc_rqst *rqstp, struct svc_fh *fhp, char *buf, int *lenp)
*/
oldfs = get_fs(); set_fs(KERNEL_DS);
- err = inode->i_op->readlink(dentry, buf, *lenp);
+ host_err = inode->i_op->readlink(dentry, buf, *lenp);
set_fs(oldfs);
- if (err < 0)
+ if (host_err < 0)
goto out_nfserr;
- *lenp = err;
+ *lenp = host_err;
err = 0;
out:
return err;
out_nfserr:
- err = nfserrno(err);
+ err = nfserrno(host_err);
goto out;
}
@@ -1404,7 +1431,7 @@ out_nfserr:
* Create a symlink and look up its inode
* N.B. After this call _both_ fhp and resfhp need an fh_put
*/
-int
+__be32
nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp,
char *fname, int flen,
char *path, int plen,
@@ -1412,7 +1439,8 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp,
struct iattr *iap)
{
struct dentry *dentry, *dnew;
- int err, cerr;
+ __be32 err, cerr;
+ int host_err;
umode_t mode;
err = nfserr_noent;
@@ -1428,7 +1456,7 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp,
fh_lock(fhp);
dentry = fhp->fh_dentry;
dnew = lookup_one_len(fname, dentry, flen);
- err = PTR_ERR(dnew);
+ host_err = PTR_ERR(dnew);
if (IS_ERR(dnew))
goto out_nfserr;
@@ -1440,21 +1468,21 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp,
if (unlikely(path[plen] != 0)) {
char *path_alloced = kmalloc(plen+1, GFP_KERNEL);
if (path_alloced == NULL)
- err = -ENOMEM;
+ host_err = -ENOMEM;
else {
strncpy(path_alloced, path, plen);
path_alloced[plen] = 0;
- err = vfs_symlink(dentry->d_inode, dnew, path_alloced, mode);
+ host_err = vfs_symlink(dentry->d_inode, dnew, path_alloced, mode);
kfree(path_alloced);
}
} else
- err = vfs_symlink(dentry->d_inode, dnew, path, mode);
+ host_err = vfs_symlink(dentry->d_inode, dnew, path, mode);
- if (!err)
+ if (!host_err) {
if (EX_ISSYNC(fhp->fh_export))
- err = nfsd_sync_dir(dentry);
- if (err)
- err = nfserrno(err);
+ host_err = nfsd_sync_dir(dentry);
+ }
+ err = nfserrno(host_err);
fh_unlock(fhp);
cerr = fh_compose(resfhp, fhp->fh_export, dnew, fhp);
@@ -1464,7 +1492,7 @@ out:
return err;
out_nfserr:
- err = nfserrno(err);
+ err = nfserrno(host_err);
goto out;
}
@@ -1472,13 +1500,14 @@ out_nfserr:
* Create a hardlink
* N.B. After this call _both_ ffhp and tfhp need an fh_put
*/
-int
+__be32
nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
char *name, int len, struct svc_fh *tfhp)
{
struct dentry *ddir, *dnew, *dold;
struct inode *dirp, *dest;
- int err;
+ __be32 err;
+ int host_err;
err = fh_verify(rqstp, ffhp, S_IFDIR, MAY_CREATE);
if (err)
@@ -1499,24 +1528,25 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
dirp = ddir->d_inode;
dnew = lookup_one_len(name, ddir, len);
- err = PTR_ERR(dnew);
+ host_err = PTR_ERR(dnew);
if (IS_ERR(dnew))
goto out_nfserr;
dold = tfhp->fh_dentry;
dest = dold->d_inode;
- err = vfs_link(dold, dirp, dnew);
- if (!err) {
+ host_err = vfs_link(dold, dirp, dnew);
+ if (!host_err) {
if (EX_ISSYNC(ffhp->fh_export)) {
err = nfserrno(nfsd_sync_dir(ddir));
write_inode_now(dest, 1);
}
+ err = 0;
} else {
- if (err == -EXDEV && rqstp->rq_vers == 2)
+ if (host_err == -EXDEV && rqstp->rq_vers == 2)
err = nfserr_acces;
else
- err = nfserrno(err);
+ err = nfserrno(host_err);
}
dput(dnew);
@@ -1526,7 +1556,7 @@ out:
return err;
out_nfserr:
- err = nfserrno(err);
+ err = nfserrno(host_err);
goto out_unlock;
}
@@ -1534,13 +1564,14 @@ out_nfserr:
* Rename a file
* N.B. After this call _both_ ffhp and tfhp need an fh_put
*/
-int
+__be32
nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
struct svc_fh *tfhp, char *tname, int tlen)
{
struct dentry *fdentry, *tdentry, *odentry, *ndentry, *trap;
struct inode *fdir, *tdir;
- int err;
+ __be32 err;
+ int host_err;
err = fh_verify(rqstp, ffhp, S_IFDIR, MAY_REMOVE);
if (err)
@@ -1571,22 +1602,22 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
fill_pre_wcc(tfhp);
odentry = lookup_one_len(fname, fdentry, flen);
- err = PTR_ERR(odentry);
+ host_err = PTR_ERR(odentry);
if (IS_ERR(odentry))
goto out_nfserr;
- err = -ENOENT;
+ host_err = -ENOENT;
if (!odentry->d_inode)
goto out_dput_old;
- err = -EINVAL;
+ host_err = -EINVAL;
if (odentry == trap)
goto out_dput_old;
ndentry = lookup_one_len(tname, tdentry, tlen);
- err = PTR_ERR(ndentry);
+ host_err = PTR_ERR(ndentry);
if (IS_ERR(ndentry))
goto out_dput_old;
- err = -ENOTEMPTY;
+ host_err = -ENOTEMPTY;
if (ndentry == trap)
goto out_dput_new;
@@ -1594,14 +1625,14 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
if ((ffhp->fh_export->ex_flags & NFSEXP_MSNFS) &&
((atomic_read(&odentry->d_count) > 1)
|| (atomic_read(&ndentry->d_count) > 1))) {
- err = -EPERM;
+ host_err = -EPERM;
} else
#endif
- err = vfs_rename(fdir, odentry, tdir, ndentry);
- if (!err && EX_ISSYNC(tfhp->fh_export)) {
- err = nfsd_sync_dir(tdentry);
- if (!err)
- err = nfsd_sync_dir(fdentry);
+ host_err = vfs_rename(fdir, odentry, tdir, ndentry);
+ if (!host_err && EX_ISSYNC(tfhp->fh_export)) {
+ host_err = nfsd_sync_dir(tdentry);
+ if (!host_err)
+ host_err = nfsd_sync_dir(fdentry);
}
out_dput_new:
@@ -1609,8 +1640,7 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
out_dput_old:
dput(odentry);
out_nfserr:
- if (err)
- err = nfserrno(err);
+ err = nfserrno(host_err);
/* we cannot reply on fh_unlock on the two filehandles,
* as that would do the wrong thing if the two directories
@@ -1629,13 +1659,14 @@ out:
* Unlink a file or directory
* N.B. After this call fhp needs an fh_put
*/
-int
+__be32
nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
char *fname, int flen)
{
struct dentry *dentry, *rdentry;
struct inode *dirp;
- int err;
+ __be32 err;
+ int host_err;
err = nfserr_acces;
if (!flen || isdotent(fname, flen))
@@ -1649,7 +1680,7 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
dirp = dentry->d_inode;
rdentry = lookup_one_len(fname, dentry, flen);
- err = PTR_ERR(rdentry);
+ host_err = PTR_ERR(rdentry);
if (IS_ERR(rdentry))
goto out_nfserr;
@@ -1666,22 +1697,23 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
#ifdef MSNFS
if ((fhp->fh_export->ex_flags & NFSEXP_MSNFS) &&
(atomic_read(&rdentry->d_count) > 1)) {
- err = -EPERM;
+ host_err = -EPERM;
} else
#endif
- err = vfs_unlink(dirp, rdentry);
+ host_err = vfs_unlink(dirp, rdentry);
} else { /* It's RMDIR */
- err = vfs_rmdir(dirp, rdentry);
+ host_err = vfs_rmdir(dirp, rdentry);
}
dput(rdentry);
- if (err == 0 &&
- EX_ISSYNC(fhp->fh_export))
- err = nfsd_sync_dir(dentry);
+ if (host_err)
+ goto out_nfserr;
+ if (EX_ISSYNC(fhp->fh_export))
+ host_err = nfsd_sync_dir(dentry);
out_nfserr:
- err = nfserrno(err);
+ err = nfserrno(host_err);
out:
return err;
}
@@ -1690,11 +1722,12 @@ out:
* Read entries from a directory.
* The NFSv3/4 verifier we ignore for now.
*/
-int
+__be32
nfsd_readdir(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t *offsetp,
struct readdir_cd *cdp, encode_dent_fn func)
{
- int err;
+ __be32 err;
+ int host_err;
struct file *file;
loff_t offset = *offsetp;
@@ -1716,10 +1749,10 @@ nfsd_readdir(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t *offsetp,
do {
cdp->err = nfserr_eof; /* will be cleared on successful read */
- err = vfs_readdir(file, (filldir_t) func, cdp);
- } while (err >=0 && cdp->err == nfs_ok);
- if (err)
- err = nfserrno(err);
+ host_err = vfs_readdir(file, (filldir_t) func, cdp);
+ } while (host_err >=0 && cdp->err == nfs_ok);
+ if (host_err)
+ err = nfserrno(host_err);
else
err = cdp->err;
*offsetp = vfs_llseek(file, 0, 1);
@@ -1736,10 +1769,10 @@ out:
* Get file system stats
* N.B. After this call fhp needs an fh_put
*/
-int
+__be32
nfsd_statfs(struct svc_rqst *rqstp, struct svc_fh *fhp, struct kstatfs *stat)
{
- int err = fh_verify(rqstp, fhp, 0, MAY_NOP);
+ __be32 err = fh_verify(rqstp, fhp, 0, MAY_NOP);
if (!err && vfs_statfs(fhp->fh_dentry,stat))
err = nfserr_io;
return err;
@@ -1748,7 +1781,7 @@ nfsd_statfs(struct svc_rqst *rqstp, struct svc_fh *fhp, struct kstatfs *stat)
/*
* Check for a user's access permissions to this inode.
*/
-int
+__be32
nfsd_permission(struct svc_export *exp, struct dentry *dentry, int acc)
{
struct inode *inode = dentry->d_inode;
@@ -1829,11 +1862,11 @@ nfsd_permission(struct svc_export *exp, struct dentry *dentry, int acc)
void
nfsd_racache_shutdown(void)
{
- if (!raparm_cache)
+ if (!raparml)
return;
dprintk("nfsd: freeing readahead buffers.\n");
kfree(raparml);
- raparm_cache = raparml = NULL;
+ raparml = NULL;
}
/*
* Initialize readahead param cache
@@ -1842,19 +1875,31 @@ int
nfsd_racache_init(int cache_size)
{
int i;
+ int j = 0;
+ int nperbucket;
+
- if (raparm_cache)
+ if (raparml)
return 0;
+ if (cache_size < 2*RAPARM_HASH_SIZE)
+ cache_size = 2*RAPARM_HASH_SIZE;
raparml = kmalloc(sizeof(struct raparms) * cache_size, GFP_KERNEL);
if (raparml != NULL) {
dprintk("nfsd: allocating %d readahead buffers.\n",
cache_size);
+ for (i = 0 ; i < RAPARM_HASH_SIZE ; i++) {
+ raparm_hash[i].pb_head = NULL;
+ spin_lock_init(&raparm_hash[i].pb_lock);
+ }
+ nperbucket = cache_size >> RAPARM_HASH_BITS;
memset(raparml, 0, sizeof(struct raparms) * cache_size);
for (i = 0; i < cache_size - 1; i++) {
- raparml[i].p_next = raparml + i + 1;
+ if (i % nperbucket == 0)
+ raparm_hash[j++].pb_head = raparml + i;
+ if (i % nperbucket < nperbucket-1)
+ raparml[i].p_next = raparml + i + 1;
}
- raparm_cache = raparml;
} else {
printk(KERN_WARNING
"nfsd: Could not allocate memory read-ahead cache.\n");
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
index e1fceb8aa32d..d11753c50bc1 100644
--- a/fs/ocfs2/cluster/nodemanager.c
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -152,14 +152,16 @@ static struct o2nm_node *o2nm_node_ip_tree_lookup(struct o2nm_cluster *cluster,
struct o2nm_node *node, *ret = NULL;
while (*p) {
+ int cmp;
+
parent = *p;
node = rb_entry(parent, struct o2nm_node, nd_ip_node);
- if (memcmp(&ip_needle, &node->nd_ipv4_address,
- sizeof(ip_needle)) < 0)
+ cmp = memcmp(&ip_needle, &node->nd_ipv4_address,
+ sizeof(ip_needle));
+ if (cmp < 0)
p = &(*p)->rb_left;
- else if (memcmp(&ip_needle, &node->nd_ipv4_address,
- sizeof(ip_needle)) > 0)
+ else if (cmp > 0)
p = &(*p)->rb_right;
else {
ret = node;
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index d9ba0a931a03..1be74c4e7814 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -30,6 +30,7 @@
#include <linux/highmem.h>
#include <linux/pagemap.h>
#include <linux/uio.h>
+#include <linux/sched.h>
#define MLOG_MASK_PREFIX ML_INODE
#include <cluster/masklog.h>
@@ -691,6 +692,12 @@ static int ocfs2_zero_extend(struct inode *inode,
}
start_off += sb->s_blocksize;
+
+ /*
+ * Very large extends have the potential to lock up
+ * the cpu for extended periods of time.
+ */
+ cond_resched();
}
out:
@@ -728,31 +735,36 @@ static int ocfs2_extend_file(struct inode *inode,
clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size) -
OCFS2_I(inode)->ip_clusters;
- if (clusters_to_add) {
- /*
- * protect the pages that ocfs2_zero_extend is going to
- * be pulling into the page cache.. we do this before the
- * metadata extend so that we don't get into the situation
- * where we've extended the metadata but can't get the data
- * lock to zero.
- */
- ret = ocfs2_data_lock(inode, 1);
- if (ret < 0) {
- mlog_errno(ret);
- goto out;
- }
+ /*
+ * protect the pages that ocfs2_zero_extend is going to be
+ * pulling into the page cache.. we do this before the
+ * metadata extend so that we don't get into the situation
+ * where we've extended the metadata but can't get the data
+ * lock to zero.
+ */
+ ret = ocfs2_data_lock(inode, 1);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+ if (clusters_to_add) {
ret = ocfs2_extend_allocation(inode, clusters_to_add);
if (ret < 0) {
mlog_errno(ret);
goto out_unlock;
}
+ }
- ret = ocfs2_zero_extend(inode, (u64)new_i_size - tail_to_skip);
- if (ret < 0) {
- mlog_errno(ret);
- goto out_unlock;
- }
+ /*
+ * Call this even if we don't add any clusters to the tree. We
+ * still need to zero the area between the old i_size and the
+ * new i_size.
+ */
+ ret = ocfs2_zero_extend(inode, (u64)new_i_size - tail_to_skip);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out_unlock;
}
if (!tail_to_skip) {
@@ -764,8 +776,7 @@ static int ocfs2_extend_file(struct inode *inode,
}
out_unlock:
- if (clusters_to_add) /* this is the only case in which we lock */
- ocfs2_data_unlock(inode, 1);
+ ocfs2_data_unlock(inode, 1);
out:
return ret;
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 259155f0eb2e..a57b751d4f40 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -1085,14 +1085,6 @@ static int ocfs2_rename(struct inode *old_dir,
BUG();
}
- if (atomic_read(&old_dentry->d_count) > 2) {
- shrink_dcache_parent(old_dentry);
- if (atomic_read(&old_dentry->d_count) > 2) {
- status = -EBUSY;
- goto bail;
- }
- }
-
/* Assume a directory heirarchy thusly:
* a/b/c
* a/d
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 4c29cd7cc8e6..76b46ebbb10c 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -339,7 +339,7 @@ static unsigned long long ocfs2_max_file_offset(unsigned int blockshift)
#if BITS_PER_LONG == 32
# if defined(CONFIG_LBD)
- BUG_ON(sizeof(sector_t) != 8);
+ BUILD_BUG_ON(sizeof(sector_t) != 8);
pagefactor = PAGE_CACHE_SIZE;
bitshift = BITS_PER_LONG;
# else
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 51c6a748df49..6fb4b6150d77 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -376,18 +376,48 @@ static char *make_block_name(struct gendisk *disk)
return name;
}
-static void disk_sysfs_symlinks(struct gendisk *disk)
+static int disk_sysfs_symlinks(struct gendisk *disk)
{
struct device *target = get_device(disk->driverfs_dev);
+ int err;
+ char *disk_name = NULL;
+
if (target) {
- char *disk_name = make_block_name(disk);
- sysfs_create_link(&disk->kobj,&target->kobj,"device");
- if (disk_name) {
- sysfs_create_link(&target->kobj,&disk->kobj,disk_name);
- kfree(disk_name);
+ disk_name = make_block_name(disk);
+ if (!disk_name) {
+ err = -ENOMEM;
+ goto err_out;
}
+
+ err = sysfs_create_link(&disk->kobj, &target->kobj, "device");
+ if (err)
+ goto err_out_disk_name;
+
+ err = sysfs_create_link(&target->kobj, &disk->kobj, disk_name);
+ if (err)
+ goto err_out_dev_link;
}
- sysfs_create_link(&disk->kobj, &block_subsys.kset.kobj, "subsystem");
+
+ err = sysfs_create_link(&disk->kobj, &block_subsys.kset.kobj,
+ "subsystem");
+ if (err)
+ goto err_out_disk_name_lnk;
+
+ kfree(disk_name);
+
+ return 0;
+
+err_out_disk_name_lnk:
+ if (target) {
+ sysfs_remove_link(&target->kobj, disk_name);
+err_out_dev_link:
+ sysfs_remove_link(&disk->kobj, "device");
+err_out_disk_name:
+ kfree(disk_name);
+err_out:
+ put_device(target);
+ }
+ return err;
}
/* Not exported, helper to add_disk(). */
@@ -406,7 +436,11 @@ void register_disk(struct gendisk *disk)
*s = '!';
if ((err = kobject_add(&disk->kobj)))
return;
- disk_sysfs_symlinks(disk);
+ err = disk_sysfs_symlinks(disk);
+ if (err) {
+ kobject_del(&disk->kobj);
+ return;
+ }
disk_sysfs_add_subdirs(disk);
/* No minors to use for partitions */
diff --git a/fs/partitions/msdos.c b/fs/partitions/msdos.c
index 4f8df71e49d3..8c7af1777819 100644
--- a/fs/partitions/msdos.c
+++ b/fs/partitions/msdos.c
@@ -32,13 +32,11 @@
#include <asm/unaligned.h>
#define SYS_IND(p) (get_unaligned(&p->sys_ind))
-#define NR_SECTS(p) ({ __typeof__(p->nr_sects) __a = \
- get_unaligned(&p->nr_sects); \
+#define NR_SECTS(p) ({ __le32 __a = get_unaligned(&p->nr_sects); \
le32_to_cpu(__a); \
})
-#define START_SECT(p) ({ __typeof__(p->start_sect) __a = \
- get_unaligned(&p->start_sect); \
+#define START_SECT(p) ({ __le32 __a = get_unaligned(&p->start_sect); \
le32_to_cpu(__a); \
})
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 82da55b5cffe..8df27401d292 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -72,6 +72,7 @@
#include <linux/audit.h>
#include <linux/poll.h>
#include <linux/nsproxy.h>
+#include <linux/oom.h>
#include "internal.h"
/* NOTE:
@@ -86,7 +87,7 @@
/* Worst case buffer size needed for holding an integer. */
-#define PROC_NUMBUF 10
+#define PROC_NUMBUF 13
struct pid_entry {
int len;
@@ -689,7 +690,8 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
if (copy_from_user(buffer, buf, count))
return -EFAULT;
oom_adjust = simple_strtol(buffer, &end, 0);
- if ((oom_adjust < -16 || oom_adjust > 15) && oom_adjust != OOM_DISABLE)
+ if ((oom_adjust < OOM_ADJUST_MIN || oom_adjust > OOM_ADJUST_MAX) &&
+ oom_adjust != OOM_DISABLE)
return -EINVAL;
if (*end == '\n')
end++;
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index 8d88e58ed5cc..93c43b676e59 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -647,7 +647,7 @@ static ssize_t write_sysrq_trigger(struct file *file, const char __user *buf,
if (get_user(c, buf))
return -EFAULT;
- __handle_sysrq(c, NULL, NULL, 0);
+ __handle_sysrq(c, NULL, 0);
}
return count;
}
diff --git a/fs/reiserfs/bitmap.c b/fs/reiserfs/bitmap.c
index 1bfae42117ca..e3d466a228d4 100644
--- a/fs/reiserfs/bitmap.c
+++ b/fs/reiserfs/bitmap.c
@@ -1304,8 +1304,8 @@ struct buffer_head *reiserfs_read_bitmap_block(struct super_block *sb,
bh = sb_bread(sb, block);
if (bh == NULL)
- reiserfs_warning(sb, "sh-2029: %s: bitmap block (#%lu) "
- "reading failed", __FUNCTION__, bh->b_blocknr);
+ reiserfs_warning(sb, "sh-2029: %s: bitmap block (#%u) "
+ "reading failed", __FUNCTION__, block);
else {
if (buffer_locked(bh)) {
PROC_INFO_INC(sb, scan_bitmap.wait);
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index c093642fb983..b67ce9354048 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -2,7 +2,6 @@
* Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
*/
-#include <linux/config.h>
#include <linux/time.h>
#include <linux/reiserfs_fs.h>
#include <linux/reiserfs_acl.h>
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 7e5a2f5ebeb0..9c69bcacad22 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -1780,7 +1780,7 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
err = -EDQUOT;
goto out_end_trans;
}
- if (!dir || !dir->i_nlink) {
+ if (!dir->i_nlink) {
err = -EPERM;
goto out_bad_inode;
}
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index ad8cbc49883a..85ce23268302 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -53,6 +53,7 @@
#include <linux/workqueue.h>
#include <linux/writeback.h>
#include <linux/blkdev.h>
+#include <linux/backing-dev.h>
/* gets a struct reiserfs_journal_list * from a list head */
#define JOURNAL_LIST_ENTRY(h) (list_entry((h), struct reiserfs_journal_list, \
@@ -970,7 +971,7 @@ int reiserfs_async_progress_wait(struct super_block *s)
DEFINE_WAIT(wait);
struct reiserfs_journal *j = SB_JOURNAL(s);
if (atomic_read(&j->j_async_throttle))
- blk_congestion_wait(WRITE, HZ / 10);
+ congestion_wait(WRITE, HZ / 10);
return 0;
}
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index c89aa2338191..9041802df832 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -430,20 +430,29 @@ int remove_save_link(struct inode *inode, int truncate)
return journal_end(&th, inode->i_sb, JOURNAL_PER_BALANCE_CNT);
}
-static void reiserfs_put_super(struct super_block *s)
+static void reiserfs_kill_sb(struct super_block *s)
{
- struct reiserfs_transaction_handle th;
- th.t_trans_id = 0;
+ if (REISERFS_SB(s)) {
+ if (REISERFS_SB(s)->xattr_root) {
+ d_invalidate(REISERFS_SB(s)->xattr_root);
+ dput(REISERFS_SB(s)->xattr_root);
+ REISERFS_SB(s)->xattr_root = NULL;
+ }
- if (REISERFS_SB(s)->xattr_root) {
- d_invalidate(REISERFS_SB(s)->xattr_root);
- dput(REISERFS_SB(s)->xattr_root);
+ if (REISERFS_SB(s)->priv_root) {
+ d_invalidate(REISERFS_SB(s)->priv_root);
+ dput(REISERFS_SB(s)->priv_root);
+ REISERFS_SB(s)->priv_root = NULL;
+ }
}
- if (REISERFS_SB(s)->priv_root) {
- d_invalidate(REISERFS_SB(s)->priv_root);
- dput(REISERFS_SB(s)->priv_root);
- }
+ kill_block_super(s);
+}
+
+static void reiserfs_put_super(struct super_block *s)
+{
+ struct reiserfs_transaction_handle th;
+ th.t_trans_id = 0;
/* change file system state to current state if it was mounted with read-write permissions */
if (!(s->s_flags & MS_RDONLY)) {
@@ -2156,7 +2165,7 @@ struct file_system_type reiserfs_fs_type = {
.owner = THIS_MODULE,
.name = "reiserfs",
.get_sb = get_super_block,
- .kill_sb = kill_block_super,
+ .kill_sb = reiserfs_kill_sb,
.fs_flags = FS_REQUIRES_DEV,
};
diff --git a/fs/splice.c b/fs/splice.c
index 13e92dd19fbb..a567010b62ac 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -607,7 +607,7 @@ find_page:
ret = -ENOMEM;
page = page_cache_alloc_cold(mapping);
if (unlikely(!page))
- goto out_nomem;
+ goto out_ret;
/*
* This will also lock the page
@@ -666,7 +666,7 @@ find_page:
if (sd->pos + this_len > isize)
vmtruncate(mapping->host, isize);
- goto out;
+ goto out_ret;
}
if (buf->page != page) {
@@ -698,7 +698,7 @@ find_page:
out:
page_cache_release(page);
unlock_page(page);
-out_nomem:
+out_ret:
return ret;
}
diff --git a/fs/super.c b/fs/super.c
index aec99ddbe53f..47e554c12e76 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -260,17 +260,17 @@ int fsync_super(struct super_block *sb)
* that need destruction out of superblock, call generic_shutdown_super()
* and release aforementioned objects. Note: dentries and inodes _are_
* taken care of and do not need specific handling.
+ *
+ * Upon calling this function, the filesystem may no longer alter or
+ * rearrange the set of dentries belonging to this super_block, nor may it
+ * change the attachments of dentries to inodes.
*/
void generic_shutdown_super(struct super_block *sb)
{
- struct dentry *root = sb->s_root;
struct super_operations *sop = sb->s_op;
- if (root) {
- sb->s_root = NULL;
- shrink_dcache_parent(root);
- shrink_dcache_sb(sb);
- dput(root);
+ if (sb->s_root) {
+ shrink_dcache_for_umount(sb);
fsync_super(sb);
lock_super(sb);
sb->s_flags &= ~MS_ACTIVE;
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 146f1dedec84..298303b5a716 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -483,17 +483,12 @@ int sysfs_update_file(struct kobject * kobj, const struct attribute * attr)
(victim->d_parent->d_inode == dir->d_inode)) {
victim->d_inode->i_mtime = CURRENT_TIME;
fsnotify_modify(victim);
-
- /**
- * Drop reference from initial sysfs_get_dentry().
- */
- dput(victim);
res = 0;
} else
d_drop(victim);
/**
- * Drop the reference acquired from sysfs_get_dentry() above.
+ * Drop the reference acquired from lookup_one_len() above.
*/
dput(victim);
}
diff --git a/fs/sysv/super.c b/fs/sysv/super.c
index 350cba5d6803..dc9e7dc07fb7 100644
--- a/fs/sysv/super.c
+++ b/fs/sysv/super.c
@@ -358,16 +358,11 @@ static int sysv_fill_super(struct super_block *sb, void *data, int silent)
unsigned long blocknr;
int size = 0, i;
- if (1024 != sizeof (struct xenix_super_block))
- panic("Xenix FS: bad superblock size");
- if (512 != sizeof (struct sysv4_super_block))
- panic("SystemV FS: bad superblock size");
- if (512 != sizeof (struct sysv2_super_block))
- panic("SystemV FS: bad superblock size");
- if (500 != sizeof (struct coh_super_block))
- panic("Coherent FS: bad superblock size");
- if (64 != sizeof (struct sysv_inode))
- panic("sysv fs: bad inode size");
+ BUILD_BUG_ON(1024 != sizeof (struct xenix_super_block));
+ BUILD_BUG_ON(512 != sizeof (struct sysv4_super_block));
+ BUILD_BUG_ON(512 != sizeof (struct sysv2_super_block));
+ BUILD_BUG_ON(500 != sizeof (struct coh_super_block));
+ BUILD_BUG_ON(64 != sizeof (struct sysv_inode));
sbi = kzalloc(sizeof(struct sysv_sb_info), GFP_KERNEL);
if (!sbi)
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 1d3b5d2070e5..1aea6a4f9a4a 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -1621,9 +1621,10 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
goto error_out;
}
- if (UDF_SB_PARTFLAGS(sb, UDF_SB_PARTITION(sb)) & UDF_PART_FLAG_READ_ONLY)
+ if (UDF_SB_PARTFLAGS(sb, UDF_SB_PARTITION(sb)) & UDF_PART_FLAG_READ_ONLY) {
printk("UDF-fs: Partition marked readonly; forcing readonly mount\n");
sb->s_flags |= MS_RDONLY;
+ }
if ( udf_find_fileset(sb, &fileset, &rootdir) )
{
diff --git a/fs/ufs/util.c b/fs/ufs/util.c
index 22f820a9b15c..17437574f79c 100644
--- a/fs/ufs/util.c
+++ b/fs/ufs/util.c
@@ -184,14 +184,13 @@ void _ubh_memcpyubh_(struct ufs_sb_private_info * uspi,
dev_t
ufs_get_inode_dev(struct super_block *sb, struct ufs_inode_info *ufsi)
{
- __fs32 fs32;
+ __u32 fs32;
dev_t dev;
if ((UFS_SB(sb)->s_flags & UFS_ST_MASK) == UFS_ST_SUNx86)
- fs32 = ufsi->i_u1.i_data[1];
+ fs32 = fs32_to_cpu(sb, ufsi->i_u1.i_data[1]);
else
- fs32 = ufsi->i_u1.i_data[0];
- fs32 = fs32_to_cpu(sb, fs32);
+ fs32 = fs32_to_cpu(sb, ufsi->i_u1.i_data[0]);
switch (UFS_SB(sb)->s_flags & UFS_ST_MASK) {
case UFS_ST_SUNx86:
case UFS_ST_SUN:
@@ -212,7 +211,7 @@ ufs_get_inode_dev(struct super_block *sb, struct ufs_inode_info *ufsi)
void
ufs_set_inode_dev(struct super_block *sb, struct ufs_inode_info *ufsi, dev_t dev)
{
- __fs32 fs32;
+ __u32 fs32;
switch (UFS_SB(sb)->s_flags & UFS_ST_MASK) {
case UFS_ST_SUNx86:
@@ -227,11 +226,10 @@ ufs_set_inode_dev(struct super_block *sb, struct ufs_inode_info *ufsi, dev_t dev
fs32 = old_encode_dev(dev);
break;
}
- fs32 = cpu_to_fs32(sb, fs32);
if ((UFS_SB(sb)->s_flags & UFS_ST_MASK) == UFS_ST_SUNx86)
- ufsi->i_u1.i_data[1] = fs32;
+ ufsi->i_u1.i_data[1] = cpu_to_fs32(sb, fs32);
else
- ufsi->i_u1.i_data[0] = fs32;
+ ufsi->i_u1.i_data[0] = cpu_to_fs32(sb, fs32);
}
/**
diff --git a/fs/xattr.c b/fs/xattr.c
index c32f15b5f60f..395635100f77 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -135,6 +135,26 @@ vfs_getxattr(struct dentry *dentry, char *name, void *value, size_t size)
}
EXPORT_SYMBOL_GPL(vfs_getxattr);
+ssize_t
+vfs_listxattr(struct dentry *d, char *list, size_t size)
+{
+ ssize_t error;
+
+ error = security_inode_listxattr(d);
+ if (error)
+ return error;
+ error = -EOPNOTSUPP;
+ if (d->d_inode->i_op && d->d_inode->i_op->listxattr) {
+ error = d->d_inode->i_op->listxattr(d, list, size);
+ } else {
+ error = security_inode_listsecurity(d->d_inode, list, size);
+ if (size && error > size)
+ error = -ERANGE;
+ }
+ return error;
+}
+EXPORT_SYMBOL_GPL(vfs_listxattr);
+
int
vfs_removexattr(struct dentry *dentry, char *name)
{
@@ -346,17 +366,7 @@ listxattr(struct dentry *d, char __user *list, size_t size)
return -ENOMEM;
}
- error = security_inode_listxattr(d);
- if (error)
- goto out;
- error = -EOPNOTSUPP;
- if (d->d_inode->i_op && d->d_inode->i_op->listxattr) {
- error = d->d_inode->i_op->listxattr(d, klist, size);
- } else {
- error = security_inode_listsecurity(d->d_inode, klist, size);
- if (size && error > size)
- error = -ERANGE;
- }
+ error = vfs_listxattr(d, klist, size);
if (error > 0) {
if (size && copy_to_user(list, klist, error))
error = -EFAULT;
@@ -365,7 +375,6 @@ listxattr(struct dentry *d, char __user *list, size_t size)
than XATTR_LIST_MAX bytes. Not possible. */
error = -E2BIG;
}
-out:
kfree(klist);
return error;
}
diff --git a/fs/xfs/linux-2.6/kmem.c b/fs/xfs/linux-2.6/kmem.c
index d59737589815..004baf600611 100644
--- a/fs/xfs/linux-2.6/kmem.c
+++ b/fs/xfs/linux-2.6/kmem.c
@@ -21,6 +21,7 @@
#include <linux/highmem.h>
#include <linux/swap.h>
#include <linux/blkdev.h>
+#include <linux/backing-dev.h>
#include "time.h"
#include "kmem.h"
@@ -53,7 +54,7 @@ kmem_alloc(size_t size, unsigned int __nocast flags)
printk(KERN_ERR "XFS: possible memory allocation "
"deadlock in %s (mode:0x%x)\n",
__FUNCTION__, lflags);
- blk_congestion_wait(WRITE, HZ/50);
+ congestion_wait(WRITE, HZ/50);
} while (1);
}
@@ -131,7 +132,7 @@ kmem_zone_alloc(kmem_zone_t *zone, unsigned int __nocast flags)
printk(KERN_ERR "XFS: possible memory allocation "
"deadlock in %s (mode:0x%x)\n",
__FUNCTION__, lflags);
- blk_congestion_wait(WRITE, HZ/50);
+ congestion_wait(WRITE, HZ/50);
} while (1);
}
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 9bbadafdcb00..db5f5a3608ca 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -30,6 +30,7 @@
#include <linux/hash.h>
#include <linux/kthread.h>
#include <linux/migrate.h>
+#include <linux/backing-dev.h>
#include "xfs_linux.h"
STATIC kmem_zone_t *xfs_buf_zone;
@@ -395,7 +396,7 @@ _xfs_buf_lookup_pages(
XFS_STATS_INC(xb_page_retries);
xfsbufd_wakeup(0, gfp_mask);
- blk_congestion_wait(WRITE, HZ/50);
+ congestion_wait(WRITE, HZ/50);
goto retry;
}