diff options
author | Linus Torvalds <torvalds@g5.osdl.org> | 2006-10-04 09:06:16 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@g5.osdl.org> | 2006-10-04 09:06:16 -0700 |
commit | 4a61f17378c2cdd9bd8f34ef8bd7422861d0c1f1 (patch) | |
tree | a2054556900af8c16fd9f5419f012dcf1ee2995a /fs/gfs2 | |
parent | d002ec481c24f325ed6cfcb7810d317c015dd1b5 (diff) | |
parent | 7ecdb70a0ea436c06540140242bfac6ac3babfc0 (diff) | |
download | linux-stable-4a61f17378c2cdd9bd8f34ef8bd7422861d0c1f1.tar.gz linux-stable-4a61f17378c2cdd9bd8f34ef8bd7422861d0c1f1.tar.bz2 linux-stable-4a61f17378c2cdd9bd8f34ef8bd7422861d0c1f1.zip |
Merge git://git.kernel.org/pub/scm/linux/kernel/git/steve/gfs2-2.6
* git://git.kernel.org/pub/scm/linux/kernel/git/steve/gfs2-2.6: (292 commits)
[GFS2] Fix endian bug for de_type
[GFS2] Initialize SELinux extended attributes at inode creation time.
[GFS2] Move logging code into log.c (mostly)
[GFS2] Mark nlink cleared so VFS sees it happen
[GFS2] Two redundant casts removed
[GFS2] Remove uneeded endian conversion
[GFS2] Remove duplicate sb reading code
[GFS2] Mark metadata reads for blktrace
[GFS2] Remove iflags.h, use FS_
[GFS2] Fix code style/indent in ops_file.c
[GFS2] streamline-generic_file_-interfaces-and-filemap gfs fix
[GFS2] Remove readv/writev methods and use aio_read/aio_write instead (gfs bits)
[GFS2] inode-diet: Eliminate i_blksize from the inode structure
[GFS2] inode_diet: Replace inode.u.generic_ip with inode.i_private (gfs)
[GFS2] Fix typo in last patch
[GFS2] Fix direct i/o logic in filemap.c
[GFS2] Fix bug in Makefiles for lock modules
[GFS2] Remove (extra) fs_subsys declaration
[GFS2/DLM] Fix trailing whitespace
[GFS2] Tidy up meta_io code
...
Diffstat (limited to 'fs/gfs2')
75 files changed, 27006 insertions, 0 deletions
diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig new file mode 100644 index 000000000000..8c27de8b9568 --- /dev/null +++ b/fs/gfs2/Kconfig @@ -0,0 +1,44 @@ +config GFS2_FS + tristate "GFS2 file system support" + depends on EXPERIMENTAL + select FS_POSIX_ACL + help + A cluster filesystem. + + Allows a cluster of computers to simultaneously use a block device + that is shared between them (with FC, iSCSI, NBD, etc...). GFS reads + and writes to the block device like a local filesystem, but also uses + a lock module to allow the computers coordinate their I/O so + filesystem consistency is maintained. One of the nifty features of + GFS is perfect consistency -- changes made to the filesystem on one + machine show up immediately on all other machines in the cluster. + + To use the GFS2 filesystem, you will need to enable one or more of + the below locking modules. Documentation and utilities for GFS2 can + be found here: http://sources.redhat.com/cluster + +config GFS2_FS_LOCKING_NOLOCK + tristate "GFS2 \"nolock\" locking module" + depends on GFS2_FS + help + Single node locking module for GFS2. + + Use this module if you want to use GFS2 on a single node without + its clustering features. You can still take advantage of the + large file support, and upgrade to running a full cluster later on + if required. + + If you will only be using GFS2 in cluster mode, you do not need this + module. + +config GFS2_FS_LOCKING_DLM + tristate "GFS2 DLM locking module" + depends on GFS2_FS + select DLM + help + Multiple node locking module for GFS2 + + Most users of GFS2 will require this module. It provides the locking + interface between GFS2 and the DLM, which is required to use GFS2 + in a cluster environment. + diff --git a/fs/gfs2/Makefile b/fs/gfs2/Makefile new file mode 100644 index 000000000000..e3f1ada643ac --- /dev/null +++ b/fs/gfs2/Makefile @@ -0,0 +1,10 @@ +obj-$(CONFIG_GFS2_FS) += gfs2.o +gfs2-y := acl.o bmap.o daemon.o dir.o eaops.o eattr.o glock.o \ + glops.o inode.o lm.o log.o lops.o locking.o main.o meta_io.o \ + mount.o ondisk.o ops_address.o ops_dentry.o ops_export.o ops_file.o \ + ops_fstype.o ops_inode.o ops_super.o ops_vm.o quota.o \ + recovery.o rgrp.o super.o sys.o trans.o util.o + +obj-$(CONFIG_GFS2_FS_LOCKING_NOLOCK) += locking/nolock/ +obj-$(CONFIG_GFS2_FS_LOCKING_DLM) += locking/dlm/ + diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c new file mode 100644 index 000000000000..5f959b8ce406 --- /dev/null +++ b/fs/gfs2/acl.c @@ -0,0 +1,309 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/completion.h> +#include <linux/buffer_head.h> +#include <linux/posix_acl.h> +#include <linux/posix_acl_xattr.h> +#include <linux/gfs2_ondisk.h> +#include <linux/lm_interface.h> + +#include "gfs2.h" +#include "incore.h" +#include "acl.h" +#include "eaops.h" +#include "eattr.h" +#include "glock.h" +#include "inode.h" +#include "meta_io.h" +#include "trans.h" +#include "util.h" + +#define ACL_ACCESS 1 +#define ACL_DEFAULT 0 + +int gfs2_acl_validate_set(struct gfs2_inode *ip, int access, + struct gfs2_ea_request *er, + int *remove, mode_t *mode) +{ + struct posix_acl *acl; + int error; + + error = gfs2_acl_validate_remove(ip, access); + if (error) + return error; + + if (!er->er_data) + return -EINVAL; + + acl = posix_acl_from_xattr(er->er_data, er->er_data_len); + if (IS_ERR(acl)) + return PTR_ERR(acl); + if (!acl) { + *remove = 1; + return 0; + } + + error = posix_acl_valid(acl); + if (error) + goto out; + + if (access) { + error = posix_acl_equiv_mode(acl, mode); + if (!error) + *remove = 1; + else if (error > 0) + error = 0; + } + +out: + posix_acl_release(acl); + return error; +} + +int gfs2_acl_validate_remove(struct gfs2_inode *ip, int access) +{ + if (!GFS2_SB(&ip->i_inode)->sd_args.ar_posix_acl) + return -EOPNOTSUPP; + if (current->fsuid != ip->i_di.di_uid && !capable(CAP_FOWNER)) + return -EPERM; + if (S_ISLNK(ip->i_di.di_mode)) + return -EOPNOTSUPP; + if (!access && !S_ISDIR(ip->i_di.di_mode)) + return -EACCES; + + return 0; +} + +static int acl_get(struct gfs2_inode *ip, int access, struct posix_acl **acl, + struct gfs2_ea_location *el, char **data, unsigned int *len) +{ + struct gfs2_ea_request er; + struct gfs2_ea_location el_this; + int error; + + if (!ip->i_di.di_eattr) + return 0; + + memset(&er, 0, sizeof(struct gfs2_ea_request)); + if (access) { + er.er_name = GFS2_POSIX_ACL_ACCESS; + er.er_name_len = GFS2_POSIX_ACL_ACCESS_LEN; + } else { + er.er_name = GFS2_POSIX_ACL_DEFAULT; + er.er_name_len = GFS2_POSIX_ACL_DEFAULT_LEN; + } + er.er_type = GFS2_EATYPE_SYS; + + if (!el) + el = &el_this; + + error = gfs2_ea_find(ip, &er, el); + if (error) + return error; + if (!el->el_ea) + return 0; + if (!GFS2_EA_DATA_LEN(el->el_ea)) + goto out; + + er.er_data_len = GFS2_EA_DATA_LEN(el->el_ea); + er.er_data = kmalloc(er.er_data_len, GFP_KERNEL); + error = -ENOMEM; + if (!er.er_data) + goto out; + + error = gfs2_ea_get_copy(ip, el, er.er_data); + if (error) + goto out_kfree; + + if (acl) { + *acl = posix_acl_from_xattr(er.er_data, er.er_data_len); + if (IS_ERR(*acl)) + error = PTR_ERR(*acl); + } + +out_kfree: + if (error || !data) + kfree(er.er_data); + else { + *data = er.er_data; + *len = er.er_data_len; + } +out: + if (error || el == &el_this) + brelse(el->el_bh); + return error; +} + +/** + * gfs2_check_acl_locked - Check an ACL to see if we're allowed to do something + * @inode: the file we want to do something to + * @mask: what we want to do + * + * Returns: errno + */ + +int gfs2_check_acl_locked(struct inode *inode, int mask) +{ + struct posix_acl *acl = NULL; + int error; + + error = acl_get(GFS2_I(inode), ACL_ACCESS, &acl, NULL, NULL, NULL); + if (error) + return error; + + if (acl) { + error = posix_acl_permission(inode, acl, mask); + posix_acl_release(acl); + return error; + } + + return -EAGAIN; +} + +int gfs2_check_acl(struct inode *inode, int mask) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_holder i_gh; + int error; + + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh); + if (!error) { + error = gfs2_check_acl_locked(inode, mask); + gfs2_glock_dq_uninit(&i_gh); + } + + return error; +} + +static int munge_mode(struct gfs2_inode *ip, mode_t mode) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct buffer_head *dibh; + int error; + + error = gfs2_trans_begin(sdp, RES_DINODE, 0); + if (error) + return error; + + error = gfs2_meta_inode_buffer(ip, &dibh); + if (!error) { + gfs2_assert_withdraw(sdp, + (ip->i_di.di_mode & S_IFMT) == (mode & S_IFMT)); + ip->i_di.di_mode = mode; + gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_dinode_out(&ip->i_di, dibh->b_data); + brelse(dibh); + } + + gfs2_trans_end(sdp); + + return 0; +} + +int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip) +{ + struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); + struct posix_acl *acl = NULL, *clone; + struct gfs2_ea_request er; + mode_t mode = ip->i_di.di_mode; + int error; + + if (!sdp->sd_args.ar_posix_acl) + return 0; + if (S_ISLNK(ip->i_di.di_mode)) + return 0; + + memset(&er, 0, sizeof(struct gfs2_ea_request)); + er.er_type = GFS2_EATYPE_SYS; + + error = acl_get(dip, ACL_DEFAULT, &acl, NULL, + &er.er_data, &er.er_data_len); + if (error) + return error; + if (!acl) { + mode &= ~current->fs->umask; + if (mode != ip->i_di.di_mode) + error = munge_mode(ip, mode); + return error; + } + + clone = posix_acl_clone(acl, GFP_KERNEL); + error = -ENOMEM; + if (!clone) + goto out; + posix_acl_release(acl); + acl = clone; + + if (S_ISDIR(ip->i_di.di_mode)) { + er.er_name = GFS2_POSIX_ACL_DEFAULT; + er.er_name_len = GFS2_POSIX_ACL_DEFAULT_LEN; + error = gfs2_system_eaops.eo_set(ip, &er); + if (error) + goto out; + } + + error = posix_acl_create_masq(acl, &mode); + if (error < 0) + goto out; + if (error > 0) { + er.er_name = GFS2_POSIX_ACL_ACCESS; + er.er_name_len = GFS2_POSIX_ACL_ACCESS_LEN; + posix_acl_to_xattr(acl, er.er_data, er.er_data_len); + er.er_mode = mode; + er.er_flags = GFS2_ERF_MODE; + error = gfs2_system_eaops.eo_set(ip, &er); + if (error) + goto out; + } else + munge_mode(ip, mode); + +out: + posix_acl_release(acl); + kfree(er.er_data); + return error; +} + +int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr) +{ + struct posix_acl *acl = NULL, *clone; + struct gfs2_ea_location el; + char *data; + unsigned int len; + int error; + + error = acl_get(ip, ACL_ACCESS, &acl, &el, &data, &len); + if (error) + return error; + if (!acl) + return gfs2_setattr_simple(ip, attr); + + clone = posix_acl_clone(acl, GFP_KERNEL); + error = -ENOMEM; + if (!clone) + goto out; + posix_acl_release(acl); + acl = clone; + + error = posix_acl_chmod_masq(acl, attr->ia_mode); + if (!error) { + posix_acl_to_xattr(acl, data, len); + error = gfs2_ea_acl_chmod(ip, &el, attr, data); + } + +out: + posix_acl_release(acl); + brelse(el.el_bh); + kfree(data); + return error; +} + diff --git a/fs/gfs2/acl.h b/fs/gfs2/acl.h new file mode 100644 index 000000000000..05c294fe0d78 --- /dev/null +++ b/fs/gfs2/acl.h @@ -0,0 +1,39 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#ifndef __ACL_DOT_H__ +#define __ACL_DOT_H__ + +#include "incore.h" + +#define GFS2_POSIX_ACL_ACCESS "posix_acl_access" +#define GFS2_POSIX_ACL_ACCESS_LEN 16 +#define GFS2_POSIX_ACL_DEFAULT "posix_acl_default" +#define GFS2_POSIX_ACL_DEFAULT_LEN 17 + +#define GFS2_ACL_IS_ACCESS(name, len) \ + ((len) == GFS2_POSIX_ACL_ACCESS_LEN && \ + !memcmp(GFS2_POSIX_ACL_ACCESS, (name), (len))) + +#define GFS2_ACL_IS_DEFAULT(name, len) \ + ((len) == GFS2_POSIX_ACL_DEFAULT_LEN && \ + !memcmp(GFS2_POSIX_ACL_DEFAULT, (name), (len))) + +struct gfs2_ea_request; + +int gfs2_acl_validate_set(struct gfs2_inode *ip, int access, + struct gfs2_ea_request *er, + int *remove, mode_t *mode); +int gfs2_acl_validate_remove(struct gfs2_inode *ip, int access); +int gfs2_check_acl_locked(struct inode *inode, int mask); +int gfs2_check_acl(struct inode *inode, int mask); +int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip); +int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr); + +#endif /* __ACL_DOT_H__ */ diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c new file mode 100644 index 000000000000..cc57f2ecd219 --- /dev/null +++ b/fs/gfs2/bmap.c @@ -0,0 +1,1221 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/completion.h> +#include <linux/buffer_head.h> +#include <linux/gfs2_ondisk.h> +#include <linux/crc32.h> +#include <linux/lm_interface.h> + +#include "gfs2.h" +#include "incore.h" +#include "bmap.h" +#include "glock.h" +#include "inode.h" +#include "meta_io.h" +#include "quota.h" +#include "rgrp.h" +#include "trans.h" +#include "dir.h" +#include "util.h" +#include "ops_address.h" + +/* This doesn't need to be that large as max 64 bit pointers in a 4k + * block is 512, so __u16 is fine for that. It saves stack space to + * keep it small. + */ +struct metapath { + __u16 mp_list[GFS2_MAX_META_HEIGHT]; +}; + +typedef int (*block_call_t) (struct gfs2_inode *ip, struct buffer_head *dibh, + struct buffer_head *bh, u64 *top, + u64 *bottom, unsigned int height, + void *data); + +struct strip_mine { + int sm_first; + unsigned int sm_height; +}; + +/** + * gfs2_unstuffer_page - unstuff a stuffed inode into a block cached by a page + * @ip: the inode + * @dibh: the dinode buffer + * @block: the block number that was allocated + * @private: any locked page held by the caller process + * + * Returns: errno + */ + +static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh, + u64 block, struct page *page) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct inode *inode = &ip->i_inode; + struct buffer_head *bh; + int release = 0; + + if (!page || page->index) { + page = grab_cache_page(inode->i_mapping, 0); + if (!page) + return -ENOMEM; + release = 1; + } + + if (!PageUptodate(page)) { + void *kaddr = kmap(page); + + memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), + ip->i_di.di_size); + memset(kaddr + ip->i_di.di_size, 0, + PAGE_CACHE_SIZE - ip->i_di.di_size); + kunmap(page); + + SetPageUptodate(page); + } + + if (!page_has_buffers(page)) + create_empty_buffers(page, 1 << inode->i_blkbits, + (1 << BH_Uptodate)); + + bh = page_buffers(page); + + if (!buffer_mapped(bh)) + map_bh(bh, inode->i_sb, block); + + set_buffer_uptodate(bh); + if (sdp->sd_args.ar_data == GFS2_DATA_ORDERED || gfs2_is_jdata(ip)) + gfs2_trans_add_bh(ip->i_gl, bh, 0); + mark_buffer_dirty(bh); + + if (release) { + unlock_page(page); + page_cache_release(page); + } + + return 0; +} + +/** + * gfs2_unstuff_dinode - Unstuff a dinode when the data has grown too big + * @ip: The GFS2 inode to unstuff + * @unstuffer: the routine that handles unstuffing a non-zero length file + * @private: private data for the unstuffer + * + * This routine unstuffs a dinode and returns it to a "normal" state such + * that the height can be grown in the traditional way. + * + * Returns: errno + */ + +int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page) +{ + struct buffer_head *bh, *dibh; + struct gfs2_dinode *di; + u64 block = 0; + int isdir = gfs2_is_dir(ip); + int error; + + down_write(&ip->i_rw_mutex); + + error = gfs2_meta_inode_buffer(ip, &dibh); + if (error) + goto out; + + if (ip->i_di.di_size) { + /* Get a free block, fill it with the stuffed data, + and write it out to disk */ + + if (isdir) { + block = gfs2_alloc_meta(ip); + + error = gfs2_dir_get_new_buffer(ip, block, &bh); + if (error) + goto out_brelse; + gfs2_buffer_copy_tail(bh, sizeof(struct gfs2_meta_header), + dibh, sizeof(struct gfs2_dinode)); + brelse(bh); + } else { + block = gfs2_alloc_data(ip); + + error = gfs2_unstuffer_page(ip, dibh, block, page); + if (error) + goto out_brelse; + } + } + + /* Set up the pointer to the new block */ + + gfs2_trans_add_bh(ip->i_gl, dibh, 1); + di = (struct gfs2_dinode *)dibh->b_data; + gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode)); + + if (ip->i_di.di_size) { + *(__be64 *)(di + 1) = cpu_to_be64(block); + ip->i_di.di_blocks++; + di->di_blocks = cpu_to_be64(ip->i_di.di_blocks); + } + + ip->i_di.di_height = 1; + di->di_height = cpu_to_be16(1); + +out_brelse: + brelse(dibh); +out: + up_write(&ip->i_rw_mutex); + return error; +} + +/** + * calc_tree_height - Calculate the height of a metadata tree + * @ip: The GFS2 inode + * @size: The proposed size of the file + * + * Work out how tall a metadata tree needs to be in order to accommodate a + * file of a particular size. If size is less than the current size of + * the inode, then the current size of the inode is used instead of the + * supplied one. + * + * Returns: the height the tree should be + */ + +static unsigned int calc_tree_height(struct gfs2_inode *ip, u64 size) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + u64 *arr; + unsigned int max, height; + + if (ip->i_di.di_size > size) + size = ip->i_di.di_size; + + if (gfs2_is_dir(ip)) { + arr = sdp->sd_jheightsize; + max = sdp->sd_max_jheight; + } else { + arr = sdp->sd_heightsize; + max = sdp->sd_max_height; + } + + for (height = 0; height < max; height++) + if (arr[height] >= size) + break; + + return height; +} + +/** + * build_height - Build a metadata tree of the requested height + * @ip: The GFS2 inode + * @height: The height to build to + * + * + * Returns: errno + */ + +static int build_height(struct inode *inode, unsigned height) +{ + struct gfs2_inode *ip = GFS2_I(inode); + unsigned new_height = height - ip->i_di.di_height; + struct buffer_head *dibh; + struct buffer_head *blocks[GFS2_MAX_META_HEIGHT]; + struct gfs2_dinode *di; + int error; + u64 *bp; + u64 bn; + unsigned n; + + if (height <= ip->i_di.di_height) + return 0; + + error = gfs2_meta_inode_buffer(ip, &dibh); + if (error) + return error; + + for(n = 0; n < new_height; n++) { + bn = gfs2_alloc_meta(ip); + blocks[n] = gfs2_meta_new(ip->i_gl, bn); + gfs2_trans_add_bh(ip->i_gl, blocks[n], 1); + } + + n = 0; + bn = blocks[0]->b_blocknr; + if (new_height > 1) { + for(; n < new_height-1; n++) { + gfs2_metatype_set(blocks[n], GFS2_METATYPE_IN, + GFS2_FORMAT_IN); + gfs2_buffer_clear_tail(blocks[n], + sizeof(struct gfs2_meta_header)); + bp = (u64 *)(blocks[n]->b_data + + sizeof(struct gfs2_meta_header)); + *bp = cpu_to_be64(blocks[n+1]->b_blocknr); + brelse(blocks[n]); + blocks[n] = NULL; + } + } + gfs2_metatype_set(blocks[n], GFS2_METATYPE_IN, GFS2_FORMAT_IN); + gfs2_buffer_copy_tail(blocks[n], sizeof(struct gfs2_meta_header), + dibh, sizeof(struct gfs2_dinode)); + brelse(blocks[n]); + gfs2_trans_add_bh(ip->i_gl, dibh, 1); + di = (struct gfs2_dinode *)dibh->b_data; + gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode)); + *(__be64 *)(di + 1) = cpu_to_be64(bn); + ip->i_di.di_height += new_height; + ip->i_di.di_blocks += new_height; + di->di_height = cpu_to_be16(ip->i_di.di_height); + di->di_blocks = cpu_to_be64(ip->i_di.di_blocks); + brelse(dibh); + return error; +} + +/** + * find_metapath - Find path through the metadata tree + * @ip: The inode pointer + * @mp: The metapath to return the result in + * @block: The disk block to look up + * + * This routine returns a struct metapath structure that defines a path + * through the metadata of inode "ip" to get to block "block". + * + * Example: + * Given: "ip" is a height 3 file, "offset" is 101342453, and this is a + * filesystem with a blocksize of 4096. + * + * find_metapath() would return a struct metapath structure set to: + * mp_offset = 101342453, mp_height = 3, mp_list[0] = 0, mp_list[1] = 48, + * and mp_list[2] = 165. + * + * That means that in order to get to the block containing the byte at + * offset 101342453, we would load the indirect block pointed to by pointer + * 0 in the dinode. We would then load the indirect block pointed to by + * pointer 48 in that indirect block. We would then load the data block + * pointed to by pointer 165 in that indirect block. + * + * ---------------------------------------- + * | Dinode | | + * | | 4| + * | |0 1 2 3 4 5 9| + * | | 6| + * ---------------------------------------- + * | + * | + * V + * ---------------------------------------- + * | Indirect Block | + * | 5| + * | 4 4 4 4 4 5 5 1| + * |0 5 6 7 8 9 0 1 2| + * ---------------------------------------- + * | + * | + * V + * ---------------------------------------- + * | Indirect Block | + * | 1 1 1 1 1 5| + * | 6 6 6 6 6 1| + * |0 3 4 5 6 7 2| + * ---------------------------------------- + * | + * | + * V + * ---------------------------------------- + * | Data block containing offset | + * | 101342453 | + * | | + * | | + * ---------------------------------------- + * + */ + +static void find_metapath(struct gfs2_inode *ip, u64 block, + struct metapath *mp) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + u64 b = block; + unsigned int i; + + for (i = ip->i_di.di_height; i--;) + mp->mp_list[i] = do_div(b, sdp->sd_inptrs); + +} + +/** + * metapointer - Return pointer to start of metadata in a buffer + * @bh: The buffer + * @height: The metadata height (0 = dinode) + * @mp: The metapath + * + * Return a pointer to the block number of the next height of the metadata + * tree given a buffer containing the pointer to the current height of the + * metadata tree. + */ + +static inline u64 *metapointer(struct buffer_head *bh, int *boundary, + unsigned int height, const struct metapath *mp) +{ + unsigned int head_size = (height > 0) ? + sizeof(struct gfs2_meta_header) : sizeof(struct gfs2_dinode); + u64 *ptr; + *boundary = 0; + ptr = ((u64 *)(bh->b_data + head_size)) + mp->mp_list[height]; + if (ptr + 1 == (u64 *)(bh->b_data + bh->b_size)) + *boundary = 1; + return ptr; +} + +/** + * lookup_block - Get the next metadata block in metadata tree + * @ip: The GFS2 inode + * @bh: Buffer containing the pointers to metadata blocks + * @height: The height of the tree (0 = dinode) + * @mp: The metapath + * @create: Non-zero if we may create a new meatdata block + * @new: Used to indicate if we did create a new metadata block + * @block: the returned disk block number + * + * Given a metatree, complete to a particular height, checks to see if the next + * height of the tree exists. If not the next height of the tree is created. + * The block number of the next height of the metadata tree is returned. + * + */ + +static int lookup_block(struct gfs2_inode *ip, struct buffer_head *bh, + unsigned int height, struct metapath *mp, int create, + int *new, u64 *block) +{ + int boundary; + u64 *ptr = metapointer(bh, &boundary, height, mp); + + if (*ptr) { + *block = be64_to_cpu(*ptr); + return boundary; + } + + *block = 0; + + if (!create) + return 0; + + if (height == ip->i_di.di_height - 1 && !gfs2_is_dir(ip)) + *block = gfs2_alloc_data(ip); + else + *block = gfs2_alloc_meta(ip); + + gfs2_trans_add_bh(ip->i_gl, bh, 1); + + *ptr = cpu_to_be64(*block); + ip->i_di.di_blocks++; + + *new = 1; + return 0; +} + +/** + * gfs2_block_pointers - Map a block from an inode to a disk block + * @inode: The inode + * @lblock: The logical block number + * @map_bh: The bh to be mapped + * @mp: metapath to use + * + * Find the block number on the current device which corresponds to an + * inode's block. If the block had to be created, "new" will be set. + * + * Returns: errno + */ + +static int gfs2_block_pointers(struct inode *inode, u64 lblock, int create, + struct buffer_head *bh_map, struct metapath *mp, + unsigned int maxlen) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + struct buffer_head *bh; + unsigned int bsize; + unsigned int height; + unsigned int end_of_metadata; + unsigned int x; + int error = 0; + int new = 0; + u64 dblock = 0; + int boundary; + + BUG_ON(maxlen == 0); + + if (gfs2_assert_warn(sdp, !gfs2_is_stuffed(ip))) + return 0; + + bsize = gfs2_is_dir(ip) ? sdp->sd_jbsize : sdp->sd_sb.sb_bsize; + + height = calc_tree_height(ip, (lblock + 1) * bsize); + if (ip->i_di.di_height < height) { + if (!create) + return 0; + + error = build_height(inode, height); + if (error) + return error; + } + + find_metapath(ip, lblock, mp); + end_of_metadata = ip->i_di.di_height - 1; + + error = gfs2_meta_inode_buffer(ip, &bh); + if (error) + return error; + + for (x = 0; x < end_of_metadata; x++) { + lookup_block(ip, bh, x, mp, create, &new, &dblock); + brelse(bh); + if (!dblock) + return 0; + + error = gfs2_meta_indirect_buffer(ip, x+1, dblock, new, &bh); + if (error) + return error; + } + + boundary = lookup_block(ip, bh, end_of_metadata, mp, create, &new, &dblock); + clear_buffer_mapped(bh_map); + clear_buffer_new(bh_map); + clear_buffer_boundary(bh_map); + + if (dblock) { + map_bh(bh_map, inode->i_sb, dblock); + if (boundary) + set_buffer_boundary(bh); + if (new) { + struct buffer_head *dibh; + error = gfs2_meta_inode_buffer(ip, &dibh); + if (!error) { + gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_dinode_out(&ip->i_di, dibh->b_data); + brelse(dibh); + } + set_buffer_new(bh_map); + goto out_brelse; + } + while(--maxlen && !buffer_boundary(bh_map)) { + u64 eblock; + + mp->mp_list[end_of_metadata]++; + boundary = lookup_block(ip, bh, end_of_metadata, mp, 0, &new, &eblock); + if (eblock != ++dblock) + break; + bh_map->b_size += (1 << inode->i_blkbits); + if (boundary) + set_buffer_boundary(bh_map); + } + } +out_brelse: + brelse(bh); + return 0; +} + + +static inline void bmap_lock(struct inode *inode, int create) +{ + struct gfs2_inode *ip = GFS2_I(inode); + if (create) + down_write(&ip->i_rw_mutex); + else + down_read(&ip->i_rw_mutex); +} + +static inline void bmap_unlock(struct inode *inode, int create) +{ + struct gfs2_inode *ip = GFS2_I(inode); + if (create) + up_write(&ip->i_rw_mutex); + else + up_read(&ip->i_rw_mutex); +} + +int gfs2_block_map(struct inode *inode, u64 lblock, int create, + struct buffer_head *bh, unsigned int maxlen) +{ + struct metapath mp; + int ret; + + bmap_lock(inode, create); + ret = gfs2_block_pointers(inode, lblock, create, bh, &mp, maxlen); + bmap_unlock(inode, create); + return ret; +} + +int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsigned *extlen) +{ + struct metapath mp; + struct buffer_head bh = { .b_state = 0, .b_blocknr = 0, .b_size = 0 }; + int ret; + int create = *new; + + BUG_ON(!extlen); + BUG_ON(!dblock); + BUG_ON(!new); + + bmap_lock(inode, create); + ret = gfs2_block_pointers(inode, lblock, create, &bh, &mp, 32); + bmap_unlock(inode, create); + *extlen = bh.b_size >> inode->i_blkbits; + *dblock = bh.b_blocknr; + if (buffer_new(&bh)) + *new = 1; + else + *new = 0; + return ret; +} + +/** + * recursive_scan - recursively scan through the end of a file + * @ip: the inode + * @dibh: the dinode buffer + * @mp: the path through the metadata to the point to start + * @height: the height the recursion is at + * @block: the indirect block to look at + * @first: 1 if this is the first block + * @bc: the call to make for each piece of metadata + * @data: data opaque to this function to pass to @bc + * + * When this is first called @height and @block should be zero and + * @first should be 1. + * + * Returns: errno + */ + +static int recursive_scan(struct gfs2_inode *ip, struct buffer_head *dibh, + struct metapath *mp, unsigned int height, + u64 block, int first, block_call_t bc, + void *data) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct buffer_head *bh = NULL; + u64 *top, *bottom; + u64 bn; + int error; + int mh_size = sizeof(struct gfs2_meta_header); + + if (!height) { + error = gfs2_meta_inode_buffer(ip, &bh); + if (error) + return error; + dibh = bh; + + top = (u64 *)(bh->b_data + sizeof(struct gfs2_dinode)) + mp->mp_list[0]; + bottom = (u64 *)(bh->b_data + sizeof(struct gfs2_dinode)) + sdp->sd_diptrs; + } else { + error = gfs2_meta_indirect_buffer(ip, height, block, 0, &bh); + if (error) + return error; + + top = (u64 *)(bh->b_data + mh_size) + + (first ? mp->mp_list[height] : 0); + + bottom = (u64 *)(bh->b_data + mh_size) + sdp->sd_inptrs; + } + + error = bc(ip, dibh, bh, top, bottom, height, data); + if (error) + goto out; + + if (height < ip->i_di.di_height - 1) + for (; top < bottom; top++, first = 0) { + if (!*top) + continue; + + bn = be64_to_cpu(*top); + + error = recursive_scan(ip, dibh, mp, height + 1, bn, + first, bc, data); + if (error) + break; + } + +out: + brelse(bh); + return error; +} + +/** + * do_strip - Look for a layer a particular layer of the file and strip it off + * @ip: the inode + * @dibh: the dinode buffer + * @bh: A buffer of pointers + * @top: The first pointer in the buffer + * @bottom: One more than the last pointer + * @height: the height this buffer is at + * @data: a pointer to a struct strip_mine + * + * Returns: errno + */ + +static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh, + struct buffer_head *bh, u64 *top, u64 *bottom, + unsigned int height, void *data) +{ + struct strip_mine *sm = data; + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct gfs2_rgrp_list rlist; + u64 bn, bstart; + u32 blen; + u64 *p; + unsigned int rg_blocks = 0; + int metadata; + unsigned int revokes = 0; + int x; + int error; + + if (!*top) + sm->sm_first = 0; + + if (height != sm->sm_height) + return 0; + + if (sm->sm_first) { + top++; + sm->sm_first = 0; + } + + metadata = (height != ip->i_di.di_height - 1); + if (metadata) + revokes = (height) ? sdp->sd_inptrs : sdp->sd_diptrs; + + error = gfs2_rindex_hold(sdp, &ip->i_alloc.al_ri_gh); + if (error) + return error; + + memset(&rlist, 0, sizeof(struct gfs2_rgrp_list)); + bstart = 0; + blen = 0; + + for (p = top; p < bottom; p++) { + if (!*p) + continue; + + bn = be64_to_cpu(*p); + + if (bstart + blen == bn) + blen++; + else { + if (bstart) + gfs2_rlist_add(sdp, &rlist, bstart); + + bstart = bn; + blen = 1; + } + } + + if (bstart) + gfs2_rlist_add(sdp, &rlist, bstart); + else + goto out; /* Nothing to do */ + + gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE, 0); + + for (x = 0; x < rlist.rl_rgrps; x++) { + struct gfs2_rgrpd *rgd; + rgd = rlist.rl_ghs[x].gh_gl->gl_object; + rg_blocks += rgd->rd_ri.ri_length; + } + + error = gfs2_glock_nq_m(rlist.rl_rgrps, rlist.rl_ghs); + if (error) + goto out_rlist; + + error = gfs2_trans_begin(sdp, rg_blocks + RES_DINODE + + RES_INDIRECT + RES_STATFS + RES_QUOTA, + revokes); + if (error) + goto out_rg_gunlock; + + down_write(&ip->i_rw_mutex); + + gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_trans_add_bh(ip->i_gl, bh, 1); + + bstart = 0; + blen = 0; + + for (p = top; p < bottom; p++) { + if (!*p) + continue; + + bn = be64_to_cpu(*p); + + if (bstart + blen == bn) + blen++; + else { + if (bstart) { + if (metadata) + gfs2_free_meta(ip, bstart, blen); + else + gfs2_free_data(ip, bstart, blen); + } + + bstart = bn; + blen = 1; + } + + *p = 0; + if (!ip->i_di.di_blocks) + gfs2_consist_inode(ip); + ip->i_di.di_blocks--; + } + if (bstart) { + if (metadata) + gfs2_free_meta(ip, bstart, blen); + else + gfs2_free_data(ip, bstart, blen); + } + + ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds(); + + gfs2_dinode_out(&ip->i_di, dibh->b_data); + + up_write(&ip->i_rw_mutex); + + gfs2_trans_end(sdp); + +out_rg_gunlock: + gfs2_glock_dq_m(rlist.rl_rgrps, rlist.rl_ghs); +out_rlist: + gfs2_rlist_free(&rlist); +out: + gfs2_glock_dq_uninit(&ip->i_alloc.al_ri_gh); + return error; +} + +/** + * do_grow - Make a file look bigger than it is + * @ip: the inode + * @size: the size to set the file to + * + * Called with an exclusive lock on @ip. + * + * Returns: errno + */ + +static int do_grow(struct gfs2_inode *ip, u64 size) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct gfs2_alloc *al; + struct buffer_head *dibh; + unsigned int h; + int error; + + al = gfs2_alloc_get(ip); + + error = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE); + if (error) + goto out; + + error = gfs2_quota_check(ip, ip->i_di.di_uid, ip->i_di.di_gid); + if (error) + goto out_gunlock_q; + + al->al_requested = sdp->sd_max_height + RES_DATA; + + error = gfs2_inplace_reserve(ip); + if (error) + goto out_gunlock_q; + + error = gfs2_trans_begin(sdp, + sdp->sd_max_height + al->al_rgd->rd_ri.ri_length + + RES_JDATA + RES_DINODE + RES_STATFS + RES_QUOTA, 0); + if (error) + goto out_ipres; + + if (size > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)) { + if (gfs2_is_stuffed(ip)) { + error = gfs2_unstuff_dinode(ip, NULL); + if (error) + goto out_end_trans; + } + + h = calc_tree_height(ip, size); + if (ip->i_di.di_height < h) { + down_write(&ip->i_rw_mutex); + error = build_height(&ip->i_inode, h); + up_write(&ip->i_rw_mutex); + if (error) + goto out_end_trans; + } + } + + ip->i_di.di_size = size; + ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds(); + + error = gfs2_meta_inode_buffer(ip, &dibh); + if (error) + goto out_end_trans; + + gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_dinode_out(&ip->i_di, dibh->b_data); + brelse(dibh); + +out_end_trans: + gfs2_trans_end(sdp); +out_ipres: + gfs2_inplace_release(ip); +out_gunlock_q: + gfs2_quota_unlock(ip); +out: + gfs2_alloc_put(ip); + return error; +} + + +/** + * gfs2_block_truncate_page - Deal with zeroing out data for truncate + * + * This is partly borrowed from ext3. + */ +static int gfs2_block_truncate_page(struct address_space *mapping) +{ + struct inode *inode = mapping->host; + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + loff_t from = inode->i_size; + unsigned long index = from >> PAGE_CACHE_SHIFT; + unsigned offset = from & (PAGE_CACHE_SIZE-1); + unsigned blocksize, iblock, length, pos; + struct buffer_head *bh; + struct page *page; + void *kaddr; + int err; + + page = grab_cache_page(mapping, index); + if (!page) + return 0; + + blocksize = inode->i_sb->s_blocksize; + length = blocksize - (offset & (blocksize - 1)); + iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); + + if (!page_has_buffers(page)) + create_empty_buffers(page, blocksize, 0); + + /* Find the buffer that contains "offset" */ + bh = page_buffers(page); + pos = blocksize; + while (offset >= pos) { + bh = bh->b_this_page; + iblock++; + pos += blocksize; + } + + err = 0; + + if (!buffer_mapped(bh)) { + gfs2_get_block(inode, iblock, bh, 0); + /* unmapped? It's a hole - nothing to do */ + if (!buffer_mapped(bh)) + goto unlock; + } + + /* Ok, it's mapped. Make sure it's up-to-date */ + if (PageUptodate(page)) + set_buffer_uptodate(bh); + + if (!buffer_uptodate(bh)) { + err = -EIO; + ll_rw_block(READ, 1, &bh); + wait_on_buffer(bh); + /* Uhhuh. Read error. Complain and punt. */ + if (!buffer_uptodate(bh)) + goto unlock; + } + + if (sdp->sd_args.ar_data == GFS2_DATA_ORDERED || gfs2_is_jdata(ip)) + gfs2_trans_add_bh(ip->i_gl, bh, 0); + + kaddr = kmap_atomic(page, KM_USER0); + memset(kaddr + offset, 0, length); + flush_dcache_page(page); + kunmap_atomic(kaddr, KM_USER0); + +unlock: + unlock_page(page); + page_cache_release(page); + return err; +} + +static int trunc_start(struct gfs2_inode *ip, u64 size) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct buffer_head *dibh; + int journaled = gfs2_is_jdata(ip); + int error; + + error = gfs2_trans_begin(sdp, + RES_DINODE + (journaled ? RES_JDATA : 0), 0); + if (error) + return error; + + error = gfs2_meta_inode_buffer(ip, &dibh); + if (error) + goto out; + + if (gfs2_is_stuffed(ip)) { + ip->i_di.di_size = size; + ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds(); + gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_dinode_out(&ip->i_di, dibh->b_data); + gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode) + size); + error = 1; + + } else { + if (size & (u64)(sdp->sd_sb.sb_bsize - 1)) + error = gfs2_block_truncate_page(ip->i_inode.i_mapping); + + if (!error) { + ip->i_di.di_size = size; + ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds(); + ip->i_di.di_flags |= GFS2_DIF_TRUNC_IN_PROG; + gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_dinode_out(&ip->i_di, dibh->b_data); + } + } + + brelse(dibh); + +out: + gfs2_trans_end(sdp); + return error; +} + +static int trunc_dealloc(struct gfs2_inode *ip, u64 size) +{ + unsigned int height = ip->i_di.di_height; + u64 lblock; + struct metapath mp; + int error; + + if (!size) + lblock = 0; + else + lblock = (size - 1) >> GFS2_SB(&ip->i_inode)->sd_sb.sb_bsize_shift; + + find_metapath(ip, lblock, &mp); + gfs2_alloc_get(ip); + + error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE); + if (error) + goto out; + + while (height--) { + struct strip_mine sm; + sm.sm_first = !!size; + sm.sm_height = height; + + error = recursive_scan(ip, NULL, &mp, 0, 0, 1, do_strip, &sm); + if (error) + break; + } + + gfs2_quota_unhold(ip); + +out: + gfs2_alloc_put(ip); + return error; +} + +static int trunc_end(struct gfs2_inode *ip) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct buffer_head *dibh; + int error; + + error = gfs2_trans_begin(sdp, RES_DINODE, 0); + if (error) + return error; + + down_write(&ip->i_rw_mutex); + + error = gfs2_meta_inode_buffer(ip, &dibh); + if (error) + goto out; + + if (!ip->i_di.di_size) { + ip->i_di.di_height = 0; + ip->i_di.di_goal_meta = + ip->i_di.di_goal_data = + ip->i_num.no_addr; + gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode)); + } + ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds(); + ip->i_di.di_flags &= ~GFS2_DIF_TRUNC_IN_PROG; + + gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_dinode_out(&ip->i_di, dibh->b_data); + brelse(dibh); + +out: + up_write(&ip->i_rw_mutex); + gfs2_trans_end(sdp); + return error; +} + +/** + * do_shrink - make a file smaller + * @ip: the inode + * @size: the size to make the file + * @truncator: function to truncate the last partial block + * + * Called with an exclusive lock on @ip. + * + * Returns: errno + */ + +static int do_shrink(struct gfs2_inode *ip, u64 size) +{ + int error; + + error = trunc_start(ip, size); + if (error < 0) + return error; + if (error > 0) + return 0; + + error = trunc_dealloc(ip, size); + if (!error) + error = trunc_end(ip); + + return error; +} + +/** + * gfs2_truncatei - make a file a given size + * @ip: the inode + * @size: the size to make the file + * @truncator: function to truncate the last partial block + * + * The file size can grow, shrink, or stay the same size. + * + * Returns: errno + */ + +int gfs2_truncatei(struct gfs2_inode *ip, u64 size) +{ + int error; + + if (gfs2_assert_warn(GFS2_SB(&ip->i_inode), S_ISREG(ip->i_di.di_mode))) + return -EINVAL; + + if (size > ip->i_di.di_size) + error = do_grow(ip, size); + else + error = do_shrink(ip, size); + + return error; +} + +int gfs2_truncatei_resume(struct gfs2_inode *ip) +{ + int error; + error = trunc_dealloc(ip, ip->i_di.di_size); + if (!error) + error = trunc_end(ip); + return error; +} + +int gfs2_file_dealloc(struct gfs2_inode *ip) +{ + return trunc_dealloc(ip, 0); +} + +/** + * gfs2_write_calc_reserv - calculate number of blocks needed to write to a file + * @ip: the file + * @len: the number of bytes to be written to the file + * @data_blocks: returns the number of data blocks required + * @ind_blocks: returns the number of indirect blocks required + * + */ + +void gfs2_write_calc_reserv(struct gfs2_inode *ip, unsigned int len, + unsigned int *data_blocks, unsigned int *ind_blocks) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + unsigned int tmp; + + if (gfs2_is_dir(ip)) { + *data_blocks = DIV_ROUND_UP(len, sdp->sd_jbsize) + 2; + *ind_blocks = 3 * (sdp->sd_max_jheight - 1); + } else { + *data_blocks = (len >> sdp->sd_sb.sb_bsize_shift) + 3; + *ind_blocks = 3 * (sdp->sd_max_height - 1); + } + + for (tmp = *data_blocks; tmp > sdp->sd_diptrs;) { + tmp = DIV_ROUND_UP(tmp, sdp->sd_inptrs); + *ind_blocks += tmp; + } +} + +/** + * gfs2_write_alloc_required - figure out if a write will require an allocation + * @ip: the file being written to + * @offset: the offset to write to + * @len: the number of bytes being written + * @alloc_required: set to 1 if an alloc is required, 0 otherwise + * + * Returns: errno + */ + +int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset, + unsigned int len, int *alloc_required) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + u64 lblock, lblock_stop, dblock; + u32 extlen; + int new = 0; + int error = 0; + + *alloc_required = 0; + + if (!len) + return 0; + + if (gfs2_is_stuffed(ip)) { + if (offset + len > + sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)) + *alloc_required = 1; + return 0; + } + + if (gfs2_is_dir(ip)) { + unsigned int bsize = sdp->sd_jbsize; + lblock = offset; + do_div(lblock, bsize); + lblock_stop = offset + len + bsize - 1; + do_div(lblock_stop, bsize); + } else { + unsigned int shift = sdp->sd_sb.sb_bsize_shift; + lblock = offset >> shift; + lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift; + } + + for (; lblock < lblock_stop; lblock += extlen) { + error = gfs2_extent_map(&ip->i_inode, lblock, &new, &dblock, &extlen); + if (error) + return error; + + if (!dblock) { + *alloc_required = 1; + return 0; + } + } + + return 0; +} + diff --git a/fs/gfs2/bmap.h b/fs/gfs2/bmap.h new file mode 100644 index 000000000000..0fd379b4cd9e --- /dev/null +++ b/fs/gfs2/bmap.h @@ -0,0 +1,31 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#ifndef __BMAP_DOT_H__ +#define __BMAP_DOT_H__ + +struct inode; +struct gfs2_inode; +struct page; + +int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page); +int gfs2_block_map(struct inode *inode, u64 lblock, int create, struct buffer_head *bh, unsigned int maxlen); +int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsigned *extlen); + +int gfs2_truncatei(struct gfs2_inode *ip, u64 size); +int gfs2_truncatei_resume(struct gfs2_inode *ip); +int gfs2_file_dealloc(struct gfs2_inode *ip); + +void gfs2_write_calc_reserv(struct gfs2_inode *ip, unsigned int len, + unsigned int *data_blocks, + unsigned int *ind_blocks); +int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset, + unsigned int len, int *alloc_required); + +#endif /* __BMAP_DOT_H__ */ diff --git a/fs/gfs2/daemon.c b/fs/gfs2/daemon.c new file mode 100644 index 000000000000..cab1f68d4685 --- /dev/null +++ b/fs/gfs2/daemon.c @@ -0,0 +1,196 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/completion.h> +#include <linux/buffer_head.h> +#include <linux/kthread.h> +#include <linux/delay.h> +#include <linux/gfs2_ondisk.h> +#include <linux/lm_interface.h> + +#include "gfs2.h" +#include "incore.h" +#include "daemon.h" +#include "glock.h" +#include "log.h" +#include "quota.h" +#include "recovery.h" +#include "super.h" +#include "util.h" + +/* This uses schedule_timeout() instead of msleep() because it's good for + the daemons to wake up more often than the timeout when unmounting so + the user's unmount doesn't sit there forever. + + The kthread functions used to start these daemons block and flush signals. */ + +/** + * gfs2_scand - Look for cached glocks and inodes to toss from memory + * @sdp: Pointer to GFS2 superblock + * + * One of these daemons runs, finding candidates to add to sd_reclaim_list. + * See gfs2_glockd() + */ + +int gfs2_scand(void *data) +{ + struct gfs2_sbd *sdp = data; + unsigned long t; + + while (!kthread_should_stop()) { + gfs2_scand_internal(sdp); + t = gfs2_tune_get(sdp, gt_scand_secs) * HZ; + schedule_timeout_interruptible(t); + } + + return 0; +} + +/** + * gfs2_glockd - Reclaim unused glock structures + * @sdp: Pointer to GFS2 superblock + * + * One or more of these daemons run, reclaiming glocks on sd_reclaim_list. + * Number of daemons can be set by user, with num_glockd mount option. + */ + +int gfs2_glockd(void *data) +{ + struct gfs2_sbd *sdp = data; + + while (!kthread_should_stop()) { + while (atomic_read(&sdp->sd_reclaim_count)) + gfs2_reclaim_glock(sdp); + + wait_event_interruptible(sdp->sd_reclaim_wq, + (atomic_read(&sdp->sd_reclaim_count) || + kthread_should_stop())); + } + + return 0; +} + +/** + * gfs2_recoverd - Recover dead machine's journals + * @sdp: Pointer to GFS2 superblock + * + */ + +int gfs2_recoverd(void *data) +{ + struct gfs2_sbd *sdp = data; + unsigned long t; + + while (!kthread_should_stop()) { + gfs2_check_journals(sdp); + t = gfs2_tune_get(sdp, gt_recoverd_secs) * HZ; + schedule_timeout_interruptible(t); + } + + return 0; +} + +/** + * gfs2_logd - Update log tail as Active Items get flushed to in-place blocks + * @sdp: Pointer to GFS2 superblock + * + * Also, periodically check to make sure that we're using the most recent + * journal index. + */ + +int gfs2_logd(void *data) +{ + struct gfs2_sbd *sdp = data; + struct gfs2_holder ji_gh; + unsigned long t; + + while (!kthread_should_stop()) { + /* Advance the log tail */ + + t = sdp->sd_log_flush_time + + gfs2_tune_get(sdp, gt_log_flush_secs) * HZ; + + gfs2_ail1_empty(sdp, DIO_ALL); + + if (time_after_eq(jiffies, t)) { + gfs2_log_flush(sdp, NULL); + sdp->sd_log_flush_time = jiffies; + } + + /* Check for latest journal index */ + + t = sdp->sd_jindex_refresh_time + + gfs2_tune_get(sdp, gt_jindex_refresh_secs) * HZ; + + if (time_after_eq(jiffies, t)) { + if (!gfs2_jindex_hold(sdp, &ji_gh)) + gfs2_glock_dq_uninit(&ji_gh); + sdp->sd_jindex_refresh_time = jiffies; + } + + t = gfs2_tune_get(sdp, gt_logd_secs) * HZ; + schedule_timeout_interruptible(t); + } + + return 0; +} + +/** + * gfs2_quotad - Write cached quota changes into the quota file + * @sdp: Pointer to GFS2 superblock + * + */ + +int gfs2_quotad(void *data) +{ + struct gfs2_sbd *sdp = data; + unsigned long t; + int error; + + while (!kthread_should_stop()) { + /* Update the master statfs file */ + + t = sdp->sd_statfs_sync_time + + gfs2_tune_get(sdp, gt_statfs_quantum) * HZ; + + if (time_after_eq(jiffies, t)) { + error = gfs2_statfs_sync(sdp); + if (error && + error != -EROFS && + !test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) + fs_err(sdp, "quotad: (1) error=%d\n", error); + sdp->sd_statfs_sync_time = jiffies; + } + + /* Update quota file */ + + t = sdp->sd_quota_sync_time + + gfs2_tune_get(sdp, gt_quota_quantum) * HZ; + + if (time_after_eq(jiffies, t)) { + error = gfs2_quota_sync(sdp); + if (error && + error != -EROFS && + !test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) + fs_err(sdp, "quotad: (2) error=%d\n", error); + sdp->sd_quota_sync_time = jiffies; + } + + gfs2_quota_scan(sdp); + + t = gfs2_tune_get(sdp, gt_quotad_secs) * HZ; + schedule_timeout_interruptible(t); + } + + return 0; +} + diff --git a/fs/gfs2/daemon.h b/fs/gfs2/daemon.h new file mode 100644 index 000000000000..801007120fb2 --- /dev/null +++ b/fs/gfs2/daemon.h @@ -0,0 +1,19 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#ifndef __DAEMON_DOT_H__ +#define __DAEMON_DOT_H__ + +int gfs2_scand(void *data); +int gfs2_glockd(void *data); +int gfs2_recoverd(void *data); +int gfs2_logd(void *data); +int gfs2_quotad(void *data); + +#endif /* __DAEMON_DOT_H__ */ diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c new file mode 100644 index 000000000000..459498cac93b --- /dev/null +++ b/fs/gfs2/dir.c @@ -0,0 +1,1961 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +/* + * Implements Extendible Hashing as described in: + * "Extendible Hashing" by Fagin, et al in + * __ACM Trans. on Database Systems__, Sept 1979. + * + * + * Here's the layout of dirents which is essentially the same as that of ext2 + * within a single block. The field de_name_len is the number of bytes + * actually required for the name (no null terminator). The field de_rec_len + * is the number of bytes allocated to the dirent. The offset of the next + * dirent in the block is (dirent + dirent->de_rec_len). When a dirent is + * deleted, the preceding dirent inherits its allocated space, ie + * prev->de_rec_len += deleted->de_rec_len. Since the next dirent is obtained + * by adding de_rec_len to the current dirent, this essentially causes the + * deleted dirent to get jumped over when iterating through all the dirents. + * + * When deleting the first dirent in a block, there is no previous dirent so + * the field de_ino is set to zero to designate it as deleted. When allocating + * a dirent, gfs2_dirent_alloc iterates through the dirents in a block. If the + * first dirent has (de_ino == 0) and de_rec_len is large enough, this first + * dirent is allocated. Otherwise it must go through all the 'used' dirents + * searching for one in which the amount of total space minus the amount of + * used space will provide enough space for the new dirent. + * + * There are two types of blocks in which dirents reside. In a stuffed dinode, + * the dirents begin at offset sizeof(struct gfs2_dinode) from the beginning of + * the block. In leaves, they begin at offset sizeof(struct gfs2_leaf) from the + * beginning of the leaf block. The dirents reside in leaves when + * + * dip->i_di.di_flags & GFS2_DIF_EXHASH is true + * + * Otherwise, the dirents are "linear", within a single stuffed dinode block. + * + * When the dirents are in leaves, the actual contents of the directory file are + * used as an array of 64-bit block pointers pointing to the leaf blocks. The + * dirents are NOT in the directory file itself. There can be more than one + * block pointer in the array that points to the same leaf. In fact, when a + * directory is first converted from linear to exhash, all of the pointers + * point to the same leaf. + * + * When a leaf is completely full, the size of the hash table can be + * doubled unless it is already at the maximum size which is hard coded into + * GFS2_DIR_MAX_DEPTH. After that, leaves are chained together in a linked list, + * but never before the maximum hash table size has been reached. + */ + +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/buffer_head.h> +#include <linux/sort.h> +#include <linux/gfs2_ondisk.h> +#include <linux/crc32.h> +#include <linux/vmalloc.h> +#include <linux/lm_interface.h> + +#include "gfs2.h" +#include "incore.h" +#include "dir.h" +#include "glock.h" +#include "inode.h" +#include "meta_io.h" +#include "quota.h" +#include "rgrp.h" +#include "trans.h" +#include "bmap.h" +#include "util.h" + +#define IS_LEAF 1 /* Hashed (leaf) directory */ +#define IS_DINODE 2 /* Linear (stuffed dinode block) directory */ + +#define gfs2_disk_hash2offset(h) (((u64)(h)) >> 1) +#define gfs2_dir_offset2hash(p) ((u32)(((u64)(p)) << 1)) + +typedef int (*leaf_call_t) (struct gfs2_inode *dip, u32 index, u32 len, + u64 leaf_no, void *data); +typedef int (*gfs2_dscan_t)(const struct gfs2_dirent *dent, + const struct qstr *name, void *opaque); + + +int gfs2_dir_get_new_buffer(struct gfs2_inode *ip, u64 block, + struct buffer_head **bhp) +{ + struct buffer_head *bh; + + bh = gfs2_meta_new(ip->i_gl, block); + gfs2_trans_add_bh(ip->i_gl, bh, 1); + gfs2_metatype_set(bh, GFS2_METATYPE_JD, GFS2_FORMAT_JD); + gfs2_buffer_clear_tail(bh, sizeof(struct gfs2_meta_header)); + *bhp = bh; + return 0; +} + +static int gfs2_dir_get_existing_buffer(struct gfs2_inode *ip, u64 block, + struct buffer_head **bhp) +{ + struct buffer_head *bh; + int error; + + error = gfs2_meta_read(ip->i_gl, block, DIO_WAIT, &bh); + if (error) + return error; + if (gfs2_metatype_check(GFS2_SB(&ip->i_inode), bh, GFS2_METATYPE_JD)) { + brelse(bh); + return -EIO; + } + *bhp = bh; + return 0; +} + +static int gfs2_dir_write_stuffed(struct gfs2_inode *ip, const char *buf, + unsigned int offset, unsigned int size) +{ + struct buffer_head *dibh; + int error; + + error = gfs2_meta_inode_buffer(ip, &dibh); + if (error) + return error; + + gfs2_trans_add_bh(ip->i_gl, dibh, 1); + memcpy(dibh->b_data + offset + sizeof(struct gfs2_dinode), buf, size); + if (ip->i_di.di_size < offset + size) + ip->i_di.di_size = offset + size; + ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds(); + gfs2_dinode_out(&ip->i_di, dibh->b_data); + + brelse(dibh); + + return size; +} + + + +/** + * gfs2_dir_write_data - Write directory information to the inode + * @ip: The GFS2 inode + * @buf: The buffer containing information to be written + * @offset: The file offset to start writing at + * @size: The amount of data to write + * + * Returns: The number of bytes correctly written or error code + */ +static int gfs2_dir_write_data(struct gfs2_inode *ip, const char *buf, + u64 offset, unsigned int size) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct buffer_head *dibh; + u64 lblock, dblock; + u32 extlen = 0; + unsigned int o; + int copied = 0; + int error = 0; + + if (!size) + return 0; + + if (gfs2_is_stuffed(ip) && + offset + size <= sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)) + return gfs2_dir_write_stuffed(ip, buf, (unsigned int)offset, + size); + + if (gfs2_assert_warn(sdp, gfs2_is_jdata(ip))) + return -EINVAL; + + if (gfs2_is_stuffed(ip)) { + error = gfs2_unstuff_dinode(ip, NULL); + if (error) + return error; + } + + lblock = offset; + o = do_div(lblock, sdp->sd_jbsize) + sizeof(struct gfs2_meta_header); + + while (copied < size) { + unsigned int amount; + struct buffer_head *bh; + int new; + + amount = size - copied; + if (amount > sdp->sd_sb.sb_bsize - o) + amount = sdp->sd_sb.sb_bsize - o; + + if (!extlen) { + new = 1; + error = gfs2_extent_map(&ip->i_inode, lblock, &new, + &dblock, &extlen); + if (error) + goto fail; + error = -EIO; + if (gfs2_assert_withdraw(sdp, dblock)) + goto fail; + } + + if (amount == sdp->sd_jbsize || new) + error = gfs2_dir_get_new_buffer(ip, dblock, &bh); + else + error = gfs2_dir_get_existing_buffer(ip, dblock, &bh); + + if (error) + goto fail; + + gfs2_trans_add_bh(ip->i_gl, bh, 1); + memcpy(bh->b_data + o, buf, amount); + brelse(bh); + if (error) + goto fail; + + buf += amount; + copied += amount; + lblock++; + dblock++; + extlen--; + + o = sizeof(struct gfs2_meta_header); + } + +out: + error = gfs2_meta_inode_buffer(ip, &dibh); + if (error) + return error; + + if (ip->i_di.di_size < offset + copied) + ip->i_di.di_size = offset + copied; + ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds(); + + gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_dinode_out(&ip->i_di, dibh->b_data); + brelse(dibh); + + return copied; +fail: + if (copied) + goto out; + return error; +} + +static int gfs2_dir_read_stuffed(struct gfs2_inode *ip, char *buf, + u64 offset, unsigned int size) +{ + struct buffer_head *dibh; + int error; + + error = gfs2_meta_inode_buffer(ip, &dibh); + if (!error) { + offset += sizeof(struct gfs2_dinode); + memcpy(buf, dibh->b_data + offset, size); + brelse(dibh); + } + + return (error) ? error : size; +} + + +/** + * gfs2_dir_read_data - Read a data from a directory inode + * @ip: The GFS2 Inode + * @buf: The buffer to place result into + * @offset: File offset to begin jdata_readng from + * @size: Amount of data to transfer + * + * Returns: The amount of data actually copied or the error + */ +static int gfs2_dir_read_data(struct gfs2_inode *ip, char *buf, u64 offset, + unsigned int size, unsigned ra) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + u64 lblock, dblock; + u32 extlen = 0; + unsigned int o; + int copied = 0; + int error = 0; + + if (offset >= ip->i_di.di_size) + return 0; + + if (offset + size > ip->i_di.di_size) + size = ip->i_di.di_size - offset; + + if (!size) + return 0; + + if (gfs2_is_stuffed(ip)) + return gfs2_dir_read_stuffed(ip, buf, offset, size); + + if (gfs2_assert_warn(sdp, gfs2_is_jdata(ip))) + return -EINVAL; + + lblock = offset; + o = do_div(lblock, sdp->sd_jbsize) + sizeof(struct gfs2_meta_header); + + while (copied < size) { + unsigned int amount; + struct buffer_head *bh; + int new; + + amount = size - copied; + if (amount > sdp->sd_sb.sb_bsize - o) + amount = sdp->sd_sb.sb_bsize - o; + + if (!extlen) { + new = 0; + error = gfs2_extent_map(&ip->i_inode, lblock, &new, + &dblock, &extlen); + if (error || !dblock) + goto fail; + BUG_ON(extlen < 1); + if (!ra) + extlen = 1; + bh = gfs2_meta_ra(ip->i_gl, dblock, extlen); + } + if (!bh) { + error = gfs2_meta_read(ip->i_gl, dblock, DIO_WAIT, &bh); + if (error) + goto fail; + } + error = gfs2_metatype_check(sdp, bh, GFS2_METATYPE_JD); + if (error) { + brelse(bh); + goto fail; + } + dblock++; + extlen--; + memcpy(buf, bh->b_data + o, amount); + brelse(bh); + bh = NULL; + buf += amount; + copied += amount; + lblock++; + o = sizeof(struct gfs2_meta_header); + } + + return copied; +fail: + return (copied) ? copied : error; +} + +static inline int __gfs2_dirent_find(const struct gfs2_dirent *dent, + const struct qstr *name, int ret) +{ + if (dent->de_inum.no_addr != 0 && + be32_to_cpu(dent->de_hash) == name->hash && + be16_to_cpu(dent->de_name_len) == name->len && + memcmp(dent+1, name->name, name->len) == 0) + return ret; + return 0; +} + +static int gfs2_dirent_find(const struct gfs2_dirent *dent, + const struct qstr *name, + void *opaque) +{ + return __gfs2_dirent_find(dent, name, 1); +} + +static int gfs2_dirent_prev(const struct gfs2_dirent *dent, + const struct qstr *name, + void *opaque) +{ + return __gfs2_dirent_find(dent, name, 2); +} + +/* + * name->name holds ptr to start of block. + * name->len holds size of block. + */ +static int gfs2_dirent_last(const struct gfs2_dirent *dent, + const struct qstr *name, + void *opaque) +{ + const char *start = name->name; + const char *end = (const char *)dent + be16_to_cpu(dent->de_rec_len); + if (name->len == (end - start)) + return 1; + return 0; +} + +static int gfs2_dirent_find_space(const struct gfs2_dirent *dent, + const struct qstr *name, + void *opaque) +{ + unsigned required = GFS2_DIRENT_SIZE(name->len); + unsigned actual = GFS2_DIRENT_SIZE(be16_to_cpu(dent->de_name_len)); + unsigned totlen = be16_to_cpu(dent->de_rec_len); + + if (!dent->de_inum.no_addr) + actual = GFS2_DIRENT_SIZE(0); + if (totlen - actual >= required) + return 1; + return 0; +} + +struct dirent_gather { + const struct gfs2_dirent **pdent; + unsigned offset; +}; + +static int gfs2_dirent_gather(const struct gfs2_dirent *dent, + const struct qstr *name, + void *opaque) +{ + struct dirent_gather *g = opaque; + if (dent->de_inum.no_addr) { + g->pdent[g->offset++] = dent; + } + return 0; +} + +/* + * Other possible things to check: + * - Inode located within filesystem size (and on valid block) + * - Valid directory entry type + * Not sure how heavy-weight we want to make this... could also check + * hash is correct for example, but that would take a lot of extra time. + * For now the most important thing is to check that the various sizes + * are correct. + */ +static int gfs2_check_dirent(struct gfs2_dirent *dent, unsigned int offset, + unsigned int size, unsigned int len, int first) +{ + const char *msg = "gfs2_dirent too small"; + if (unlikely(size < sizeof(struct gfs2_dirent))) + goto error; + msg = "gfs2_dirent misaligned"; + if (unlikely(offset & 0x7)) + goto error; + msg = "gfs2_dirent points beyond end of block"; + if (unlikely(offset + size > len)) + goto error; + msg = "zero inode number"; + if (unlikely(!first && !dent->de_inum.no_addr)) + goto error; + msg = "name length is greater than space in dirent"; + if (dent->de_inum.no_addr && + unlikely(sizeof(struct gfs2_dirent)+be16_to_cpu(dent->de_name_len) > + size)) + goto error; + return 0; +error: + printk(KERN_WARNING "gfs2_check_dirent: %s (%s)\n", msg, + first ? "first in block" : "not first in block"); + return -EIO; +} + +static int gfs2_dirent_offset(const void *buf) +{ + const struct gfs2_meta_header *h = buf; + int offset; + + BUG_ON(buf == NULL); + + switch(be32_to_cpu(h->mh_type)) { + case GFS2_METATYPE_LF: + offset = sizeof(struct gfs2_leaf); + break; + case GFS2_METATYPE_DI: + offset = sizeof(struct gfs2_dinode); + break; + default: + goto wrong_type; + } + return offset; +wrong_type: + printk(KERN_WARNING "gfs2_scan_dirent: wrong block type %u\n", + be32_to_cpu(h->mh_type)); + return -1; +} + +static struct gfs2_dirent *gfs2_dirent_scan(struct inode *inode, void *buf, + unsigned int len, gfs2_dscan_t scan, + const struct qstr *name, + void *opaque) +{ + struct gfs2_dirent *dent, *prev; + unsigned offset; + unsigned size; + int ret = 0; + + ret = gfs2_dirent_offset(buf); + if (ret < 0) + goto consist_inode; + + offset = ret; + prev = NULL; + dent = buf + offset; + size = be16_to_cpu(dent->de_rec_len); + if (gfs2_check_dirent(dent, offset, size, len, 1)) + goto consist_inode; + do { + ret = scan(dent, name, opaque); + if (ret) + break; + offset += size; + if (offset == len) + break; + prev = dent; + dent = buf + offset; + size = be16_to_cpu(dent->de_rec_len); + if (gfs2_check_dirent(dent, offset, size, len, 0)) + goto consist_inode; + } while(1); + + switch(ret) { + case 0: + return NULL; + case 1: + return dent; + case 2: + return prev ? prev : dent; + default: + BUG_ON(ret > 0); + return ERR_PTR(ret); + } + +consist_inode: + gfs2_consist_inode(GFS2_I(inode)); + return ERR_PTR(-EIO); +} + + +/** + * dirent_first - Return the first dirent + * @dip: the directory + * @bh: The buffer + * @dent: Pointer to list of dirents + * + * return first dirent whether bh points to leaf or stuffed dinode + * + * Returns: IS_LEAF, IS_DINODE, or -errno + */ + +static int dirent_first(struct gfs2_inode *dip, struct buffer_head *bh, + struct gfs2_dirent **dent) +{ + struct gfs2_meta_header *h = (struct gfs2_meta_header *)bh->b_data; + + if (be32_to_cpu(h->mh_type) == GFS2_METATYPE_LF) { + if (gfs2_meta_check(GFS2_SB(&dip->i_inode), bh)) + return -EIO; + *dent = (struct gfs2_dirent *)(bh->b_data + + sizeof(struct gfs2_leaf)); + return IS_LEAF; + } else { + if (gfs2_metatype_check(GFS2_SB(&dip->i_inode), bh, GFS2_METATYPE_DI)) + return -EIO; + *dent = (struct gfs2_dirent *)(bh->b_data + + sizeof(struct gfs2_dinode)); + return IS_DINODE; + } +} + +static int dirent_check_reclen(struct gfs2_inode *dip, + const struct gfs2_dirent *d, const void *end_p) +{ + const void *ptr = d; + u16 rec_len = be16_to_cpu(d->de_rec_len); + + if (unlikely(rec_len < sizeof(struct gfs2_dirent))) + goto broken; + ptr += rec_len; + if (ptr < end_p) + return rec_len; + if (ptr == end_p) + return -ENOENT; +broken: + gfs2_consist_inode(dip); + return -EIO; +} + +/** + * dirent_next - Next dirent + * @dip: the directory + * @bh: The buffer + * @dent: Pointer to list of dirents + * + * Returns: 0 on success, error code otherwise + */ + +static int dirent_next(struct gfs2_inode *dip, struct buffer_head *bh, + struct gfs2_dirent **dent) +{ + struct gfs2_dirent *cur = *dent, *tmp; + char *bh_end = bh->b_data + bh->b_size; + int ret; + + ret = dirent_check_reclen(dip, cur, bh_end); + if (ret < 0) + return ret; + + tmp = (void *)cur + ret; + ret = dirent_check_reclen(dip, tmp, bh_end); + if (ret == -EIO) + return ret; + + /* Only the first dent could ever have de_inum.no_addr == 0 */ + if (!tmp->de_inum.no_addr) { + gfs2_consist_inode(dip); + return -EIO; + } + + *dent = tmp; + return 0; +} + +/** + * dirent_del - Delete a dirent + * @dip: The GFS2 inode + * @bh: The buffer + * @prev: The previous dirent + * @cur: The current dirent + * + */ + +static void dirent_del(struct gfs2_inode *dip, struct buffer_head *bh, + struct gfs2_dirent *prev, struct gfs2_dirent *cur) +{ + u16 cur_rec_len, prev_rec_len; + + if (!cur->de_inum.no_addr) { + gfs2_consist_inode(dip); + return; + } + + gfs2_trans_add_bh(dip->i_gl, bh, 1); + + /* If there is no prev entry, this is the first entry in the block. + The de_rec_len is already as big as it needs to be. Just zero + out the inode number and return. */ + + if (!prev) { + cur->de_inum.no_addr = 0; /* No endianess worries */ + return; + } + + /* Combine this dentry with the previous one. */ + + prev_rec_len = be16_to_cpu(prev->de_rec_len); + cur_rec_len = be16_to_cpu(cur->de_rec_len); + + if ((char *)prev + prev_rec_len != (char *)cur) + gfs2_consist_inode(dip); + if ((char *)cur + cur_rec_len > bh->b_data + bh->b_size) + gfs2_consist_inode(dip); + + prev_rec_len += cur_rec_len; + prev->de_rec_len = cpu_to_be16(prev_rec_len); +} + +/* + * Takes a dent from which to grab space as an argument. Returns the + * newly created dent. + */ +static struct gfs2_dirent *gfs2_init_dirent(struct inode *inode, + struct gfs2_dirent *dent, + const struct qstr *name, + struct buffer_head *bh) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_dirent *ndent; + unsigned offset = 0, totlen; + + if (dent->de_inum.no_addr) + offset = GFS2_DIRENT_SIZE(be16_to_cpu(dent->de_name_len)); + totlen = be16_to_cpu(dent->de_rec_len); + BUG_ON(offset + name->len > totlen); + gfs2_trans_add_bh(ip->i_gl, bh, 1); + ndent = (struct gfs2_dirent *)((char *)dent + offset); + dent->de_rec_len = cpu_to_be16(offset); + gfs2_qstr2dirent(name, totlen - offset, ndent); + return ndent; +} + +static struct gfs2_dirent *gfs2_dirent_alloc(struct inode *inode, + struct buffer_head *bh, + const struct qstr *name) +{ + struct gfs2_dirent *dent; + dent = gfs2_dirent_scan(inode, bh->b_data, bh->b_size, + gfs2_dirent_find_space, name, NULL); + if (!dent || IS_ERR(dent)) + return dent; + return gfs2_init_dirent(inode, dent, name, bh); +} + +static int get_leaf(struct gfs2_inode *dip, u64 leaf_no, + struct buffer_head **bhp) +{ + int error; + + error = gfs2_meta_read(dip->i_gl, leaf_no, DIO_WAIT, bhp); + if (!error && gfs2_metatype_check(GFS2_SB(&dip->i_inode), *bhp, GFS2_METATYPE_LF)) { + /* printk(KERN_INFO "block num=%llu\n", leaf_no); */ + error = -EIO; + } + + return error; +} + +/** + * get_leaf_nr - Get a leaf number associated with the index + * @dip: The GFS2 inode + * @index: + * @leaf_out: + * + * Returns: 0 on success, error code otherwise + */ + +static int get_leaf_nr(struct gfs2_inode *dip, u32 index, + u64 *leaf_out) +{ + u64 leaf_no; + int error; + + error = gfs2_dir_read_data(dip, (char *)&leaf_no, + index * sizeof(u64), + sizeof(u64), 0); + if (error != sizeof(u64)) + return (error < 0) ? error : -EIO; + + *leaf_out = be64_to_cpu(leaf_no); + + return 0; +} + +static int get_first_leaf(struct gfs2_inode *dip, u32 index, + struct buffer_head **bh_out) +{ + u64 leaf_no; + int error; + + error = get_leaf_nr(dip, index, &leaf_no); + if (!error) + error = get_leaf(dip, leaf_no, bh_out); + + return error; +} + +static struct gfs2_dirent *gfs2_dirent_search(struct inode *inode, + const struct qstr *name, + gfs2_dscan_t scan, + struct buffer_head **pbh) +{ + struct buffer_head *bh; + struct gfs2_dirent *dent; + struct gfs2_inode *ip = GFS2_I(inode); + int error; + + if (ip->i_di.di_flags & GFS2_DIF_EXHASH) { + struct gfs2_leaf *leaf; + unsigned hsize = 1 << ip->i_di.di_depth; + unsigned index; + u64 ln; + if (hsize * sizeof(u64) != ip->i_di.di_size) { + gfs2_consist_inode(ip); + return ERR_PTR(-EIO); + } + + index = name->hash >> (32 - ip->i_di.di_depth); + error = get_first_leaf(ip, index, &bh); + if (error) + return ERR_PTR(error); + do { + dent = gfs2_dirent_scan(inode, bh->b_data, bh->b_size, + scan, name, NULL); + if (dent) + goto got_dent; + leaf = (struct gfs2_leaf *)bh->b_data; + ln = be64_to_cpu(leaf->lf_next); + brelse(bh); + if (!ln) + break; + + error = get_leaf(ip, ln, &bh); + } while(!error); + + return error ? ERR_PTR(error) : NULL; + } + + + error = gfs2_meta_inode_buffer(ip, &bh); + if (error) + return ERR_PTR(error); + dent = gfs2_dirent_scan(inode, bh->b_data, bh->b_size, scan, name, NULL); +got_dent: + if (unlikely(dent == NULL || IS_ERR(dent))) { + brelse(bh); + bh = NULL; + } + *pbh = bh; + return dent; +} + +static struct gfs2_leaf *new_leaf(struct inode *inode, struct buffer_head **pbh, u16 depth) +{ + struct gfs2_inode *ip = GFS2_I(inode); + u64 bn = gfs2_alloc_meta(ip); + struct buffer_head *bh = gfs2_meta_new(ip->i_gl, bn); + struct gfs2_leaf *leaf; + struct gfs2_dirent *dent; + struct qstr name = { .name = "", .len = 0, .hash = 0 }; + if (!bh) + return NULL; + + gfs2_trans_add_bh(ip->i_gl, bh, 1); + gfs2_metatype_set(bh, GFS2_METATYPE_LF, GFS2_FORMAT_LF); + leaf = (struct gfs2_leaf *)bh->b_data; + leaf->lf_depth = cpu_to_be16(depth); + leaf->lf_entries = 0; + leaf->lf_dirent_format = cpu_to_be16(GFS2_FORMAT_DE); + leaf->lf_next = 0; + memset(leaf->lf_reserved, 0, sizeof(leaf->lf_reserved)); + dent = (struct gfs2_dirent *)(leaf+1); + gfs2_qstr2dirent(&name, bh->b_size - sizeof(struct gfs2_leaf), dent); + *pbh = bh; + return leaf; +} + +/** + * dir_make_exhash - Convert a stuffed directory into an ExHash directory + * @dip: The GFS2 inode + * + * Returns: 0 on success, error code otherwise + */ + +static int dir_make_exhash(struct inode *inode) +{ + struct gfs2_inode *dip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + struct gfs2_dirent *dent; + struct qstr args; + struct buffer_head *bh, *dibh; + struct gfs2_leaf *leaf; + int y; + u32 x; + u64 *lp, bn; + int error; + + error = gfs2_meta_inode_buffer(dip, &dibh); + if (error) + return error; + + /* Turn over a new leaf */ + + leaf = new_leaf(inode, &bh, 0); + if (!leaf) + return -ENOSPC; + bn = bh->b_blocknr; + + gfs2_assert(sdp, dip->i_di.di_entries < (1 << 16)); + leaf->lf_entries = cpu_to_be16(dip->i_di.di_entries); + + /* Copy dirents */ + + gfs2_buffer_copy_tail(bh, sizeof(struct gfs2_leaf), dibh, + sizeof(struct gfs2_dinode)); + + /* Find last entry */ + + x = 0; + args.len = bh->b_size - sizeof(struct gfs2_dinode) + + sizeof(struct gfs2_leaf); + args.name = bh->b_data; + dent = gfs2_dirent_scan(&dip->i_inode, bh->b_data, bh->b_size, + gfs2_dirent_last, &args, NULL); + if (!dent) { + brelse(bh); + brelse(dibh); + return -EIO; + } + if (IS_ERR(dent)) { + brelse(bh); + brelse(dibh); + return PTR_ERR(dent); + } + + /* Adjust the last dirent's record length + (Remember that dent still points to the last entry.) */ + + dent->de_rec_len = cpu_to_be16(be16_to_cpu(dent->de_rec_len) + + sizeof(struct gfs2_dinode) - + sizeof(struct gfs2_leaf)); + + brelse(bh); + + /* We're done with the new leaf block, now setup the new + hash table. */ + + gfs2_trans_add_bh(dip->i_gl, dibh, 1); + gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode)); + + lp = (u64 *)(dibh->b_data + sizeof(struct gfs2_dinode)); + + for (x = sdp->sd_hash_ptrs; x--; lp++) + *lp = cpu_to_be64(bn); + + dip->i_di.di_size = sdp->sd_sb.sb_bsize / 2; + dip->i_di.di_blocks++; + dip->i_di.di_flags |= GFS2_DIF_EXHASH; + dip->i_di.di_payload_format = 0; + + for (x = sdp->sd_hash_ptrs, y = -1; x; x >>= 1, y++) ; + dip->i_di.di_depth = y; + + gfs2_dinode_out(&dip->i_di, dibh->b_data); + + brelse(dibh); + + return 0; +} + +/** + * dir_split_leaf - Split a leaf block into two + * @dip: The GFS2 inode + * @index: + * @leaf_no: + * + * Returns: 0 on success, error code on failure + */ + +static int dir_split_leaf(struct inode *inode, const struct qstr *name) +{ + struct gfs2_inode *dip = GFS2_I(inode); + struct buffer_head *nbh, *obh, *dibh; + struct gfs2_leaf *nleaf, *oleaf; + struct gfs2_dirent *dent = NULL, *prev = NULL, *next = NULL, *new; + u32 start, len, half_len, divider; + u64 bn, *lp, leaf_no; + u32 index; + int x, moved = 0; + int error; + + index = name->hash >> (32 - dip->i_di.di_depth); + error = get_leaf_nr(dip, index, &leaf_no); + if (error) + return error; + + /* Get the old leaf block */ + error = get_leaf(dip, leaf_no, &obh); + if (error) + return error; + + oleaf = (struct gfs2_leaf *)obh->b_data; + if (dip->i_di.di_depth == be16_to_cpu(oleaf->lf_depth)) { + brelse(obh); + return 1; /* can't split */ + } + + gfs2_trans_add_bh(dip->i_gl, obh, 1); + + nleaf = new_leaf(inode, &nbh, be16_to_cpu(oleaf->lf_depth) + 1); + if (!nleaf) { + brelse(obh); + return -ENOSPC; + } + bn = nbh->b_blocknr; + + /* Compute the start and len of leaf pointers in the hash table. */ + len = 1 << (dip->i_di.di_depth - be16_to_cpu(oleaf->lf_depth)); + half_len = len >> 1; + if (!half_len) { + printk(KERN_WARNING "di_depth %u lf_depth %u index %u\n", dip->i_di.di_depth, be16_to_cpu(oleaf->lf_depth), index); + gfs2_consist_inode(dip); + error = -EIO; + goto fail_brelse; + } + + start = (index & ~(len - 1)); + + /* Change the pointers. + Don't bother distinguishing stuffed from non-stuffed. + This code is complicated enough already. */ + lp = kmalloc(half_len * sizeof(u64), GFP_NOFS | __GFP_NOFAIL); + /* Change the pointers */ + for (x = 0; x < half_len; x++) + lp[x] = cpu_to_be64(bn); + + error = gfs2_dir_write_data(dip, (char *)lp, start * sizeof(u64), + half_len * sizeof(u64)); + if (error != half_len * sizeof(u64)) { + if (error >= 0) + error = -EIO; + goto fail_lpfree; + } + + kfree(lp); + + /* Compute the divider */ + divider = (start + half_len) << (32 - dip->i_di.di_depth); + + /* Copy the entries */ + dirent_first(dip, obh, &dent); + + do { + next = dent; + if (dirent_next(dip, obh, &next)) + next = NULL; + + if (dent->de_inum.no_addr && + be32_to_cpu(dent->de_hash) < divider) { + struct qstr str; + str.name = (char*)(dent+1); + str.len = be16_to_cpu(dent->de_name_len); + str.hash = be32_to_cpu(dent->de_hash); + new = gfs2_dirent_alloc(inode, nbh, &str); + if (IS_ERR(new)) { + error = PTR_ERR(new); + break; + } + + new->de_inum = dent->de_inum; /* No endian worries */ + new->de_type = dent->de_type; /* No endian worries */ + nleaf->lf_entries = cpu_to_be16(be16_to_cpu(nleaf->lf_entries)+1); + + dirent_del(dip, obh, prev, dent); + + if (!oleaf->lf_entries) + gfs2_consist_inode(dip); + oleaf->lf_entries = cpu_to_be16(be16_to_cpu(oleaf->lf_entries)-1); + + if (!prev) + prev = dent; + + moved = 1; + } else { + prev = dent; + } + dent = next; + } while (dent); + + oleaf->lf_depth = nleaf->lf_depth; + + error = gfs2_meta_inode_buffer(dip, &dibh); + if (!gfs2_assert_withdraw(GFS2_SB(&dip->i_inode), !error)) { + dip->i_di.di_blocks++; + gfs2_dinode_out(&dip->i_di, dibh->b_data); + brelse(dibh); + } + + brelse(obh); + brelse(nbh); + + return error; + +fail_lpfree: + kfree(lp); + +fail_brelse: + brelse(obh); + brelse(nbh); + return error; +} + +/** + * dir_double_exhash - Double size of ExHash table + * @dip: The GFS2 dinode + * + * Returns: 0 on success, error code on failure + */ + +static int dir_double_exhash(struct gfs2_inode *dip) +{ + struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); + struct buffer_head *dibh; + u32 hsize; + u64 *buf; + u64 *from, *to; + u64 block; + int x; + int error = 0; + + hsize = 1 << dip->i_di.di_depth; + if (hsize * sizeof(u64) != dip->i_di.di_size) { + gfs2_consist_inode(dip); + return -EIO; + } + + /* Allocate both the "from" and "to" buffers in one big chunk */ + + buf = kcalloc(3, sdp->sd_hash_bsize, GFP_KERNEL | __GFP_NOFAIL); + + for (block = dip->i_di.di_size >> sdp->sd_hash_bsize_shift; block--;) { + error = gfs2_dir_read_data(dip, (char *)buf, + block * sdp->sd_hash_bsize, + sdp->sd_hash_bsize, 1); + if (error != sdp->sd_hash_bsize) { + if (error >= 0) + error = -EIO; + goto fail; + } + + from = buf; + to = (u64 *)((char *)buf + sdp->sd_hash_bsize); + + for (x = sdp->sd_hash_ptrs; x--; from++) { + *to++ = *from; /* No endianess worries */ + *to++ = *from; + } + + error = gfs2_dir_write_data(dip, + (char *)buf + sdp->sd_hash_bsize, + block * sdp->sd_sb.sb_bsize, + sdp->sd_sb.sb_bsize); + if (error != sdp->sd_sb.sb_bsize) { + if (error >= 0) + error = -EIO; + goto fail; + } + } + + kfree(buf); + + error = gfs2_meta_inode_buffer(dip, &dibh); + if (!gfs2_assert_withdraw(sdp, !error)) { + dip->i_di.di_depth++; + gfs2_dinode_out(&dip->i_di, dibh->b_data); + brelse(dibh); + } + + return error; + +fail: + kfree(buf); + return error; +} + +/** + * compare_dents - compare directory entries by hash value + * @a: first dent + * @b: second dent + * + * When comparing the hash entries of @a to @b: + * gt: returns 1 + * lt: returns -1 + * eq: returns 0 + */ + +static int compare_dents(const void *a, const void *b) +{ + const struct gfs2_dirent *dent_a, *dent_b; + u32 hash_a, hash_b; + int ret = 0; + + dent_a = *(const struct gfs2_dirent **)a; + hash_a = be32_to_cpu(dent_a->de_hash); + + dent_b = *(const struct gfs2_dirent **)b; + hash_b = be32_to_cpu(dent_b->de_hash); + + if (hash_a > hash_b) + ret = 1; + else if (hash_a < hash_b) + ret = -1; + else { + unsigned int len_a = be16_to_cpu(dent_a->de_name_len); + unsigned int len_b = be16_to_cpu(dent_b->de_name_len); + + if (len_a > len_b) + ret = 1; + else if (len_a < len_b) + ret = -1; + else + ret = memcmp(dent_a + 1, dent_b + 1, len_a); + } + + return ret; +} + +/** + * do_filldir_main - read out directory entries + * @dip: The GFS2 inode + * @offset: The offset in the file to read from + * @opaque: opaque data to pass to filldir + * @filldir: The function to pass entries to + * @darr: an array of struct gfs2_dirent pointers to read + * @entries: the number of entries in darr + * @copied: pointer to int that's non-zero if a entry has been copied out + * + * Jump through some hoops to make sure that if there are hash collsions, + * they are read out at the beginning of a buffer. We want to minimize + * the possibility that they will fall into different readdir buffers or + * that someone will want to seek to that location. + * + * Returns: errno, >0 on exception from filldir + */ + +static int do_filldir_main(struct gfs2_inode *dip, u64 *offset, + void *opaque, gfs2_filldir_t filldir, + const struct gfs2_dirent **darr, u32 entries, + int *copied) +{ + const struct gfs2_dirent *dent, *dent_next; + struct gfs2_inum inum; + u64 off, off_next; + unsigned int x, y; + int run = 0; + int error = 0; + + sort(darr, entries, sizeof(struct gfs2_dirent *), compare_dents, NULL); + + dent_next = darr[0]; + off_next = be32_to_cpu(dent_next->de_hash); + off_next = gfs2_disk_hash2offset(off_next); + + for (x = 0, y = 1; x < entries; x++, y++) { + dent = dent_next; + off = off_next; + + if (y < entries) { + dent_next = darr[y]; + off_next = be32_to_cpu(dent_next->de_hash); + off_next = gfs2_disk_hash2offset(off_next); + + if (off < *offset) + continue; + *offset = off; + + if (off_next == off) { + if (*copied && !run) + return 1; + run = 1; + } else + run = 0; + } else { + if (off < *offset) + continue; + *offset = off; + } + + gfs2_inum_in(&inum, (char *)&dent->de_inum); + + error = filldir(opaque, (const char *)(dent + 1), + be16_to_cpu(dent->de_name_len), + off, &inum, + be16_to_cpu(dent->de_type)); + if (error) + return 1; + + *copied = 1; + } + + /* Increment the *offset by one, so the next time we come into the + do_filldir fxn, we get the next entry instead of the last one in the + current leaf */ + + (*offset)++; + + return 0; +} + +static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque, + gfs2_filldir_t filldir, int *copied, + unsigned *depth, u64 leaf_no) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct buffer_head *bh; + struct gfs2_leaf *lf; + unsigned entries = 0; + unsigned leaves = 0; + const struct gfs2_dirent **darr, *dent; + struct dirent_gather g; + struct buffer_head **larr; + int leaf = 0; + int error, i; + u64 lfn = leaf_no; + + do { + error = get_leaf(ip, lfn, &bh); + if (error) + goto out; + lf = (struct gfs2_leaf *)bh->b_data; + if (leaves == 0) + *depth = be16_to_cpu(lf->lf_depth); + entries += be16_to_cpu(lf->lf_entries); + leaves++; + lfn = be64_to_cpu(lf->lf_next); + brelse(bh); + } while(lfn); + + if (!entries) + return 0; + + error = -ENOMEM; + larr = vmalloc((leaves + entries) * sizeof(void *)); + if (!larr) + goto out; + darr = (const struct gfs2_dirent **)(larr + leaves); + g.pdent = darr; + g.offset = 0; + lfn = leaf_no; + + do { + error = get_leaf(ip, lfn, &bh); + if (error) + goto out_kfree; + lf = (struct gfs2_leaf *)bh->b_data; + lfn = be64_to_cpu(lf->lf_next); + if (lf->lf_entries) { + dent = gfs2_dirent_scan(inode, bh->b_data, bh->b_size, + gfs2_dirent_gather, NULL, &g); + error = PTR_ERR(dent); + if (IS_ERR(dent)) { + goto out_kfree; + } + error = 0; + larr[leaf++] = bh; + } else { + brelse(bh); + } + } while(lfn); + + error = do_filldir_main(ip, offset, opaque, filldir, darr, + entries, copied); +out_kfree: + for(i = 0; i < leaf; i++) + brelse(larr[i]); + vfree(larr); +out: + return error; +} + +/** + * dir_e_read - Reads the entries from a directory into a filldir buffer + * @dip: dinode pointer + * @offset: the hash of the last entry read shifted to the right once + * @opaque: buffer for the filldir function to fill + * @filldir: points to the filldir function to use + * + * Returns: errno + */ + +static int dir_e_read(struct inode *inode, u64 *offset, void *opaque, + gfs2_filldir_t filldir) +{ + struct gfs2_inode *dip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + u32 hsize, len = 0; + u32 ht_offset, lp_offset, ht_offset_cur = -1; + u32 hash, index; + u64 *lp; + int copied = 0; + int error = 0; + unsigned depth = 0; + + hsize = 1 << dip->i_di.di_depth; + if (hsize * sizeof(u64) != dip->i_di.di_size) { + gfs2_consist_inode(dip); + return -EIO; + } + + hash = gfs2_dir_offset2hash(*offset); + index = hash >> (32 - dip->i_di.di_depth); + + lp = kmalloc(sdp->sd_hash_bsize, GFP_KERNEL); + if (!lp) + return -ENOMEM; + + while (index < hsize) { + lp_offset = index & (sdp->sd_hash_ptrs - 1); + ht_offset = index - lp_offset; + + if (ht_offset_cur != ht_offset) { + error = gfs2_dir_read_data(dip, (char *)lp, + ht_offset * sizeof(u64), + sdp->sd_hash_bsize, 1); + if (error != sdp->sd_hash_bsize) { + if (error >= 0) + error = -EIO; + goto out; + } + ht_offset_cur = ht_offset; + } + + error = gfs2_dir_read_leaf(inode, offset, opaque, filldir, + &copied, &depth, + be64_to_cpu(lp[lp_offset])); + if (error) + break; + + len = 1 << (dip->i_di.di_depth - depth); + index = (index & ~(len - 1)) + len; + } + +out: + kfree(lp); + if (error > 0) + error = 0; + return error; +} + +int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque, + gfs2_filldir_t filldir) +{ + struct gfs2_inode *dip = GFS2_I(inode); + struct dirent_gather g; + const struct gfs2_dirent **darr, *dent; + struct buffer_head *dibh; + int copied = 0; + int error; + + if (!dip->i_di.di_entries) + return 0; + + if (dip->i_di.di_flags & GFS2_DIF_EXHASH) + return dir_e_read(inode, offset, opaque, filldir); + + if (!gfs2_is_stuffed(dip)) { + gfs2_consist_inode(dip); + return -EIO; + } + + error = gfs2_meta_inode_buffer(dip, &dibh); + if (error) + return error; + + error = -ENOMEM; + darr = kmalloc(dip->i_di.di_entries * sizeof(struct gfs2_dirent *), + GFP_KERNEL); + if (darr) { + g.pdent = darr; + g.offset = 0; + dent = gfs2_dirent_scan(inode, dibh->b_data, dibh->b_size, + gfs2_dirent_gather, NULL, &g); + if (IS_ERR(dent)) { + error = PTR_ERR(dent); + goto out; + } + error = do_filldir_main(dip, offset, opaque, filldir, darr, + dip->i_di.di_entries, &copied); +out: + kfree(darr); + } + + if (error > 0) + error = 0; + + brelse(dibh); + + return error; +} + +/** + * gfs2_dir_search - Search a directory + * @dip: The GFS2 inode + * @filename: + * @inode: + * + * This routine searches a directory for a file or another directory. + * Assumes a glock is held on dip. + * + * Returns: errno + */ + +int gfs2_dir_search(struct inode *dir, const struct qstr *name, + struct gfs2_inum *inum, unsigned int *type) +{ + struct buffer_head *bh; + struct gfs2_dirent *dent; + + dent = gfs2_dirent_search(dir, name, gfs2_dirent_find, &bh); + if (dent) { + if (IS_ERR(dent)) + return PTR_ERR(dent); + if (inum) + gfs2_inum_in(inum, (char *)&dent->de_inum); + if (type) + *type = be16_to_cpu(dent->de_type); + brelse(bh); + return 0; + } + return -ENOENT; +} + +static int dir_new_leaf(struct inode *inode, const struct qstr *name) +{ + struct buffer_head *bh, *obh; + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_leaf *leaf, *oleaf; + int error; + u32 index; + u64 bn; + + index = name->hash >> (32 - ip->i_di.di_depth); + error = get_first_leaf(ip, index, &obh); + if (error) + return error; + do { + oleaf = (struct gfs2_leaf *)obh->b_data; + bn = be64_to_cpu(oleaf->lf_next); + if (!bn) + break; + brelse(obh); + error = get_leaf(ip, bn, &obh); + if (error) + return error; + } while(1); + + gfs2_trans_add_bh(ip->i_gl, obh, 1); + + leaf = new_leaf(inode, &bh, be16_to_cpu(oleaf->lf_depth)); + if (!leaf) { + brelse(obh); + return -ENOSPC; + } + oleaf->lf_next = cpu_to_be64(bh->b_blocknr); + brelse(bh); + brelse(obh); + + error = gfs2_meta_inode_buffer(ip, &bh); + if (error) + return error; + gfs2_trans_add_bh(ip->i_gl, bh, 1); + ip->i_di.di_blocks++; + gfs2_dinode_out(&ip->i_di, bh->b_data); + brelse(bh); + return 0; +} + +/** + * gfs2_dir_add - Add new filename into directory + * @dip: The GFS2 inode + * @filename: The new name + * @inode: The inode number of the entry + * @type: The type of the entry + * + * Returns: 0 on success, error code on failure + */ + +int gfs2_dir_add(struct inode *inode, const struct qstr *name, + const struct gfs2_inum *inum, unsigned type) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct buffer_head *bh; + struct gfs2_dirent *dent; + struct gfs2_leaf *leaf; + int error; + + while(1) { + dent = gfs2_dirent_search(inode, name, gfs2_dirent_find_space, + &bh); + if (dent) { + if (IS_ERR(dent)) + return PTR_ERR(dent); + dent = gfs2_init_dirent(inode, dent, name, bh); + gfs2_inum_out(inum, (char *)&dent->de_inum); + dent->de_type = cpu_to_be16(type); + if (ip->i_di.di_flags & GFS2_DIF_EXHASH) { + leaf = (struct gfs2_leaf *)bh->b_data; + leaf->lf_entries = cpu_to_be16(be16_to_cpu(leaf->lf_entries) + 1); + } + brelse(bh); + error = gfs2_meta_inode_buffer(ip, &bh); + if (error) + break; + gfs2_trans_add_bh(ip->i_gl, bh, 1); + ip->i_di.di_entries++; + ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds(); + gfs2_dinode_out(&ip->i_di, bh->b_data); + brelse(bh); + error = 0; + break; + } + if (!(ip->i_di.di_flags & GFS2_DIF_EXHASH)) { + error = dir_make_exhash(inode); + if (error) + break; + continue; + } + error = dir_split_leaf(inode, name); + if (error == 0) + continue; + if (error < 0) + break; + if (ip->i_di.di_depth < GFS2_DIR_MAX_DEPTH) { + error = dir_double_exhash(ip); + if (error) + break; + error = dir_split_leaf(inode, name); + if (error < 0) + break; + if (error == 0) + continue; + } + error = dir_new_leaf(inode, name); + if (!error) + continue; + error = -ENOSPC; + break; + } + return error; +} + + +/** + * gfs2_dir_del - Delete a directory entry + * @dip: The GFS2 inode + * @filename: The filename + * + * Returns: 0 on success, error code on failure + */ + +int gfs2_dir_del(struct gfs2_inode *dip, const struct qstr *name) +{ + struct gfs2_dirent *dent, *prev = NULL; + struct buffer_head *bh; + int error; + + /* Returns _either_ the entry (if its first in block) or the + previous entry otherwise */ + dent = gfs2_dirent_search(&dip->i_inode, name, gfs2_dirent_prev, &bh); + if (!dent) { + gfs2_consist_inode(dip); + return -EIO; + } + if (IS_ERR(dent)) { + gfs2_consist_inode(dip); + return PTR_ERR(dent); + } + /* If not first in block, adjust pointers accordingly */ + if (gfs2_dirent_find(dent, name, NULL) == 0) { + prev = dent; + dent = (struct gfs2_dirent *)((char *)dent + be16_to_cpu(prev->de_rec_len)); + } + + dirent_del(dip, bh, prev, dent); + if (dip->i_di.di_flags & GFS2_DIF_EXHASH) { + struct gfs2_leaf *leaf = (struct gfs2_leaf *)bh->b_data; + u16 entries = be16_to_cpu(leaf->lf_entries); + if (!entries) + gfs2_consist_inode(dip); + leaf->lf_entries = cpu_to_be16(--entries); + } + brelse(bh); + + error = gfs2_meta_inode_buffer(dip, &bh); + if (error) + return error; + + if (!dip->i_di.di_entries) + gfs2_consist_inode(dip); + gfs2_trans_add_bh(dip->i_gl, bh, 1); + dip->i_di.di_entries--; + dip->i_di.di_mtime = dip->i_di.di_ctime = get_seconds(); + gfs2_dinode_out(&dip->i_di, bh->b_data); + brelse(bh); + mark_inode_dirty(&dip->i_inode); + + return error; +} + +/** + * gfs2_dir_mvino - Change inode number of directory entry + * @dip: The GFS2 inode + * @filename: + * @new_inode: + * + * This routine changes the inode number of a directory entry. It's used + * by rename to change ".." when a directory is moved. + * Assumes a glock is held on dvp. + * + * Returns: errno + */ + +int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename, + struct gfs2_inum *inum, unsigned int new_type) +{ + struct buffer_head *bh; + struct gfs2_dirent *dent; + int error; + + dent = gfs2_dirent_search(&dip->i_inode, filename, gfs2_dirent_find, &bh); + if (!dent) { + gfs2_consist_inode(dip); + return -EIO; + } + if (IS_ERR(dent)) + return PTR_ERR(dent); + + gfs2_trans_add_bh(dip->i_gl, bh, 1); + gfs2_inum_out(inum, (char *)&dent->de_inum); + dent->de_type = cpu_to_be16(new_type); + + if (dip->i_di.di_flags & GFS2_DIF_EXHASH) { + brelse(bh); + error = gfs2_meta_inode_buffer(dip, &bh); + if (error) + return error; + gfs2_trans_add_bh(dip->i_gl, bh, 1); + } + + dip->i_di.di_mtime = dip->i_di.di_ctime = get_seconds(); + gfs2_dinode_out(&dip->i_di, bh->b_data); + brelse(bh); + return 0; +} + +/** + * foreach_leaf - call a function for each leaf in a directory + * @dip: the directory + * @lc: the function to call for each each + * @data: private data to pass to it + * + * Returns: errno + */ + +static int foreach_leaf(struct gfs2_inode *dip, leaf_call_t lc, void *data) +{ + struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); + struct buffer_head *bh; + struct gfs2_leaf *leaf; + u32 hsize, len; + u32 ht_offset, lp_offset, ht_offset_cur = -1; + u32 index = 0; + u64 *lp; + u64 leaf_no; + int error = 0; + + hsize = 1 << dip->i_di.di_depth; + if (hsize * sizeof(u64) != dip->i_di.di_size) { + gfs2_consist_inode(dip); + return -EIO; + } + + lp = kmalloc(sdp->sd_hash_bsize, GFP_KERNEL); + if (!lp) + return -ENOMEM; + + while (index < hsize) { + lp_offset = index & (sdp->sd_hash_ptrs - 1); + ht_offset = index - lp_offset; + + if (ht_offset_cur != ht_offset) { + error = gfs2_dir_read_data(dip, (char *)lp, + ht_offset * sizeof(u64), + sdp->sd_hash_bsize, 1); + if (error != sdp->sd_hash_bsize) { + if (error >= 0) + error = -EIO; + goto out; + } + ht_offset_cur = ht_offset; + } + + leaf_no = be64_to_cpu(lp[lp_offset]); + if (leaf_no) { + error = get_leaf(dip, leaf_no, &bh); + if (error) + goto out; + leaf = (struct gfs2_leaf *)bh->b_data; + len = 1 << (dip->i_di.di_depth - be16_to_cpu(leaf->lf_depth)); + brelse(bh); + + error = lc(dip, index, len, leaf_no, data); + if (error) + goto out; + + index = (index & ~(len - 1)) + len; + } else + index++; + } + + if (index != hsize) { + gfs2_consist_inode(dip); + error = -EIO; + } + +out: + kfree(lp); + + return error; +} + +/** + * leaf_dealloc - Deallocate a directory leaf + * @dip: the directory + * @index: the hash table offset in the directory + * @len: the number of pointers to this leaf + * @leaf_no: the leaf number + * @data: not used + * + * Returns: errno + */ + +static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len, + u64 leaf_no, void *data) +{ + struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); + struct gfs2_leaf *tmp_leaf; + struct gfs2_rgrp_list rlist; + struct buffer_head *bh, *dibh; + u64 blk, nblk; + unsigned int rg_blocks = 0, l_blocks = 0; + char *ht; + unsigned int x, size = len * sizeof(u64); + int error; + + memset(&rlist, 0, sizeof(struct gfs2_rgrp_list)); + + ht = kzalloc(size, GFP_KERNEL); + if (!ht) + return -ENOMEM; + + gfs2_alloc_get(dip); + + error = gfs2_quota_hold(dip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE); + if (error) + goto out; + + error = gfs2_rindex_hold(sdp, &dip->i_alloc.al_ri_gh); + if (error) + goto out_qs; + + /* Count the number of leaves */ + + for (blk = leaf_no; blk; blk = nblk) { + error = get_leaf(dip, blk, &bh); + if (error) + goto out_rlist; + tmp_leaf = (struct gfs2_leaf *)bh->b_data; + nblk = be64_to_cpu(tmp_leaf->lf_next); + brelse(bh); + + gfs2_rlist_add(sdp, &rlist, blk); + l_blocks++; + } + + gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE, 0); + + for (x = 0; x < rlist.rl_rgrps; x++) { + struct gfs2_rgrpd *rgd; + rgd = rlist.rl_ghs[x].gh_gl->gl_object; + rg_blocks += rgd->rd_ri.ri_length; + } + + error = gfs2_glock_nq_m(rlist.rl_rgrps, rlist.rl_ghs); + if (error) + goto out_rlist; + + error = gfs2_trans_begin(sdp, + rg_blocks + (DIV_ROUND_UP(size, sdp->sd_jbsize) + 1) + + RES_DINODE + RES_STATFS + RES_QUOTA, l_blocks); + if (error) + goto out_rg_gunlock; + + for (blk = leaf_no; blk; blk = nblk) { + error = get_leaf(dip, blk, &bh); + if (error) + goto out_end_trans; + tmp_leaf = (struct gfs2_leaf *)bh->b_data; + nblk = be64_to_cpu(tmp_leaf->lf_next); + brelse(bh); + + gfs2_free_meta(dip, blk, 1); + + if (!dip->i_di.di_blocks) + gfs2_consist_inode(dip); + dip->i_di.di_blocks--; + } + + error = gfs2_dir_write_data(dip, ht, index * sizeof(u64), size); + if (error != size) { + if (error >= 0) + error = -EIO; + goto out_end_trans; + } + + error = gfs2_meta_inode_buffer(dip, &dibh); + if (error) + goto out_end_trans; + + gfs2_trans_add_bh(dip->i_gl, dibh, 1); + gfs2_dinode_out(&dip->i_di, dibh->b_data); + brelse(dibh); + +out_end_trans: + gfs2_trans_end(sdp); +out_rg_gunlock: + gfs2_glock_dq_m(rlist.rl_rgrps, rlist.rl_ghs); +out_rlist: + gfs2_rlist_free(&rlist); + gfs2_glock_dq_uninit(&dip->i_alloc.al_ri_gh); +out_qs: + gfs2_quota_unhold(dip); +out: + gfs2_alloc_put(dip); + kfree(ht); + return error; +} + +/** + * gfs2_dir_exhash_dealloc - free all the leaf blocks in a directory + * @dip: the directory + * + * Dealloc all on-disk directory leaves to FREEMETA state + * Change on-disk inode type to "regular file" + * + * Returns: errno + */ + +int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip) +{ + struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); + struct buffer_head *bh; + int error; + + /* Dealloc on-disk leaves to FREEMETA state */ + error = foreach_leaf(dip, leaf_dealloc, NULL); + if (error) + return error; + + /* Make this a regular file in case we crash. + (We don't want to free these blocks a second time.) */ + + error = gfs2_trans_begin(sdp, RES_DINODE, 0); + if (error) + return error; + + error = gfs2_meta_inode_buffer(dip, &bh); + if (!error) { + gfs2_trans_add_bh(dip->i_gl, bh, 1); + ((struct gfs2_dinode *)bh->b_data)->di_mode = + cpu_to_be32(S_IFREG); + brelse(bh); + } + + gfs2_trans_end(sdp); + + return error; +} + +/** + * gfs2_diradd_alloc_required - find if adding entry will require an allocation + * @ip: the file being written to + * @filname: the filename that's going to be added + * + * Returns: 1 if alloc required, 0 if not, -ve on error + */ + +int gfs2_diradd_alloc_required(struct inode *inode, const struct qstr *name) +{ + struct gfs2_dirent *dent; + struct buffer_head *bh; + + dent = gfs2_dirent_search(inode, name, gfs2_dirent_find_space, &bh); + if (!dent) { + return 1; + } + if (IS_ERR(dent)) + return PTR_ERR(dent); + brelse(bh); + return 0; +} + diff --git a/fs/gfs2/dir.h b/fs/gfs2/dir.h new file mode 100644 index 000000000000..371233419b07 --- /dev/null +++ b/fs/gfs2/dir.h @@ -0,0 +1,79 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#ifndef __DIR_DOT_H__ +#define __DIR_DOT_H__ + +#include <linux/dcache.h> + +struct inode; +struct gfs2_inode; +struct gfs2_inum; + +/** + * gfs2_filldir_t - Report a directory entry to the caller of gfs2_dir_read() + * @opaque: opaque data used by the function + * @name: the name of the directory entry + * @length: the length of the name + * @offset: the entry's offset in the directory + * @inum: the inode number the entry points to + * @type: the type of inode the entry points to + * + * Returns: 0 on success, 1 if buffer full + */ + +typedef int (*gfs2_filldir_t) (void *opaque, + const char *name, unsigned int length, + u64 offset, + struct gfs2_inum *inum, unsigned int type); + +int gfs2_dir_search(struct inode *dir, const struct qstr *filename, + struct gfs2_inum *inum, unsigned int *type); +int gfs2_dir_add(struct inode *inode, const struct qstr *filename, + const struct gfs2_inum *inum, unsigned int type); +int gfs2_dir_del(struct gfs2_inode *dip, const struct qstr *filename); +int gfs2_dir_read(struct inode *inode, u64 * offset, void *opaque, + gfs2_filldir_t filldir); +int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename, + struct gfs2_inum *new_inum, unsigned int new_type); + +int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip); + +int gfs2_diradd_alloc_required(struct inode *dir, + const struct qstr *filename); +int gfs2_dir_get_new_buffer(struct gfs2_inode *ip, u64 block, + struct buffer_head **bhp); + +static inline u32 gfs2_disk_hash(const char *data, int len) +{ + return crc32_le((u32)~0, data, len) ^ (u32)~0; +} + + +static inline void gfs2_str2qstr(struct qstr *name, const char *fname) +{ + name->name = fname; + name->len = strlen(fname); + name->hash = gfs2_disk_hash(name->name, name->len); +} + +/* N.B. This probably ought to take inum & type as args as well */ +static inline void gfs2_qstr2dirent(const struct qstr *name, u16 reclen, struct gfs2_dirent *dent) +{ + dent->de_inum.no_addr = cpu_to_be64(0); + dent->de_inum.no_formal_ino = cpu_to_be64(0); + dent->de_hash = cpu_to_be32(name->hash); + dent->de_rec_len = cpu_to_be16(reclen); + dent->de_name_len = cpu_to_be16(name->len); + dent->de_type = cpu_to_be16(0); + memset(dent->__pad, 0, sizeof(dent->__pad)); + memcpy(dent + 1, name->name, name->len); +} + +#endif /* __DIR_DOT_H__ */ diff --git a/fs/gfs2/eaops.c b/fs/gfs2/eaops.c new file mode 100644 index 000000000000..92c54e9b0dc3 --- /dev/null +++ b/fs/gfs2/eaops.c @@ -0,0 +1,230 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/completion.h> +#include <linux/buffer_head.h> +#include <linux/xattr.h> +#include <linux/gfs2_ondisk.h> +#include <linux/lm_interface.h> +#include <asm/uaccess.h> + +#include "gfs2.h" +#include "incore.h" +#include "acl.h" +#include "eaops.h" +#include "eattr.h" +#include "util.h" + +/** + * gfs2_ea_name2type - get the type of the ea, and truncate type from the name + * @namep: ea name, possibly with type appended + * + * Returns: GFS2_EATYPE_XXX + */ + +unsigned int gfs2_ea_name2type(const char *name, const char **truncated_name) +{ + unsigned int type; + + if (strncmp(name, "system.", 7) == 0) { + type = GFS2_EATYPE_SYS; + if (truncated_name) + *truncated_name = name + sizeof("system.") - 1; + } else if (strncmp(name, "user.", 5) == 0) { + type = GFS2_EATYPE_USR; + if (truncated_name) + *truncated_name = name + sizeof("user.") - 1; + } else if (strncmp(name, "security.", 9) == 0) { + type = GFS2_EATYPE_SECURITY; + if (truncated_name) + *truncated_name = name + sizeof("security.") - 1; + } else { + type = GFS2_EATYPE_UNUSED; + if (truncated_name) + *truncated_name = NULL; + } + + return type; +} + +static int user_eo_get(struct gfs2_inode *ip, struct gfs2_ea_request *er) +{ + struct inode *inode = &ip->i_inode; + int error = permission(inode, MAY_READ, NULL); + if (error) + return error; + + return gfs2_ea_get_i(ip, er); +} + +static int user_eo_set(struct gfs2_inode *ip, struct gfs2_ea_request *er) +{ + struct inode *inode = &ip->i_inode; + + if (S_ISREG(inode->i_mode) || + (S_ISDIR(inode->i_mode) && !(inode->i_mode & S_ISVTX))) { + int error = permission(inode, MAY_WRITE, NULL); + if (error) + return error; + } else + return -EPERM; + + return gfs2_ea_set_i(ip, er); +} + +static int user_eo_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er) +{ + struct inode *inode = &ip->i_inode; + + if (S_ISREG(inode->i_mode) || + (S_ISDIR(inode->i_mode) && !(inode->i_mode & S_ISVTX))) { + int error = permission(inode, MAY_WRITE, NULL); + if (error) + return error; + } else + return -EPERM; + + return gfs2_ea_remove_i(ip, er); +} + +static int system_eo_get(struct gfs2_inode *ip, struct gfs2_ea_request *er) +{ + if (!GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len) && + !GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len) && + !capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (GFS2_SB(&ip->i_inode)->sd_args.ar_posix_acl == 0 && + (GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len) || + GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len))) + return -EOPNOTSUPP; + + + + return gfs2_ea_get_i(ip, er); +} + +static int system_eo_set(struct gfs2_inode *ip, struct gfs2_ea_request *er) +{ + int remove = 0; + int error; + + if (GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len)) { + if (!(er->er_flags & GFS2_ERF_MODE)) { + er->er_mode = ip->i_di.di_mode; + er->er_flags |= GFS2_ERF_MODE; + } + error = gfs2_acl_validate_set(ip, 1, er, + &remove, &er->er_mode); + if (error) + return error; + error = gfs2_ea_set_i(ip, er); + if (error) + return error; + if (remove) + gfs2_ea_remove_i(ip, er); + return 0; + + } else if (GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len)) { + error = gfs2_acl_validate_set(ip, 0, er, + &remove, NULL); + if (error) + return error; + if (!remove) + error = gfs2_ea_set_i(ip, er); + else { + error = gfs2_ea_remove_i(ip, er); + if (error == -ENODATA) + error = 0; + } + return error; + } + + return -EPERM; +} + +static int system_eo_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er) +{ + if (GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len)) { + int error = gfs2_acl_validate_remove(ip, 1); + if (error) + return error; + + } else if (GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len)) { + int error = gfs2_acl_validate_remove(ip, 0); + if (error) + return error; + + } else + return -EPERM; + + return gfs2_ea_remove_i(ip, er); +} + +static int security_eo_get(struct gfs2_inode *ip, struct gfs2_ea_request *er) +{ + struct inode *inode = &ip->i_inode; + int error = permission(inode, MAY_READ, NULL); + if (error) + return error; + + return gfs2_ea_get_i(ip, er); +} + +static int security_eo_set(struct gfs2_inode *ip, struct gfs2_ea_request *er) +{ + struct inode *inode = &ip->i_inode; + int error = permission(inode, MAY_WRITE, NULL); + if (error) + return error; + + return gfs2_ea_set_i(ip, er); +} + +static int security_eo_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er) +{ + struct inode *inode = &ip->i_inode; + int error = permission(inode, MAY_WRITE, NULL); + if (error) + return error; + + return gfs2_ea_remove_i(ip, er); +} + +static struct gfs2_eattr_operations gfs2_user_eaops = { + .eo_get = user_eo_get, + .eo_set = user_eo_set, + .eo_remove = user_eo_remove, + .eo_name = "user", +}; + +struct gfs2_eattr_operations gfs2_system_eaops = { + .eo_get = system_eo_get, + .eo_set = system_eo_set, + .eo_remove = system_eo_remove, + .eo_name = "system", +}; + +static struct gfs2_eattr_operations gfs2_security_eaops = { + .eo_get = security_eo_get, + .eo_set = security_eo_set, + .eo_remove = security_eo_remove, + .eo_name = "security", +}; + +struct gfs2_eattr_operations *gfs2_ea_ops[] = { + NULL, + &gfs2_user_eaops, + &gfs2_system_eaops, + &gfs2_security_eaops, +}; + diff --git a/fs/gfs2/eaops.h b/fs/gfs2/eaops.h new file mode 100644 index 000000000000..508b4f7a2449 --- /dev/null +++ b/fs/gfs2/eaops.h @@ -0,0 +1,30 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#ifndef __EAOPS_DOT_H__ +#define __EAOPS_DOT_H__ + +struct gfs2_ea_request; +struct gfs2_inode; + +struct gfs2_eattr_operations { + int (*eo_get) (struct gfs2_inode *ip, struct gfs2_ea_request *er); + int (*eo_set) (struct gfs2_inode *ip, struct gfs2_ea_request *er); + int (*eo_remove) (struct gfs2_inode *ip, struct gfs2_ea_request *er); + char *eo_name; +}; + +unsigned int gfs2_ea_name2type(const char *name, const char **truncated_name); + +extern struct gfs2_eattr_operations gfs2_system_eaops; + +extern struct gfs2_eattr_operations *gfs2_ea_ops[]; + +#endif /* __EAOPS_DOT_H__ */ + diff --git a/fs/gfs2/eattr.c b/fs/gfs2/eattr.c new file mode 100644 index 000000000000..a65a4ccfd4dd --- /dev/null +++ b/fs/gfs2/eattr.c @@ -0,0 +1,1501 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/completion.h> +#include <linux/buffer_head.h> +#include <linux/xattr.h> +#include <linux/gfs2_ondisk.h> +#include <linux/lm_interface.h> +#include <asm/uaccess.h> + +#include "gfs2.h" +#include "incore.h" +#include "acl.h" +#include "eaops.h" +#include "eattr.h" +#include "glock.h" +#include "inode.h" +#include "meta_io.h" +#include "quota.h" +#include "rgrp.h" +#include "trans.h" +#include "util.h" + +/** + * ea_calc_size - returns the acutal number of bytes the request will take up + * (not counting any unstuffed data blocks) + * @sdp: + * @er: + * @size: + * + * Returns: 1 if the EA should be stuffed + */ + +static int ea_calc_size(struct gfs2_sbd *sdp, struct gfs2_ea_request *er, + unsigned int *size) +{ + *size = GFS2_EAREQ_SIZE_STUFFED(er); + if (*size <= sdp->sd_jbsize) + return 1; + + *size = GFS2_EAREQ_SIZE_UNSTUFFED(sdp, er); + + return 0; +} + +static int ea_check_size(struct gfs2_sbd *sdp, struct gfs2_ea_request *er) +{ + unsigned int size; + + if (er->er_data_len > GFS2_EA_MAX_DATA_LEN) + return -ERANGE; + + ea_calc_size(sdp, er, &size); + + /* This can only happen with 512 byte blocks */ + if (size > sdp->sd_jbsize) + return -ERANGE; + + return 0; +} + +typedef int (*ea_call_t) (struct gfs2_inode *ip, struct buffer_head *bh, + struct gfs2_ea_header *ea, + struct gfs2_ea_header *prev, void *private); + +static int ea_foreach_i(struct gfs2_inode *ip, struct buffer_head *bh, + ea_call_t ea_call, void *data) +{ + struct gfs2_ea_header *ea, *prev = NULL; + int error = 0; + + if (gfs2_metatype_check(GFS2_SB(&ip->i_inode), bh, GFS2_METATYPE_EA)) + return -EIO; + + for (ea = GFS2_EA_BH2FIRST(bh);; prev = ea, ea = GFS2_EA2NEXT(ea)) { + if (!GFS2_EA_REC_LEN(ea)) + goto fail; + if (!(bh->b_data <= (char *)ea && (char *)GFS2_EA2NEXT(ea) <= + bh->b_data + bh->b_size)) + goto fail; + if (!GFS2_EATYPE_VALID(ea->ea_type)) + goto fail; + + error = ea_call(ip, bh, ea, prev, data); + if (error) + return error; + + if (GFS2_EA_IS_LAST(ea)) { + if ((char *)GFS2_EA2NEXT(ea) != + bh->b_data + bh->b_size) + goto fail; + break; + } + } + + return error; + +fail: + gfs2_consist_inode(ip); + return -EIO; +} + +static int ea_foreach(struct gfs2_inode *ip, ea_call_t ea_call, void *data) +{ + struct buffer_head *bh, *eabh; + u64 *eablk, *end; + int error; + + error = gfs2_meta_read(ip->i_gl, ip->i_di.di_eattr, DIO_WAIT, &bh); + if (error) + return error; + + if (!(ip->i_di.di_flags & GFS2_DIF_EA_INDIRECT)) { + error = ea_foreach_i(ip, bh, ea_call, data); + goto out; + } + + if (gfs2_metatype_check(GFS2_SB(&ip->i_inode), bh, GFS2_METATYPE_IN)) { + error = -EIO; + goto out; + } + + eablk = (u64 *)(bh->b_data + sizeof(struct gfs2_meta_header)); + end = eablk + GFS2_SB(&ip->i_inode)->sd_inptrs; + + for (; eablk < end; eablk++) { + u64 bn; + + if (!*eablk) + break; + bn = be64_to_cpu(*eablk); + + error = gfs2_meta_read(ip->i_gl, bn, DIO_WAIT, &eabh); + if (error) + break; + error = ea_foreach_i(ip, eabh, ea_call, data); + brelse(eabh); + if (error) + break; + } +out: + brelse(bh); + return error; +} + +struct ea_find { + struct gfs2_ea_request *ef_er; + struct gfs2_ea_location *ef_el; +}; + +static int ea_find_i(struct gfs2_inode *ip, struct buffer_head *bh, + struct gfs2_ea_header *ea, struct gfs2_ea_header *prev, + void *private) +{ + struct ea_find *ef = private; + struct gfs2_ea_request *er = ef->ef_er; + + if (ea->ea_type == GFS2_EATYPE_UNUSED) + return 0; + + if (ea->ea_type == er->er_type) { + if (ea->ea_name_len == er->er_name_len && + !memcmp(GFS2_EA2NAME(ea), er->er_name, ea->ea_name_len)) { + struct gfs2_ea_location *el = ef->ef_el; + get_bh(bh); + el->el_bh = bh; + el->el_ea = ea; + el->el_prev = prev; + return 1; + } + } + + return 0; +} + +int gfs2_ea_find(struct gfs2_inode *ip, struct gfs2_ea_request *er, + struct gfs2_ea_location *el) +{ + struct ea_find ef; + int error; + + ef.ef_er = er; + ef.ef_el = el; + + memset(el, 0, sizeof(struct gfs2_ea_location)); + + error = ea_foreach(ip, ea_find_i, &ef); + if (error > 0) + return 0; + + return error; +} + +/** + * ea_dealloc_unstuffed - + * @ip: + * @bh: + * @ea: + * @prev: + * @private: + * + * Take advantage of the fact that all unstuffed blocks are + * allocated from the same RG. But watch, this may not always + * be true. + * + * Returns: errno + */ + +static int ea_dealloc_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh, + struct gfs2_ea_header *ea, + struct gfs2_ea_header *prev, void *private) +{ + int *leave = private; + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct gfs2_rgrpd *rgd; + struct gfs2_holder rg_gh; + struct buffer_head *dibh; + u64 *dataptrs, bn = 0; + u64 bstart = 0; + unsigned int blen = 0; + unsigned int blks = 0; + unsigned int x; + int error; + + if (GFS2_EA_IS_STUFFED(ea)) + return 0; + + dataptrs = GFS2_EA2DATAPTRS(ea); + for (x = 0; x < ea->ea_num_ptrs; x++, dataptrs++) { + if (*dataptrs) { + blks++; + bn = be64_to_cpu(*dataptrs); + } + } + if (!blks) + return 0; + + rgd = gfs2_blk2rgrpd(sdp, bn); + if (!rgd) { + gfs2_consist_inode(ip); + return -EIO; + } + + error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, &rg_gh); + if (error) + return error; + + error = gfs2_trans_begin(sdp, rgd->rd_ri.ri_length + RES_DINODE + + RES_EATTR + RES_STATFS + RES_QUOTA, blks); + if (error) + goto out_gunlock; + + gfs2_trans_add_bh(ip->i_gl, bh, 1); + + dataptrs = GFS2_EA2DATAPTRS(ea); + for (x = 0; x < ea->ea_num_ptrs; x++, dataptrs++) { + if (!*dataptrs) + break; + bn = be64_to_cpu(*dataptrs); + + if (bstart + blen == bn) + blen++; + else { + if (bstart) + gfs2_free_meta(ip, bstart, blen); + bstart = bn; + blen = 1; + } + + *dataptrs = 0; + if (!ip->i_di.di_blocks) + gfs2_consist_inode(ip); + ip->i_di.di_blocks--; + } + if (bstart) + gfs2_free_meta(ip, bstart, blen); + + if (prev && !leave) { + u32 len; + + len = GFS2_EA_REC_LEN(prev) + GFS2_EA_REC_LEN(ea); + prev->ea_rec_len = cpu_to_be32(len); + + if (GFS2_EA_IS_LAST(ea)) + prev->ea_flags |= GFS2_EAFLAG_LAST; + } else { + ea->ea_type = GFS2_EATYPE_UNUSED; + ea->ea_num_ptrs = 0; + } + + error = gfs2_meta_inode_buffer(ip, &dibh); + if (!error) { + ip->i_di.di_ctime = get_seconds(); + gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_dinode_out(&ip->i_di, dibh->b_data); + brelse(dibh); + } + + gfs2_trans_end(sdp); + +out_gunlock: + gfs2_glock_dq_uninit(&rg_gh); + return error; +} + +static int ea_remove_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh, + struct gfs2_ea_header *ea, + struct gfs2_ea_header *prev, int leave) +{ + struct gfs2_alloc *al; + int error; + + al = gfs2_alloc_get(ip); + + error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE); + if (error) + goto out_alloc; + + error = gfs2_rindex_hold(GFS2_SB(&ip->i_inode), &al->al_ri_gh); + if (error) + goto out_quota; + + error = ea_dealloc_unstuffed(ip, bh, ea, prev, (leave) ? &error : NULL); + + gfs2_glock_dq_uninit(&al->al_ri_gh); + +out_quota: + gfs2_quota_unhold(ip); +out_alloc: + gfs2_alloc_put(ip); + return error; +} + +struct ea_list { + struct gfs2_ea_request *ei_er; + unsigned int ei_size; +}; + +static int ea_list_i(struct gfs2_inode *ip, struct buffer_head *bh, + struct gfs2_ea_header *ea, struct gfs2_ea_header *prev, + void *private) +{ + struct ea_list *ei = private; + struct gfs2_ea_request *er = ei->ei_er; + unsigned int ea_size = gfs2_ea_strlen(ea); + + if (ea->ea_type == GFS2_EATYPE_UNUSED) + return 0; + + if (er->er_data_len) { + char *prefix = NULL; + unsigned int l = 0; + char c = 0; + + if (ei->ei_size + ea_size > er->er_data_len) + return -ERANGE; + + switch (ea->ea_type) { + case GFS2_EATYPE_USR: + prefix = "user."; + l = 5; + break; + case GFS2_EATYPE_SYS: + prefix = "system."; + l = 7; + break; + case GFS2_EATYPE_SECURITY: + prefix = "security."; + l = 9; + break; + } + + BUG_ON(l == 0); + + memcpy(er->er_data + ei->ei_size, prefix, l); + memcpy(er->er_data + ei->ei_size + l, GFS2_EA2NAME(ea), + ea->ea_name_len); + memcpy(er->er_data + ei->ei_size + ea_size - 1, &c, 1); + } + + ei->ei_size += ea_size; + + return 0; +} + +/** + * gfs2_ea_list - + * @ip: + * @er: + * + * Returns: actual size of data on success, -errno on error + */ + +int gfs2_ea_list(struct gfs2_inode *ip, struct gfs2_ea_request *er) +{ + struct gfs2_holder i_gh; + int error; + + if (!er->er_data || !er->er_data_len) { + er->er_data = NULL; + er->er_data_len = 0; + } + + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh); + if (error) + return error; + + if (ip->i_di.di_eattr) { + struct ea_list ei = { .ei_er = er, .ei_size = 0 }; + + error = ea_foreach(ip, ea_list_i, &ei); + if (!error) + error = ei.ei_size; + } + + gfs2_glock_dq_uninit(&i_gh); + + return error; +} + +/** + * ea_get_unstuffed - actually copies the unstuffed data into the + * request buffer + * @ip: The GFS2 inode + * @ea: The extended attribute header structure + * @data: The data to be copied + * + * Returns: errno + */ + +static int ea_get_unstuffed(struct gfs2_inode *ip, struct gfs2_ea_header *ea, + char *data) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct buffer_head **bh; + unsigned int amount = GFS2_EA_DATA_LEN(ea); + unsigned int nptrs = DIV_ROUND_UP(amount, sdp->sd_jbsize); + u64 *dataptrs = GFS2_EA2DATAPTRS(ea); + unsigned int x; + int error = 0; + + bh = kcalloc(nptrs, sizeof(struct buffer_head *), GFP_KERNEL); + if (!bh) + return -ENOMEM; + + for (x = 0; x < nptrs; x++) { + error = gfs2_meta_read(ip->i_gl, be64_to_cpu(*dataptrs), 0, + bh + x); + if (error) { + while (x--) + brelse(bh[x]); + goto out; + } + dataptrs++; + } + + for (x = 0; x < nptrs; x++) { + error = gfs2_meta_wait(sdp, bh[x]); + if (error) { + for (; x < nptrs; x++) + brelse(bh[x]); + goto out; + } + if (gfs2_metatype_check(sdp, bh[x], GFS2_METATYPE_ED)) { + for (; x < nptrs; x++) + brelse(bh[x]); + error = -EIO; + goto out; + } + + memcpy(data, bh[x]->b_data + sizeof(struct gfs2_meta_header), + (sdp->sd_jbsize > amount) ? amount : sdp->sd_jbsize); + + amount -= sdp->sd_jbsize; + data += sdp->sd_jbsize; + + brelse(bh[x]); + } + +out: + kfree(bh); + return error; +} + +int gfs2_ea_get_copy(struct gfs2_inode *ip, struct gfs2_ea_location *el, + char *data) +{ + if (GFS2_EA_IS_STUFFED(el->el_ea)) { + memcpy(data, GFS2_EA2DATA(el->el_ea), GFS2_EA_DATA_LEN(el->el_ea)); + return 0; + } else + return ea_get_unstuffed(ip, el->el_ea, data); +} + +/** + * gfs2_ea_get_i - + * @ip: The GFS2 inode + * @er: The request structure + * + * Returns: actual size of data on success, -errno on error + */ + +int gfs2_ea_get_i(struct gfs2_inode *ip, struct gfs2_ea_request *er) +{ + struct gfs2_ea_location el; + int error; + + if (!ip->i_di.di_eattr) + return -ENODATA; + + error = gfs2_ea_find(ip, er, &el); + if (error) + return error; + if (!el.el_ea) + return -ENODATA; + + if (er->er_data_len) { + if (GFS2_EA_DATA_LEN(el.el_ea) > er->er_data_len) + error = -ERANGE; + else + error = gfs2_ea_get_copy(ip, &el, er->er_data); + } + if (!error) + error = GFS2_EA_DATA_LEN(el.el_ea); + + brelse(el.el_bh); + + return error; +} + +/** + * gfs2_ea_get - + * @ip: The GFS2 inode + * @er: The request structure + * + * Returns: actual size of data on success, -errno on error + */ + +int gfs2_ea_get(struct gfs2_inode *ip, struct gfs2_ea_request *er) +{ + struct gfs2_holder i_gh; + int error; + + if (!er->er_name_len || + er->er_name_len > GFS2_EA_MAX_NAME_LEN) + return -EINVAL; + if (!er->er_data || !er->er_data_len) { + er->er_data = NULL; + er->er_data_len = 0; + } + + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh); + if (error) + return error; + + error = gfs2_ea_ops[er->er_type]->eo_get(ip, er); + + gfs2_glock_dq_uninit(&i_gh); + + return error; +} + +/** + * ea_alloc_blk - allocates a new block for extended attributes. + * @ip: A pointer to the inode that's getting extended attributes + * @bhp: Pointer to pointer to a struct buffer_head + * + * Returns: errno + */ + +static int ea_alloc_blk(struct gfs2_inode *ip, struct buffer_head **bhp) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct gfs2_ea_header *ea; + u64 block; + + block = gfs2_alloc_meta(ip); + + *bhp = gfs2_meta_new(ip->i_gl, block); + gfs2_trans_add_bh(ip->i_gl, *bhp, 1); + gfs2_metatype_set(*bhp, GFS2_METATYPE_EA, GFS2_FORMAT_EA); + gfs2_buffer_clear_tail(*bhp, sizeof(struct gfs2_meta_header)); + + ea = GFS2_EA_BH2FIRST(*bhp); + ea->ea_rec_len = cpu_to_be32(sdp->sd_jbsize); + ea->ea_type = GFS2_EATYPE_UNUSED; + ea->ea_flags = GFS2_EAFLAG_LAST; + ea->ea_num_ptrs = 0; + + ip->i_di.di_blocks++; + + return 0; +} + +/** + * ea_write - writes the request info to an ea, creating new blocks if + * necessary + * @ip: inode that is being modified + * @ea: the location of the new ea in a block + * @er: the write request + * + * Note: does not update ea_rec_len or the GFS2_EAFLAG_LAST bin of ea_flags + * + * returns : errno + */ + +static int ea_write(struct gfs2_inode *ip, struct gfs2_ea_header *ea, + struct gfs2_ea_request *er) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + + ea->ea_data_len = cpu_to_be32(er->er_data_len); + ea->ea_name_len = er->er_name_len; + ea->ea_type = er->er_type; + ea->__pad = 0; + + memcpy(GFS2_EA2NAME(ea), er->er_name, er->er_name_len); + + if (GFS2_EAREQ_SIZE_STUFFED(er) <= sdp->sd_jbsize) { + ea->ea_num_ptrs = 0; + memcpy(GFS2_EA2DATA(ea), er->er_data, er->er_data_len); + } else { + u64 *dataptr = GFS2_EA2DATAPTRS(ea); + const char *data = er->er_data; + unsigned int data_len = er->er_data_len; + unsigned int copy; + unsigned int x; + + ea->ea_num_ptrs = DIV_ROUND_UP(er->er_data_len, sdp->sd_jbsize); + for (x = 0; x < ea->ea_num_ptrs; x++) { + struct buffer_head *bh; + u64 block; + int mh_size = sizeof(struct gfs2_meta_header); + + block = gfs2_alloc_meta(ip); + + bh = gfs2_meta_new(ip->i_gl, block); + gfs2_trans_add_bh(ip->i_gl, bh, 1); + gfs2_metatype_set(bh, GFS2_METATYPE_ED, GFS2_FORMAT_ED); + + ip->i_di.di_blocks++; + + copy = data_len > sdp->sd_jbsize ? sdp->sd_jbsize : + data_len; + memcpy(bh->b_data + mh_size, data, copy); + if (copy < sdp->sd_jbsize) + memset(bh->b_data + mh_size + copy, 0, + sdp->sd_jbsize - copy); + + *dataptr++ = cpu_to_be64(bh->b_blocknr); + data += copy; + data_len -= copy; + + brelse(bh); + } + + gfs2_assert_withdraw(sdp, !data_len); + } + + return 0; +} + +typedef int (*ea_skeleton_call_t) (struct gfs2_inode *ip, + struct gfs2_ea_request *er, void *private); + +static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er, + unsigned int blks, + ea_skeleton_call_t skeleton_call, void *private) +{ + struct gfs2_alloc *al; + struct buffer_head *dibh; + int error; + + al = gfs2_alloc_get(ip); + + error = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE); + if (error) + goto out; + + error = gfs2_quota_check(ip, ip->i_di.di_uid, ip->i_di.di_gid); + if (error) + goto out_gunlock_q; + + al->al_requested = blks; + + error = gfs2_inplace_reserve(ip); + if (error) + goto out_gunlock_q; + + error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), + blks + al->al_rgd->rd_ri.ri_length + + RES_DINODE + RES_STATFS + RES_QUOTA, 0); + if (error) + goto out_ipres; + + error = skeleton_call(ip, er, private); + if (error) + goto out_end_trans; + + error = gfs2_meta_inode_buffer(ip, &dibh); + if (!error) { + if (er->er_flags & GFS2_ERF_MODE) { + gfs2_assert_withdraw(GFS2_SB(&ip->i_inode), + (ip->i_di.di_mode & S_IFMT) == + (er->er_mode & S_IFMT)); + ip->i_di.di_mode = er->er_mode; + } + ip->i_di.di_ctime = get_seconds(); + gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_dinode_out(&ip->i_di, dibh->b_data); + brelse(dibh); + } + +out_end_trans: + gfs2_trans_end(GFS2_SB(&ip->i_inode)); +out_ipres: + gfs2_inplace_release(ip); +out_gunlock_q: + gfs2_quota_unlock(ip); +out: + gfs2_alloc_put(ip); + return error; +} + +static int ea_init_i(struct gfs2_inode *ip, struct gfs2_ea_request *er, + void *private) +{ + struct buffer_head *bh; + int error; + + error = ea_alloc_blk(ip, &bh); + if (error) + return error; + + ip->i_di.di_eattr = bh->b_blocknr; + error = ea_write(ip, GFS2_EA_BH2FIRST(bh), er); + + brelse(bh); + + return error; +} + +/** + * ea_init - initializes a new eattr block + * @ip: + * @er: + * + * Returns: errno + */ + +static int ea_init(struct gfs2_inode *ip, struct gfs2_ea_request *er) +{ + unsigned int jbsize = GFS2_SB(&ip->i_inode)->sd_jbsize; + unsigned int blks = 1; + + if (GFS2_EAREQ_SIZE_STUFFED(er) > jbsize) + blks += DIV_ROUND_UP(er->er_data_len, jbsize); + + return ea_alloc_skeleton(ip, er, blks, ea_init_i, NULL); +} + +static struct gfs2_ea_header *ea_split_ea(struct gfs2_ea_header *ea) +{ + u32 ea_size = GFS2_EA_SIZE(ea); + struct gfs2_ea_header *new = (struct gfs2_ea_header *)((char *)ea + + ea_size); + u32 new_size = GFS2_EA_REC_LEN(ea) - ea_size; + int last = ea->ea_flags & GFS2_EAFLAG_LAST; + + ea->ea_rec_len = cpu_to_be32(ea_size); + ea->ea_flags ^= last; + + new->ea_rec_len = cpu_to_be32(new_size); + new->ea_flags = last; + + return new; +} + +static void ea_set_remove_stuffed(struct gfs2_inode *ip, + struct gfs2_ea_location *el) +{ + struct gfs2_ea_header *ea = el->el_ea; + struct gfs2_ea_header *prev = el->el_prev; + u32 len; + + gfs2_trans_add_bh(ip->i_gl, el->el_bh, 1); + + if (!prev || !GFS2_EA_IS_STUFFED(ea)) { + ea->ea_type = GFS2_EATYPE_UNUSED; + return; + } else if (GFS2_EA2NEXT(prev) != ea) { + prev = GFS2_EA2NEXT(prev); + gfs2_assert_withdraw(GFS2_SB(&ip->i_inode), GFS2_EA2NEXT(prev) == ea); + } + + len = GFS2_EA_REC_LEN(prev) + GFS2_EA_REC_LEN(ea); + prev->ea_rec_len = cpu_to_be32(len); + + if (GFS2_EA_IS_LAST(ea)) + prev->ea_flags |= GFS2_EAFLAG_LAST; +} + +struct ea_set { + int ea_split; + + struct gfs2_ea_request *es_er; + struct gfs2_ea_location *es_el; + + struct buffer_head *es_bh; + struct gfs2_ea_header *es_ea; +}; + +static int ea_set_simple_noalloc(struct gfs2_inode *ip, struct buffer_head *bh, + struct gfs2_ea_header *ea, struct ea_set *es) +{ + struct gfs2_ea_request *er = es->es_er; + struct buffer_head *dibh; + int error; + + error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), RES_DINODE + 2 * RES_EATTR, 0); + if (error) + return error; + + gfs2_trans_add_bh(ip->i_gl, bh, 1); + + if (es->ea_split) + ea = ea_split_ea(ea); + + ea_write(ip, ea, er); + + if (es->es_el) + ea_set_remove_stuffed(ip, es->es_el); + + error = gfs2_meta_inode_buffer(ip, &dibh); + if (error) + goto out; + + if (er->er_flags & GFS2_ERF_MODE) { + gfs2_assert_withdraw(GFS2_SB(&ip->i_inode), + (ip->i_di.di_mode & S_IFMT) == (er->er_mode & S_IFMT)); + ip->i_di.di_mode = er->er_mode; + } + ip->i_di.di_ctime = get_seconds(); + gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_dinode_out(&ip->i_di, dibh->b_data); + brelse(dibh); +out: + gfs2_trans_end(GFS2_SB(&ip->i_inode)); + return error; +} + +static int ea_set_simple_alloc(struct gfs2_inode *ip, + struct gfs2_ea_request *er, void *private) +{ + struct ea_set *es = private; + struct gfs2_ea_header *ea = es->es_ea; + int error; + + gfs2_trans_add_bh(ip->i_gl, es->es_bh, 1); + + if (es->ea_split) + ea = ea_split_ea(ea); + + error = ea_write(ip, ea, er); + if (error) + return error; + + if (es->es_el) + ea_set_remove_stuffed(ip, es->es_el); + + return 0; +} + +static int ea_set_simple(struct gfs2_inode *ip, struct buffer_head *bh, + struct gfs2_ea_header *ea, struct gfs2_ea_header *prev, + void *private) +{ + struct ea_set *es = private; + unsigned int size; + int stuffed; + int error; + + stuffed = ea_calc_size(GFS2_SB(&ip->i_inode), es->es_er, &size); + + if (ea->ea_type == GFS2_EATYPE_UNUSED) { + if (GFS2_EA_REC_LEN(ea) < size) + return 0; + if (!GFS2_EA_IS_STUFFED(ea)) { + error = ea_remove_unstuffed(ip, bh, ea, prev, 1); + if (error) + return error; + } + es->ea_split = 0; + } else if (GFS2_EA_REC_LEN(ea) - GFS2_EA_SIZE(ea) >= size) + es->ea_split = 1; + else + return 0; + + if (stuffed) { + error = ea_set_simple_noalloc(ip, bh, ea, es); + if (error) + return error; + } else { + unsigned int blks; + + es->es_bh = bh; + es->es_ea = ea; + blks = 2 + DIV_ROUND_UP(es->es_er->er_data_len, + GFS2_SB(&ip->i_inode)->sd_jbsize); + + error = ea_alloc_skeleton(ip, es->es_er, blks, + ea_set_simple_alloc, es); + if (error) + return error; + } + + return 1; +} + +static int ea_set_block(struct gfs2_inode *ip, struct gfs2_ea_request *er, + void *private) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct buffer_head *indbh, *newbh; + u64 *eablk; + int error; + int mh_size = sizeof(struct gfs2_meta_header); + + if (ip->i_di.di_flags & GFS2_DIF_EA_INDIRECT) { + u64 *end; + + error = gfs2_meta_read(ip->i_gl, ip->i_di.di_eattr, DIO_WAIT, + &indbh); + if (error) + return error; + + if (gfs2_metatype_check(sdp, indbh, GFS2_METATYPE_IN)) { + error = -EIO; + goto out; + } + + eablk = (u64 *)(indbh->b_data + mh_size); + end = eablk + sdp->sd_inptrs; + + for (; eablk < end; eablk++) + if (!*eablk) + break; + + if (eablk == end) { + error = -ENOSPC; + goto out; + } + + gfs2_trans_add_bh(ip->i_gl, indbh, 1); + } else { + u64 blk; + + blk = gfs2_alloc_meta(ip); + + indbh = gfs2_meta_new(ip->i_gl, blk); + gfs2_trans_add_bh(ip->i_gl, indbh, 1); + gfs2_metatype_set(indbh, GFS2_METATYPE_IN, GFS2_FORMAT_IN); + gfs2_buffer_clear_tail(indbh, mh_size); + + eablk = (u64 *)(indbh->b_data + mh_size); + *eablk = cpu_to_be64(ip->i_di.di_eattr); + ip->i_di.di_eattr = blk; + ip->i_di.di_flags |= GFS2_DIF_EA_INDIRECT; + ip->i_di.di_blocks++; + + eablk++; + } + + error = ea_alloc_blk(ip, &newbh); + if (error) + goto out; + + *eablk = cpu_to_be64((u64)newbh->b_blocknr); + error = ea_write(ip, GFS2_EA_BH2FIRST(newbh), er); + brelse(newbh); + if (error) + goto out; + + if (private) + ea_set_remove_stuffed(ip, private); + +out: + brelse(indbh); + return error; +} + +static int ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er, + struct gfs2_ea_location *el) +{ + struct ea_set es; + unsigned int blks = 2; + int error; + + memset(&es, 0, sizeof(struct ea_set)); + es.es_er = er; + es.es_el = el; + + error = ea_foreach(ip, ea_set_simple, &es); + if (error > 0) + return 0; + if (error) + return error; + + if (!(ip->i_di.di_flags & GFS2_DIF_EA_INDIRECT)) + blks++; + if (GFS2_EAREQ_SIZE_STUFFED(er) > GFS2_SB(&ip->i_inode)->sd_jbsize) + blks += DIV_ROUND_UP(er->er_data_len, GFS2_SB(&ip->i_inode)->sd_jbsize); + + return ea_alloc_skeleton(ip, er, blks, ea_set_block, el); +} + +static int ea_set_remove_unstuffed(struct gfs2_inode *ip, + struct gfs2_ea_location *el) +{ + if (el->el_prev && GFS2_EA2NEXT(el->el_prev) != el->el_ea) { + el->el_prev = GFS2_EA2NEXT(el->el_prev); + gfs2_assert_withdraw(GFS2_SB(&ip->i_inode), + GFS2_EA2NEXT(el->el_prev) == el->el_ea); + } + + return ea_remove_unstuffed(ip, el->el_bh, el->el_ea, el->el_prev,0); +} + +int gfs2_ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er) +{ + struct gfs2_ea_location el; + int error; + + if (!ip->i_di.di_eattr) { + if (er->er_flags & XATTR_REPLACE) + return -ENODATA; + return ea_init(ip, er); + } + + error = gfs2_ea_find(ip, er, &el); + if (error) + return error; + + if (el.el_ea) { + if (ip->i_di.di_flags & GFS2_DIF_APPENDONLY) { + brelse(el.el_bh); + return -EPERM; + } + + error = -EEXIST; + if (!(er->er_flags & XATTR_CREATE)) { + int unstuffed = !GFS2_EA_IS_STUFFED(el.el_ea); + error = ea_set_i(ip, er, &el); + if (!error && unstuffed) + ea_set_remove_unstuffed(ip, &el); + } + + brelse(el.el_bh); + } else { + error = -ENODATA; + if (!(er->er_flags & XATTR_REPLACE)) + error = ea_set_i(ip, er, NULL); + } + + return error; +} + +int gfs2_ea_set(struct gfs2_inode *ip, struct gfs2_ea_request *er) +{ + struct gfs2_holder i_gh; + int error; + + if (!er->er_name_len || er->er_name_len > GFS2_EA_MAX_NAME_LEN) + return -EINVAL; + if (!er->er_data || !er->er_data_len) { + er->er_data = NULL; + er->er_data_len = 0; + } + error = ea_check_size(GFS2_SB(&ip->i_inode), er); + if (error) + return error; + + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh); + if (error) + return error; + + if (IS_IMMUTABLE(&ip->i_inode)) + error = -EPERM; + else + error = gfs2_ea_ops[er->er_type]->eo_set(ip, er); + + gfs2_glock_dq_uninit(&i_gh); + + return error; +} + +static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el) +{ + struct gfs2_ea_header *ea = el->el_ea; + struct gfs2_ea_header *prev = el->el_prev; + struct buffer_head *dibh; + int error; + + error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), RES_DINODE + RES_EATTR, 0); + if (error) + return error; + + gfs2_trans_add_bh(ip->i_gl, el->el_bh, 1); + + if (prev) { + u32 len; + + len = GFS2_EA_REC_LEN(prev) + GFS2_EA_REC_LEN(ea); + prev->ea_rec_len = cpu_to_be32(len); + + if (GFS2_EA_IS_LAST(ea)) + prev->ea_flags |= GFS2_EAFLAG_LAST; + } else + ea->ea_type = GFS2_EATYPE_UNUSED; + + error = gfs2_meta_inode_buffer(ip, &dibh); + if (!error) { + ip->i_di.di_ctime = get_seconds(); + gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_dinode_out(&ip->i_di, dibh->b_data); + brelse(dibh); + } + + gfs2_trans_end(GFS2_SB(&ip->i_inode)); + + return error; +} + +int gfs2_ea_remove_i(struct gfs2_inode *ip, struct gfs2_ea_request *er) +{ + struct gfs2_ea_location el; + int error; + + if (!ip->i_di.di_eattr) + return -ENODATA; + + error = gfs2_ea_find(ip, er, &el); + if (error) + return error; + if (!el.el_ea) + return -ENODATA; + + if (GFS2_EA_IS_STUFFED(el.el_ea)) + error = ea_remove_stuffed(ip, &el); + else + error = ea_remove_unstuffed(ip, el.el_bh, el.el_ea, el.el_prev, + 0); + + brelse(el.el_bh); + + return error; +} + +/** + * gfs2_ea_remove - sets (or creates or replaces) an extended attribute + * @ip: pointer to the inode of the target file + * @er: request information + * + * Returns: errno + */ + +int gfs2_ea_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er) +{ + struct gfs2_holder i_gh; + int error; + + if (!er->er_name_len || er->er_name_len > GFS2_EA_MAX_NAME_LEN) + return -EINVAL; + + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh); + if (error) + return error; + + if (IS_IMMUTABLE(&ip->i_inode) || IS_APPEND(&ip->i_inode)) + error = -EPERM; + else + error = gfs2_ea_ops[er->er_type]->eo_remove(ip, er); + + gfs2_glock_dq_uninit(&i_gh); + + return error; +} + +static int ea_acl_chmod_unstuffed(struct gfs2_inode *ip, + struct gfs2_ea_header *ea, char *data) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct buffer_head **bh; + unsigned int amount = GFS2_EA_DATA_LEN(ea); + unsigned int nptrs = DIV_ROUND_UP(amount, sdp->sd_jbsize); + u64 *dataptrs = GFS2_EA2DATAPTRS(ea); + unsigned int x; + int error; + + bh = kcalloc(nptrs, sizeof(struct buffer_head *), GFP_KERNEL); + if (!bh) + return -ENOMEM; + + error = gfs2_trans_begin(sdp, nptrs + RES_DINODE, 0); + if (error) + goto out; + + for (x = 0; x < nptrs; x++) { + error = gfs2_meta_read(ip->i_gl, be64_to_cpu(*dataptrs), 0, + bh + x); + if (error) { + while (x--) + brelse(bh[x]); + goto fail; + } + dataptrs++; + } + + for (x = 0; x < nptrs; x++) { + error = gfs2_meta_wait(sdp, bh[x]); + if (error) { + for (; x < nptrs; x++) + brelse(bh[x]); + goto fail; + } + if (gfs2_metatype_check(sdp, bh[x], GFS2_METATYPE_ED)) { + for (; x < nptrs; x++) + brelse(bh[x]); + error = -EIO; + goto fail; + } + + gfs2_trans_add_bh(ip->i_gl, bh[x], 1); + + memcpy(bh[x]->b_data + sizeof(struct gfs2_meta_header), data, + (sdp->sd_jbsize > amount) ? amount : sdp->sd_jbsize); + + amount -= sdp->sd_jbsize; + data += sdp->sd_jbsize; + + brelse(bh[x]); + } + +out: + kfree(bh); + return error; + +fail: + gfs2_trans_end(sdp); + kfree(bh); + return error; +} + +int gfs2_ea_acl_chmod(struct gfs2_inode *ip, struct gfs2_ea_location *el, + struct iattr *attr, char *data) +{ + struct buffer_head *dibh; + int error; + + if (GFS2_EA_IS_STUFFED(el->el_ea)) { + error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), RES_DINODE + RES_EATTR, 0); + if (error) + return error; + + gfs2_trans_add_bh(ip->i_gl, el->el_bh, 1); + memcpy(GFS2_EA2DATA(el->el_ea), data, + GFS2_EA_DATA_LEN(el->el_ea)); + } else + error = ea_acl_chmod_unstuffed(ip, el->el_ea, data); + + if (error) + return error; + + error = gfs2_meta_inode_buffer(ip, &dibh); + if (!error) { + error = inode_setattr(&ip->i_inode, attr); + gfs2_assert_warn(GFS2_SB(&ip->i_inode), !error); + gfs2_inode_attr_out(ip); + gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_dinode_out(&ip->i_di, dibh->b_data); + brelse(dibh); + } + + gfs2_trans_end(GFS2_SB(&ip->i_inode)); + + return error; +} + +static int ea_dealloc_indirect(struct gfs2_inode *ip) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct gfs2_rgrp_list rlist; + struct buffer_head *indbh, *dibh; + u64 *eablk, *end; + unsigned int rg_blocks = 0; + u64 bstart = 0; + unsigned int blen = 0; + unsigned int blks = 0; + unsigned int x; + int error; + + memset(&rlist, 0, sizeof(struct gfs2_rgrp_list)); + + error = gfs2_meta_read(ip->i_gl, ip->i_di.di_eattr, DIO_WAIT, &indbh); + if (error) + return error; + + if (gfs2_metatype_check(sdp, indbh, GFS2_METATYPE_IN)) { + error = -EIO; + goto out; + } + + eablk = (u64 *)(indbh->b_data + sizeof(struct gfs2_meta_header)); + end = eablk + sdp->sd_inptrs; + + for (; eablk < end; eablk++) { + u64 bn; + + if (!*eablk) + break; + bn = be64_to_cpu(*eablk); + + if (bstart + blen == bn) + blen++; + else { + if (bstart) + gfs2_rlist_add(sdp, &rlist, bstart); + bstart = bn; + blen = 1; + } + blks++; + } + if (bstart) + gfs2_rlist_add(sdp, &rlist, bstart); + else + goto out; + + gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE, 0); + + for (x = 0; x < rlist.rl_rgrps; x++) { + struct gfs2_rgrpd *rgd; + rgd = rlist.rl_ghs[x].gh_gl->gl_object; + rg_blocks += rgd->rd_ri.ri_length; + } + + error = gfs2_glock_nq_m(rlist.rl_rgrps, rlist.rl_ghs); + if (error) + goto out_rlist_free; + + error = gfs2_trans_begin(sdp, rg_blocks + RES_DINODE + RES_INDIRECT + + RES_STATFS + RES_QUOTA, blks); + if (error) + goto out_gunlock; + + gfs2_trans_add_bh(ip->i_gl, indbh, 1); + + eablk = (u64 *)(indbh->b_data + sizeof(struct gfs2_meta_header)); + bstart = 0; + blen = 0; + + for (; eablk < end; eablk++) { + u64 bn; + + if (!*eablk) + break; + bn = be64_to_cpu(*eablk); + + if (bstart + blen == bn) + blen++; + else { + if (bstart) + gfs2_free_meta(ip, bstart, blen); + bstart = bn; + blen = 1; + } + + *eablk = 0; + if (!ip->i_di.di_blocks) + gfs2_consist_inode(ip); + ip->i_di.di_blocks--; + } + if (bstart) + gfs2_free_meta(ip, bstart, blen); + + ip->i_di.di_flags &= ~GFS2_DIF_EA_INDIRECT; + + error = gfs2_meta_inode_buffer(ip, &dibh); + if (!error) { + gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_dinode_out(&ip->i_di, dibh->b_data); + brelse(dibh); + } + + gfs2_trans_end(sdp); + +out_gunlock: + gfs2_glock_dq_m(rlist.rl_rgrps, rlist.rl_ghs); +out_rlist_free: + gfs2_rlist_free(&rlist); +out: + brelse(indbh); + return error; +} + +static int ea_dealloc_block(struct gfs2_inode *ip) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct gfs2_alloc *al = &ip->i_alloc; + struct gfs2_rgrpd *rgd; + struct buffer_head *dibh; + int error; + + rgd = gfs2_blk2rgrpd(sdp, ip->i_di.di_eattr); + if (!rgd) { + gfs2_consist_inode(ip); + return -EIO; + } + + error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, + &al->al_rgd_gh); + if (error) + return error; + + error = gfs2_trans_begin(sdp, RES_RG_BIT + RES_DINODE + RES_STATFS + + RES_QUOTA, 1); + if (error) + goto out_gunlock; + + gfs2_free_meta(ip, ip->i_di.di_eattr, 1); + + ip->i_di.di_eattr = 0; + if (!ip->i_di.di_blocks) + gfs2_consist_inode(ip); + ip->i_di.di_blocks--; + + error = gfs2_meta_inode_buffer(ip, &dibh); + if (!error) { + gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_dinode_out(&ip->i_di, dibh->b_data); + brelse(dibh); + } + + gfs2_trans_end(sdp); + +out_gunlock: + gfs2_glock_dq_uninit(&al->al_rgd_gh); + return error; +} + +/** + * gfs2_ea_dealloc - deallocate the extended attribute fork + * @ip: the inode + * + * Returns: errno + */ + +int gfs2_ea_dealloc(struct gfs2_inode *ip) +{ + struct gfs2_alloc *al; + int error; + + al = gfs2_alloc_get(ip); + + error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE); + if (error) + goto out_alloc; + + error = gfs2_rindex_hold(GFS2_SB(&ip->i_inode), &al->al_ri_gh); + if (error) + goto out_quota; + + error = ea_foreach(ip, ea_dealloc_unstuffed, NULL); + if (error) + goto out_rindex; + + if (ip->i_di.di_flags & GFS2_DIF_EA_INDIRECT) { + error = ea_dealloc_indirect(ip); + if (error) + goto out_rindex; + } + + error = ea_dealloc_block(ip); + +out_rindex: + gfs2_glock_dq_uninit(&al->al_ri_gh); +out_quota: + gfs2_quota_unhold(ip); +out_alloc: + gfs2_alloc_put(ip); + return error; +} + diff --git a/fs/gfs2/eattr.h b/fs/gfs2/eattr.h new file mode 100644 index 000000000000..ffa65947d686 --- /dev/null +++ b/fs/gfs2/eattr.h @@ -0,0 +1,100 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#ifndef __EATTR_DOT_H__ +#define __EATTR_DOT_H__ + +struct gfs2_inode; +struct iattr; + +#define GFS2_EA_REC_LEN(ea) be32_to_cpu((ea)->ea_rec_len) +#define GFS2_EA_DATA_LEN(ea) be32_to_cpu((ea)->ea_data_len) + +#define GFS2_EA_SIZE(ea) \ +ALIGN(sizeof(struct gfs2_ea_header) + (ea)->ea_name_len + \ + ((GFS2_EA_IS_STUFFED(ea)) ? GFS2_EA_DATA_LEN(ea) : \ + (sizeof(u64) * (ea)->ea_num_ptrs)), 8) + +#define GFS2_EA_IS_STUFFED(ea) (!(ea)->ea_num_ptrs) +#define GFS2_EA_IS_LAST(ea) ((ea)->ea_flags & GFS2_EAFLAG_LAST) + +#define GFS2_EAREQ_SIZE_STUFFED(er) \ +ALIGN(sizeof(struct gfs2_ea_header) + (er)->er_name_len + (er)->er_data_len, 8) + +#define GFS2_EAREQ_SIZE_UNSTUFFED(sdp, er) \ +ALIGN(sizeof(struct gfs2_ea_header) + (er)->er_name_len + \ + sizeof(u64) * DIV_ROUND_UP((er)->er_data_len, (sdp)->sd_jbsize), 8) + +#define GFS2_EA2NAME(ea) ((char *)((struct gfs2_ea_header *)(ea) + 1)) +#define GFS2_EA2DATA(ea) (GFS2_EA2NAME(ea) + (ea)->ea_name_len) + +#define GFS2_EA2DATAPTRS(ea) \ +((u64 *)(GFS2_EA2NAME(ea) + ALIGN((ea)->ea_name_len, 8))) + +#define GFS2_EA2NEXT(ea) \ +((struct gfs2_ea_header *)((char *)(ea) + GFS2_EA_REC_LEN(ea))) + +#define GFS2_EA_BH2FIRST(bh) \ +((struct gfs2_ea_header *)((bh)->b_data + sizeof(struct gfs2_meta_header))) + +#define GFS2_ERF_MODE 0x80000000 + +struct gfs2_ea_request { + const char *er_name; + char *er_data; + unsigned int er_name_len; + unsigned int er_data_len; + unsigned int er_type; /* GFS2_EATYPE_... */ + int er_flags; + mode_t er_mode; +}; + +struct gfs2_ea_location { + struct buffer_head *el_bh; + struct gfs2_ea_header *el_ea; + struct gfs2_ea_header *el_prev; +}; + +int gfs2_ea_get_i(struct gfs2_inode *ip, struct gfs2_ea_request *er); +int gfs2_ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er); +int gfs2_ea_remove_i(struct gfs2_inode *ip, struct gfs2_ea_request *er); + +int gfs2_ea_list(struct gfs2_inode *ip, struct gfs2_ea_request *er); +int gfs2_ea_get(struct gfs2_inode *ip, struct gfs2_ea_request *er); +int gfs2_ea_set(struct gfs2_inode *ip, struct gfs2_ea_request *er); +int gfs2_ea_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er); + +int gfs2_ea_dealloc(struct gfs2_inode *ip); + +/* Exported to acl.c */ + +int gfs2_ea_find(struct gfs2_inode *ip, + struct gfs2_ea_request *er, + struct gfs2_ea_location *el); +int gfs2_ea_get_copy(struct gfs2_inode *ip, + struct gfs2_ea_location *el, + char *data); +int gfs2_ea_acl_chmod(struct gfs2_inode *ip, struct gfs2_ea_location *el, + struct iattr *attr, char *data); + +static inline unsigned int gfs2_ea_strlen(struct gfs2_ea_header *ea) +{ + switch (ea->ea_type) { + case GFS2_EATYPE_USR: + return 5 + ea->ea_name_len + 1; + case GFS2_EATYPE_SYS: + return 7 + ea->ea_name_len + 1; + case GFS2_EATYPE_SECURITY: + return 9 + ea->ea_name_len + 1; + default: + return 0; + } +} + +#endif /* __EATTR_DOT_H__ */ diff --git a/fs/gfs2/gfs2.h b/fs/gfs2/gfs2.h new file mode 100644 index 000000000000..3bb11c0f8b56 --- /dev/null +++ b/fs/gfs2/gfs2.h @@ -0,0 +1,31 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#ifndef __GFS2_DOT_H__ +#define __GFS2_DOT_H__ + +enum { + NO_CREATE = 0, + CREATE = 1, +}; + +enum { + NO_WAIT = 0, + WAIT = 1, +}; + +enum { + NO_FORCE = 0, + FORCE = 1, +}; + +#define GFS2_FAST_NAME_SIZE 8 + +#endif /* __GFS2_DOT_H__ */ + diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c new file mode 100644 index 000000000000..78fe0fae23ff --- /dev/null +++ b/fs/gfs2/glock.c @@ -0,0 +1,2231 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/completion.h> +#include <linux/buffer_head.h> +#include <linux/delay.h> +#include <linux/sort.h> +#include <linux/jhash.h> +#include <linux/kallsyms.h> +#include <linux/gfs2_ondisk.h> +#include <linux/list.h> +#include <linux/lm_interface.h> +#include <asm/uaccess.h> + +#include "gfs2.h" +#include "incore.h" +#include "glock.h" +#include "glops.h" +#include "inode.h" +#include "lm.h" +#include "lops.h" +#include "meta_io.h" +#include "quota.h" +#include "super.h" +#include "util.h" + +struct greedy { + struct gfs2_holder gr_gh; + struct work_struct gr_work; +}; + +struct gfs2_gl_hash_bucket { + struct hlist_head hb_list; +}; + +typedef void (*glock_examiner) (struct gfs2_glock * gl); + +static int gfs2_dump_lockstate(struct gfs2_sbd *sdp); +static int dump_glock(struct gfs2_glock *gl); +static int dump_inode(struct gfs2_inode *ip); + +#define GFS2_GL_HASH_SHIFT 15 +#define GFS2_GL_HASH_SIZE (1 << GFS2_GL_HASH_SHIFT) +#define GFS2_GL_HASH_MASK (GFS2_GL_HASH_SIZE - 1) + +static struct gfs2_gl_hash_bucket gl_hash_table[GFS2_GL_HASH_SIZE]; + +/* + * Despite what you might think, the numbers below are not arbitrary :-) + * They are taken from the ipv4 routing hash code, which is well tested + * and thus should be nearly optimal. Later on we might tweek the numbers + * but for now this should be fine. + * + * The reason for putting the locks in a separate array from the list heads + * is that we can have fewer locks than list heads and save memory. We use + * the same hash function for both, but with a different hash mask. + */ +#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \ + defined(CONFIG_PROVE_LOCKING) + +#ifdef CONFIG_LOCKDEP +# define GL_HASH_LOCK_SZ 256 +#else +# if NR_CPUS >= 32 +# define GL_HASH_LOCK_SZ 4096 +# elif NR_CPUS >= 16 +# define GL_HASH_LOCK_SZ 2048 +# elif NR_CPUS >= 8 +# define GL_HASH_LOCK_SZ 1024 +# elif NR_CPUS >= 4 +# define GL_HASH_LOCK_SZ 512 +# else +# define GL_HASH_LOCK_SZ 256 +# endif +#endif + +/* We never want more locks than chains */ +#if GFS2_GL_HASH_SIZE < GL_HASH_LOCK_SZ +# undef GL_HASH_LOCK_SZ +# define GL_HASH_LOCK_SZ GFS2_GL_HASH_SIZE +#endif + +static rwlock_t gl_hash_locks[GL_HASH_LOCK_SZ]; + +static inline rwlock_t *gl_lock_addr(unsigned int x) +{ + return &gl_hash_locks[x & (GL_HASH_LOCK_SZ-1)]; +} +#else /* not SMP, so no spinlocks required */ +static inline rwlock_t *gl_lock_addr(x) +{ + return NULL; +} +#endif + +/** + * relaxed_state_ok - is a requested lock compatible with the current lock mode? + * @actual: the current state of the lock + * @requested: the lock state that was requested by the caller + * @flags: the modifier flags passed in by the caller + * + * Returns: 1 if the locks are compatible, 0 otherwise + */ + +static inline int relaxed_state_ok(unsigned int actual, unsigned requested, + int flags) +{ + if (actual == requested) + return 1; + + if (flags & GL_EXACT) + return 0; + + if (actual == LM_ST_EXCLUSIVE && requested == LM_ST_SHARED) + return 1; + + if (actual != LM_ST_UNLOCKED && (flags & LM_FLAG_ANY)) + return 1; + + return 0; +} + +/** + * gl_hash() - Turn glock number into hash bucket number + * @lock: The glock number + * + * Returns: The number of the corresponding hash bucket + */ + +static unsigned int gl_hash(const struct gfs2_sbd *sdp, + const struct lm_lockname *name) +{ + unsigned int h; + + h = jhash(&name->ln_number, sizeof(u64), 0); + h = jhash(&name->ln_type, sizeof(unsigned int), h); + h = jhash(&sdp, sizeof(struct gfs2_sbd *), h); + h &= GFS2_GL_HASH_MASK; + + return h; +} + +/** + * glock_free() - Perform a few checks and then release struct gfs2_glock + * @gl: The glock to release + * + * Also calls lock module to release its internal structure for this glock. + * + */ + +static void glock_free(struct gfs2_glock *gl) +{ + struct gfs2_sbd *sdp = gl->gl_sbd; + struct inode *aspace = gl->gl_aspace; + + gfs2_lm_put_lock(sdp, gl->gl_lock); + + if (aspace) + gfs2_aspace_put(aspace); + + kmem_cache_free(gfs2_glock_cachep, gl); +} + +/** + * gfs2_glock_hold() - increment reference count on glock + * @gl: The glock to hold + * + */ + +void gfs2_glock_hold(struct gfs2_glock *gl) +{ + atomic_inc(&gl->gl_ref); +} + +/** + * gfs2_glock_put() - Decrement reference count on glock + * @gl: The glock to put + * + */ + +int gfs2_glock_put(struct gfs2_glock *gl) +{ + int rv = 0; + struct gfs2_sbd *sdp = gl->gl_sbd; + + write_lock(gl_lock_addr(gl->gl_hash)); + if (atomic_dec_and_test(&gl->gl_ref)) { + hlist_del(&gl->gl_list); + write_unlock(gl_lock_addr(gl->gl_hash)); + BUG_ON(spin_is_locked(&gl->gl_spin)); + gfs2_assert(sdp, gl->gl_state == LM_ST_UNLOCKED); + gfs2_assert(sdp, list_empty(&gl->gl_reclaim)); + gfs2_assert(sdp, list_empty(&gl->gl_holders)); + gfs2_assert(sdp, list_empty(&gl->gl_waiters1)); + gfs2_assert(sdp, list_empty(&gl->gl_waiters2)); + gfs2_assert(sdp, list_empty(&gl->gl_waiters3)); + glock_free(gl); + rv = 1; + goto out; + } + write_unlock(gl_lock_addr(gl->gl_hash)); +out: + return rv; +} + +/** + * queue_empty - check to see if a glock's queue is empty + * @gl: the glock + * @head: the head of the queue to check + * + * This function protects the list in the event that a process already + * has a holder on the list and is adding a second holder for itself. + * The glmutex lock is what generally prevents processes from working + * on the same glock at once, but the special case of adding a second + * holder for yourself ("recursive" locking) doesn't involve locking + * glmutex, making the spin lock necessary. + * + * Returns: 1 if the queue is empty + */ + +static inline int queue_empty(struct gfs2_glock *gl, struct list_head *head) +{ + int empty; + spin_lock(&gl->gl_spin); + empty = list_empty(head); + spin_unlock(&gl->gl_spin); + return empty; +} + +/** + * search_bucket() - Find struct gfs2_glock by lock number + * @bucket: the bucket to search + * @name: The lock name + * + * Returns: NULL, or the struct gfs2_glock with the requested number + */ + +static struct gfs2_glock *search_bucket(unsigned int hash, + const struct gfs2_sbd *sdp, + const struct lm_lockname *name) +{ + struct gfs2_glock *gl; + struct hlist_node *h; + + hlist_for_each_entry(gl, h, &gl_hash_table[hash].hb_list, gl_list) { + if (!lm_name_equal(&gl->gl_name, name)) + continue; + if (gl->gl_sbd != sdp) + continue; + + atomic_inc(&gl->gl_ref); + + return gl; + } + + return NULL; +} + +/** + * gfs2_glock_find() - Find glock by lock number + * @sdp: The GFS2 superblock + * @name: The lock name + * + * Returns: NULL, or the struct gfs2_glock with the requested number + */ + +static struct gfs2_glock *gfs2_glock_find(const struct gfs2_sbd *sdp, + const struct lm_lockname *name) +{ + unsigned int hash = gl_hash(sdp, name); + struct gfs2_glock *gl; + + read_lock(gl_lock_addr(hash)); + gl = search_bucket(hash, sdp, name); + read_unlock(gl_lock_addr(hash)); + + return gl; +} + +/** + * gfs2_glock_get() - Get a glock, or create one if one doesn't exist + * @sdp: The GFS2 superblock + * @number: the lock number + * @glops: The glock_operations to use + * @create: If 0, don't create the glock if it doesn't exist + * @glp: the glock is returned here + * + * This does not lock a glock, just finds/creates structures for one. + * + * Returns: errno + */ + +int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, + const struct gfs2_glock_operations *glops, int create, + struct gfs2_glock **glp) +{ + struct lm_lockname name = { .ln_number = number, .ln_type = glops->go_type }; + struct gfs2_glock *gl, *tmp; + unsigned int hash = gl_hash(sdp, &name); + int error; + + read_lock(gl_lock_addr(hash)); + gl = search_bucket(hash, sdp, &name); + read_unlock(gl_lock_addr(hash)); + + if (gl || !create) { + *glp = gl; + return 0; + } + + gl = kmem_cache_alloc(gfs2_glock_cachep, GFP_KERNEL); + if (!gl) + return -ENOMEM; + + gl->gl_flags = 0; + gl->gl_name = name; + atomic_set(&gl->gl_ref, 1); + gl->gl_state = LM_ST_UNLOCKED; + gl->gl_hash = hash; + gl->gl_owner = NULL; + gl->gl_ip = 0; + gl->gl_ops = glops; + gl->gl_req_gh = NULL; + gl->gl_req_bh = NULL; + gl->gl_vn = 0; + gl->gl_stamp = jiffies; + gl->gl_object = NULL; + gl->gl_sbd = sdp; + gl->gl_aspace = NULL; + lops_init_le(&gl->gl_le, &gfs2_glock_lops); + + /* If this glock protects actual on-disk data or metadata blocks, + create a VFS inode to manage the pages/buffers holding them. */ + if (glops == &gfs2_inode_glops || glops == &gfs2_rgrp_glops) { + gl->gl_aspace = gfs2_aspace_get(sdp); + if (!gl->gl_aspace) { + error = -ENOMEM; + goto fail; + } + } + + error = gfs2_lm_get_lock(sdp, &name, &gl->gl_lock); + if (error) + goto fail_aspace; + + write_lock(gl_lock_addr(hash)); + tmp = search_bucket(hash, sdp, &name); + if (tmp) { + write_unlock(gl_lock_addr(hash)); + glock_free(gl); + gl = tmp; + } else { + hlist_add_head(&gl->gl_list, &gl_hash_table[hash].hb_list); + write_unlock(gl_lock_addr(hash)); + } + + *glp = gl; + + return 0; + +fail_aspace: + if (gl->gl_aspace) + gfs2_aspace_put(gl->gl_aspace); +fail: + kmem_cache_free(gfs2_glock_cachep, gl); + return error; +} + +/** + * gfs2_holder_init - initialize a struct gfs2_holder in the default way + * @gl: the glock + * @state: the state we're requesting + * @flags: the modifier flags + * @gh: the holder structure + * + */ + +void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, unsigned flags, + struct gfs2_holder *gh) +{ + INIT_LIST_HEAD(&gh->gh_list); + gh->gh_gl = gl; + gh->gh_ip = (unsigned long)__builtin_return_address(0); + gh->gh_owner = current; + gh->gh_state = state; + gh->gh_flags = flags; + gh->gh_error = 0; + gh->gh_iflags = 0; + init_completion(&gh->gh_wait); + + if (gh->gh_state == LM_ST_EXCLUSIVE) + gh->gh_flags |= GL_LOCAL_EXCL; + + gfs2_glock_hold(gl); +} + +/** + * gfs2_holder_reinit - reinitialize a struct gfs2_holder so we can requeue it + * @state: the state we're requesting + * @flags: the modifier flags + * @gh: the holder structure + * + * Don't mess with the glock. + * + */ + +void gfs2_holder_reinit(unsigned int state, unsigned flags, struct gfs2_holder *gh) +{ + gh->gh_state = state; + gh->gh_flags = flags; + if (gh->gh_state == LM_ST_EXCLUSIVE) + gh->gh_flags |= GL_LOCAL_EXCL; + + gh->gh_iflags &= 1 << HIF_ALLOCED; + gh->gh_ip = (unsigned long)__builtin_return_address(0); +} + +/** + * gfs2_holder_uninit - uninitialize a holder structure (drop glock reference) + * @gh: the holder structure + * + */ + +void gfs2_holder_uninit(struct gfs2_holder *gh) +{ + gfs2_glock_put(gh->gh_gl); + gh->gh_gl = NULL; + gh->gh_ip = 0; +} + +/** + * gfs2_holder_get - get a struct gfs2_holder structure + * @gl: the glock + * @state: the state we're requesting + * @flags: the modifier flags + * @gfp_flags: + * + * Figure out how big an impact this function has. Either: + * 1) Replace it with a cache of structures hanging off the struct gfs2_sbd + * 2) Leave it like it is + * + * Returns: the holder structure, NULL on ENOMEM + */ + +static struct gfs2_holder *gfs2_holder_get(struct gfs2_glock *gl, + unsigned int state, + int flags, gfp_t gfp_flags) +{ + struct gfs2_holder *gh; + + gh = kmalloc(sizeof(struct gfs2_holder), gfp_flags); + if (!gh) + return NULL; + + gfs2_holder_init(gl, state, flags, gh); + set_bit(HIF_ALLOCED, &gh->gh_iflags); + gh->gh_ip = (unsigned long)__builtin_return_address(0); + return gh; +} + +/** + * gfs2_holder_put - get rid of a struct gfs2_holder structure + * @gh: the holder structure + * + */ + +static void gfs2_holder_put(struct gfs2_holder *gh) +{ + gfs2_holder_uninit(gh); + kfree(gh); +} + +/** + * rq_mutex - process a mutex request in the queue + * @gh: the glock holder + * + * Returns: 1 if the queue is blocked + */ + +static int rq_mutex(struct gfs2_holder *gh) +{ + struct gfs2_glock *gl = gh->gh_gl; + + list_del_init(&gh->gh_list); + /* gh->gh_error never examined. */ + set_bit(GLF_LOCK, &gl->gl_flags); + complete(&gh->gh_wait); + + return 1; +} + +/** + * rq_promote - process a promote request in the queue + * @gh: the glock holder + * + * Acquire a new inter-node lock, or change a lock state to more restrictive. + * + * Returns: 1 if the queue is blocked + */ + +static int rq_promote(struct gfs2_holder *gh) +{ + struct gfs2_glock *gl = gh->gh_gl; + struct gfs2_sbd *sdp = gl->gl_sbd; + const struct gfs2_glock_operations *glops = gl->gl_ops; + + if (!relaxed_state_ok(gl->gl_state, gh->gh_state, gh->gh_flags)) { + if (list_empty(&gl->gl_holders)) { + gl->gl_req_gh = gh; + set_bit(GLF_LOCK, &gl->gl_flags); + spin_unlock(&gl->gl_spin); + + if (atomic_read(&sdp->sd_reclaim_count) > + gfs2_tune_get(sdp, gt_reclaim_limit) && + !(gh->gh_flags & LM_FLAG_PRIORITY)) { + gfs2_reclaim_glock(sdp); + gfs2_reclaim_glock(sdp); + } + + glops->go_xmote_th(gl, gh->gh_state, gh->gh_flags); + spin_lock(&gl->gl_spin); + } + return 1; + } + + if (list_empty(&gl->gl_holders)) { + set_bit(HIF_FIRST, &gh->gh_iflags); + set_bit(GLF_LOCK, &gl->gl_flags); + } else { + struct gfs2_holder *next_gh; + if (gh->gh_flags & GL_LOCAL_EXCL) + return 1; + next_gh = list_entry(gl->gl_holders.next, struct gfs2_holder, + gh_list); + if (next_gh->gh_flags & GL_LOCAL_EXCL) + return 1; + } + + list_move_tail(&gh->gh_list, &gl->gl_holders); + gh->gh_error = 0; + set_bit(HIF_HOLDER, &gh->gh_iflags); + + complete(&gh->gh_wait); + + return 0; +} + +/** + * rq_demote - process a demote request in the queue + * @gh: the glock holder + * + * Returns: 1 if the queue is blocked + */ + +static int rq_demote(struct gfs2_holder *gh) +{ + struct gfs2_glock *gl = gh->gh_gl; + const struct gfs2_glock_operations *glops = gl->gl_ops; + + if (!list_empty(&gl->gl_holders)) + return 1; + + if (gl->gl_state == gh->gh_state || gl->gl_state == LM_ST_UNLOCKED) { + list_del_init(&gh->gh_list); + gh->gh_error = 0; + spin_unlock(&gl->gl_spin); + if (test_bit(HIF_DEALLOC, &gh->gh_iflags)) + gfs2_holder_put(gh); + else + complete(&gh->gh_wait); + spin_lock(&gl->gl_spin); + } else { + gl->gl_req_gh = gh; + set_bit(GLF_LOCK, &gl->gl_flags); + spin_unlock(&gl->gl_spin); + + if (gh->gh_state == LM_ST_UNLOCKED || + gl->gl_state != LM_ST_EXCLUSIVE) + glops->go_drop_th(gl); + else + glops->go_xmote_th(gl, gh->gh_state, gh->gh_flags); + + spin_lock(&gl->gl_spin); + } + + return 0; +} + +/** + * rq_greedy - process a queued request to drop greedy status + * @gh: the glock holder + * + * Returns: 1 if the queue is blocked + */ + +static int rq_greedy(struct gfs2_holder *gh) +{ + struct gfs2_glock *gl = gh->gh_gl; + + list_del_init(&gh->gh_list); + /* gh->gh_error never examined. */ + clear_bit(GLF_GREEDY, &gl->gl_flags); + spin_unlock(&gl->gl_spin); + + gfs2_holder_uninit(gh); + kfree(container_of(gh, struct greedy, gr_gh)); + + spin_lock(&gl->gl_spin); + + return 0; +} + +/** + * run_queue - process holder structures on a glock + * @gl: the glock + * + */ +static void run_queue(struct gfs2_glock *gl) +{ + struct gfs2_holder *gh; + int blocked = 1; + + for (;;) { + if (test_bit(GLF_LOCK, &gl->gl_flags)) + break; + + if (!list_empty(&gl->gl_waiters1)) { + gh = list_entry(gl->gl_waiters1.next, + struct gfs2_holder, gh_list); + + if (test_bit(HIF_MUTEX, &gh->gh_iflags)) + blocked = rq_mutex(gh); + else + gfs2_assert_warn(gl->gl_sbd, 0); + + } else if (!list_empty(&gl->gl_waiters2) && + !test_bit(GLF_SKIP_WAITERS2, &gl->gl_flags)) { + gh = list_entry(gl->gl_waiters2.next, + struct gfs2_holder, gh_list); + + if (test_bit(HIF_DEMOTE, &gh->gh_iflags)) + blocked = rq_demote(gh); + else if (test_bit(HIF_GREEDY, &gh->gh_iflags)) + blocked = rq_greedy(gh); + else + gfs2_assert_warn(gl->gl_sbd, 0); + + } else if (!list_empty(&gl->gl_waiters3)) { + gh = list_entry(gl->gl_waiters3.next, + struct gfs2_holder, gh_list); + + if (test_bit(HIF_PROMOTE, &gh->gh_iflags)) + blocked = rq_promote(gh); + else + gfs2_assert_warn(gl->gl_sbd, 0); + + } else + break; + + if (blocked) + break; + } +} + +/** + * gfs2_glmutex_lock - acquire a local lock on a glock + * @gl: the glock + * + * Gives caller exclusive access to manipulate a glock structure. + */ + +static void gfs2_glmutex_lock(struct gfs2_glock *gl) +{ + struct gfs2_holder gh; + + gfs2_holder_init(gl, 0, 0, &gh); + set_bit(HIF_MUTEX, &gh.gh_iflags); + + spin_lock(&gl->gl_spin); + if (test_and_set_bit(GLF_LOCK, &gl->gl_flags)) { + list_add_tail(&gh.gh_list, &gl->gl_waiters1); + } else { + gl->gl_owner = current; + gl->gl_ip = (unsigned long)__builtin_return_address(0); + complete(&gh.gh_wait); + } + spin_unlock(&gl->gl_spin); + + wait_for_completion(&gh.gh_wait); + gfs2_holder_uninit(&gh); +} + +/** + * gfs2_glmutex_trylock - try to acquire a local lock on a glock + * @gl: the glock + * + * Returns: 1 if the glock is acquired + */ + +static int gfs2_glmutex_trylock(struct gfs2_glock *gl) +{ + int acquired = 1; + + spin_lock(&gl->gl_spin); + if (test_and_set_bit(GLF_LOCK, &gl->gl_flags)) { + acquired = 0; + } else { + gl->gl_owner = current; + gl->gl_ip = (unsigned long)__builtin_return_address(0); + } + spin_unlock(&gl->gl_spin); + + return acquired; +} + +/** + * gfs2_glmutex_unlock - release a local lock on a glock + * @gl: the glock + * + */ + +static void gfs2_glmutex_unlock(struct gfs2_glock *gl) +{ + spin_lock(&gl->gl_spin); + clear_bit(GLF_LOCK, &gl->gl_flags); + gl->gl_owner = NULL; + gl->gl_ip = 0; + run_queue(gl); + BUG_ON(!spin_is_locked(&gl->gl_spin)); + spin_unlock(&gl->gl_spin); +} + +/** + * handle_callback - add a demote request to a lock's queue + * @gl: the glock + * @state: the state the caller wants us to change to + * + * Note: This may fail sliently if we are out of memory. + */ + +static void handle_callback(struct gfs2_glock *gl, unsigned int state) +{ + struct gfs2_holder *gh, *new_gh = NULL; + +restart: + spin_lock(&gl->gl_spin); + + list_for_each_entry(gh, &gl->gl_waiters2, gh_list) { + if (test_bit(HIF_DEMOTE, &gh->gh_iflags) && + gl->gl_req_gh != gh) { + if (gh->gh_state != state) + gh->gh_state = LM_ST_UNLOCKED; + goto out; + } + } + + if (new_gh) { + list_add_tail(&new_gh->gh_list, &gl->gl_waiters2); + new_gh = NULL; + } else { + spin_unlock(&gl->gl_spin); + + new_gh = gfs2_holder_get(gl, state, LM_FLAG_TRY, GFP_KERNEL); + if (!new_gh) + return; + set_bit(HIF_DEMOTE, &new_gh->gh_iflags); + set_bit(HIF_DEALLOC, &new_gh->gh_iflags); + + goto restart; + } + +out: + spin_unlock(&gl->gl_spin); + + if (new_gh) + gfs2_holder_put(new_gh); +} + +void gfs2_glock_inode_squish(struct inode *inode) +{ + struct gfs2_holder gh; + struct gfs2_glock *gl = GFS2_I(inode)->i_gl; + gfs2_holder_init(gl, LM_ST_UNLOCKED, 0, &gh); + set_bit(HIF_DEMOTE, &gh.gh_iflags); + spin_lock(&gl->gl_spin); + gfs2_assert(inode->i_sb->s_fs_info, list_empty(&gl->gl_holders)); + list_add_tail(&gh.gh_list, &gl->gl_waiters2); + run_queue(gl); + spin_unlock(&gl->gl_spin); + wait_for_completion(&gh.gh_wait); + gfs2_holder_uninit(&gh); +} + +/** + * state_change - record that the glock is now in a different state + * @gl: the glock + * @new_state the new state + * + */ + +static void state_change(struct gfs2_glock *gl, unsigned int new_state) +{ + int held1, held2; + + held1 = (gl->gl_state != LM_ST_UNLOCKED); + held2 = (new_state != LM_ST_UNLOCKED); + + if (held1 != held2) { + if (held2) + gfs2_glock_hold(gl); + else + gfs2_glock_put(gl); + } + + gl->gl_state = new_state; +} + +/** + * xmote_bh - Called after the lock module is done acquiring a lock + * @gl: The glock in question + * @ret: the int returned from the lock module + * + */ + +static void xmote_bh(struct gfs2_glock *gl, unsigned int ret) +{ + struct gfs2_sbd *sdp = gl->gl_sbd; + const struct gfs2_glock_operations *glops = gl->gl_ops; + struct gfs2_holder *gh = gl->gl_req_gh; + int prev_state = gl->gl_state; + int op_done = 1; + + gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags)); + gfs2_assert_warn(sdp, queue_empty(gl, &gl->gl_holders)); + gfs2_assert_warn(sdp, !(ret & LM_OUT_ASYNC)); + + state_change(gl, ret & LM_OUT_ST_MASK); + + if (prev_state != LM_ST_UNLOCKED && !(ret & LM_OUT_CACHEABLE)) { + if (glops->go_inval) + glops->go_inval(gl, DIO_METADATA | DIO_DATA); + } else if (gl->gl_state == LM_ST_DEFERRED) { + /* We might not want to do this here. + Look at moving to the inode glops. */ + if (glops->go_inval) + glops->go_inval(gl, DIO_DATA); + } + + /* Deal with each possible exit condition */ + + if (!gh) + gl->gl_stamp = jiffies; + else if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) { + spin_lock(&gl->gl_spin); + list_del_init(&gh->gh_list); + gh->gh_error = -EIO; + spin_unlock(&gl->gl_spin); + } else if (test_bit(HIF_DEMOTE, &gh->gh_iflags)) { + spin_lock(&gl->gl_spin); + list_del_init(&gh->gh_list); + if (gl->gl_state == gh->gh_state || + gl->gl_state == LM_ST_UNLOCKED) { + gh->gh_error = 0; + } else { + if (gfs2_assert_warn(sdp, gh->gh_flags & + (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) == -1) + fs_warn(sdp, "ret = 0x%.8X\n", ret); + gh->gh_error = GLR_TRYFAILED; + } + spin_unlock(&gl->gl_spin); + + if (ret & LM_OUT_CANCELED) + handle_callback(gl, LM_ST_UNLOCKED); + + } else if (ret & LM_OUT_CANCELED) { + spin_lock(&gl->gl_spin); + list_del_init(&gh->gh_list); + gh->gh_error = GLR_CANCELED; + spin_unlock(&gl->gl_spin); + + } else if (relaxed_state_ok(gl->gl_state, gh->gh_state, gh->gh_flags)) { + spin_lock(&gl->gl_spin); + list_move_tail(&gh->gh_list, &gl->gl_holders); + gh->gh_error = 0; + set_bit(HIF_HOLDER, &gh->gh_iflags); + spin_unlock(&gl->gl_spin); + + set_bit(HIF_FIRST, &gh->gh_iflags); + + op_done = 0; + + } else if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) { + spin_lock(&gl->gl_spin); + list_del_init(&gh->gh_list); + gh->gh_error = GLR_TRYFAILED; + spin_unlock(&gl->gl_spin); + + } else { + if (gfs2_assert_withdraw(sdp, 0) == -1) + fs_err(sdp, "ret = 0x%.8X\n", ret); + } + + if (glops->go_xmote_bh) + glops->go_xmote_bh(gl); + + if (op_done) { + spin_lock(&gl->gl_spin); + gl->gl_req_gh = NULL; + gl->gl_req_bh = NULL; + clear_bit(GLF_LOCK, &gl->gl_flags); + run_queue(gl); + spin_unlock(&gl->gl_spin); + } + + gfs2_glock_put(gl); + + if (gh) { + if (test_bit(HIF_DEALLOC, &gh->gh_iflags)) + gfs2_holder_put(gh); + else + complete(&gh->gh_wait); + } +} + +/** + * gfs2_glock_xmote_th - Call into the lock module to acquire or change a glock + * @gl: The glock in question + * @state: the requested state + * @flags: modifier flags to the lock call + * + */ + +void gfs2_glock_xmote_th(struct gfs2_glock *gl, unsigned int state, int flags) +{ + struct gfs2_sbd *sdp = gl->gl_sbd; + const struct gfs2_glock_operations *glops = gl->gl_ops; + int lck_flags = flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB | + LM_FLAG_NOEXP | LM_FLAG_ANY | + LM_FLAG_PRIORITY); + unsigned int lck_ret; + + gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags)); + gfs2_assert_warn(sdp, queue_empty(gl, &gl->gl_holders)); + gfs2_assert_warn(sdp, state != LM_ST_UNLOCKED); + gfs2_assert_warn(sdp, state != gl->gl_state); + + if (gl->gl_state == LM_ST_EXCLUSIVE && glops->go_sync) + glops->go_sync(gl, DIO_METADATA | DIO_DATA | DIO_RELEASE); + + gfs2_glock_hold(gl); + gl->gl_req_bh = xmote_bh; + + lck_ret = gfs2_lm_lock(sdp, gl->gl_lock, gl->gl_state, state, lck_flags); + + if (gfs2_assert_withdraw(sdp, !(lck_ret & LM_OUT_ERROR))) + return; + + if (lck_ret & LM_OUT_ASYNC) + gfs2_assert_warn(sdp, lck_ret == LM_OUT_ASYNC); + else + xmote_bh(gl, lck_ret); +} + +/** + * drop_bh - Called after a lock module unlock completes + * @gl: the glock + * @ret: the return status + * + * Doesn't wake up the process waiting on the struct gfs2_holder (if any) + * Doesn't drop the reference on the glock the top half took out + * + */ + +static void drop_bh(struct gfs2_glock *gl, unsigned int ret) +{ + struct gfs2_sbd *sdp = gl->gl_sbd; + const struct gfs2_glock_operations *glops = gl->gl_ops; + struct gfs2_holder *gh = gl->gl_req_gh; + + clear_bit(GLF_PREFETCH, &gl->gl_flags); + + gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags)); + gfs2_assert_warn(sdp, queue_empty(gl, &gl->gl_holders)); + gfs2_assert_warn(sdp, !ret); + + state_change(gl, LM_ST_UNLOCKED); + + if (glops->go_inval) + glops->go_inval(gl, DIO_METADATA | DIO_DATA); + + if (gh) { + spin_lock(&gl->gl_spin); + list_del_init(&gh->gh_list); + gh->gh_error = 0; + spin_unlock(&gl->gl_spin); + } + + if (glops->go_drop_bh) + glops->go_drop_bh(gl); + + spin_lock(&gl->gl_spin); + gl->gl_req_gh = NULL; + gl->gl_req_bh = NULL; + clear_bit(GLF_LOCK, &gl->gl_flags); + run_queue(gl); + spin_unlock(&gl->gl_spin); + + gfs2_glock_put(gl); + + if (gh) { + if (test_bit(HIF_DEALLOC, &gh->gh_iflags)) + gfs2_holder_put(gh); + else + complete(&gh->gh_wait); + } +} + +/** + * gfs2_glock_drop_th - call into the lock module to unlock a lock + * @gl: the glock + * + */ + +void gfs2_glock_drop_th(struct gfs2_glock *gl) +{ + struct gfs2_sbd *sdp = gl->gl_sbd; + const struct gfs2_glock_operations *glops = gl->gl_ops; + unsigned int ret; + + gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags)); + gfs2_assert_warn(sdp, queue_empty(gl, &gl->gl_holders)); + gfs2_assert_warn(sdp, gl->gl_state != LM_ST_UNLOCKED); + + if (gl->gl_state == LM_ST_EXCLUSIVE && glops->go_sync) + glops->go_sync(gl, DIO_METADATA | DIO_DATA | DIO_RELEASE); + + gfs2_glock_hold(gl); + gl->gl_req_bh = drop_bh; + + ret = gfs2_lm_unlock(sdp, gl->gl_lock, gl->gl_state); + + if (gfs2_assert_withdraw(sdp, !(ret & LM_OUT_ERROR))) + return; + + if (!ret) + drop_bh(gl, ret); + else + gfs2_assert_warn(sdp, ret == LM_OUT_ASYNC); +} + +/** + * do_cancels - cancel requests for locks stuck waiting on an expire flag + * @gh: the LM_FLAG_PRIORITY holder waiting to acquire the lock + * + * Don't cancel GL_NOCANCEL requests. + */ + +static void do_cancels(struct gfs2_holder *gh) +{ + struct gfs2_glock *gl = gh->gh_gl; + + spin_lock(&gl->gl_spin); + + while (gl->gl_req_gh != gh && + !test_bit(HIF_HOLDER, &gh->gh_iflags) && + !list_empty(&gh->gh_list)) { + if (gl->gl_req_bh && !(gl->gl_req_gh && + (gl->gl_req_gh->gh_flags & GL_NOCANCEL))) { + spin_unlock(&gl->gl_spin); + gfs2_lm_cancel(gl->gl_sbd, gl->gl_lock); + msleep(100); + spin_lock(&gl->gl_spin); + } else { + spin_unlock(&gl->gl_spin); + msleep(100); + spin_lock(&gl->gl_spin); + } + } + + spin_unlock(&gl->gl_spin); +} + +/** + * glock_wait_internal - wait on a glock acquisition + * @gh: the glock holder + * + * Returns: 0 on success + */ + +static int glock_wait_internal(struct gfs2_holder *gh) +{ + struct gfs2_glock *gl = gh->gh_gl; + struct gfs2_sbd *sdp = gl->gl_sbd; + const struct gfs2_glock_operations *glops = gl->gl_ops; + + if (test_bit(HIF_ABORTED, &gh->gh_iflags)) + return -EIO; + + if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) { + spin_lock(&gl->gl_spin); + if (gl->gl_req_gh != gh && + !test_bit(HIF_HOLDER, &gh->gh_iflags) && + !list_empty(&gh->gh_list)) { + list_del_init(&gh->gh_list); + gh->gh_error = GLR_TRYFAILED; + run_queue(gl); + spin_unlock(&gl->gl_spin); + return gh->gh_error; + } + spin_unlock(&gl->gl_spin); + } + + if (gh->gh_flags & LM_FLAG_PRIORITY) + do_cancels(gh); + + wait_for_completion(&gh->gh_wait); + + if (gh->gh_error) + return gh->gh_error; + + gfs2_assert_withdraw(sdp, test_bit(HIF_HOLDER, &gh->gh_iflags)); + gfs2_assert_withdraw(sdp, relaxed_state_ok(gl->gl_state, gh->gh_state, + gh->gh_flags)); + + if (test_bit(HIF_FIRST, &gh->gh_iflags)) { + gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags)); + + if (glops->go_lock) { + gh->gh_error = glops->go_lock(gh); + if (gh->gh_error) { + spin_lock(&gl->gl_spin); + list_del_init(&gh->gh_list); + spin_unlock(&gl->gl_spin); + } + } + + spin_lock(&gl->gl_spin); + gl->gl_req_gh = NULL; + gl->gl_req_bh = NULL; + clear_bit(GLF_LOCK, &gl->gl_flags); + run_queue(gl); + spin_unlock(&gl->gl_spin); + } + + return gh->gh_error; +} + +static inline struct gfs2_holder * +find_holder_by_owner(struct list_head *head, struct task_struct *owner) +{ + struct gfs2_holder *gh; + + list_for_each_entry(gh, head, gh_list) { + if (gh->gh_owner == owner) + return gh; + } + + return NULL; +} + +/** + * add_to_queue - Add a holder to the wait queue (but look for recursion) + * @gh: the holder structure to add + * + */ + +static void add_to_queue(struct gfs2_holder *gh) +{ + struct gfs2_glock *gl = gh->gh_gl; + struct gfs2_holder *existing; + + BUG_ON(!gh->gh_owner); + + existing = find_holder_by_owner(&gl->gl_holders, gh->gh_owner); + if (existing) { + print_symbol(KERN_WARNING "original: %s\n", existing->gh_ip); + printk(KERN_INFO "pid : %d\n", existing->gh_owner->pid); + printk(KERN_INFO "lock type : %d lock state : %d\n", + existing->gh_gl->gl_name.ln_type, existing->gh_gl->gl_state); + print_symbol(KERN_WARNING "new: %s\n", gh->gh_ip); + printk(KERN_INFO "pid : %d\n", gh->gh_owner->pid); + printk(KERN_INFO "lock type : %d lock state : %d\n", + gl->gl_name.ln_type, gl->gl_state); + BUG(); + } + + existing = find_holder_by_owner(&gl->gl_waiters3, gh->gh_owner); + if (existing) { + print_symbol(KERN_WARNING "original: %s\n", existing->gh_ip); + print_symbol(KERN_WARNING "new: %s\n", gh->gh_ip); + BUG(); + } + + if (gh->gh_flags & LM_FLAG_PRIORITY) + list_add(&gh->gh_list, &gl->gl_waiters3); + else + list_add_tail(&gh->gh_list, &gl->gl_waiters3); +} + +/** + * gfs2_glock_nq - enqueue a struct gfs2_holder onto a glock (acquire a glock) + * @gh: the holder structure + * + * if (gh->gh_flags & GL_ASYNC), this never returns an error + * + * Returns: 0, GLR_TRYFAILED, or errno on failure + */ + +int gfs2_glock_nq(struct gfs2_holder *gh) +{ + struct gfs2_glock *gl = gh->gh_gl; + struct gfs2_sbd *sdp = gl->gl_sbd; + int error = 0; + +restart: + if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) { + set_bit(HIF_ABORTED, &gh->gh_iflags); + return -EIO; + } + + set_bit(HIF_PROMOTE, &gh->gh_iflags); + + spin_lock(&gl->gl_spin); + add_to_queue(gh); + run_queue(gl); + spin_unlock(&gl->gl_spin); + + if (!(gh->gh_flags & GL_ASYNC)) { + error = glock_wait_internal(gh); + if (error == GLR_CANCELED) { + msleep(100); + goto restart; + } + } + + clear_bit(GLF_PREFETCH, &gl->gl_flags); + + if (error == GLR_TRYFAILED && (gh->gh_flags & GL_DUMP)) + dump_glock(gl); + + return error; +} + +/** + * gfs2_glock_poll - poll to see if an async request has been completed + * @gh: the holder + * + * Returns: 1 if the request is ready to be gfs2_glock_wait()ed on + */ + +int gfs2_glock_poll(struct gfs2_holder *gh) +{ + struct gfs2_glock *gl = gh->gh_gl; + int ready = 0; + + spin_lock(&gl->gl_spin); + + if (test_bit(HIF_HOLDER, &gh->gh_iflags)) + ready = 1; + else if (list_empty(&gh->gh_list)) { + if (gh->gh_error == GLR_CANCELED) { + spin_unlock(&gl->gl_spin); + msleep(100); + if (gfs2_glock_nq(gh)) + return 1; + return 0; + } else + ready = 1; + } + + spin_unlock(&gl->gl_spin); + + return ready; +} + +/** + * gfs2_glock_wait - wait for a lock acquisition that ended in a GLR_ASYNC + * @gh: the holder structure + * + * Returns: 0, GLR_TRYFAILED, or errno on failure + */ + +int gfs2_glock_wait(struct gfs2_holder *gh) +{ + int error; + + error = glock_wait_internal(gh); + if (error == GLR_CANCELED) { + msleep(100); + gh->gh_flags &= ~GL_ASYNC; + error = gfs2_glock_nq(gh); + } + + return error; +} + +/** + * gfs2_glock_dq - dequeue a struct gfs2_holder from a glock (release a glock) + * @gh: the glock holder + * + */ + +void gfs2_glock_dq(struct gfs2_holder *gh) +{ + struct gfs2_glock *gl = gh->gh_gl; + const struct gfs2_glock_operations *glops = gl->gl_ops; + + if (gh->gh_flags & GL_NOCACHE) + handle_callback(gl, LM_ST_UNLOCKED); + + gfs2_glmutex_lock(gl); + + spin_lock(&gl->gl_spin); + list_del_init(&gh->gh_list); + + if (list_empty(&gl->gl_holders)) { + spin_unlock(&gl->gl_spin); + + if (glops->go_unlock) + glops->go_unlock(gh); + + gl->gl_stamp = jiffies; + + spin_lock(&gl->gl_spin); + } + + clear_bit(GLF_LOCK, &gl->gl_flags); + run_queue(gl); + spin_unlock(&gl->gl_spin); +} + +/** + * gfs2_glock_prefetch - Try to prefetch a glock + * @gl: the glock + * @state: the state to prefetch in + * @flags: flags passed to go_xmote_th() + * + */ + +static void gfs2_glock_prefetch(struct gfs2_glock *gl, unsigned int state, + int flags) +{ + const struct gfs2_glock_operations *glops = gl->gl_ops; + + spin_lock(&gl->gl_spin); + + if (test_bit(GLF_LOCK, &gl->gl_flags) || !list_empty(&gl->gl_holders) || + !list_empty(&gl->gl_waiters1) || !list_empty(&gl->gl_waiters2) || + !list_empty(&gl->gl_waiters3) || + relaxed_state_ok(gl->gl_state, state, flags)) { + spin_unlock(&gl->gl_spin); + return; + } + + set_bit(GLF_PREFETCH, &gl->gl_flags); + set_bit(GLF_LOCK, &gl->gl_flags); + spin_unlock(&gl->gl_spin); + + glops->go_xmote_th(gl, state, flags); +} + +static void greedy_work(void *data) +{ + struct greedy *gr = data; + struct gfs2_holder *gh = &gr->gr_gh; + struct gfs2_glock *gl = gh->gh_gl; + const struct gfs2_glock_operations *glops = gl->gl_ops; + + clear_bit(GLF_SKIP_WAITERS2, &gl->gl_flags); + + if (glops->go_greedy) + glops->go_greedy(gl); + + spin_lock(&gl->gl_spin); + + if (list_empty(&gl->gl_waiters2)) { + clear_bit(GLF_GREEDY, &gl->gl_flags); + spin_unlock(&gl->gl_spin); + gfs2_holder_uninit(gh); + kfree(gr); + } else { + gfs2_glock_hold(gl); + list_add_tail(&gh->gh_list, &gl->gl_waiters2); + run_queue(gl); + spin_unlock(&gl->gl_spin); + gfs2_glock_put(gl); + } +} + +/** + * gfs2_glock_be_greedy - + * @gl: + * @time: + * + * Returns: 0 if go_greedy will be called, 1 otherwise + */ + +int gfs2_glock_be_greedy(struct gfs2_glock *gl, unsigned int time) +{ + struct greedy *gr; + struct gfs2_holder *gh; + + if (!time || gl->gl_sbd->sd_args.ar_localcaching || + test_and_set_bit(GLF_GREEDY, &gl->gl_flags)) + return 1; + + gr = kmalloc(sizeof(struct greedy), GFP_KERNEL); + if (!gr) { + clear_bit(GLF_GREEDY, &gl->gl_flags); + return 1; + } + gh = &gr->gr_gh; + + gfs2_holder_init(gl, 0, 0, gh); + set_bit(HIF_GREEDY, &gh->gh_iflags); + INIT_WORK(&gr->gr_work, greedy_work, gr); + + set_bit(GLF_SKIP_WAITERS2, &gl->gl_flags); + schedule_delayed_work(&gr->gr_work, time); + + return 0; +} + +/** + * gfs2_glock_dq_uninit - dequeue a holder from a glock and initialize it + * @gh: the holder structure + * + */ + +void gfs2_glock_dq_uninit(struct gfs2_holder *gh) +{ + gfs2_glock_dq(gh); + gfs2_holder_uninit(gh); +} + +/** + * gfs2_glock_nq_num - acquire a glock based on lock number + * @sdp: the filesystem + * @number: the lock number + * @glops: the glock operations for the type of glock + * @state: the state to acquire the glock in + * @flags: modifier flags for the aquisition + * @gh: the struct gfs2_holder + * + * Returns: errno + */ + +int gfs2_glock_nq_num(struct gfs2_sbd *sdp, u64 number, + const struct gfs2_glock_operations *glops, + unsigned int state, int flags, struct gfs2_holder *gh) +{ + struct gfs2_glock *gl; + int error; + + error = gfs2_glock_get(sdp, number, glops, CREATE, &gl); + if (!error) { + error = gfs2_glock_nq_init(gl, state, flags, gh); + gfs2_glock_put(gl); + } + + return error; +} + +/** + * glock_compare - Compare two struct gfs2_glock structures for sorting + * @arg_a: the first structure + * @arg_b: the second structure + * + */ + +static int glock_compare(const void *arg_a, const void *arg_b) +{ + const struct gfs2_holder *gh_a = *(const struct gfs2_holder **)arg_a; + const struct gfs2_holder *gh_b = *(const struct gfs2_holder **)arg_b; + const struct lm_lockname *a = &gh_a->gh_gl->gl_name; + const struct lm_lockname *b = &gh_b->gh_gl->gl_name; + + if (a->ln_number > b->ln_number) + return 1; + if (a->ln_number < b->ln_number) + return -1; + if (gh_a->gh_state == LM_ST_SHARED && gh_b->gh_state == LM_ST_EXCLUSIVE) + return 1; + if (!(gh_a->gh_flags & GL_LOCAL_EXCL) && (gh_b->gh_flags & GL_LOCAL_EXCL)) + return 1; + return 0; +} + +/** + * nq_m_sync - synchonously acquire more than one glock in deadlock free order + * @num_gh: the number of structures + * @ghs: an array of struct gfs2_holder structures + * + * Returns: 0 on success (all glocks acquired), + * errno on failure (no glocks acquired) + */ + +static int nq_m_sync(unsigned int num_gh, struct gfs2_holder *ghs, + struct gfs2_holder **p) +{ + unsigned int x; + int error = 0; + + for (x = 0; x < num_gh; x++) + p[x] = &ghs[x]; + + sort(p, num_gh, sizeof(struct gfs2_holder *), glock_compare, NULL); + + for (x = 0; x < num_gh; x++) { + p[x]->gh_flags &= ~(LM_FLAG_TRY | GL_ASYNC); + + error = gfs2_glock_nq(p[x]); + if (error) { + while (x--) + gfs2_glock_dq(p[x]); + break; + } + } + + return error; +} + +/** + * gfs2_glock_nq_m - acquire multiple glocks + * @num_gh: the number of structures + * @ghs: an array of struct gfs2_holder structures + * + * Figure out how big an impact this function has. Either: + * 1) Replace this code with code that calls gfs2_glock_prefetch() + * 2) Forget async stuff and just call nq_m_sync() + * 3) Leave it like it is + * + * Returns: 0 on success (all glocks acquired), + * errno on failure (no glocks acquired) + */ + +int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs) +{ + int *e; + unsigned int x; + int borked = 0, serious = 0; + int error = 0; + + if (!num_gh) + return 0; + + if (num_gh == 1) { + ghs->gh_flags &= ~(LM_FLAG_TRY | GL_ASYNC); + return gfs2_glock_nq(ghs); + } + + e = kcalloc(num_gh, sizeof(struct gfs2_holder *), GFP_KERNEL); + if (!e) + return -ENOMEM; + + for (x = 0; x < num_gh; x++) { + ghs[x].gh_flags |= LM_FLAG_TRY | GL_ASYNC; + error = gfs2_glock_nq(&ghs[x]); + if (error) { + borked = 1; + serious = error; + num_gh = x; + break; + } + } + + for (x = 0; x < num_gh; x++) { + error = e[x] = glock_wait_internal(&ghs[x]); + if (error) { + borked = 1; + if (error != GLR_TRYFAILED && error != GLR_CANCELED) + serious = error; + } + } + + if (!borked) { + kfree(e); + return 0; + } + + for (x = 0; x < num_gh; x++) + if (!e[x]) + gfs2_glock_dq(&ghs[x]); + + if (serious) + error = serious; + else { + for (x = 0; x < num_gh; x++) + gfs2_holder_reinit(ghs[x].gh_state, ghs[x].gh_flags, + &ghs[x]); + error = nq_m_sync(num_gh, ghs, (struct gfs2_holder **)e); + } + + kfree(e); + + return error; +} + +/** + * gfs2_glock_dq_m - release multiple glocks + * @num_gh: the number of structures + * @ghs: an array of struct gfs2_holder structures + * + */ + +void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs) +{ + unsigned int x; + + for (x = 0; x < num_gh; x++) + gfs2_glock_dq(&ghs[x]); +} + +/** + * gfs2_glock_dq_uninit_m - release multiple glocks + * @num_gh: the number of structures + * @ghs: an array of struct gfs2_holder structures + * + */ + +void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs) +{ + unsigned int x; + + for (x = 0; x < num_gh; x++) + gfs2_glock_dq_uninit(&ghs[x]); +} + +/** + * gfs2_glock_prefetch_num - prefetch a glock based on lock number + * @sdp: the filesystem + * @number: the lock number + * @glops: the glock operations for the type of glock + * @state: the state to acquire the glock in + * @flags: modifier flags for the aquisition + * + * Returns: errno + */ + +void gfs2_glock_prefetch_num(struct gfs2_sbd *sdp, u64 number, + const struct gfs2_glock_operations *glops, + unsigned int state, int flags) +{ + struct gfs2_glock *gl; + int error; + + if (atomic_read(&sdp->sd_reclaim_count) < + gfs2_tune_get(sdp, gt_reclaim_limit)) { + error = gfs2_glock_get(sdp, number, glops, CREATE, &gl); + if (!error) { + gfs2_glock_prefetch(gl, state, flags); + gfs2_glock_put(gl); + } + } +} + +/** + * gfs2_lvb_hold - attach a LVB from a glock + * @gl: The glock in question + * + */ + +int gfs2_lvb_hold(struct gfs2_glock *gl) +{ + int error; + + gfs2_glmutex_lock(gl); + + if (!atomic_read(&gl->gl_lvb_count)) { + error = gfs2_lm_hold_lvb(gl->gl_sbd, gl->gl_lock, &gl->gl_lvb); + if (error) { + gfs2_glmutex_unlock(gl); + return error; + } + gfs2_glock_hold(gl); + } + atomic_inc(&gl->gl_lvb_count); + + gfs2_glmutex_unlock(gl); + + return 0; +} + +/** + * gfs2_lvb_unhold - detach a LVB from a glock + * @gl: The glock in question + * + */ + +void gfs2_lvb_unhold(struct gfs2_glock *gl) +{ + gfs2_glock_hold(gl); + gfs2_glmutex_lock(gl); + + gfs2_assert(gl->gl_sbd, atomic_read(&gl->gl_lvb_count) > 0); + if (atomic_dec_and_test(&gl->gl_lvb_count)) { + gfs2_lm_unhold_lvb(gl->gl_sbd, gl->gl_lock, gl->gl_lvb); + gl->gl_lvb = NULL; + gfs2_glock_put(gl); + } + + gfs2_glmutex_unlock(gl); + gfs2_glock_put(gl); +} + +static void blocking_cb(struct gfs2_sbd *sdp, struct lm_lockname *name, + unsigned int state) +{ + struct gfs2_glock *gl; + + gl = gfs2_glock_find(sdp, name); + if (!gl) + return; + + if (gl->gl_ops->go_callback) + gl->gl_ops->go_callback(gl, state); + handle_callback(gl, state); + + spin_lock(&gl->gl_spin); + run_queue(gl); + spin_unlock(&gl->gl_spin); + + gfs2_glock_put(gl); +} + +/** + * gfs2_glock_cb - Callback used by locking module + * @sdp: Pointer to the superblock + * @type: Type of callback + * @data: Type dependent data pointer + * + * Called by the locking module when it wants to tell us something. + * Either we need to drop a lock, one of our ASYNC requests completed, or + * a journal from another client needs to be recovered. + */ + +void gfs2_glock_cb(void *cb_data, unsigned int type, void *data) +{ + struct gfs2_sbd *sdp = cb_data; + + switch (type) { + case LM_CB_NEED_E: + blocking_cb(sdp, data, LM_ST_UNLOCKED); + return; + + case LM_CB_NEED_D: + blocking_cb(sdp, data, LM_ST_DEFERRED); + return; + + case LM_CB_NEED_S: + blocking_cb(sdp, data, LM_ST_SHARED); + return; + + case LM_CB_ASYNC: { + struct lm_async_cb *async = data; + struct gfs2_glock *gl; + + gl = gfs2_glock_find(sdp, &async->lc_name); + if (gfs2_assert_warn(sdp, gl)) + return; + if (!gfs2_assert_warn(sdp, gl->gl_req_bh)) + gl->gl_req_bh(gl, async->lc_ret); + gfs2_glock_put(gl); + return; + } + + case LM_CB_NEED_RECOVERY: + gfs2_jdesc_make_dirty(sdp, *(unsigned int *)data); + if (sdp->sd_recoverd_process) + wake_up_process(sdp->sd_recoverd_process); + return; + + case LM_CB_DROPLOCKS: + gfs2_gl_hash_clear(sdp, NO_WAIT); + gfs2_quota_scan(sdp); + return; + + default: + gfs2_assert_warn(sdp, 0); + return; + } +} + +/** + * demote_ok - Check to see if it's ok to unlock a glock + * @gl: the glock + * + * Returns: 1 if it's ok + */ + +static int demote_ok(struct gfs2_glock *gl) +{ + struct gfs2_sbd *sdp = gl->gl_sbd; + const struct gfs2_glock_operations *glops = gl->gl_ops; + int demote = 1; + + if (test_bit(GLF_STICKY, &gl->gl_flags)) + demote = 0; + else if (test_bit(GLF_PREFETCH, &gl->gl_flags)) + demote = time_after_eq(jiffies, gl->gl_stamp + + gfs2_tune_get(sdp, gt_prefetch_secs) * HZ); + else if (glops->go_demote_ok) + demote = glops->go_demote_ok(gl); + + return demote; +} + +/** + * gfs2_glock_schedule_for_reclaim - Add a glock to the reclaim list + * @gl: the glock + * + */ + +void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl) +{ + struct gfs2_sbd *sdp = gl->gl_sbd; + + spin_lock(&sdp->sd_reclaim_lock); + if (list_empty(&gl->gl_reclaim)) { + gfs2_glock_hold(gl); + list_add(&gl->gl_reclaim, &sdp->sd_reclaim_list); + atomic_inc(&sdp->sd_reclaim_count); + } + spin_unlock(&sdp->sd_reclaim_lock); + + wake_up(&sdp->sd_reclaim_wq); +} + +/** + * gfs2_reclaim_glock - process the next glock on the filesystem's reclaim list + * @sdp: the filesystem + * + * Called from gfs2_glockd() glock reclaim daemon, or when promoting a + * different glock and we notice that there are a lot of glocks in the + * reclaim list. + * + */ + +void gfs2_reclaim_glock(struct gfs2_sbd *sdp) +{ + struct gfs2_glock *gl; + + spin_lock(&sdp->sd_reclaim_lock); + if (list_empty(&sdp->sd_reclaim_list)) { + spin_unlock(&sdp->sd_reclaim_lock); + return; + } + gl = list_entry(sdp->sd_reclaim_list.next, + struct gfs2_glock, gl_reclaim); + list_del_init(&gl->gl_reclaim); + spin_unlock(&sdp->sd_reclaim_lock); + + atomic_dec(&sdp->sd_reclaim_count); + atomic_inc(&sdp->sd_reclaimed); + + if (gfs2_glmutex_trylock(gl)) { + if (queue_empty(gl, &gl->gl_holders) && + gl->gl_state != LM_ST_UNLOCKED && demote_ok(gl)) + handle_callback(gl, LM_ST_UNLOCKED); + gfs2_glmutex_unlock(gl); + } + + gfs2_glock_put(gl); +} + +/** + * examine_bucket - Call a function for glock in a hash bucket + * @examiner: the function + * @sdp: the filesystem + * @bucket: the bucket + * + * Returns: 1 if the bucket has entries + */ + +static int examine_bucket(glock_examiner examiner, struct gfs2_sbd *sdp, + unsigned int hash) +{ + struct gfs2_glock *gl, *prev = NULL; + int has_entries = 0; + struct hlist_head *head = &gl_hash_table[hash].hb_list; + + read_lock(gl_lock_addr(hash)); + /* Can't use hlist_for_each_entry - don't want prefetch here */ + if (hlist_empty(head)) + goto out; + gl = list_entry(head->first, struct gfs2_glock, gl_list); + while(1) { + if (gl->gl_sbd == sdp) { + gfs2_glock_hold(gl); + read_unlock(gl_lock_addr(hash)); + if (prev) + gfs2_glock_put(prev); + prev = gl; + examiner(gl); + has_entries = 1; + read_lock(gl_lock_addr(hash)); + } + if (gl->gl_list.next == NULL) + break; + gl = list_entry(gl->gl_list.next, struct gfs2_glock, gl_list); + } +out: + read_unlock(gl_lock_addr(hash)); + if (prev) + gfs2_glock_put(prev); + return has_entries; +} + +/** + * scan_glock - look at a glock and see if we can reclaim it + * @gl: the glock to look at + * + */ + +static void scan_glock(struct gfs2_glock *gl) +{ + if (gl->gl_ops == &gfs2_inode_glops) + return; + + if (gfs2_glmutex_trylock(gl)) { + if (queue_empty(gl, &gl->gl_holders) && + gl->gl_state != LM_ST_UNLOCKED && demote_ok(gl)) + goto out_schedule; + gfs2_glmutex_unlock(gl); + } + return; + +out_schedule: + gfs2_glmutex_unlock(gl); + gfs2_glock_schedule_for_reclaim(gl); +} + +/** + * gfs2_scand_internal - Look for glocks and inodes to toss from memory + * @sdp: the filesystem + * + */ + +void gfs2_scand_internal(struct gfs2_sbd *sdp) +{ + unsigned int x; + + for (x = 0; x < GFS2_GL_HASH_SIZE; x++) + examine_bucket(scan_glock, sdp, x); +} + +/** + * clear_glock - look at a glock and see if we can free it from glock cache + * @gl: the glock to look at + * + */ + +static void clear_glock(struct gfs2_glock *gl) +{ + struct gfs2_sbd *sdp = gl->gl_sbd; + int released; + + spin_lock(&sdp->sd_reclaim_lock); + if (!list_empty(&gl->gl_reclaim)) { + list_del_init(&gl->gl_reclaim); + atomic_dec(&sdp->sd_reclaim_count); + spin_unlock(&sdp->sd_reclaim_lock); + released = gfs2_glock_put(gl); + gfs2_assert(sdp, !released); + } else { + spin_unlock(&sdp->sd_reclaim_lock); + } + + if (gfs2_glmutex_trylock(gl)) { + if (queue_empty(gl, &gl->gl_holders) && + gl->gl_state != LM_ST_UNLOCKED) + handle_callback(gl, LM_ST_UNLOCKED); + gfs2_glmutex_unlock(gl); + } +} + +/** + * gfs2_gl_hash_clear - Empty out the glock hash table + * @sdp: the filesystem + * @wait: wait until it's all gone + * + * Called when unmounting the filesystem, or when inter-node lock manager + * requests DROPLOCKS because it is running out of capacity. + */ + +void gfs2_gl_hash_clear(struct gfs2_sbd *sdp, int wait) +{ + unsigned long t; + unsigned int x; + int cont; + + t = jiffies; + + for (;;) { + cont = 0; + for (x = 0; x < GFS2_GL_HASH_SIZE; x++) { + if (examine_bucket(clear_glock, sdp, x)) + cont = 1; + } + + if (!wait || !cont) + break; + + if (time_after_eq(jiffies, + t + gfs2_tune_get(sdp, gt_stall_secs) * HZ)) { + fs_warn(sdp, "Unmount seems to be stalled. " + "Dumping lock state...\n"); + gfs2_dump_lockstate(sdp); + t = jiffies; + } + + invalidate_inodes(sdp->sd_vfs); + msleep(10); + } +} + +/* + * Diagnostic routines to help debug distributed deadlock + */ + +/** + * dump_holder - print information about a glock holder + * @str: a string naming the type of holder + * @gh: the glock holder + * + * Returns: 0 on success, -ENOBUFS when we run out of space + */ + +static int dump_holder(char *str, struct gfs2_holder *gh) +{ + unsigned int x; + int error = -ENOBUFS; + + printk(KERN_INFO " %s\n", str); + printk(KERN_INFO " owner = %ld\n", + (gh->gh_owner) ? (long)gh->gh_owner->pid : -1); + printk(KERN_INFO " gh_state = %u\n", gh->gh_state); + printk(KERN_INFO " gh_flags ="); + for (x = 0; x < 32; x++) + if (gh->gh_flags & (1 << x)) + printk(" %u", x); + printk(" \n"); + printk(KERN_INFO " error = %d\n", gh->gh_error); + printk(KERN_INFO " gh_iflags ="); + for (x = 0; x < 32; x++) + if (test_bit(x, &gh->gh_iflags)) + printk(" %u", x); + printk(" \n"); + print_symbol(KERN_INFO " initialized at: %s\n", gh->gh_ip); + + error = 0; + + return error; +} + +/** + * dump_inode - print information about an inode + * @ip: the inode + * + * Returns: 0 on success, -ENOBUFS when we run out of space + */ + +static int dump_inode(struct gfs2_inode *ip) +{ + unsigned int x; + int error = -ENOBUFS; + + printk(KERN_INFO " Inode:\n"); + printk(KERN_INFO " num = %llu %llu\n", + (unsigned long long)ip->i_num.no_formal_ino, + (unsigned long long)ip->i_num.no_addr); + printk(KERN_INFO " type = %u\n", IF2DT(ip->i_di.di_mode)); + printk(KERN_INFO " i_flags ="); + for (x = 0; x < 32; x++) + if (test_bit(x, &ip->i_flags)) + printk(" %u", x); + printk(" \n"); + + error = 0; + + return error; +} + +/** + * dump_glock - print information about a glock + * @gl: the glock + * @count: where we are in the buffer + * + * Returns: 0 on success, -ENOBUFS when we run out of space + */ + +static int dump_glock(struct gfs2_glock *gl) +{ + struct gfs2_holder *gh; + unsigned int x; + int error = -ENOBUFS; + + spin_lock(&gl->gl_spin); + + printk(KERN_INFO "Glock 0x%p (%u, %llu)\n", gl, gl->gl_name.ln_type, + (unsigned long long)gl->gl_name.ln_number); + printk(KERN_INFO " gl_flags ="); + for (x = 0; x < 32; x++) { + if (test_bit(x, &gl->gl_flags)) + printk(" %u", x); + } + printk(" \n"); + printk(KERN_INFO " gl_ref = %d\n", atomic_read(&gl->gl_ref)); + printk(KERN_INFO " gl_state = %u\n", gl->gl_state); + printk(KERN_INFO " gl_owner = %s\n", gl->gl_owner->comm); + print_symbol(KERN_INFO " gl_ip = %s\n", gl->gl_ip); + printk(KERN_INFO " req_gh = %s\n", (gl->gl_req_gh) ? "yes" : "no"); + printk(KERN_INFO " req_bh = %s\n", (gl->gl_req_bh) ? "yes" : "no"); + printk(KERN_INFO " lvb_count = %d\n", atomic_read(&gl->gl_lvb_count)); + printk(KERN_INFO " object = %s\n", (gl->gl_object) ? "yes" : "no"); + printk(KERN_INFO " le = %s\n", + (list_empty(&gl->gl_le.le_list)) ? "no" : "yes"); + printk(KERN_INFO " reclaim = %s\n", + (list_empty(&gl->gl_reclaim)) ? "no" : "yes"); + if (gl->gl_aspace) + printk(KERN_INFO " aspace = 0x%p nrpages = %lu\n", gl->gl_aspace, + gl->gl_aspace->i_mapping->nrpages); + else + printk(KERN_INFO " aspace = no\n"); + printk(KERN_INFO " ail = %d\n", atomic_read(&gl->gl_ail_count)); + if (gl->gl_req_gh) { + error = dump_holder("Request", gl->gl_req_gh); + if (error) + goto out; + } + list_for_each_entry(gh, &gl->gl_holders, gh_list) { + error = dump_holder("Holder", gh); + if (error) + goto out; + } + list_for_each_entry(gh, &gl->gl_waiters1, gh_list) { + error = dump_holder("Waiter1", gh); + if (error) + goto out; + } + list_for_each_entry(gh, &gl->gl_waiters2, gh_list) { + error = dump_holder("Waiter2", gh); + if (error) + goto out; + } + list_for_each_entry(gh, &gl->gl_waiters3, gh_list) { + error = dump_holder("Waiter3", gh); + if (error) + goto out; + } + if (gl->gl_ops == &gfs2_inode_glops && gl->gl_object) { + if (!test_bit(GLF_LOCK, &gl->gl_flags) && + list_empty(&gl->gl_holders)) { + error = dump_inode(gl->gl_object); + if (error) + goto out; + } else { + error = -ENOBUFS; + printk(KERN_INFO " Inode: busy\n"); + } + } + + error = 0; + +out: + spin_unlock(&gl->gl_spin); + return error; +} + +/** + * gfs2_dump_lockstate - print out the current lockstate + * @sdp: the filesystem + * @ub: the buffer to copy the information into + * + * If @ub is NULL, dump the lockstate to the console. + * + */ + +static int gfs2_dump_lockstate(struct gfs2_sbd *sdp) +{ + struct gfs2_glock *gl; + struct hlist_node *h; + unsigned int x; + int error = 0; + + for (x = 0; x < GFS2_GL_HASH_SIZE; x++) { + + read_lock(gl_lock_addr(x)); + + hlist_for_each_entry(gl, h, &gl_hash_table[x].hb_list, gl_list) { + if (gl->gl_sbd != sdp) + continue; + + error = dump_glock(gl); + if (error) + break; + } + + read_unlock(gl_lock_addr(x)); + + if (error) + break; + } + + + return error; +} + +int __init gfs2_glock_init(void) +{ + unsigned i; + for(i = 0; i < GFS2_GL_HASH_SIZE; i++) { + INIT_HLIST_HEAD(&gl_hash_table[i].hb_list); + } +#ifdef GL_HASH_LOCK_SZ + for(i = 0; i < GL_HASH_LOCK_SZ; i++) { + rwlock_init(&gl_hash_locks[i]); + } +#endif + return 0; +} + diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h new file mode 100644 index 000000000000..2b2a889ee2cc --- /dev/null +++ b/fs/gfs2/glock.h @@ -0,0 +1,153 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#ifndef __GLOCK_DOT_H__ +#define __GLOCK_DOT_H__ + +#include "incore.h" + +/* Flags for lock requests; used in gfs2_holder gh_flag field. + From lm_interface.h: +#define LM_FLAG_TRY 0x00000001 +#define LM_FLAG_TRY_1CB 0x00000002 +#define LM_FLAG_NOEXP 0x00000004 +#define LM_FLAG_ANY 0x00000008 +#define LM_FLAG_PRIORITY 0x00000010 */ + +#define GL_LOCAL_EXCL 0x00000020 +#define GL_ASYNC 0x00000040 +#define GL_EXACT 0x00000080 +#define GL_SKIP 0x00000100 +#define GL_ATIME 0x00000200 +#define GL_NOCACHE 0x00000400 +#define GL_NOCANCEL 0x00001000 +#define GL_AOP 0x00004000 +#define GL_DUMP 0x00008000 + +#define GLR_TRYFAILED 13 +#define GLR_CANCELED 14 + +static inline int gfs2_glock_is_locked_by_me(struct gfs2_glock *gl) +{ + struct gfs2_holder *gh; + int locked = 0; + + /* Look in glock's list of holders for one with current task as owner */ + spin_lock(&gl->gl_spin); + list_for_each_entry(gh, &gl->gl_holders, gh_list) { + if (gh->gh_owner == current) { + locked = 1; + break; + } + } + spin_unlock(&gl->gl_spin); + + return locked; +} + +static inline int gfs2_glock_is_held_excl(struct gfs2_glock *gl) +{ + return gl->gl_state == LM_ST_EXCLUSIVE; +} + +static inline int gfs2_glock_is_held_dfrd(struct gfs2_glock *gl) +{ + return gl->gl_state == LM_ST_DEFERRED; +} + +static inline int gfs2_glock_is_held_shrd(struct gfs2_glock *gl) +{ + return gl->gl_state == LM_ST_SHARED; +} + +static inline int gfs2_glock_is_blocking(struct gfs2_glock *gl) +{ + int ret; + spin_lock(&gl->gl_spin); + ret = !list_empty(&gl->gl_waiters2) || !list_empty(&gl->gl_waiters3); + spin_unlock(&gl->gl_spin); + return ret; +} + +int gfs2_glock_get(struct gfs2_sbd *sdp, + u64 number, const struct gfs2_glock_operations *glops, + int create, struct gfs2_glock **glp); +void gfs2_glock_hold(struct gfs2_glock *gl); +int gfs2_glock_put(struct gfs2_glock *gl); +void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, unsigned flags, + struct gfs2_holder *gh); +void gfs2_holder_reinit(unsigned int state, unsigned flags, + struct gfs2_holder *gh); +void gfs2_holder_uninit(struct gfs2_holder *gh); + +void gfs2_glock_xmote_th(struct gfs2_glock *gl, unsigned int state, int flags); +void gfs2_glock_drop_th(struct gfs2_glock *gl); + +int gfs2_glock_nq(struct gfs2_holder *gh); +int gfs2_glock_poll(struct gfs2_holder *gh); +int gfs2_glock_wait(struct gfs2_holder *gh); +void gfs2_glock_dq(struct gfs2_holder *gh); + +int gfs2_glock_be_greedy(struct gfs2_glock *gl, unsigned int time); + +void gfs2_glock_dq_uninit(struct gfs2_holder *gh); +int gfs2_glock_nq_num(struct gfs2_sbd *sdp, + u64 number, const struct gfs2_glock_operations *glops, + unsigned int state, int flags, struct gfs2_holder *gh); + +int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs); +void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs); +void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs); + +void gfs2_glock_prefetch_num(struct gfs2_sbd *sdp, u64 number, + const struct gfs2_glock_operations *glops, + unsigned int state, int flags); +void gfs2_glock_inode_squish(struct inode *inode); + +/** + * gfs2_glock_nq_init - intialize a holder and enqueue it on a glock + * @gl: the glock + * @state: the state we're requesting + * @flags: the modifier flags + * @gh: the holder structure + * + * Returns: 0, GLR_*, or errno + */ + +static inline int gfs2_glock_nq_init(struct gfs2_glock *gl, + unsigned int state, int flags, + struct gfs2_holder *gh) +{ + int error; + + gfs2_holder_init(gl, state, flags, gh); + + error = gfs2_glock_nq(gh); + if (error) + gfs2_holder_uninit(gh); + + return error; +} + +/* Lock Value Block functions */ + +int gfs2_lvb_hold(struct gfs2_glock *gl); +void gfs2_lvb_unhold(struct gfs2_glock *gl); + +void gfs2_glock_cb(void *cb_data, unsigned int type, void *data); + +void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl); +void gfs2_reclaim_glock(struct gfs2_sbd *sdp); + +void gfs2_scand_internal(struct gfs2_sbd *sdp); +void gfs2_gl_hash_clear(struct gfs2_sbd *sdp, int wait); + +int __init gfs2_glock_init(void); + +#endif /* __GLOCK_DOT_H__ */ diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c new file mode 100644 index 000000000000..41a6b6818a50 --- /dev/null +++ b/fs/gfs2/glops.c @@ -0,0 +1,615 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/completion.h> +#include <linux/buffer_head.h> +#include <linux/gfs2_ondisk.h> +#include <linux/lm_interface.h> + +#include "gfs2.h" +#include "incore.h" +#include "bmap.h" +#include "glock.h" +#include "glops.h" +#include "inode.h" +#include "log.h" +#include "meta_io.h" +#include "recovery.h" +#include "rgrp.h" +#include "util.h" +#include "trans.h" + +/** + * ail_empty_gl - remove all buffers for a given lock from the AIL + * @gl: the glock + * + * None of the buffers should be dirty, locked, or pinned. + */ + +static void gfs2_ail_empty_gl(struct gfs2_glock *gl) +{ + struct gfs2_sbd *sdp = gl->gl_sbd; + unsigned int blocks; + struct list_head *head = &gl->gl_ail_list; + struct gfs2_bufdata *bd; + struct buffer_head *bh; + u64 blkno; + int error; + + blocks = atomic_read(&gl->gl_ail_count); + if (!blocks) + return; + + error = gfs2_trans_begin(sdp, 0, blocks); + if (gfs2_assert_withdraw(sdp, !error)) + return; + + gfs2_log_lock(sdp); + while (!list_empty(head)) { + bd = list_entry(head->next, struct gfs2_bufdata, + bd_ail_gl_list); + bh = bd->bd_bh; + blkno = bh->b_blocknr; + gfs2_assert_withdraw(sdp, !buffer_busy(bh)); + + bd->bd_ail = NULL; + list_del(&bd->bd_ail_st_list); + list_del(&bd->bd_ail_gl_list); + atomic_dec(&gl->gl_ail_count); + brelse(bh); + gfs2_log_unlock(sdp); + + gfs2_trans_add_revoke(sdp, blkno); + + gfs2_log_lock(sdp); + } + gfs2_assert_withdraw(sdp, !atomic_read(&gl->gl_ail_count)); + gfs2_log_unlock(sdp); + + gfs2_trans_end(sdp); + gfs2_log_flush(sdp, NULL); +} + +/** + * gfs2_pte_inval - Sync and invalidate all PTEs associated with a glock + * @gl: the glock + * + */ + +static void gfs2_pte_inval(struct gfs2_glock *gl) +{ + struct gfs2_inode *ip; + struct inode *inode; + + ip = gl->gl_object; + inode = &ip->i_inode; + if (!ip || !S_ISREG(ip->i_di.di_mode)) + return; + + if (!test_bit(GIF_PAGED, &ip->i_flags)) + return; + + unmap_shared_mapping_range(inode->i_mapping, 0, 0); + + if (test_bit(GIF_SW_PAGED, &ip->i_flags)) + set_bit(GLF_DIRTY, &gl->gl_flags); + + clear_bit(GIF_SW_PAGED, &ip->i_flags); +} + +/** + * gfs2_page_inval - Invalidate all pages associated with a glock + * @gl: the glock + * + */ + +static void gfs2_page_inval(struct gfs2_glock *gl) +{ + struct gfs2_inode *ip; + struct inode *inode; + + ip = gl->gl_object; + inode = &ip->i_inode; + if (!ip || !S_ISREG(ip->i_di.di_mode)) + return; + + truncate_inode_pages(inode->i_mapping, 0); + gfs2_assert_withdraw(GFS2_SB(&ip->i_inode), !inode->i_mapping->nrpages); + clear_bit(GIF_PAGED, &ip->i_flags); +} + +/** + * gfs2_page_wait - Wait for writeback of data + * @gl: the glock + * + * Syncs data (not metadata) for a regular file. + * No-op for all other types. + */ + +static void gfs2_page_wait(struct gfs2_glock *gl) +{ + struct gfs2_inode *ip = gl->gl_object; + struct inode *inode = &ip->i_inode; + struct address_space *mapping = inode->i_mapping; + int error; + + if (!S_ISREG(ip->i_di.di_mode)) + return; + + error = filemap_fdatawait(mapping); + + /* Put back any errors cleared by filemap_fdatawait() + so they can be caught by someone who can pass them + up to user space. */ + + if (error == -ENOSPC) + set_bit(AS_ENOSPC, &mapping->flags); + else if (error) + set_bit(AS_EIO, &mapping->flags); + +} + +static void gfs2_page_writeback(struct gfs2_glock *gl) +{ + struct gfs2_inode *ip = gl->gl_object; + struct inode *inode = &ip->i_inode; + struct address_space *mapping = inode->i_mapping; + + if (!S_ISREG(ip->i_di.di_mode)) + return; + + filemap_fdatawrite(mapping); +} + +/** + * meta_go_sync - sync out the metadata for this glock + * @gl: the glock + * @flags: DIO_* + * + * Called when demoting or unlocking an EX glock. We must flush + * to disk all dirty buffers/pages relating to this glock, and must not + * not return to caller to demote/unlock the glock until I/O is complete. + */ + +static void meta_go_sync(struct gfs2_glock *gl, int flags) +{ + if (!(flags & DIO_METADATA)) + return; + + if (test_and_clear_bit(GLF_DIRTY, &gl->gl_flags)) { + gfs2_log_flush(gl->gl_sbd, gl); + gfs2_meta_sync(gl); + if (flags & DIO_RELEASE) + gfs2_ail_empty_gl(gl); + } + +} + +/** + * meta_go_inval - invalidate the metadata for this glock + * @gl: the glock + * @flags: + * + */ + +static void meta_go_inval(struct gfs2_glock *gl, int flags) +{ + if (!(flags & DIO_METADATA)) + return; + + gfs2_meta_inval(gl); + gl->gl_vn++; +} + +/** + * inode_go_xmote_th - promote/demote a glock + * @gl: the glock + * @state: the requested state + * @flags: + * + */ + +static void inode_go_xmote_th(struct gfs2_glock *gl, unsigned int state, + int flags) +{ + if (gl->gl_state != LM_ST_UNLOCKED) + gfs2_pte_inval(gl); + gfs2_glock_xmote_th(gl, state, flags); +} + +/** + * inode_go_xmote_bh - After promoting/demoting a glock + * @gl: the glock + * + */ + +static void inode_go_xmote_bh(struct gfs2_glock *gl) +{ + struct gfs2_holder *gh = gl->gl_req_gh; + struct buffer_head *bh; + int error; + + if (gl->gl_state != LM_ST_UNLOCKED && + (!gh || !(gh->gh_flags & GL_SKIP))) { + error = gfs2_meta_read(gl, gl->gl_name.ln_number, 0, &bh); + if (!error) + brelse(bh); + } +} + +/** + * inode_go_drop_th - unlock a glock + * @gl: the glock + * + * Invoked from rq_demote(). + * Another node needs the lock in EXCLUSIVE mode, or lock (unused for too long) + * is being purged from our node's glock cache; we're dropping lock. + */ + +static void inode_go_drop_th(struct gfs2_glock *gl) +{ + gfs2_pte_inval(gl); + gfs2_glock_drop_th(gl); +} + +/** + * inode_go_sync - Sync the dirty data and/or metadata for an inode glock + * @gl: the glock protecting the inode + * @flags: + * + */ + +static void inode_go_sync(struct gfs2_glock *gl, int flags) +{ + int meta = (flags & DIO_METADATA); + int data = (flags & DIO_DATA); + + if (test_bit(GLF_DIRTY, &gl->gl_flags)) { + if (meta && data) { + gfs2_page_writeback(gl); + gfs2_log_flush(gl->gl_sbd, gl); + gfs2_meta_sync(gl); + gfs2_page_wait(gl); + clear_bit(GLF_DIRTY, &gl->gl_flags); + } else if (meta) { + gfs2_log_flush(gl->gl_sbd, gl); + gfs2_meta_sync(gl); + } else if (data) { + gfs2_page_writeback(gl); + gfs2_page_wait(gl); + } + if (flags & DIO_RELEASE) + gfs2_ail_empty_gl(gl); + } +} + +/** + * inode_go_inval - prepare a inode glock to be released + * @gl: the glock + * @flags: + * + */ + +static void inode_go_inval(struct gfs2_glock *gl, int flags) +{ + int meta = (flags & DIO_METADATA); + int data = (flags & DIO_DATA); + + if (meta) { + gfs2_meta_inval(gl); + gl->gl_vn++; + } + if (data) + gfs2_page_inval(gl); +} + +/** + * inode_go_demote_ok - Check to see if it's ok to unlock an inode glock + * @gl: the glock + * + * Returns: 1 if it's ok + */ + +static int inode_go_demote_ok(struct gfs2_glock *gl) +{ + struct gfs2_sbd *sdp = gl->gl_sbd; + int demote = 0; + + if (!gl->gl_object && !gl->gl_aspace->i_mapping->nrpages) + demote = 1; + else if (!sdp->sd_args.ar_localcaching && + time_after_eq(jiffies, gl->gl_stamp + + gfs2_tune_get(sdp, gt_demote_secs) * HZ)) + demote = 1; + + return demote; +} + +/** + * inode_go_lock - operation done after an inode lock is locked by a process + * @gl: the glock + * @flags: + * + * Returns: errno + */ + +static int inode_go_lock(struct gfs2_holder *gh) +{ + struct gfs2_glock *gl = gh->gh_gl; + struct gfs2_inode *ip = gl->gl_object; + int error = 0; + + if (!ip) + return 0; + + if (ip->i_vn != gl->gl_vn) { + error = gfs2_inode_refresh(ip); + if (error) + return error; + gfs2_inode_attr_in(ip); + } + + if ((ip->i_di.di_flags & GFS2_DIF_TRUNC_IN_PROG) && + (gl->gl_state == LM_ST_EXCLUSIVE) && + (gh->gh_flags & GL_LOCAL_EXCL)) + error = gfs2_truncatei_resume(ip); + + return error; +} + +/** + * inode_go_unlock - operation done before an inode lock is unlocked by a + * process + * @gl: the glock + * @flags: + * + */ + +static void inode_go_unlock(struct gfs2_holder *gh) +{ + struct gfs2_glock *gl = gh->gh_gl; + struct gfs2_inode *ip = gl->gl_object; + + if (ip == NULL) + return; + if (test_bit(GLF_DIRTY, &gl->gl_flags)) + gfs2_inode_attr_in(ip); + gfs2_meta_cache_flush(ip); +} + +/** + * inode_greedy - + * @gl: the glock + * + */ + +static void inode_greedy(struct gfs2_glock *gl) +{ + struct gfs2_sbd *sdp = gl->gl_sbd; + struct gfs2_inode *ip = gl->gl_object; + unsigned int quantum = gfs2_tune_get(sdp, gt_greedy_quantum); + unsigned int max = gfs2_tune_get(sdp, gt_greedy_max); + unsigned int new_time; + + spin_lock(&ip->i_spin); + + if (time_after(ip->i_last_pfault + quantum, jiffies)) { + new_time = ip->i_greedy + quantum; + if (new_time > max) + new_time = max; + } else { + new_time = ip->i_greedy - quantum; + if (!new_time || new_time > max) + new_time = 1; + } + + ip->i_greedy = new_time; + + spin_unlock(&ip->i_spin); + + iput(&ip->i_inode); +} + +/** + * rgrp_go_demote_ok - Check to see if it's ok to unlock a RG's glock + * @gl: the glock + * + * Returns: 1 if it's ok + */ + +static int rgrp_go_demote_ok(struct gfs2_glock *gl) +{ + return !gl->gl_aspace->i_mapping->nrpages; +} + +/** + * rgrp_go_lock - operation done after an rgrp lock is locked by + * a first holder on this node. + * @gl: the glock + * @flags: + * + * Returns: errno + */ + +static int rgrp_go_lock(struct gfs2_holder *gh) +{ + return gfs2_rgrp_bh_get(gh->gh_gl->gl_object); +} + +/** + * rgrp_go_unlock - operation done before an rgrp lock is unlocked by + * a last holder on this node. + * @gl: the glock + * @flags: + * + */ + +static void rgrp_go_unlock(struct gfs2_holder *gh) +{ + gfs2_rgrp_bh_put(gh->gh_gl->gl_object); +} + +/** + * trans_go_xmote_th - promote/demote the transaction glock + * @gl: the glock + * @state: the requested state + * @flags: + * + */ + +static void trans_go_xmote_th(struct gfs2_glock *gl, unsigned int state, + int flags) +{ + struct gfs2_sbd *sdp = gl->gl_sbd; + + if (gl->gl_state != LM_ST_UNLOCKED && + test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) { + gfs2_meta_syncfs(sdp); + gfs2_log_shutdown(sdp); + } + + gfs2_glock_xmote_th(gl, state, flags); +} + +/** + * trans_go_xmote_bh - After promoting/demoting the transaction glock + * @gl: the glock + * + */ + +static void trans_go_xmote_bh(struct gfs2_glock *gl) +{ + struct gfs2_sbd *sdp = gl->gl_sbd; + struct gfs2_inode *ip = GFS2_I(sdp->sd_jdesc->jd_inode); + struct gfs2_glock *j_gl = ip->i_gl; + struct gfs2_log_header head; + int error; + + if (gl->gl_state != LM_ST_UNLOCKED && + test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) { + gfs2_meta_cache_flush(GFS2_I(sdp->sd_jdesc->jd_inode)); + j_gl->gl_ops->go_inval(j_gl, DIO_METADATA | DIO_DATA); + + error = gfs2_find_jhead(sdp->sd_jdesc, &head); + if (error) + gfs2_consist(sdp); + if (!(head.lh_flags & GFS2_LOG_HEAD_UNMOUNT)) + gfs2_consist(sdp); + + /* Initialize some head of the log stuff */ + if (!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) { + sdp->sd_log_sequence = head.lh_sequence + 1; + gfs2_log_pointers_init(sdp, head.lh_blkno); + } + } +} + +/** + * trans_go_drop_th - unlock the transaction glock + * @gl: the glock + * + * We want to sync the device even with localcaching. Remember + * that localcaching journal replay only marks buffers dirty. + */ + +static void trans_go_drop_th(struct gfs2_glock *gl) +{ + struct gfs2_sbd *sdp = gl->gl_sbd; + + if (test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) { + gfs2_meta_syncfs(sdp); + gfs2_log_shutdown(sdp); + } + + gfs2_glock_drop_th(gl); +} + +/** + * quota_go_demote_ok - Check to see if it's ok to unlock a quota glock + * @gl: the glock + * + * Returns: 1 if it's ok + */ + +static int quota_go_demote_ok(struct gfs2_glock *gl) +{ + return !atomic_read(&gl->gl_lvb_count); +} + +const struct gfs2_glock_operations gfs2_meta_glops = { + .go_xmote_th = gfs2_glock_xmote_th, + .go_drop_th = gfs2_glock_drop_th, + .go_type = LM_TYPE_META, +}; + +const struct gfs2_glock_operations gfs2_inode_glops = { + .go_xmote_th = inode_go_xmote_th, + .go_xmote_bh = inode_go_xmote_bh, + .go_drop_th = inode_go_drop_th, + .go_sync = inode_go_sync, + .go_inval = inode_go_inval, + .go_demote_ok = inode_go_demote_ok, + .go_lock = inode_go_lock, + .go_unlock = inode_go_unlock, + .go_greedy = inode_greedy, + .go_type = LM_TYPE_INODE, +}; + +const struct gfs2_glock_operations gfs2_rgrp_glops = { + .go_xmote_th = gfs2_glock_xmote_th, + .go_drop_th = gfs2_glock_drop_th, + .go_sync = meta_go_sync, + .go_inval = meta_go_inval, + .go_demote_ok = rgrp_go_demote_ok, + .go_lock = rgrp_go_lock, + .go_unlock = rgrp_go_unlock, + .go_type = LM_TYPE_RGRP, +}; + +const struct gfs2_glock_operations gfs2_trans_glops = { + .go_xmote_th = trans_go_xmote_th, + .go_xmote_bh = trans_go_xmote_bh, + .go_drop_th = trans_go_drop_th, + .go_type = LM_TYPE_NONDISK, +}; + +const struct gfs2_glock_operations gfs2_iopen_glops = { + .go_xmote_th = gfs2_glock_xmote_th, + .go_drop_th = gfs2_glock_drop_th, + .go_type = LM_TYPE_IOPEN, +}; + +const struct gfs2_glock_operations gfs2_flock_glops = { + .go_xmote_th = gfs2_glock_xmote_th, + .go_drop_th = gfs2_glock_drop_th, + .go_type = LM_TYPE_FLOCK, +}; + +const struct gfs2_glock_operations gfs2_nondisk_glops = { + .go_xmote_th = gfs2_glock_xmote_th, + .go_drop_th = gfs2_glock_drop_th, + .go_type = LM_TYPE_NONDISK, +}; + +const struct gfs2_glock_operations gfs2_quota_glops = { + .go_xmote_th = gfs2_glock_xmote_th, + .go_drop_th = gfs2_glock_drop_th, + .go_demote_ok = quota_go_demote_ok, + .go_type = LM_TYPE_QUOTA, +}; + +const struct gfs2_glock_operations gfs2_journal_glops = { + .go_xmote_th = gfs2_glock_xmote_th, + .go_drop_th = gfs2_glock_drop_th, + .go_type = LM_TYPE_JOURNAL, +}; + diff --git a/fs/gfs2/glops.h b/fs/gfs2/glops.h new file mode 100644 index 000000000000..a1d9b5b024e6 --- /dev/null +++ b/fs/gfs2/glops.h @@ -0,0 +1,25 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#ifndef __GLOPS_DOT_H__ +#define __GLOPS_DOT_H__ + +#include "incore.h" + +extern const struct gfs2_glock_operations gfs2_meta_glops; +extern const struct gfs2_glock_operations gfs2_inode_glops; +extern const struct gfs2_glock_operations gfs2_rgrp_glops; +extern const struct gfs2_glock_operations gfs2_trans_glops; +extern const struct gfs2_glock_operations gfs2_iopen_glops; +extern const struct gfs2_glock_operations gfs2_flock_glops; +extern const struct gfs2_glock_operations gfs2_nondisk_glops; +extern const struct gfs2_glock_operations gfs2_quota_glops; +extern const struct gfs2_glock_operations gfs2_journal_glops; + +#endif /* __GLOPS_DOT_H__ */ diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h new file mode 100644 index 000000000000..118dc693d111 --- /dev/null +++ b/fs/gfs2/incore.h @@ -0,0 +1,634 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#ifndef __INCORE_DOT_H__ +#define __INCORE_DOT_H__ + +#include <linux/fs.h> + +#define DIO_WAIT 0x00000010 +#define DIO_METADATA 0x00000020 +#define DIO_DATA 0x00000040 +#define DIO_RELEASE 0x00000080 +#define DIO_ALL 0x00000100 + +struct gfs2_log_operations; +struct gfs2_log_element; +struct gfs2_holder; +struct gfs2_glock; +struct gfs2_quota_data; +struct gfs2_trans; +struct gfs2_ail; +struct gfs2_jdesc; +struct gfs2_sbd; + +typedef void (*gfs2_glop_bh_t) (struct gfs2_glock *gl, unsigned int ret); + +/* + * Structure of operations that are associated with each + * type of element in the log. + */ + +struct gfs2_log_operations { + void (*lo_add) (struct gfs2_sbd *sdp, struct gfs2_log_element *le); + void (*lo_incore_commit) (struct gfs2_sbd *sdp, struct gfs2_trans *tr); + void (*lo_before_commit) (struct gfs2_sbd *sdp); + void (*lo_after_commit) (struct gfs2_sbd *sdp, struct gfs2_ail *ai); + void (*lo_before_scan) (struct gfs2_jdesc *jd, + struct gfs2_log_header *head, int pass); + int (*lo_scan_elements) (struct gfs2_jdesc *jd, unsigned int start, + struct gfs2_log_descriptor *ld, __be64 *ptr, + int pass); + void (*lo_after_scan) (struct gfs2_jdesc *jd, int error, int pass); + const char *lo_name; +}; + +struct gfs2_log_element { + struct list_head le_list; + const struct gfs2_log_operations *le_ops; +}; + +struct gfs2_bitmap { + struct buffer_head *bi_bh; + char *bi_clone; + u32 bi_offset; + u32 bi_start; + u32 bi_len; +}; + +struct gfs2_rgrpd { + struct list_head rd_list; /* Link with superblock */ + struct list_head rd_list_mru; + struct list_head rd_recent; /* Recently used rgrps */ + struct gfs2_glock *rd_gl; /* Glock for this rgrp */ + struct gfs2_rindex rd_ri; + struct gfs2_rgrp rd_rg; + u64 rd_rg_vn; + struct gfs2_bitmap *rd_bits; + unsigned int rd_bh_count; + struct mutex rd_mutex; + u32 rd_free_clone; + struct gfs2_log_element rd_le; + u32 rd_last_alloc_data; + u32 rd_last_alloc_meta; + struct gfs2_sbd *rd_sbd; +}; + +enum gfs2_state_bits { + BH_Pinned = BH_PrivateStart, + BH_Escaped = BH_PrivateStart + 1, +}; + +BUFFER_FNS(Pinned, pinned) +TAS_BUFFER_FNS(Pinned, pinned) +BUFFER_FNS(Escaped, escaped) +TAS_BUFFER_FNS(Escaped, escaped) + +struct gfs2_bufdata { + struct buffer_head *bd_bh; + struct gfs2_glock *bd_gl; + + struct list_head bd_list_tr; + struct gfs2_log_element bd_le; + + struct gfs2_ail *bd_ail; + struct list_head bd_ail_st_list; + struct list_head bd_ail_gl_list; +}; + +struct gfs2_glock_operations { + void (*go_xmote_th) (struct gfs2_glock * gl, unsigned int state, + int flags); + void (*go_xmote_bh) (struct gfs2_glock * gl); + void (*go_drop_th) (struct gfs2_glock * gl); + void (*go_drop_bh) (struct gfs2_glock * gl); + void (*go_sync) (struct gfs2_glock * gl, int flags); + void (*go_inval) (struct gfs2_glock * gl, int flags); + int (*go_demote_ok) (struct gfs2_glock * gl); + int (*go_lock) (struct gfs2_holder * gh); + void (*go_unlock) (struct gfs2_holder * gh); + void (*go_callback) (struct gfs2_glock * gl, unsigned int state); + void (*go_greedy) (struct gfs2_glock * gl); + const int go_type; +}; + +enum { + /* Actions */ + HIF_MUTEX = 0, + HIF_PROMOTE = 1, + HIF_DEMOTE = 2, + HIF_GREEDY = 3, + + /* States */ + HIF_ALLOCED = 4, + HIF_DEALLOC = 5, + HIF_HOLDER = 6, + HIF_FIRST = 7, + HIF_ABORTED = 9, +}; + +struct gfs2_holder { + struct list_head gh_list; + + struct gfs2_glock *gh_gl; + struct task_struct *gh_owner; + unsigned int gh_state; + unsigned gh_flags; + + int gh_error; + unsigned long gh_iflags; + struct completion gh_wait; + unsigned long gh_ip; +}; + +enum { + GLF_LOCK = 1, + GLF_STICKY = 2, + GLF_PREFETCH = 3, + GLF_DIRTY = 5, + GLF_SKIP_WAITERS2 = 6, + GLF_GREEDY = 7, +}; + +struct gfs2_glock { + struct hlist_node gl_list; + unsigned long gl_flags; /* GLF_... */ + struct lm_lockname gl_name; + atomic_t gl_ref; + + spinlock_t gl_spin; + + unsigned int gl_state; + unsigned int gl_hash; + struct task_struct *gl_owner; + unsigned long gl_ip; + struct list_head gl_holders; + struct list_head gl_waiters1; /* HIF_MUTEX */ + struct list_head gl_waiters2; /* HIF_DEMOTE, HIF_GREEDY */ + struct list_head gl_waiters3; /* HIF_PROMOTE */ + + const struct gfs2_glock_operations *gl_ops; + + struct gfs2_holder *gl_req_gh; + gfs2_glop_bh_t gl_req_bh; + + void *gl_lock; + char *gl_lvb; + atomic_t gl_lvb_count; + + u64 gl_vn; + unsigned long gl_stamp; + void *gl_object; + + struct list_head gl_reclaim; + + struct gfs2_sbd *gl_sbd; + + struct inode *gl_aspace; + struct gfs2_log_element gl_le; + struct list_head gl_ail_list; + atomic_t gl_ail_count; +}; + +struct gfs2_alloc { + /* Quota stuff */ + + struct gfs2_quota_data *al_qd[2*MAXQUOTAS]; + struct gfs2_holder al_qd_ghs[2*MAXQUOTAS]; + unsigned int al_qd_num; + + u32 al_requested; /* Filled in by caller of gfs2_inplace_reserve() */ + u32 al_alloced; /* Filled in by gfs2_alloc_*() */ + + /* Filled in by gfs2_inplace_reserve() */ + + unsigned int al_line; + char *al_file; + struct gfs2_holder al_ri_gh; + struct gfs2_holder al_rgd_gh; + struct gfs2_rgrpd *al_rgd; + +}; + +enum { + GIF_QD_LOCKED = 1, + GIF_PAGED = 2, + GIF_SW_PAGED = 3, +}; + +struct gfs2_inode { + struct inode i_inode; + struct gfs2_inum i_num; + + unsigned long i_flags; /* GIF_... */ + + u64 i_vn; + struct gfs2_dinode i_di; /* To be replaced by ref to block */ + + struct gfs2_glock *i_gl; /* Move into i_gh? */ + struct gfs2_holder i_iopen_gh; + struct gfs2_holder i_gh; /* for prepare/commit_write only */ + struct gfs2_alloc i_alloc; + u64 i_last_rg_alloc; + + spinlock_t i_spin; + struct rw_semaphore i_rw_mutex; + unsigned int i_greedy; + unsigned long i_last_pfault; + + struct buffer_head *i_cache[GFS2_MAX_META_HEIGHT]; +}; + +/* + * Since i_inode is the first element of struct gfs2_inode, + * this is effectively a cast. + */ +static inline struct gfs2_inode *GFS2_I(struct inode *inode) +{ + return container_of(inode, struct gfs2_inode, i_inode); +} + +/* To be removed? */ +static inline struct gfs2_sbd *GFS2_SB(struct inode *inode) +{ + return inode->i_sb->s_fs_info; +} + +enum { + GFF_DID_DIRECT_ALLOC = 0, + GFF_EXLOCK = 1, +}; + +struct gfs2_file { + unsigned long f_flags; /* GFF_... */ + struct mutex f_fl_mutex; + struct gfs2_holder f_fl_gh; +}; + +struct gfs2_revoke { + struct gfs2_log_element rv_le; + u64 rv_blkno; +}; + +struct gfs2_revoke_replay { + struct list_head rr_list; + u64 rr_blkno; + unsigned int rr_where; +}; + +enum { + QDF_USER = 0, + QDF_CHANGE = 1, + QDF_LOCKED = 2, +}; + +struct gfs2_quota_lvb { + __be32 qb_magic; + u32 __pad; + __be64 qb_limit; /* Hard limit of # blocks to alloc */ + __be64 qb_warn; /* Warn user when alloc is above this # */ + __be64 qb_value; /* Current # blocks allocated */ +}; + +struct gfs2_quota_data { + struct list_head qd_list; + unsigned int qd_count; + + u32 qd_id; + unsigned long qd_flags; /* QDF_... */ + + s64 qd_change; + s64 qd_change_sync; + + unsigned int qd_slot; + unsigned int qd_slot_count; + + struct buffer_head *qd_bh; + struct gfs2_quota_change *qd_bh_qc; + unsigned int qd_bh_count; + + struct gfs2_glock *qd_gl; + struct gfs2_quota_lvb qd_qb; + + u64 qd_sync_gen; + unsigned long qd_last_warn; + unsigned long qd_last_touched; +}; + +struct gfs2_log_buf { + struct list_head lb_list; + struct buffer_head *lb_bh; + struct buffer_head *lb_real; +}; + +struct gfs2_trans { + unsigned long tr_ip; + + unsigned int tr_blocks; + unsigned int tr_revokes; + unsigned int tr_reserved; + + struct gfs2_holder tr_t_gh; + + int tr_touched; + + unsigned int tr_num_buf; + unsigned int tr_num_buf_new; + unsigned int tr_num_buf_rm; + struct list_head tr_list_buf; + + unsigned int tr_num_revoke; + unsigned int tr_num_revoke_rm; +}; + +struct gfs2_ail { + struct list_head ai_list; + + unsigned int ai_first; + struct list_head ai_ail1_list; + struct list_head ai_ail2_list; + + u64 ai_sync_gen; +}; + +struct gfs2_jdesc { + struct list_head jd_list; + + struct inode *jd_inode; + unsigned int jd_jid; + int jd_dirty; + + unsigned int jd_blocks; +}; + +#define GFS2_GLOCKD_DEFAULT 1 +#define GFS2_GLOCKD_MAX 16 + +#define GFS2_QUOTA_DEFAULT GFS2_QUOTA_OFF +#define GFS2_QUOTA_OFF 0 +#define GFS2_QUOTA_ACCOUNT 1 +#define GFS2_QUOTA_ON 2 + +#define GFS2_DATA_DEFAULT GFS2_DATA_ORDERED +#define GFS2_DATA_WRITEBACK 1 +#define GFS2_DATA_ORDERED 2 + +struct gfs2_args { + char ar_lockproto[GFS2_LOCKNAME_LEN]; /* Name of the Lock Protocol */ + char ar_locktable[GFS2_LOCKNAME_LEN]; /* Name of the Lock Table */ + char ar_hostdata[GFS2_LOCKNAME_LEN]; /* Host specific data */ + int ar_spectator; /* Don't get a journal because we're always RO */ + int ar_ignore_local_fs; /* Don't optimize even if local_fs is 1 */ + int ar_localflocks; /* Let the VFS do flock|fcntl locks for us */ + int ar_localcaching; /* Local-style caching (dangerous on multihost) */ + int ar_debug; /* Oops on errors instead of trying to be graceful */ + int ar_upgrade; /* Upgrade ondisk/multihost format */ + unsigned int ar_num_glockd; /* Number of glockd threads */ + int ar_posix_acl; /* Enable posix acls */ + int ar_quota; /* off/account/on */ + int ar_suiddir; /* suiddir support */ + int ar_data; /* ordered/writeback */ +}; + +struct gfs2_tune { + spinlock_t gt_spin; + + unsigned int gt_ilimit; + unsigned int gt_ilimit_tries; + unsigned int gt_ilimit_min; + unsigned int gt_demote_secs; /* Cache retention for unheld glock */ + unsigned int gt_incore_log_blocks; + unsigned int gt_log_flush_secs; + unsigned int gt_jindex_refresh_secs; /* Check for new journal index */ + + unsigned int gt_scand_secs; + unsigned int gt_recoverd_secs; + unsigned int gt_logd_secs; + unsigned int gt_quotad_secs; + + unsigned int gt_quota_simul_sync; /* Max quotavals to sync at once */ + unsigned int gt_quota_warn_period; /* Secs between quota warn msgs */ + unsigned int gt_quota_scale_num; /* Numerator */ + unsigned int gt_quota_scale_den; /* Denominator */ + unsigned int gt_quota_cache_secs; + unsigned int gt_quota_quantum; /* Secs between syncs to quota file */ + unsigned int gt_atime_quantum; /* Min secs between atime updates */ + unsigned int gt_new_files_jdata; + unsigned int gt_new_files_directio; + unsigned int gt_max_atomic_write; /* Split big writes into this size */ + unsigned int gt_max_readahead; /* Max bytes to read-ahead from disk */ + unsigned int gt_lockdump_size; + unsigned int gt_stall_secs; /* Detects trouble! */ + unsigned int gt_complain_secs; + unsigned int gt_reclaim_limit; /* Max num of glocks in reclaim list */ + unsigned int gt_entries_per_readdir; + unsigned int gt_prefetch_secs; /* Usage window for prefetched glocks */ + unsigned int gt_greedy_default; + unsigned int gt_greedy_quantum; + unsigned int gt_greedy_max; + unsigned int gt_statfs_quantum; + unsigned int gt_statfs_slow; +}; + +enum { + SDF_JOURNAL_CHECKED = 0, + SDF_JOURNAL_LIVE = 1, + SDF_SHUTDOWN = 2, + SDF_NOATIME = 3, +}; + +#define GFS2_FSNAME_LEN 256 + +struct gfs2_sbd { + struct super_block *sd_vfs; + struct super_block *sd_vfs_meta; + struct kobject sd_kobj; + unsigned long sd_flags; /* SDF_... */ + struct gfs2_sb sd_sb; + + /* Constants computed on mount */ + + u32 sd_fsb2bb; + u32 sd_fsb2bb_shift; + u32 sd_diptrs; /* Number of pointers in a dinode */ + u32 sd_inptrs; /* Number of pointers in a indirect block */ + u32 sd_jbsize; /* Size of a journaled data block */ + u32 sd_hash_bsize; /* sizeof(exhash block) */ + u32 sd_hash_bsize_shift; + u32 sd_hash_ptrs; /* Number of pointers in a hash block */ + u32 sd_qc_per_block; + u32 sd_max_dirres; /* Max blocks needed to add a directory entry */ + u32 sd_max_height; /* Max height of a file's metadata tree */ + u64 sd_heightsize[GFS2_MAX_META_HEIGHT]; + u32 sd_max_jheight; /* Max height of journaled file's meta tree */ + u64 sd_jheightsize[GFS2_MAX_META_HEIGHT]; + + struct gfs2_args sd_args; /* Mount arguments */ + struct gfs2_tune sd_tune; /* Filesystem tuning structure */ + + /* Lock Stuff */ + + struct lm_lockstruct sd_lockstruct; + struct list_head sd_reclaim_list; + spinlock_t sd_reclaim_lock; + wait_queue_head_t sd_reclaim_wq; + atomic_t sd_reclaim_count; + struct gfs2_holder sd_live_gh; + struct gfs2_glock *sd_rename_gl; + struct gfs2_glock *sd_trans_gl; + + /* Inode Stuff */ + + struct inode *sd_master_dir; + struct inode *sd_jindex; + struct inode *sd_inum_inode; + struct inode *sd_statfs_inode; + struct inode *sd_ir_inode; + struct inode *sd_sc_inode; + struct inode *sd_qc_inode; + struct inode *sd_rindex; + struct inode *sd_quota_inode; + + /* Inum stuff */ + + struct mutex sd_inum_mutex; + + /* StatFS stuff */ + + spinlock_t sd_statfs_spin; + struct mutex sd_statfs_mutex; + struct gfs2_statfs_change sd_statfs_master; + struct gfs2_statfs_change sd_statfs_local; + unsigned long sd_statfs_sync_time; + + /* Resource group stuff */ + + u64 sd_rindex_vn; + spinlock_t sd_rindex_spin; + struct mutex sd_rindex_mutex; + struct list_head sd_rindex_list; + struct list_head sd_rindex_mru_list; + struct list_head sd_rindex_recent_list; + struct gfs2_rgrpd *sd_rindex_forward; + unsigned int sd_rgrps; + + /* Journal index stuff */ + + struct list_head sd_jindex_list; + spinlock_t sd_jindex_spin; + struct mutex sd_jindex_mutex; + unsigned int sd_journals; + unsigned long sd_jindex_refresh_time; + + struct gfs2_jdesc *sd_jdesc; + struct gfs2_holder sd_journal_gh; + struct gfs2_holder sd_jinode_gh; + + struct gfs2_holder sd_ir_gh; + struct gfs2_holder sd_sc_gh; + struct gfs2_holder sd_qc_gh; + + /* Daemon stuff */ + + struct task_struct *sd_scand_process; + struct task_struct *sd_recoverd_process; + struct task_struct *sd_logd_process; + struct task_struct *sd_quotad_process; + struct task_struct *sd_glockd_process[GFS2_GLOCKD_MAX]; + unsigned int sd_glockd_num; + + /* Quota stuff */ + + struct list_head sd_quota_list; + atomic_t sd_quota_count; + spinlock_t sd_quota_spin; + struct mutex sd_quota_mutex; + + unsigned int sd_quota_slots; + unsigned int sd_quota_chunks; + unsigned char **sd_quota_bitmap; + + u64 sd_quota_sync_gen; + unsigned long sd_quota_sync_time; + + /* Log stuff */ + + spinlock_t sd_log_lock; + + unsigned int sd_log_blks_reserved; + unsigned int sd_log_commited_buf; + unsigned int sd_log_commited_revoke; + + unsigned int sd_log_num_gl; + unsigned int sd_log_num_buf; + unsigned int sd_log_num_revoke; + unsigned int sd_log_num_rg; + unsigned int sd_log_num_databuf; + unsigned int sd_log_num_jdata; + unsigned int sd_log_num_hdrs; + + struct list_head sd_log_le_gl; + struct list_head sd_log_le_buf; + struct list_head sd_log_le_revoke; + struct list_head sd_log_le_rg; + struct list_head sd_log_le_databuf; + + unsigned int sd_log_blks_free; + struct mutex sd_log_reserve_mutex; + + u64 sd_log_sequence; + unsigned int sd_log_head; + unsigned int sd_log_tail; + int sd_log_idle; + + unsigned long sd_log_flush_time; + struct rw_semaphore sd_log_flush_lock; + struct list_head sd_log_flush_list; + + unsigned int sd_log_flush_head; + u64 sd_log_flush_wrapped; + + struct list_head sd_ail1_list; + struct list_head sd_ail2_list; + u64 sd_ail_sync_gen; + + /* Replay stuff */ + + struct list_head sd_revoke_list; + unsigned int sd_replay_tail; + + unsigned int sd_found_blocks; + unsigned int sd_found_revokes; + unsigned int sd_replayed_blocks; + + /* For quiescing the filesystem */ + + struct gfs2_holder sd_freeze_gh; + struct mutex sd_freeze_lock; + unsigned int sd_freeze_count; + + /* Counters */ + + atomic_t sd_glock_count; + atomic_t sd_glock_held_count; + atomic_t sd_inode_count; + atomic_t sd_reclaimed; + + char sd_fsname[GFS2_FSNAME_LEN]; + char sd_table_name[GFS2_FSNAME_LEN]; + char sd_proto_name[GFS2_FSNAME_LEN]; + + /* Debugging crud */ + + unsigned long sd_last_warning; + struct vfsmount *sd_gfs2mnt; +}; + +#endif /* __INCORE_DOT_H__ */ + diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c new file mode 100644 index 000000000000..57c43ac47925 --- /dev/null +++ b/fs/gfs2/inode.c @@ -0,0 +1,1379 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/completion.h> +#include <linux/buffer_head.h> +#include <linux/posix_acl.h> +#include <linux/sort.h> +#include <linux/gfs2_ondisk.h> +#include <linux/crc32.h> +#include <linux/lm_interface.h> +#include <linux/security.h> + +#include "gfs2.h" +#include "incore.h" +#include "acl.h" +#include "bmap.h" +#include "dir.h" +#include "eattr.h" +#include "glock.h" +#include "glops.h" +#include "inode.h" +#include "log.h" +#include "meta_io.h" +#include "ops_address.h" +#include "ops_file.h" +#include "ops_inode.h" +#include "quota.h" +#include "rgrp.h" +#include "trans.h" +#include "util.h" + +/** + * gfs2_inode_attr_in - Copy attributes from the dinode into the VFS inode + * @ip: The GFS2 inode (with embedded disk inode data) + * @inode: The Linux VFS inode + * + */ + +void gfs2_inode_attr_in(struct gfs2_inode *ip) +{ + struct inode *inode = &ip->i_inode; + struct gfs2_dinode *di = &ip->i_di; + + inode->i_ino = ip->i_num.no_addr; + + switch (di->di_mode & S_IFMT) { + case S_IFBLK: + case S_IFCHR: + inode->i_rdev = MKDEV(di->di_major, di->di_minor); + break; + default: + inode->i_rdev = 0; + break; + }; + + inode->i_mode = di->di_mode; + inode->i_nlink = di->di_nlink; + inode->i_uid = di->di_uid; + inode->i_gid = di->di_gid; + i_size_write(inode, di->di_size); + inode->i_atime.tv_sec = di->di_atime; + inode->i_mtime.tv_sec = di->di_mtime; + inode->i_ctime.tv_sec = di->di_ctime; + inode->i_atime.tv_nsec = 0; + inode->i_mtime.tv_nsec = 0; + inode->i_ctime.tv_nsec = 0; + inode->i_blocks = di->di_blocks << + (GFS2_SB(inode)->sd_sb.sb_bsize_shift - GFS2_BASIC_BLOCK_SHIFT); + + if (di->di_flags & GFS2_DIF_IMMUTABLE) + inode->i_flags |= S_IMMUTABLE; + else + inode->i_flags &= ~S_IMMUTABLE; + + if (di->di_flags & GFS2_DIF_APPENDONLY) + inode->i_flags |= S_APPEND; + else + inode->i_flags &= ~S_APPEND; +} + +/** + * gfs2_inode_attr_out - Copy attributes from VFS inode into the dinode + * @ip: The GFS2 inode + * + * Only copy out the attributes that we want the VFS layer + * to be able to modify. + */ + +void gfs2_inode_attr_out(struct gfs2_inode *ip) +{ + struct inode *inode = &ip->i_inode; + struct gfs2_dinode *di = &ip->i_di; + gfs2_assert_withdraw(GFS2_SB(inode), + (di->di_mode & S_IFMT) == (inode->i_mode & S_IFMT)); + di->di_mode = inode->i_mode; + di->di_uid = inode->i_uid; + di->di_gid = inode->i_gid; + di->di_atime = inode->i_atime.tv_sec; + di->di_mtime = inode->i_mtime.tv_sec; + di->di_ctime = inode->i_ctime.tv_sec; +} + +static int iget_test(struct inode *inode, void *opaque) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_inum *inum = opaque; + + if (ip && ip->i_num.no_addr == inum->no_addr) + return 1; + + return 0; +} + +static int iget_set(struct inode *inode, void *opaque) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_inum *inum = opaque; + + ip->i_num = *inum; + return 0; +} + +struct inode *gfs2_ilookup(struct super_block *sb, struct gfs2_inum *inum) +{ + return ilookup5(sb, (unsigned long)inum->no_formal_ino, + iget_test, inum); +} + +static struct inode *gfs2_iget(struct super_block *sb, struct gfs2_inum *inum) +{ + return iget5_locked(sb, (unsigned long)inum->no_formal_ino, + iget_test, iget_set, inum); +} + +/** + * gfs2_inode_lookup - Lookup an inode + * @sb: The super block + * @inum: The inode number + * @type: The type of the inode + * + * Returns: A VFS inode, or an error + */ + +struct inode *gfs2_inode_lookup(struct super_block *sb, struct gfs2_inum *inum, unsigned int type) +{ + struct inode *inode = gfs2_iget(sb, inum); + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_glock *io_gl; + int error; + + if (inode->i_state & I_NEW) { + struct gfs2_sbd *sdp = GFS2_SB(inode); + umode_t mode = DT2IF(type); + inode->i_private = ip; + inode->i_mode = mode; + + if (S_ISREG(mode)) { + inode->i_op = &gfs2_file_iops; + inode->i_fop = &gfs2_file_fops; + inode->i_mapping->a_ops = &gfs2_file_aops; + } else if (S_ISDIR(mode)) { + inode->i_op = &gfs2_dir_iops; + inode->i_fop = &gfs2_dir_fops; + } else if (S_ISLNK(mode)) { + inode->i_op = &gfs2_symlink_iops; + } else { + inode->i_op = &gfs2_dev_iops; + } + + error = gfs2_glock_get(sdp, inum->no_addr, &gfs2_inode_glops, CREATE, &ip->i_gl); + if (unlikely(error)) + goto fail; + ip->i_gl->gl_object = ip; + + error = gfs2_glock_get(sdp, inum->no_addr, &gfs2_iopen_glops, CREATE, &io_gl); + if (unlikely(error)) + goto fail_put; + + ip->i_vn = ip->i_gl->gl_vn - 1; + error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT, &ip->i_iopen_gh); + if (unlikely(error)) + goto fail_iopen; + + gfs2_glock_put(io_gl); + unlock_new_inode(inode); + } + + return inode; +fail_iopen: + gfs2_glock_put(io_gl); +fail_put: + ip->i_gl->gl_object = NULL; + gfs2_glock_put(ip->i_gl); +fail: + iput(inode); + return ERR_PTR(error); +} + +/** + * gfs2_inode_refresh - Refresh the incore copy of the dinode + * @ip: The GFS2 inode + * + * Returns: errno + */ + +int gfs2_inode_refresh(struct gfs2_inode *ip) +{ + struct buffer_head *dibh; + int error; + + error = gfs2_meta_inode_buffer(ip, &dibh); + if (error) + return error; + + if (gfs2_metatype_check(GFS2_SB(&ip->i_inode), dibh, GFS2_METATYPE_DI)) { + brelse(dibh); + return -EIO; + } + + gfs2_dinode_in(&ip->i_di, dibh->b_data); + + brelse(dibh); + + if (ip->i_num.no_addr != ip->i_di.di_num.no_addr) { + if (gfs2_consist_inode(ip)) + gfs2_dinode_print(&ip->i_di); + return -EIO; + } + if (ip->i_num.no_formal_ino != ip->i_di.di_num.no_formal_ino) + return -ESTALE; + + ip->i_vn = ip->i_gl->gl_vn; + + return 0; +} + +int gfs2_dinode_dealloc(struct gfs2_inode *ip) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct gfs2_alloc *al; + struct gfs2_rgrpd *rgd; + int error; + + if (ip->i_di.di_blocks != 1) { + if (gfs2_consist_inode(ip)) + gfs2_dinode_print(&ip->i_di); + return -EIO; + } + + al = gfs2_alloc_get(ip); + + error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE); + if (error) + goto out; + + error = gfs2_rindex_hold(sdp, &al->al_ri_gh); + if (error) + goto out_qs; + + rgd = gfs2_blk2rgrpd(sdp, ip->i_num.no_addr); + if (!rgd) { + gfs2_consist_inode(ip); + error = -EIO; + goto out_rindex_relse; + } + + error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, + &al->al_rgd_gh); + if (error) + goto out_rindex_relse; + + error = gfs2_trans_begin(sdp, RES_RG_BIT + RES_STATFS + RES_QUOTA, 1); + if (error) + goto out_rg_gunlock; + + gfs2_trans_add_gl(ip->i_gl); + + gfs2_free_di(rgd, ip); + + gfs2_trans_end(sdp); + clear_bit(GLF_STICKY, &ip->i_gl->gl_flags); + +out_rg_gunlock: + gfs2_glock_dq_uninit(&al->al_rgd_gh); +out_rindex_relse: + gfs2_glock_dq_uninit(&al->al_ri_gh); +out_qs: + gfs2_quota_unhold(ip); +out: + gfs2_alloc_put(ip); + return error; +} + +/** + * gfs2_change_nlink - Change nlink count on inode + * @ip: The GFS2 inode + * @diff: The change in the nlink count required + * + * Returns: errno + */ + +int gfs2_change_nlink(struct gfs2_inode *ip, int diff) +{ + struct gfs2_sbd *sdp = ip->i_inode.i_sb->s_fs_info; + struct buffer_head *dibh; + u32 nlink; + int error; + + BUG_ON(ip->i_di.di_nlink != ip->i_inode.i_nlink); + nlink = ip->i_di.di_nlink + diff; + + /* If we are reducing the nlink count, but the new value ends up being + bigger than the old one, we must have underflowed. */ + if (diff < 0 && nlink > ip->i_di.di_nlink) { + if (gfs2_consist_inode(ip)) + gfs2_dinode_print(&ip->i_di); + return -EIO; + } + + error = gfs2_meta_inode_buffer(ip, &dibh); + if (error) + return error; + + ip->i_di.di_nlink = nlink; + ip->i_di.di_ctime = get_seconds(); + ip->i_inode.i_nlink = nlink; + + gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_dinode_out(&ip->i_di, dibh->b_data); + brelse(dibh); + mark_inode_dirty(&ip->i_inode); + + if (ip->i_di.di_nlink == 0) { + struct gfs2_rgrpd *rgd; + struct gfs2_holder ri_gh, rg_gh; + + error = gfs2_rindex_hold(sdp, &ri_gh); + if (error) + goto out; + error = -EIO; + rgd = gfs2_blk2rgrpd(sdp, ip->i_num.no_addr); + if (!rgd) + goto out_norgrp; + error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, &rg_gh); + if (error) + goto out_norgrp; + + clear_nlink(&ip->i_inode); + gfs2_unlink_di(&ip->i_inode); /* mark inode unlinked */ + gfs2_glock_dq_uninit(&rg_gh); +out_norgrp: + gfs2_glock_dq_uninit(&ri_gh); + } +out: + return error; +} + +struct inode *gfs2_lookup_simple(struct inode *dip, const char *name) +{ + struct qstr qstr; + gfs2_str2qstr(&qstr, name); + return gfs2_lookupi(dip, &qstr, 1, NULL); +} + + +/** + * gfs2_lookupi - Look up a filename in a directory and return its inode + * @d_gh: An initialized holder for the directory glock + * @name: The name of the inode to look for + * @is_root: If 1, ignore the caller's permissions + * @i_gh: An uninitialized holder for the new inode glock + * + * There will always be a vnode (Linux VFS inode) for the d_gh inode unless + * @is_root is true. + * + * Returns: errno + */ + +struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name, + int is_root, struct nameidata *nd) +{ + struct super_block *sb = dir->i_sb; + struct gfs2_inode *dip = GFS2_I(dir); + struct gfs2_holder d_gh; + struct gfs2_inum inum; + unsigned int type; + int error = 0; + struct inode *inode = NULL; + + if (!name->len || name->len > GFS2_FNAMESIZE) + return ERR_PTR(-ENAMETOOLONG); + + if ((name->len == 1 && memcmp(name->name, ".", 1) == 0) || + (name->len == 2 && memcmp(name->name, "..", 2) == 0 && + dir == sb->s_root->d_inode)) { + igrab(dir); + return dir; + } + + error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh); + if (error) + return ERR_PTR(error); + + if (!is_root) { + error = permission(dir, MAY_EXEC, NULL); + if (error) + goto out; + } + + error = gfs2_dir_search(dir, name, &inum, &type); + if (error) + goto out; + + inode = gfs2_inode_lookup(sb, &inum, type); + +out: + gfs2_glock_dq_uninit(&d_gh); + if (error == -ENOENT) + return NULL; + return inode; +} + +static int pick_formal_ino_1(struct gfs2_sbd *sdp, u64 *formal_ino) +{ + struct gfs2_inode *ip = GFS2_I(sdp->sd_ir_inode); + struct buffer_head *bh; + struct gfs2_inum_range ir; + int error; + + error = gfs2_trans_begin(sdp, RES_DINODE, 0); + if (error) + return error; + mutex_lock(&sdp->sd_inum_mutex); + + error = gfs2_meta_inode_buffer(ip, &bh); + if (error) { + mutex_unlock(&sdp->sd_inum_mutex); + gfs2_trans_end(sdp); + return error; + } + + gfs2_inum_range_in(&ir, bh->b_data + sizeof(struct gfs2_dinode)); + + if (ir.ir_length) { + *formal_ino = ir.ir_start++; + ir.ir_length--; + gfs2_trans_add_bh(ip->i_gl, bh, 1); + gfs2_inum_range_out(&ir, + bh->b_data + sizeof(struct gfs2_dinode)); + brelse(bh); + mutex_unlock(&sdp->sd_inum_mutex); + gfs2_trans_end(sdp); + return 0; + } + + brelse(bh); + + mutex_unlock(&sdp->sd_inum_mutex); + gfs2_trans_end(sdp); + + return 1; +} + +static int pick_formal_ino_2(struct gfs2_sbd *sdp, u64 *formal_ino) +{ + struct gfs2_inode *ip = GFS2_I(sdp->sd_ir_inode); + struct gfs2_inode *m_ip = GFS2_I(sdp->sd_inum_inode); + struct gfs2_holder gh; + struct buffer_head *bh; + struct gfs2_inum_range ir; + int error; + + error = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); + if (error) + return error; + + error = gfs2_trans_begin(sdp, 2 * RES_DINODE, 0); + if (error) + goto out; + mutex_lock(&sdp->sd_inum_mutex); + + error = gfs2_meta_inode_buffer(ip, &bh); + if (error) + goto out_end_trans; + + gfs2_inum_range_in(&ir, bh->b_data + sizeof(struct gfs2_dinode)); + + if (!ir.ir_length) { + struct buffer_head *m_bh; + u64 x, y; + + error = gfs2_meta_inode_buffer(m_ip, &m_bh); + if (error) + goto out_brelse; + + x = *(u64 *)(m_bh->b_data + sizeof(struct gfs2_dinode)); + x = y = be64_to_cpu(x); + ir.ir_start = x; + ir.ir_length = GFS2_INUM_QUANTUM; + x += GFS2_INUM_QUANTUM; + if (x < y) + gfs2_consist_inode(m_ip); + x = cpu_to_be64(x); + gfs2_trans_add_bh(m_ip->i_gl, m_bh, 1); + *(u64 *)(m_bh->b_data + sizeof(struct gfs2_dinode)) = x; + + brelse(m_bh); + } + + *formal_ino = ir.ir_start++; + ir.ir_length--; + + gfs2_trans_add_bh(ip->i_gl, bh, 1); + gfs2_inum_range_out(&ir, bh->b_data + sizeof(struct gfs2_dinode)); + +out_brelse: + brelse(bh); +out_end_trans: + mutex_unlock(&sdp->sd_inum_mutex); + gfs2_trans_end(sdp); +out: + gfs2_glock_dq_uninit(&gh); + return error; +} + +static int pick_formal_ino(struct gfs2_sbd *sdp, u64 *inum) +{ + int error; + + error = pick_formal_ino_1(sdp, inum); + if (error <= 0) + return error; + + error = pick_formal_ino_2(sdp, inum); + + return error; +} + +/** + * create_ok - OK to create a new on-disk inode here? + * @dip: Directory in which dinode is to be created + * @name: Name of new dinode + * @mode: + * + * Returns: errno + */ + +static int create_ok(struct gfs2_inode *dip, const struct qstr *name, + unsigned int mode) +{ + int error; + + error = permission(&dip->i_inode, MAY_WRITE | MAY_EXEC, NULL); + if (error) + return error; + + /* Don't create entries in an unlinked directory */ + if (!dip->i_di.di_nlink) + return -EPERM; + + error = gfs2_dir_search(&dip->i_inode, name, NULL, NULL); + switch (error) { + case -ENOENT: + error = 0; + break; + case 0: + return -EEXIST; + default: + return error; + } + + if (dip->i_di.di_entries == (u32)-1) + return -EFBIG; + if (S_ISDIR(mode) && dip->i_di.di_nlink == (u32)-1) + return -EMLINK; + + return 0; +} + +static void munge_mode_uid_gid(struct gfs2_inode *dip, unsigned int *mode, + unsigned int *uid, unsigned int *gid) +{ + if (GFS2_SB(&dip->i_inode)->sd_args.ar_suiddir && + (dip->i_di.di_mode & S_ISUID) && dip->i_di.di_uid) { + if (S_ISDIR(*mode)) + *mode |= S_ISUID; + else if (dip->i_di.di_uid != current->fsuid) + *mode &= ~07111; + *uid = dip->i_di.di_uid; + } else + *uid = current->fsuid; + + if (dip->i_di.di_mode & S_ISGID) { + if (S_ISDIR(*mode)) + *mode |= S_ISGID; + *gid = dip->i_di.di_gid; + } else + *gid = current->fsgid; +} + +static int alloc_dinode(struct gfs2_inode *dip, struct gfs2_inum *inum, + u64 *generation) +{ + struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); + int error; + + gfs2_alloc_get(dip); + + dip->i_alloc.al_requested = RES_DINODE; + error = gfs2_inplace_reserve(dip); + if (error) + goto out; + + error = gfs2_trans_begin(sdp, RES_RG_BIT + RES_STATFS, 0); + if (error) + goto out_ipreserv; + + inum->no_addr = gfs2_alloc_di(dip, generation); + + gfs2_trans_end(sdp); + +out_ipreserv: + gfs2_inplace_release(dip); +out: + gfs2_alloc_put(dip); + return error; +} + +/** + * init_dinode - Fill in a new dinode structure + * @dip: the directory this inode is being created in + * @gl: The glock covering the new inode + * @inum: the inode number + * @mode: the file permissions + * @uid: + * @gid: + * + */ + +static void init_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl, + const struct gfs2_inum *inum, unsigned int mode, + unsigned int uid, unsigned int gid, + const u64 *generation) +{ + struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); + struct gfs2_dinode *di; + struct buffer_head *dibh; + + dibh = gfs2_meta_new(gl, inum->no_addr); + gfs2_trans_add_bh(gl, dibh, 1); + gfs2_metatype_set(dibh, GFS2_METATYPE_DI, GFS2_FORMAT_DI); + gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode)); + di = (struct gfs2_dinode *)dibh->b_data; + + di->di_num.no_formal_ino = cpu_to_be64(inum->no_formal_ino); + di->di_num.no_addr = cpu_to_be64(inum->no_addr); + di->di_mode = cpu_to_be32(mode); + di->di_uid = cpu_to_be32(uid); + di->di_gid = cpu_to_be32(gid); + di->di_nlink = cpu_to_be32(0); + di->di_size = cpu_to_be64(0); + di->di_blocks = cpu_to_be64(1); + di->di_atime = di->di_mtime = di->di_ctime = cpu_to_be64(get_seconds()); + di->di_major = di->di_minor = cpu_to_be32(0); + di->di_goal_meta = di->di_goal_data = cpu_to_be64(inum->no_addr); + di->di_generation = cpu_to_be64(*generation); + di->di_flags = cpu_to_be32(0); + + if (S_ISREG(mode)) { + if ((dip->i_di.di_flags & GFS2_DIF_INHERIT_JDATA) || + gfs2_tune_get(sdp, gt_new_files_jdata)) + di->di_flags |= cpu_to_be32(GFS2_DIF_JDATA); + if ((dip->i_di.di_flags & GFS2_DIF_INHERIT_DIRECTIO) || + gfs2_tune_get(sdp, gt_new_files_directio)) + di->di_flags |= cpu_to_be32(GFS2_DIF_DIRECTIO); + } else if (S_ISDIR(mode)) { + di->di_flags |= cpu_to_be32(dip->i_di.di_flags & + GFS2_DIF_INHERIT_DIRECTIO); + di->di_flags |= cpu_to_be32(dip->i_di.di_flags & + GFS2_DIF_INHERIT_JDATA); + } + + di->__pad1 = 0; + di->di_payload_format = cpu_to_be32(0); + di->di_height = cpu_to_be32(0); + di->__pad2 = 0; + di->__pad3 = 0; + di->di_depth = cpu_to_be16(0); + di->di_entries = cpu_to_be32(0); + memset(&di->__pad4, 0, sizeof(di->__pad4)); + di->di_eattr = cpu_to_be64(0); + memset(&di->di_reserved, 0, sizeof(di->di_reserved)); + + brelse(dibh); +} + +static int make_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl, + unsigned int mode, const struct gfs2_inum *inum, + const u64 *generation) +{ + struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); + unsigned int uid, gid; + int error; + + munge_mode_uid_gid(dip, &mode, &uid, &gid); + gfs2_alloc_get(dip); + + error = gfs2_quota_lock(dip, uid, gid); + if (error) + goto out; + + error = gfs2_quota_check(dip, uid, gid); + if (error) + goto out_quota; + + error = gfs2_trans_begin(sdp, RES_DINODE + RES_QUOTA, 0); + if (error) + goto out_quota; + + init_dinode(dip, gl, inum, mode, uid, gid, generation); + gfs2_quota_change(dip, +1, uid, gid); + gfs2_trans_end(sdp); + +out_quota: + gfs2_quota_unlock(dip); +out: + gfs2_alloc_put(dip); + return error; +} + +static int link_dinode(struct gfs2_inode *dip, const struct qstr *name, + struct gfs2_inode *ip) +{ + struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); + struct gfs2_alloc *al; + int alloc_required; + struct buffer_head *dibh; + int error; + + al = gfs2_alloc_get(dip); + + error = gfs2_quota_lock(dip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE); + if (error) + goto fail; + + error = alloc_required = gfs2_diradd_alloc_required(&dip->i_inode, name); + if (alloc_required < 0) + goto fail; + if (alloc_required) { + error = gfs2_quota_check(dip, dip->i_di.di_uid, + dip->i_di.di_gid); + if (error) + goto fail_quota_locks; + + al->al_requested = sdp->sd_max_dirres; + + error = gfs2_inplace_reserve(dip); + if (error) + goto fail_quota_locks; + + error = gfs2_trans_begin(sdp, sdp->sd_max_dirres + + al->al_rgd->rd_ri.ri_length + + 2 * RES_DINODE + + RES_STATFS + RES_QUOTA, 0); + if (error) + goto fail_ipreserv; + } else { + error = gfs2_trans_begin(sdp, RES_LEAF + 2 * RES_DINODE, 0); + if (error) + goto fail_quota_locks; + } + + error = gfs2_dir_add(&dip->i_inode, name, &ip->i_num, IF2DT(ip->i_di.di_mode)); + if (error) + goto fail_end_trans; + + error = gfs2_meta_inode_buffer(ip, &dibh); + if (error) + goto fail_end_trans; + ip->i_di.di_nlink = 1; + gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_dinode_out(&ip->i_di, dibh->b_data); + brelse(dibh); + return 0; + +fail_end_trans: + gfs2_trans_end(sdp); + +fail_ipreserv: + if (dip->i_alloc.al_rgd) + gfs2_inplace_release(dip); + +fail_quota_locks: + gfs2_quota_unlock(dip); + +fail: + gfs2_alloc_put(dip); + return error; +} + +static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip) +{ + int err; + size_t len; + void *value; + char *name; + struct gfs2_ea_request er; + + err = security_inode_init_security(&ip->i_inode, &dip->i_inode, + &name, &value, &len); + + if (err) { + if (err == -EOPNOTSUPP) + return 0; + return err; + } + + memset(&er, 0, sizeof(struct gfs2_ea_request)); + + er.er_type = GFS2_EATYPE_SECURITY; + er.er_name = name; + er.er_data = value; + er.er_name_len = strlen(name); + er.er_data_len = len; + + err = gfs2_ea_set_i(ip, &er); + + kfree(value); + kfree(name); + + return err; +} + +/** + * gfs2_createi - Create a new inode + * @ghs: An array of two holders + * @name: The name of the new file + * @mode: the permissions on the new inode + * + * @ghs[0] is an initialized holder for the directory + * @ghs[1] is the holder for the inode lock + * + * If the return value is not NULL, the glocks on both the directory and the new + * file are held. A transaction has been started and an inplace reservation + * is held, as well. + * + * Returns: An inode + */ + +struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name, + unsigned int mode) +{ + struct inode *inode; + struct gfs2_inode *dip = ghs->gh_gl->gl_object; + struct inode *dir = &dip->i_inode; + struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); + struct gfs2_inum inum; + int error; + u64 generation; + + if (!name->len || name->len > GFS2_FNAMESIZE) + return ERR_PTR(-ENAMETOOLONG); + + gfs2_holder_reinit(LM_ST_EXCLUSIVE, 0, ghs); + error = gfs2_glock_nq(ghs); + if (error) + goto fail; + + error = create_ok(dip, name, mode); + if (error) + goto fail_gunlock; + + error = pick_formal_ino(sdp, &inum.no_formal_ino); + if (error) + goto fail_gunlock; + + error = alloc_dinode(dip, &inum, &generation); + if (error) + goto fail_gunlock; + + if (inum.no_addr < dip->i_num.no_addr) { + gfs2_glock_dq(ghs); + + error = gfs2_glock_nq_num(sdp, inum.no_addr, + &gfs2_inode_glops, LM_ST_EXCLUSIVE, + GL_SKIP, ghs + 1); + if (error) { + return ERR_PTR(error); + } + + gfs2_holder_reinit(LM_ST_EXCLUSIVE, 0, ghs); + error = gfs2_glock_nq(ghs); + if (error) { + gfs2_glock_dq_uninit(ghs + 1); + return ERR_PTR(error); + } + + error = create_ok(dip, name, mode); + if (error) + goto fail_gunlock2; + } else { + error = gfs2_glock_nq_num(sdp, inum.no_addr, + &gfs2_inode_glops, LM_ST_EXCLUSIVE, + GL_SKIP, ghs + 1); + if (error) + goto fail_gunlock; + } + + error = make_dinode(dip, ghs[1].gh_gl, mode, &inum, &generation); + if (error) + goto fail_gunlock2; + + inode = gfs2_inode_lookup(dir->i_sb, &inum, IF2DT(mode)); + if (IS_ERR(inode)) + goto fail_gunlock2; + + error = gfs2_inode_refresh(GFS2_I(inode)); + if (error) + goto fail_iput; + + error = gfs2_acl_create(dip, GFS2_I(inode)); + if (error) + goto fail_iput; + + error = gfs2_security_init(dip, GFS2_I(inode)); + if (error) + goto fail_iput; + + error = link_dinode(dip, name, GFS2_I(inode)); + if (error) + goto fail_iput; + + if (!inode) + return ERR_PTR(-ENOMEM); + return inode; + +fail_iput: + iput(inode); +fail_gunlock2: + gfs2_glock_dq_uninit(ghs + 1); +fail_gunlock: + gfs2_glock_dq(ghs); +fail: + return ERR_PTR(error); +} + +/** + * gfs2_rmdiri - Remove a directory + * @dip: The parent directory of the directory to be removed + * @name: The name of the directory to be removed + * @ip: The GFS2 inode of the directory to be removed + * + * Assumes Glocks on dip and ip are held + * + * Returns: errno + */ + +int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name, + struct gfs2_inode *ip) +{ + struct qstr dotname; + int error; + + if (ip->i_di.di_entries != 2) { + if (gfs2_consist_inode(ip)) + gfs2_dinode_print(&ip->i_di); + return -EIO; + } + + error = gfs2_dir_del(dip, name); + if (error) + return error; + + error = gfs2_change_nlink(dip, -1); + if (error) + return error; + + gfs2_str2qstr(&dotname, "."); + error = gfs2_dir_del(ip, &dotname); + if (error) + return error; + + gfs2_str2qstr(&dotname, ".."); + error = gfs2_dir_del(ip, &dotname); + if (error) + return error; + + error = gfs2_change_nlink(ip, -2); + if (error) + return error; + + return error; +} + +/* + * gfs2_unlink_ok - check to see that a inode is still in a directory + * @dip: the directory + * @name: the name of the file + * @ip: the inode + * + * Assumes that the lock on (at least) @dip is held. + * + * Returns: 0 if the parent/child relationship is correct, errno if it isn't + */ + +int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name, + struct gfs2_inode *ip) +{ + struct gfs2_inum inum; + unsigned int type; + int error; + + if (IS_IMMUTABLE(&ip->i_inode) || IS_APPEND(&ip->i_inode)) + return -EPERM; + + if ((dip->i_di.di_mode & S_ISVTX) && + dip->i_di.di_uid != current->fsuid && + ip->i_di.di_uid != current->fsuid && !capable(CAP_FOWNER)) + return -EPERM; + + if (IS_APPEND(&dip->i_inode)) + return -EPERM; + + error = permission(&dip->i_inode, MAY_WRITE | MAY_EXEC, NULL); + if (error) + return error; + + error = gfs2_dir_search(&dip->i_inode, name, &inum, &type); + if (error) + return error; + + if (!gfs2_inum_equal(&inum, &ip->i_num)) + return -ENOENT; + + if (IF2DT(ip->i_di.di_mode) != type) { + gfs2_consist_inode(dip); + return -EIO; + } + + return 0; +} + +/* + * gfs2_ok_to_move - check if it's ok to move a directory to another directory + * @this: move this + * @to: to here + * + * Follow @to back to the root and make sure we don't encounter @this + * Assumes we already hold the rename lock. + * + * Returns: errno + */ + +int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to) +{ + struct inode *dir = &to->i_inode; + struct super_block *sb = dir->i_sb; + struct inode *tmp; + struct qstr dotdot; + int error = 0; + + gfs2_str2qstr(&dotdot, ".."); + + igrab(dir); + + for (;;) { + if (dir == &this->i_inode) { + error = -EINVAL; + break; + } + if (dir == sb->s_root->d_inode) { + error = 0; + break; + } + + tmp = gfs2_lookupi(dir, &dotdot, 1, NULL); + if (IS_ERR(tmp)) { + error = PTR_ERR(tmp); + break; + } + + iput(dir); + dir = tmp; + } + + iput(dir); + + return error; +} + +/** + * gfs2_readlinki - return the contents of a symlink + * @ip: the symlink's inode + * @buf: a pointer to the buffer to be filled + * @len: a pointer to the length of @buf + * + * If @buf is too small, a piece of memory is kmalloc()ed and needs + * to be freed by the caller. + * + * Returns: errno + */ + +int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len) +{ + struct gfs2_holder i_gh; + struct buffer_head *dibh; + unsigned int x; + int error; + + gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &i_gh); + error = gfs2_glock_nq_atime(&i_gh); + if (error) { + gfs2_holder_uninit(&i_gh); + return error; + } + + if (!ip->i_di.di_size) { + gfs2_consist_inode(ip); + error = -EIO; + goto out; + } + + error = gfs2_meta_inode_buffer(ip, &dibh); + if (error) + goto out; + + x = ip->i_di.di_size + 1; + if (x > *len) { + *buf = kmalloc(x, GFP_KERNEL); + if (!*buf) { + error = -ENOMEM; + goto out_brelse; + } + } + + memcpy(*buf, dibh->b_data + sizeof(struct gfs2_dinode), x); + *len = x; + +out_brelse: + brelse(dibh); +out: + gfs2_glock_dq_uninit(&i_gh); + return error; +} + +/** + * gfs2_glock_nq_atime - Acquire a hold on an inode's glock, and + * conditionally update the inode's atime + * @gh: the holder to acquire + * + * Tests atime (access time) for gfs2_read, gfs2_readdir and gfs2_mmap + * Update if the difference between the current time and the inode's current + * atime is greater than an interval specified at mount. + * + * Returns: errno + */ + +int gfs2_glock_nq_atime(struct gfs2_holder *gh) +{ + struct gfs2_glock *gl = gh->gh_gl; + struct gfs2_sbd *sdp = gl->gl_sbd; + struct gfs2_inode *ip = gl->gl_object; + s64 curtime, quantum = gfs2_tune_get(sdp, gt_atime_quantum); + unsigned int state; + int flags; + int error; + + if (gfs2_assert_warn(sdp, gh->gh_flags & GL_ATIME) || + gfs2_assert_warn(sdp, !(gh->gh_flags & GL_ASYNC)) || + gfs2_assert_warn(sdp, gl->gl_ops == &gfs2_inode_glops)) + return -EINVAL; + + state = gh->gh_state; + flags = gh->gh_flags; + + error = gfs2_glock_nq(gh); + if (error) + return error; + + if (test_bit(SDF_NOATIME, &sdp->sd_flags) || + (sdp->sd_vfs->s_flags & MS_RDONLY)) + return 0; + + curtime = get_seconds(); + if (curtime - ip->i_di.di_atime >= quantum) { + gfs2_glock_dq(gh); + gfs2_holder_reinit(LM_ST_EXCLUSIVE, gh->gh_flags & ~LM_FLAG_ANY, + gh); + error = gfs2_glock_nq(gh); + if (error) + return error; + + /* Verify that atime hasn't been updated while we were + trying to get exclusive lock. */ + + curtime = get_seconds(); + if (curtime - ip->i_di.di_atime >= quantum) { + struct buffer_head *dibh; + struct gfs2_dinode *di; + + error = gfs2_trans_begin(sdp, RES_DINODE, 0); + if (error == -EROFS) + return 0; + if (error) + goto fail; + + error = gfs2_meta_inode_buffer(ip, &dibh); + if (error) + goto fail_end_trans; + + ip->i_di.di_atime = curtime; + + gfs2_trans_add_bh(ip->i_gl, dibh, 1); + di = (struct gfs2_dinode *)dibh->b_data; + di->di_atime = cpu_to_be64(ip->i_di.di_atime); + brelse(dibh); + + gfs2_trans_end(sdp); + } + + /* If someone else has asked for the glock, + unlock and let them have it. Then reacquire + in the original state. */ + if (gfs2_glock_is_blocking(gl)) { + gfs2_glock_dq(gh); + gfs2_holder_reinit(state, flags, gh); + return gfs2_glock_nq(gh); + } + } + + return 0; + +fail_end_trans: + gfs2_trans_end(sdp); +fail: + gfs2_glock_dq(gh); + return error; +} + +/** + * glock_compare_atime - Compare two struct gfs2_glock structures for sort + * @arg_a: the first structure + * @arg_b: the second structure + * + * Returns: 1 if A > B + * -1 if A < B + * 0 if A == B + */ + +static int glock_compare_atime(const void *arg_a, const void *arg_b) +{ + const struct gfs2_holder *gh_a = *(const struct gfs2_holder **)arg_a; + const struct gfs2_holder *gh_b = *(const struct gfs2_holder **)arg_b; + const struct lm_lockname *a = &gh_a->gh_gl->gl_name; + const struct lm_lockname *b = &gh_b->gh_gl->gl_name; + + if (a->ln_number > b->ln_number) + return 1; + if (a->ln_number < b->ln_number) + return -1; + if (gh_a->gh_state == LM_ST_SHARED && gh_b->gh_state == LM_ST_EXCLUSIVE) + return 1; + if (gh_a->gh_state == LM_ST_SHARED && (gh_b->gh_flags & GL_ATIME)) + return 1; + + return 0; +} + +/** + * gfs2_glock_nq_m_atime - acquire multiple glocks where one may need an + * atime update + * @num_gh: the number of structures + * @ghs: an array of struct gfs2_holder structures + * + * Returns: 0 on success (all glocks acquired), + * errno on failure (no glocks acquired) + */ + +int gfs2_glock_nq_m_atime(unsigned int num_gh, struct gfs2_holder *ghs) +{ + struct gfs2_holder **p; + unsigned int x; + int error = 0; + + if (!num_gh) + return 0; + + if (num_gh == 1) { + ghs->gh_flags &= ~(LM_FLAG_TRY | GL_ASYNC); + if (ghs->gh_flags & GL_ATIME) + error = gfs2_glock_nq_atime(ghs); + else + error = gfs2_glock_nq(ghs); + return error; + } + + p = kcalloc(num_gh, sizeof(struct gfs2_holder *), GFP_KERNEL); + if (!p) + return -ENOMEM; + + for (x = 0; x < num_gh; x++) + p[x] = &ghs[x]; + + sort(p, num_gh, sizeof(struct gfs2_holder *), glock_compare_atime,NULL); + + for (x = 0; x < num_gh; x++) { + p[x]->gh_flags &= ~(LM_FLAG_TRY | GL_ASYNC); + + if (p[x]->gh_flags & GL_ATIME) + error = gfs2_glock_nq_atime(p[x]); + else + error = gfs2_glock_nq(p[x]); + + if (error) { + while (x--) + gfs2_glock_dq(p[x]); + break; + } + } + + kfree(p); + return error; +} + + +static int +__gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr) +{ + struct buffer_head *dibh; + int error; + + error = gfs2_meta_inode_buffer(ip, &dibh); + if (!error) { + error = inode_setattr(&ip->i_inode, attr); + gfs2_assert_warn(GFS2_SB(&ip->i_inode), !error); + gfs2_inode_attr_out(ip); + + gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_dinode_out(&ip->i_di, dibh->b_data); + brelse(dibh); + } + return error; +} + +/** + * gfs2_setattr_simple - + * @ip: + * @attr: + * + * Called with a reference on the vnode. + * + * Returns: errno + */ + +int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr) +{ + int error; + + if (current->journal_info) + return __gfs2_setattr_simple(ip, attr); + + error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), RES_DINODE, 0); + if (error) + return error; + + error = __gfs2_setattr_simple(ip, attr); + gfs2_trans_end(GFS2_SB(&ip->i_inode)); + return error; +} + diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h new file mode 100644 index 000000000000..f5d861760579 --- /dev/null +++ b/fs/gfs2/inode.h @@ -0,0 +1,56 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#ifndef __INODE_DOT_H__ +#define __INODE_DOT_H__ + +static inline int gfs2_is_stuffed(struct gfs2_inode *ip) +{ + return !ip->i_di.di_height; +} + +static inline int gfs2_is_jdata(struct gfs2_inode *ip) +{ + return ip->i_di.di_flags & GFS2_DIF_JDATA; +} + +static inline int gfs2_is_dir(struct gfs2_inode *ip) +{ + return S_ISDIR(ip->i_di.di_mode); +} + +void gfs2_inode_attr_in(struct gfs2_inode *ip); +void gfs2_inode_attr_out(struct gfs2_inode *ip); +struct inode *gfs2_inode_lookup(struct super_block *sb, struct gfs2_inum *inum, unsigned type); +struct inode *gfs2_ilookup(struct super_block *sb, struct gfs2_inum *inum); + +int gfs2_inode_refresh(struct gfs2_inode *ip); + +int gfs2_dinode_dealloc(struct gfs2_inode *inode); +int gfs2_change_nlink(struct gfs2_inode *ip, int diff); +struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name, + int is_root, struct nameidata *nd); +struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name, + unsigned int mode); +int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name, + struct gfs2_inode *ip); +int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name, + struct gfs2_inode *ip); +int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to); +int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len); + +int gfs2_glock_nq_atime(struct gfs2_holder *gh); +int gfs2_glock_nq_m_atime(unsigned int num_gh, struct gfs2_holder *ghs); + +int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr); + +struct inode *gfs2_lookup_simple(struct inode *dip, const char *name); + +#endif /* __INODE_DOT_H__ */ + diff --git a/fs/gfs2/lm.c b/fs/gfs2/lm.c new file mode 100644 index 000000000000..effe4a337c1d --- /dev/null +++ b/fs/gfs2/lm.c @@ -0,0 +1,217 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/completion.h> +#include <linux/buffer_head.h> +#include <linux/delay.h> +#include <linux/gfs2_ondisk.h> +#include <linux/lm_interface.h> + +#include "gfs2.h" +#include "incore.h" +#include "glock.h" +#include "lm.h" +#include "super.h" +#include "util.h" + +/** + * gfs2_lm_mount - mount a locking protocol + * @sdp: the filesystem + * @args: mount arguements + * @silent: if 1, don't complain if the FS isn't a GFS2 fs + * + * Returns: errno + */ + +int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent) +{ + char *proto = sdp->sd_proto_name; + char *table = sdp->sd_table_name; + int flags = 0; + int error; + + if (sdp->sd_args.ar_spectator) + flags |= LM_MFLAG_SPECTATOR; + + fs_info(sdp, "Trying to join cluster \"%s\", \"%s\"\n", proto, table); + + error = gfs2_mount_lockproto(proto, table, sdp->sd_args.ar_hostdata, + gfs2_glock_cb, sdp, + GFS2_MIN_LVB_SIZE, flags, + &sdp->sd_lockstruct, &sdp->sd_kobj); + if (error) { + fs_info(sdp, "can't mount proto=%s, table=%s, hostdata=%s\n", + proto, table, sdp->sd_args.ar_hostdata); + goto out; + } + + if (gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_lockspace) || + gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_ops) || + gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_lvb_size >= + GFS2_MIN_LVB_SIZE)) { + gfs2_unmount_lockproto(&sdp->sd_lockstruct); + goto out; + } + + if (sdp->sd_args.ar_spectator) + snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.s", table); + else + snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.%u", table, + sdp->sd_lockstruct.ls_jid); + + fs_info(sdp, "Joined cluster. Now mounting FS...\n"); + + if ((sdp->sd_lockstruct.ls_flags & LM_LSFLAG_LOCAL) && + !sdp->sd_args.ar_ignore_local_fs) { + sdp->sd_args.ar_localflocks = 1; + sdp->sd_args.ar_localcaching = 1; + } + +out: + return error; +} + +void gfs2_lm_others_may_mount(struct gfs2_sbd *sdp) +{ + if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) + sdp->sd_lockstruct.ls_ops->lm_others_may_mount( + sdp->sd_lockstruct.ls_lockspace); +} + +void gfs2_lm_unmount(struct gfs2_sbd *sdp) +{ + if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) + gfs2_unmount_lockproto(&sdp->sd_lockstruct); +} + +int gfs2_lm_withdraw(struct gfs2_sbd *sdp, char *fmt, ...) +{ + va_list args; + + if (test_and_set_bit(SDF_SHUTDOWN, &sdp->sd_flags)) + return 0; + + va_start(args, fmt); + vprintk(fmt, args); + va_end(args); + + fs_err(sdp, "about to withdraw from the cluster\n"); + BUG_ON(sdp->sd_args.ar_debug); + + + fs_err(sdp, "waiting for outstanding I/O\n"); + + /* FIXME: suspend dm device so oustanding bio's complete + and all further io requests fail */ + + fs_err(sdp, "telling LM to withdraw\n"); + gfs2_withdraw_lockproto(&sdp->sd_lockstruct); + fs_err(sdp, "withdrawn\n"); + dump_stack(); + + return -1; +} + +int gfs2_lm_get_lock(struct gfs2_sbd *sdp, struct lm_lockname *name, + void **lockp) +{ + int error = -EIO; + if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) + error = sdp->sd_lockstruct.ls_ops->lm_get_lock( + sdp->sd_lockstruct.ls_lockspace, name, lockp); + return error; +} + +void gfs2_lm_put_lock(struct gfs2_sbd *sdp, void *lock) +{ + if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) + sdp->sd_lockstruct.ls_ops->lm_put_lock(lock); +} + +unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, void *lock, + unsigned int cur_state, unsigned int req_state, + unsigned int flags) +{ + int ret = 0; + if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) + ret = sdp->sd_lockstruct.ls_ops->lm_lock(lock, cur_state, + req_state, flags); + return ret; +} + +unsigned int gfs2_lm_unlock(struct gfs2_sbd *sdp, void *lock, + unsigned int cur_state) +{ + int ret = 0; + if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) + ret = sdp->sd_lockstruct.ls_ops->lm_unlock(lock, cur_state); + return ret; +} + +void gfs2_lm_cancel(struct gfs2_sbd *sdp, void *lock) +{ + if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) + sdp->sd_lockstruct.ls_ops->lm_cancel(lock); +} + +int gfs2_lm_hold_lvb(struct gfs2_sbd *sdp, void *lock, char **lvbp) +{ + int error = -EIO; + if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) + error = sdp->sd_lockstruct.ls_ops->lm_hold_lvb(lock, lvbp); + return error; +} + +void gfs2_lm_unhold_lvb(struct gfs2_sbd *sdp, void *lock, char *lvb) +{ + if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) + sdp->sd_lockstruct.ls_ops->lm_unhold_lvb(lock, lvb); +} + +int gfs2_lm_plock_get(struct gfs2_sbd *sdp, struct lm_lockname *name, + struct file *file, struct file_lock *fl) +{ + int error = -EIO; + if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) + error = sdp->sd_lockstruct.ls_ops->lm_plock_get( + sdp->sd_lockstruct.ls_lockspace, name, file, fl); + return error; +} + +int gfs2_lm_plock(struct gfs2_sbd *sdp, struct lm_lockname *name, + struct file *file, int cmd, struct file_lock *fl) +{ + int error = -EIO; + if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) + error = sdp->sd_lockstruct.ls_ops->lm_plock( + sdp->sd_lockstruct.ls_lockspace, name, file, cmd, fl); + return error; +} + +int gfs2_lm_punlock(struct gfs2_sbd *sdp, struct lm_lockname *name, + struct file *file, struct file_lock *fl) +{ + int error = -EIO; + if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) + error = sdp->sd_lockstruct.ls_ops->lm_punlock( + sdp->sd_lockstruct.ls_lockspace, name, file, fl); + return error; +} + +void gfs2_lm_recovery_done(struct gfs2_sbd *sdp, unsigned int jid, + unsigned int message) +{ + if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) + sdp->sd_lockstruct.ls_ops->lm_recovery_done( + sdp->sd_lockstruct.ls_lockspace, jid, message); +} + diff --git a/fs/gfs2/lm.h b/fs/gfs2/lm.h new file mode 100644 index 000000000000..21cdc30ee08c --- /dev/null +++ b/fs/gfs2/lm.h @@ -0,0 +1,42 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#ifndef __LM_DOT_H__ +#define __LM_DOT_H__ + +struct gfs2_sbd; + +#define GFS2_MIN_LVB_SIZE 32 + +int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent); +void gfs2_lm_others_may_mount(struct gfs2_sbd *sdp); +void gfs2_lm_unmount(struct gfs2_sbd *sdp); +int gfs2_lm_withdraw(struct gfs2_sbd *sdp, char *fmt, ...) + __attribute__ ((format(printf, 2, 3))); +int gfs2_lm_get_lock(struct gfs2_sbd *sdp, struct lm_lockname *name, + void **lockp); +void gfs2_lm_put_lock(struct gfs2_sbd *sdp, void *lock); +unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, void *lock, + unsigned int cur_state, unsigned int req_state, + unsigned int flags); +unsigned int gfs2_lm_unlock(struct gfs2_sbd *sdp, void *lock, + unsigned int cur_state); +void gfs2_lm_cancel(struct gfs2_sbd *sdp, void *lock); +int gfs2_lm_hold_lvb(struct gfs2_sbd *sdp, void *lock, char **lvbp); +void gfs2_lm_unhold_lvb(struct gfs2_sbd *sdp, void *lock, char *lvb); +int gfs2_lm_plock_get(struct gfs2_sbd *sdp, struct lm_lockname *name, + struct file *file, struct file_lock *fl); +int gfs2_lm_plock(struct gfs2_sbd *sdp, struct lm_lockname *name, + struct file *file, int cmd, struct file_lock *fl); +int gfs2_lm_punlock(struct gfs2_sbd *sdp, struct lm_lockname *name, + struct file *file, struct file_lock *fl); +void gfs2_lm_recovery_done(struct gfs2_sbd *sdp, unsigned int jid, + unsigned int message); + +#endif /* __LM_DOT_H__ */ diff --git a/fs/gfs2/locking.c b/fs/gfs2/locking.c new file mode 100644 index 000000000000..663fee728783 --- /dev/null +++ b/fs/gfs2/locking.c @@ -0,0 +1,184 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/string.h> +#include <linux/slab.h> +#include <linux/wait.h> +#include <linux/sched.h> +#include <linux/kmod.h> +#include <linux/fs.h> +#include <linux/delay.h> +#include <linux/lm_interface.h> + +struct lmh_wrapper { + struct list_head lw_list; + const struct lm_lockops *lw_ops; +}; + +/* List of registered low-level locking protocols. A file system selects one + of them by name at mount time, e.g. lock_nolock, lock_dlm. */ + +static LIST_HEAD(lmh_list); +static DEFINE_MUTEX(lmh_lock); + +/** + * gfs2_register_lockproto - Register a low-level locking protocol + * @proto: the protocol definition + * + * Returns: 0 on success, -EXXX on failure + */ + +int gfs2_register_lockproto(const struct lm_lockops *proto) +{ + struct lmh_wrapper *lw; + + mutex_lock(&lmh_lock); + + list_for_each_entry(lw, &lmh_list, lw_list) { + if (!strcmp(lw->lw_ops->lm_proto_name, proto->lm_proto_name)) { + mutex_unlock(&lmh_lock); + printk(KERN_INFO "GFS2: protocol %s already exists\n", + proto->lm_proto_name); + return -EEXIST; + } + } + + lw = kzalloc(sizeof(struct lmh_wrapper), GFP_KERNEL); + if (!lw) { + mutex_unlock(&lmh_lock); + return -ENOMEM; + } + + lw->lw_ops = proto; + list_add(&lw->lw_list, &lmh_list); + + mutex_unlock(&lmh_lock); + + return 0; +} + +/** + * gfs2_unregister_lockproto - Unregister a low-level locking protocol + * @proto: the protocol definition + * + */ + +void gfs2_unregister_lockproto(const struct lm_lockops *proto) +{ + struct lmh_wrapper *lw; + + mutex_lock(&lmh_lock); + + list_for_each_entry(lw, &lmh_list, lw_list) { + if (!strcmp(lw->lw_ops->lm_proto_name, proto->lm_proto_name)) { + list_del(&lw->lw_list); + mutex_unlock(&lmh_lock); + kfree(lw); + return; + } + } + + mutex_unlock(&lmh_lock); + + printk(KERN_WARNING "GFS2: can't unregister lock protocol %s\n", + proto->lm_proto_name); +} + +/** + * gfs2_mount_lockproto - Mount a lock protocol + * @proto_name - the name of the protocol + * @table_name - the name of the lock space + * @host_data - data specific to this host + * @cb - the callback to the code using the lock module + * @sdp - The GFS2 superblock + * @min_lvb_size - the mininum LVB size that the caller can deal with + * @flags - LM_MFLAG_* + * @lockstruct - a structure returned describing the mount + * + * Returns: 0 on success, -EXXX on failure + */ + +int gfs2_mount_lockproto(char *proto_name, char *table_name, char *host_data, + lm_callback_t cb, void *cb_data, + unsigned int min_lvb_size, int flags, + struct lm_lockstruct *lockstruct, + struct kobject *fskobj) +{ + struct lmh_wrapper *lw = NULL; + int try = 0; + int error, found; + +retry: + mutex_lock(&lmh_lock); + + found = 0; + list_for_each_entry(lw, &lmh_list, lw_list) { + if (!strcmp(lw->lw_ops->lm_proto_name, proto_name)) { + found = 1; + break; + } + } + + if (!found) { + if (!try && capable(CAP_SYS_MODULE)) { + try = 1; + mutex_unlock(&lmh_lock); + request_module(proto_name); + goto retry; + } + printk(KERN_INFO "GFS2: can't find protocol %s\n", proto_name); + error = -ENOENT; + goto out; + } + + if (!try_module_get(lw->lw_ops->lm_owner)) { + try = 0; + mutex_unlock(&lmh_lock); + msleep(1000); + goto retry; + } + + error = lw->lw_ops->lm_mount(table_name, host_data, cb, cb_data, + min_lvb_size, flags, lockstruct, fskobj); + if (error) + module_put(lw->lw_ops->lm_owner); +out: + mutex_unlock(&lmh_lock); + return error; +} + +void gfs2_unmount_lockproto(struct lm_lockstruct *lockstruct) +{ + mutex_lock(&lmh_lock); + lockstruct->ls_ops->lm_unmount(lockstruct->ls_lockspace); + if (lockstruct->ls_ops->lm_owner) + module_put(lockstruct->ls_ops->lm_owner); + mutex_unlock(&lmh_lock); +} + +/** + * gfs2_withdraw_lockproto - abnormally unmount a lock module + * @lockstruct: the lockstruct passed into mount + * + */ + +void gfs2_withdraw_lockproto(struct lm_lockstruct *lockstruct) +{ + mutex_lock(&lmh_lock); + lockstruct->ls_ops->lm_withdraw(lockstruct->ls_lockspace); + if (lockstruct->ls_ops->lm_owner) + module_put(lockstruct->ls_ops->lm_owner); + mutex_unlock(&lmh_lock); +} + +EXPORT_SYMBOL_GPL(gfs2_register_lockproto); +EXPORT_SYMBOL_GPL(gfs2_unregister_lockproto); + diff --git a/fs/gfs2/locking/dlm/Makefile b/fs/gfs2/locking/dlm/Makefile new file mode 100644 index 000000000000..89b93b6b45cf --- /dev/null +++ b/fs/gfs2/locking/dlm/Makefile @@ -0,0 +1,3 @@ +obj-$(CONFIG_GFS2_FS_LOCKING_DLM) += lock_dlm.o +lock_dlm-y := lock.o main.o mount.o sysfs.o thread.o plock.o + diff --git a/fs/gfs2/locking/dlm/lock.c b/fs/gfs2/locking/dlm/lock.c new file mode 100644 index 000000000000..b167addf9fd1 --- /dev/null +++ b/fs/gfs2/locking/dlm/lock.c @@ -0,0 +1,524 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#include "lock_dlm.h" + +static char junk_lvb[GDLM_LVB_SIZE]; + +static void queue_complete(struct gdlm_lock *lp) +{ + struct gdlm_ls *ls = lp->ls; + + clear_bit(LFL_ACTIVE, &lp->flags); + + spin_lock(&ls->async_lock); + list_add_tail(&lp->clist, &ls->complete); + spin_unlock(&ls->async_lock); + wake_up(&ls->thread_wait); +} + +static inline void gdlm_ast(void *astarg) +{ + queue_complete(astarg); +} + +static inline void gdlm_bast(void *astarg, int mode) +{ + struct gdlm_lock *lp = astarg; + struct gdlm_ls *ls = lp->ls; + + if (!mode) { + printk(KERN_INFO "lock_dlm: bast mode zero %x,%llx\n", + lp->lockname.ln_type, + (unsigned long long)lp->lockname.ln_number); + return; + } + + spin_lock(&ls->async_lock); + if (!lp->bast_mode) { + list_add_tail(&lp->blist, &ls->blocking); + lp->bast_mode = mode; + } else if (lp->bast_mode < mode) + lp->bast_mode = mode; + spin_unlock(&ls->async_lock); + wake_up(&ls->thread_wait); +} + +void gdlm_queue_delayed(struct gdlm_lock *lp) +{ + struct gdlm_ls *ls = lp->ls; + + spin_lock(&ls->async_lock); + list_add_tail(&lp->delay_list, &ls->delayed); + spin_unlock(&ls->async_lock); +} + +/* convert gfs lock-state to dlm lock-mode */ + +static s16 make_mode(s16 lmstate) +{ + switch (lmstate) { + case LM_ST_UNLOCKED: + return DLM_LOCK_NL; + case LM_ST_EXCLUSIVE: + return DLM_LOCK_EX; + case LM_ST_DEFERRED: + return DLM_LOCK_CW; + case LM_ST_SHARED: + return DLM_LOCK_PR; + } + gdlm_assert(0, "unknown LM state %d", lmstate); + return -1; +} + +/* convert dlm lock-mode to gfs lock-state */ + +s16 gdlm_make_lmstate(s16 dlmmode) +{ + switch (dlmmode) { + case DLM_LOCK_IV: + case DLM_LOCK_NL: + return LM_ST_UNLOCKED; + case DLM_LOCK_EX: + return LM_ST_EXCLUSIVE; + case DLM_LOCK_CW: + return LM_ST_DEFERRED; + case DLM_LOCK_PR: + return LM_ST_SHARED; + } + gdlm_assert(0, "unknown DLM mode %d", dlmmode); + return -1; +} + +/* verify agreement with GFS on the current lock state, NB: DLM_LOCK_NL and + DLM_LOCK_IV are both considered LM_ST_UNLOCKED by GFS. */ + +static void check_cur_state(struct gdlm_lock *lp, unsigned int cur_state) +{ + s16 cur = make_mode(cur_state); + if (lp->cur != DLM_LOCK_IV) + gdlm_assert(lp->cur == cur, "%d, %d", lp->cur, cur); +} + +static inline unsigned int make_flags(struct gdlm_lock *lp, + unsigned int gfs_flags, + s16 cur, s16 req) +{ + unsigned int lkf = 0; + + if (gfs_flags & LM_FLAG_TRY) + lkf |= DLM_LKF_NOQUEUE; + + if (gfs_flags & LM_FLAG_TRY_1CB) { + lkf |= DLM_LKF_NOQUEUE; + lkf |= DLM_LKF_NOQUEUEBAST; + } + + if (gfs_flags & LM_FLAG_PRIORITY) { + lkf |= DLM_LKF_NOORDER; + lkf |= DLM_LKF_HEADQUE; + } + + if (gfs_flags & LM_FLAG_ANY) { + if (req == DLM_LOCK_PR) + lkf |= DLM_LKF_ALTCW; + else if (req == DLM_LOCK_CW) + lkf |= DLM_LKF_ALTPR; + } + + if (lp->lksb.sb_lkid != 0) { + lkf |= DLM_LKF_CONVERT; + + /* Conversion deadlock avoidance by DLM */ + + if (!test_bit(LFL_FORCE_PROMOTE, &lp->flags) && + !(lkf & DLM_LKF_NOQUEUE) && + cur > DLM_LOCK_NL && req > DLM_LOCK_NL && cur != req) + lkf |= DLM_LKF_CONVDEADLK; + } + + if (lp->lvb) + lkf |= DLM_LKF_VALBLK; + + return lkf; +} + +/* make_strname - convert GFS lock numbers to a string */ + +static inline void make_strname(struct lm_lockname *lockname, + struct gdlm_strname *str) +{ + sprintf(str->name, "%8x%16llx", lockname->ln_type, + (unsigned long long)lockname->ln_number); + str->namelen = GDLM_STRNAME_BYTES; +} + +static int gdlm_create_lp(struct gdlm_ls *ls, struct lm_lockname *name, + struct gdlm_lock **lpp) +{ + struct gdlm_lock *lp; + + lp = kzalloc(sizeof(struct gdlm_lock), GFP_KERNEL); + if (!lp) + return -ENOMEM; + + lp->lockname = *name; + lp->ls = ls; + lp->cur = DLM_LOCK_IV; + lp->lvb = NULL; + lp->hold_null = NULL; + init_completion(&lp->ast_wait); + INIT_LIST_HEAD(&lp->clist); + INIT_LIST_HEAD(&lp->blist); + INIT_LIST_HEAD(&lp->delay_list); + + spin_lock(&ls->async_lock); + list_add(&lp->all_list, &ls->all_locks); + ls->all_locks_count++; + spin_unlock(&ls->async_lock); + + *lpp = lp; + return 0; +} + +void gdlm_delete_lp(struct gdlm_lock *lp) +{ + struct gdlm_ls *ls = lp->ls; + + spin_lock(&ls->async_lock); + if (!list_empty(&lp->clist)) + list_del_init(&lp->clist); + if (!list_empty(&lp->blist)) + list_del_init(&lp->blist); + if (!list_empty(&lp->delay_list)) + list_del_init(&lp->delay_list); + gdlm_assert(!list_empty(&lp->all_list), "%x,%llx", lp->lockname.ln_type, + (unsigned long long)lp->lockname.ln_number); + list_del_init(&lp->all_list); + ls->all_locks_count--; + spin_unlock(&ls->async_lock); + + kfree(lp); +} + +int gdlm_get_lock(void *lockspace, struct lm_lockname *name, + void **lockp) +{ + struct gdlm_lock *lp; + int error; + + error = gdlm_create_lp(lockspace, name, &lp); + + *lockp = lp; + return error; +} + +void gdlm_put_lock(void *lock) +{ + gdlm_delete_lp(lock); +} + +unsigned int gdlm_do_lock(struct gdlm_lock *lp) +{ + struct gdlm_ls *ls = lp->ls; + struct gdlm_strname str; + int error, bast = 1; + + /* + * When recovery is in progress, delay lock requests for submission + * once recovery is done. Requests for recovery (NOEXP) and unlocks + * can pass. + */ + + if (test_bit(DFL_BLOCK_LOCKS, &ls->flags) && + !test_bit(LFL_NOBLOCK, &lp->flags) && lp->req != DLM_LOCK_NL) { + gdlm_queue_delayed(lp); + return LM_OUT_ASYNC; + } + + /* + * Submit the actual lock request. + */ + + if (test_bit(LFL_NOBAST, &lp->flags)) + bast = 0; + + make_strname(&lp->lockname, &str); + + set_bit(LFL_ACTIVE, &lp->flags); + + log_debug("lk %x,%llx id %x %d,%d %x", lp->lockname.ln_type, + (unsigned long long)lp->lockname.ln_number, lp->lksb.sb_lkid, + lp->cur, lp->req, lp->lkf); + + error = dlm_lock(ls->dlm_lockspace, lp->req, &lp->lksb, lp->lkf, + str.name, str.namelen, 0, gdlm_ast, lp, + bast ? gdlm_bast : NULL); + + if ((error == -EAGAIN) && (lp->lkf & DLM_LKF_NOQUEUE)) { + lp->lksb.sb_status = -EAGAIN; + queue_complete(lp); + error = 0; + } + + if (error) { + log_debug("%s: gdlm_lock %x,%llx err=%d cur=%d req=%d lkf=%x " + "flags=%lx", ls->fsname, lp->lockname.ln_type, + (unsigned long long)lp->lockname.ln_number, error, + lp->cur, lp->req, lp->lkf, lp->flags); + return LM_OUT_ERROR; + } + return LM_OUT_ASYNC; +} + +static unsigned int gdlm_do_unlock(struct gdlm_lock *lp) +{ + struct gdlm_ls *ls = lp->ls; + unsigned int lkf = 0; + int error; + + set_bit(LFL_DLM_UNLOCK, &lp->flags); + set_bit(LFL_ACTIVE, &lp->flags); + + if (lp->lvb) + lkf = DLM_LKF_VALBLK; + + log_debug("un %x,%llx %x %d %x", lp->lockname.ln_type, + (unsigned long long)lp->lockname.ln_number, + lp->lksb.sb_lkid, lp->cur, lkf); + + error = dlm_unlock(ls->dlm_lockspace, lp->lksb.sb_lkid, lkf, NULL, lp); + + if (error) { + log_debug("%s: gdlm_unlock %x,%llx err=%d cur=%d req=%d lkf=%x " + "flags=%lx", ls->fsname, lp->lockname.ln_type, + (unsigned long long)lp->lockname.ln_number, error, + lp->cur, lp->req, lp->lkf, lp->flags); + return LM_OUT_ERROR; + } + return LM_OUT_ASYNC; +} + +unsigned int gdlm_lock(void *lock, unsigned int cur_state, + unsigned int req_state, unsigned int flags) +{ + struct gdlm_lock *lp = lock; + + clear_bit(LFL_DLM_CANCEL, &lp->flags); + if (flags & LM_FLAG_NOEXP) + set_bit(LFL_NOBLOCK, &lp->flags); + + check_cur_state(lp, cur_state); + lp->req = make_mode(req_state); + lp->lkf = make_flags(lp, flags, lp->cur, lp->req); + + return gdlm_do_lock(lp); +} + +unsigned int gdlm_unlock(void *lock, unsigned int cur_state) +{ + struct gdlm_lock *lp = lock; + + clear_bit(LFL_DLM_CANCEL, &lp->flags); + if (lp->cur == DLM_LOCK_IV) + return 0; + return gdlm_do_unlock(lp); +} + +void gdlm_cancel(void *lock) +{ + struct gdlm_lock *lp = lock; + struct gdlm_ls *ls = lp->ls; + int error, delay_list = 0; + + if (test_bit(LFL_DLM_CANCEL, &lp->flags)) + return; + + log_info("gdlm_cancel %x,%llx flags %lx", lp->lockname.ln_type, + (unsigned long long)lp->lockname.ln_number, lp->flags); + + spin_lock(&ls->async_lock); + if (!list_empty(&lp->delay_list)) { + list_del_init(&lp->delay_list); + delay_list = 1; + } + spin_unlock(&ls->async_lock); + + if (delay_list) { + set_bit(LFL_CANCEL, &lp->flags); + set_bit(LFL_ACTIVE, &lp->flags); + queue_complete(lp); + return; + } + + if (!test_bit(LFL_ACTIVE, &lp->flags) || + test_bit(LFL_DLM_UNLOCK, &lp->flags)) { + log_info("gdlm_cancel skip %x,%llx flags %lx", + lp->lockname.ln_type, + (unsigned long long)lp->lockname.ln_number, lp->flags); + return; + } + + /* the lock is blocked in the dlm */ + + set_bit(LFL_DLM_CANCEL, &lp->flags); + set_bit(LFL_ACTIVE, &lp->flags); + + error = dlm_unlock(ls->dlm_lockspace, lp->lksb.sb_lkid, DLM_LKF_CANCEL, + NULL, lp); + + log_info("gdlm_cancel rv %d %x,%llx flags %lx", error, + lp->lockname.ln_type, + (unsigned long long)lp->lockname.ln_number, lp->flags); + + if (error == -EBUSY) + clear_bit(LFL_DLM_CANCEL, &lp->flags); +} + +static int gdlm_add_lvb(struct gdlm_lock *lp) +{ + char *lvb; + + lvb = kzalloc(GDLM_LVB_SIZE, GFP_KERNEL); + if (!lvb) + return -ENOMEM; + + lp->lksb.sb_lvbptr = lvb; + lp->lvb = lvb; + return 0; +} + +static void gdlm_del_lvb(struct gdlm_lock *lp) +{ + kfree(lp->lvb); + lp->lvb = NULL; + lp->lksb.sb_lvbptr = NULL; +} + +/* This can do a synchronous dlm request (requiring a lock_dlm thread to get + the completion) because gfs won't call hold_lvb() during a callback (from + the context of a lock_dlm thread). */ + +static int hold_null_lock(struct gdlm_lock *lp) +{ + struct gdlm_lock *lpn = NULL; + int error; + + if (lp->hold_null) { + printk(KERN_INFO "lock_dlm: lvb already held\n"); + return 0; + } + + error = gdlm_create_lp(lp->ls, &lp->lockname, &lpn); + if (error) + goto out; + + lpn->lksb.sb_lvbptr = junk_lvb; + lpn->lvb = junk_lvb; + + lpn->req = DLM_LOCK_NL; + lpn->lkf = DLM_LKF_VALBLK | DLM_LKF_EXPEDITE; + set_bit(LFL_NOBAST, &lpn->flags); + set_bit(LFL_INLOCK, &lpn->flags); + + init_completion(&lpn->ast_wait); + gdlm_do_lock(lpn); + wait_for_completion(&lpn->ast_wait); + error = lpn->lksb.sb_status; + if (error) { + printk(KERN_INFO "lock_dlm: hold_null_lock dlm error %d\n", + error); + gdlm_delete_lp(lpn); + lpn = NULL; + } +out: + lp->hold_null = lpn; + return error; +} + +/* This cannot do a synchronous dlm request (requiring a lock_dlm thread to get + the completion) because gfs may call unhold_lvb() during a callback (from + the context of a lock_dlm thread) which could cause a deadlock since the + other lock_dlm thread could be engaged in recovery. */ + +static void unhold_null_lock(struct gdlm_lock *lp) +{ + struct gdlm_lock *lpn = lp->hold_null; + + gdlm_assert(lpn, "%x,%llx", lp->lockname.ln_type, + (unsigned long long)lp->lockname.ln_number); + lpn->lksb.sb_lvbptr = NULL; + lpn->lvb = NULL; + set_bit(LFL_UNLOCK_DELETE, &lpn->flags); + gdlm_do_unlock(lpn); + lp->hold_null = NULL; +} + +/* Acquire a NL lock because gfs requires the value block to remain + intact on the resource while the lvb is "held" even if it's holding no locks + on the resource. */ + +int gdlm_hold_lvb(void *lock, char **lvbp) +{ + struct gdlm_lock *lp = lock; + int error; + + error = gdlm_add_lvb(lp); + if (error) + return error; + + *lvbp = lp->lvb; + + error = hold_null_lock(lp); + if (error) + gdlm_del_lvb(lp); + + return error; +} + +void gdlm_unhold_lvb(void *lock, char *lvb) +{ + struct gdlm_lock *lp = lock; + + unhold_null_lock(lp); + gdlm_del_lvb(lp); +} + +void gdlm_submit_delayed(struct gdlm_ls *ls) +{ + struct gdlm_lock *lp, *safe; + + spin_lock(&ls->async_lock); + list_for_each_entry_safe(lp, safe, &ls->delayed, delay_list) { + list_del_init(&lp->delay_list); + list_add_tail(&lp->delay_list, &ls->submit); + } + spin_unlock(&ls->async_lock); + wake_up(&ls->thread_wait); +} + +int gdlm_release_all_locks(struct gdlm_ls *ls) +{ + struct gdlm_lock *lp, *safe; + int count = 0; + + spin_lock(&ls->async_lock); + list_for_each_entry_safe(lp, safe, &ls->all_locks, all_list) { + list_del_init(&lp->all_list); + + if (lp->lvb && lp->lvb != junk_lvb) + kfree(lp->lvb); + kfree(lp); + count++; + } + spin_unlock(&ls->async_lock); + + return count; +} + diff --git a/fs/gfs2/locking/dlm/lock_dlm.h b/fs/gfs2/locking/dlm/lock_dlm.h new file mode 100644 index 000000000000..33af707a4d3f --- /dev/null +++ b/fs/gfs2/locking/dlm/lock_dlm.h @@ -0,0 +1,187 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#ifndef LOCK_DLM_DOT_H +#define LOCK_DLM_DOT_H + +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/string.h> +#include <linux/list.h> +#include <linux/socket.h> +#include <linux/delay.h> +#include <linux/kthread.h> +#include <linux/kobject.h> +#include <linux/fcntl.h> +#include <linux/wait.h> +#include <net/sock.h> + +#include <linux/dlm.h> +#include <linux/lm_interface.h> + +/* + * Internally, we prefix things with gdlm_ and GDLM_ (for gfs-dlm) since a + * prefix of lock_dlm_ gets awkward. Externally, GFS refers to this module + * as "lock_dlm". + */ + +#define GDLM_STRNAME_BYTES 24 +#define GDLM_LVB_SIZE 32 +#define GDLM_DROP_COUNT 50000 +#define GDLM_DROP_PERIOD 60 +#define GDLM_NAME_LEN 128 + +/* GFS uses 12 bytes to identify a resource (32 bit type + 64 bit number). + We sprintf these numbers into a 24 byte string of hex values to make them + human-readable (to make debugging simpler.) */ + +struct gdlm_strname { + unsigned char name[GDLM_STRNAME_BYTES]; + unsigned short namelen; +}; + +enum { + DFL_BLOCK_LOCKS = 0, + DFL_SPECTATOR = 1, + DFL_WITHDRAW = 2, +}; + +struct gdlm_ls { + u32 id; + int jid; + int first; + int first_done; + unsigned long flags; + struct kobject kobj; + char clustername[GDLM_NAME_LEN]; + char fsname[GDLM_NAME_LEN]; + int fsflags; + dlm_lockspace_t *dlm_lockspace; + lm_callback_t fscb; + struct gfs2_sbd *sdp; + int recover_jid; + int recover_jid_done; + int recover_jid_status; + spinlock_t async_lock; + struct list_head complete; + struct list_head blocking; + struct list_head delayed; + struct list_head submit; + struct list_head all_locks; + u32 all_locks_count; + wait_queue_head_t wait_control; + struct task_struct *thread1; + struct task_struct *thread2; + wait_queue_head_t thread_wait; + unsigned long drop_time; + int drop_locks_count; + int drop_locks_period; +}; + +enum { + LFL_NOBLOCK = 0, + LFL_NOCACHE = 1, + LFL_DLM_UNLOCK = 2, + LFL_DLM_CANCEL = 3, + LFL_SYNC_LVB = 4, + LFL_FORCE_PROMOTE = 5, + LFL_REREQUEST = 6, + LFL_ACTIVE = 7, + LFL_INLOCK = 8, + LFL_CANCEL = 9, + LFL_NOBAST = 10, + LFL_HEADQUE = 11, + LFL_UNLOCK_DELETE = 12, +}; + +struct gdlm_lock { + struct gdlm_ls *ls; + struct lm_lockname lockname; + char *lvb; + struct dlm_lksb lksb; + + s16 cur; + s16 req; + s16 prev_req; + u32 lkf; /* dlm flags DLM_LKF_ */ + unsigned long flags; /* lock_dlm flags LFL_ */ + + int bast_mode; /* protected by async_lock */ + struct completion ast_wait; + + struct list_head clist; /* complete */ + struct list_head blist; /* blocking */ + struct list_head delay_list; /* delayed */ + struct list_head all_list; /* all locks for the fs */ + struct gdlm_lock *hold_null; /* NL lock for hold_lvb */ +}; + +#define gdlm_assert(assertion, fmt, args...) \ +do { \ + if (unlikely(!(assertion))) { \ + printk(KERN_EMERG "lock_dlm: fatal assertion failed \"%s\"\n" \ + "lock_dlm: " fmt "\n", \ + #assertion, ##args); \ + BUG(); \ + } \ +} while (0) + +#define log_print(lev, fmt, arg...) printk(lev "lock_dlm: " fmt "\n" , ## arg) +#define log_info(fmt, arg...) log_print(KERN_INFO , fmt , ## arg) +#define log_error(fmt, arg...) log_print(KERN_ERR , fmt , ## arg) +#ifdef LOCK_DLM_LOG_DEBUG +#define log_debug(fmt, arg...) log_print(KERN_DEBUG , fmt , ## arg) +#else +#define log_debug(fmt, arg...) +#endif + +/* sysfs.c */ + +int gdlm_sysfs_init(void); +void gdlm_sysfs_exit(void); +int gdlm_kobject_setup(struct gdlm_ls *, struct kobject *); +void gdlm_kobject_release(struct gdlm_ls *); + +/* thread.c */ + +int gdlm_init_threads(struct gdlm_ls *); +void gdlm_release_threads(struct gdlm_ls *); + +/* lock.c */ + +s16 gdlm_make_lmstate(s16); +void gdlm_queue_delayed(struct gdlm_lock *); +void gdlm_submit_delayed(struct gdlm_ls *); +int gdlm_release_all_locks(struct gdlm_ls *); +void gdlm_delete_lp(struct gdlm_lock *); +unsigned int gdlm_do_lock(struct gdlm_lock *); + +int gdlm_get_lock(void *, struct lm_lockname *, void **); +void gdlm_put_lock(void *); +unsigned int gdlm_lock(void *, unsigned int, unsigned int, unsigned int); +unsigned int gdlm_unlock(void *, unsigned int); +void gdlm_cancel(void *); +int gdlm_hold_lvb(void *, char **); +void gdlm_unhold_lvb(void *, char *); + +/* plock.c */ + +int gdlm_plock_init(void); +void gdlm_plock_exit(void); +int gdlm_plock(void *, struct lm_lockname *, struct file *, int, + struct file_lock *); +int gdlm_plock_get(void *, struct lm_lockname *, struct file *, + struct file_lock *); +int gdlm_punlock(void *, struct lm_lockname *, struct file *, + struct file_lock *); +#endif + diff --git a/fs/gfs2/locking/dlm/main.c b/fs/gfs2/locking/dlm/main.c new file mode 100644 index 000000000000..2194b1d5b5ec --- /dev/null +++ b/fs/gfs2/locking/dlm/main.c @@ -0,0 +1,64 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#include <linux/init.h> + +#include "lock_dlm.h" + +extern int gdlm_drop_count; +extern int gdlm_drop_period; + +extern struct lm_lockops gdlm_ops; + +static int __init init_lock_dlm(void) +{ + int error; + + error = gfs2_register_lockproto(&gdlm_ops); + if (error) { + printk(KERN_WARNING "lock_dlm: can't register protocol: %d\n", + error); + return error; + } + + error = gdlm_sysfs_init(); + if (error) { + gfs2_unregister_lockproto(&gdlm_ops); + return error; + } + + error = gdlm_plock_init(); + if (error) { + gdlm_sysfs_exit(); + gfs2_unregister_lockproto(&gdlm_ops); + return error; + } + + gdlm_drop_count = GDLM_DROP_COUNT; + gdlm_drop_period = GDLM_DROP_PERIOD; + + printk(KERN_INFO + "Lock_DLM (built %s %s) installed\n", __DATE__, __TIME__); + return 0; +} + +static void __exit exit_lock_dlm(void) +{ + gdlm_plock_exit(); + gdlm_sysfs_exit(); + gfs2_unregister_lockproto(&gdlm_ops); +} + +module_init(init_lock_dlm); +module_exit(exit_lock_dlm); + +MODULE_DESCRIPTION("GFS DLM Locking Module"); +MODULE_AUTHOR("Red Hat, Inc."); +MODULE_LICENSE("GPL"); + diff --git a/fs/gfs2/locking/dlm/mount.c b/fs/gfs2/locking/dlm/mount.c new file mode 100644 index 000000000000..1f94dd35a943 --- /dev/null +++ b/fs/gfs2/locking/dlm/mount.c @@ -0,0 +1,255 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#include "lock_dlm.h" + +int gdlm_drop_count; +int gdlm_drop_period; +const struct lm_lockops gdlm_ops; + + +static struct gdlm_ls *init_gdlm(lm_callback_t cb, struct gfs2_sbd *sdp, + int flags, char *table_name) +{ + struct gdlm_ls *ls; + char buf[256], *p; + + ls = kzalloc(sizeof(struct gdlm_ls), GFP_KERNEL); + if (!ls) + return NULL; + + ls->drop_locks_count = gdlm_drop_count; + ls->drop_locks_period = gdlm_drop_period; + ls->fscb = cb; + ls->sdp = sdp; + ls->fsflags = flags; + spin_lock_init(&ls->async_lock); + INIT_LIST_HEAD(&ls->complete); + INIT_LIST_HEAD(&ls->blocking); + INIT_LIST_HEAD(&ls->delayed); + INIT_LIST_HEAD(&ls->submit); + INIT_LIST_HEAD(&ls->all_locks); + init_waitqueue_head(&ls->thread_wait); + init_waitqueue_head(&ls->wait_control); + ls->thread1 = NULL; + ls->thread2 = NULL; + ls->drop_time = jiffies; + ls->jid = -1; + + strncpy(buf, table_name, 256); + buf[255] = '\0'; + + p = strstr(buf, ":"); + if (!p) { + log_info("invalid table_name \"%s\"", table_name); + kfree(ls); + return NULL; + } + *p = '\0'; + p++; + + strncpy(ls->clustername, buf, GDLM_NAME_LEN); + strncpy(ls->fsname, p, GDLM_NAME_LEN); + + return ls; +} + +static int make_args(struct gdlm_ls *ls, char *data_arg, int *nodir) +{ + char data[256]; + char *options, *x, *y; + int error = 0; + + memset(data, 0, 256); + strncpy(data, data_arg, 255); + + for (options = data; (x = strsep(&options, ":")); ) { + if (!*x) + continue; + + y = strchr(x, '='); + if (y) + *y++ = 0; + + if (!strcmp(x, "jid")) { + if (!y) { + log_error("need argument to jid"); + error = -EINVAL; + break; + } + sscanf(y, "%u", &ls->jid); + + } else if (!strcmp(x, "first")) { + if (!y) { + log_error("need argument to first"); + error = -EINVAL; + break; + } + sscanf(y, "%u", &ls->first); + + } else if (!strcmp(x, "id")) { + if (!y) { + log_error("need argument to id"); + error = -EINVAL; + break; + } + sscanf(y, "%u", &ls->id); + + } else if (!strcmp(x, "nodir")) { + if (!y) { + log_error("need argument to nodir"); + error = -EINVAL; + break; + } + sscanf(y, "%u", nodir); + + } else { + log_error("unkonwn option: %s", x); + error = -EINVAL; + break; + } + } + + return error; +} + +static int gdlm_mount(char *table_name, char *host_data, + lm_callback_t cb, void *cb_data, + unsigned int min_lvb_size, int flags, + struct lm_lockstruct *lockstruct, + struct kobject *fskobj) +{ + struct gdlm_ls *ls; + int error = -ENOMEM, nodir = 0; + + if (min_lvb_size > GDLM_LVB_SIZE) + goto out; + + ls = init_gdlm(cb, cb_data, flags, table_name); + if (!ls) + goto out; + + error = make_args(ls, host_data, &nodir); + if (error) + goto out; + + error = gdlm_init_threads(ls); + if (error) + goto out_free; + + error = gdlm_kobject_setup(ls, fskobj); + if (error) + goto out_thread; + + error = dlm_new_lockspace(ls->fsname, strlen(ls->fsname), + &ls->dlm_lockspace, + nodir ? DLM_LSFL_NODIR : 0, + GDLM_LVB_SIZE); + if (error) { + log_error("dlm_new_lockspace error %d", error); + goto out_kobj; + } + + lockstruct->ls_jid = ls->jid; + lockstruct->ls_first = ls->first; + lockstruct->ls_lockspace = ls; + lockstruct->ls_ops = &gdlm_ops; + lockstruct->ls_flags = 0; + lockstruct->ls_lvb_size = GDLM_LVB_SIZE; + return 0; + +out_kobj: + gdlm_kobject_release(ls); +out_thread: + gdlm_release_threads(ls); +out_free: + kfree(ls); +out: + return error; +} + +static void gdlm_unmount(void *lockspace) +{ + struct gdlm_ls *ls = lockspace; + int rv; + + log_debug("unmount flags %lx", ls->flags); + + /* FIXME: serialize unmount and withdraw in case they + happen at once. Also, if unmount follows withdraw, + wait for withdraw to finish. */ + + if (test_bit(DFL_WITHDRAW, &ls->flags)) + goto out; + + gdlm_kobject_release(ls); + dlm_release_lockspace(ls->dlm_lockspace, 2); + gdlm_release_threads(ls); + rv = gdlm_release_all_locks(ls); + if (rv) + log_info("gdlm_unmount: %d stray locks freed", rv); +out: + kfree(ls); +} + +static void gdlm_recovery_done(void *lockspace, unsigned int jid, + unsigned int message) +{ + struct gdlm_ls *ls = lockspace; + ls->recover_jid_done = jid; + ls->recover_jid_status = message; + kobject_uevent(&ls->kobj, KOBJ_CHANGE); +} + +static void gdlm_others_may_mount(void *lockspace) +{ + struct gdlm_ls *ls = lockspace; + ls->first_done = 1; + kobject_uevent(&ls->kobj, KOBJ_CHANGE); +} + +/* Userspace gets the offline uevent, blocks new gfs locks on + other mounters, and lets us know (sets WITHDRAW flag). Then, + userspace leaves the mount group while we leave the lockspace. */ + +static void gdlm_withdraw(void *lockspace) +{ + struct gdlm_ls *ls = lockspace; + + kobject_uevent(&ls->kobj, KOBJ_OFFLINE); + + wait_event_interruptible(ls->wait_control, + test_bit(DFL_WITHDRAW, &ls->flags)); + + dlm_release_lockspace(ls->dlm_lockspace, 2); + gdlm_release_threads(ls); + gdlm_release_all_locks(ls); + gdlm_kobject_release(ls); +} + +const struct lm_lockops gdlm_ops = { + .lm_proto_name = "lock_dlm", + .lm_mount = gdlm_mount, + .lm_others_may_mount = gdlm_others_may_mount, + .lm_unmount = gdlm_unmount, + .lm_withdraw = gdlm_withdraw, + .lm_get_lock = gdlm_get_lock, + .lm_put_lock = gdlm_put_lock, + .lm_lock = gdlm_lock, + .lm_unlock = gdlm_unlock, + .lm_plock = gdlm_plock, + .lm_punlock = gdlm_punlock, + .lm_plock_get = gdlm_plock_get, + .lm_cancel = gdlm_cancel, + .lm_hold_lvb = gdlm_hold_lvb, + .lm_unhold_lvb = gdlm_unhold_lvb, + .lm_recovery_done = gdlm_recovery_done, + .lm_owner = THIS_MODULE, +}; + diff --git a/fs/gfs2/locking/dlm/plock.c b/fs/gfs2/locking/dlm/plock.c new file mode 100644 index 000000000000..7365aec9511b --- /dev/null +++ b/fs/gfs2/locking/dlm/plock.c @@ -0,0 +1,301 @@ +/* + * Copyright (C) 2005 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#include <linux/miscdevice.h> +#include <linux/lock_dlm_plock.h> + +#include "lock_dlm.h" + + +static spinlock_t ops_lock; +static struct list_head send_list; +static struct list_head recv_list; +static wait_queue_head_t send_wq; +static wait_queue_head_t recv_wq; + +struct plock_op { + struct list_head list; + int done; + struct gdlm_plock_info info; +}; + +static inline void set_version(struct gdlm_plock_info *info) +{ + info->version[0] = GDLM_PLOCK_VERSION_MAJOR; + info->version[1] = GDLM_PLOCK_VERSION_MINOR; + info->version[2] = GDLM_PLOCK_VERSION_PATCH; +} + +static int check_version(struct gdlm_plock_info *info) +{ + if ((GDLM_PLOCK_VERSION_MAJOR != info->version[0]) || + (GDLM_PLOCK_VERSION_MINOR < info->version[1])) { + log_error("plock device version mismatch: " + "kernel (%u.%u.%u), user (%u.%u.%u)", + GDLM_PLOCK_VERSION_MAJOR, + GDLM_PLOCK_VERSION_MINOR, + GDLM_PLOCK_VERSION_PATCH, + info->version[0], + info->version[1], + info->version[2]); + return -EINVAL; + } + return 0; +} + +static void send_op(struct plock_op *op) +{ + set_version(&op->info); + INIT_LIST_HEAD(&op->list); + spin_lock(&ops_lock); + list_add_tail(&op->list, &send_list); + spin_unlock(&ops_lock); + wake_up(&send_wq); +} + +int gdlm_plock(void *lockspace, struct lm_lockname *name, + struct file *file, int cmd, struct file_lock *fl) +{ + struct gdlm_ls *ls = lockspace; + struct plock_op *op; + int rv; + + op = kzalloc(sizeof(*op), GFP_KERNEL); + if (!op) + return -ENOMEM; + + op->info.optype = GDLM_PLOCK_OP_LOCK; + op->info.pid = fl->fl_pid; + op->info.ex = (fl->fl_type == F_WRLCK); + op->info.wait = IS_SETLKW(cmd); + op->info.fsid = ls->id; + op->info.number = name->ln_number; + op->info.start = fl->fl_start; + op->info.end = fl->fl_end; + op->info.owner = (__u64)(long) fl->fl_owner; + + send_op(op); + wait_event(recv_wq, (op->done != 0)); + + spin_lock(&ops_lock); + if (!list_empty(&op->list)) { + printk(KERN_INFO "plock op on list\n"); + list_del(&op->list); + } + spin_unlock(&ops_lock); + + rv = op->info.rv; + + if (!rv) { + if (posix_lock_file_wait(file, fl) < 0) + log_error("gdlm_plock: vfs lock error %x,%llx", + name->ln_type, + (unsigned long long)name->ln_number); + } + + kfree(op); + return rv; +} + +int gdlm_punlock(void *lockspace, struct lm_lockname *name, + struct file *file, struct file_lock *fl) +{ + struct gdlm_ls *ls = lockspace; + struct plock_op *op; + int rv; + + op = kzalloc(sizeof(*op), GFP_KERNEL); + if (!op) + return -ENOMEM; + + if (posix_lock_file_wait(file, fl) < 0) + log_error("gdlm_punlock: vfs unlock error %x,%llx", + name->ln_type, (unsigned long long)name->ln_number); + + op->info.optype = GDLM_PLOCK_OP_UNLOCK; + op->info.pid = fl->fl_pid; + op->info.fsid = ls->id; + op->info.number = name->ln_number; + op->info.start = fl->fl_start; + op->info.end = fl->fl_end; + op->info.owner = (__u64)(long) fl->fl_owner; + + send_op(op); + wait_event(recv_wq, (op->done != 0)); + + spin_lock(&ops_lock); + if (!list_empty(&op->list)) { + printk(KERN_INFO "punlock op on list\n"); + list_del(&op->list); + } + spin_unlock(&ops_lock); + + rv = op->info.rv; + + kfree(op); + return rv; +} + +int gdlm_plock_get(void *lockspace, struct lm_lockname *name, + struct file *file, struct file_lock *fl) +{ + struct gdlm_ls *ls = lockspace; + struct plock_op *op; + int rv; + + op = kzalloc(sizeof(*op), GFP_KERNEL); + if (!op) + return -ENOMEM; + + op->info.optype = GDLM_PLOCK_OP_GET; + op->info.pid = fl->fl_pid; + op->info.ex = (fl->fl_type == F_WRLCK); + op->info.fsid = ls->id; + op->info.number = name->ln_number; + op->info.start = fl->fl_start; + op->info.end = fl->fl_end; + + send_op(op); + wait_event(recv_wq, (op->done != 0)); + + spin_lock(&ops_lock); + if (!list_empty(&op->list)) { + printk(KERN_INFO "plock_get op on list\n"); + list_del(&op->list); + } + spin_unlock(&ops_lock); + + rv = op->info.rv; + + if (rv == 0) + fl->fl_type = F_UNLCK; + else if (rv > 0) { + fl->fl_type = (op->info.ex) ? F_WRLCK : F_RDLCK; + fl->fl_pid = op->info.pid; + fl->fl_start = op->info.start; + fl->fl_end = op->info.end; + } + + kfree(op); + return rv; +} + +/* a read copies out one plock request from the send list */ +static ssize_t dev_read(struct file *file, char __user *u, size_t count, + loff_t *ppos) +{ + struct gdlm_plock_info info; + struct plock_op *op = NULL; + + if (count < sizeof(info)) + return -EINVAL; + + spin_lock(&ops_lock); + if (!list_empty(&send_list)) { + op = list_entry(send_list.next, struct plock_op, list); + list_move(&op->list, &recv_list); + memcpy(&info, &op->info, sizeof(info)); + } + spin_unlock(&ops_lock); + + if (!op) + return -EAGAIN; + + if (copy_to_user(u, &info, sizeof(info))) + return -EFAULT; + return sizeof(info); +} + +/* a write copies in one plock result that should match a plock_op + on the recv list */ +static ssize_t dev_write(struct file *file, const char __user *u, size_t count, + loff_t *ppos) +{ + struct gdlm_plock_info info; + struct plock_op *op; + int found = 0; + + if (count != sizeof(info)) + return -EINVAL; + + if (copy_from_user(&info, u, sizeof(info))) + return -EFAULT; + + if (check_version(&info)) + return -EINVAL; + + spin_lock(&ops_lock); + list_for_each_entry(op, &recv_list, list) { + if (op->info.fsid == info.fsid && op->info.number == info.number && + op->info.owner == info.owner) { + list_del_init(&op->list); + found = 1; + op->done = 1; + memcpy(&op->info, &info, sizeof(info)); + break; + } + } + spin_unlock(&ops_lock); + + if (found) + wake_up(&recv_wq); + else + printk(KERN_INFO "gdlm dev_write no op %x %llx\n", info.fsid, + (unsigned long long)info.number); + return count; +} + +static unsigned int dev_poll(struct file *file, poll_table *wait) +{ + poll_wait(file, &send_wq, wait); + + spin_lock(&ops_lock); + if (!list_empty(&send_list)) { + spin_unlock(&ops_lock); + return POLLIN | POLLRDNORM; + } + spin_unlock(&ops_lock); + return 0; +} + +static struct file_operations dev_fops = { + .read = dev_read, + .write = dev_write, + .poll = dev_poll, + .owner = THIS_MODULE +}; + +static struct miscdevice plock_dev_misc = { + .minor = MISC_DYNAMIC_MINOR, + .name = GDLM_PLOCK_MISC_NAME, + .fops = &dev_fops +}; + +int gdlm_plock_init(void) +{ + int rv; + + spin_lock_init(&ops_lock); + INIT_LIST_HEAD(&send_list); + INIT_LIST_HEAD(&recv_list); + init_waitqueue_head(&send_wq); + init_waitqueue_head(&recv_wq); + + rv = misc_register(&plock_dev_misc); + if (rv) + printk(KERN_INFO "gdlm_plock_init: misc_register failed %d", + rv); + return rv; +} + +void gdlm_plock_exit(void) +{ + if (misc_deregister(&plock_dev_misc) < 0) + printk(KERN_INFO "gdlm_plock_exit: misc_deregister failed"); +} + diff --git a/fs/gfs2/locking/dlm/sysfs.c b/fs/gfs2/locking/dlm/sysfs.c new file mode 100644 index 000000000000..29ae06f94944 --- /dev/null +++ b/fs/gfs2/locking/dlm/sysfs.c @@ -0,0 +1,226 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#include <linux/ctype.h> +#include <linux/stat.h> + +#include "lock_dlm.h" + +extern struct lm_lockops gdlm_ops; + +static ssize_t proto_name_show(struct gdlm_ls *ls, char *buf) +{ + return sprintf(buf, "%s\n", gdlm_ops.lm_proto_name); +} + +static ssize_t block_show(struct gdlm_ls *ls, char *buf) +{ + ssize_t ret; + int val = 0; + + if (test_bit(DFL_BLOCK_LOCKS, &ls->flags)) + val = 1; + ret = sprintf(buf, "%d\n", val); + return ret; +} + +static ssize_t block_store(struct gdlm_ls *ls, const char *buf, size_t len) +{ + ssize_t ret = len; + int val; + + val = simple_strtol(buf, NULL, 0); + + if (val == 1) + set_bit(DFL_BLOCK_LOCKS, &ls->flags); + else if (val == 0) { + clear_bit(DFL_BLOCK_LOCKS, &ls->flags); + gdlm_submit_delayed(ls); + } else { + ret = -EINVAL; + } + return ret; +} + +static ssize_t withdraw_show(struct gdlm_ls *ls, char *buf) +{ + ssize_t ret; + int val = 0; + + if (test_bit(DFL_WITHDRAW, &ls->flags)) + val = 1; + ret = sprintf(buf, "%d\n", val); + return ret; +} + +static ssize_t withdraw_store(struct gdlm_ls *ls, const char *buf, size_t len) +{ + ssize_t ret = len; + int val; + + val = simple_strtol(buf, NULL, 0); + + if (val == 1) + set_bit(DFL_WITHDRAW, &ls->flags); + else + ret = -EINVAL; + wake_up(&ls->wait_control); + return ret; +} + +static ssize_t id_show(struct gdlm_ls *ls, char *buf) +{ + return sprintf(buf, "%u\n", ls->id); +} + +static ssize_t jid_show(struct gdlm_ls *ls, char *buf) +{ + return sprintf(buf, "%d\n", ls->jid); +} + +static ssize_t first_show(struct gdlm_ls *ls, char *buf) +{ + return sprintf(buf, "%d\n", ls->first); +} + +static ssize_t first_done_show(struct gdlm_ls *ls, char *buf) +{ + return sprintf(buf, "%d\n", ls->first_done); +} + +static ssize_t recover_show(struct gdlm_ls *ls, char *buf) +{ + return sprintf(buf, "%d\n", ls->recover_jid); +} + +static ssize_t recover_store(struct gdlm_ls *ls, const char *buf, size_t len) +{ + ls->recover_jid = simple_strtol(buf, NULL, 0); + ls->fscb(ls->sdp, LM_CB_NEED_RECOVERY, &ls->recover_jid); + return len; +} + +static ssize_t recover_done_show(struct gdlm_ls *ls, char *buf) +{ + return sprintf(buf, "%d\n", ls->recover_jid_done); +} + +static ssize_t recover_status_show(struct gdlm_ls *ls, char *buf) +{ + return sprintf(buf, "%d\n", ls->recover_jid_status); +} + +struct gdlm_attr { + struct attribute attr; + ssize_t (*show)(struct gdlm_ls *, char *); + ssize_t (*store)(struct gdlm_ls *, const char *, size_t); +}; + +#define GDLM_ATTR(_name,_mode,_show,_store) \ +static struct gdlm_attr gdlm_attr_##_name = __ATTR(_name,_mode,_show,_store) + +GDLM_ATTR(proto_name, 0444, proto_name_show, NULL); +GDLM_ATTR(block, 0644, block_show, block_store); +GDLM_ATTR(withdraw, 0644, withdraw_show, withdraw_store); +GDLM_ATTR(id, 0444, id_show, NULL); +GDLM_ATTR(jid, 0444, jid_show, NULL); +GDLM_ATTR(first, 0444, first_show, NULL); +GDLM_ATTR(first_done, 0444, first_done_show, NULL); +GDLM_ATTR(recover, 0644, recover_show, recover_store); +GDLM_ATTR(recover_done, 0444, recover_done_show, NULL); +GDLM_ATTR(recover_status, 0444, recover_status_show, NULL); + +static struct attribute *gdlm_attrs[] = { + &gdlm_attr_proto_name.attr, + &gdlm_attr_block.attr, + &gdlm_attr_withdraw.attr, + &gdlm_attr_id.attr, + &gdlm_attr_jid.attr, + &gdlm_attr_first.attr, + &gdlm_attr_first_done.attr, + &gdlm_attr_recover.attr, + &gdlm_attr_recover_done.attr, + &gdlm_attr_recover_status.attr, + NULL, +}; + +static ssize_t gdlm_attr_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct gdlm_ls *ls = container_of(kobj, struct gdlm_ls, kobj); + struct gdlm_attr *a = container_of(attr, struct gdlm_attr, attr); + return a->show ? a->show(ls, buf) : 0; +} + +static ssize_t gdlm_attr_store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t len) +{ + struct gdlm_ls *ls = container_of(kobj, struct gdlm_ls, kobj); + struct gdlm_attr *a = container_of(attr, struct gdlm_attr, attr); + return a->store ? a->store(ls, buf, len) : len; +} + +static struct sysfs_ops gdlm_attr_ops = { + .show = gdlm_attr_show, + .store = gdlm_attr_store, +}; + +static struct kobj_type gdlm_ktype = { + .default_attrs = gdlm_attrs, + .sysfs_ops = &gdlm_attr_ops, +}; + +static struct kset gdlm_kset = { + .subsys = &kernel_subsys, + .kobj = {.name = "lock_dlm",}, + .ktype = &gdlm_ktype, +}; + +int gdlm_kobject_setup(struct gdlm_ls *ls, struct kobject *fskobj) +{ + int error; + + error = kobject_set_name(&ls->kobj, "%s", "lock_module"); + if (error) { + log_error("can't set kobj name %d", error); + return error; + } + + ls->kobj.kset = &gdlm_kset; + ls->kobj.ktype = &gdlm_ktype; + ls->kobj.parent = fskobj; + + error = kobject_register(&ls->kobj); + if (error) + log_error("can't register kobj %d", error); + + return error; +} + +void gdlm_kobject_release(struct gdlm_ls *ls) +{ + kobject_unregister(&ls->kobj); +} + +int gdlm_sysfs_init(void) +{ + int error; + + error = kset_register(&gdlm_kset); + if (error) + printk("lock_dlm: cannot register kset %d\n", error); + + return error; +} + +void gdlm_sysfs_exit(void) +{ + kset_unregister(&gdlm_kset); +} + diff --git a/fs/gfs2/locking/dlm/thread.c b/fs/gfs2/locking/dlm/thread.c new file mode 100644 index 000000000000..9cf1f168eaf8 --- /dev/null +++ b/fs/gfs2/locking/dlm/thread.c @@ -0,0 +1,359 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#include "lock_dlm.h" + +/* A lock placed on this queue is re-submitted to DLM as soon as the lock_dlm + thread gets to it. */ + +static void queue_submit(struct gdlm_lock *lp) +{ + struct gdlm_ls *ls = lp->ls; + + spin_lock(&ls->async_lock); + list_add_tail(&lp->delay_list, &ls->submit); + spin_unlock(&ls->async_lock); + wake_up(&ls->thread_wait); +} + +static void process_blocking(struct gdlm_lock *lp, int bast_mode) +{ + struct gdlm_ls *ls = lp->ls; + unsigned int cb = 0; + + switch (gdlm_make_lmstate(bast_mode)) { + case LM_ST_EXCLUSIVE: + cb = LM_CB_NEED_E; + break; + case LM_ST_DEFERRED: + cb = LM_CB_NEED_D; + break; + case LM_ST_SHARED: + cb = LM_CB_NEED_S; + break; + default: + gdlm_assert(0, "unknown bast mode %u", lp->bast_mode); + } + + ls->fscb(ls->sdp, cb, &lp->lockname); +} + +static void process_complete(struct gdlm_lock *lp) +{ + struct gdlm_ls *ls = lp->ls; + struct lm_async_cb acb; + s16 prev_mode = lp->cur; + + memset(&acb, 0, sizeof(acb)); + + if (lp->lksb.sb_status == -DLM_ECANCEL) { + log_info("complete dlm cancel %x,%llx flags %lx", + lp->lockname.ln_type, + (unsigned long long)lp->lockname.ln_number, + lp->flags); + + lp->req = lp->cur; + acb.lc_ret |= LM_OUT_CANCELED; + if (lp->cur == DLM_LOCK_IV) + lp->lksb.sb_lkid = 0; + goto out; + } + + if (test_and_clear_bit(LFL_DLM_UNLOCK, &lp->flags)) { + if (lp->lksb.sb_status != -DLM_EUNLOCK) { + log_info("unlock sb_status %d %x,%llx flags %lx", + lp->lksb.sb_status, lp->lockname.ln_type, + (unsigned long long)lp->lockname.ln_number, + lp->flags); + return; + } + + lp->cur = DLM_LOCK_IV; + lp->req = DLM_LOCK_IV; + lp->lksb.sb_lkid = 0; + + if (test_and_clear_bit(LFL_UNLOCK_DELETE, &lp->flags)) { + gdlm_delete_lp(lp); + return; + } + goto out; + } + + if (lp->lksb.sb_flags & DLM_SBF_VALNOTVALID) + memset(lp->lksb.sb_lvbptr, 0, GDLM_LVB_SIZE); + + if (lp->lksb.sb_flags & DLM_SBF_ALTMODE) { + if (lp->req == DLM_LOCK_PR) + lp->req = DLM_LOCK_CW; + else if (lp->req == DLM_LOCK_CW) + lp->req = DLM_LOCK_PR; + } + + /* + * A canceled lock request. The lock was just taken off the delayed + * list and was never even submitted to dlm. + */ + + if (test_and_clear_bit(LFL_CANCEL, &lp->flags)) { + log_info("complete internal cancel %x,%llx", + lp->lockname.ln_type, + (unsigned long long)lp->lockname.ln_number); + lp->req = lp->cur; + acb.lc_ret |= LM_OUT_CANCELED; + goto out; + } + + /* + * An error occured. + */ + + if (lp->lksb.sb_status) { + /* a "normal" error */ + if ((lp->lksb.sb_status == -EAGAIN) && + (lp->lkf & DLM_LKF_NOQUEUE)) { + lp->req = lp->cur; + if (lp->cur == DLM_LOCK_IV) + lp->lksb.sb_lkid = 0; + goto out; + } + + /* this could only happen with cancels I think */ + log_info("ast sb_status %d %x,%llx flags %lx", + lp->lksb.sb_status, lp->lockname.ln_type, + (unsigned long long)lp->lockname.ln_number, + lp->flags); + return; + } + + /* + * This is an AST for an EX->EX conversion for sync_lvb from GFS. + */ + + if (test_and_clear_bit(LFL_SYNC_LVB, &lp->flags)) { + complete(&lp->ast_wait); + return; + } + + /* + * A lock has been demoted to NL because it initially completed during + * BLOCK_LOCKS. Now it must be requested in the originally requested + * mode. + */ + + if (test_and_clear_bit(LFL_REREQUEST, &lp->flags)) { + gdlm_assert(lp->req == DLM_LOCK_NL, "%x,%llx", + lp->lockname.ln_type, + (unsigned long long)lp->lockname.ln_number); + gdlm_assert(lp->prev_req > DLM_LOCK_NL, "%x,%llx", + lp->lockname.ln_type, + (unsigned long long)lp->lockname.ln_number); + + lp->cur = DLM_LOCK_NL; + lp->req = lp->prev_req; + lp->prev_req = DLM_LOCK_IV; + lp->lkf &= ~DLM_LKF_CONVDEADLK; + + set_bit(LFL_NOCACHE, &lp->flags); + + if (test_bit(DFL_BLOCK_LOCKS, &ls->flags) && + !test_bit(LFL_NOBLOCK, &lp->flags)) + gdlm_queue_delayed(lp); + else + queue_submit(lp); + return; + } + + /* + * A request is granted during dlm recovery. It may be granted + * because the locks of a failed node were cleared. In that case, + * there may be inconsistent data beneath this lock and we must wait + * for recovery to complete to use it. When gfs recovery is done this + * granted lock will be converted to NL and then reacquired in this + * granted state. + */ + + if (test_bit(DFL_BLOCK_LOCKS, &ls->flags) && + !test_bit(LFL_NOBLOCK, &lp->flags) && + lp->req != DLM_LOCK_NL) { + + lp->cur = lp->req; + lp->prev_req = lp->req; + lp->req = DLM_LOCK_NL; + lp->lkf |= DLM_LKF_CONVERT; + lp->lkf &= ~DLM_LKF_CONVDEADLK; + + log_debug("rereq %x,%llx id %x %d,%d", + lp->lockname.ln_type, + (unsigned long long)lp->lockname.ln_number, + lp->lksb.sb_lkid, lp->cur, lp->req); + + set_bit(LFL_REREQUEST, &lp->flags); + queue_submit(lp); + return; + } + + /* + * DLM demoted the lock to NL before it was granted so GFS must be + * told it cannot cache data for this lock. + */ + + if (lp->lksb.sb_flags & DLM_SBF_DEMOTED) + set_bit(LFL_NOCACHE, &lp->flags); + +out: + /* + * This is an internal lock_dlm lock + */ + + if (test_bit(LFL_INLOCK, &lp->flags)) { + clear_bit(LFL_NOBLOCK, &lp->flags); + lp->cur = lp->req; + complete(&lp->ast_wait); + return; + } + + /* + * Normal completion of a lock request. Tell GFS it now has the lock. + */ + + clear_bit(LFL_NOBLOCK, &lp->flags); + lp->cur = lp->req; + + acb.lc_name = lp->lockname; + acb.lc_ret |= gdlm_make_lmstate(lp->cur); + + if (!test_and_clear_bit(LFL_NOCACHE, &lp->flags) && + (lp->cur > DLM_LOCK_NL) && (prev_mode > DLM_LOCK_NL)) + acb.lc_ret |= LM_OUT_CACHEABLE; + + ls->fscb(ls->sdp, LM_CB_ASYNC, &acb); +} + +static inline int no_work(struct gdlm_ls *ls, int blocking) +{ + int ret; + + spin_lock(&ls->async_lock); + ret = list_empty(&ls->complete) && list_empty(&ls->submit); + if (ret && blocking) + ret = list_empty(&ls->blocking); + spin_unlock(&ls->async_lock); + + return ret; +} + +static inline int check_drop(struct gdlm_ls *ls) +{ + if (!ls->drop_locks_count) + return 0; + + if (time_after(jiffies, ls->drop_time + ls->drop_locks_period * HZ)) { + ls->drop_time = jiffies; + if (ls->all_locks_count >= ls->drop_locks_count) + return 1; + } + return 0; +} + +static int gdlm_thread(void *data) +{ + struct gdlm_ls *ls = (struct gdlm_ls *) data; + struct gdlm_lock *lp = NULL; + int blist = 0; + uint8_t complete, blocking, submit, drop; + DECLARE_WAITQUEUE(wait, current); + + /* Only thread1 is allowed to do blocking callbacks since gfs + may wait for a completion callback within a blocking cb. */ + + if (current == ls->thread1) + blist = 1; + + while (!kthread_should_stop()) { + set_current_state(TASK_INTERRUPTIBLE); + add_wait_queue(&ls->thread_wait, &wait); + if (no_work(ls, blist)) + schedule(); + remove_wait_queue(&ls->thread_wait, &wait); + set_current_state(TASK_RUNNING); + + complete = blocking = submit = drop = 0; + + spin_lock(&ls->async_lock); + + if (blist && !list_empty(&ls->blocking)) { + lp = list_entry(ls->blocking.next, struct gdlm_lock, + blist); + list_del_init(&lp->blist); + blocking = lp->bast_mode; + lp->bast_mode = 0; + } else if (!list_empty(&ls->complete)) { + lp = list_entry(ls->complete.next, struct gdlm_lock, + clist); + list_del_init(&lp->clist); + complete = 1; + } else if (!list_empty(&ls->submit)) { + lp = list_entry(ls->submit.next, struct gdlm_lock, + delay_list); + list_del_init(&lp->delay_list); + submit = 1; + } + + drop = check_drop(ls); + spin_unlock(&ls->async_lock); + + if (complete) + process_complete(lp); + + else if (blocking) + process_blocking(lp, blocking); + + else if (submit) + gdlm_do_lock(lp); + + if (drop) + ls->fscb(ls->sdp, LM_CB_DROPLOCKS, NULL); + + schedule(); + } + + return 0; +} + +int gdlm_init_threads(struct gdlm_ls *ls) +{ + struct task_struct *p; + int error; + + p = kthread_run(gdlm_thread, ls, "lock_dlm1"); + error = IS_ERR(p); + if (error) { + log_error("can't start lock_dlm1 thread %d", error); + return error; + } + ls->thread1 = p; + + p = kthread_run(gdlm_thread, ls, "lock_dlm2"); + error = IS_ERR(p); + if (error) { + log_error("can't start lock_dlm2 thread %d", error); + kthread_stop(ls->thread1); + return error; + } + ls->thread2 = p; + + return 0; +} + +void gdlm_release_threads(struct gdlm_ls *ls) +{ + kthread_stop(ls->thread1); + kthread_stop(ls->thread2); +} + diff --git a/fs/gfs2/locking/nolock/Makefile b/fs/gfs2/locking/nolock/Makefile new file mode 100644 index 000000000000..35e9730bc3a8 --- /dev/null +++ b/fs/gfs2/locking/nolock/Makefile @@ -0,0 +1,3 @@ +obj-$(CONFIG_GFS2_FS_LOCKING_NOLOCK) += lock_nolock.o +lock_nolock-y := main.o + diff --git a/fs/gfs2/locking/nolock/main.c b/fs/gfs2/locking/nolock/main.c new file mode 100644 index 000000000000..acfbc941f319 --- /dev/null +++ b/fs/gfs2/locking/nolock/main.c @@ -0,0 +1,246 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/types.h> +#include <linux/fs.h> +#include <linux/smp_lock.h> +#include <linux/lm_interface.h> + +struct nolock_lockspace { + unsigned int nl_lvb_size; +}; + +static const struct lm_lockops nolock_ops; + +static int nolock_mount(char *table_name, char *host_data, + lm_callback_t cb, void *cb_data, + unsigned int min_lvb_size, int flags, + struct lm_lockstruct *lockstruct, + struct kobject *fskobj) +{ + char *c; + unsigned int jid; + struct nolock_lockspace *nl; + + c = strstr(host_data, "jid="); + if (!c) + jid = 0; + else { + c += 4; + sscanf(c, "%u", &jid); + } + + nl = kzalloc(sizeof(struct nolock_lockspace), GFP_KERNEL); + if (!nl) + return -ENOMEM; + + nl->nl_lvb_size = min_lvb_size; + + lockstruct->ls_jid = jid; + lockstruct->ls_first = 1; + lockstruct->ls_lvb_size = min_lvb_size; + lockstruct->ls_lockspace = nl; + lockstruct->ls_ops = &nolock_ops; + lockstruct->ls_flags = LM_LSFLAG_LOCAL; + + return 0; +} + +static void nolock_others_may_mount(void *lockspace) +{ +} + +static void nolock_unmount(void *lockspace) +{ + struct nolock_lockspace *nl = lockspace; + kfree(nl); +} + +static void nolock_withdraw(void *lockspace) +{ +} + +/** + * nolock_get_lock - get a lm_lock_t given a descripton of the lock + * @lockspace: the lockspace the lock lives in + * @name: the name of the lock + * @lockp: return the lm_lock_t here + * + * Returns: 0 on success, -EXXX on failure + */ + +static int nolock_get_lock(void *lockspace, struct lm_lockname *name, + void **lockp) +{ + *lockp = lockspace; + return 0; +} + +/** + * nolock_put_lock - get rid of a lock structure + * @lock: the lock to throw away + * + */ + +static void nolock_put_lock(void *lock) +{ +} + +/** + * nolock_lock - acquire a lock + * @lock: the lock to manipulate + * @cur_state: the current state + * @req_state: the requested state + * @flags: modifier flags + * + * Returns: A bitmap of LM_OUT_* + */ + +static unsigned int nolock_lock(void *lock, unsigned int cur_state, + unsigned int req_state, unsigned int flags) +{ + return req_state | LM_OUT_CACHEABLE; +} + +/** + * nolock_unlock - unlock a lock + * @lock: the lock to manipulate + * @cur_state: the current state + * + * Returns: 0 + */ + +static unsigned int nolock_unlock(void *lock, unsigned int cur_state) +{ + return 0; +} + +static void nolock_cancel(void *lock) +{ +} + +/** + * nolock_hold_lvb - hold on to a lock value block + * @lock: the lock the LVB is associated with + * @lvbp: return the lm_lvb_t here + * + * Returns: 0 on success, -EXXX on failure + */ + +static int nolock_hold_lvb(void *lock, char **lvbp) +{ + struct nolock_lockspace *nl = lock; + int error = 0; + + *lvbp = kzalloc(nl->nl_lvb_size, GFP_KERNEL); + if (!*lvbp) + error = -ENOMEM; + + return error; +} + +/** + * nolock_unhold_lvb - release a LVB + * @lock: the lock the LVB is associated with + * @lvb: the lock value block + * + */ + +static void nolock_unhold_lvb(void *lock, char *lvb) +{ + kfree(lvb); +} + +static int nolock_plock_get(void *lockspace, struct lm_lockname *name, + struct file *file, struct file_lock *fl) +{ + struct file_lock tmp; + int ret; + + ret = posix_test_lock(file, fl, &tmp); + fl->fl_type = F_UNLCK; + if (ret) + memcpy(fl, &tmp, sizeof(struct file_lock)); + + return 0; +} + +static int nolock_plock(void *lockspace, struct lm_lockname *name, + struct file *file, int cmd, struct file_lock *fl) +{ + int error; + error = posix_lock_file_wait(file, fl); + return error; +} + +static int nolock_punlock(void *lockspace, struct lm_lockname *name, + struct file *file, struct file_lock *fl) +{ + int error; + error = posix_lock_file_wait(file, fl); + return error; +} + +static void nolock_recovery_done(void *lockspace, unsigned int jid, + unsigned int message) +{ +} + +static const struct lm_lockops nolock_ops = { + .lm_proto_name = "lock_nolock", + .lm_mount = nolock_mount, + .lm_others_may_mount = nolock_others_may_mount, + .lm_unmount = nolock_unmount, + .lm_withdraw = nolock_withdraw, + .lm_get_lock = nolock_get_lock, + .lm_put_lock = nolock_put_lock, + .lm_lock = nolock_lock, + .lm_unlock = nolock_unlock, + .lm_cancel = nolock_cancel, + .lm_hold_lvb = nolock_hold_lvb, + .lm_unhold_lvb = nolock_unhold_lvb, + .lm_plock_get = nolock_plock_get, + .lm_plock = nolock_plock, + .lm_punlock = nolock_punlock, + .lm_recovery_done = nolock_recovery_done, + .lm_owner = THIS_MODULE, +}; + +static int __init init_nolock(void) +{ + int error; + + error = gfs2_register_lockproto(&nolock_ops); + if (error) { + printk(KERN_WARNING + "lock_nolock: can't register protocol: %d\n", error); + return error; + } + + printk(KERN_INFO + "Lock_Nolock (built %s %s) installed\n", __DATE__, __TIME__); + return 0; +} + +static void __exit exit_nolock(void) +{ + gfs2_unregister_lockproto(&nolock_ops); +} + +module_init(init_nolock); +module_exit(exit_nolock); + +MODULE_DESCRIPTION("GFS Nolock Locking Module"); +MODULE_AUTHOR("Red Hat, Inc."); +MODULE_LICENSE("GPL"); + diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c new file mode 100644 index 000000000000..554fe5bd1b72 --- /dev/null +++ b/fs/gfs2/log.c @@ -0,0 +1,687 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/completion.h> +#include <linux/buffer_head.h> +#include <linux/gfs2_ondisk.h> +#include <linux/crc32.h> +#include <linux/lm_interface.h> + +#include "gfs2.h" +#include "incore.h" +#include "bmap.h" +#include "glock.h" +#include "log.h" +#include "lops.h" +#include "meta_io.h" +#include "util.h" +#include "dir.h" + +#define PULL 1 + +/** + * gfs2_struct2blk - compute stuff + * @sdp: the filesystem + * @nstruct: the number of structures + * @ssize: the size of the structures + * + * Compute the number of log descriptor blocks needed to hold a certain number + * of structures of a certain size. + * + * Returns: the number of blocks needed (minimum is always 1) + */ + +unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct, + unsigned int ssize) +{ + unsigned int blks; + unsigned int first, second; + + blks = 1; + first = (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_log_descriptor)) / ssize; + + if (nstruct > first) { + second = (sdp->sd_sb.sb_bsize - + sizeof(struct gfs2_meta_header)) / ssize; + blks += DIV_ROUND_UP(nstruct - first, second); + } + + return blks; +} + +/** + * gfs2_ail1_start_one - Start I/O on a part of the AIL + * @sdp: the filesystem + * @tr: the part of the AIL + * + */ + +static void gfs2_ail1_start_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai) +{ + struct gfs2_bufdata *bd, *s; + struct buffer_head *bh; + int retry; + + BUG_ON(!spin_is_locked(&sdp->sd_log_lock)); + + do { + retry = 0; + + list_for_each_entry_safe_reverse(bd, s, &ai->ai_ail1_list, + bd_ail_st_list) { + bh = bd->bd_bh; + + gfs2_assert(sdp, bd->bd_ail == ai); + + if (!buffer_busy(bh)) { + if (!buffer_uptodate(bh)) { + gfs2_log_unlock(sdp); + gfs2_io_error_bh(sdp, bh); + gfs2_log_lock(sdp); + } + list_move(&bd->bd_ail_st_list, &ai->ai_ail2_list); + continue; + } + + if (!buffer_dirty(bh)) + continue; + + list_move(&bd->bd_ail_st_list, &ai->ai_ail1_list); + + gfs2_log_unlock(sdp); + wait_on_buffer(bh); + ll_rw_block(WRITE, 1, &bh); + gfs2_log_lock(sdp); + + retry = 1; + break; + } + } while (retry); +} + +/** + * gfs2_ail1_empty_one - Check whether or not a trans in the AIL has been synced + * @sdp: the filesystem + * @ai: the AIL entry + * + */ + +static int gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai, int flags) +{ + struct gfs2_bufdata *bd, *s; + struct buffer_head *bh; + + list_for_each_entry_safe_reverse(bd, s, &ai->ai_ail1_list, + bd_ail_st_list) { + bh = bd->bd_bh; + + gfs2_assert(sdp, bd->bd_ail == ai); + + if (buffer_busy(bh)) { + if (flags & DIO_ALL) + continue; + else + break; + } + + if (!buffer_uptodate(bh)) + gfs2_io_error_bh(sdp, bh); + + list_move(&bd->bd_ail_st_list, &ai->ai_ail2_list); + } + + return list_empty(&ai->ai_ail1_list); +} + +void gfs2_ail1_start(struct gfs2_sbd *sdp, int flags) +{ + struct list_head *head = &sdp->sd_ail1_list; + u64 sync_gen; + struct list_head *first; + struct gfs2_ail *first_ai, *ai, *tmp; + int done = 0; + + gfs2_log_lock(sdp); + if (list_empty(head)) { + gfs2_log_unlock(sdp); + return; + } + sync_gen = sdp->sd_ail_sync_gen++; + + first = head->prev; + first_ai = list_entry(first, struct gfs2_ail, ai_list); + first_ai->ai_sync_gen = sync_gen; + gfs2_ail1_start_one(sdp, first_ai); /* This may drop log lock */ + + if (flags & DIO_ALL) + first = NULL; + + while(!done) { + if (first && (head->prev != first || + gfs2_ail1_empty_one(sdp, first_ai, 0))) + break; + + done = 1; + list_for_each_entry_safe_reverse(ai, tmp, head, ai_list) { + if (ai->ai_sync_gen >= sync_gen) + continue; + ai->ai_sync_gen = sync_gen; + gfs2_ail1_start_one(sdp, ai); /* This may drop log lock */ + done = 0; + break; + } + } + + gfs2_log_unlock(sdp); +} + +int gfs2_ail1_empty(struct gfs2_sbd *sdp, int flags) +{ + struct gfs2_ail *ai, *s; + int ret; + + gfs2_log_lock(sdp); + + list_for_each_entry_safe_reverse(ai, s, &sdp->sd_ail1_list, ai_list) { + if (gfs2_ail1_empty_one(sdp, ai, flags)) + list_move(&ai->ai_list, &sdp->sd_ail2_list); + else if (!(flags & DIO_ALL)) + break; + } + + ret = list_empty(&sdp->sd_ail1_list); + + gfs2_log_unlock(sdp); + + return ret; +} + + +/** + * gfs2_ail2_empty_one - Check whether or not a trans in the AIL has been synced + * @sdp: the filesystem + * @ai: the AIL entry + * + */ + +static void gfs2_ail2_empty_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai) +{ + struct list_head *head = &ai->ai_ail2_list; + struct gfs2_bufdata *bd; + + while (!list_empty(head)) { + bd = list_entry(head->prev, struct gfs2_bufdata, + bd_ail_st_list); + gfs2_assert(sdp, bd->bd_ail == ai); + bd->bd_ail = NULL; + list_del(&bd->bd_ail_st_list); + list_del(&bd->bd_ail_gl_list); + atomic_dec(&bd->bd_gl->gl_ail_count); + brelse(bd->bd_bh); + } +} + +static void ail2_empty(struct gfs2_sbd *sdp, unsigned int new_tail) +{ + struct gfs2_ail *ai, *safe; + unsigned int old_tail = sdp->sd_log_tail; + int wrap = (new_tail < old_tail); + int a, b, rm; + + gfs2_log_lock(sdp); + + list_for_each_entry_safe(ai, safe, &sdp->sd_ail2_list, ai_list) { + a = (old_tail <= ai->ai_first); + b = (ai->ai_first < new_tail); + rm = (wrap) ? (a || b) : (a && b); + if (!rm) + continue; + + gfs2_ail2_empty_one(sdp, ai); + list_del(&ai->ai_list); + gfs2_assert_warn(sdp, list_empty(&ai->ai_ail1_list)); + gfs2_assert_warn(sdp, list_empty(&ai->ai_ail2_list)); + kfree(ai); + } + + gfs2_log_unlock(sdp); +} + +/** + * gfs2_log_reserve - Make a log reservation + * @sdp: The GFS2 superblock + * @blks: The number of blocks to reserve + * + * Returns: errno + */ + +int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks) +{ + unsigned int try = 0; + + if (gfs2_assert_warn(sdp, blks) || + gfs2_assert_warn(sdp, blks <= sdp->sd_jdesc->jd_blocks)) + return -EINVAL; + + mutex_lock(&sdp->sd_log_reserve_mutex); + gfs2_log_lock(sdp); + while(sdp->sd_log_blks_free <= blks) { + gfs2_log_unlock(sdp); + gfs2_ail1_empty(sdp, 0); + gfs2_log_flush(sdp, NULL); + + if (try++) + gfs2_ail1_start(sdp, 0); + gfs2_log_lock(sdp); + } + sdp->sd_log_blks_free -= blks; + gfs2_log_unlock(sdp); + mutex_unlock(&sdp->sd_log_reserve_mutex); + + down_read(&sdp->sd_log_flush_lock); + + return 0; +} + +/** + * gfs2_log_release - Release a given number of log blocks + * @sdp: The GFS2 superblock + * @blks: The number of blocks + * + */ + +void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks) +{ + + gfs2_log_lock(sdp); + sdp->sd_log_blks_free += blks; + gfs2_assert_withdraw(sdp, + sdp->sd_log_blks_free <= sdp->sd_jdesc->jd_blocks); + gfs2_log_unlock(sdp); + up_read(&sdp->sd_log_flush_lock); +} + +static u64 log_bmap(struct gfs2_sbd *sdp, unsigned int lbn) +{ + int error; + struct buffer_head bh_map; + + error = gfs2_block_map(sdp->sd_jdesc->jd_inode, lbn, 0, &bh_map, 1); + if (error || !bh_map.b_blocknr) + printk(KERN_INFO "error=%d, dbn=%llu lbn=%u", error, bh_map.b_blocknr, lbn); + gfs2_assert_withdraw(sdp, !error && bh_map.b_blocknr); + + return bh_map.b_blocknr; +} + +/** + * log_distance - Compute distance between two journal blocks + * @sdp: The GFS2 superblock + * @newer: The most recent journal block of the pair + * @older: The older journal block of the pair + * + * Compute the distance (in the journal direction) between two + * blocks in the journal + * + * Returns: the distance in blocks + */ + +static inline unsigned int log_distance(struct gfs2_sbd *sdp, unsigned int newer, + unsigned int older) +{ + int dist; + + dist = newer - older; + if (dist < 0) + dist += sdp->sd_jdesc->jd_blocks; + + return dist; +} + +static unsigned int current_tail(struct gfs2_sbd *sdp) +{ + struct gfs2_ail *ai; + unsigned int tail; + + gfs2_log_lock(sdp); + + if (list_empty(&sdp->sd_ail1_list)) { + tail = sdp->sd_log_head; + } else { + ai = list_entry(sdp->sd_ail1_list.prev, struct gfs2_ail, ai_list); + tail = ai->ai_first; + } + + gfs2_log_unlock(sdp); + + return tail; +} + +static inline void log_incr_head(struct gfs2_sbd *sdp) +{ + if (sdp->sd_log_flush_head == sdp->sd_log_tail) + gfs2_assert_withdraw(sdp, sdp->sd_log_flush_head == sdp->sd_log_head); + + if (++sdp->sd_log_flush_head == sdp->sd_jdesc->jd_blocks) { + sdp->sd_log_flush_head = 0; + sdp->sd_log_flush_wrapped = 1; + } +} + +/** + * gfs2_log_get_buf - Get and initialize a buffer to use for log control data + * @sdp: The GFS2 superblock + * + * Returns: the buffer_head + */ + +struct buffer_head *gfs2_log_get_buf(struct gfs2_sbd *sdp) +{ + u64 blkno = log_bmap(sdp, sdp->sd_log_flush_head); + struct gfs2_log_buf *lb; + struct buffer_head *bh; + + lb = kzalloc(sizeof(struct gfs2_log_buf), GFP_NOFS | __GFP_NOFAIL); + list_add(&lb->lb_list, &sdp->sd_log_flush_list); + + bh = lb->lb_bh = sb_getblk(sdp->sd_vfs, blkno); + lock_buffer(bh); + memset(bh->b_data, 0, bh->b_size); + set_buffer_uptodate(bh); + clear_buffer_dirty(bh); + unlock_buffer(bh); + + log_incr_head(sdp); + + return bh; +} + +/** + * gfs2_log_fake_buf - Build a fake buffer head to write metadata buffer to log + * @sdp: the filesystem + * @data: the data the buffer_head should point to + * + * Returns: the log buffer descriptor + */ + +struct buffer_head *gfs2_log_fake_buf(struct gfs2_sbd *sdp, + struct buffer_head *real) +{ + u64 blkno = log_bmap(sdp, sdp->sd_log_flush_head); + struct gfs2_log_buf *lb; + struct buffer_head *bh; + + lb = kzalloc(sizeof(struct gfs2_log_buf), GFP_NOFS | __GFP_NOFAIL); + list_add(&lb->lb_list, &sdp->sd_log_flush_list); + lb->lb_real = real; + + bh = lb->lb_bh = alloc_buffer_head(GFP_NOFS | __GFP_NOFAIL); + atomic_set(&bh->b_count, 1); + bh->b_state = (1 << BH_Mapped) | (1 << BH_Uptodate); + set_bh_page(bh, real->b_page, bh_offset(real)); + bh->b_blocknr = blkno; + bh->b_size = sdp->sd_sb.sb_bsize; + bh->b_bdev = sdp->sd_vfs->s_bdev; + + log_incr_head(sdp); + + return bh; +} + +static void log_pull_tail(struct gfs2_sbd *sdp, unsigned int new_tail, int pull) +{ + unsigned int dist = log_distance(sdp, new_tail, sdp->sd_log_tail); + + ail2_empty(sdp, new_tail); + + gfs2_log_lock(sdp); + sdp->sd_log_blks_free += dist - (pull ? 1 : 0); + gfs2_assert_withdraw(sdp, sdp->sd_log_blks_free <= sdp->sd_jdesc->jd_blocks); + gfs2_log_unlock(sdp); + + sdp->sd_log_tail = new_tail; +} + +/** + * log_write_header - Get and initialize a journal header buffer + * @sdp: The GFS2 superblock + * + * Returns: the initialized log buffer descriptor + */ + +static void log_write_header(struct gfs2_sbd *sdp, u32 flags, int pull) +{ + u64 blkno = log_bmap(sdp, sdp->sd_log_flush_head); + struct buffer_head *bh; + struct gfs2_log_header *lh; + unsigned int tail; + u32 hash; + + bh = sb_getblk(sdp->sd_vfs, blkno); + lock_buffer(bh); + memset(bh->b_data, 0, bh->b_size); + set_buffer_uptodate(bh); + clear_buffer_dirty(bh); + unlock_buffer(bh); + + gfs2_ail1_empty(sdp, 0); + tail = current_tail(sdp); + + lh = (struct gfs2_log_header *)bh->b_data; + memset(lh, 0, sizeof(struct gfs2_log_header)); + lh->lh_header.mh_magic = cpu_to_be32(GFS2_MAGIC); + lh->lh_header.mh_type = cpu_to_be32(GFS2_METATYPE_LH); + lh->lh_header.mh_format = cpu_to_be32(GFS2_FORMAT_LH); + lh->lh_sequence = cpu_to_be64(sdp->sd_log_sequence++); + lh->lh_flags = cpu_to_be32(flags); + lh->lh_tail = cpu_to_be32(tail); + lh->lh_blkno = cpu_to_be32(sdp->sd_log_flush_head); + hash = gfs2_disk_hash(bh->b_data, sizeof(struct gfs2_log_header)); + lh->lh_hash = cpu_to_be32(hash); + + set_buffer_dirty(bh); + if (sync_dirty_buffer(bh)) + gfs2_io_error_bh(sdp, bh); + brelse(bh); + + if (sdp->sd_log_tail != tail) + log_pull_tail(sdp, tail, pull); + else + gfs2_assert_withdraw(sdp, !pull); + + sdp->sd_log_idle = (tail == sdp->sd_log_flush_head); + log_incr_head(sdp); +} + +static void log_flush_commit(struct gfs2_sbd *sdp) +{ + struct list_head *head = &sdp->sd_log_flush_list; + struct gfs2_log_buf *lb; + struct buffer_head *bh; + + while (!list_empty(head)) { + lb = list_entry(head->next, struct gfs2_log_buf, lb_list); + list_del(&lb->lb_list); + bh = lb->lb_bh; + + wait_on_buffer(bh); + if (!buffer_uptodate(bh)) + gfs2_io_error_bh(sdp, bh); + if (lb->lb_real) { + while (atomic_read(&bh->b_count) != 1) /* Grrrr... */ + schedule(); + free_buffer_head(bh); + } else + brelse(bh); + kfree(lb); + } + + log_write_header(sdp, 0, 0); +} + +/** + * gfs2_log_flush - flush incore transaction(s) + * @sdp: the filesystem + * @gl: The glock structure to flush. If NULL, flush the whole incore log + * + */ + +void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl) +{ + struct gfs2_ail *ai; + + down_write(&sdp->sd_log_flush_lock); + + if (gl) { + gfs2_log_lock(sdp); + if (list_empty(&gl->gl_le.le_list)) { + gfs2_log_unlock(sdp); + up_write(&sdp->sd_log_flush_lock); + return; + } + gfs2_log_unlock(sdp); + } + + ai = kzalloc(sizeof(struct gfs2_ail), GFP_NOFS | __GFP_NOFAIL); + INIT_LIST_HEAD(&ai->ai_ail1_list); + INIT_LIST_HEAD(&ai->ai_ail2_list); + + gfs2_assert_withdraw(sdp, sdp->sd_log_num_buf == sdp->sd_log_commited_buf); + gfs2_assert_withdraw(sdp, + sdp->sd_log_num_revoke == sdp->sd_log_commited_revoke); + + sdp->sd_log_flush_head = sdp->sd_log_head; + sdp->sd_log_flush_wrapped = 0; + ai->ai_first = sdp->sd_log_flush_head; + + lops_before_commit(sdp); + if (!list_empty(&sdp->sd_log_flush_list)) + log_flush_commit(sdp); + else if (sdp->sd_log_tail != current_tail(sdp) && !sdp->sd_log_idle) + log_write_header(sdp, 0, PULL); + lops_after_commit(sdp, ai); + sdp->sd_log_head = sdp->sd_log_flush_head; + + sdp->sd_log_blks_free -= sdp->sd_log_num_hdrs; + + sdp->sd_log_blks_reserved = 0; + sdp->sd_log_commited_buf = 0; + sdp->sd_log_num_hdrs = 0; + sdp->sd_log_commited_revoke = 0; + + gfs2_log_lock(sdp); + if (!list_empty(&ai->ai_ail1_list)) { + list_add(&ai->ai_list, &sdp->sd_ail1_list); + ai = NULL; + } + gfs2_log_unlock(sdp); + + sdp->sd_vfs->s_dirt = 0; + up_write(&sdp->sd_log_flush_lock); + + kfree(ai); +} + +static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr) +{ + unsigned int reserved = 0; + unsigned int old; + + gfs2_log_lock(sdp); + + sdp->sd_log_commited_buf += tr->tr_num_buf_new - tr->tr_num_buf_rm; + gfs2_assert_withdraw(sdp, ((int)sdp->sd_log_commited_buf) >= 0); + sdp->sd_log_commited_revoke += tr->tr_num_revoke - tr->tr_num_revoke_rm; + gfs2_assert_withdraw(sdp, ((int)sdp->sd_log_commited_revoke) >= 0); + + if (sdp->sd_log_commited_buf) + reserved += sdp->sd_log_commited_buf; + if (sdp->sd_log_commited_revoke) + reserved += gfs2_struct2blk(sdp, sdp->sd_log_commited_revoke, + sizeof(u64)); + if (reserved) + reserved++; + + old = sdp->sd_log_blks_free; + sdp->sd_log_blks_free += tr->tr_reserved - + (reserved - sdp->sd_log_blks_reserved); + + gfs2_assert_withdraw(sdp, sdp->sd_log_blks_free >= old); + gfs2_assert_withdraw(sdp, + sdp->sd_log_blks_free <= sdp->sd_jdesc->jd_blocks + + sdp->sd_log_num_hdrs); + + sdp->sd_log_blks_reserved = reserved; + + gfs2_log_unlock(sdp); +} + +/** + * gfs2_log_commit - Commit a transaction to the log + * @sdp: the filesystem + * @tr: the transaction + * + * Returns: errno + */ + +void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr) +{ + log_refund(sdp, tr); + lops_incore_commit(sdp, tr); + + sdp->sd_vfs->s_dirt = 1; + up_read(&sdp->sd_log_flush_lock); + + gfs2_log_lock(sdp); + if (sdp->sd_log_num_buf > gfs2_tune_get(sdp, gt_incore_log_blocks)) { + gfs2_log_unlock(sdp); + gfs2_log_flush(sdp, NULL); + } else { + gfs2_log_unlock(sdp); + } +} + +/** + * gfs2_log_shutdown - write a shutdown header into a journal + * @sdp: the filesystem + * + */ + +void gfs2_log_shutdown(struct gfs2_sbd *sdp) +{ + down_write(&sdp->sd_log_flush_lock); + + gfs2_assert_withdraw(sdp, !sdp->sd_log_blks_reserved); + gfs2_assert_withdraw(sdp, !sdp->sd_log_num_gl); + gfs2_assert_withdraw(sdp, !sdp->sd_log_num_buf); + gfs2_assert_withdraw(sdp, !sdp->sd_log_num_jdata); + gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke); + gfs2_assert_withdraw(sdp, !sdp->sd_log_num_rg); + gfs2_assert_withdraw(sdp, !sdp->sd_log_num_databuf); + gfs2_assert_withdraw(sdp, !sdp->sd_log_num_hdrs); + gfs2_assert_withdraw(sdp, list_empty(&sdp->sd_ail1_list)); + + sdp->sd_log_flush_head = sdp->sd_log_head; + sdp->sd_log_flush_wrapped = 0; + + log_write_header(sdp, GFS2_LOG_HEAD_UNMOUNT, 0); + + gfs2_assert_warn(sdp, sdp->sd_log_blks_free == sdp->sd_jdesc->jd_blocks); + gfs2_assert_warn(sdp, sdp->sd_log_head == sdp->sd_log_tail); + gfs2_assert_warn(sdp, list_empty(&sdp->sd_ail2_list)); + + sdp->sd_log_head = sdp->sd_log_flush_head; + sdp->sd_log_tail = sdp->sd_log_head; + + up_write(&sdp->sd_log_flush_lock); +} + diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h new file mode 100644 index 000000000000..7f5737d55612 --- /dev/null +++ b/fs/gfs2/log.h @@ -0,0 +1,65 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#ifndef __LOG_DOT_H__ +#define __LOG_DOT_H__ + +#include <linux/list.h> +#include <linux/spinlock.h> +#include "incore.h" + +/** + * gfs2_log_lock - acquire the right to mess with the log manager + * @sdp: the filesystem + * + */ + +static inline void gfs2_log_lock(struct gfs2_sbd *sdp) +{ + spin_lock(&sdp->sd_log_lock); +} + +/** + * gfs2_log_unlock - release the right to mess with the log manager + * @sdp: the filesystem + * + */ + +static inline void gfs2_log_unlock(struct gfs2_sbd *sdp) +{ + spin_unlock(&sdp->sd_log_lock); +} + +static inline void gfs2_log_pointers_init(struct gfs2_sbd *sdp, + unsigned int value) +{ + if (++value == sdp->sd_jdesc->jd_blocks) { + value = 0; + } + sdp->sd_log_head = sdp->sd_log_tail = value; +} + +unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct, + unsigned int ssize); + +void gfs2_ail1_start(struct gfs2_sbd *sdp, int flags); +int gfs2_ail1_empty(struct gfs2_sbd *sdp, int flags); + +int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks); +void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks); + +struct buffer_head *gfs2_log_get_buf(struct gfs2_sbd *sdp); +struct buffer_head *gfs2_log_fake_buf(struct gfs2_sbd *sdp, + struct buffer_head *real); +void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl); +void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *trans); + +void gfs2_log_shutdown(struct gfs2_sbd *sdp); + +#endif /* __LOG_DOT_H__ */ diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c new file mode 100644 index 000000000000..881e337b6a70 --- /dev/null +++ b/fs/gfs2/lops.c @@ -0,0 +1,809 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/completion.h> +#include <linux/buffer_head.h> +#include <linux/gfs2_ondisk.h> +#include <linux/lm_interface.h> + +#include "gfs2.h" +#include "incore.h" +#include "glock.h" +#include "log.h" +#include "lops.h" +#include "meta_io.h" +#include "recovery.h" +#include "rgrp.h" +#include "trans.h" +#include "util.h" + +static void glock_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le) +{ + struct gfs2_glock *gl; + struct gfs2_trans *tr = current->journal_info; + + tr->tr_touched = 1; + + if (!list_empty(&le->le_list)) + return; + + gl = container_of(le, struct gfs2_glock, gl_le); + if (gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(gl))) + return; + gfs2_glock_hold(gl); + set_bit(GLF_DIRTY, &gl->gl_flags); + + gfs2_log_lock(sdp); + sdp->sd_log_num_gl++; + list_add(&le->le_list, &sdp->sd_log_le_gl); + gfs2_log_unlock(sdp); +} + +static void glock_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai) +{ + struct list_head *head = &sdp->sd_log_le_gl; + struct gfs2_glock *gl; + + while (!list_empty(head)) { + gl = list_entry(head->next, struct gfs2_glock, gl_le.le_list); + list_del_init(&gl->gl_le.le_list); + sdp->sd_log_num_gl--; + + gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(gl)); + gfs2_glock_put(gl); + } + gfs2_assert_warn(sdp, !sdp->sd_log_num_gl); +} + +static void buf_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le) +{ + struct gfs2_bufdata *bd = container_of(le, struct gfs2_bufdata, bd_le); + struct gfs2_trans *tr; + + if (!list_empty(&bd->bd_list_tr)) + return; + + tr = current->journal_info; + tr->tr_touched = 1; + tr->tr_num_buf++; + list_add(&bd->bd_list_tr, &tr->tr_list_buf); + + if (!list_empty(&le->le_list)) + return; + + gfs2_trans_add_gl(bd->bd_gl); + + gfs2_meta_check(sdp, bd->bd_bh); + gfs2_pin(sdp, bd->bd_bh); + + gfs2_log_lock(sdp); + sdp->sd_log_num_buf++; + list_add(&le->le_list, &sdp->sd_log_le_buf); + gfs2_log_unlock(sdp); + + tr->tr_num_buf_new++; +} + +static void buf_lo_incore_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr) +{ + struct list_head *head = &tr->tr_list_buf; + struct gfs2_bufdata *bd; + + while (!list_empty(head)) { + bd = list_entry(head->next, struct gfs2_bufdata, bd_list_tr); + list_del_init(&bd->bd_list_tr); + tr->tr_num_buf--; + } + gfs2_assert_warn(sdp, !tr->tr_num_buf); +} + +static void buf_lo_before_commit(struct gfs2_sbd *sdp) +{ + struct buffer_head *bh; + struct gfs2_log_descriptor *ld; + struct gfs2_bufdata *bd1 = NULL, *bd2; + unsigned int total = sdp->sd_log_num_buf; + unsigned int offset = sizeof(struct gfs2_log_descriptor); + unsigned int limit; + unsigned int num; + unsigned n; + __be64 *ptr; + + offset += sizeof(__be64) - 1; + offset &= ~(sizeof(__be64) - 1); + limit = (sdp->sd_sb.sb_bsize - offset)/sizeof(__be64); + /* for 4k blocks, limit = 503 */ + + bd1 = bd2 = list_prepare_entry(bd1, &sdp->sd_log_le_buf, bd_le.le_list); + while(total) { + num = total; + if (total > limit) + num = limit; + bh = gfs2_log_get_buf(sdp); + sdp->sd_log_num_hdrs++; + ld = (struct gfs2_log_descriptor *)bh->b_data; + ptr = (__be64 *)(bh->b_data + offset); + ld->ld_header.mh_magic = cpu_to_be32(GFS2_MAGIC); + ld->ld_header.mh_type = cpu_to_be32(GFS2_METATYPE_LD); + ld->ld_header.mh_format = cpu_to_be32(GFS2_FORMAT_LD); + ld->ld_type = cpu_to_be32(GFS2_LOG_DESC_METADATA); + ld->ld_length = cpu_to_be32(num + 1); + ld->ld_data1 = cpu_to_be32(num); + ld->ld_data2 = cpu_to_be32(0); + memset(ld->ld_reserved, 0, sizeof(ld->ld_reserved)); + + n = 0; + list_for_each_entry_continue(bd1, &sdp->sd_log_le_buf, + bd_le.le_list) { + *ptr++ = cpu_to_be64(bd1->bd_bh->b_blocknr); + if (++n >= num) + break; + } + + set_buffer_dirty(bh); + ll_rw_block(WRITE, 1, &bh); + + n = 0; + list_for_each_entry_continue(bd2, &sdp->sd_log_le_buf, + bd_le.le_list) { + bh = gfs2_log_fake_buf(sdp, bd2->bd_bh); + set_buffer_dirty(bh); + ll_rw_block(WRITE, 1, &bh); + if (++n >= num) + break; + } + + total -= num; + } +} + +static void buf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai) +{ + struct list_head *head = &sdp->sd_log_le_buf; + struct gfs2_bufdata *bd; + + while (!list_empty(head)) { + bd = list_entry(head->next, struct gfs2_bufdata, bd_le.le_list); + list_del_init(&bd->bd_le.le_list); + sdp->sd_log_num_buf--; + + gfs2_unpin(sdp, bd->bd_bh, ai); + } + gfs2_assert_warn(sdp, !sdp->sd_log_num_buf); +} + +static void buf_lo_before_scan(struct gfs2_jdesc *jd, + struct gfs2_log_header *head, int pass) +{ + struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); + + if (pass != 0) + return; + + sdp->sd_found_blocks = 0; + sdp->sd_replayed_blocks = 0; +} + +static int buf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start, + struct gfs2_log_descriptor *ld, __be64 *ptr, + int pass) +{ + struct gfs2_inode *ip = GFS2_I(jd->jd_inode); + struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); + struct gfs2_glock *gl = ip->i_gl; + unsigned int blks = be32_to_cpu(ld->ld_data1); + struct buffer_head *bh_log, *bh_ip; + u64 blkno; + int error = 0; + + if (pass != 1 || be32_to_cpu(ld->ld_type) != GFS2_LOG_DESC_METADATA) + return 0; + + gfs2_replay_incr_blk(sdp, &start); + + for (; blks; gfs2_replay_incr_blk(sdp, &start), blks--) { + blkno = be64_to_cpu(*ptr++); + + sdp->sd_found_blocks++; + + if (gfs2_revoke_check(sdp, blkno, start)) + continue; + + error = gfs2_replay_read_block(jd, start, &bh_log); + if (error) + return error; + + bh_ip = gfs2_meta_new(gl, blkno); + memcpy(bh_ip->b_data, bh_log->b_data, bh_log->b_size); + + if (gfs2_meta_check(sdp, bh_ip)) + error = -EIO; + else + mark_buffer_dirty(bh_ip); + + brelse(bh_log); + brelse(bh_ip); + + if (error) + break; + + sdp->sd_replayed_blocks++; + } + + return error; +} + +static void buf_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass) +{ + struct gfs2_inode *ip = GFS2_I(jd->jd_inode); + struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); + + if (error) { + gfs2_meta_sync(ip->i_gl); + return; + } + if (pass != 1) + return; + + gfs2_meta_sync(ip->i_gl); + + fs_info(sdp, "jid=%u: Replayed %u of %u blocks\n", + jd->jd_jid, sdp->sd_replayed_blocks, sdp->sd_found_blocks); +} + +static void revoke_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le) +{ + struct gfs2_trans *tr; + + tr = current->journal_info; + tr->tr_touched = 1; + tr->tr_num_revoke++; + + gfs2_log_lock(sdp); + sdp->sd_log_num_revoke++; + list_add(&le->le_list, &sdp->sd_log_le_revoke); + gfs2_log_unlock(sdp); +} + +static void revoke_lo_before_commit(struct gfs2_sbd *sdp) +{ + struct gfs2_log_descriptor *ld; + struct gfs2_meta_header *mh; + struct buffer_head *bh; + unsigned int offset; + struct list_head *head = &sdp->sd_log_le_revoke; + struct gfs2_revoke *rv; + + if (!sdp->sd_log_num_revoke) + return; + + bh = gfs2_log_get_buf(sdp); + ld = (struct gfs2_log_descriptor *)bh->b_data; + ld->ld_header.mh_magic = cpu_to_be32(GFS2_MAGIC); + ld->ld_header.mh_type = cpu_to_be32(GFS2_METATYPE_LD); + ld->ld_header.mh_format = cpu_to_be32(GFS2_FORMAT_LD); + ld->ld_type = cpu_to_be32(GFS2_LOG_DESC_REVOKE); + ld->ld_length = cpu_to_be32(gfs2_struct2blk(sdp, sdp->sd_log_num_revoke, + sizeof(u64))); + ld->ld_data1 = cpu_to_be32(sdp->sd_log_num_revoke); + ld->ld_data2 = cpu_to_be32(0); + memset(ld->ld_reserved, 0, sizeof(ld->ld_reserved)); + offset = sizeof(struct gfs2_log_descriptor); + + while (!list_empty(head)) { + rv = list_entry(head->next, struct gfs2_revoke, rv_le.le_list); + list_del_init(&rv->rv_le.le_list); + sdp->sd_log_num_revoke--; + + if (offset + sizeof(u64) > sdp->sd_sb.sb_bsize) { + set_buffer_dirty(bh); + ll_rw_block(WRITE, 1, &bh); + + bh = gfs2_log_get_buf(sdp); + mh = (struct gfs2_meta_header *)bh->b_data; + mh->mh_magic = cpu_to_be32(GFS2_MAGIC); + mh->mh_type = cpu_to_be32(GFS2_METATYPE_LB); + mh->mh_format = cpu_to_be32(GFS2_FORMAT_LB); + offset = sizeof(struct gfs2_meta_header); + } + + *(__be64 *)(bh->b_data + offset) = cpu_to_be64(rv->rv_blkno); + kfree(rv); + + offset += sizeof(u64); + } + gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke); + + set_buffer_dirty(bh); + ll_rw_block(WRITE, 1, &bh); +} + +static void revoke_lo_before_scan(struct gfs2_jdesc *jd, + struct gfs2_log_header *head, int pass) +{ + struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); + + if (pass != 0) + return; + + sdp->sd_found_revokes = 0; + sdp->sd_replay_tail = head->lh_tail; +} + +static int revoke_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start, + struct gfs2_log_descriptor *ld, __be64 *ptr, + int pass) +{ + struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); + unsigned int blks = be32_to_cpu(ld->ld_length); + unsigned int revokes = be32_to_cpu(ld->ld_data1); + struct buffer_head *bh; + unsigned int offset; + u64 blkno; + int first = 1; + int error; + + if (pass != 0 || be32_to_cpu(ld->ld_type) != GFS2_LOG_DESC_REVOKE) + return 0; + + offset = sizeof(struct gfs2_log_descriptor); + + for (; blks; gfs2_replay_incr_blk(sdp, &start), blks--) { + error = gfs2_replay_read_block(jd, start, &bh); + if (error) + return error; + + if (!first) + gfs2_metatype_check(sdp, bh, GFS2_METATYPE_LB); + + while (offset + sizeof(u64) <= sdp->sd_sb.sb_bsize) { + blkno = be64_to_cpu(*(__be64 *)(bh->b_data + offset)); + + error = gfs2_revoke_add(sdp, blkno, start); + if (error < 0) + return error; + else if (error) + sdp->sd_found_revokes++; + + if (!--revokes) + break; + offset += sizeof(u64); + } + + brelse(bh); + offset = sizeof(struct gfs2_meta_header); + first = 0; + } + + return 0; +} + +static void revoke_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass) +{ + struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); + + if (error) { + gfs2_revoke_clean(sdp); + return; + } + if (pass != 1) + return; + + fs_info(sdp, "jid=%u: Found %u revoke tags\n", + jd->jd_jid, sdp->sd_found_revokes); + + gfs2_revoke_clean(sdp); +} + +static void rg_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le) +{ + struct gfs2_rgrpd *rgd; + struct gfs2_trans *tr = current->journal_info; + + tr->tr_touched = 1; + + if (!list_empty(&le->le_list)) + return; + + rgd = container_of(le, struct gfs2_rgrpd, rd_le); + gfs2_rgrp_bh_hold(rgd); + + gfs2_log_lock(sdp); + sdp->sd_log_num_rg++; + list_add(&le->le_list, &sdp->sd_log_le_rg); + gfs2_log_unlock(sdp); +} + +static void rg_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai) +{ + struct list_head *head = &sdp->sd_log_le_rg; + struct gfs2_rgrpd *rgd; + + while (!list_empty(head)) { + rgd = list_entry(head->next, struct gfs2_rgrpd, rd_le.le_list); + list_del_init(&rgd->rd_le.le_list); + sdp->sd_log_num_rg--; + + gfs2_rgrp_repolish_clones(rgd); + gfs2_rgrp_bh_put(rgd); + } + gfs2_assert_warn(sdp, !sdp->sd_log_num_rg); +} + +/** + * databuf_lo_add - Add a databuf to the transaction. + * + * This is used in two distinct cases: + * i) In ordered write mode + * We put the data buffer on a list so that we can ensure that its + * synced to disk at the right time + * ii) In journaled data mode + * We need to journal the data block in the same way as metadata in + * the functions above. The difference is that here we have a tag + * which is two __be64's being the block number (as per meta data) + * and a flag which says whether the data block needs escaping or + * not. This means we need a new log entry for each 251 or so data + * blocks, which isn't an enormous overhead but twice as much as + * for normal metadata blocks. + */ +static void databuf_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le) +{ + struct gfs2_bufdata *bd = container_of(le, struct gfs2_bufdata, bd_le); + struct gfs2_trans *tr = current->journal_info; + struct address_space *mapping = bd->bd_bh->b_page->mapping; + struct gfs2_inode *ip = GFS2_I(mapping->host); + + tr->tr_touched = 1; + if (list_empty(&bd->bd_list_tr) && + (ip->i_di.di_flags & GFS2_DIF_JDATA)) { + tr->tr_num_buf++; + list_add(&bd->bd_list_tr, &tr->tr_list_buf); + gfs2_pin(sdp, bd->bd_bh); + tr->tr_num_buf_new++; + } + gfs2_trans_add_gl(bd->bd_gl); + gfs2_log_lock(sdp); + if (list_empty(&le->le_list)) { + if (ip->i_di.di_flags & GFS2_DIF_JDATA) + sdp->sd_log_num_jdata++; + sdp->sd_log_num_databuf++; + list_add(&le->le_list, &sdp->sd_log_le_databuf); + } + gfs2_log_unlock(sdp); +} + +static int gfs2_check_magic(struct buffer_head *bh) +{ + struct page *page = bh->b_page; + void *kaddr; + __be32 *ptr; + int rv = 0; + + kaddr = kmap_atomic(page, KM_USER0); + ptr = kaddr + bh_offset(bh); + if (*ptr == cpu_to_be32(GFS2_MAGIC)) + rv = 1; + kunmap_atomic(page, KM_USER0); + + return rv; +} + +/** + * databuf_lo_before_commit - Scan the data buffers, writing as we go + * + * Here we scan through the lists of buffers and make the assumption + * that any buffer thats been pinned is being journaled, and that + * any unpinned buffer is an ordered write data buffer and therefore + * will be written back rather than journaled. + */ +static void databuf_lo_before_commit(struct gfs2_sbd *sdp) +{ + LIST_HEAD(started); + struct gfs2_bufdata *bd1 = NULL, *bd2, *bdt; + struct buffer_head *bh = NULL; + unsigned int offset = sizeof(struct gfs2_log_descriptor); + struct gfs2_log_descriptor *ld; + unsigned int limit; + unsigned int total_dbuf = sdp->sd_log_num_databuf; + unsigned int total_jdata = sdp->sd_log_num_jdata; + unsigned int num, n; + __be64 *ptr = NULL; + + offset += 2*sizeof(__be64) - 1; + offset &= ~(2*sizeof(__be64) - 1); + limit = (sdp->sd_sb.sb_bsize - offset)/sizeof(__be64); + + /* + * Start writing ordered buffers, write journaled buffers + * into the log along with a header + */ + gfs2_log_lock(sdp); + bd2 = bd1 = list_prepare_entry(bd1, &sdp->sd_log_le_databuf, + bd_le.le_list); + while(total_dbuf) { + num = total_jdata; + if (num > limit) + num = limit; + n = 0; + list_for_each_entry_safe_continue(bd1, bdt, + &sdp->sd_log_le_databuf, + bd_le.le_list) { + /* An ordered write buffer */ + if (bd1->bd_bh && !buffer_pinned(bd1->bd_bh)) { + list_move(&bd1->bd_le.le_list, &started); + if (bd1 == bd2) { + bd2 = NULL; + bd2 = list_prepare_entry(bd2, + &sdp->sd_log_le_databuf, + bd_le.le_list); + } + total_dbuf--; + if (bd1->bd_bh) { + get_bh(bd1->bd_bh); + if (buffer_dirty(bd1->bd_bh)) { + gfs2_log_unlock(sdp); + wait_on_buffer(bd1->bd_bh); + ll_rw_block(WRITE, 1, + &bd1->bd_bh); + gfs2_log_lock(sdp); + } + brelse(bd1->bd_bh); + continue; + } + continue; + } else if (bd1->bd_bh) { /* A journaled buffer */ + int magic; + gfs2_log_unlock(sdp); + if (!bh) { + bh = gfs2_log_get_buf(sdp); + sdp->sd_log_num_hdrs++; + ld = (struct gfs2_log_descriptor *) + bh->b_data; + ptr = (__be64 *)(bh->b_data + offset); + ld->ld_header.mh_magic = + cpu_to_be32(GFS2_MAGIC); + ld->ld_header.mh_type = + cpu_to_be32(GFS2_METATYPE_LD); + ld->ld_header.mh_format = + cpu_to_be32(GFS2_FORMAT_LD); + ld->ld_type = + cpu_to_be32(GFS2_LOG_DESC_JDATA); + ld->ld_length = cpu_to_be32(num + 1); + ld->ld_data1 = cpu_to_be32(num); + ld->ld_data2 = cpu_to_be32(0); + memset(ld->ld_reserved, 0, sizeof(ld->ld_reserved)); + } + magic = gfs2_check_magic(bd1->bd_bh); + *ptr++ = cpu_to_be64(bd1->bd_bh->b_blocknr); + *ptr++ = cpu_to_be64((__u64)magic); + clear_buffer_escaped(bd1->bd_bh); + if (unlikely(magic != 0)) + set_buffer_escaped(bd1->bd_bh); + gfs2_log_lock(sdp); + if (n++ > num) + break; + } else if (!bd1->bd_bh) { + total_dbuf--; + sdp->sd_log_num_databuf--; + list_del_init(&bd1->bd_le.le_list); + if (bd1 == bd2) { + bd2 = NULL; + bd2 = list_prepare_entry(bd2, + &sdp->sd_log_le_databuf, + bd_le.le_list); + } + kmem_cache_free(gfs2_bufdata_cachep, bd1); + } + } + gfs2_log_unlock(sdp); + if (bh) { + set_buffer_dirty(bh); + ll_rw_block(WRITE, 1, &bh); + bh = NULL; + } + n = 0; + gfs2_log_lock(sdp); + list_for_each_entry_continue(bd2, &sdp->sd_log_le_databuf, + bd_le.le_list) { + if (!bd2->bd_bh) + continue; + /* copy buffer if it needs escaping */ + gfs2_log_unlock(sdp); + if (unlikely(buffer_escaped(bd2->bd_bh))) { + void *kaddr; + struct page *page = bd2->bd_bh->b_page; + bh = gfs2_log_get_buf(sdp); + kaddr = kmap_atomic(page, KM_USER0); + memcpy(bh->b_data, + kaddr + bh_offset(bd2->bd_bh), + sdp->sd_sb.sb_bsize); + kunmap_atomic(page, KM_USER0); + *(__be32 *)bh->b_data = 0; + } else { + bh = gfs2_log_fake_buf(sdp, bd2->bd_bh); + } + set_buffer_dirty(bh); + ll_rw_block(WRITE, 1, &bh); + gfs2_log_lock(sdp); + if (++n >= num) + break; + } + bh = NULL; + total_dbuf -= num; + total_jdata -= num; + } + gfs2_log_unlock(sdp); + + /* Wait on all ordered buffers */ + while (!list_empty(&started)) { + gfs2_log_lock(sdp); + bd1 = list_entry(started.next, struct gfs2_bufdata, + bd_le.le_list); + list_del_init(&bd1->bd_le.le_list); + sdp->sd_log_num_databuf--; + bh = bd1->bd_bh; + if (bh) { + bh->b_private = NULL; + get_bh(bh); + gfs2_log_unlock(sdp); + wait_on_buffer(bh); + brelse(bh); + } else + gfs2_log_unlock(sdp); + + kmem_cache_free(gfs2_bufdata_cachep, bd1); + } + + /* We've removed all the ordered write bufs here, so only jdata left */ + gfs2_assert_warn(sdp, sdp->sd_log_num_databuf == sdp->sd_log_num_jdata); +} + +static int databuf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start, + struct gfs2_log_descriptor *ld, + __be64 *ptr, int pass) +{ + struct gfs2_inode *ip = GFS2_I(jd->jd_inode); + struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); + struct gfs2_glock *gl = ip->i_gl; + unsigned int blks = be32_to_cpu(ld->ld_data1); + struct buffer_head *bh_log, *bh_ip; + u64 blkno; + u64 esc; + int error = 0; + + if (pass != 1 || be32_to_cpu(ld->ld_type) != GFS2_LOG_DESC_JDATA) + return 0; + + gfs2_replay_incr_blk(sdp, &start); + for (; blks; gfs2_replay_incr_blk(sdp, &start), blks--) { + blkno = be64_to_cpu(*ptr++); + esc = be64_to_cpu(*ptr++); + + sdp->sd_found_blocks++; + + if (gfs2_revoke_check(sdp, blkno, start)) + continue; + + error = gfs2_replay_read_block(jd, start, &bh_log); + if (error) + return error; + + bh_ip = gfs2_meta_new(gl, blkno); + memcpy(bh_ip->b_data, bh_log->b_data, bh_log->b_size); + + /* Unescape */ + if (esc) { + __be32 *eptr = (__be32 *)bh_ip->b_data; + *eptr = cpu_to_be32(GFS2_MAGIC); + } + mark_buffer_dirty(bh_ip); + + brelse(bh_log); + brelse(bh_ip); + if (error) + break; + + sdp->sd_replayed_blocks++; + } + + return error; +} + +/* FIXME: sort out accounting for log blocks etc. */ + +static void databuf_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass) +{ + struct gfs2_inode *ip = GFS2_I(jd->jd_inode); + struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); + + if (error) { + gfs2_meta_sync(ip->i_gl); + return; + } + if (pass != 1) + return; + + /* data sync? */ + gfs2_meta_sync(ip->i_gl); + + fs_info(sdp, "jid=%u: Replayed %u of %u data blocks\n", + jd->jd_jid, sdp->sd_replayed_blocks, sdp->sd_found_blocks); +} + +static void databuf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai) +{ + struct list_head *head = &sdp->sd_log_le_databuf; + struct gfs2_bufdata *bd; + + while (!list_empty(head)) { + bd = list_entry(head->next, struct gfs2_bufdata, bd_le.le_list); + list_del_init(&bd->bd_le.le_list); + sdp->sd_log_num_databuf--; + sdp->sd_log_num_jdata--; + gfs2_unpin(sdp, bd->bd_bh, ai); + } + gfs2_assert_warn(sdp, !sdp->sd_log_num_databuf); + gfs2_assert_warn(sdp, !sdp->sd_log_num_jdata); +} + + +const struct gfs2_log_operations gfs2_glock_lops = { + .lo_add = glock_lo_add, + .lo_after_commit = glock_lo_after_commit, + .lo_name = "glock", +}; + +const struct gfs2_log_operations gfs2_buf_lops = { + .lo_add = buf_lo_add, + .lo_incore_commit = buf_lo_incore_commit, + .lo_before_commit = buf_lo_before_commit, + .lo_after_commit = buf_lo_after_commit, + .lo_before_scan = buf_lo_before_scan, + .lo_scan_elements = buf_lo_scan_elements, + .lo_after_scan = buf_lo_after_scan, + .lo_name = "buf", +}; + +const struct gfs2_log_operations gfs2_revoke_lops = { + .lo_add = revoke_lo_add, + .lo_before_commit = revoke_lo_before_commit, + .lo_before_scan = revoke_lo_before_scan, + .lo_scan_elements = revoke_lo_scan_elements, + .lo_after_scan = revoke_lo_after_scan, + .lo_name = "revoke", +}; + +const struct gfs2_log_operations gfs2_rg_lops = { + .lo_add = rg_lo_add, + .lo_after_commit = rg_lo_after_commit, + .lo_name = "rg", +}; + +const struct gfs2_log_operations gfs2_databuf_lops = { + .lo_add = databuf_lo_add, + .lo_incore_commit = buf_lo_incore_commit, + .lo_before_commit = databuf_lo_before_commit, + .lo_after_commit = databuf_lo_after_commit, + .lo_scan_elements = databuf_lo_scan_elements, + .lo_after_scan = databuf_lo_after_scan, + .lo_name = "databuf", +}; + +const struct gfs2_log_operations *gfs2_log_ops[] = { + &gfs2_glock_lops, + &gfs2_buf_lops, + &gfs2_revoke_lops, + &gfs2_rg_lops, + &gfs2_databuf_lops, + NULL, +}; + diff --git a/fs/gfs2/lops.h b/fs/gfs2/lops.h new file mode 100644 index 000000000000..5839c05ae6be --- /dev/null +++ b/fs/gfs2/lops.h @@ -0,0 +1,99 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#ifndef __LOPS_DOT_H__ +#define __LOPS_DOT_H__ + +#include <linux/list.h> +#include "incore.h" + +extern const struct gfs2_log_operations gfs2_glock_lops; +extern const struct gfs2_log_operations gfs2_buf_lops; +extern const struct gfs2_log_operations gfs2_revoke_lops; +extern const struct gfs2_log_operations gfs2_rg_lops; +extern const struct gfs2_log_operations gfs2_databuf_lops; + +extern const struct gfs2_log_operations *gfs2_log_ops[]; + +static inline void lops_init_le(struct gfs2_log_element *le, + const struct gfs2_log_operations *lops) +{ + INIT_LIST_HEAD(&le->le_list); + le->le_ops = lops; +} + +static inline void lops_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le) +{ + if (le->le_ops->lo_add) + le->le_ops->lo_add(sdp, le); +} + +static inline void lops_incore_commit(struct gfs2_sbd *sdp, + struct gfs2_trans *tr) +{ + int x; + for (x = 0; gfs2_log_ops[x]; x++) + if (gfs2_log_ops[x]->lo_incore_commit) + gfs2_log_ops[x]->lo_incore_commit(sdp, tr); +} + +static inline void lops_before_commit(struct gfs2_sbd *sdp) +{ + int x; + for (x = 0; gfs2_log_ops[x]; x++) + if (gfs2_log_ops[x]->lo_before_commit) + gfs2_log_ops[x]->lo_before_commit(sdp); +} + +static inline void lops_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai) +{ + int x; + for (x = 0; gfs2_log_ops[x]; x++) + if (gfs2_log_ops[x]->lo_after_commit) + gfs2_log_ops[x]->lo_after_commit(sdp, ai); +} + +static inline void lops_before_scan(struct gfs2_jdesc *jd, + struct gfs2_log_header *head, + unsigned int pass) +{ + int x; + for (x = 0; gfs2_log_ops[x]; x++) + if (gfs2_log_ops[x]->lo_before_scan) + gfs2_log_ops[x]->lo_before_scan(jd, head, pass); +} + +static inline int lops_scan_elements(struct gfs2_jdesc *jd, unsigned int start, + struct gfs2_log_descriptor *ld, + __be64 *ptr, + unsigned int pass) +{ + int x, error; + for (x = 0; gfs2_log_ops[x]; x++) + if (gfs2_log_ops[x]->lo_scan_elements) { + error = gfs2_log_ops[x]->lo_scan_elements(jd, start, + ld, ptr, pass); + if (error) + return error; + } + + return 0; +} + +static inline void lops_after_scan(struct gfs2_jdesc *jd, int error, + unsigned int pass) +{ + int x; + for (x = 0; gfs2_log_ops[x]; x++) + if (gfs2_log_ops[x]->lo_before_scan) + gfs2_log_ops[x]->lo_after_scan(jd, error, pass); +} + +#endif /* __LOPS_DOT_H__ */ + diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c new file mode 100644 index 000000000000..21508a13bb78 --- /dev/null +++ b/fs/gfs2/main.c @@ -0,0 +1,150 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/completion.h> +#include <linux/buffer_head.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/gfs2_ondisk.h> +#include <linux/lm_interface.h> +#include <asm/atomic.h> + +#include "gfs2.h" +#include "incore.h" +#include "ops_fstype.h" +#include "sys.h" +#include "util.h" +#include "glock.h" + +static void gfs2_init_inode_once(void *foo, kmem_cache_t *cachep, unsigned long flags) +{ + struct gfs2_inode *ip = foo; + if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == + SLAB_CTOR_CONSTRUCTOR) { + inode_init_once(&ip->i_inode); + spin_lock_init(&ip->i_spin); + init_rwsem(&ip->i_rw_mutex); + memset(ip->i_cache, 0, sizeof(ip->i_cache)); + } +} + +static void gfs2_init_glock_once(void *foo, kmem_cache_t *cachep, unsigned long flags) +{ + struct gfs2_glock *gl = foo; + if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == + SLAB_CTOR_CONSTRUCTOR) { + INIT_HLIST_NODE(&gl->gl_list); + spin_lock_init(&gl->gl_spin); + INIT_LIST_HEAD(&gl->gl_holders); + INIT_LIST_HEAD(&gl->gl_waiters1); + INIT_LIST_HEAD(&gl->gl_waiters2); + INIT_LIST_HEAD(&gl->gl_waiters3); + gl->gl_lvb = NULL; + atomic_set(&gl->gl_lvb_count, 0); + INIT_LIST_HEAD(&gl->gl_reclaim); + INIT_LIST_HEAD(&gl->gl_ail_list); + atomic_set(&gl->gl_ail_count, 0); + } +} + +/** + * init_gfs2_fs - Register GFS2 as a filesystem + * + * Returns: 0 on success, error code on failure + */ + +static int __init init_gfs2_fs(void) +{ + int error; + + error = gfs2_sys_init(); + if (error) + return error; + + error = gfs2_glock_init(); + if (error) + goto fail; + + error = -ENOMEM; + gfs2_glock_cachep = kmem_cache_create("gfs2_glock", + sizeof(struct gfs2_glock), + 0, 0, + gfs2_init_glock_once, NULL); + if (!gfs2_glock_cachep) + goto fail; + + gfs2_inode_cachep = kmem_cache_create("gfs2_inode", + sizeof(struct gfs2_inode), + 0, (SLAB_RECLAIM_ACCOUNT| + SLAB_PANIC|SLAB_MEM_SPREAD), + gfs2_init_inode_once, NULL); + if (!gfs2_inode_cachep) + goto fail; + + gfs2_bufdata_cachep = kmem_cache_create("gfs2_bufdata", + sizeof(struct gfs2_bufdata), + 0, 0, NULL, NULL); + if (!gfs2_bufdata_cachep) + goto fail; + + error = register_filesystem(&gfs2_fs_type); + if (error) + goto fail; + + error = register_filesystem(&gfs2meta_fs_type); + if (error) + goto fail_unregister; + + printk("GFS2 (built %s %s) installed\n", __DATE__, __TIME__); + + return 0; + +fail_unregister: + unregister_filesystem(&gfs2_fs_type); +fail: + if (gfs2_bufdata_cachep) + kmem_cache_destroy(gfs2_bufdata_cachep); + + if (gfs2_inode_cachep) + kmem_cache_destroy(gfs2_inode_cachep); + + if (gfs2_glock_cachep) + kmem_cache_destroy(gfs2_glock_cachep); + + gfs2_sys_uninit(); + return error; +} + +/** + * exit_gfs2_fs - Unregister the file system + * + */ + +static void __exit exit_gfs2_fs(void) +{ + unregister_filesystem(&gfs2_fs_type); + unregister_filesystem(&gfs2meta_fs_type); + + kmem_cache_destroy(gfs2_bufdata_cachep); + kmem_cache_destroy(gfs2_inode_cachep); + kmem_cache_destroy(gfs2_glock_cachep); + + gfs2_sys_uninit(); +} + +MODULE_DESCRIPTION("Global File System"); +MODULE_AUTHOR("Red Hat, Inc."); +MODULE_LICENSE("GPL"); + +module_init(init_gfs2_fs); +module_exit(exit_gfs2_fs); + diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c new file mode 100644 index 000000000000..3912d6a4b1e6 --- /dev/null +++ b/fs/gfs2/meta_io.c @@ -0,0 +1,590 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/completion.h> +#include <linux/buffer_head.h> +#include <linux/mm.h> +#include <linux/pagemap.h> +#include <linux/writeback.h> +#include <linux/swap.h> +#include <linux/delay.h> +#include <linux/bio.h> +#include <linux/gfs2_ondisk.h> +#include <linux/lm_interface.h> + +#include "gfs2.h" +#include "incore.h" +#include "glock.h" +#include "glops.h" +#include "inode.h" +#include "log.h" +#include "lops.h" +#include "meta_io.h" +#include "rgrp.h" +#include "trans.h" +#include "util.h" +#include "ops_address.h" + +static int aspace_get_block(struct inode *inode, sector_t lblock, + struct buffer_head *bh_result, int create) +{ + gfs2_assert_warn(inode->i_sb->s_fs_info, 0); + return -EOPNOTSUPP; +} + +static int gfs2_aspace_writepage(struct page *page, + struct writeback_control *wbc) +{ + return block_write_full_page(page, aspace_get_block, wbc); +} + +static const struct address_space_operations aspace_aops = { + .writepage = gfs2_aspace_writepage, + .releasepage = gfs2_releasepage, +}; + +/** + * gfs2_aspace_get - Create and initialize a struct inode structure + * @sdp: the filesystem the aspace is in + * + * Right now a struct inode is just a struct inode. Maybe Linux + * will supply a more lightweight address space construct (that works) + * in the future. + * + * Make sure pages/buffers in this aspace aren't in high memory. + * + * Returns: the aspace + */ + +struct inode *gfs2_aspace_get(struct gfs2_sbd *sdp) +{ + struct inode *aspace; + + aspace = new_inode(sdp->sd_vfs); + if (aspace) { + mapping_set_gfp_mask(aspace->i_mapping, GFP_NOFS); + aspace->i_mapping->a_ops = &aspace_aops; + aspace->i_size = ~0ULL; + aspace->i_private = NULL; + insert_inode_hash(aspace); + } + return aspace; +} + +void gfs2_aspace_put(struct inode *aspace) +{ + remove_inode_hash(aspace); + iput(aspace); +} + +/** + * gfs2_meta_inval - Invalidate all buffers associated with a glock + * @gl: the glock + * + */ + +void gfs2_meta_inval(struct gfs2_glock *gl) +{ + struct gfs2_sbd *sdp = gl->gl_sbd; + struct inode *aspace = gl->gl_aspace; + struct address_space *mapping = gl->gl_aspace->i_mapping; + + gfs2_assert_withdraw(sdp, !atomic_read(&gl->gl_ail_count)); + + atomic_inc(&aspace->i_writecount); + truncate_inode_pages(mapping, 0); + atomic_dec(&aspace->i_writecount); + + gfs2_assert_withdraw(sdp, !mapping->nrpages); +} + +/** + * gfs2_meta_sync - Sync all buffers associated with a glock + * @gl: The glock + * + */ + +void gfs2_meta_sync(struct gfs2_glock *gl) +{ + struct address_space *mapping = gl->gl_aspace->i_mapping; + int error; + + filemap_fdatawrite(mapping); + error = filemap_fdatawait(mapping); + + if (error) + gfs2_io_error(gl->gl_sbd); +} + +/** + * getbuf - Get a buffer with a given address space + * @sdp: the filesystem + * @aspace: the address space + * @blkno: the block number (filesystem scope) + * @create: 1 if the buffer should be created + * + * Returns: the buffer + */ + +static struct buffer_head *getbuf(struct gfs2_sbd *sdp, struct inode *aspace, + u64 blkno, int create) +{ + struct page *page; + struct buffer_head *bh; + unsigned int shift; + unsigned long index; + unsigned int bufnum; + + shift = PAGE_CACHE_SHIFT - sdp->sd_sb.sb_bsize_shift; + index = blkno >> shift; /* convert block to page */ + bufnum = blkno - (index << shift); /* block buf index within page */ + + if (create) { + for (;;) { + page = grab_cache_page(aspace->i_mapping, index); + if (page) + break; + yield(); + } + } else { + page = find_lock_page(aspace->i_mapping, index); + if (!page) + return NULL; + } + + if (!page_has_buffers(page)) + create_empty_buffers(page, sdp->sd_sb.sb_bsize, 0); + + /* Locate header for our buffer within our page */ + for (bh = page_buffers(page); bufnum--; bh = bh->b_this_page) + /* Do nothing */; + get_bh(bh); + + if (!buffer_mapped(bh)) + map_bh(bh, sdp->sd_vfs, blkno); + + unlock_page(page); + mark_page_accessed(page); + page_cache_release(page); + + return bh; +} + +static void meta_prep_new(struct buffer_head *bh) +{ + struct gfs2_meta_header *mh = (struct gfs2_meta_header *)bh->b_data; + + lock_buffer(bh); + clear_buffer_dirty(bh); + set_buffer_uptodate(bh); + unlock_buffer(bh); + + mh->mh_magic = cpu_to_be32(GFS2_MAGIC); +} + +/** + * gfs2_meta_new - Get a block + * @gl: The glock associated with this block + * @blkno: The block number + * + * Returns: The buffer + */ + +struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno) +{ + struct buffer_head *bh; + bh = getbuf(gl->gl_sbd, gl->gl_aspace, blkno, CREATE); + meta_prep_new(bh); + return bh; +} + +/** + * gfs2_meta_read - Read a block from disk + * @gl: The glock covering the block + * @blkno: The block number + * @flags: flags + * @bhp: the place where the buffer is returned (NULL on failure) + * + * Returns: errno + */ + +int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags, + struct buffer_head **bhp) +{ + *bhp = getbuf(gl->gl_sbd, gl->gl_aspace, blkno, CREATE); + if (!buffer_uptodate(*bhp)) + ll_rw_block(READ_META, 1, bhp); + if (flags & DIO_WAIT) { + int error = gfs2_meta_wait(gl->gl_sbd, *bhp); + if (error) { + brelse(*bhp); + return error; + } + } + + return 0; +} + +/** + * gfs2_meta_wait - Reread a block from disk + * @sdp: the filesystem + * @bh: The block to wait for + * + * Returns: errno + */ + +int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh) +{ + if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) + return -EIO; + + wait_on_buffer(bh); + + if (!buffer_uptodate(bh)) { + struct gfs2_trans *tr = current->journal_info; + if (tr && tr->tr_touched) + gfs2_io_error_bh(sdp, bh); + return -EIO; + } + if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) + return -EIO; + + return 0; +} + +/** + * gfs2_attach_bufdata - attach a struct gfs2_bufdata structure to a buffer + * @gl: the glock the buffer belongs to + * @bh: The buffer to be attached to + * @meta: Flag to indicate whether its metadata or not + */ + +void gfs2_attach_bufdata(struct gfs2_glock *gl, struct buffer_head *bh, + int meta) +{ + struct gfs2_bufdata *bd; + + if (meta) + lock_page(bh->b_page); + + if (bh->b_private) { + if (meta) + unlock_page(bh->b_page); + return; + } + + bd = kmem_cache_alloc(gfs2_bufdata_cachep, GFP_NOFS | __GFP_NOFAIL), + memset(bd, 0, sizeof(struct gfs2_bufdata)); + bd->bd_bh = bh; + bd->bd_gl = gl; + + INIT_LIST_HEAD(&bd->bd_list_tr); + if (meta) + lops_init_le(&bd->bd_le, &gfs2_buf_lops); + else + lops_init_le(&bd->bd_le, &gfs2_databuf_lops); + bh->b_private = bd; + + if (meta) + unlock_page(bh->b_page); +} + +/** + * gfs2_pin - Pin a buffer in memory + * @sdp: the filesystem the buffer belongs to + * @bh: The buffer to be pinned + * + */ + +void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh) +{ + struct gfs2_bufdata *bd = bh->b_private; + + gfs2_assert_withdraw(sdp, test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)); + + if (test_set_buffer_pinned(bh)) + gfs2_assert_withdraw(sdp, 0); + + wait_on_buffer(bh); + + /* If this buffer is in the AIL and it has already been written + to in-place disk block, remove it from the AIL. */ + + gfs2_log_lock(sdp); + if (bd->bd_ail && !buffer_in_io(bh)) + list_move(&bd->bd_ail_st_list, &bd->bd_ail->ai_ail2_list); + gfs2_log_unlock(sdp); + + clear_buffer_dirty(bh); + wait_on_buffer(bh); + + if (!buffer_uptodate(bh)) + gfs2_io_error_bh(sdp, bh); + + get_bh(bh); +} + +/** + * gfs2_unpin - Unpin a buffer + * @sdp: the filesystem the buffer belongs to + * @bh: The buffer to unpin + * @ai: + * + */ + +void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh, + struct gfs2_ail *ai) +{ + struct gfs2_bufdata *bd = bh->b_private; + + gfs2_assert_withdraw(sdp, buffer_uptodate(bh)); + + if (!buffer_pinned(bh)) + gfs2_assert_withdraw(sdp, 0); + + mark_buffer_dirty(bh); + clear_buffer_pinned(bh); + + gfs2_log_lock(sdp); + if (bd->bd_ail) { + list_del(&bd->bd_ail_st_list); + brelse(bh); + } else { + struct gfs2_glock *gl = bd->bd_gl; + list_add(&bd->bd_ail_gl_list, &gl->gl_ail_list); + atomic_inc(&gl->gl_ail_count); + } + bd->bd_ail = ai; + list_add(&bd->bd_ail_st_list, &ai->ai_ail1_list); + gfs2_log_unlock(sdp); +} + +/** + * gfs2_meta_wipe - make inode's buffers so they aren't dirty/pinned anymore + * @ip: the inode who owns the buffers + * @bstart: the first buffer in the run + * @blen: the number of buffers in the run + * + */ + +void gfs2_meta_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct inode *aspace = ip->i_gl->gl_aspace; + struct buffer_head *bh; + + while (blen) { + bh = getbuf(sdp, aspace, bstart, NO_CREATE); + if (bh) { + struct gfs2_bufdata *bd = bh->b_private; + + if (test_clear_buffer_pinned(bh)) { + struct gfs2_trans *tr = current->journal_info; + gfs2_log_lock(sdp); + list_del_init(&bd->bd_le.le_list); + gfs2_assert_warn(sdp, sdp->sd_log_num_buf); + sdp->sd_log_num_buf--; + gfs2_log_unlock(sdp); + tr->tr_num_buf_rm++; + brelse(bh); + } + if (bd) { + gfs2_log_lock(sdp); + if (bd->bd_ail) { + u64 blkno = bh->b_blocknr; + bd->bd_ail = NULL; + list_del(&bd->bd_ail_st_list); + list_del(&bd->bd_ail_gl_list); + atomic_dec(&bd->bd_gl->gl_ail_count); + brelse(bh); + gfs2_log_unlock(sdp); + gfs2_trans_add_revoke(sdp, blkno); + } else + gfs2_log_unlock(sdp); + } + + lock_buffer(bh); + clear_buffer_dirty(bh); + clear_buffer_uptodate(bh); + unlock_buffer(bh); + + brelse(bh); + } + + bstart++; + blen--; + } +} + +/** + * gfs2_meta_cache_flush - get rid of any references on buffers for this inode + * @ip: The GFS2 inode + * + * This releases buffers that are in the most-recently-used array of + * blocks used for indirect block addressing for this inode. + */ + +void gfs2_meta_cache_flush(struct gfs2_inode *ip) +{ + struct buffer_head **bh_slot; + unsigned int x; + + spin_lock(&ip->i_spin); + + for (x = 0; x < GFS2_MAX_META_HEIGHT; x++) { + bh_slot = &ip->i_cache[x]; + if (!*bh_slot) + break; + brelse(*bh_slot); + *bh_slot = NULL; + } + + spin_unlock(&ip->i_spin); +} + +/** + * gfs2_meta_indirect_buffer - Get a metadata buffer + * @ip: The GFS2 inode + * @height: The level of this buf in the metadata (indir addr) tree (if any) + * @num: The block number (device relative) of the buffer + * @new: Non-zero if we may create a new buffer + * @bhp: the buffer is returned here + * + * Try to use the gfs2_inode's MRU metadata tree cache. + * + * Returns: errno + */ + +int gfs2_meta_indirect_buffer(struct gfs2_inode *ip, int height, u64 num, + int new, struct buffer_head **bhp) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct gfs2_glock *gl = ip->i_gl; + struct buffer_head *bh = NULL, **bh_slot = ip->i_cache + height; + int in_cache = 0; + + spin_lock(&ip->i_spin); + if (*bh_slot && (*bh_slot)->b_blocknr == num) { + bh = *bh_slot; + get_bh(bh); + in_cache = 1; + } + spin_unlock(&ip->i_spin); + + if (!bh) + bh = getbuf(gl->gl_sbd, gl->gl_aspace, num, CREATE); + + if (!bh) + return -ENOBUFS; + + if (new) { + if (gfs2_assert_warn(sdp, height)) + goto err; + meta_prep_new(bh); + gfs2_trans_add_bh(ip->i_gl, bh, 1); + gfs2_metatype_set(bh, GFS2_METATYPE_IN, GFS2_FORMAT_IN); + gfs2_buffer_clear_tail(bh, sizeof(struct gfs2_meta_header)); + } else { + u32 mtype = height ? GFS2_METATYPE_IN : GFS2_METATYPE_DI; + if (!buffer_uptodate(bh)) { + ll_rw_block(READ_META, 1, &bh); + if (gfs2_meta_wait(sdp, bh)) + goto err; + } + if (gfs2_metatype_check(sdp, bh, mtype)) + goto err; + } + + if (!in_cache) { + spin_lock(&ip->i_spin); + if (*bh_slot) + brelse(*bh_slot); + *bh_slot = bh; + get_bh(bh); + spin_unlock(&ip->i_spin); + } + + *bhp = bh; + return 0; +err: + brelse(bh); + return -EIO; +} + +/** + * gfs2_meta_ra - start readahead on an extent of a file + * @gl: the glock the blocks belong to + * @dblock: the starting disk block + * @extlen: the number of blocks in the extent + * + * returns: the first buffer in the extent + */ + +struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen) +{ + struct gfs2_sbd *sdp = gl->gl_sbd; + struct inode *aspace = gl->gl_aspace; + struct buffer_head *first_bh, *bh; + u32 max_ra = gfs2_tune_get(sdp, gt_max_readahead) >> + sdp->sd_sb.sb_bsize_shift; + + BUG_ON(!extlen); + + if (max_ra < 1) + max_ra = 1; + if (extlen > max_ra) + extlen = max_ra; + + first_bh = getbuf(sdp, aspace, dblock, CREATE); + + if (buffer_uptodate(first_bh)) + goto out; + if (!buffer_locked(first_bh)) + ll_rw_block(READ_META, 1, &first_bh); + + dblock++; + extlen--; + + while (extlen) { + bh = getbuf(sdp, aspace, dblock, CREATE); + + if (!buffer_uptodate(bh) && !buffer_locked(bh)) + ll_rw_block(READA, 1, &bh); + brelse(bh); + dblock++; + extlen--; + if (!buffer_locked(first_bh) && buffer_uptodate(first_bh)) + goto out; + } + + wait_on_buffer(first_bh); +out: + return first_bh; +} + +/** + * gfs2_meta_syncfs - sync all the buffers in a filesystem + * @sdp: the filesystem + * + */ + +void gfs2_meta_syncfs(struct gfs2_sbd *sdp) +{ + gfs2_log_flush(sdp, NULL); + for (;;) { + gfs2_ail1_start(sdp, DIO_ALL); + if (gfs2_ail1_empty(sdp, DIO_ALL)) + break; + msleep(10); + } +} + diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h new file mode 100644 index 000000000000..3ec939e20dff --- /dev/null +++ b/fs/gfs2/meta_io.h @@ -0,0 +1,78 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#ifndef __DIO_DOT_H__ +#define __DIO_DOT_H__ + +#include <linux/buffer_head.h> +#include <linux/string.h> +#include "incore.h" + +static inline void gfs2_buffer_clear(struct buffer_head *bh) +{ + memset(bh->b_data, 0, bh->b_size); +} + +static inline void gfs2_buffer_clear_tail(struct buffer_head *bh, int head) +{ + BUG_ON(head > bh->b_size); + memset(bh->b_data + head, 0, bh->b_size - head); +} + +static inline void gfs2_buffer_copy_tail(struct buffer_head *to_bh, + int to_head, + struct buffer_head *from_bh, + int from_head) +{ + BUG_ON(from_head < to_head); + memcpy(to_bh->b_data + to_head, from_bh->b_data + from_head, + from_bh->b_size - from_head); + memset(to_bh->b_data + to_bh->b_size + to_head - from_head, + 0, from_head - to_head); +} + +struct inode *gfs2_aspace_get(struct gfs2_sbd *sdp); +void gfs2_aspace_put(struct inode *aspace); + +void gfs2_meta_inval(struct gfs2_glock *gl); +void gfs2_meta_sync(struct gfs2_glock *gl); + +struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno); +int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, + int flags, struct buffer_head **bhp); +int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh); + +void gfs2_attach_bufdata(struct gfs2_glock *gl, struct buffer_head *bh, + int meta); +void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh); +void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh, + struct gfs2_ail *ai); + +void gfs2_meta_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen); + +void gfs2_meta_cache_flush(struct gfs2_inode *ip); +int gfs2_meta_indirect_buffer(struct gfs2_inode *ip, int height, u64 num, + int new, struct buffer_head **bhp); + +static inline int gfs2_meta_inode_buffer(struct gfs2_inode *ip, + struct buffer_head **bhp) +{ + return gfs2_meta_indirect_buffer(ip, 0, ip->i_num.no_addr, 0, bhp); +} + +struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen); +void gfs2_meta_syncfs(struct gfs2_sbd *sdp); + +#define buffer_busy(bh) \ +((bh)->b_state & ((1ul << BH_Dirty) | (1ul << BH_Lock) | (1ul << BH_Pinned))) +#define buffer_in_io(bh) \ +((bh)->b_state & ((1ul << BH_Dirty) | (1ul << BH_Lock))) + +#endif /* __DIO_DOT_H__ */ + diff --git a/fs/gfs2/mount.c b/fs/gfs2/mount.c new file mode 100644 index 000000000000..ef3092e29607 --- /dev/null +++ b/fs/gfs2/mount.c @@ -0,0 +1,214 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/completion.h> +#include <linux/buffer_head.h> +#include <linux/gfs2_ondisk.h> +#include <linux/lm_interface.h> + +#include "gfs2.h" +#include "incore.h" +#include "mount.h" +#include "sys.h" +#include "util.h" + +/** + * gfs2_mount_args - Parse mount options + * @sdp: + * @data: + * + * Return: errno + */ + +int gfs2_mount_args(struct gfs2_sbd *sdp, char *data_arg, int remount) +{ + struct gfs2_args *args = &sdp->sd_args; + char *data = data_arg; + char *options, *o, *v; + int error = 0; + + if (!remount) { + /* If someone preloaded options, use those instead */ + spin_lock(&gfs2_sys_margs_lock); + if (gfs2_sys_margs) { + data = gfs2_sys_margs; + gfs2_sys_margs = NULL; + } + spin_unlock(&gfs2_sys_margs_lock); + + /* Set some defaults */ + args->ar_num_glockd = GFS2_GLOCKD_DEFAULT; + args->ar_quota = GFS2_QUOTA_DEFAULT; + args->ar_data = GFS2_DATA_DEFAULT; + } + + /* Split the options into tokens with the "," character and + process them */ + + for (options = data; (o = strsep(&options, ",")); ) { + if (!*o) + continue; + + v = strchr(o, '='); + if (v) + *v++ = 0; + + if (!strcmp(o, "lockproto")) { + if (!v) + goto need_value; + if (remount && strcmp(v, args->ar_lockproto)) + goto cant_remount; + strncpy(args->ar_lockproto, v, GFS2_LOCKNAME_LEN); + args->ar_lockproto[GFS2_LOCKNAME_LEN - 1] = 0; + } + + else if (!strcmp(o, "locktable")) { + if (!v) + goto need_value; + if (remount && strcmp(v, args->ar_locktable)) + goto cant_remount; + strncpy(args->ar_locktable, v, GFS2_LOCKNAME_LEN); + args->ar_locktable[GFS2_LOCKNAME_LEN - 1] = 0; + } + + else if (!strcmp(o, "hostdata")) { + if (!v) + goto need_value; + if (remount && strcmp(v, args->ar_hostdata)) + goto cant_remount; + strncpy(args->ar_hostdata, v, GFS2_LOCKNAME_LEN); + args->ar_hostdata[GFS2_LOCKNAME_LEN - 1] = 0; + } + + else if (!strcmp(o, "spectator")) { + if (remount && !args->ar_spectator) + goto cant_remount; + args->ar_spectator = 1; + sdp->sd_vfs->s_flags |= MS_RDONLY; + } + + else if (!strcmp(o, "ignore_local_fs")) { + if (remount && !args->ar_ignore_local_fs) + goto cant_remount; + args->ar_ignore_local_fs = 1; + } + + else if (!strcmp(o, "localflocks")) { + if (remount && !args->ar_localflocks) + goto cant_remount; + args->ar_localflocks = 1; + } + + else if (!strcmp(o, "localcaching")) { + if (remount && !args->ar_localcaching) + goto cant_remount; + args->ar_localcaching = 1; + } + + else if (!strcmp(o, "debug")) + args->ar_debug = 1; + + else if (!strcmp(o, "nodebug")) + args->ar_debug = 0; + + else if (!strcmp(o, "upgrade")) { + if (remount && !args->ar_upgrade) + goto cant_remount; + args->ar_upgrade = 1; + } + + else if (!strcmp(o, "num_glockd")) { + unsigned int x; + if (!v) + goto need_value; + sscanf(v, "%u", &x); + if (remount && x != args->ar_num_glockd) + goto cant_remount; + if (!x || x > GFS2_GLOCKD_MAX) { + fs_info(sdp, "0 < num_glockd <= %u (not %u)\n", + GFS2_GLOCKD_MAX, x); + error = -EINVAL; + break; + } + args->ar_num_glockd = x; + } + + else if (!strcmp(o, "acl")) { + args->ar_posix_acl = 1; + sdp->sd_vfs->s_flags |= MS_POSIXACL; + } + + else if (!strcmp(o, "noacl")) { + args->ar_posix_acl = 0; + sdp->sd_vfs->s_flags &= ~MS_POSIXACL; + } + + else if (!strcmp(o, "quota")) { + if (!v) + goto need_value; + if (!strcmp(v, "off")) + args->ar_quota = GFS2_QUOTA_OFF; + else if (!strcmp(v, "account")) + args->ar_quota = GFS2_QUOTA_ACCOUNT; + else if (!strcmp(v, "on")) + args->ar_quota = GFS2_QUOTA_ON; + else { + fs_info(sdp, "invalid value for quota\n"); + error = -EINVAL; + break; + } + } + + else if (!strcmp(o, "suiddir")) + args->ar_suiddir = 1; + + else if (!strcmp(o, "nosuiddir")) + args->ar_suiddir = 0; + + else if (!strcmp(o, "data")) { + if (!v) + goto need_value; + if (!strcmp(v, "writeback")) + args->ar_data = GFS2_DATA_WRITEBACK; + else if (!strcmp(v, "ordered")) + args->ar_data = GFS2_DATA_ORDERED; + else { + fs_info(sdp, "invalid value for data\n"); + error = -EINVAL; + break; + } + } + + else { + fs_info(sdp, "unknown option: %s\n", o); + error = -EINVAL; + break; + } + } + + if (error) + fs_info(sdp, "invalid mount option(s)\n"); + + if (data != data_arg) + kfree(data); + + return error; + +need_value: + fs_info(sdp, "need value for option %s\n", o); + return -EINVAL; + +cant_remount: + fs_info(sdp, "can't remount with option %s\n", o); + return -EINVAL; +} + diff --git a/fs/gfs2/mount.h b/fs/gfs2/mount.h new file mode 100644 index 000000000000..401288acfdf3 --- /dev/null +++ b/fs/gfs2/mount.h @@ -0,0 +1,17 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#ifndef __MOUNT_DOT_H__ +#define __MOUNT_DOT_H__ + +struct gfs2_sbd; + +int gfs2_mount_args(struct gfs2_sbd *sdp, char *data_arg, int remount); + +#endif /* __MOUNT_DOT_H__ */ diff --git a/fs/gfs2/ondisk.c b/fs/gfs2/ondisk.c new file mode 100644 index 000000000000..1025960b0e6e --- /dev/null +++ b/fs/gfs2/ondisk.c @@ -0,0 +1,308 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/completion.h> +#include <linux/buffer_head.h> + +#include "gfs2.h" +#include <linux/gfs2_ondisk.h> + +#define pv(struct, member, fmt) printk(KERN_INFO " "#member" = "fmt"\n", \ + struct->member); + +/* + * gfs2_xxx_in - read in an xxx struct + * first arg: the cpu-order structure + * buf: the disk-order buffer + * + * gfs2_xxx_out - write out an xxx struct + * first arg: the cpu-order structure + * buf: the disk-order buffer + * + * gfs2_xxx_print - print out an xxx struct + * first arg: the cpu-order structure + */ + +void gfs2_inum_in(struct gfs2_inum *no, const void *buf) +{ + const struct gfs2_inum *str = buf; + + no->no_formal_ino = be64_to_cpu(str->no_formal_ino); + no->no_addr = be64_to_cpu(str->no_addr); +} + +void gfs2_inum_out(const struct gfs2_inum *no, void *buf) +{ + struct gfs2_inum *str = buf; + + str->no_formal_ino = cpu_to_be64(no->no_formal_ino); + str->no_addr = cpu_to_be64(no->no_addr); +} + +static void gfs2_inum_print(const struct gfs2_inum *no) +{ + printk(KERN_INFO " no_formal_ino = %llu\n", (unsigned long long)no->no_formal_ino); + printk(KERN_INFO " no_addr = %llu\n", (unsigned long long)no->no_addr); +} + +static void gfs2_meta_header_in(struct gfs2_meta_header *mh, const void *buf) +{ + const struct gfs2_meta_header *str = buf; + + mh->mh_magic = be32_to_cpu(str->mh_magic); + mh->mh_type = be32_to_cpu(str->mh_type); + mh->mh_format = be32_to_cpu(str->mh_format); +} + +static void gfs2_meta_header_out(const struct gfs2_meta_header *mh, void *buf) +{ + struct gfs2_meta_header *str = buf; + + str->mh_magic = cpu_to_be32(mh->mh_magic); + str->mh_type = cpu_to_be32(mh->mh_type); + str->mh_format = cpu_to_be32(mh->mh_format); +} + +static void gfs2_meta_header_print(const struct gfs2_meta_header *mh) +{ + pv(mh, mh_magic, "0x%.8X"); + pv(mh, mh_type, "%u"); + pv(mh, mh_format, "%u"); +} + +void gfs2_sb_in(struct gfs2_sb *sb, const void *buf) +{ + const struct gfs2_sb *str = buf; + + gfs2_meta_header_in(&sb->sb_header, buf); + + sb->sb_fs_format = be32_to_cpu(str->sb_fs_format); + sb->sb_multihost_format = be32_to_cpu(str->sb_multihost_format); + sb->sb_bsize = be32_to_cpu(str->sb_bsize); + sb->sb_bsize_shift = be32_to_cpu(str->sb_bsize_shift); + + gfs2_inum_in(&sb->sb_master_dir, (char *)&str->sb_master_dir); + gfs2_inum_in(&sb->sb_root_dir, (char *)&str->sb_root_dir); + + memcpy(sb->sb_lockproto, str->sb_lockproto, GFS2_LOCKNAME_LEN); + memcpy(sb->sb_locktable, str->sb_locktable, GFS2_LOCKNAME_LEN); +} + +void gfs2_rindex_in(struct gfs2_rindex *ri, const void *buf) +{ + const struct gfs2_rindex *str = buf; + + ri->ri_addr = be64_to_cpu(str->ri_addr); + ri->ri_length = be32_to_cpu(str->ri_length); + ri->ri_data0 = be64_to_cpu(str->ri_data0); + ri->ri_data = be32_to_cpu(str->ri_data); + ri->ri_bitbytes = be32_to_cpu(str->ri_bitbytes); + +} + +void gfs2_rindex_print(const struct gfs2_rindex *ri) +{ + printk(KERN_INFO " ri_addr = %llu\n", (unsigned long long)ri->ri_addr); + pv(ri, ri_length, "%u"); + + printk(KERN_INFO " ri_data0 = %llu\n", (unsigned long long)ri->ri_data0); + pv(ri, ri_data, "%u"); + + pv(ri, ri_bitbytes, "%u"); +} + +void gfs2_rgrp_in(struct gfs2_rgrp *rg, const void *buf) +{ + const struct gfs2_rgrp *str = buf; + + gfs2_meta_header_in(&rg->rg_header, buf); + rg->rg_flags = be32_to_cpu(str->rg_flags); + rg->rg_free = be32_to_cpu(str->rg_free); + rg->rg_dinodes = be32_to_cpu(str->rg_dinodes); + rg->rg_igeneration = be64_to_cpu(str->rg_igeneration); +} + +void gfs2_rgrp_out(const struct gfs2_rgrp *rg, void *buf) +{ + struct gfs2_rgrp *str = buf; + + gfs2_meta_header_out(&rg->rg_header, buf); + str->rg_flags = cpu_to_be32(rg->rg_flags); + str->rg_free = cpu_to_be32(rg->rg_free); + str->rg_dinodes = cpu_to_be32(rg->rg_dinodes); + str->__pad = cpu_to_be32(0); + str->rg_igeneration = cpu_to_be64(rg->rg_igeneration); + memset(&str->rg_reserved, 0, sizeof(str->rg_reserved)); +} + +void gfs2_quota_in(struct gfs2_quota *qu, const void *buf) +{ + const struct gfs2_quota *str = buf; + + qu->qu_limit = be64_to_cpu(str->qu_limit); + qu->qu_warn = be64_to_cpu(str->qu_warn); + qu->qu_value = be64_to_cpu(str->qu_value); +} + +void gfs2_dinode_in(struct gfs2_dinode *di, const void *buf) +{ + const struct gfs2_dinode *str = buf; + + gfs2_meta_header_in(&di->di_header, buf); + gfs2_inum_in(&di->di_num, &str->di_num); + + di->di_mode = be32_to_cpu(str->di_mode); + di->di_uid = be32_to_cpu(str->di_uid); + di->di_gid = be32_to_cpu(str->di_gid); + di->di_nlink = be32_to_cpu(str->di_nlink); + di->di_size = be64_to_cpu(str->di_size); + di->di_blocks = be64_to_cpu(str->di_blocks); + di->di_atime = be64_to_cpu(str->di_atime); + di->di_mtime = be64_to_cpu(str->di_mtime); + di->di_ctime = be64_to_cpu(str->di_ctime); + di->di_major = be32_to_cpu(str->di_major); + di->di_minor = be32_to_cpu(str->di_minor); + + di->di_goal_meta = be64_to_cpu(str->di_goal_meta); + di->di_goal_data = be64_to_cpu(str->di_goal_data); + di->di_generation = be64_to_cpu(str->di_generation); + + di->di_flags = be32_to_cpu(str->di_flags); + di->di_payload_format = be32_to_cpu(str->di_payload_format); + di->di_height = be16_to_cpu(str->di_height); + + di->di_depth = be16_to_cpu(str->di_depth); + di->di_entries = be32_to_cpu(str->di_entries); + + di->di_eattr = be64_to_cpu(str->di_eattr); + +} + +void gfs2_dinode_out(const struct gfs2_dinode *di, void *buf) +{ + struct gfs2_dinode *str = buf; + + gfs2_meta_header_out(&di->di_header, buf); + gfs2_inum_out(&di->di_num, (char *)&str->di_num); + + str->di_mode = cpu_to_be32(di->di_mode); + str->di_uid = cpu_to_be32(di->di_uid); + str->di_gid = cpu_to_be32(di->di_gid); + str->di_nlink = cpu_to_be32(di->di_nlink); + str->di_size = cpu_to_be64(di->di_size); + str->di_blocks = cpu_to_be64(di->di_blocks); + str->di_atime = cpu_to_be64(di->di_atime); + str->di_mtime = cpu_to_be64(di->di_mtime); + str->di_ctime = cpu_to_be64(di->di_ctime); + str->di_major = cpu_to_be32(di->di_major); + str->di_minor = cpu_to_be32(di->di_minor); + + str->di_goal_meta = cpu_to_be64(di->di_goal_meta); + str->di_goal_data = cpu_to_be64(di->di_goal_data); + str->di_generation = cpu_to_be64(di->di_generation); + + str->di_flags = cpu_to_be32(di->di_flags); + str->di_payload_format = cpu_to_be32(di->di_payload_format); + str->di_height = cpu_to_be16(di->di_height); + + str->di_depth = cpu_to_be16(di->di_depth); + str->di_entries = cpu_to_be32(di->di_entries); + + str->di_eattr = cpu_to_be64(di->di_eattr); + +} + +void gfs2_dinode_print(const struct gfs2_dinode *di) +{ + gfs2_meta_header_print(&di->di_header); + gfs2_inum_print(&di->di_num); + + pv(di, di_mode, "0%o"); + pv(di, di_uid, "%u"); + pv(di, di_gid, "%u"); + pv(di, di_nlink, "%u"); + printk(KERN_INFO " di_size = %llu\n", (unsigned long long)di->di_size); + printk(KERN_INFO " di_blocks = %llu\n", (unsigned long long)di->di_blocks); + printk(KERN_INFO " di_atime = %lld\n", (long long)di->di_atime); + printk(KERN_INFO " di_mtime = %lld\n", (long long)di->di_mtime); + printk(KERN_INFO " di_ctime = %lld\n", (long long)di->di_ctime); + pv(di, di_major, "%u"); + pv(di, di_minor, "%u"); + + printk(KERN_INFO " di_goal_meta = %llu\n", (unsigned long long)di->di_goal_meta); + printk(KERN_INFO " di_goal_data = %llu\n", (unsigned long long)di->di_goal_data); + + pv(di, di_flags, "0x%.8X"); + pv(di, di_payload_format, "%u"); + pv(di, di_height, "%u"); + + pv(di, di_depth, "%u"); + pv(di, di_entries, "%u"); + + printk(KERN_INFO " di_eattr = %llu\n", (unsigned long long)di->di_eattr); +} + +void gfs2_log_header_in(struct gfs2_log_header *lh, const void *buf) +{ + const struct gfs2_log_header *str = buf; + + gfs2_meta_header_in(&lh->lh_header, buf); + lh->lh_sequence = be64_to_cpu(str->lh_sequence); + lh->lh_flags = be32_to_cpu(str->lh_flags); + lh->lh_tail = be32_to_cpu(str->lh_tail); + lh->lh_blkno = be32_to_cpu(str->lh_blkno); + lh->lh_hash = be32_to_cpu(str->lh_hash); +} + +void gfs2_inum_range_in(struct gfs2_inum_range *ir, const void *buf) +{ + const struct gfs2_inum_range *str = buf; + + ir->ir_start = be64_to_cpu(str->ir_start); + ir->ir_length = be64_to_cpu(str->ir_length); +} + +void gfs2_inum_range_out(const struct gfs2_inum_range *ir, void *buf) +{ + struct gfs2_inum_range *str = buf; + + str->ir_start = cpu_to_be64(ir->ir_start); + str->ir_length = cpu_to_be64(ir->ir_length); +} + +void gfs2_statfs_change_in(struct gfs2_statfs_change *sc, const void *buf) +{ + const struct gfs2_statfs_change *str = buf; + + sc->sc_total = be64_to_cpu(str->sc_total); + sc->sc_free = be64_to_cpu(str->sc_free); + sc->sc_dinodes = be64_to_cpu(str->sc_dinodes); +} + +void gfs2_statfs_change_out(const struct gfs2_statfs_change *sc, void *buf) +{ + struct gfs2_statfs_change *str = buf; + + str->sc_total = cpu_to_be64(sc->sc_total); + str->sc_free = cpu_to_be64(sc->sc_free); + str->sc_dinodes = cpu_to_be64(sc->sc_dinodes); +} + +void gfs2_quota_change_in(struct gfs2_quota_change *qc, const void *buf) +{ + const struct gfs2_quota_change *str = buf; + + qc->qc_change = be64_to_cpu(str->qc_change); + qc->qc_flags = be32_to_cpu(str->qc_flags); + qc->qc_id = be32_to_cpu(str->qc_id); +} + diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c new file mode 100644 index 000000000000..4fb743f4e4a4 --- /dev/null +++ b/fs/gfs2/ops_address.c @@ -0,0 +1,790 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/completion.h> +#include <linux/buffer_head.h> +#include <linux/pagemap.h> +#include <linux/pagevec.h> +#include <linux/mpage.h> +#include <linux/fs.h> +#include <linux/gfs2_ondisk.h> +#include <linux/lm_interface.h> + +#include "gfs2.h" +#include "incore.h" +#include "bmap.h" +#include "glock.h" +#include "inode.h" +#include "log.h" +#include "meta_io.h" +#include "ops_address.h" +#include "quota.h" +#include "trans.h" +#include "rgrp.h" +#include "ops_file.h" +#include "util.h" +#include "glops.h" + + +static void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page, + unsigned int from, unsigned int to) +{ + struct buffer_head *head = page_buffers(page); + unsigned int bsize = head->b_size; + struct buffer_head *bh; + unsigned int start, end; + + for (bh = head, start = 0; bh != head || !start; + bh = bh->b_this_page, start = end) { + end = start + bsize; + if (end <= from || start >= to) + continue; + gfs2_trans_add_bh(ip->i_gl, bh, 0); + } +} + +/** + * gfs2_get_block - Fills in a buffer head with details about a block + * @inode: The inode + * @lblock: The block number to look up + * @bh_result: The buffer head to return the result in + * @create: Non-zero if we may add block to the file + * + * Returns: errno + */ + +int gfs2_get_block(struct inode *inode, sector_t lblock, + struct buffer_head *bh_result, int create) +{ + return gfs2_block_map(inode, lblock, create, bh_result, 32); +} + +/** + * gfs2_get_block_noalloc - Fills in a buffer head with details about a block + * @inode: The inode + * @lblock: The block number to look up + * @bh_result: The buffer head to return the result in + * @create: Non-zero if we may add block to the file + * + * Returns: errno + */ + +static int gfs2_get_block_noalloc(struct inode *inode, sector_t lblock, + struct buffer_head *bh_result, int create) +{ + int error; + + error = gfs2_block_map(inode, lblock, 0, bh_result, 1); + if (error) + return error; + if (bh_result->b_blocknr == 0) + return -EIO; + return 0; +} + +static int gfs2_get_block_direct(struct inode *inode, sector_t lblock, + struct buffer_head *bh_result, int create) +{ + return gfs2_block_map(inode, lblock, 0, bh_result, 32); +} + +/** + * gfs2_writepage - Write complete page + * @page: Page to write + * + * Returns: errno + * + * Some of this is copied from block_write_full_page() although we still + * call it to do most of the work. + */ + +static int gfs2_writepage(struct page *page, struct writeback_control *wbc) +{ + struct inode *inode = page->mapping->host; + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + loff_t i_size = i_size_read(inode); + pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT; + unsigned offset; + int error; + int done_trans = 0; + + if (gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(ip->i_gl))) { + unlock_page(page); + return -EIO; + } + if (current->journal_info) + goto out_ignore; + + /* Is the page fully outside i_size? (truncate in progress) */ + offset = i_size & (PAGE_CACHE_SIZE-1); + if (page->index > end_index || (page->index == end_index && !offset)) { + page->mapping->a_ops->invalidatepage(page, 0); + unlock_page(page); + return 0; /* don't care */ + } + + if (sdp->sd_args.ar_data == GFS2_DATA_ORDERED || gfs2_is_jdata(ip)) { + error = gfs2_trans_begin(sdp, RES_DINODE + 1, 0); + if (error) + goto out_ignore; + if (!page_has_buffers(page)) { + create_empty_buffers(page, inode->i_sb->s_blocksize, + (1 << BH_Dirty)|(1 << BH_Uptodate)); + } + gfs2_page_add_databufs(ip, page, 0, sdp->sd_vfs->s_blocksize-1); + done_trans = 1; + } + error = block_write_full_page(page, gfs2_get_block_noalloc, wbc); + if (done_trans) + gfs2_trans_end(sdp); + gfs2_meta_cache_flush(ip); + return error; + +out_ignore: + redirty_page_for_writepage(wbc, page); + unlock_page(page); + return 0; +} + +static int zero_readpage(struct page *page) +{ + void *kaddr; + + kaddr = kmap_atomic(page, KM_USER0); + memset(kaddr, 0, PAGE_CACHE_SIZE); + kunmap_atomic(page, KM_USER0); + + SetPageUptodate(page); + + return 0; +} + +/** + * stuffed_readpage - Fill in a Linux page with stuffed file data + * @ip: the inode + * @page: the page + * + * Returns: errno + */ + +static int stuffed_readpage(struct gfs2_inode *ip, struct page *page) +{ + struct buffer_head *dibh; + void *kaddr; + int error; + + /* Only the first page of a stuffed file might contain data */ + if (unlikely(page->index)) + return zero_readpage(page); + + error = gfs2_meta_inode_buffer(ip, &dibh); + if (error) + return error; + + kaddr = kmap_atomic(page, KM_USER0); + memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), + ip->i_di.di_size); + memset(kaddr + ip->i_di.di_size, 0, PAGE_CACHE_SIZE - ip->i_di.di_size); + kunmap_atomic(page, KM_USER0); + + brelse(dibh); + + SetPageUptodate(page); + + return 0; +} + + +/** + * gfs2_readpage - readpage with locking + * @file: The file to read a page for. N.B. This may be NULL if we are + * reading an internal file. + * @page: The page to read + * + * Returns: errno + */ + +static int gfs2_readpage(struct file *file, struct page *page) +{ + struct gfs2_inode *ip = GFS2_I(page->mapping->host); + struct gfs2_sbd *sdp = GFS2_SB(page->mapping->host); + struct gfs2_file *gf = NULL; + struct gfs2_holder gh; + int error; + int do_unlock = 0; + + if (likely(file != &gfs2_internal_file_sentinel)) { + if (file) { + gf = file->private_data; + if (test_bit(GFF_EXLOCK, &gf->f_flags)) + /* gfs2_sharewrite_nopage has grabbed the ip->i_gl already */ + goto skip_lock; + } + gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME|GL_AOP, &gh); + do_unlock = 1; + error = gfs2_glock_nq_m_atime(1, &gh); + if (unlikely(error)) + goto out_unlock; + } + +skip_lock: + if (gfs2_is_stuffed(ip)) { + error = stuffed_readpage(ip, page); + unlock_page(page); + } else + error = mpage_readpage(page, gfs2_get_block); + + if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) + error = -EIO; + + if (do_unlock) { + gfs2_glock_dq_m(1, &gh); + gfs2_holder_uninit(&gh); + } +out: + return error; +out_unlock: + unlock_page(page); + if (do_unlock) + gfs2_holder_uninit(&gh); + goto out; +} + +/** + * gfs2_readpages - Read a bunch of pages at once + * + * Some notes: + * 1. This is only for readahead, so we can simply ignore any things + * which are slightly inconvenient (such as locking conflicts between + * the page lock and the glock) and return having done no I/O. Its + * obviously not something we'd want to do on too regular a basis. + * Any I/O we ignore at this time will be done via readpage later. + * 2. We have to handle stuffed files here too. + * 3. mpage_readpages() does most of the heavy lifting in the common case. + * 4. gfs2_get_block() is relied upon to set BH_Boundary in the right places. + * 5. We use LM_FLAG_TRY_1CB here, effectively we then have lock-ahead as + * well as read-ahead. + */ +static int gfs2_readpages(struct file *file, struct address_space *mapping, + struct list_head *pages, unsigned nr_pages) +{ + struct inode *inode = mapping->host; + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + struct gfs2_holder gh; + unsigned page_idx; + int ret; + int do_unlock = 0; + + if (likely(file != &gfs2_internal_file_sentinel)) { + if (file) { + struct gfs2_file *gf = file->private_data; + if (test_bit(GFF_EXLOCK, &gf->f_flags)) + goto skip_lock; + } + gfs2_holder_init(ip->i_gl, LM_ST_SHARED, + LM_FLAG_TRY_1CB|GL_ATIME|GL_AOP, &gh); + do_unlock = 1; + ret = gfs2_glock_nq_m_atime(1, &gh); + if (ret == GLR_TRYFAILED) + goto out_noerror; + if (unlikely(ret)) + goto out_unlock; + } +skip_lock: + if (gfs2_is_stuffed(ip)) { + struct pagevec lru_pvec; + pagevec_init(&lru_pvec, 0); + for (page_idx = 0; page_idx < nr_pages; page_idx++) { + struct page *page = list_entry(pages->prev, struct page, lru); + prefetchw(&page->flags); + list_del(&page->lru); + if (!add_to_page_cache(page, mapping, + page->index, GFP_KERNEL)) { + ret = stuffed_readpage(ip, page); + unlock_page(page); + if (!pagevec_add(&lru_pvec, page)) + __pagevec_lru_add(&lru_pvec); + } else { + page_cache_release(page); + } + } + pagevec_lru_add(&lru_pvec); + ret = 0; + } else { + /* What we really want to do .... */ + ret = mpage_readpages(mapping, pages, nr_pages, gfs2_get_block); + } + + if (do_unlock) { + gfs2_glock_dq_m(1, &gh); + gfs2_holder_uninit(&gh); + } +out: + if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) + ret = -EIO; + return ret; +out_noerror: + ret = 0; +out_unlock: + /* unlock all pages, we can't do any I/O right now */ + for (page_idx = 0; page_idx < nr_pages; page_idx++) { + struct page *page = list_entry(pages->prev, struct page, lru); + list_del(&page->lru); + unlock_page(page); + page_cache_release(page); + } + if (do_unlock) + gfs2_holder_uninit(&gh); + goto out; +} + +/** + * gfs2_prepare_write - Prepare to write a page to a file + * @file: The file to write to + * @page: The page which is to be prepared for writing + * @from: From (byte range within page) + * @to: To (byte range within page) + * + * Returns: errno + */ + +static int gfs2_prepare_write(struct file *file, struct page *page, + unsigned from, unsigned to) +{ + struct gfs2_inode *ip = GFS2_I(page->mapping->host); + struct gfs2_sbd *sdp = GFS2_SB(page->mapping->host); + unsigned int data_blocks, ind_blocks, rblocks; + int alloc_required; + int error = 0; + loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + from; + loff_t end = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to; + struct gfs2_alloc *al; + + gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_ATIME|GL_AOP, &ip->i_gh); + error = gfs2_glock_nq_m_atime(1, &ip->i_gh); + if (error) + goto out_uninit; + + gfs2_write_calc_reserv(ip, to - from, &data_blocks, &ind_blocks); + + error = gfs2_write_alloc_required(ip, pos, from - to, &alloc_required); + if (error) + goto out_unlock; + + + if (alloc_required) { + al = gfs2_alloc_get(ip); + + error = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE); + if (error) + goto out_alloc_put; + + error = gfs2_quota_check(ip, ip->i_di.di_uid, ip->i_di.di_gid); + if (error) + goto out_qunlock; + + al->al_requested = data_blocks + ind_blocks; + error = gfs2_inplace_reserve(ip); + if (error) + goto out_qunlock; + } + + rblocks = RES_DINODE + ind_blocks; + if (gfs2_is_jdata(ip)) + rblocks += data_blocks ? data_blocks : 1; + if (ind_blocks || data_blocks) + rblocks += RES_STATFS + RES_QUOTA; + + error = gfs2_trans_begin(sdp, rblocks, 0); + if (error) + goto out; + + if (gfs2_is_stuffed(ip)) { + if (end > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)) { + error = gfs2_unstuff_dinode(ip, page); + if (error == 0) + goto prepare_write; + } else if (!PageUptodate(page)) + error = stuffed_readpage(ip, page); + goto out; + } + +prepare_write: + error = block_prepare_write(page, from, to, gfs2_get_block); + +out: + if (error) { + gfs2_trans_end(sdp); + if (alloc_required) { + gfs2_inplace_release(ip); +out_qunlock: + gfs2_quota_unlock(ip); +out_alloc_put: + gfs2_alloc_put(ip); + } +out_unlock: + gfs2_glock_dq_m(1, &ip->i_gh); +out_uninit: + gfs2_holder_uninit(&ip->i_gh); + } + + return error; +} + +/** + * gfs2_commit_write - Commit write to a file + * @file: The file to write to + * @page: The page containing the data + * @from: From (byte range within page) + * @to: To (byte range within page) + * + * Returns: errno + */ + +static int gfs2_commit_write(struct file *file, struct page *page, + unsigned from, unsigned to) +{ + struct inode *inode = page->mapping->host; + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + int error = -EOPNOTSUPP; + struct buffer_head *dibh; + struct gfs2_alloc *al = &ip->i_alloc; + struct gfs2_dinode *di; + + if (gfs2_assert_withdraw(sdp, gfs2_glock_is_locked_by_me(ip->i_gl))) + goto fail_nounlock; + + error = gfs2_meta_inode_buffer(ip, &dibh); + if (error) + goto fail_endtrans; + + gfs2_trans_add_bh(ip->i_gl, dibh, 1); + di = (struct gfs2_dinode *)dibh->b_data; + + if (gfs2_is_stuffed(ip)) { + u64 file_size; + void *kaddr; + + file_size = ((u64)page->index << PAGE_CACHE_SHIFT) + to; + + kaddr = kmap_atomic(page, KM_USER0); + memcpy(dibh->b_data + sizeof(struct gfs2_dinode) + from, + kaddr + from, to - from); + kunmap_atomic(page, KM_USER0); + + SetPageUptodate(page); + + if (inode->i_size < file_size) + i_size_write(inode, file_size); + } else { + if (sdp->sd_args.ar_data == GFS2_DATA_ORDERED || + gfs2_is_jdata(ip)) + gfs2_page_add_databufs(ip, page, from, to); + error = generic_commit_write(file, page, from, to); + if (error) + goto fail; + } + + if (ip->i_di.di_size < inode->i_size) { + ip->i_di.di_size = inode->i_size; + di->di_size = cpu_to_be64(inode->i_size); + } + + di->di_mode = cpu_to_be32(inode->i_mode); + di->di_atime = cpu_to_be64(inode->i_atime.tv_sec); + di->di_mtime = cpu_to_be64(inode->i_mtime.tv_sec); + di->di_ctime = cpu_to_be64(inode->i_ctime.tv_sec); + + brelse(dibh); + gfs2_trans_end(sdp); + if (al->al_requested) { + gfs2_inplace_release(ip); + gfs2_quota_unlock(ip); + gfs2_alloc_put(ip); + } + gfs2_glock_dq_m(1, &ip->i_gh); + gfs2_holder_uninit(&ip->i_gh); + return 0; + +fail: + brelse(dibh); +fail_endtrans: + gfs2_trans_end(sdp); + if (al->al_requested) { + gfs2_inplace_release(ip); + gfs2_quota_unlock(ip); + gfs2_alloc_put(ip); + } + gfs2_glock_dq_m(1, &ip->i_gh); + gfs2_holder_uninit(&ip->i_gh); +fail_nounlock: + ClearPageUptodate(page); + return error; +} + +/** + * gfs2_bmap - Block map function + * @mapping: Address space info + * @lblock: The block to map + * + * Returns: The disk address for the block or 0 on hole or error + */ + +static sector_t gfs2_bmap(struct address_space *mapping, sector_t lblock) +{ + struct gfs2_inode *ip = GFS2_I(mapping->host); + struct gfs2_holder i_gh; + sector_t dblock = 0; + int error; + + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh); + if (error) + return 0; + + if (!gfs2_is_stuffed(ip)) + dblock = generic_block_bmap(mapping, lblock, gfs2_get_block); + + gfs2_glock_dq_uninit(&i_gh); + + return dblock; +} + +static void discard_buffer(struct gfs2_sbd *sdp, struct buffer_head *bh) +{ + struct gfs2_bufdata *bd; + + gfs2_log_lock(sdp); + bd = bh->b_private; + if (bd) { + bd->bd_bh = NULL; + bh->b_private = NULL; + } + gfs2_log_unlock(sdp); + + lock_buffer(bh); + clear_buffer_dirty(bh); + bh->b_bdev = NULL; + clear_buffer_mapped(bh); + clear_buffer_req(bh); + clear_buffer_new(bh); + clear_buffer_delay(bh); + unlock_buffer(bh); +} + +static void gfs2_invalidatepage(struct page *page, unsigned long offset) +{ + struct gfs2_sbd *sdp = GFS2_SB(page->mapping->host); + struct buffer_head *head, *bh, *next; + unsigned int curr_off = 0; + + BUG_ON(!PageLocked(page)); + if (!page_has_buffers(page)) + return; + + bh = head = page_buffers(page); + do { + unsigned int next_off = curr_off + bh->b_size; + next = bh->b_this_page; + + if (offset <= curr_off) + discard_buffer(sdp, bh); + + curr_off = next_off; + bh = next; + } while (bh != head); + + if (!offset) + try_to_release_page(page, 0); + + return; +} + +static ssize_t gfs2_direct_IO(int rw, struct kiocb *iocb, + const struct iovec *iov, loff_t offset, + unsigned long nr_segs) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_mapping->host; + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_holder gh; + int rv; + + if (rw == READ) + mutex_lock(&inode->i_mutex); + /* + * Shared lock, even if its a write, since we do no allocation + * on this path. All we need change is atime. + */ + gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &gh); + rv = gfs2_glock_nq_m_atime(1, &gh); + if (rv) + goto out; + + if (offset > i_size_read(inode)) + goto out; + + /* + * Should we return an error here? I can't see that O_DIRECT for + * a journaled file makes any sense. For now we'll silently fall + * back to buffered I/O, likewise we do the same for stuffed + * files since they are (a) small and (b) unaligned. + */ + if (gfs2_is_jdata(ip)) + goto out; + + if (gfs2_is_stuffed(ip)) + goto out; + + rv = blockdev_direct_IO_own_locking(rw, iocb, inode, + inode->i_sb->s_bdev, + iov, offset, nr_segs, + gfs2_get_block_direct, NULL); +out: + gfs2_glock_dq_m(1, &gh); + gfs2_holder_uninit(&gh); + if (rw == READ) + mutex_unlock(&inode->i_mutex); + + return rv; +} + +/** + * stuck_releasepage - We're stuck in gfs2_releasepage(). Print stuff out. + * @bh: the buffer we're stuck on + * + */ + +static void stuck_releasepage(struct buffer_head *bh) +{ + struct inode *inode = bh->b_page->mapping->host; + struct gfs2_sbd *sdp = inode->i_sb->s_fs_info; + struct gfs2_bufdata *bd = bh->b_private; + struct gfs2_glock *gl; +static unsigned limit = 0; + + if (limit > 3) + return; + limit++; + + fs_warn(sdp, "stuck in gfs2_releasepage() %p\n", inode); + fs_warn(sdp, "blkno = %llu, bh->b_count = %d\n", + (unsigned long long)bh->b_blocknr, atomic_read(&bh->b_count)); + fs_warn(sdp, "pinned = %u\n", buffer_pinned(bh)); + fs_warn(sdp, "bh->b_private = %s\n", (bd) ? "!NULL" : "NULL"); + + if (!bd) + return; + + gl = bd->bd_gl; + + fs_warn(sdp, "gl = (%u, %llu)\n", + gl->gl_name.ln_type, (unsigned long long)gl->gl_name.ln_number); + + fs_warn(sdp, "bd_list_tr = %s, bd_le.le_list = %s\n", + (list_empty(&bd->bd_list_tr)) ? "no" : "yes", + (list_empty(&bd->bd_le.le_list)) ? "no" : "yes"); + + if (gl->gl_ops == &gfs2_inode_glops) { + struct gfs2_inode *ip = gl->gl_object; + unsigned int x; + + if (!ip) + return; + + fs_warn(sdp, "ip = %llu %llu\n", + (unsigned long long)ip->i_num.no_formal_ino, + (unsigned long long)ip->i_num.no_addr); + + for (x = 0; x < GFS2_MAX_META_HEIGHT; x++) + fs_warn(sdp, "ip->i_cache[%u] = %s\n", + x, (ip->i_cache[x]) ? "!NULL" : "NULL"); + } +} + +/** + * gfs2_releasepage - free the metadata associated with a page + * @page: the page that's being released + * @gfp_mask: passed from Linux VFS, ignored by us + * + * Call try_to_free_buffers() if the buffers in this page can be + * released. + * + * Returns: 0 + */ + +int gfs2_releasepage(struct page *page, gfp_t gfp_mask) +{ + struct inode *aspace = page->mapping->host; + struct gfs2_sbd *sdp = aspace->i_sb->s_fs_info; + struct buffer_head *bh, *head; + struct gfs2_bufdata *bd; + unsigned long t = jiffies + gfs2_tune_get(sdp, gt_stall_secs) * HZ; + + if (!page_has_buffers(page)) + goto out; + + head = bh = page_buffers(page); + do { + while (atomic_read(&bh->b_count)) { + if (!atomic_read(&aspace->i_writecount)) + return 0; + + if (time_after_eq(jiffies, t)) { + stuck_releasepage(bh); + /* should we withdraw here? */ + return 0; + } + + yield(); + } + + gfs2_assert_warn(sdp, !buffer_pinned(bh)); + gfs2_assert_warn(sdp, !buffer_dirty(bh)); + + gfs2_log_lock(sdp); + bd = bh->b_private; + if (bd) { + gfs2_assert_warn(sdp, bd->bd_bh == bh); + gfs2_assert_warn(sdp, list_empty(&bd->bd_list_tr)); + gfs2_assert_warn(sdp, !bd->bd_ail); + bd->bd_bh = NULL; + if (!list_empty(&bd->bd_le.le_list)) + bd = NULL; + bh->b_private = NULL; + } + gfs2_log_unlock(sdp); + if (bd) + kmem_cache_free(gfs2_bufdata_cachep, bd); + + bh = bh->b_this_page; + } while (bh != head); + +out: + return try_to_free_buffers(page); +} + +const struct address_space_operations gfs2_file_aops = { + .writepage = gfs2_writepage, + .readpage = gfs2_readpage, + .readpages = gfs2_readpages, + .sync_page = block_sync_page, + .prepare_write = gfs2_prepare_write, + .commit_write = gfs2_commit_write, + .bmap = gfs2_bmap, + .invalidatepage = gfs2_invalidatepage, + .releasepage = gfs2_releasepage, + .direct_IO = gfs2_direct_IO, +}; + diff --git a/fs/gfs2/ops_address.h b/fs/gfs2/ops_address.h new file mode 100644 index 000000000000..35aaee4aa7e1 --- /dev/null +++ b/fs/gfs2/ops_address.h @@ -0,0 +1,22 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#ifndef __OPS_ADDRESS_DOT_H__ +#define __OPS_ADDRESS_DOT_H__ + +#include <linux/fs.h> +#include <linux/buffer_head.h> +#include <linux/mm.h> + +extern const struct address_space_operations gfs2_file_aops; +extern int gfs2_get_block(struct inode *inode, sector_t lblock, + struct buffer_head *bh_result, int create); +extern int gfs2_releasepage(struct page *page, gfp_t gfp_mask); + +#endif /* __OPS_ADDRESS_DOT_H__ */ diff --git a/fs/gfs2/ops_dentry.c b/fs/gfs2/ops_dentry.c new file mode 100644 index 000000000000..00041b1b8025 --- /dev/null +++ b/fs/gfs2/ops_dentry.c @@ -0,0 +1,119 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/completion.h> +#include <linux/buffer_head.h> +#include <linux/smp_lock.h> +#include <linux/gfs2_ondisk.h> +#include <linux/crc32.h> +#include <linux/lm_interface.h> + +#include "gfs2.h" +#include "incore.h" +#include "dir.h" +#include "glock.h" +#include "ops_dentry.h" +#include "util.h" + +/** + * gfs2_drevalidate - Check directory lookup consistency + * @dentry: the mapping to check + * @nd: + * + * Check to make sure the lookup necessary to arrive at this inode from its + * parent is still good. + * + * Returns: 1 if the dentry is ok, 0 if it isn't + */ + +static int gfs2_drevalidate(struct dentry *dentry, struct nameidata *nd) +{ + struct dentry *parent = dget_parent(dentry); + struct gfs2_sbd *sdp = GFS2_SB(parent->d_inode); + struct gfs2_inode *dip = GFS2_I(parent->d_inode); + struct inode *inode = dentry->d_inode; + struct gfs2_holder d_gh; + struct gfs2_inode *ip; + struct gfs2_inum inum; + unsigned int type; + int error; + + if (inode && is_bad_inode(inode)) + goto invalid; + + if (sdp->sd_args.ar_localcaching) + goto valid; + + error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh); + if (error) + goto fail; + + error = gfs2_dir_search(parent->d_inode, &dentry->d_name, &inum, &type); + switch (error) { + case 0: + if (!inode) + goto invalid_gunlock; + break; + case -ENOENT: + if (!inode) + goto valid_gunlock; + goto invalid_gunlock; + default: + goto fail_gunlock; + } + + ip = GFS2_I(inode); + + if (!gfs2_inum_equal(&ip->i_num, &inum)) + goto invalid_gunlock; + + if (IF2DT(ip->i_di.di_mode) != type) { + gfs2_consist_inode(dip); + goto fail_gunlock; + } + +valid_gunlock: + gfs2_glock_dq_uninit(&d_gh); +valid: + dput(parent); + return 1; + +invalid_gunlock: + gfs2_glock_dq_uninit(&d_gh); +invalid: + if (inode && S_ISDIR(inode->i_mode)) { + if (have_submounts(dentry)) + goto valid; + shrink_dcache_parent(dentry); + } + d_drop(dentry); + dput(parent); + return 0; + +fail_gunlock: + gfs2_glock_dq_uninit(&d_gh); +fail: + dput(parent); + return 0; +} + +static int gfs2_dhash(struct dentry *dentry, struct qstr *str) +{ + str->hash = gfs2_disk_hash(str->name, str->len); + return 0; +} + +struct dentry_operations gfs2_dops = { + .d_revalidate = gfs2_drevalidate, + .d_hash = gfs2_dhash, +}; + diff --git a/fs/gfs2/ops_dentry.h b/fs/gfs2/ops_dentry.h new file mode 100644 index 000000000000..5caa3db4d3f5 --- /dev/null +++ b/fs/gfs2/ops_dentry.h @@ -0,0 +1,17 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#ifndef __OPS_DENTRY_DOT_H__ +#define __OPS_DENTRY_DOT_H__ + +#include <linux/dcache.h> + +extern struct dentry_operations gfs2_dops; + +#endif /* __OPS_DENTRY_DOT_H__ */ diff --git a/fs/gfs2/ops_export.c b/fs/gfs2/ops_export.c new file mode 100644 index 000000000000..86127d93bd35 --- /dev/null +++ b/fs/gfs2/ops_export.c @@ -0,0 +1,298 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/completion.h> +#include <linux/buffer_head.h> +#include <linux/gfs2_ondisk.h> +#include <linux/crc32.h> +#include <linux/lm_interface.h> + +#include "gfs2.h" +#include "incore.h" +#include "dir.h" +#include "glock.h" +#include "glops.h" +#include "inode.h" +#include "ops_export.h" +#include "rgrp.h" +#include "util.h" + +static struct dentry *gfs2_decode_fh(struct super_block *sb, + __u32 *fh, + int fh_len, + int fh_type, + int (*acceptable)(void *context, + struct dentry *dentry), + void *context) +{ + struct gfs2_fh_obj fh_obj; + struct gfs2_inum *this, parent; + + if (fh_type != fh_len) + return NULL; + + this = &fh_obj.this; + fh_obj.imode = DT_UNKNOWN; + memset(&parent, 0, sizeof(struct gfs2_inum)); + + switch (fh_type) { + case GFS2_LARGE_FH_SIZE: + parent.no_formal_ino = ((u64)be32_to_cpu(fh[4])) << 32; + parent.no_formal_ino |= be32_to_cpu(fh[5]); + parent.no_addr = ((u64)be32_to_cpu(fh[6])) << 32; + parent.no_addr |= be32_to_cpu(fh[7]); + fh_obj.imode = be32_to_cpu(fh[8]); + case GFS2_SMALL_FH_SIZE: + this->no_formal_ino = ((u64)be32_to_cpu(fh[0])) << 32; + this->no_formal_ino |= be32_to_cpu(fh[1]); + this->no_addr = ((u64)be32_to_cpu(fh[2])) << 32; + this->no_addr |= be32_to_cpu(fh[3]); + break; + default: + return NULL; + } + + return gfs2_export_ops.find_exported_dentry(sb, &fh_obj, &parent, + acceptable, context); +} + +static int gfs2_encode_fh(struct dentry *dentry, __u32 *fh, int *len, + int connectable) +{ + struct inode *inode = dentry->d_inode; + struct super_block *sb = inode->i_sb; + struct gfs2_inode *ip = GFS2_I(inode); + + if (*len < GFS2_SMALL_FH_SIZE || + (connectable && *len < GFS2_LARGE_FH_SIZE)) + return 255; + + fh[0] = ip->i_num.no_formal_ino >> 32; + fh[0] = cpu_to_be32(fh[0]); + fh[1] = ip->i_num.no_formal_ino & 0xFFFFFFFF; + fh[1] = cpu_to_be32(fh[1]); + fh[2] = ip->i_num.no_addr >> 32; + fh[2] = cpu_to_be32(fh[2]); + fh[3] = ip->i_num.no_addr & 0xFFFFFFFF; + fh[3] = cpu_to_be32(fh[3]); + *len = GFS2_SMALL_FH_SIZE; + + if (!connectable || inode == sb->s_root->d_inode) + return *len; + + spin_lock(&dentry->d_lock); + inode = dentry->d_parent->d_inode; + ip = GFS2_I(inode); + igrab(inode); + spin_unlock(&dentry->d_lock); + + fh[4] = ip->i_num.no_formal_ino >> 32; + fh[4] = cpu_to_be32(fh[4]); + fh[5] = ip->i_num.no_formal_ino & 0xFFFFFFFF; + fh[5] = cpu_to_be32(fh[5]); + fh[6] = ip->i_num.no_addr >> 32; + fh[6] = cpu_to_be32(fh[6]); + fh[7] = ip->i_num.no_addr & 0xFFFFFFFF; + fh[7] = cpu_to_be32(fh[7]); + + fh[8] = cpu_to_be32(inode->i_mode); + fh[9] = 0; /* pad to double word */ + *len = GFS2_LARGE_FH_SIZE; + + iput(inode); + + return *len; +} + +struct get_name_filldir { + struct gfs2_inum inum; + char *name; +}; + +static int get_name_filldir(void *opaque, const char *name, unsigned int length, + u64 offset, struct gfs2_inum *inum, + unsigned int type) +{ + struct get_name_filldir *gnfd = (struct get_name_filldir *)opaque; + + if (!gfs2_inum_equal(inum, &gnfd->inum)) + return 0; + + memcpy(gnfd->name, name, length); + gnfd->name[length] = 0; + + return 1; +} + +static int gfs2_get_name(struct dentry *parent, char *name, + struct dentry *child) +{ + struct inode *dir = parent->d_inode; + struct inode *inode = child->d_inode; + struct gfs2_inode *dip, *ip; + struct get_name_filldir gnfd; + struct gfs2_holder gh; + u64 offset = 0; + int error; + + if (!dir) + return -EINVAL; + + if (!S_ISDIR(dir->i_mode) || !inode) + return -EINVAL; + + dip = GFS2_I(dir); + ip = GFS2_I(inode); + + *name = 0; + gnfd.inum = ip->i_num; + gnfd.name = name; + + error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &gh); + if (error) + return error; + + error = gfs2_dir_read(dir, &offset, &gnfd, get_name_filldir); + + gfs2_glock_dq_uninit(&gh); + + if (!error && !*name) + error = -ENOENT; + + return error; +} + +static struct dentry *gfs2_get_parent(struct dentry *child) +{ + struct qstr dotdot; + struct inode *inode; + struct dentry *dentry; + + gfs2_str2qstr(&dotdot, ".."); + inode = gfs2_lookupi(child->d_inode, &dotdot, 1, NULL); + + if (!inode) + return ERR_PTR(-ENOENT); + /* + * In case of an error, @inode carries the error value, and we + * have to return that as a(n invalid) pointer to dentry. + */ + if (IS_ERR(inode)) + return ERR_PTR(PTR_ERR(inode)); + + dentry = d_alloc_anon(inode); + if (!dentry) { + iput(inode); + return ERR_PTR(-ENOMEM); + } + + return dentry; +} + +static struct dentry *gfs2_get_dentry(struct super_block *sb, void *inum_obj) +{ + struct gfs2_sbd *sdp = sb->s_fs_info; + struct gfs2_fh_obj *fh_obj = (struct gfs2_fh_obj *)inum_obj; + struct gfs2_inum *inum = &fh_obj->this; + struct gfs2_holder i_gh, ri_gh, rgd_gh; + struct gfs2_rgrpd *rgd; + struct inode *inode; + struct dentry *dentry; + int error; + + /* System files? */ + + inode = gfs2_ilookup(sb, inum); + if (inode) { + if (GFS2_I(inode)->i_num.no_formal_ino != inum->no_formal_ino) { + iput(inode); + return ERR_PTR(-ESTALE); + } + goto out_inode; + } + + error = gfs2_glock_nq_num(sdp, inum->no_addr, &gfs2_inode_glops, + LM_ST_SHARED, LM_FLAG_ANY | GL_LOCAL_EXCL, + &i_gh); + if (error) + return ERR_PTR(error); + + error = gfs2_rindex_hold(sdp, &ri_gh); + if (error) + goto fail; + + error = -EINVAL; + rgd = gfs2_blk2rgrpd(sdp, inum->no_addr); + if (!rgd) + goto fail_rindex; + + error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_SHARED, 0, &rgd_gh); + if (error) + goto fail_rindex; + + error = -ESTALE; + if (gfs2_get_block_type(rgd, inum->no_addr) != GFS2_BLKST_DINODE) + goto fail_rgd; + + gfs2_glock_dq_uninit(&rgd_gh); + gfs2_glock_dq_uninit(&ri_gh); + + inode = gfs2_inode_lookup(sb, inum, fh_obj->imode); + if (!inode) + goto fail; + if (IS_ERR(inode)) { + error = PTR_ERR(inode); + goto fail; + } + + error = gfs2_inode_refresh(GFS2_I(inode)); + if (error) { + iput(inode); + goto fail; + } + + error = -EIO; + if (GFS2_I(inode)->i_di.di_flags & GFS2_DIF_SYSTEM) { + iput(inode); + goto fail; + } + + gfs2_glock_dq_uninit(&i_gh); + +out_inode: + dentry = d_alloc_anon(inode); + if (!dentry) { + iput(inode); + return ERR_PTR(-ENOMEM); + } + + return dentry; + +fail_rgd: + gfs2_glock_dq_uninit(&rgd_gh); + +fail_rindex: + gfs2_glock_dq_uninit(&ri_gh); + +fail: + gfs2_glock_dq_uninit(&i_gh); + return ERR_PTR(error); +} + +struct export_operations gfs2_export_ops = { + .decode_fh = gfs2_decode_fh, + .encode_fh = gfs2_encode_fh, + .get_name = gfs2_get_name, + .get_parent = gfs2_get_parent, + .get_dentry = gfs2_get_dentry, +}; + diff --git a/fs/gfs2/ops_export.h b/fs/gfs2/ops_export.h new file mode 100644 index 000000000000..09aca5046fb1 --- /dev/null +++ b/fs/gfs2/ops_export.h @@ -0,0 +1,22 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#ifndef __OPS_EXPORT_DOT_H__ +#define __OPS_EXPORT_DOT_H__ + +#define GFS2_SMALL_FH_SIZE 4 +#define GFS2_LARGE_FH_SIZE 10 + +extern struct export_operations gfs2_export_ops; +struct gfs2_fh_obj { + struct gfs2_inum this; + __u32 imode; +}; + +#endif /* __OPS_EXPORT_DOT_H__ */ diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c new file mode 100644 index 000000000000..3064f133bf3c --- /dev/null +++ b/fs/gfs2/ops_file.c @@ -0,0 +1,661 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/completion.h> +#include <linux/buffer_head.h> +#include <linux/pagemap.h> +#include <linux/uio.h> +#include <linux/blkdev.h> +#include <linux/mm.h> +#include <linux/smp_lock.h> +#include <linux/fs.h> +#include <linux/gfs2_ondisk.h> +#include <linux/ext2_fs.h> +#include <linux/crc32.h> +#include <linux/lm_interface.h> +#include <asm/uaccess.h> + +#include "gfs2.h" +#include "incore.h" +#include "bmap.h" +#include "dir.h" +#include "glock.h" +#include "glops.h" +#include "inode.h" +#include "lm.h" +#include "log.h" +#include "meta_io.h" +#include "ops_file.h" +#include "ops_vm.h" +#include "quota.h" +#include "rgrp.h" +#include "trans.h" +#include "util.h" +#include "eaops.h" + +/* For regular, non-NFS */ +struct filldir_reg { + struct gfs2_sbd *fdr_sbd; + int fdr_prefetch; + + filldir_t fdr_filldir; + void *fdr_opaque; +}; + +/* + * Most fields left uninitialised to catch anybody who tries to + * use them. f_flags set to prevent file_accessed() from touching + * any other part of this. Its use is purely as a flag so that we + * know (in readpage()) whether or not do to locking. + */ +struct file gfs2_internal_file_sentinel = { + .f_flags = O_NOATIME|O_RDONLY, +}; + +static int gfs2_read_actor(read_descriptor_t *desc, struct page *page, + unsigned long offset, unsigned long size) +{ + char *kaddr; + unsigned long count = desc->count; + + if (size > count) + size = count; + + kaddr = kmap(page); + memcpy(desc->arg.buf, kaddr + offset, size); + kunmap(page); + + desc->count = count - size; + desc->written += size; + desc->arg.buf += size; + return size; +} + +int gfs2_internal_read(struct gfs2_inode *ip, struct file_ra_state *ra_state, + char *buf, loff_t *pos, unsigned size) +{ + struct inode *inode = &ip->i_inode; + read_descriptor_t desc; + desc.written = 0; + desc.arg.buf = buf; + desc.count = size; + desc.error = 0; + do_generic_mapping_read(inode->i_mapping, ra_state, + &gfs2_internal_file_sentinel, pos, &desc, + gfs2_read_actor); + return desc.written ? desc.written : desc.error; +} + +/** + * gfs2_llseek - seek to a location in a file + * @file: the file + * @offset: the offset + * @origin: Where to seek from (SEEK_SET, SEEK_CUR, or SEEK_END) + * + * SEEK_END requires the glock for the file because it references the + * file's size. + * + * Returns: The new offset, or errno + */ + +static loff_t gfs2_llseek(struct file *file, loff_t offset, int origin) +{ + struct gfs2_inode *ip = GFS2_I(file->f_mapping->host); + struct gfs2_holder i_gh; + loff_t error; + + if (origin == 2) { + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, + &i_gh); + if (!error) { + error = remote_llseek(file, offset, origin); + gfs2_glock_dq_uninit(&i_gh); + } + } else + error = remote_llseek(file, offset, origin); + + return error; +} + +/** + * filldir_func - Report a directory entry to the caller of gfs2_dir_read() + * @opaque: opaque data used by the function + * @name: the name of the directory entry + * @length: the length of the name + * @offset: the entry's offset in the directory + * @inum: the inode number the entry points to + * @type: the type of inode the entry points to + * + * Returns: 0 on success, 1 if buffer full + */ + +static int filldir_func(void *opaque, const char *name, unsigned int length, + u64 offset, struct gfs2_inum *inum, + unsigned int type) +{ + struct filldir_reg *fdr = (struct filldir_reg *)opaque; + struct gfs2_sbd *sdp = fdr->fdr_sbd; + int error; + + error = fdr->fdr_filldir(fdr->fdr_opaque, name, length, offset, + inum->no_addr, type); + if (error) + return 1; + + if (fdr->fdr_prefetch && !(length == 1 && *name == '.')) { + gfs2_glock_prefetch_num(sdp, inum->no_addr, &gfs2_inode_glops, + LM_ST_SHARED, LM_FLAG_TRY | LM_FLAG_ANY); + gfs2_glock_prefetch_num(sdp, inum->no_addr, &gfs2_iopen_glops, + LM_ST_SHARED, LM_FLAG_TRY); + } + + return 0; +} + +/** + * gfs2_readdir - Read directory entries from a directory + * @file: The directory to read from + * @dirent: Buffer for dirents + * @filldir: Function used to do the copying + * + * Returns: errno + */ + +static int gfs2_readdir(struct file *file, void *dirent, filldir_t filldir) +{ + struct inode *dir = file->f_mapping->host; + struct gfs2_inode *dip = GFS2_I(dir); + struct filldir_reg fdr; + struct gfs2_holder d_gh; + u64 offset = file->f_pos; + int error; + + fdr.fdr_sbd = GFS2_SB(dir); + fdr.fdr_prefetch = 1; + fdr.fdr_filldir = filldir; + fdr.fdr_opaque = dirent; + + gfs2_holder_init(dip->i_gl, LM_ST_SHARED, GL_ATIME, &d_gh); + error = gfs2_glock_nq_atime(&d_gh); + if (error) { + gfs2_holder_uninit(&d_gh); + return error; + } + + error = gfs2_dir_read(dir, &offset, &fdr, filldir_func); + + gfs2_glock_dq_uninit(&d_gh); + + file->f_pos = offset; + + return error; +} + +/** + * fsflags_cvt + * @table: A table of 32 u32 flags + * @val: a 32 bit value to convert + * + * This function can be used to convert between fsflags values and + * GFS2's own flags values. + * + * Returns: the converted flags + */ +static u32 fsflags_cvt(const u32 *table, u32 val) +{ + u32 res = 0; + while(val) { + if (val & 1) + res |= *table; + table++; + val >>= 1; + } + return res; +} + +static const u32 fsflags_to_gfs2[32] = { + [3] = GFS2_DIF_SYNC, + [4] = GFS2_DIF_IMMUTABLE, + [5] = GFS2_DIF_APPENDONLY, + [7] = GFS2_DIF_NOATIME, + [12] = GFS2_DIF_EXHASH, + [14] = GFS2_DIF_JDATA, + [20] = GFS2_DIF_DIRECTIO, +}; + +static const u32 gfs2_to_fsflags[32] = { + [gfs2fl_Sync] = FS_SYNC_FL, + [gfs2fl_Immutable] = FS_IMMUTABLE_FL, + [gfs2fl_AppendOnly] = FS_APPEND_FL, + [gfs2fl_NoAtime] = FS_NOATIME_FL, + [gfs2fl_ExHash] = FS_INDEX_FL, + [gfs2fl_Jdata] = FS_JOURNAL_DATA_FL, + [gfs2fl_Directio] = FS_DIRECTIO_FL, + [gfs2fl_InheritDirectio] = FS_DIRECTIO_FL, + [gfs2fl_InheritJdata] = FS_JOURNAL_DATA_FL, +}; + +static int gfs2_get_flags(struct file *filp, u32 __user *ptr) +{ + struct inode *inode = filp->f_dentry->d_inode; + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_holder gh; + int error; + u32 fsflags; + + gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &gh); + error = gfs2_glock_nq_m_atime(1, &gh); + if (error) + return error; + + fsflags = fsflags_cvt(gfs2_to_fsflags, ip->i_di.di_flags); + if (put_user(fsflags, ptr)) + error = -EFAULT; + + gfs2_glock_dq_m(1, &gh); + gfs2_holder_uninit(&gh); + return error; +} + +/* Flags that can be set by user space */ +#define GFS2_FLAGS_USER_SET (GFS2_DIF_JDATA| \ + GFS2_DIF_DIRECTIO| \ + GFS2_DIF_IMMUTABLE| \ + GFS2_DIF_APPENDONLY| \ + GFS2_DIF_NOATIME| \ + GFS2_DIF_SYNC| \ + GFS2_DIF_SYSTEM| \ + GFS2_DIF_INHERIT_DIRECTIO| \ + GFS2_DIF_INHERIT_JDATA) + +/** + * gfs2_set_flags - set flags on an inode + * @inode: The inode + * @flags: The flags to set + * @mask: Indicates which flags are valid + * + */ +static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask) +{ + struct inode *inode = filp->f_dentry->d_inode; + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + struct buffer_head *bh; + struct gfs2_holder gh; + int error; + u32 new_flags, flags; + + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); + if (error) + return error; + + flags = ip->i_di.di_flags; + new_flags = (flags & ~mask) | (reqflags & mask); + if ((new_flags ^ flags) == 0) + goto out; + + if (S_ISDIR(inode->i_mode)) { + if ((new_flags ^ flags) & GFS2_DIF_JDATA) + new_flags ^= (GFS2_DIF_JDATA|GFS2_DIF_INHERIT_JDATA); + if ((new_flags ^ flags) & GFS2_DIF_DIRECTIO) + new_flags ^= (GFS2_DIF_DIRECTIO|GFS2_DIF_INHERIT_DIRECTIO); + } + + error = -EINVAL; + if ((new_flags ^ flags) & ~GFS2_FLAGS_USER_SET) + goto out; + + error = -EPERM; + if (IS_IMMUTABLE(inode) && (new_flags & GFS2_DIF_IMMUTABLE)) + goto out; + if (IS_APPEND(inode) && (new_flags & GFS2_DIF_APPENDONLY)) + goto out; + if (((new_flags ^ flags) & GFS2_DIF_IMMUTABLE) && + !capable(CAP_LINUX_IMMUTABLE)) + goto out; + if (!IS_IMMUTABLE(inode)) { + error = permission(inode, MAY_WRITE, NULL); + if (error) + goto out; + } + + error = gfs2_trans_begin(sdp, RES_DINODE, 0); + if (error) + goto out; + error = gfs2_meta_inode_buffer(ip, &bh); + if (error) + goto out_trans_end; + gfs2_trans_add_bh(ip->i_gl, bh, 1); + ip->i_di.di_flags = new_flags; + gfs2_dinode_out(&ip->i_di, bh->b_data); + brelse(bh); +out_trans_end: + gfs2_trans_end(sdp); +out: + gfs2_glock_dq_uninit(&gh); + return error; +} + +static int gfs2_set_flags(struct file *filp, u32 __user *ptr) +{ + u32 fsflags, gfsflags; + if (get_user(fsflags, ptr)) + return -EFAULT; + gfsflags = fsflags_cvt(fsflags_to_gfs2, fsflags); + return do_gfs2_set_flags(filp, gfsflags, ~0); +} + +static long gfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + switch(cmd) { + case FS_IOC_GETFLAGS: + return gfs2_get_flags(filp, (u32 __user *)arg); + case FS_IOC_SETFLAGS: + return gfs2_set_flags(filp, (u32 __user *)arg); + } + return -ENOTTY; +} + + +/** + * gfs2_mmap - + * @file: The file to map + * @vma: The VMA which described the mapping + * + * Returns: 0 or error code + */ + +static int gfs2_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct gfs2_inode *ip = GFS2_I(file->f_mapping->host); + struct gfs2_holder i_gh; + int error; + + gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &i_gh); + error = gfs2_glock_nq_atime(&i_gh); + if (error) { + gfs2_holder_uninit(&i_gh); + return error; + } + + /* This is VM_MAYWRITE instead of VM_WRITE because a call + to mprotect() can turn on VM_WRITE later. */ + + if ((vma->vm_flags & (VM_MAYSHARE | VM_MAYWRITE)) == + (VM_MAYSHARE | VM_MAYWRITE)) + vma->vm_ops = &gfs2_vm_ops_sharewrite; + else + vma->vm_ops = &gfs2_vm_ops_private; + + gfs2_glock_dq_uninit(&i_gh); + + return error; +} + +/** + * gfs2_open - open a file + * @inode: the inode to open + * @file: the struct file for this opening + * + * Returns: errno + */ + +static int gfs2_open(struct inode *inode, struct file *file) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_holder i_gh; + struct gfs2_file *fp; + int error; + + fp = kzalloc(sizeof(struct gfs2_file), GFP_KERNEL); + if (!fp) + return -ENOMEM; + + mutex_init(&fp->f_fl_mutex); + + gfs2_assert_warn(GFS2_SB(inode), !file->private_data); + file->private_data = fp; + + if (S_ISREG(ip->i_di.di_mode)) { + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, + &i_gh); + if (error) + goto fail; + + if (!(file->f_flags & O_LARGEFILE) && + ip->i_di.di_size > MAX_NON_LFS) { + error = -EFBIG; + goto fail_gunlock; + } + + /* Listen to the Direct I/O flag */ + + if (ip->i_di.di_flags & GFS2_DIF_DIRECTIO) + file->f_flags |= O_DIRECT; + + gfs2_glock_dq_uninit(&i_gh); + } + + return 0; + +fail_gunlock: + gfs2_glock_dq_uninit(&i_gh); +fail: + file->private_data = NULL; + kfree(fp); + return error; +} + +/** + * gfs2_close - called to close a struct file + * @inode: the inode the struct file belongs to + * @file: the struct file being closed + * + * Returns: errno + */ + +static int gfs2_close(struct inode *inode, struct file *file) +{ + struct gfs2_sbd *sdp = inode->i_sb->s_fs_info; + struct gfs2_file *fp; + + fp = file->private_data; + file->private_data = NULL; + + if (gfs2_assert_warn(sdp, fp)) + return -EIO; + + kfree(fp); + + return 0; +} + +/** + * gfs2_fsync - sync the dirty data for a file (across the cluster) + * @file: the file that points to the dentry (we ignore this) + * @dentry: the dentry that points to the inode to sync + * + * Returns: errno + */ + +static int gfs2_fsync(struct file *file, struct dentry *dentry, int datasync) +{ + struct gfs2_inode *ip = GFS2_I(dentry->d_inode); + + gfs2_log_flush(ip->i_gl->gl_sbd, ip->i_gl); + + return 0; +} + +/** + * gfs2_lock - acquire/release a posix lock on a file + * @file: the file pointer + * @cmd: either modify or retrieve lock state, possibly wait + * @fl: type and range of lock + * + * Returns: errno + */ + +static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl) +{ + struct gfs2_inode *ip = GFS2_I(file->f_mapping->host); + struct gfs2_sbd *sdp = GFS2_SB(file->f_mapping->host); + struct lm_lockname name = + { .ln_number = ip->i_num.no_addr, + .ln_type = LM_TYPE_PLOCK }; + + if (!(fl->fl_flags & FL_POSIX)) + return -ENOLCK; + if ((ip->i_di.di_mode & (S_ISGID | S_IXGRP)) == S_ISGID) + return -ENOLCK; + + if (sdp->sd_args.ar_localflocks) { + if (IS_GETLK(cmd)) { + struct file_lock tmp; + int ret; + ret = posix_test_lock(file, fl, &tmp); + fl->fl_type = F_UNLCK; + if (ret) + memcpy(fl, &tmp, sizeof(struct file_lock)); + return 0; + } else { + return posix_lock_file_wait(file, fl); + } + } + + if (IS_GETLK(cmd)) + return gfs2_lm_plock_get(sdp, &name, file, fl); + else if (fl->fl_type == F_UNLCK) + return gfs2_lm_punlock(sdp, &name, file, fl); + else + return gfs2_lm_plock(sdp, &name, file, cmd, fl); +} + +static int do_flock(struct file *file, int cmd, struct file_lock *fl) +{ + struct gfs2_file *fp = file->private_data; + struct gfs2_holder *fl_gh = &fp->f_fl_gh; + struct gfs2_inode *ip = GFS2_I(file->f_dentry->d_inode); + struct gfs2_glock *gl; + unsigned int state; + int flags; + int error = 0; + + state = (fl->fl_type == F_WRLCK) ? LM_ST_EXCLUSIVE : LM_ST_SHARED; + flags = (IS_SETLKW(cmd) ? 0 : LM_FLAG_TRY) | GL_EXACT | GL_NOCACHE; + + mutex_lock(&fp->f_fl_mutex); + + gl = fl_gh->gh_gl; + if (gl) { + if (fl_gh->gh_state == state) + goto out; + gfs2_glock_hold(gl); + flock_lock_file_wait(file, + &(struct file_lock){.fl_type = F_UNLCK}); + gfs2_glock_dq_uninit(fl_gh); + } else { + error = gfs2_glock_get(GFS2_SB(&ip->i_inode), + ip->i_num.no_addr, &gfs2_flock_glops, + CREATE, &gl); + if (error) + goto out; + } + + gfs2_holder_init(gl, state, flags, fl_gh); + gfs2_glock_put(gl); + + error = gfs2_glock_nq(fl_gh); + if (error) { + gfs2_holder_uninit(fl_gh); + if (error == GLR_TRYFAILED) + error = -EAGAIN; + } else { + error = flock_lock_file_wait(file, fl); + gfs2_assert_warn(GFS2_SB(&ip->i_inode), !error); + } + +out: + mutex_unlock(&fp->f_fl_mutex); + return error; +} + +static void do_unflock(struct file *file, struct file_lock *fl) +{ + struct gfs2_file *fp = file->private_data; + struct gfs2_holder *fl_gh = &fp->f_fl_gh; + + mutex_lock(&fp->f_fl_mutex); + flock_lock_file_wait(file, fl); + if (fl_gh->gh_gl) + gfs2_glock_dq_uninit(fl_gh); + mutex_unlock(&fp->f_fl_mutex); +} + +/** + * gfs2_flock - acquire/release a flock lock on a file + * @file: the file pointer + * @cmd: either modify or retrieve lock state, possibly wait + * @fl: type and range of lock + * + * Returns: errno + */ + +static int gfs2_flock(struct file *file, int cmd, struct file_lock *fl) +{ + struct gfs2_inode *ip = GFS2_I(file->f_mapping->host); + struct gfs2_sbd *sdp = GFS2_SB(file->f_mapping->host); + + if (!(fl->fl_flags & FL_FLOCK)) + return -ENOLCK; + if ((ip->i_di.di_mode & (S_ISGID | S_IXGRP)) == S_ISGID) + return -ENOLCK; + + if (sdp->sd_args.ar_localflocks) + return flock_lock_file_wait(file, fl); + + if (fl->fl_type == F_UNLCK) { + do_unflock(file, fl); + return 0; + } else { + return do_flock(file, cmd, fl); + } +} + +const struct file_operations gfs2_file_fops = { + .llseek = gfs2_llseek, + .read = do_sync_read, + .aio_read = generic_file_aio_read, + .write = do_sync_write, + .aio_write = generic_file_aio_write, + .unlocked_ioctl = gfs2_ioctl, + .mmap = gfs2_mmap, + .open = gfs2_open, + .release = gfs2_close, + .fsync = gfs2_fsync, + .lock = gfs2_lock, + .sendfile = generic_file_sendfile, + .flock = gfs2_flock, + .splice_read = generic_file_splice_read, + .splice_write = generic_file_splice_write, +}; + +const struct file_operations gfs2_dir_fops = { + .readdir = gfs2_readdir, + .unlocked_ioctl = gfs2_ioctl, + .open = gfs2_open, + .release = gfs2_close, + .fsync = gfs2_fsync, + .lock = gfs2_lock, + .flock = gfs2_flock, +}; + diff --git a/fs/gfs2/ops_file.h b/fs/gfs2/ops_file.h new file mode 100644 index 000000000000..ce319f89ec8e --- /dev/null +++ b/fs/gfs2/ops_file.h @@ -0,0 +1,24 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#ifndef __OPS_FILE_DOT_H__ +#define __OPS_FILE_DOT_H__ + +#include <linux/fs.h> +struct gfs2_inode; + +extern struct file gfs2_internal_file_sentinel; +extern int gfs2_internal_read(struct gfs2_inode *ip, + struct file_ra_state *ra_state, + char *buf, loff_t *pos, unsigned size); + +extern const struct file_operations gfs2_file_fops; +extern const struct file_operations gfs2_dir_fops; + +#endif /* __OPS_FILE_DOT_H__ */ diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c new file mode 100644 index 000000000000..178b33911843 --- /dev/null +++ b/fs/gfs2/ops_fstype.c @@ -0,0 +1,928 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/completion.h> +#include <linux/buffer_head.h> +#include <linux/blkdev.h> +#include <linux/kthread.h> +#include <linux/namei.h> +#include <linux/mount.h> +#include <linux/gfs2_ondisk.h> +#include <linux/lm_interface.h> + +#include "gfs2.h" +#include "incore.h" +#include "daemon.h" +#include "glock.h" +#include "glops.h" +#include "inode.h" +#include "lm.h" +#include "mount.h" +#include "ops_export.h" +#include "ops_fstype.h" +#include "ops_super.h" +#include "recovery.h" +#include "rgrp.h" +#include "super.h" +#include "sys.h" +#include "util.h" + +#define DO 0 +#define UNDO 1 + +extern struct dentry_operations gfs2_dops; + +static struct gfs2_sbd *init_sbd(struct super_block *sb) +{ + struct gfs2_sbd *sdp; + + sdp = kzalloc(sizeof(struct gfs2_sbd), GFP_KERNEL); + if (!sdp) + return NULL; + + sb->s_fs_info = sdp; + sdp->sd_vfs = sb; + + gfs2_tune_init(&sdp->sd_tune); + + INIT_LIST_HEAD(&sdp->sd_reclaim_list); + spin_lock_init(&sdp->sd_reclaim_lock); + init_waitqueue_head(&sdp->sd_reclaim_wq); + + mutex_init(&sdp->sd_inum_mutex); + spin_lock_init(&sdp->sd_statfs_spin); + mutex_init(&sdp->sd_statfs_mutex); + + spin_lock_init(&sdp->sd_rindex_spin); + mutex_init(&sdp->sd_rindex_mutex); + INIT_LIST_HEAD(&sdp->sd_rindex_list); + INIT_LIST_HEAD(&sdp->sd_rindex_mru_list); + INIT_LIST_HEAD(&sdp->sd_rindex_recent_list); + + INIT_LIST_HEAD(&sdp->sd_jindex_list); + spin_lock_init(&sdp->sd_jindex_spin); + mutex_init(&sdp->sd_jindex_mutex); + + INIT_LIST_HEAD(&sdp->sd_quota_list); + spin_lock_init(&sdp->sd_quota_spin); + mutex_init(&sdp->sd_quota_mutex); + + spin_lock_init(&sdp->sd_log_lock); + + INIT_LIST_HEAD(&sdp->sd_log_le_gl); + INIT_LIST_HEAD(&sdp->sd_log_le_buf); + INIT_LIST_HEAD(&sdp->sd_log_le_revoke); + INIT_LIST_HEAD(&sdp->sd_log_le_rg); + INIT_LIST_HEAD(&sdp->sd_log_le_databuf); + + mutex_init(&sdp->sd_log_reserve_mutex); + INIT_LIST_HEAD(&sdp->sd_ail1_list); + INIT_LIST_HEAD(&sdp->sd_ail2_list); + + init_rwsem(&sdp->sd_log_flush_lock); + INIT_LIST_HEAD(&sdp->sd_log_flush_list); + + INIT_LIST_HEAD(&sdp->sd_revoke_list); + + mutex_init(&sdp->sd_freeze_lock); + + return sdp; +} + +static void init_vfs(struct super_block *sb, unsigned noatime) +{ + struct gfs2_sbd *sdp = sb->s_fs_info; + + sb->s_magic = GFS2_MAGIC; + sb->s_op = &gfs2_super_ops; + sb->s_export_op = &gfs2_export_ops; + sb->s_maxbytes = MAX_LFS_FILESIZE; + + if (sb->s_flags & (MS_NOATIME | MS_NODIRATIME)) + set_bit(noatime, &sdp->sd_flags); + + /* Don't let the VFS update atimes. GFS2 handles this itself. */ + sb->s_flags |= MS_NOATIME | MS_NODIRATIME; +} + +static int init_names(struct gfs2_sbd *sdp, int silent) +{ + struct page *page; + char *proto, *table; + int error = 0; + + proto = sdp->sd_args.ar_lockproto; + table = sdp->sd_args.ar_locktable; + + /* Try to autodetect */ + + if (!proto[0] || !table[0]) { + struct gfs2_sb *sb; + page = gfs2_read_super(sdp->sd_vfs, GFS2_SB_ADDR >> sdp->sd_fsb2bb_shift); + if (!page) + return -ENOBUFS; + sb = kmap(page); + gfs2_sb_in(&sdp->sd_sb, sb); + kunmap(page); + __free_page(page); + + error = gfs2_check_sb(sdp, &sdp->sd_sb, silent); + if (error) + goto out; + + if (!proto[0]) + proto = sdp->sd_sb.sb_lockproto; + if (!table[0]) + table = sdp->sd_sb.sb_locktable; + } + + if (!table[0]) + table = sdp->sd_vfs->s_id; + + snprintf(sdp->sd_proto_name, GFS2_FSNAME_LEN, "%s", proto); + snprintf(sdp->sd_table_name, GFS2_FSNAME_LEN, "%s", table); + +out: + return error; +} + +static int init_locking(struct gfs2_sbd *sdp, struct gfs2_holder *mount_gh, + int undo) +{ + struct task_struct *p; + int error = 0; + + if (undo) + goto fail_trans; + + p = kthread_run(gfs2_scand, sdp, "gfs2_scand"); + error = IS_ERR(p); + if (error) { + fs_err(sdp, "can't start scand thread: %d\n", error); + return error; + } + sdp->sd_scand_process = p; + + for (sdp->sd_glockd_num = 0; + sdp->sd_glockd_num < sdp->sd_args.ar_num_glockd; + sdp->sd_glockd_num++) { + p = kthread_run(gfs2_glockd, sdp, "gfs2_glockd"); + error = IS_ERR(p); + if (error) { + fs_err(sdp, "can't start glockd thread: %d\n", error); + goto fail; + } + sdp->sd_glockd_process[sdp->sd_glockd_num] = p; + } + + error = gfs2_glock_nq_num(sdp, + GFS2_MOUNT_LOCK, &gfs2_nondisk_glops, + LM_ST_EXCLUSIVE, LM_FLAG_NOEXP | GL_NOCACHE, + mount_gh); + if (error) { + fs_err(sdp, "can't acquire mount glock: %d\n", error); + goto fail; + } + + error = gfs2_glock_nq_num(sdp, + GFS2_LIVE_LOCK, &gfs2_nondisk_glops, + LM_ST_SHARED, + LM_FLAG_NOEXP | GL_EXACT, + &sdp->sd_live_gh); + if (error) { + fs_err(sdp, "can't acquire live glock: %d\n", error); + goto fail_mount; + } + + error = gfs2_glock_get(sdp, GFS2_RENAME_LOCK, &gfs2_nondisk_glops, + CREATE, &sdp->sd_rename_gl); + if (error) { + fs_err(sdp, "can't create rename glock: %d\n", error); + goto fail_live; + } + + error = gfs2_glock_get(sdp, GFS2_TRANS_LOCK, &gfs2_trans_glops, + CREATE, &sdp->sd_trans_gl); + if (error) { + fs_err(sdp, "can't create transaction glock: %d\n", error); + goto fail_rename; + } + set_bit(GLF_STICKY, &sdp->sd_trans_gl->gl_flags); + + return 0; + +fail_trans: + gfs2_glock_put(sdp->sd_trans_gl); +fail_rename: + gfs2_glock_put(sdp->sd_rename_gl); +fail_live: + gfs2_glock_dq_uninit(&sdp->sd_live_gh); +fail_mount: + gfs2_glock_dq_uninit(mount_gh); +fail: + while (sdp->sd_glockd_num--) + kthread_stop(sdp->sd_glockd_process[sdp->sd_glockd_num]); + + kthread_stop(sdp->sd_scand_process); + return error; +} + +static struct inode *gfs2_lookup_root(struct super_block *sb, + struct gfs2_inum *inum) +{ + return gfs2_inode_lookup(sb, inum, DT_DIR); +} + +static int init_sb(struct gfs2_sbd *sdp, int silent, int undo) +{ + struct super_block *sb = sdp->sd_vfs; + struct gfs2_holder sb_gh; + struct gfs2_inum *inum; + struct inode *inode; + int error = 0; + + if (undo) { + if (sb->s_root) { + dput(sb->s_root); + sb->s_root = NULL; + } + return 0; + } + + error = gfs2_glock_nq_num(sdp, GFS2_SB_LOCK, &gfs2_meta_glops, + LM_ST_SHARED, 0, &sb_gh); + if (error) { + fs_err(sdp, "can't acquire superblock glock: %d\n", error); + return error; + } + + error = gfs2_read_sb(sdp, sb_gh.gh_gl, silent); + if (error) { + fs_err(sdp, "can't read superblock: %d\n", error); + goto out; + } + + /* Set up the buffer cache and SB for real */ + if (sdp->sd_sb.sb_bsize < bdev_hardsect_size(sb->s_bdev)) { + error = -EINVAL; + fs_err(sdp, "FS block size (%u) is too small for device " + "block size (%u)\n", + sdp->sd_sb.sb_bsize, bdev_hardsect_size(sb->s_bdev)); + goto out; + } + if (sdp->sd_sb.sb_bsize > PAGE_SIZE) { + error = -EINVAL; + fs_err(sdp, "FS block size (%u) is too big for machine " + "page size (%u)\n", + sdp->sd_sb.sb_bsize, (unsigned int)PAGE_SIZE); + goto out; + } + sb_set_blocksize(sb, sdp->sd_sb.sb_bsize); + + /* Get the root inode */ + inum = &sdp->sd_sb.sb_root_dir; + if (sb->s_type == &gfs2meta_fs_type) + inum = &sdp->sd_sb.sb_master_dir; + inode = gfs2_lookup_root(sb, inum); + if (IS_ERR(inode)) { + error = PTR_ERR(inode); + fs_err(sdp, "can't read in root inode: %d\n", error); + goto out; + } + + sb->s_root = d_alloc_root(inode); + if (!sb->s_root) { + fs_err(sdp, "can't get root dentry\n"); + error = -ENOMEM; + iput(inode); + } + sb->s_root->d_op = &gfs2_dops; +out: + gfs2_glock_dq_uninit(&sb_gh); + return error; +} + +static int init_journal(struct gfs2_sbd *sdp, int undo) +{ + struct gfs2_holder ji_gh; + struct task_struct *p; + struct gfs2_inode *ip; + int jindex = 1; + int error = 0; + + if (undo) { + jindex = 0; + goto fail_recoverd; + } + + sdp->sd_jindex = gfs2_lookup_simple(sdp->sd_master_dir, "jindex"); + if (IS_ERR(sdp->sd_jindex)) { + fs_err(sdp, "can't lookup journal index: %d\n", error); + return PTR_ERR(sdp->sd_jindex); + } + ip = GFS2_I(sdp->sd_jindex); + set_bit(GLF_STICKY, &ip->i_gl->gl_flags); + + /* Load in the journal index special file */ + + error = gfs2_jindex_hold(sdp, &ji_gh); + if (error) { + fs_err(sdp, "can't read journal index: %d\n", error); + goto fail; + } + + error = -EINVAL; + if (!gfs2_jindex_size(sdp)) { + fs_err(sdp, "no journals!\n"); + goto fail_jindex; + } + + if (sdp->sd_args.ar_spectator) { + sdp->sd_jdesc = gfs2_jdesc_find(sdp, 0); + sdp->sd_log_blks_free = sdp->sd_jdesc->jd_blocks; + } else { + if (sdp->sd_lockstruct.ls_jid >= gfs2_jindex_size(sdp)) { + fs_err(sdp, "can't mount journal #%u\n", + sdp->sd_lockstruct.ls_jid); + fs_err(sdp, "there are only %u journals (0 - %u)\n", + gfs2_jindex_size(sdp), + gfs2_jindex_size(sdp) - 1); + goto fail_jindex; + } + sdp->sd_jdesc = gfs2_jdesc_find(sdp, sdp->sd_lockstruct.ls_jid); + + error = gfs2_glock_nq_num(sdp, sdp->sd_lockstruct.ls_jid, + &gfs2_journal_glops, + LM_ST_EXCLUSIVE, LM_FLAG_NOEXP, + &sdp->sd_journal_gh); + if (error) { + fs_err(sdp, "can't acquire journal glock: %d\n", error); + goto fail_jindex; + } + + ip = GFS2_I(sdp->sd_jdesc->jd_inode); + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, + LM_FLAG_NOEXP | GL_EXACT, + &sdp->sd_jinode_gh); + if (error) { + fs_err(sdp, "can't acquire journal inode glock: %d\n", + error); + goto fail_journal_gh; + } + + error = gfs2_jdesc_check(sdp->sd_jdesc); + if (error) { + fs_err(sdp, "my journal (%u) is bad: %d\n", + sdp->sd_jdesc->jd_jid, error); + goto fail_jinode_gh; + } + sdp->sd_log_blks_free = sdp->sd_jdesc->jd_blocks; + } + + if (sdp->sd_lockstruct.ls_first) { + unsigned int x; + for (x = 0; x < sdp->sd_journals; x++) { + error = gfs2_recover_journal(gfs2_jdesc_find(sdp, x)); + if (error) { + fs_err(sdp, "error recovering journal %u: %d\n", + x, error); + goto fail_jinode_gh; + } + } + + gfs2_lm_others_may_mount(sdp); + } else if (!sdp->sd_args.ar_spectator) { + error = gfs2_recover_journal(sdp->sd_jdesc); + if (error) { + fs_err(sdp, "error recovering my journal: %d\n", error); + goto fail_jinode_gh; + } + } + + set_bit(SDF_JOURNAL_CHECKED, &sdp->sd_flags); + gfs2_glock_dq_uninit(&ji_gh); + jindex = 0; + + p = kthread_run(gfs2_recoverd, sdp, "gfs2_recoverd"); + error = IS_ERR(p); + if (error) { + fs_err(sdp, "can't start recoverd thread: %d\n", error); + goto fail_jinode_gh; + } + sdp->sd_recoverd_process = p; + + return 0; + +fail_recoverd: + kthread_stop(sdp->sd_recoverd_process); +fail_jinode_gh: + if (!sdp->sd_args.ar_spectator) + gfs2_glock_dq_uninit(&sdp->sd_jinode_gh); +fail_journal_gh: + if (!sdp->sd_args.ar_spectator) + gfs2_glock_dq_uninit(&sdp->sd_journal_gh); +fail_jindex: + gfs2_jindex_free(sdp); + if (jindex) + gfs2_glock_dq_uninit(&ji_gh); +fail: + iput(sdp->sd_jindex); + return error; +} + + +static int init_inodes(struct gfs2_sbd *sdp, int undo) +{ + int error = 0; + struct gfs2_inode *ip; + struct inode *inode; + + if (undo) + goto fail_qinode; + + inode = gfs2_lookup_root(sdp->sd_vfs, &sdp->sd_sb.sb_master_dir); + if (IS_ERR(inode)) { + error = PTR_ERR(inode); + fs_err(sdp, "can't read in master directory: %d\n", error); + goto fail; + } + sdp->sd_master_dir = inode; + + error = init_journal(sdp, undo); + if (error) + goto fail_master; + + /* Read in the master inode number inode */ + sdp->sd_inum_inode = gfs2_lookup_simple(sdp->sd_master_dir, "inum"); + if (IS_ERR(sdp->sd_inum_inode)) { + error = PTR_ERR(sdp->sd_inum_inode); + fs_err(sdp, "can't read in inum inode: %d\n", error); + goto fail_journal; + } + + + /* Read in the master statfs inode */ + sdp->sd_statfs_inode = gfs2_lookup_simple(sdp->sd_master_dir, "statfs"); + if (IS_ERR(sdp->sd_statfs_inode)) { + error = PTR_ERR(sdp->sd_statfs_inode); + fs_err(sdp, "can't read in statfs inode: %d\n", error); + goto fail_inum; + } + + /* Read in the resource index inode */ + sdp->sd_rindex = gfs2_lookup_simple(sdp->sd_master_dir, "rindex"); + if (IS_ERR(sdp->sd_rindex)) { + error = PTR_ERR(sdp->sd_rindex); + fs_err(sdp, "can't get resource index inode: %d\n", error); + goto fail_statfs; + } + ip = GFS2_I(sdp->sd_rindex); + set_bit(GLF_STICKY, &ip->i_gl->gl_flags); + sdp->sd_rindex_vn = ip->i_gl->gl_vn - 1; + + /* Read in the quota inode */ + sdp->sd_quota_inode = gfs2_lookup_simple(sdp->sd_master_dir, "quota"); + if (IS_ERR(sdp->sd_quota_inode)) { + error = PTR_ERR(sdp->sd_quota_inode); + fs_err(sdp, "can't get quota file inode: %d\n", error); + goto fail_rindex; + } + return 0; + +fail_qinode: + iput(sdp->sd_quota_inode); +fail_rindex: + gfs2_clear_rgrpd(sdp); + iput(sdp->sd_rindex); +fail_statfs: + iput(sdp->sd_statfs_inode); +fail_inum: + iput(sdp->sd_inum_inode); +fail_journal: + init_journal(sdp, UNDO); +fail_master: + iput(sdp->sd_master_dir); +fail: + return error; +} + +static int init_per_node(struct gfs2_sbd *sdp, int undo) +{ + struct inode *pn = NULL; + char buf[30]; + int error = 0; + struct gfs2_inode *ip; + + if (sdp->sd_args.ar_spectator) + return 0; + + if (undo) + goto fail_qc_gh; + + pn = gfs2_lookup_simple(sdp->sd_master_dir, "per_node"); + if (IS_ERR(pn)) { + error = PTR_ERR(pn); + fs_err(sdp, "can't find per_node directory: %d\n", error); + return error; + } + + sprintf(buf, "inum_range%u", sdp->sd_jdesc->jd_jid); + sdp->sd_ir_inode = gfs2_lookup_simple(pn, buf); + if (IS_ERR(sdp->sd_ir_inode)) { + error = PTR_ERR(sdp->sd_ir_inode); + fs_err(sdp, "can't find local \"ir\" file: %d\n", error); + goto fail; + } + + sprintf(buf, "statfs_change%u", sdp->sd_jdesc->jd_jid); + sdp->sd_sc_inode = gfs2_lookup_simple(pn, buf); + if (IS_ERR(sdp->sd_sc_inode)) { + error = PTR_ERR(sdp->sd_sc_inode); + fs_err(sdp, "can't find local \"sc\" file: %d\n", error); + goto fail_ir_i; + } + + sprintf(buf, "quota_change%u", sdp->sd_jdesc->jd_jid); + sdp->sd_qc_inode = gfs2_lookup_simple(pn, buf); + if (IS_ERR(sdp->sd_qc_inode)) { + error = PTR_ERR(sdp->sd_qc_inode); + fs_err(sdp, "can't find local \"qc\" file: %d\n", error); + goto fail_ut_i; + } + + iput(pn); + pn = NULL; + + ip = GFS2_I(sdp->sd_ir_inode); + error = gfs2_glock_nq_init(ip->i_gl, + LM_ST_EXCLUSIVE, 0, + &sdp->sd_ir_gh); + if (error) { + fs_err(sdp, "can't lock local \"ir\" file: %d\n", error); + goto fail_qc_i; + } + + ip = GFS2_I(sdp->sd_sc_inode); + error = gfs2_glock_nq_init(ip->i_gl, + LM_ST_EXCLUSIVE, 0, + &sdp->sd_sc_gh); + if (error) { + fs_err(sdp, "can't lock local \"sc\" file: %d\n", error); + goto fail_ir_gh; + } + + ip = GFS2_I(sdp->sd_qc_inode); + error = gfs2_glock_nq_init(ip->i_gl, + LM_ST_EXCLUSIVE, 0, + &sdp->sd_qc_gh); + if (error) { + fs_err(sdp, "can't lock local \"qc\" file: %d\n", error); + goto fail_ut_gh; + } + + return 0; + +fail_qc_gh: + gfs2_glock_dq_uninit(&sdp->sd_qc_gh); +fail_ut_gh: + gfs2_glock_dq_uninit(&sdp->sd_sc_gh); +fail_ir_gh: + gfs2_glock_dq_uninit(&sdp->sd_ir_gh); +fail_qc_i: + iput(sdp->sd_qc_inode); +fail_ut_i: + iput(sdp->sd_sc_inode); +fail_ir_i: + iput(sdp->sd_ir_inode); +fail: + if (pn) + iput(pn); + return error; +} + +static int init_threads(struct gfs2_sbd *sdp, int undo) +{ + struct task_struct *p; + int error = 0; + + if (undo) + goto fail_quotad; + + sdp->sd_log_flush_time = jiffies; + sdp->sd_jindex_refresh_time = jiffies; + + p = kthread_run(gfs2_logd, sdp, "gfs2_logd"); + error = IS_ERR(p); + if (error) { + fs_err(sdp, "can't start logd thread: %d\n", error); + return error; + } + sdp->sd_logd_process = p; + + sdp->sd_statfs_sync_time = jiffies; + sdp->sd_quota_sync_time = jiffies; + + p = kthread_run(gfs2_quotad, sdp, "gfs2_quotad"); + error = IS_ERR(p); + if (error) { + fs_err(sdp, "can't start quotad thread: %d\n", error); + goto fail; + } + sdp->sd_quotad_process = p; + + return 0; + + +fail_quotad: + kthread_stop(sdp->sd_quotad_process); +fail: + kthread_stop(sdp->sd_logd_process); + return error; +} + +/** + * fill_super - Read in superblock + * @sb: The VFS superblock + * @data: Mount options + * @silent: Don't complain if it's not a GFS2 filesystem + * + * Returns: errno + */ + +static int fill_super(struct super_block *sb, void *data, int silent) +{ + struct gfs2_sbd *sdp; + struct gfs2_holder mount_gh; + int error; + + sdp = init_sbd(sb); + if (!sdp) { + printk(KERN_WARNING "GFS2: can't alloc struct gfs2_sbd\n"); + return -ENOMEM; + } + + error = gfs2_mount_args(sdp, (char *)data, 0); + if (error) { + printk(KERN_WARNING "GFS2: can't parse mount arguments\n"); + goto fail; + } + + init_vfs(sb, SDF_NOATIME); + + /* Set up the buffer cache and fill in some fake block size values + to allow us to read-in the on-disk superblock. */ + sdp->sd_sb.sb_bsize = sb_min_blocksize(sb, GFS2_BASIC_BLOCK); + sdp->sd_sb.sb_bsize_shift = sb->s_blocksize_bits; + sdp->sd_fsb2bb_shift = sdp->sd_sb.sb_bsize_shift - + GFS2_BASIC_BLOCK_SHIFT; + sdp->sd_fsb2bb = 1 << sdp->sd_fsb2bb_shift; + + error = init_names(sdp, silent); + if (error) + goto fail; + + error = gfs2_sys_fs_add(sdp); + if (error) + goto fail; + + error = gfs2_lm_mount(sdp, silent); + if (error) + goto fail_sys; + + error = init_locking(sdp, &mount_gh, DO); + if (error) + goto fail_lm; + + error = init_sb(sdp, silent, DO); + if (error) + goto fail_locking; + + error = init_inodes(sdp, DO); + if (error) + goto fail_sb; + + error = init_per_node(sdp, DO); + if (error) + goto fail_inodes; + + error = gfs2_statfs_init(sdp); + if (error) { + fs_err(sdp, "can't initialize statfs subsystem: %d\n", error); + goto fail_per_node; + } + + error = init_threads(sdp, DO); + if (error) + goto fail_per_node; + + if (!(sb->s_flags & MS_RDONLY)) { + error = gfs2_make_fs_rw(sdp); + if (error) { + fs_err(sdp, "can't make FS RW: %d\n", error); + goto fail_threads; + } + } + + gfs2_glock_dq_uninit(&mount_gh); + + return 0; + +fail_threads: + init_threads(sdp, UNDO); +fail_per_node: + init_per_node(sdp, UNDO); +fail_inodes: + init_inodes(sdp, UNDO); +fail_sb: + init_sb(sdp, 0, UNDO); +fail_locking: + init_locking(sdp, &mount_gh, UNDO); +fail_lm: + gfs2_gl_hash_clear(sdp, WAIT); + gfs2_lm_unmount(sdp); + while (invalidate_inodes(sb)) + yield(); +fail_sys: + gfs2_sys_fs_del(sdp); +fail: + kfree(sdp); + sb->s_fs_info = NULL; + return error; +} + +static int gfs2_get_sb(struct file_system_type *fs_type, int flags, + const char *dev_name, void *data, struct vfsmount *mnt) +{ + struct super_block *sb; + struct gfs2_sbd *sdp; + int error = get_sb_bdev(fs_type, flags, dev_name, data, fill_super, mnt); + if (error) + goto out; + sb = mnt->mnt_sb; + sdp = sb->s_fs_info; + sdp->sd_gfs2mnt = mnt; +out: + return error; +} + +static int fill_super_meta(struct super_block *sb, struct super_block *new, + void *data, int silent) +{ + struct gfs2_sbd *sdp = sb->s_fs_info; + struct inode *inode; + int error = 0; + + new->s_fs_info = sdp; + sdp->sd_vfs_meta = sb; + + init_vfs(new, SDF_NOATIME); + + /* Get the master inode */ + inode = igrab(sdp->sd_master_dir); + + new->s_root = d_alloc_root(inode); + if (!new->s_root) { + fs_err(sdp, "can't get root dentry\n"); + error = -ENOMEM; + iput(inode); + } + new->s_root->d_op = &gfs2_dops; + + return error; +} + +static int set_bdev_super(struct super_block *s, void *data) +{ + s->s_bdev = data; + s->s_dev = s->s_bdev->bd_dev; + return 0; +} + +static int test_bdev_super(struct super_block *s, void *data) +{ + return s->s_bdev == data; +} + +static struct super_block* get_gfs2_sb(const char *dev_name) +{ + struct kstat stat; + struct nameidata nd; + struct file_system_type *fstype; + struct super_block *sb = NULL, *s; + struct list_head *l; + int error; + + error = path_lookup(dev_name, LOOKUP_FOLLOW, &nd); + if (error) { + printk(KERN_WARNING "GFS2: path_lookup on %s returned error\n", + dev_name); + goto out; + } + error = vfs_getattr(nd.mnt, nd.dentry, &stat); + + fstype = get_fs_type("gfs2"); + list_for_each(l, &fstype->fs_supers) { + s = list_entry(l, struct super_block, s_instances); + if ((S_ISBLK(stat.mode) && s->s_dev == stat.rdev) || + (S_ISDIR(stat.mode) && s == nd.dentry->d_inode->i_sb)) { + sb = s; + goto free_nd; + } + } + + printk(KERN_WARNING "GFS2: Unrecognized block device or " + "mount point %s", dev_name); + +free_nd: + path_release(&nd); +out: + return sb; +} + +static int gfs2_get_sb_meta(struct file_system_type *fs_type, int flags, + const char *dev_name, void *data, struct vfsmount *mnt) +{ + int error = 0; + struct super_block *sb = NULL, *new; + struct gfs2_sbd *sdp; + char *gfs2mnt = NULL; + + sb = get_gfs2_sb(dev_name); + if (!sb) { + printk(KERN_WARNING "GFS2: gfs2 mount does not exist\n"); + error = -ENOENT; + goto error; + } + sdp = (struct gfs2_sbd*) sb->s_fs_info; + if (sdp->sd_vfs_meta) { + printk(KERN_WARNING "GFS2: gfs2meta mount already exists\n"); + error = -EBUSY; + goto error; + } + mutex_lock(&sb->s_bdev->bd_mount_mutex); + new = sget(fs_type, test_bdev_super, set_bdev_super, sb->s_bdev); + mutex_unlock(&sb->s_bdev->bd_mount_mutex); + if (IS_ERR(new)) { + error = PTR_ERR(new); + goto error; + } + module_put(fs_type->owner); + new->s_flags = flags; + strlcpy(new->s_id, sb->s_id, sizeof(new->s_id)); + sb_set_blocksize(new, sb->s_blocksize); + error = fill_super_meta(sb, new, data, flags & MS_SILENT ? 1 : 0); + if (error) { + up_write(&new->s_umount); + deactivate_super(new); + goto error; + } + + new->s_flags |= MS_ACTIVE; + + /* Grab a reference to the gfs2 mount point */ + atomic_inc(&sdp->sd_gfs2mnt->mnt_count); + return simple_set_mnt(mnt, new); +error: + if (gfs2mnt) + kfree(gfs2mnt); + return error; +} + +static void gfs2_kill_sb(struct super_block *sb) +{ + kill_block_super(sb); +} + +static void gfs2_kill_sb_meta(struct super_block *sb) +{ + struct gfs2_sbd *sdp = sb->s_fs_info; + generic_shutdown_super(sb); + sdp->sd_vfs_meta = NULL; + atomic_dec(&sdp->sd_gfs2mnt->mnt_count); +} + +struct file_system_type gfs2_fs_type = { + .name = "gfs2", + .fs_flags = FS_REQUIRES_DEV, + .get_sb = gfs2_get_sb, + .kill_sb = gfs2_kill_sb, + .owner = THIS_MODULE, +}; + +struct file_system_type gfs2meta_fs_type = { + .name = "gfs2meta", + .fs_flags = FS_REQUIRES_DEV, + .get_sb = gfs2_get_sb_meta, + .kill_sb = gfs2_kill_sb_meta, + .owner = THIS_MODULE, +}; + diff --git a/fs/gfs2/ops_fstype.h b/fs/gfs2/ops_fstype.h new file mode 100644 index 000000000000..7cc2c296271b --- /dev/null +++ b/fs/gfs2/ops_fstype.h @@ -0,0 +1,18 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#ifndef __OPS_FSTYPE_DOT_H__ +#define __OPS_FSTYPE_DOT_H__ + +#include <linux/fs.h> + +extern struct file_system_type gfs2_fs_type; +extern struct file_system_type gfs2meta_fs_type; + +#endif /* __OPS_FSTYPE_DOT_H__ */ diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c new file mode 100644 index 000000000000..ef6e5ed70e94 --- /dev/null +++ b/fs/gfs2/ops_inode.c @@ -0,0 +1,1151 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/completion.h> +#include <linux/buffer_head.h> +#include <linux/namei.h> +#include <linux/utsname.h> +#include <linux/mm.h> +#include <linux/xattr.h> +#include <linux/posix_acl.h> +#include <linux/gfs2_ondisk.h> +#include <linux/crc32.h> +#include <linux/lm_interface.h> +#include <asm/uaccess.h> + +#include "gfs2.h" +#include "incore.h" +#include "acl.h" +#include "bmap.h" +#include "dir.h" +#include "eaops.h" +#include "eattr.h" +#include "glock.h" +#include "inode.h" +#include "meta_io.h" +#include "ops_dentry.h" +#include "ops_inode.h" +#include "quota.h" +#include "rgrp.h" +#include "trans.h" +#include "util.h" + +/** + * gfs2_create - Create a file + * @dir: The directory in which to create the file + * @dentry: The dentry of the new file + * @mode: The mode of the new file + * + * Returns: errno + */ + +static int gfs2_create(struct inode *dir, struct dentry *dentry, + int mode, struct nameidata *nd) +{ + struct gfs2_inode *dip = GFS2_I(dir); + struct gfs2_sbd *sdp = GFS2_SB(dir); + struct gfs2_holder ghs[2]; + struct inode *inode; + + gfs2_holder_init(dip->i_gl, 0, 0, ghs); + + for (;;) { + inode = gfs2_createi(ghs, &dentry->d_name, S_IFREG | mode); + if (!IS_ERR(inode)) { + gfs2_trans_end(sdp); + if (dip->i_alloc.al_rgd) + gfs2_inplace_release(dip); + gfs2_quota_unlock(dip); + gfs2_alloc_put(dip); + gfs2_glock_dq_uninit_m(2, ghs); + mark_inode_dirty(inode); + break; + } else if (PTR_ERR(inode) != -EEXIST || + (nd->intent.open.flags & O_EXCL)) { + gfs2_holder_uninit(ghs); + return PTR_ERR(inode); + } + + inode = gfs2_lookupi(dir, &dentry->d_name, 0, nd); + if (inode) { + if (!IS_ERR(inode)) { + gfs2_holder_uninit(ghs); + break; + } else { + gfs2_holder_uninit(ghs); + return PTR_ERR(inode); + } + } + } + + d_instantiate(dentry, inode); + + return 0; +} + +/** + * gfs2_lookup - Look up a filename in a directory and return its inode + * @dir: The directory inode + * @dentry: The dentry of the new inode + * @nd: passed from Linux VFS, ignored by us + * + * Called by the VFS layer. Lock dir and call gfs2_lookupi() + * + * Returns: errno + */ + +static struct dentry *gfs2_lookup(struct inode *dir, struct dentry *dentry, + struct nameidata *nd) +{ + struct inode *inode = NULL; + + dentry->d_op = &gfs2_dops; + + inode = gfs2_lookupi(dir, &dentry->d_name, 0, nd); + if (inode && IS_ERR(inode)) + return ERR_PTR(PTR_ERR(inode)); + + if (inode) + return d_splice_alias(inode, dentry); + d_add(dentry, inode); + + return NULL; +} + +/** + * gfs2_link - Link to a file + * @old_dentry: The inode to link + * @dir: Add link to this directory + * @dentry: The name of the link + * + * Link the inode in "old_dentry" into the directory "dir" with the + * name in "dentry". + * + * Returns: errno + */ + +static int gfs2_link(struct dentry *old_dentry, struct inode *dir, + struct dentry *dentry) +{ + struct gfs2_inode *dip = GFS2_I(dir); + struct gfs2_sbd *sdp = GFS2_SB(dir); + struct inode *inode = old_dentry->d_inode; + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_holder ghs[2]; + int alloc_required; + int error; + + if (S_ISDIR(ip->i_di.di_mode)) + return -EPERM; + + gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs); + gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1); + + error = gfs2_glock_nq_m(2, ghs); + if (error) + goto out; + + error = permission(dir, MAY_WRITE | MAY_EXEC, NULL); + if (error) + goto out_gunlock; + + error = gfs2_dir_search(dir, &dentry->d_name, NULL, NULL); + switch (error) { + case -ENOENT: + break; + case 0: + error = -EEXIST; + default: + goto out_gunlock; + } + + error = -EINVAL; + if (!dip->i_di.di_nlink) + goto out_gunlock; + error = -EFBIG; + if (dip->i_di.di_entries == (u32)-1) + goto out_gunlock; + error = -EPERM; + if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) + goto out_gunlock; + error = -EINVAL; + if (!ip->i_di.di_nlink) + goto out_gunlock; + error = -EMLINK; + if (ip->i_di.di_nlink == (u32)-1) + goto out_gunlock; + + alloc_required = error = gfs2_diradd_alloc_required(dir, &dentry->d_name); + if (error < 0) + goto out_gunlock; + error = 0; + + if (alloc_required) { + struct gfs2_alloc *al = gfs2_alloc_get(dip); + + error = gfs2_quota_lock(dip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE); + if (error) + goto out_alloc; + + error = gfs2_quota_check(dip, dip->i_di.di_uid, + dip->i_di.di_gid); + if (error) + goto out_gunlock_q; + + al->al_requested = sdp->sd_max_dirres; + + error = gfs2_inplace_reserve(dip); + if (error) + goto out_gunlock_q; + + error = gfs2_trans_begin(sdp, sdp->sd_max_dirres + + al->al_rgd->rd_ri.ri_length + + 2 * RES_DINODE + RES_STATFS + + RES_QUOTA, 0); + if (error) + goto out_ipres; + } else { + error = gfs2_trans_begin(sdp, 2 * RES_DINODE + RES_LEAF, 0); + if (error) + goto out_ipres; + } + + error = gfs2_dir_add(dir, &dentry->d_name, &ip->i_num, + IF2DT(ip->i_di.di_mode)); + if (error) + goto out_end_trans; + + error = gfs2_change_nlink(ip, +1); + +out_end_trans: + gfs2_trans_end(sdp); +out_ipres: + if (alloc_required) + gfs2_inplace_release(dip); +out_gunlock_q: + if (alloc_required) + gfs2_quota_unlock(dip); +out_alloc: + if (alloc_required) + gfs2_alloc_put(dip); +out_gunlock: + gfs2_glock_dq_m(2, ghs); +out: + gfs2_holder_uninit(ghs); + gfs2_holder_uninit(ghs + 1); + if (!error) { + atomic_inc(&inode->i_count); + d_instantiate(dentry, inode); + mark_inode_dirty(inode); + } + return error; +} + +/** + * gfs2_unlink - Unlink a file + * @dir: The inode of the directory containing the file to unlink + * @dentry: The file itself + * + * Unlink a file. Call gfs2_unlinki() + * + * Returns: errno + */ + +static int gfs2_unlink(struct inode *dir, struct dentry *dentry) +{ + struct gfs2_inode *dip = GFS2_I(dir); + struct gfs2_sbd *sdp = GFS2_SB(dir); + struct gfs2_inode *ip = GFS2_I(dentry->d_inode); + struct gfs2_holder ghs[2]; + int error; + + gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs); + gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1); + + error = gfs2_glock_nq_m(2, ghs); + if (error) + goto out; + + error = gfs2_unlink_ok(dip, &dentry->d_name, ip); + if (error) + goto out_gunlock; + + error = gfs2_trans_begin(sdp, 2*RES_DINODE + RES_LEAF + RES_RG_BIT, 0); + if (error) + goto out_gunlock; + + error = gfs2_dir_del(dip, &dentry->d_name); + if (error) + goto out_end_trans; + + error = gfs2_change_nlink(ip, -1); + +out_end_trans: + gfs2_trans_end(sdp); +out_gunlock: + gfs2_glock_dq_m(2, ghs); +out: + gfs2_holder_uninit(ghs); + gfs2_holder_uninit(ghs + 1); + return error; +} + +/** + * gfs2_symlink - Create a symlink + * @dir: The directory to create the symlink in + * @dentry: The dentry to put the symlink in + * @symname: The thing which the link points to + * + * Returns: errno + */ + +static int gfs2_symlink(struct inode *dir, struct dentry *dentry, + const char *symname) +{ + struct gfs2_inode *dip = GFS2_I(dir), *ip; + struct gfs2_sbd *sdp = GFS2_SB(dir); + struct gfs2_holder ghs[2]; + struct inode *inode; + struct buffer_head *dibh; + int size; + int error; + + /* Must be stuffed with a null terminator for gfs2_follow_link() */ + size = strlen(symname); + if (size > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode) - 1) + return -ENAMETOOLONG; + + gfs2_holder_init(dip->i_gl, 0, 0, ghs); + + inode = gfs2_createi(ghs, &dentry->d_name, S_IFLNK | S_IRWXUGO); + if (IS_ERR(inode)) { + gfs2_holder_uninit(ghs); + return PTR_ERR(inode); + } + + ip = ghs[1].gh_gl->gl_object; + + ip->i_di.di_size = size; + + error = gfs2_meta_inode_buffer(ip, &dibh); + + if (!gfs2_assert_withdraw(sdp, !error)) { + gfs2_dinode_out(&ip->i_di, dibh->b_data); + memcpy(dibh->b_data + sizeof(struct gfs2_dinode), symname, + size); + brelse(dibh); + } + + gfs2_trans_end(sdp); + if (dip->i_alloc.al_rgd) + gfs2_inplace_release(dip); + gfs2_quota_unlock(dip); + gfs2_alloc_put(dip); + + gfs2_glock_dq_uninit_m(2, ghs); + + d_instantiate(dentry, inode); + mark_inode_dirty(inode); + + return 0; +} + +/** + * gfs2_mkdir - Make a directory + * @dir: The parent directory of the new one + * @dentry: The dentry of the new directory + * @mode: The mode of the new directory + * + * Returns: errno + */ + +static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode) +{ + struct gfs2_inode *dip = GFS2_I(dir), *ip; + struct gfs2_sbd *sdp = GFS2_SB(dir); + struct gfs2_holder ghs[2]; + struct inode *inode; + struct buffer_head *dibh; + int error; + + gfs2_holder_init(dip->i_gl, 0, 0, ghs); + + inode = gfs2_createi(ghs, &dentry->d_name, S_IFDIR | mode); + if (IS_ERR(inode)) { + gfs2_holder_uninit(ghs); + return PTR_ERR(inode); + } + + ip = ghs[1].gh_gl->gl_object; + + ip->i_di.di_nlink = 2; + ip->i_di.di_size = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode); + ip->i_di.di_flags |= GFS2_DIF_JDATA; + ip->i_di.di_payload_format = GFS2_FORMAT_DE; + ip->i_di.di_entries = 2; + + error = gfs2_meta_inode_buffer(ip, &dibh); + + if (!gfs2_assert_withdraw(sdp, !error)) { + struct gfs2_dinode *di = (struct gfs2_dinode *)dibh->b_data; + struct gfs2_dirent *dent = (struct gfs2_dirent *)(di+1); + struct qstr str; + + gfs2_str2qstr(&str, "."); + gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_qstr2dirent(&str, GFS2_DIRENT_SIZE(str.len), dent); + dent->de_inum = di->di_num; /* already GFS2 endian */ + dent->de_type = cpu_to_be16(DT_DIR); + di->di_entries = cpu_to_be32(1); + + gfs2_str2qstr(&str, ".."); + dent = (struct gfs2_dirent *)((char*)dent + GFS2_DIRENT_SIZE(1)); + gfs2_qstr2dirent(&str, dibh->b_size - GFS2_DIRENT_SIZE(1) - sizeof(struct gfs2_dinode), dent); + + gfs2_inum_out(&dip->i_num, &dent->de_inum); + dent->de_type = cpu_to_be16(DT_DIR); + + gfs2_dinode_out(&ip->i_di, di); + + brelse(dibh); + } + + error = gfs2_change_nlink(dip, +1); + gfs2_assert_withdraw(sdp, !error); /* dip already pinned */ + + gfs2_trans_end(sdp); + if (dip->i_alloc.al_rgd) + gfs2_inplace_release(dip); + gfs2_quota_unlock(dip); + gfs2_alloc_put(dip); + + gfs2_glock_dq_uninit_m(2, ghs); + + d_instantiate(dentry, inode); + mark_inode_dirty(inode); + + return 0; +} + +/** + * gfs2_rmdir - Remove a directory + * @dir: The parent directory of the directory to be removed + * @dentry: The dentry of the directory to remove + * + * Remove a directory. Call gfs2_rmdiri() + * + * Returns: errno + */ + +static int gfs2_rmdir(struct inode *dir, struct dentry *dentry) +{ + struct gfs2_inode *dip = GFS2_I(dir); + struct gfs2_sbd *sdp = GFS2_SB(dir); + struct gfs2_inode *ip = GFS2_I(dentry->d_inode); + struct gfs2_holder ghs[2]; + int error; + + gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs); + gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1); + + error = gfs2_glock_nq_m(2, ghs); + if (error) + goto out; + + error = gfs2_unlink_ok(dip, &dentry->d_name, ip); + if (error) + goto out_gunlock; + + if (ip->i_di.di_entries < 2) { + if (gfs2_consist_inode(ip)) + gfs2_dinode_print(&ip->i_di); + error = -EIO; + goto out_gunlock; + } + if (ip->i_di.di_entries > 2) { + error = -ENOTEMPTY; + goto out_gunlock; + } + + error = gfs2_trans_begin(sdp, 2 * RES_DINODE + 3 * RES_LEAF + RES_RG_BIT, 0); + if (error) + goto out_gunlock; + + error = gfs2_rmdiri(dip, &dentry->d_name, ip); + + gfs2_trans_end(sdp); + +out_gunlock: + gfs2_glock_dq_m(2, ghs); +out: + gfs2_holder_uninit(ghs); + gfs2_holder_uninit(ghs + 1); + return error; +} + +/** + * gfs2_mknod - Make a special file + * @dir: The directory in which the special file will reside + * @dentry: The dentry of the special file + * @mode: The mode of the special file + * @rdev: The device specification of the special file + * + */ + +static int gfs2_mknod(struct inode *dir, struct dentry *dentry, int mode, + dev_t dev) +{ + struct gfs2_inode *dip = GFS2_I(dir), *ip; + struct gfs2_sbd *sdp = GFS2_SB(dir); + struct gfs2_holder ghs[2]; + struct inode *inode; + struct buffer_head *dibh; + u32 major = 0, minor = 0; + int error; + + switch (mode & S_IFMT) { + case S_IFBLK: + case S_IFCHR: + major = MAJOR(dev); + minor = MINOR(dev); + break; + case S_IFIFO: + case S_IFSOCK: + break; + default: + return -EOPNOTSUPP; + }; + + gfs2_holder_init(dip->i_gl, 0, 0, ghs); + + inode = gfs2_createi(ghs, &dentry->d_name, mode); + if (IS_ERR(inode)) { + gfs2_holder_uninit(ghs); + return PTR_ERR(inode); + } + + ip = ghs[1].gh_gl->gl_object; + + ip->i_di.di_major = major; + ip->i_di.di_minor = minor; + + error = gfs2_meta_inode_buffer(ip, &dibh); + + if (!gfs2_assert_withdraw(sdp, !error)) { + gfs2_dinode_out(&ip->i_di, dibh->b_data); + brelse(dibh); + } + + gfs2_trans_end(sdp); + if (dip->i_alloc.al_rgd) + gfs2_inplace_release(dip); + gfs2_quota_unlock(dip); + gfs2_alloc_put(dip); + + gfs2_glock_dq_uninit_m(2, ghs); + + d_instantiate(dentry, inode); + mark_inode_dirty(inode); + + return 0; +} + +/** + * gfs2_rename - Rename a file + * @odir: Parent directory of old file name + * @odentry: The old dentry of the file + * @ndir: Parent directory of new file name + * @ndentry: The new dentry of the file + * + * Returns: errno + */ + +static int gfs2_rename(struct inode *odir, struct dentry *odentry, + struct inode *ndir, struct dentry *ndentry) +{ + struct gfs2_inode *odip = GFS2_I(odir); + struct gfs2_inode *ndip = GFS2_I(ndir); + struct gfs2_inode *ip = GFS2_I(odentry->d_inode); + struct gfs2_inode *nip = NULL; + struct gfs2_sbd *sdp = GFS2_SB(odir); + struct gfs2_holder ghs[4], r_gh; + unsigned int num_gh; + int dir_rename = 0; + int alloc_required; + unsigned int x; + int error; + + if (ndentry->d_inode) { + nip = GFS2_I(ndentry->d_inode); + if (ip == nip) + return 0; + } + + /* Make sure we aren't trying to move a dirctory into it's subdir */ + + if (S_ISDIR(ip->i_di.di_mode) && odip != ndip) { + dir_rename = 1; + + error = gfs2_glock_nq_init(sdp->sd_rename_gl, + LM_ST_EXCLUSIVE, 0, + &r_gh); + if (error) + goto out; + + error = gfs2_ok_to_move(ip, ndip); + if (error) + goto out_gunlock_r; + } + + num_gh = 1; + gfs2_holder_init(odip->i_gl, LM_ST_EXCLUSIVE, 0, ghs); + if (odip != ndip) { + gfs2_holder_init(ndip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh); + num_gh++; + } + gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh); + num_gh++; + + if (nip) { + gfs2_holder_init(nip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh); + num_gh++; + } + + error = gfs2_glock_nq_m(num_gh, ghs); + if (error) + goto out_uninit; + + /* Check out the old directory */ + + error = gfs2_unlink_ok(odip, &odentry->d_name, ip); + if (error) + goto out_gunlock; + + /* Check out the new directory */ + + if (nip) { + error = gfs2_unlink_ok(ndip, &ndentry->d_name, nip); + if (error) + goto out_gunlock; + + if (S_ISDIR(nip->i_di.di_mode)) { + if (nip->i_di.di_entries < 2) { + if (gfs2_consist_inode(nip)) + gfs2_dinode_print(&nip->i_di); + error = -EIO; + goto out_gunlock; + } + if (nip->i_di.di_entries > 2) { + error = -ENOTEMPTY; + goto out_gunlock; + } + } + } else { + error = permission(ndir, MAY_WRITE | MAY_EXEC, NULL); + if (error) + goto out_gunlock; + + error = gfs2_dir_search(ndir, &ndentry->d_name, NULL, NULL); + switch (error) { + case -ENOENT: + error = 0; + break; + case 0: + error = -EEXIST; + default: + goto out_gunlock; + }; + + if (odip != ndip) { + if (!ndip->i_di.di_nlink) { + error = -EINVAL; + goto out_gunlock; + } + if (ndip->i_di.di_entries == (u32)-1) { + error = -EFBIG; + goto out_gunlock; + } + if (S_ISDIR(ip->i_di.di_mode) && + ndip->i_di.di_nlink == (u32)-1) { + error = -EMLINK; + goto out_gunlock; + } + } + } + + /* Check out the dir to be renamed */ + + if (dir_rename) { + error = permission(odentry->d_inode, MAY_WRITE, NULL); + if (error) + goto out_gunlock; + } + + alloc_required = error = gfs2_diradd_alloc_required(ndir, &ndentry->d_name); + if (error < 0) + goto out_gunlock; + error = 0; + + if (alloc_required) { + struct gfs2_alloc *al = gfs2_alloc_get(ndip); + + error = gfs2_quota_lock(ndip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE); + if (error) + goto out_alloc; + + error = gfs2_quota_check(ndip, ndip->i_di.di_uid, + ndip->i_di.di_gid); + if (error) + goto out_gunlock_q; + + al->al_requested = sdp->sd_max_dirres; + + error = gfs2_inplace_reserve(ndip); + if (error) + goto out_gunlock_q; + + error = gfs2_trans_begin(sdp, sdp->sd_max_dirres + + al->al_rgd->rd_ri.ri_length + + 4 * RES_DINODE + 4 * RES_LEAF + + RES_STATFS + RES_QUOTA, 0); + if (error) + goto out_ipreserv; + } else { + error = gfs2_trans_begin(sdp, 4 * RES_DINODE + + 5 * RES_LEAF, 0); + if (error) + goto out_gunlock; + } + + /* Remove the target file, if it exists */ + + if (nip) { + if (S_ISDIR(nip->i_di.di_mode)) + error = gfs2_rmdiri(ndip, &ndentry->d_name, nip); + else { + error = gfs2_dir_del(ndip, &ndentry->d_name); + if (error) + goto out_end_trans; + error = gfs2_change_nlink(nip, -1); + } + if (error) + goto out_end_trans; + } + + if (dir_rename) { + struct qstr name; + gfs2_str2qstr(&name, ".."); + + error = gfs2_change_nlink(ndip, +1); + if (error) + goto out_end_trans; + error = gfs2_change_nlink(odip, -1); + if (error) + goto out_end_trans; + + error = gfs2_dir_mvino(ip, &name, &ndip->i_num, DT_DIR); + if (error) + goto out_end_trans; + } else { + struct buffer_head *dibh; + error = gfs2_meta_inode_buffer(ip, &dibh); + if (error) + goto out_end_trans; + ip->i_di.di_ctime = get_seconds(); + gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_dinode_out(&ip->i_di, dibh->b_data); + brelse(dibh); + } + + error = gfs2_dir_del(odip, &odentry->d_name); + if (error) + goto out_end_trans; + + error = gfs2_dir_add(ndir, &ndentry->d_name, &ip->i_num, + IF2DT(ip->i_di.di_mode)); + if (error) + goto out_end_trans; + +out_end_trans: + gfs2_trans_end(sdp); +out_ipreserv: + if (alloc_required) + gfs2_inplace_release(ndip); +out_gunlock_q: + if (alloc_required) + gfs2_quota_unlock(ndip); +out_alloc: + if (alloc_required) + gfs2_alloc_put(ndip); +out_gunlock: + gfs2_glock_dq_m(num_gh, ghs); +out_uninit: + for (x = 0; x < num_gh; x++) + gfs2_holder_uninit(ghs + x); +out_gunlock_r: + if (dir_rename) + gfs2_glock_dq_uninit(&r_gh); +out: + return error; +} + +/** + * gfs2_readlink - Read the value of a symlink + * @dentry: the symlink + * @buf: the buffer to read the symlink data into + * @size: the size of the buffer + * + * Returns: errno + */ + +static int gfs2_readlink(struct dentry *dentry, char __user *user_buf, + int user_size) +{ + struct gfs2_inode *ip = GFS2_I(dentry->d_inode); + char array[GFS2_FAST_NAME_SIZE], *buf = array; + unsigned int len = GFS2_FAST_NAME_SIZE; + int error; + + error = gfs2_readlinki(ip, &buf, &len); + if (error) + return error; + + if (user_size > len - 1) + user_size = len - 1; + + if (copy_to_user(user_buf, buf, user_size)) + error = -EFAULT; + else + error = user_size; + + if (buf != array) + kfree(buf); + + return error; +} + +/** + * gfs2_follow_link - Follow a symbolic link + * @dentry: The dentry of the link + * @nd: Data that we pass to vfs_follow_link() + * + * This can handle symlinks of any size. It is optimised for symlinks + * under GFS2_FAST_NAME_SIZE. + * + * Returns: 0 on success or error code + */ + +static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + struct gfs2_inode *ip = GFS2_I(dentry->d_inode); + char array[GFS2_FAST_NAME_SIZE], *buf = array; + unsigned int len = GFS2_FAST_NAME_SIZE; + int error; + + error = gfs2_readlinki(ip, &buf, &len); + if (!error) { + error = vfs_follow_link(nd, buf); + if (buf != array) + kfree(buf); + } + + return ERR_PTR(error); +} + +/** + * gfs2_permission - + * @inode: + * @mask: + * @nd: passed from Linux VFS, ignored by us + * + * Returns: errno + */ + +static int gfs2_permission(struct inode *inode, int mask, struct nameidata *nd) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_holder i_gh; + int error; + + if (ip->i_vn == ip->i_gl->gl_vn) + return generic_permission(inode, mask, gfs2_check_acl); + + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh); + if (!error) { + error = generic_permission(inode, mask, gfs2_check_acl_locked); + gfs2_glock_dq_uninit(&i_gh); + } + + return error; +} + +static int setattr_size(struct inode *inode, struct iattr *attr) +{ + struct gfs2_inode *ip = GFS2_I(inode); + int error; + + if (attr->ia_size != ip->i_di.di_size) { + error = vmtruncate(inode, attr->ia_size); + if (error) + return error; + } + + error = gfs2_truncatei(ip, attr->ia_size); + if (error) + return error; + + return error; +} + +static int setattr_chown(struct inode *inode, struct iattr *attr) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + struct buffer_head *dibh; + u32 ouid, ogid, nuid, ngid; + int error; + + ouid = ip->i_di.di_uid; + ogid = ip->i_di.di_gid; + nuid = attr->ia_uid; + ngid = attr->ia_gid; + + if (!(attr->ia_valid & ATTR_UID) || ouid == nuid) + ouid = nuid = NO_QUOTA_CHANGE; + if (!(attr->ia_valid & ATTR_GID) || ogid == ngid) + ogid = ngid = NO_QUOTA_CHANGE; + + gfs2_alloc_get(ip); + + error = gfs2_quota_lock(ip, nuid, ngid); + if (error) + goto out_alloc; + + if (ouid != NO_QUOTA_CHANGE || ogid != NO_QUOTA_CHANGE) { + error = gfs2_quota_check(ip, nuid, ngid); + if (error) + goto out_gunlock_q; + } + + error = gfs2_trans_begin(sdp, RES_DINODE + 2 * RES_QUOTA, 0); + if (error) + goto out_gunlock_q; + + error = gfs2_meta_inode_buffer(ip, &dibh); + if (error) + goto out_end_trans; + + error = inode_setattr(inode, attr); + gfs2_assert_warn(sdp, !error); + gfs2_inode_attr_out(ip); + + gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_dinode_out(&ip->i_di, dibh->b_data); + brelse(dibh); + + if (ouid != NO_QUOTA_CHANGE || ogid != NO_QUOTA_CHANGE) { + gfs2_quota_change(ip, -ip->i_di.di_blocks, ouid, ogid); + gfs2_quota_change(ip, ip->i_di.di_blocks, nuid, ngid); + } + +out_end_trans: + gfs2_trans_end(sdp); +out_gunlock_q: + gfs2_quota_unlock(ip); +out_alloc: + gfs2_alloc_put(ip); + return error; +} + +/** + * gfs2_setattr - Change attributes on an inode + * @dentry: The dentry which is changing + * @attr: The structure describing the change + * + * The VFS layer wants to change one or more of an inodes attributes. Write + * that change out to disk. + * + * Returns: errno + */ + +static int gfs2_setattr(struct dentry *dentry, struct iattr *attr) +{ + struct inode *inode = dentry->d_inode; + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_holder i_gh; + int error; + + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh); + if (error) + return error; + + error = -EPERM; + if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) + goto out; + + error = inode_change_ok(inode, attr); + if (error) + goto out; + + if (attr->ia_valid & ATTR_SIZE) + error = setattr_size(inode, attr); + else if (attr->ia_valid & (ATTR_UID | ATTR_GID)) + error = setattr_chown(inode, attr); + else if ((attr->ia_valid & ATTR_MODE) && IS_POSIXACL(inode)) + error = gfs2_acl_chmod(ip, attr); + else + error = gfs2_setattr_simple(ip, attr); + +out: + gfs2_glock_dq_uninit(&i_gh); + if (!error) + mark_inode_dirty(inode); + return error; +} + +/** + * gfs2_getattr - Read out an inode's attributes + * @mnt: The vfsmount the inode is being accessed from + * @dentry: The dentry to stat + * @stat: The inode's stats + * + * Returns: errno + */ + +static int gfs2_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat) +{ + struct inode *inode = dentry->d_inode; + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_holder gh; + int error; + + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &gh); + if (!error) { + generic_fillattr(inode, stat); + gfs2_glock_dq_uninit(&gh); + } + + return error; +} + +static int gfs2_setxattr(struct dentry *dentry, const char *name, + const void *data, size_t size, int flags) +{ + struct inode *inode = dentry->d_inode; + struct gfs2_ea_request er; + + memset(&er, 0, sizeof(struct gfs2_ea_request)); + er.er_type = gfs2_ea_name2type(name, &er.er_name); + if (er.er_type == GFS2_EATYPE_UNUSED) + return -EOPNOTSUPP; + er.er_data = (char *)data; + er.er_name_len = strlen(er.er_name); + er.er_data_len = size; + er.er_flags = flags; + + gfs2_assert_warn(GFS2_SB(inode), !(er.er_flags & GFS2_ERF_MODE)); + + return gfs2_ea_set(GFS2_I(inode), &er); +} + +static ssize_t gfs2_getxattr(struct dentry *dentry, const char *name, + void *data, size_t size) +{ + struct gfs2_ea_request er; + + memset(&er, 0, sizeof(struct gfs2_ea_request)); + er.er_type = gfs2_ea_name2type(name, &er.er_name); + if (er.er_type == GFS2_EATYPE_UNUSED) + return -EOPNOTSUPP; + er.er_data = data; + er.er_name_len = strlen(er.er_name); + er.er_data_len = size; + + return gfs2_ea_get(GFS2_I(dentry->d_inode), &er); +} + +static ssize_t gfs2_listxattr(struct dentry *dentry, char *buffer, size_t size) +{ + struct gfs2_ea_request er; + + memset(&er, 0, sizeof(struct gfs2_ea_request)); + er.er_data = (size) ? buffer : NULL; + er.er_data_len = size; + + return gfs2_ea_list(GFS2_I(dentry->d_inode), &er); +} + +static int gfs2_removexattr(struct dentry *dentry, const char *name) +{ + struct gfs2_ea_request er; + + memset(&er, 0, sizeof(struct gfs2_ea_request)); + er.er_type = gfs2_ea_name2type(name, &er.er_name); + if (er.er_type == GFS2_EATYPE_UNUSED) + return -EOPNOTSUPP; + er.er_name_len = strlen(er.er_name); + + return gfs2_ea_remove(GFS2_I(dentry->d_inode), &er); +} + +struct inode_operations gfs2_file_iops = { + .permission = gfs2_permission, + .setattr = gfs2_setattr, + .getattr = gfs2_getattr, + .setxattr = gfs2_setxattr, + .getxattr = gfs2_getxattr, + .listxattr = gfs2_listxattr, + .removexattr = gfs2_removexattr, +}; + +struct inode_operations gfs2_dev_iops = { + .permission = gfs2_permission, + .setattr = gfs2_setattr, + .getattr = gfs2_getattr, + .setxattr = gfs2_setxattr, + .getxattr = gfs2_getxattr, + .listxattr = gfs2_listxattr, + .removexattr = gfs2_removexattr, +}; + +struct inode_operations gfs2_dir_iops = { + .create = gfs2_create, + .lookup = gfs2_lookup, + .link = gfs2_link, + .unlink = gfs2_unlink, + .symlink = gfs2_symlink, + .mkdir = gfs2_mkdir, + .rmdir = gfs2_rmdir, + .mknod = gfs2_mknod, + .rename = gfs2_rename, + .permission = gfs2_permission, + .setattr = gfs2_setattr, + .getattr = gfs2_getattr, + .setxattr = gfs2_setxattr, + .getxattr = gfs2_getxattr, + .listxattr = gfs2_listxattr, + .removexattr = gfs2_removexattr, +}; + +struct inode_operations gfs2_symlink_iops = { + .readlink = gfs2_readlink, + .follow_link = gfs2_follow_link, + .permission = gfs2_permission, + .setattr = gfs2_setattr, + .getattr = gfs2_getattr, + .setxattr = gfs2_setxattr, + .getxattr = gfs2_getxattr, + .listxattr = gfs2_listxattr, + .removexattr = gfs2_removexattr, +}; + diff --git a/fs/gfs2/ops_inode.h b/fs/gfs2/ops_inode.h new file mode 100644 index 000000000000..b15acb4fd34c --- /dev/null +++ b/fs/gfs2/ops_inode.h @@ -0,0 +1,20 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#ifndef __OPS_INODE_DOT_H__ +#define __OPS_INODE_DOT_H__ + +#include <linux/fs.h> + +extern struct inode_operations gfs2_file_iops; +extern struct inode_operations gfs2_dir_iops; +extern struct inode_operations gfs2_symlink_iops; +extern struct inode_operations gfs2_dev_iops; + +#endif /* __OPS_INODE_DOT_H__ */ diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c new file mode 100644 index 000000000000..06f06f7773d0 --- /dev/null +++ b/fs/gfs2/ops_super.c @@ -0,0 +1,468 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/completion.h> +#include <linux/buffer_head.h> +#include <linux/statfs.h> +#include <linux/seq_file.h> +#include <linux/mount.h> +#include <linux/kthread.h> +#include <linux/delay.h> +#include <linux/gfs2_ondisk.h> +#include <linux/crc32.h> +#include <linux/lm_interface.h> + +#include "gfs2.h" +#include "incore.h" +#include "glock.h" +#include "inode.h" +#include "lm.h" +#include "log.h" +#include "mount.h" +#include "ops_super.h" +#include "quota.h" +#include "recovery.h" +#include "rgrp.h" +#include "super.h" +#include "sys.h" +#include "util.h" +#include "trans.h" +#include "dir.h" +#include "eattr.h" +#include "bmap.h" + +/** + * gfs2_write_inode - Make sure the inode is stable on the disk + * @inode: The inode + * @sync: synchronous write flag + * + * Returns: errno + */ + +static int gfs2_write_inode(struct inode *inode, int sync) +{ + struct gfs2_inode *ip = GFS2_I(inode); + + /* Check this is a "normal" inode */ + if (inode->i_private) { + if (current->flags & PF_MEMALLOC) + return 0; + if (sync) + gfs2_log_flush(GFS2_SB(inode), ip->i_gl); + } + + return 0; +} + +/** + * gfs2_put_super - Unmount the filesystem + * @sb: The VFS superblock + * + */ + +static void gfs2_put_super(struct super_block *sb) +{ + struct gfs2_sbd *sdp = sb->s_fs_info; + int error; + + if (!sdp) + return; + + if (!strncmp(sb->s_type->name, "gfs2meta", 8)) + return; /* Nothing to do */ + + /* Unfreeze the filesystem, if we need to */ + + mutex_lock(&sdp->sd_freeze_lock); + if (sdp->sd_freeze_count) + gfs2_glock_dq_uninit(&sdp->sd_freeze_gh); + mutex_unlock(&sdp->sd_freeze_lock); + + kthread_stop(sdp->sd_quotad_process); + kthread_stop(sdp->sd_logd_process); + kthread_stop(sdp->sd_recoverd_process); + while (sdp->sd_glockd_num--) + kthread_stop(sdp->sd_glockd_process[sdp->sd_glockd_num]); + kthread_stop(sdp->sd_scand_process); + + if (!(sb->s_flags & MS_RDONLY)) { + error = gfs2_make_fs_ro(sdp); + if (error) + gfs2_io_error(sdp); + } + /* At this point, we're through modifying the disk */ + + /* Release stuff */ + + iput(sdp->sd_master_dir); + iput(sdp->sd_jindex); + iput(sdp->sd_inum_inode); + iput(sdp->sd_statfs_inode); + iput(sdp->sd_rindex); + iput(sdp->sd_quota_inode); + + gfs2_glock_put(sdp->sd_rename_gl); + gfs2_glock_put(sdp->sd_trans_gl); + + if (!sdp->sd_args.ar_spectator) { + gfs2_glock_dq_uninit(&sdp->sd_journal_gh); + gfs2_glock_dq_uninit(&sdp->sd_jinode_gh); + gfs2_glock_dq_uninit(&sdp->sd_ir_gh); + gfs2_glock_dq_uninit(&sdp->sd_sc_gh); + gfs2_glock_dq_uninit(&sdp->sd_qc_gh); + iput(sdp->sd_ir_inode); + iput(sdp->sd_sc_inode); + iput(sdp->sd_qc_inode); + } + + gfs2_glock_dq_uninit(&sdp->sd_live_gh); + gfs2_clear_rgrpd(sdp); + gfs2_jindex_free(sdp); + /* Take apart glock structures and buffer lists */ + gfs2_gl_hash_clear(sdp, WAIT); + /* Unmount the locking protocol */ + gfs2_lm_unmount(sdp); + + /* At this point, we're through participating in the lockspace */ + gfs2_sys_fs_del(sdp); + kfree(sdp); +} + +/** + * gfs2_write_super - disk commit all incore transactions + * @sb: the filesystem + * + * This function is called every time sync(2) is called. + * After this exits, all dirty buffers are synced. + */ + +static void gfs2_write_super(struct super_block *sb) +{ + gfs2_log_flush(sb->s_fs_info, NULL); +} + +/** + * gfs2_write_super_lockfs - prevent further writes to the filesystem + * @sb: the VFS structure for the filesystem + * + */ + +static void gfs2_write_super_lockfs(struct super_block *sb) +{ + struct gfs2_sbd *sdp = sb->s_fs_info; + int error; + + for (;;) { + error = gfs2_freeze_fs(sdp); + if (!error) + break; + + switch (error) { + case -EBUSY: + fs_err(sdp, "waiting for recovery before freeze\n"); + break; + + default: + fs_err(sdp, "error freezing FS: %d\n", error); + break; + } + + fs_err(sdp, "retrying...\n"); + msleep(1000); + } +} + +/** + * gfs2_unlockfs - reallow writes to the filesystem + * @sb: the VFS structure for the filesystem + * + */ + +static void gfs2_unlockfs(struct super_block *sb) +{ + gfs2_unfreeze_fs(sb->s_fs_info); +} + +/** + * gfs2_statfs - Gather and return stats about the filesystem + * @sb: The superblock + * @statfsbuf: The buffer + * + * Returns: 0 on success or error code + */ + +static int gfs2_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct super_block *sb = dentry->d_inode->i_sb; + struct gfs2_sbd *sdp = sb->s_fs_info; + struct gfs2_statfs_change sc; + int error; + + if (gfs2_tune_get(sdp, gt_statfs_slow)) + error = gfs2_statfs_slow(sdp, &sc); + else + error = gfs2_statfs_i(sdp, &sc); + + if (error) + return error; + + buf->f_type = GFS2_MAGIC; + buf->f_bsize = sdp->sd_sb.sb_bsize; + buf->f_blocks = sc.sc_total; + buf->f_bfree = sc.sc_free; + buf->f_bavail = sc.sc_free; + buf->f_files = sc.sc_dinodes + sc.sc_free; + buf->f_ffree = sc.sc_free; + buf->f_namelen = GFS2_FNAMESIZE; + + return 0; +} + +/** + * gfs2_remount_fs - called when the FS is remounted + * @sb: the filesystem + * @flags: the remount flags + * @data: extra data passed in (not used right now) + * + * Returns: errno + */ + +static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data) +{ + struct gfs2_sbd *sdp = sb->s_fs_info; + int error; + + error = gfs2_mount_args(sdp, data, 1); + if (error) + return error; + + if (sdp->sd_args.ar_spectator) + *flags |= MS_RDONLY; + else { + if (*flags & MS_RDONLY) { + if (!(sb->s_flags & MS_RDONLY)) + error = gfs2_make_fs_ro(sdp); + } else if (!(*flags & MS_RDONLY) && + (sb->s_flags & MS_RDONLY)) { + error = gfs2_make_fs_rw(sdp); + } + } + + if (*flags & (MS_NOATIME | MS_NODIRATIME)) + set_bit(SDF_NOATIME, &sdp->sd_flags); + else + clear_bit(SDF_NOATIME, &sdp->sd_flags); + + /* Don't let the VFS update atimes. GFS2 handles this itself. */ + *flags |= MS_NOATIME | MS_NODIRATIME; + + return error; +} + +/** + * gfs2_clear_inode - Deallocate an inode when VFS is done with it + * @inode: The VFS inode + * + */ + +static void gfs2_clear_inode(struct inode *inode) +{ + /* This tells us its a "real" inode and not one which only + * serves to contain an address space (see rgrp.c, meta_io.c) + * which therefore doesn't have its own glocks. + */ + if (inode->i_private) { + struct gfs2_inode *ip = GFS2_I(inode); + gfs2_glock_inode_squish(inode); + gfs2_assert(inode->i_sb->s_fs_info, ip->i_gl->gl_state == LM_ST_UNLOCKED); + ip->i_gl->gl_object = NULL; + gfs2_glock_schedule_for_reclaim(ip->i_gl); + gfs2_glock_put(ip->i_gl); + ip->i_gl = NULL; + if (ip->i_iopen_gh.gh_gl) + gfs2_glock_dq_uninit(&ip->i_iopen_gh); + } +} + +/** + * gfs2_show_options - Show mount options for /proc/mounts + * @s: seq_file structure + * @mnt: vfsmount + * + * Returns: 0 on success or error code + */ + +static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt) +{ + struct gfs2_sbd *sdp = mnt->mnt_sb->s_fs_info; + struct gfs2_args *args = &sdp->sd_args; + + if (args->ar_lockproto[0]) + seq_printf(s, ",lockproto=%s", args->ar_lockproto); + if (args->ar_locktable[0]) + seq_printf(s, ",locktable=%s", args->ar_locktable); + if (args->ar_hostdata[0]) + seq_printf(s, ",hostdata=%s", args->ar_hostdata); + if (args->ar_spectator) + seq_printf(s, ",spectator"); + if (args->ar_ignore_local_fs) + seq_printf(s, ",ignore_local_fs"); + if (args->ar_localflocks) + seq_printf(s, ",localflocks"); + if (args->ar_localcaching) + seq_printf(s, ",localcaching"); + if (args->ar_debug) + seq_printf(s, ",debug"); + if (args->ar_upgrade) + seq_printf(s, ",upgrade"); + if (args->ar_num_glockd != GFS2_GLOCKD_DEFAULT) + seq_printf(s, ",num_glockd=%u", args->ar_num_glockd); + if (args->ar_posix_acl) + seq_printf(s, ",acl"); + if (args->ar_quota != GFS2_QUOTA_DEFAULT) { + char *state; + switch (args->ar_quota) { + case GFS2_QUOTA_OFF: + state = "off"; + break; + case GFS2_QUOTA_ACCOUNT: + state = "account"; + break; + case GFS2_QUOTA_ON: + state = "on"; + break; + default: + state = "unknown"; + break; + } + seq_printf(s, ",quota=%s", state); + } + if (args->ar_suiddir) + seq_printf(s, ",suiddir"); + if (args->ar_data != GFS2_DATA_DEFAULT) { + char *state; + switch (args->ar_data) { + case GFS2_DATA_WRITEBACK: + state = "writeback"; + break; + case GFS2_DATA_ORDERED: + state = "ordered"; + break; + default: + state = "unknown"; + break; + } + seq_printf(s, ",data=%s", state); + } + + return 0; +} + +/* + * We have to (at the moment) hold the inodes main lock to cover + * the gap between unlocking the shared lock on the iopen lock and + * taking the exclusive lock. I'd rather do a shared -> exclusive + * conversion on the iopen lock, but we can change that later. This + * is safe, just less efficient. + */ +static void gfs2_delete_inode(struct inode *inode) +{ + struct gfs2_sbd *sdp = inode->i_sb->s_fs_info; + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_holder gh; + int error; + + if (!inode->i_private) + goto out; + + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE, &gh); + if (unlikely(error)) { + gfs2_glock_dq_uninit(&ip->i_iopen_gh); + goto out; + } + + gfs2_glock_dq(&ip->i_iopen_gh); + gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE, &ip->i_iopen_gh); + error = gfs2_glock_nq(&ip->i_iopen_gh); + if (error) + goto out_uninit; + + if (S_ISDIR(ip->i_di.di_mode) && + (ip->i_di.di_flags & GFS2_DIF_EXHASH)) { + error = gfs2_dir_exhash_dealloc(ip); + if (error) + goto out_unlock; + } + + if (ip->i_di.di_eattr) { + error = gfs2_ea_dealloc(ip); + if (error) + goto out_unlock; + } + + if (!gfs2_is_stuffed(ip)) { + error = gfs2_file_dealloc(ip); + if (error) + goto out_unlock; + } + + error = gfs2_dinode_dealloc(ip); + +out_unlock: + gfs2_glock_dq(&ip->i_iopen_gh); +out_uninit: + gfs2_holder_uninit(&ip->i_iopen_gh); + gfs2_glock_dq_uninit(&gh); + if (error) + fs_warn(sdp, "gfs2_delete_inode: %d\n", error); +out: + truncate_inode_pages(&inode->i_data, 0); + clear_inode(inode); +} + + + +static struct inode *gfs2_alloc_inode(struct super_block *sb) +{ + struct gfs2_sbd *sdp = sb->s_fs_info; + struct gfs2_inode *ip; + + ip = kmem_cache_alloc(gfs2_inode_cachep, GFP_KERNEL); + if (ip) { + ip->i_flags = 0; + ip->i_gl = NULL; + ip->i_greedy = gfs2_tune_get(sdp, gt_greedy_default); + ip->i_last_pfault = jiffies; + } + return &ip->i_inode; +} + +static void gfs2_destroy_inode(struct inode *inode) +{ + kmem_cache_free(gfs2_inode_cachep, inode); +} + +struct super_operations gfs2_super_ops = { + .alloc_inode = gfs2_alloc_inode, + .destroy_inode = gfs2_destroy_inode, + .write_inode = gfs2_write_inode, + .delete_inode = gfs2_delete_inode, + .put_super = gfs2_put_super, + .write_super = gfs2_write_super, + .write_super_lockfs = gfs2_write_super_lockfs, + .unlockfs = gfs2_unlockfs, + .statfs = gfs2_statfs, + .remount_fs = gfs2_remount_fs, + .clear_inode = gfs2_clear_inode, + .show_options = gfs2_show_options, +}; + diff --git a/fs/gfs2/ops_super.h b/fs/gfs2/ops_super.h new file mode 100644 index 000000000000..9de73f042f78 --- /dev/null +++ b/fs/gfs2/ops_super.h @@ -0,0 +1,17 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#ifndef __OPS_SUPER_DOT_H__ +#define __OPS_SUPER_DOT_H__ + +#include <linux/fs.h> + +extern struct super_operations gfs2_super_ops; + +#endif /* __OPS_SUPER_DOT_H__ */ diff --git a/fs/gfs2/ops_vm.c b/fs/gfs2/ops_vm.c new file mode 100644 index 000000000000..5453d2947ab3 --- /dev/null +++ b/fs/gfs2/ops_vm.c @@ -0,0 +1,184 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/completion.h> +#include <linux/buffer_head.h> +#include <linux/mm.h> +#include <linux/pagemap.h> +#include <linux/gfs2_ondisk.h> +#include <linux/lm_interface.h> + +#include "gfs2.h" +#include "incore.h" +#include "bmap.h" +#include "glock.h" +#include "inode.h" +#include "ops_vm.h" +#include "quota.h" +#include "rgrp.h" +#include "trans.h" +#include "util.h" + +static void pfault_be_greedy(struct gfs2_inode *ip) +{ + unsigned int time; + + spin_lock(&ip->i_spin); + time = ip->i_greedy; + ip->i_last_pfault = jiffies; + spin_unlock(&ip->i_spin); + + igrab(&ip->i_inode); + if (gfs2_glock_be_greedy(ip->i_gl, time)) + iput(&ip->i_inode); +} + +static struct page *gfs2_private_nopage(struct vm_area_struct *area, + unsigned long address, int *type) +{ + struct gfs2_inode *ip = GFS2_I(area->vm_file->f_mapping->host); + struct page *result; + + set_bit(GIF_PAGED, &ip->i_flags); + + result = filemap_nopage(area, address, type); + + if (result && result != NOPAGE_OOM) + pfault_be_greedy(ip); + + return result; +} + +static int alloc_page_backing(struct gfs2_inode *ip, struct page *page) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + unsigned long index = page->index; + u64 lblock = index << (PAGE_CACHE_SHIFT - + sdp->sd_sb.sb_bsize_shift); + unsigned int blocks = PAGE_CACHE_SIZE >> sdp->sd_sb.sb_bsize_shift; + struct gfs2_alloc *al; + unsigned int data_blocks, ind_blocks; + unsigned int x; + int error; + + al = gfs2_alloc_get(ip); + + error = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE); + if (error) + goto out; + + error = gfs2_quota_check(ip, ip->i_di.di_uid, ip->i_di.di_gid); + if (error) + goto out_gunlock_q; + + gfs2_write_calc_reserv(ip, PAGE_CACHE_SIZE, &data_blocks, &ind_blocks); + + al->al_requested = data_blocks + ind_blocks; + + error = gfs2_inplace_reserve(ip); + if (error) + goto out_gunlock_q; + + error = gfs2_trans_begin(sdp, al->al_rgd->rd_ri.ri_length + + ind_blocks + RES_DINODE + + RES_STATFS + RES_QUOTA, 0); + if (error) + goto out_ipres; + + if (gfs2_is_stuffed(ip)) { + error = gfs2_unstuff_dinode(ip, NULL); + if (error) + goto out_trans; + } + + for (x = 0; x < blocks; ) { + u64 dblock; + unsigned int extlen; + int new = 1; + + error = gfs2_extent_map(&ip->i_inode, lblock, &new, &dblock, &extlen); + if (error) + goto out_trans; + + lblock += extlen; + x += extlen; + } + + gfs2_assert_warn(sdp, al->al_alloced); + +out_trans: + gfs2_trans_end(sdp); +out_ipres: + gfs2_inplace_release(ip); +out_gunlock_q: + gfs2_quota_unlock(ip); +out: + gfs2_alloc_put(ip); + return error; +} + +static struct page *gfs2_sharewrite_nopage(struct vm_area_struct *area, + unsigned long address, int *type) +{ + struct file *file = area->vm_file; + struct gfs2_file *gf = file->private_data; + struct gfs2_inode *ip = GFS2_I(file->f_mapping->host); + struct gfs2_holder i_gh; + struct page *result = NULL; + unsigned long index = ((address - area->vm_start) >> PAGE_CACHE_SHIFT) + + area->vm_pgoff; + int alloc_required; + int error; + + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh); + if (error) + return NULL; + + set_bit(GIF_PAGED, &ip->i_flags); + set_bit(GIF_SW_PAGED, &ip->i_flags); + + error = gfs2_write_alloc_required(ip, (u64)index << PAGE_CACHE_SHIFT, + PAGE_CACHE_SIZE, &alloc_required); + if (error) + goto out; + + set_bit(GFF_EXLOCK, &gf->f_flags); + result = filemap_nopage(area, address, type); + clear_bit(GFF_EXLOCK, &gf->f_flags); + if (!result || result == NOPAGE_OOM) + goto out; + + if (alloc_required) { + error = alloc_page_backing(ip, result); + if (error) { + page_cache_release(result); + result = NULL; + goto out; + } + set_page_dirty(result); + } + + pfault_be_greedy(ip); +out: + gfs2_glock_dq_uninit(&i_gh); + + return result; +} + +struct vm_operations_struct gfs2_vm_ops_private = { + .nopage = gfs2_private_nopage, +}; + +struct vm_operations_struct gfs2_vm_ops_sharewrite = { + .nopage = gfs2_sharewrite_nopage, +}; + diff --git a/fs/gfs2/ops_vm.h b/fs/gfs2/ops_vm.h new file mode 100644 index 000000000000..4ae8f43ed5e3 --- /dev/null +++ b/fs/gfs2/ops_vm.h @@ -0,0 +1,18 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#ifndef __OPS_VM_DOT_H__ +#define __OPS_VM_DOT_H__ + +#include <linux/mm.h> + +extern struct vm_operations_struct gfs2_vm_ops_private; +extern struct vm_operations_struct gfs2_vm_ops_sharewrite; + +#endif /* __OPS_VM_DOT_H__ */ diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c new file mode 100644 index 000000000000..c69b94a55588 --- /dev/null +++ b/fs/gfs2/quota.c @@ -0,0 +1,1227 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +/* + * Quota change tags are associated with each transaction that allocates or + * deallocates space. Those changes are accumulated locally to each node (in a + * per-node file) and then are periodically synced to the quota file. This + * avoids the bottleneck of constantly touching the quota file, but introduces + * fuzziness in the current usage value of IDs that are being used on different + * nodes in the cluster simultaneously. So, it is possible for a user on + * multiple nodes to overrun their quota, but that overrun is controlable. + * Since quota tags are part of transactions, there is no need to a quota check + * program to be run on node crashes or anything like that. + * + * There are couple of knobs that let the administrator manage the quota + * fuzziness. "quota_quantum" sets the maximum time a quota change can be + * sitting on one node before being synced to the quota file. (The default is + * 60 seconds.) Another knob, "quota_scale" controls how quickly the frequency + * of quota file syncs increases as the user moves closer to their limit. The + * more frequent the syncs, the more accurate the quota enforcement, but that + * means that there is more contention between the nodes for the quota file. + * The default value is one. This sets the maximum theoretical quota overrun + * (with infinite node with infinite bandwidth) to twice the user's limit. (In + * practice, the maximum overrun you see should be much less.) A "quota_scale" + * number greater than one makes quota syncs more frequent and reduces the + * maximum overrun. Numbers less than one (but greater than zero) make quota + * syncs less frequent. + * + * GFS quotas also use per-ID Lock Value Blocks (LVBs) to cache the contents of + * the quota file, so it is not being constantly read. + */ + +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/completion.h> +#include <linux/buffer_head.h> +#include <linux/sort.h> +#include <linux/fs.h> +#include <linux/bio.h> +#include <linux/gfs2_ondisk.h> +#include <linux/lm_interface.h> + +#include "gfs2.h" +#include "incore.h" +#include "bmap.h" +#include "glock.h" +#include "glops.h" +#include "log.h" +#include "meta_io.h" +#include "quota.h" +#include "rgrp.h" +#include "super.h" +#include "trans.h" +#include "inode.h" +#include "ops_file.h" +#include "ops_address.h" +#include "util.h" + +#define QUOTA_USER 1 +#define QUOTA_GROUP 0 + +static u64 qd2offset(struct gfs2_quota_data *qd) +{ + u64 offset; + + offset = 2 * (u64)qd->qd_id + !test_bit(QDF_USER, &qd->qd_flags); + offset *= sizeof(struct gfs2_quota); + + return offset; +} + +static int qd_alloc(struct gfs2_sbd *sdp, int user, u32 id, + struct gfs2_quota_data **qdp) +{ + struct gfs2_quota_data *qd; + int error; + + qd = kzalloc(sizeof(struct gfs2_quota_data), GFP_KERNEL); + if (!qd) + return -ENOMEM; + + qd->qd_count = 1; + qd->qd_id = id; + if (user) + set_bit(QDF_USER, &qd->qd_flags); + qd->qd_slot = -1; + + error = gfs2_glock_get(sdp, 2 * (u64)id + !user, + &gfs2_quota_glops, CREATE, &qd->qd_gl); + if (error) + goto fail; + + error = gfs2_lvb_hold(qd->qd_gl); + gfs2_glock_put(qd->qd_gl); + if (error) + goto fail; + + *qdp = qd; + + return 0; + +fail: + kfree(qd); + return error; +} + +static int qd_get(struct gfs2_sbd *sdp, int user, u32 id, int create, + struct gfs2_quota_data **qdp) +{ + struct gfs2_quota_data *qd = NULL, *new_qd = NULL; + int error, found; + + *qdp = NULL; + + for (;;) { + found = 0; + spin_lock(&sdp->sd_quota_spin); + list_for_each_entry(qd, &sdp->sd_quota_list, qd_list) { + if (qd->qd_id == id && + !test_bit(QDF_USER, &qd->qd_flags) == !user) { + qd->qd_count++; + found = 1; + break; + } + } + + if (!found) + qd = NULL; + + if (!qd && new_qd) { + qd = new_qd; + list_add(&qd->qd_list, &sdp->sd_quota_list); + atomic_inc(&sdp->sd_quota_count); + new_qd = NULL; + } + + spin_unlock(&sdp->sd_quota_spin); + + if (qd || !create) { + if (new_qd) { + gfs2_lvb_unhold(new_qd->qd_gl); + kfree(new_qd); + } + *qdp = qd; + return 0; + } + + error = qd_alloc(sdp, user, id, &new_qd); + if (error) + return error; + } +} + +static void qd_hold(struct gfs2_quota_data *qd) +{ + struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd; + + spin_lock(&sdp->sd_quota_spin); + gfs2_assert(sdp, qd->qd_count); + qd->qd_count++; + spin_unlock(&sdp->sd_quota_spin); +} + +static void qd_put(struct gfs2_quota_data *qd) +{ + struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd; + spin_lock(&sdp->sd_quota_spin); + gfs2_assert(sdp, qd->qd_count); + if (!--qd->qd_count) + qd->qd_last_touched = jiffies; + spin_unlock(&sdp->sd_quota_spin); +} + +static int slot_get(struct gfs2_quota_data *qd) +{ + struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd; + unsigned int c, o = 0, b; + unsigned char byte = 0; + + spin_lock(&sdp->sd_quota_spin); + + if (qd->qd_slot_count++) { + spin_unlock(&sdp->sd_quota_spin); + return 0; + } + + for (c = 0; c < sdp->sd_quota_chunks; c++) + for (o = 0; o < PAGE_SIZE; o++) { + byte = sdp->sd_quota_bitmap[c][o]; + if (byte != 0xFF) + goto found; + } + + goto fail; + +found: + for (b = 0; b < 8; b++) + if (!(byte & (1 << b))) + break; + qd->qd_slot = c * (8 * PAGE_SIZE) + o * 8 + b; + + if (qd->qd_slot >= sdp->sd_quota_slots) + goto fail; + + sdp->sd_quota_bitmap[c][o] |= 1 << b; + + spin_unlock(&sdp->sd_quota_spin); + + return 0; + +fail: + qd->qd_slot_count--; + spin_unlock(&sdp->sd_quota_spin); + return -ENOSPC; +} + +static void slot_hold(struct gfs2_quota_data *qd) +{ + struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd; + + spin_lock(&sdp->sd_quota_spin); + gfs2_assert(sdp, qd->qd_slot_count); + qd->qd_slot_count++; + spin_unlock(&sdp->sd_quota_spin); +} + +static void slot_put(struct gfs2_quota_data *qd) +{ + struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd; + + spin_lock(&sdp->sd_quota_spin); + gfs2_assert(sdp, qd->qd_slot_count); + if (!--qd->qd_slot_count) { + gfs2_icbit_munge(sdp, sdp->sd_quota_bitmap, qd->qd_slot, 0); + qd->qd_slot = -1; + } + spin_unlock(&sdp->sd_quota_spin); +} + +static int bh_get(struct gfs2_quota_data *qd) +{ + struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd; + struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode); + unsigned int block, offset; + struct buffer_head *bh; + int error; + struct buffer_head bh_map; + + mutex_lock(&sdp->sd_quota_mutex); + + if (qd->qd_bh_count++) { + mutex_unlock(&sdp->sd_quota_mutex); + return 0; + } + + block = qd->qd_slot / sdp->sd_qc_per_block; + offset = qd->qd_slot % sdp->sd_qc_per_block;; + + error = gfs2_block_map(&ip->i_inode, block, 0, &bh_map, 1); + if (error) + goto fail; + error = gfs2_meta_read(ip->i_gl, bh_map.b_blocknr, DIO_WAIT, &bh); + if (error) + goto fail; + error = -EIO; + if (gfs2_metatype_check(sdp, bh, GFS2_METATYPE_QC)) + goto fail_brelse; + + qd->qd_bh = bh; + qd->qd_bh_qc = (struct gfs2_quota_change *) + (bh->b_data + sizeof(struct gfs2_meta_header) + + offset * sizeof(struct gfs2_quota_change)); + + mutex_lock(&sdp->sd_quota_mutex); + + return 0; + +fail_brelse: + brelse(bh); +fail: + qd->qd_bh_count--; + mutex_unlock(&sdp->sd_quota_mutex); + return error; +} + +static void bh_put(struct gfs2_quota_data *qd) +{ + struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd; + + mutex_lock(&sdp->sd_quota_mutex); + gfs2_assert(sdp, qd->qd_bh_count); + if (!--qd->qd_bh_count) { + brelse(qd->qd_bh); + qd->qd_bh = NULL; + qd->qd_bh_qc = NULL; + } + mutex_unlock(&sdp->sd_quota_mutex); +} + +static int qd_fish(struct gfs2_sbd *sdp, struct gfs2_quota_data **qdp) +{ + struct gfs2_quota_data *qd = NULL; + int error; + int found = 0; + + *qdp = NULL; + + if (sdp->sd_vfs->s_flags & MS_RDONLY) + return 0; + + spin_lock(&sdp->sd_quota_spin); + + list_for_each_entry(qd, &sdp->sd_quota_list, qd_list) { + if (test_bit(QDF_LOCKED, &qd->qd_flags) || + !test_bit(QDF_CHANGE, &qd->qd_flags) || + qd->qd_sync_gen >= sdp->sd_quota_sync_gen) + continue; + + list_move_tail(&qd->qd_list, &sdp->sd_quota_list); + + set_bit(QDF_LOCKED, &qd->qd_flags); + gfs2_assert_warn(sdp, qd->qd_count); + qd->qd_count++; + qd->qd_change_sync = qd->qd_change; + gfs2_assert_warn(sdp, qd->qd_slot_count); + qd->qd_slot_count++; + found = 1; + + break; + } + + if (!found) + qd = NULL; + + spin_unlock(&sdp->sd_quota_spin); + + if (qd) { + gfs2_assert_warn(sdp, qd->qd_change_sync); + error = bh_get(qd); + if (error) { + clear_bit(QDF_LOCKED, &qd->qd_flags); + slot_put(qd); + qd_put(qd); + return error; + } + } + + *qdp = qd; + + return 0; +} + +static int qd_trylock(struct gfs2_quota_data *qd) +{ + struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd; + + if (sdp->sd_vfs->s_flags & MS_RDONLY) + return 0; + + spin_lock(&sdp->sd_quota_spin); + + if (test_bit(QDF_LOCKED, &qd->qd_flags) || + !test_bit(QDF_CHANGE, &qd->qd_flags)) { + spin_unlock(&sdp->sd_quota_spin); + return 0; + } + + list_move_tail(&qd->qd_list, &sdp->sd_quota_list); + + set_bit(QDF_LOCKED, &qd->qd_flags); + gfs2_assert_warn(sdp, qd->qd_count); + qd->qd_count++; + qd->qd_change_sync = qd->qd_change; + gfs2_assert_warn(sdp, qd->qd_slot_count); + qd->qd_slot_count++; + + spin_unlock(&sdp->sd_quota_spin); + + gfs2_assert_warn(sdp, qd->qd_change_sync); + if (bh_get(qd)) { + clear_bit(QDF_LOCKED, &qd->qd_flags); + slot_put(qd); + qd_put(qd); + return 0; + } + + return 1; +} + +static void qd_unlock(struct gfs2_quota_data *qd) +{ + gfs2_assert_warn(qd->qd_gl->gl_sbd, + test_bit(QDF_LOCKED, &qd->qd_flags)); + clear_bit(QDF_LOCKED, &qd->qd_flags); + bh_put(qd); + slot_put(qd); + qd_put(qd); +} + +static int qdsb_get(struct gfs2_sbd *sdp, int user, u32 id, int create, + struct gfs2_quota_data **qdp) +{ + int error; + + error = qd_get(sdp, user, id, create, qdp); + if (error) + return error; + + error = slot_get(*qdp); + if (error) + goto fail; + + error = bh_get(*qdp); + if (error) + goto fail_slot; + + return 0; + +fail_slot: + slot_put(*qdp); +fail: + qd_put(*qdp); + return error; +} + +static void qdsb_put(struct gfs2_quota_data *qd) +{ + bh_put(qd); + slot_put(qd); + qd_put(qd); +} + +int gfs2_quota_hold(struct gfs2_inode *ip, u32 uid, u32 gid) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct gfs2_alloc *al = &ip->i_alloc; + struct gfs2_quota_data **qd = al->al_qd; + int error; + + if (gfs2_assert_warn(sdp, !al->al_qd_num) || + gfs2_assert_warn(sdp, !test_bit(GIF_QD_LOCKED, &ip->i_flags))) + return -EIO; + + if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF) + return 0; + + error = qdsb_get(sdp, QUOTA_USER, ip->i_di.di_uid, CREATE, qd); + if (error) + goto out; + al->al_qd_num++; + qd++; + + error = qdsb_get(sdp, QUOTA_GROUP, ip->i_di.di_gid, CREATE, qd); + if (error) + goto out; + al->al_qd_num++; + qd++; + + if (uid != NO_QUOTA_CHANGE && uid != ip->i_di.di_uid) { + error = qdsb_get(sdp, QUOTA_USER, uid, CREATE, qd); + if (error) + goto out; + al->al_qd_num++; + qd++; + } + + if (gid != NO_QUOTA_CHANGE && gid != ip->i_di.di_gid) { + error = qdsb_get(sdp, QUOTA_GROUP, gid, CREATE, qd); + if (error) + goto out; + al->al_qd_num++; + qd++; + } + +out: + if (error) + gfs2_quota_unhold(ip); + return error; +} + +void gfs2_quota_unhold(struct gfs2_inode *ip) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct gfs2_alloc *al = &ip->i_alloc; + unsigned int x; + + gfs2_assert_warn(sdp, !test_bit(GIF_QD_LOCKED, &ip->i_flags)); + + for (x = 0; x < al->al_qd_num; x++) { + qdsb_put(al->al_qd[x]); + al->al_qd[x] = NULL; + } + al->al_qd_num = 0; +} + +static int sort_qd(const void *a, const void *b) +{ + const struct gfs2_quota_data *qd_a = *(const struct gfs2_quota_data **)a; + const struct gfs2_quota_data *qd_b = *(const struct gfs2_quota_data **)b; + + if (!test_bit(QDF_USER, &qd_a->qd_flags) != + !test_bit(QDF_USER, &qd_b->qd_flags)) { + if (test_bit(QDF_USER, &qd_a->qd_flags)) + return -1; + else + return 1; + } + if (qd_a->qd_id < qd_b->qd_id) + return -1; + if (qd_a->qd_id > qd_b->qd_id) + return 1; + + return 0; +} + +static void do_qc(struct gfs2_quota_data *qd, s64 change) +{ + struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd; + struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode); + struct gfs2_quota_change *qc = qd->qd_bh_qc; + s64 x; + + mutex_lock(&sdp->sd_quota_mutex); + gfs2_trans_add_bh(ip->i_gl, qd->qd_bh, 1); + + if (!test_bit(QDF_CHANGE, &qd->qd_flags)) { + qc->qc_change = 0; + qc->qc_flags = 0; + if (test_bit(QDF_USER, &qd->qd_flags)) + qc->qc_flags = cpu_to_be32(GFS2_QCF_USER); + qc->qc_id = cpu_to_be32(qd->qd_id); + } + + x = qc->qc_change; + x = be64_to_cpu(x) + change; + qc->qc_change = cpu_to_be64(x); + + spin_lock(&sdp->sd_quota_spin); + qd->qd_change = x; + spin_unlock(&sdp->sd_quota_spin); + + if (!x) { + gfs2_assert_warn(sdp, test_bit(QDF_CHANGE, &qd->qd_flags)); + clear_bit(QDF_CHANGE, &qd->qd_flags); + qc->qc_flags = 0; + qc->qc_id = 0; + slot_put(qd); + qd_put(qd); + } else if (!test_and_set_bit(QDF_CHANGE, &qd->qd_flags)) { + qd_hold(qd); + slot_hold(qd); + } + + mutex_unlock(&sdp->sd_quota_mutex); +} + +/** + * gfs2_adjust_quota + * + * This function was mostly borrowed from gfs2_block_truncate_page which was + * in turn mostly borrowed from ext3 + */ +static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc, + s64 change, struct gfs2_quota_data *qd) +{ + struct inode *inode = &ip->i_inode; + struct address_space *mapping = inode->i_mapping; + unsigned long index = loc >> PAGE_CACHE_SHIFT; + unsigned offset = loc & (PAGE_CACHE_SHIFT - 1); + unsigned blocksize, iblock, pos; + struct buffer_head *bh; + struct page *page; + void *kaddr; + __be64 *ptr; + s64 value; + int err = -EIO; + + page = grab_cache_page(mapping, index); + if (!page) + return -ENOMEM; + + blocksize = inode->i_sb->s_blocksize; + iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); + + if (!page_has_buffers(page)) + create_empty_buffers(page, blocksize, 0); + + bh = page_buffers(page); + pos = blocksize; + while (offset >= pos) { + bh = bh->b_this_page; + iblock++; + pos += blocksize; + } + + if (!buffer_mapped(bh)) { + gfs2_get_block(inode, iblock, bh, 1); + if (!buffer_mapped(bh)) + goto unlock; + } + + if (PageUptodate(page)) + set_buffer_uptodate(bh); + + if (!buffer_uptodate(bh)) { + ll_rw_block(READ_META, 1, &bh); + wait_on_buffer(bh); + if (!buffer_uptodate(bh)) + goto unlock; + } + + gfs2_trans_add_bh(ip->i_gl, bh, 0); + + kaddr = kmap_atomic(page, KM_USER0); + ptr = kaddr + offset; + value = (s64)be64_to_cpu(*ptr) + change; + *ptr = cpu_to_be64(value); + flush_dcache_page(page); + kunmap_atomic(kaddr, KM_USER0); + err = 0; + qd->qd_qb.qb_magic = cpu_to_be32(GFS2_MAGIC); + qd->qd_qb.qb_value = cpu_to_be64(value); +unlock: + unlock_page(page); + page_cache_release(page); + return err; +} + +static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda) +{ + struct gfs2_sbd *sdp = (*qda)->qd_gl->gl_sbd; + struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode); + unsigned int data_blocks, ind_blocks; + struct gfs2_holder *ghs, i_gh; + unsigned int qx, x; + struct gfs2_quota_data *qd; + loff_t offset; + unsigned int nalloc = 0; + struct gfs2_alloc *al = NULL; + int error; + + gfs2_write_calc_reserv(ip, sizeof(struct gfs2_quota), + &data_blocks, &ind_blocks); + + ghs = kcalloc(num_qd, sizeof(struct gfs2_holder), GFP_KERNEL); + if (!ghs) + return -ENOMEM; + + sort(qda, num_qd, sizeof(struct gfs2_quota_data *), sort_qd, NULL); + for (qx = 0; qx < num_qd; qx++) { + error = gfs2_glock_nq_init(qda[qx]->qd_gl, + LM_ST_EXCLUSIVE, + GL_NOCACHE, &ghs[qx]); + if (error) + goto out; + } + + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh); + if (error) + goto out; + + for (x = 0; x < num_qd; x++) { + int alloc_required; + + offset = qd2offset(qda[x]); + error = gfs2_write_alloc_required(ip, offset, + sizeof(struct gfs2_quota), + &alloc_required); + if (error) + goto out_gunlock; + if (alloc_required) + nalloc++; + } + + if (nalloc) { + al = gfs2_alloc_get(ip); + + al->al_requested = nalloc * (data_blocks + ind_blocks); + + error = gfs2_inplace_reserve(ip); + if (error) + goto out_alloc; + + error = gfs2_trans_begin(sdp, + al->al_rgd->rd_ri.ri_length + + num_qd * data_blocks + + nalloc * ind_blocks + + RES_DINODE + num_qd + + RES_STATFS, 0); + if (error) + goto out_ipres; + } else { + error = gfs2_trans_begin(sdp, + num_qd * data_blocks + + RES_DINODE + num_qd, 0); + if (error) + goto out_gunlock; + } + + for (x = 0; x < num_qd; x++) { + qd = qda[x]; + offset = qd2offset(qd); + error = gfs2_adjust_quota(ip, offset, qd->qd_change_sync, + (struct gfs2_quota_data *) + qd->qd_gl->gl_lvb); + if (error) + goto out_end_trans; + + do_qc(qd, -qd->qd_change_sync); + } + + error = 0; + +out_end_trans: + gfs2_trans_end(sdp); +out_ipres: + if (nalloc) + gfs2_inplace_release(ip); +out_alloc: + if (nalloc) + gfs2_alloc_put(ip); +out_gunlock: + gfs2_glock_dq_uninit(&i_gh); +out: + while (qx--) + gfs2_glock_dq_uninit(&ghs[qx]); + kfree(ghs); + gfs2_log_flush(ip->i_gl->gl_sbd, ip->i_gl); + return error; +} + +static int do_glock(struct gfs2_quota_data *qd, int force_refresh, + struct gfs2_holder *q_gh) +{ + struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd; + struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode); + struct gfs2_holder i_gh; + struct gfs2_quota q; + char buf[sizeof(struct gfs2_quota)]; + struct file_ra_state ra_state; + int error; + struct gfs2_quota_lvb *qlvb; + + file_ra_state_init(&ra_state, sdp->sd_quota_inode->i_mapping); +restart: + error = gfs2_glock_nq_init(qd->qd_gl, LM_ST_SHARED, 0, q_gh); + if (error) + return error; + + qd->qd_qb = *(struct gfs2_quota_lvb *)qd->qd_gl->gl_lvb; + + if (force_refresh || qd->qd_qb.qb_magic != cpu_to_be32(GFS2_MAGIC)) { + loff_t pos; + gfs2_glock_dq_uninit(q_gh); + error = gfs2_glock_nq_init(qd->qd_gl, + LM_ST_EXCLUSIVE, GL_NOCACHE, + q_gh); + if (error) + return error; + + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &i_gh); + if (error) + goto fail; + + memset(buf, 0, sizeof(struct gfs2_quota)); + pos = qd2offset(qd); + error = gfs2_internal_read(ip, &ra_state, buf, + &pos, sizeof(struct gfs2_quota)); + if (error < 0) + goto fail_gunlock; + + gfs2_glock_dq_uninit(&i_gh); + + + gfs2_quota_in(&q, buf); + qlvb = (struct gfs2_quota_lvb *)qd->qd_gl->gl_lvb; + qlvb->qb_magic = cpu_to_be32(GFS2_MAGIC); + qlvb->__pad = 0; + qlvb->qb_limit = cpu_to_be64(q.qu_limit); + qlvb->qb_warn = cpu_to_be64(q.qu_warn); + qlvb->qb_value = cpu_to_be64(q.qu_value); + qd->qd_qb = *qlvb; + + if (gfs2_glock_is_blocking(qd->qd_gl)) { + gfs2_glock_dq_uninit(q_gh); + force_refresh = 0; + goto restart; + } + } + + return 0; + +fail_gunlock: + gfs2_glock_dq_uninit(&i_gh); +fail: + gfs2_glock_dq_uninit(q_gh); + return error; +} + +int gfs2_quota_lock(struct gfs2_inode *ip, u32 uid, u32 gid) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct gfs2_alloc *al = &ip->i_alloc; + unsigned int x; + int error = 0; + + gfs2_quota_hold(ip, uid, gid); + + if (capable(CAP_SYS_RESOURCE) || + sdp->sd_args.ar_quota != GFS2_QUOTA_ON) + return 0; + + sort(al->al_qd, al->al_qd_num, sizeof(struct gfs2_quota_data *), + sort_qd, NULL); + + for (x = 0; x < al->al_qd_num; x++) { + error = do_glock(al->al_qd[x], NO_FORCE, &al->al_qd_ghs[x]); + if (error) + break; + } + + if (!error) + set_bit(GIF_QD_LOCKED, &ip->i_flags); + else { + while (x--) + gfs2_glock_dq_uninit(&al->al_qd_ghs[x]); + gfs2_quota_unhold(ip); + } + + return error; +} + +static int need_sync(struct gfs2_quota_data *qd) +{ + struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd; + struct gfs2_tune *gt = &sdp->sd_tune; + s64 value; + unsigned int num, den; + int do_sync = 1; + + if (!qd->qd_qb.qb_limit) + return 0; + + spin_lock(&sdp->sd_quota_spin); + value = qd->qd_change; + spin_unlock(&sdp->sd_quota_spin); + + spin_lock(>->gt_spin); + num = gt->gt_quota_scale_num; + den = gt->gt_quota_scale_den; + spin_unlock(>->gt_spin); + + if (value < 0) + do_sync = 0; + else if ((s64)be64_to_cpu(qd->qd_qb.qb_value) >= + (s64)be64_to_cpu(qd->qd_qb.qb_limit)) + do_sync = 0; + else { + value *= gfs2_jindex_size(sdp) * num; + do_div(value, den); + value += (s64)be64_to_cpu(qd->qd_qb.qb_value); + if (value < (s64)be64_to_cpu(qd->qd_qb.qb_limit)) + do_sync = 0; + } + + return do_sync; +} + +void gfs2_quota_unlock(struct gfs2_inode *ip) +{ + struct gfs2_alloc *al = &ip->i_alloc; + struct gfs2_quota_data *qda[4]; + unsigned int count = 0; + unsigned int x; + + if (!test_and_clear_bit(GIF_QD_LOCKED, &ip->i_flags)) + goto out; + + for (x = 0; x < al->al_qd_num; x++) { + struct gfs2_quota_data *qd; + int sync; + + qd = al->al_qd[x]; + sync = need_sync(qd); + + gfs2_glock_dq_uninit(&al->al_qd_ghs[x]); + + if (sync && qd_trylock(qd)) + qda[count++] = qd; + } + + if (count) { + do_sync(count, qda); + for (x = 0; x < count; x++) + qd_unlock(qda[x]); + } + +out: + gfs2_quota_unhold(ip); +} + +#define MAX_LINE 256 + +static int print_message(struct gfs2_quota_data *qd, char *type) +{ + struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd; + + printk(KERN_INFO "GFS2: fsid=%s: quota %s for %s %u\r\n", + sdp->sd_fsname, type, + (test_bit(QDF_USER, &qd->qd_flags)) ? "user" : "group", + qd->qd_id); + + return 0; +} + +int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct gfs2_alloc *al = &ip->i_alloc; + struct gfs2_quota_data *qd; + s64 value; + unsigned int x; + int error = 0; + + if (!test_bit(GIF_QD_LOCKED, &ip->i_flags)) + return 0; + + if (sdp->sd_args.ar_quota != GFS2_QUOTA_ON) + return 0; + + for (x = 0; x < al->al_qd_num; x++) { + qd = al->al_qd[x]; + + if (!((qd->qd_id == uid && test_bit(QDF_USER, &qd->qd_flags)) || + (qd->qd_id == gid && !test_bit(QDF_USER, &qd->qd_flags)))) + continue; + + value = (s64)be64_to_cpu(qd->qd_qb.qb_value); + spin_lock(&sdp->sd_quota_spin); + value += qd->qd_change; + spin_unlock(&sdp->sd_quota_spin); + + if (be64_to_cpu(qd->qd_qb.qb_limit) && (s64)be64_to_cpu(qd->qd_qb.qb_limit) < value) { + print_message(qd, "exceeded"); + error = -EDQUOT; + break; + } else if (be64_to_cpu(qd->qd_qb.qb_warn) && + (s64)be64_to_cpu(qd->qd_qb.qb_warn) < value && + time_after_eq(jiffies, qd->qd_last_warn + + gfs2_tune_get(sdp, + gt_quota_warn_period) * HZ)) { + error = print_message(qd, "warning"); + qd->qd_last_warn = jiffies; + } + } + + return error; +} + +void gfs2_quota_change(struct gfs2_inode *ip, s64 change, + u32 uid, u32 gid) +{ + struct gfs2_alloc *al = &ip->i_alloc; + struct gfs2_quota_data *qd; + unsigned int x; + unsigned int found = 0; + + if (gfs2_assert_warn(GFS2_SB(&ip->i_inode), change)) + return; + if (ip->i_di.di_flags & GFS2_DIF_SYSTEM) + return; + + for (x = 0; x < al->al_qd_num; x++) { + qd = al->al_qd[x]; + + if ((qd->qd_id == uid && test_bit(QDF_USER, &qd->qd_flags)) || + (qd->qd_id == gid && !test_bit(QDF_USER, &qd->qd_flags))) { + do_qc(qd, change); + found++; + } + } +} + +int gfs2_quota_sync(struct gfs2_sbd *sdp) +{ + struct gfs2_quota_data **qda; + unsigned int max_qd = gfs2_tune_get(sdp, gt_quota_simul_sync); + unsigned int num_qd; + unsigned int x; + int error = 0; + + sdp->sd_quota_sync_gen++; + + qda = kcalloc(max_qd, sizeof(struct gfs2_quota_data *), GFP_KERNEL); + if (!qda) + return -ENOMEM; + + do { + num_qd = 0; + + for (;;) { + error = qd_fish(sdp, qda + num_qd); + if (error || !qda[num_qd]) + break; + if (++num_qd == max_qd) + break; + } + + if (num_qd) { + if (!error) + error = do_sync(num_qd, qda); + if (!error) + for (x = 0; x < num_qd; x++) + qda[x]->qd_sync_gen = + sdp->sd_quota_sync_gen; + + for (x = 0; x < num_qd; x++) + qd_unlock(qda[x]); + } + } while (!error && num_qd == max_qd); + + kfree(qda); + + return error; +} + +int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, u32 id) +{ + struct gfs2_quota_data *qd; + struct gfs2_holder q_gh; + int error; + + error = qd_get(sdp, user, id, CREATE, &qd); + if (error) + return error; + + error = do_glock(qd, FORCE, &q_gh); + if (!error) + gfs2_glock_dq_uninit(&q_gh); + + qd_put(qd); + + return error; +} + +int gfs2_quota_init(struct gfs2_sbd *sdp) +{ + struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode); + unsigned int blocks = ip->i_di.di_size >> sdp->sd_sb.sb_bsize_shift; + unsigned int x, slot = 0; + unsigned int found = 0; + u64 dblock; + u32 extlen = 0; + int error; + + if (!ip->i_di.di_size || ip->i_di.di_size > (64 << 20) || + ip->i_di.di_size & (sdp->sd_sb.sb_bsize - 1)) { + gfs2_consist_inode(ip); + return -EIO; + } + sdp->sd_quota_slots = blocks * sdp->sd_qc_per_block; + sdp->sd_quota_chunks = DIV_ROUND_UP(sdp->sd_quota_slots, 8 * PAGE_SIZE); + + error = -ENOMEM; + + sdp->sd_quota_bitmap = kcalloc(sdp->sd_quota_chunks, + sizeof(unsigned char *), GFP_KERNEL); + if (!sdp->sd_quota_bitmap) + return error; + + for (x = 0; x < sdp->sd_quota_chunks; x++) { + sdp->sd_quota_bitmap[x] = kzalloc(PAGE_SIZE, GFP_KERNEL); + if (!sdp->sd_quota_bitmap[x]) + goto fail; + } + + for (x = 0; x < blocks; x++) { + struct buffer_head *bh; + unsigned int y; + + if (!extlen) { + int new = 0; + error = gfs2_extent_map(&ip->i_inode, x, &new, &dblock, &extlen); + if (error) + goto fail; + } + error = -EIO; + bh = gfs2_meta_ra(ip->i_gl, dblock, extlen); + if (!bh) + goto fail; + if (gfs2_metatype_check(sdp, bh, GFS2_METATYPE_QC)) { + brelse(bh); + goto fail; + } + + for (y = 0; y < sdp->sd_qc_per_block && slot < sdp->sd_quota_slots; + y++, slot++) { + struct gfs2_quota_change qc; + struct gfs2_quota_data *qd; + + gfs2_quota_change_in(&qc, bh->b_data + + sizeof(struct gfs2_meta_header) + + y * sizeof(struct gfs2_quota_change)); + if (!qc.qc_change) + continue; + + error = qd_alloc(sdp, (qc.qc_flags & GFS2_QCF_USER), + qc.qc_id, &qd); + if (error) { + brelse(bh); + goto fail; + } + + set_bit(QDF_CHANGE, &qd->qd_flags); + qd->qd_change = qc.qc_change; + qd->qd_slot = slot; + qd->qd_slot_count = 1; + qd->qd_last_touched = jiffies; + + spin_lock(&sdp->sd_quota_spin); + gfs2_icbit_munge(sdp, sdp->sd_quota_bitmap, slot, 1); + list_add(&qd->qd_list, &sdp->sd_quota_list); + atomic_inc(&sdp->sd_quota_count); + spin_unlock(&sdp->sd_quota_spin); + + found++; + } + + brelse(bh); + dblock++; + extlen--; + } + + if (found) + fs_info(sdp, "found %u quota changes\n", found); + + return 0; + +fail: + gfs2_quota_cleanup(sdp); + return error; +} + +void gfs2_quota_scan(struct gfs2_sbd *sdp) +{ + struct gfs2_quota_data *qd, *safe; + LIST_HEAD(dead); + + spin_lock(&sdp->sd_quota_spin); + list_for_each_entry_safe(qd, safe, &sdp->sd_quota_list, qd_list) { + if (!qd->qd_count && + time_after_eq(jiffies, qd->qd_last_touched + + gfs2_tune_get(sdp, gt_quota_cache_secs) * HZ)) { + list_move(&qd->qd_list, &dead); + gfs2_assert_warn(sdp, + atomic_read(&sdp->sd_quota_count) > 0); + atomic_dec(&sdp->sd_quota_count); + } + } + spin_unlock(&sdp->sd_quota_spin); + + while (!list_empty(&dead)) { + qd = list_entry(dead.next, struct gfs2_quota_data, qd_list); + list_del(&qd->qd_list); + + gfs2_assert_warn(sdp, !qd->qd_change); + gfs2_assert_warn(sdp, !qd->qd_slot_count); + gfs2_assert_warn(sdp, !qd->qd_bh_count); + + gfs2_lvb_unhold(qd->qd_gl); + kfree(qd); + } +} + +void gfs2_quota_cleanup(struct gfs2_sbd *sdp) +{ + struct list_head *head = &sdp->sd_quota_list; + struct gfs2_quota_data *qd; + unsigned int x; + + spin_lock(&sdp->sd_quota_spin); + while (!list_empty(head)) { + qd = list_entry(head->prev, struct gfs2_quota_data, qd_list); + + if (qd->qd_count > 1 || + (qd->qd_count && !test_bit(QDF_CHANGE, &qd->qd_flags))) { + list_move(&qd->qd_list, head); + spin_unlock(&sdp->sd_quota_spin); + schedule(); + spin_lock(&sdp->sd_quota_spin); + continue; + } + + list_del(&qd->qd_list); + atomic_dec(&sdp->sd_quota_count); + spin_unlock(&sdp->sd_quota_spin); + + if (!qd->qd_count) { + gfs2_assert_warn(sdp, !qd->qd_change); + gfs2_assert_warn(sdp, !qd->qd_slot_count); + } else + gfs2_assert_warn(sdp, qd->qd_slot_count == 1); + gfs2_assert_warn(sdp, !qd->qd_bh_count); + + gfs2_lvb_unhold(qd->qd_gl); + kfree(qd); + + spin_lock(&sdp->sd_quota_spin); + } + spin_unlock(&sdp->sd_quota_spin); + + gfs2_assert_warn(sdp, !atomic_read(&sdp->sd_quota_count)); + + if (sdp->sd_quota_bitmap) { + for (x = 0; x < sdp->sd_quota_chunks; x++) + kfree(sdp->sd_quota_bitmap[x]); + kfree(sdp->sd_quota_bitmap); + } +} + diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h new file mode 100644 index 000000000000..a8be1417051f --- /dev/null +++ b/fs/gfs2/quota.h @@ -0,0 +1,35 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#ifndef __QUOTA_DOT_H__ +#define __QUOTA_DOT_H__ + +struct gfs2_inode; +struct gfs2_sbd; + +#define NO_QUOTA_CHANGE ((u32)-1) + +int gfs2_quota_hold(struct gfs2_inode *ip, u32 uid, u32 gid); +void gfs2_quota_unhold(struct gfs2_inode *ip); + +int gfs2_quota_lock(struct gfs2_inode *ip, u32 uid, u32 gid); +void gfs2_quota_unlock(struct gfs2_inode *ip); + +int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid); +void gfs2_quota_change(struct gfs2_inode *ip, s64 change, + u32 uid, u32 gid); + +int gfs2_quota_sync(struct gfs2_sbd *sdp); +int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, u32 id); + +int gfs2_quota_init(struct gfs2_sbd *sdp); +void gfs2_quota_scan(struct gfs2_sbd *sdp); +void gfs2_quota_cleanup(struct gfs2_sbd *sdp); + +#endif /* __QUOTA_DOT_H__ */ diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c new file mode 100644 index 000000000000..0a8a4b87dcc6 --- /dev/null +++ b/fs/gfs2/recovery.c @@ -0,0 +1,570 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/completion.h> +#include <linux/buffer_head.h> +#include <linux/gfs2_ondisk.h> +#include <linux/crc32.h> +#include <linux/lm_interface.h> + +#include "gfs2.h" +#include "incore.h" +#include "bmap.h" +#include "glock.h" +#include "glops.h" +#include "lm.h" +#include "lops.h" +#include "meta_io.h" +#include "recovery.h" +#include "super.h" +#include "util.h" +#include "dir.h" + +int gfs2_replay_read_block(struct gfs2_jdesc *jd, unsigned int blk, + struct buffer_head **bh) +{ + struct gfs2_inode *ip = GFS2_I(jd->jd_inode); + struct gfs2_glock *gl = ip->i_gl; + int new = 0; + u64 dblock; + u32 extlen; + int error; + + error = gfs2_extent_map(&ip->i_inode, blk, &new, &dblock, &extlen); + if (error) + return error; + if (!dblock) { + gfs2_consist_inode(ip); + return -EIO; + } + + *bh = gfs2_meta_ra(gl, dblock, extlen); + + return error; +} + +int gfs2_revoke_add(struct gfs2_sbd *sdp, u64 blkno, unsigned int where) +{ + struct list_head *head = &sdp->sd_revoke_list; + struct gfs2_revoke_replay *rr; + int found = 0; + + list_for_each_entry(rr, head, rr_list) { + if (rr->rr_blkno == blkno) { + found = 1; + break; + } + } + + if (found) { + rr->rr_where = where; + return 0; + } + + rr = kmalloc(sizeof(struct gfs2_revoke_replay), GFP_KERNEL); + if (!rr) + return -ENOMEM; + + rr->rr_blkno = blkno; + rr->rr_where = where; + list_add(&rr->rr_list, head); + + return 1; +} + +int gfs2_revoke_check(struct gfs2_sbd *sdp, u64 blkno, unsigned int where) +{ + struct gfs2_revoke_replay *rr; + int wrap, a, b, revoke; + int found = 0; + + list_for_each_entry(rr, &sdp->sd_revoke_list, rr_list) { + if (rr->rr_blkno == blkno) { + found = 1; + break; + } + } + + if (!found) + return 0; + + wrap = (rr->rr_where < sdp->sd_replay_tail); + a = (sdp->sd_replay_tail < where); + b = (where < rr->rr_where); + revoke = (wrap) ? (a || b) : (a && b); + + return revoke; +} + +void gfs2_revoke_clean(struct gfs2_sbd *sdp) +{ + struct list_head *head = &sdp->sd_revoke_list; + struct gfs2_revoke_replay *rr; + + while (!list_empty(head)) { + rr = list_entry(head->next, struct gfs2_revoke_replay, rr_list); + list_del(&rr->rr_list); + kfree(rr); + } +} + +/** + * get_log_header - read the log header for a given segment + * @jd: the journal + * @blk: the block to look at + * @lh: the log header to return + * + * Read the log header for a given segement in a given journal. Do a few + * sanity checks on it. + * + * Returns: 0 on success, + * 1 if the header was invalid or incomplete, + * errno on error + */ + +static int get_log_header(struct gfs2_jdesc *jd, unsigned int blk, + struct gfs2_log_header *head) +{ + struct buffer_head *bh; + struct gfs2_log_header lh; + u32 hash; + int error; + + error = gfs2_replay_read_block(jd, blk, &bh); + if (error) + return error; + + memcpy(&lh, bh->b_data, sizeof(struct gfs2_log_header)); + lh.lh_hash = 0; + hash = gfs2_disk_hash((char *)&lh, sizeof(struct gfs2_log_header)); + gfs2_log_header_in(&lh, bh->b_data); + + brelse(bh); + + if (lh.lh_header.mh_magic != GFS2_MAGIC || + lh.lh_header.mh_type != GFS2_METATYPE_LH || + lh.lh_blkno != blk || lh.lh_hash != hash) + return 1; + + *head = lh; + + return 0; +} + +/** + * find_good_lh - find a good log header + * @jd: the journal + * @blk: the segment to start searching from + * @lh: the log header to fill in + * @forward: if true search forward in the log, else search backward + * + * Call get_log_header() to get a log header for a segment, but if the + * segment is bad, either scan forward or backward until we find a good one. + * + * Returns: errno + */ + +static int find_good_lh(struct gfs2_jdesc *jd, unsigned int *blk, + struct gfs2_log_header *head) +{ + unsigned int orig_blk = *blk; + int error; + + for (;;) { + error = get_log_header(jd, *blk, head); + if (error <= 0) + return error; + + if (++*blk == jd->jd_blocks) + *blk = 0; + + if (*blk == orig_blk) { + gfs2_consist_inode(GFS2_I(jd->jd_inode)); + return -EIO; + } + } +} + +/** + * jhead_scan - make sure we've found the head of the log + * @jd: the journal + * @head: this is filled in with the log descriptor of the head + * + * At this point, seg and lh should be either the head of the log or just + * before. Scan forward until we find the head. + * + * Returns: errno + */ + +static int jhead_scan(struct gfs2_jdesc *jd, struct gfs2_log_header *head) +{ + unsigned int blk = head->lh_blkno; + struct gfs2_log_header lh; + int error; + + for (;;) { + if (++blk == jd->jd_blocks) + blk = 0; + + error = get_log_header(jd, blk, &lh); + if (error < 0) + return error; + if (error == 1) + continue; + + if (lh.lh_sequence == head->lh_sequence) { + gfs2_consist_inode(GFS2_I(jd->jd_inode)); + return -EIO; + } + if (lh.lh_sequence < head->lh_sequence) + break; + + *head = lh; + } + + return 0; +} + +/** + * gfs2_find_jhead - find the head of a log + * @jd: the journal + * @head: the log descriptor for the head of the log is returned here + * + * Do a binary search of a journal and find the valid log entry with the + * highest sequence number. (i.e. the log head) + * + * Returns: errno + */ + +int gfs2_find_jhead(struct gfs2_jdesc *jd, struct gfs2_log_header *head) +{ + struct gfs2_log_header lh_1, lh_m; + u32 blk_1, blk_2, blk_m; + int error; + + blk_1 = 0; + blk_2 = jd->jd_blocks - 1; + + for (;;) { + blk_m = (blk_1 + blk_2) / 2; + + error = find_good_lh(jd, &blk_1, &lh_1); + if (error) + return error; + + error = find_good_lh(jd, &blk_m, &lh_m); + if (error) + return error; + + if (blk_1 == blk_m || blk_m == blk_2) + break; + + if (lh_1.lh_sequence <= lh_m.lh_sequence) + blk_1 = blk_m; + else + blk_2 = blk_m; + } + + error = jhead_scan(jd, &lh_1); + if (error) + return error; + + *head = lh_1; + + return error; +} + +/** + * foreach_descriptor - go through the active part of the log + * @jd: the journal + * @start: the first log header in the active region + * @end: the last log header (don't process the contents of this entry)) + * + * Call a given function once for every log descriptor in the active + * portion of the log. + * + * Returns: errno + */ + +static int foreach_descriptor(struct gfs2_jdesc *jd, unsigned int start, + unsigned int end, int pass) +{ + struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); + struct buffer_head *bh; + struct gfs2_log_descriptor *ld; + int error = 0; + u32 length; + __be64 *ptr; + unsigned int offset = sizeof(struct gfs2_log_descriptor); + offset += sizeof(__be64) - 1; + offset &= ~(sizeof(__be64) - 1); + + while (start != end) { + error = gfs2_replay_read_block(jd, start, &bh); + if (error) + return error; + if (gfs2_meta_check(sdp, bh)) { + brelse(bh); + return -EIO; + } + ld = (struct gfs2_log_descriptor *)bh->b_data; + length = be32_to_cpu(ld->ld_length); + + if (be32_to_cpu(ld->ld_header.mh_type) == GFS2_METATYPE_LH) { + struct gfs2_log_header lh; + error = get_log_header(jd, start, &lh); + if (!error) { + gfs2_replay_incr_blk(sdp, &start); + brelse(bh); + continue; + } + if (error == 1) { + gfs2_consist_inode(GFS2_I(jd->jd_inode)); + error = -EIO; + } + brelse(bh); + return error; + } else if (gfs2_metatype_check(sdp, bh, GFS2_METATYPE_LD)) { + brelse(bh); + return -EIO; + } + ptr = (__be64 *)(bh->b_data + offset); + error = lops_scan_elements(jd, start, ld, ptr, pass); + if (error) { + brelse(bh); + return error; + } + + while (length--) + gfs2_replay_incr_blk(sdp, &start); + + brelse(bh); + } + + return 0; +} + +/** + * clean_journal - mark a dirty journal as being clean + * @sdp: the filesystem + * @jd: the journal + * @gl: the journal's glock + * @head: the head journal to start from + * + * Returns: errno + */ + +static int clean_journal(struct gfs2_jdesc *jd, struct gfs2_log_header *head) +{ + struct gfs2_inode *ip = GFS2_I(jd->jd_inode); + struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); + unsigned int lblock; + struct gfs2_log_header *lh; + u32 hash; + struct buffer_head *bh; + int error; + struct buffer_head bh_map; + + lblock = head->lh_blkno; + gfs2_replay_incr_blk(sdp, &lblock); + error = gfs2_block_map(&ip->i_inode, lblock, 0, &bh_map, 1); + if (error) + return error; + if (!bh_map.b_blocknr) { + gfs2_consist_inode(ip); + return -EIO; + } + + bh = sb_getblk(sdp->sd_vfs, bh_map.b_blocknr); + lock_buffer(bh); + memset(bh->b_data, 0, bh->b_size); + set_buffer_uptodate(bh); + clear_buffer_dirty(bh); + unlock_buffer(bh); + + lh = (struct gfs2_log_header *)bh->b_data; + memset(lh, 0, sizeof(struct gfs2_log_header)); + lh->lh_header.mh_magic = cpu_to_be32(GFS2_MAGIC); + lh->lh_header.mh_type = cpu_to_be32(GFS2_METATYPE_LH); + lh->lh_header.mh_format = cpu_to_be32(GFS2_FORMAT_LH); + lh->lh_sequence = cpu_to_be64(head->lh_sequence + 1); + lh->lh_flags = cpu_to_be32(GFS2_LOG_HEAD_UNMOUNT); + lh->lh_blkno = cpu_to_be32(lblock); + hash = gfs2_disk_hash((const char *)lh, sizeof(struct gfs2_log_header)); + lh->lh_hash = cpu_to_be32(hash); + + set_buffer_dirty(bh); + if (sync_dirty_buffer(bh)) + gfs2_io_error_bh(sdp, bh); + brelse(bh); + + return error; +} + +/** + * gfs2_recover_journal - recovery a given journal + * @jd: the struct gfs2_jdesc describing the journal + * + * Acquire the journal's lock, check to see if the journal is clean, and + * do recovery if necessary. + * + * Returns: errno + */ + +int gfs2_recover_journal(struct gfs2_jdesc *jd) +{ + struct gfs2_inode *ip = GFS2_I(jd->jd_inode); + struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); + struct gfs2_log_header head; + struct gfs2_holder j_gh, ji_gh, t_gh; + unsigned long t; + int ro = 0; + unsigned int pass; + int error; + + if (jd->jd_jid != sdp->sd_lockstruct.ls_jid) { + fs_info(sdp, "jid=%u: Trying to acquire journal lock...\n", + jd->jd_jid); + + /* Aquire the journal lock so we can do recovery */ + + error = gfs2_glock_nq_num(sdp, jd->jd_jid, &gfs2_journal_glops, + LM_ST_EXCLUSIVE, + LM_FLAG_NOEXP | LM_FLAG_TRY | GL_NOCACHE, + &j_gh); + switch (error) { + case 0: + break; + + case GLR_TRYFAILED: + fs_info(sdp, "jid=%u: Busy\n", jd->jd_jid); + error = 0; + + default: + goto fail; + }; + + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, + LM_FLAG_NOEXP, &ji_gh); + if (error) + goto fail_gunlock_j; + } else { + fs_info(sdp, "jid=%u, already locked for use\n", jd->jd_jid); + } + + fs_info(sdp, "jid=%u: Looking at journal...\n", jd->jd_jid); + + error = gfs2_jdesc_check(jd); + if (error) + goto fail_gunlock_ji; + + error = gfs2_find_jhead(jd, &head); + if (error) + goto fail_gunlock_ji; + + if (!(head.lh_flags & GFS2_LOG_HEAD_UNMOUNT)) { + fs_info(sdp, "jid=%u: Acquiring the transaction lock...\n", + jd->jd_jid); + + t = jiffies; + + /* Acquire a shared hold on the transaction lock */ + + error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, + LM_FLAG_NOEXP | LM_FLAG_PRIORITY | + GL_NOCANCEL | GL_NOCACHE, &t_gh); + if (error) + goto fail_gunlock_ji; + + if (test_bit(SDF_JOURNAL_CHECKED, &sdp->sd_flags)) { + if (!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) + ro = 1; + } else { + if (sdp->sd_vfs->s_flags & MS_RDONLY) + ro = 1; + } + + if (ro) { + fs_warn(sdp, "jid=%u: Can't replay: read-only FS\n", + jd->jd_jid); + error = -EROFS; + goto fail_gunlock_tr; + } + + fs_info(sdp, "jid=%u: Replaying journal...\n", jd->jd_jid); + + for (pass = 0; pass < 2; pass++) { + lops_before_scan(jd, &head, pass); + error = foreach_descriptor(jd, head.lh_tail, + head.lh_blkno, pass); + lops_after_scan(jd, error, pass); + if (error) + goto fail_gunlock_tr; + } + + error = clean_journal(jd, &head); + if (error) + goto fail_gunlock_tr; + + gfs2_glock_dq_uninit(&t_gh); + t = DIV_ROUND_UP(jiffies - t, HZ); + fs_info(sdp, "jid=%u: Journal replayed in %lus\n", + jd->jd_jid, t); + } + + if (jd->jd_jid != sdp->sd_lockstruct.ls_jid) + gfs2_glock_dq_uninit(&ji_gh); + + gfs2_lm_recovery_done(sdp, jd->jd_jid, LM_RD_SUCCESS); + + if (jd->jd_jid != sdp->sd_lockstruct.ls_jid) + gfs2_glock_dq_uninit(&j_gh); + + fs_info(sdp, "jid=%u: Done\n", jd->jd_jid); + return 0; + +fail_gunlock_tr: + gfs2_glock_dq_uninit(&t_gh); +fail_gunlock_ji: + if (jd->jd_jid != sdp->sd_lockstruct.ls_jid) { + gfs2_glock_dq_uninit(&ji_gh); +fail_gunlock_j: + gfs2_glock_dq_uninit(&j_gh); + } + + fs_info(sdp, "jid=%u: %s\n", jd->jd_jid, (error) ? "Failed" : "Done"); + +fail: + gfs2_lm_recovery_done(sdp, jd->jd_jid, LM_RD_GAVEUP); + return error; +} + +/** + * gfs2_check_journals - Recover any dirty journals + * @sdp: the filesystem + * + */ + +void gfs2_check_journals(struct gfs2_sbd *sdp) +{ + struct gfs2_jdesc *jd; + + for (;;) { + jd = gfs2_jdesc_find_dirty(sdp); + if (!jd) + break; + + if (jd != sdp->sd_jdesc) + gfs2_recover_journal(jd); + } +} + diff --git a/fs/gfs2/recovery.h b/fs/gfs2/recovery.h new file mode 100644 index 000000000000..961feedf4d8b --- /dev/null +++ b/fs/gfs2/recovery.h @@ -0,0 +1,34 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#ifndef __RECOVERY_DOT_H__ +#define __RECOVERY_DOT_H__ + +#include "incore.h" + +static inline void gfs2_replay_incr_blk(struct gfs2_sbd *sdp, unsigned int *blk) +{ + if (++*blk == sdp->sd_jdesc->jd_blocks) + *blk = 0; +} + +int gfs2_replay_read_block(struct gfs2_jdesc *jd, unsigned int blk, + struct buffer_head **bh); + +int gfs2_revoke_add(struct gfs2_sbd *sdp, u64 blkno, unsigned int where); +int gfs2_revoke_check(struct gfs2_sbd *sdp, u64 blkno, unsigned int where); +void gfs2_revoke_clean(struct gfs2_sbd *sdp); + +int gfs2_find_jhead(struct gfs2_jdesc *jd, + struct gfs2_log_header *head); +int gfs2_recover_journal(struct gfs2_jdesc *gfs2_jd); +void gfs2_check_journals(struct gfs2_sbd *sdp); + +#endif /* __RECOVERY_DOT_H__ */ + diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c new file mode 100644 index 000000000000..b261385c0065 --- /dev/null +++ b/fs/gfs2/rgrp.c @@ -0,0 +1,1513 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/completion.h> +#include <linux/buffer_head.h> +#include <linux/fs.h> +#include <linux/gfs2_ondisk.h> +#include <linux/lm_interface.h> + +#include "gfs2.h" +#include "incore.h" +#include "glock.h" +#include "glops.h" +#include "lops.h" +#include "meta_io.h" +#include "quota.h" +#include "rgrp.h" +#include "super.h" +#include "trans.h" +#include "ops_file.h" +#include "util.h" + +#define BFITNOENT ((u32)~0) + +/* + * These routines are used by the resource group routines (rgrp.c) + * to keep track of block allocation. Each block is represented by two + * bits. So, each byte represents GFS2_NBBY (i.e. 4) blocks. + * + * 0 = Free + * 1 = Used (not metadata) + * 2 = Unlinked (still in use) inode + * 3 = Used (metadata) + */ + +static const char valid_change[16] = { + /* current */ + /* n */ 0, 1, 1, 1, + /* e */ 1, 0, 0, 0, + /* w */ 0, 0, 0, 1, + 1, 0, 0, 0 +}; + +/** + * gfs2_setbit - Set a bit in the bitmaps + * @buffer: the buffer that holds the bitmaps + * @buflen: the length (in bytes) of the buffer + * @block: the block to set + * @new_state: the new state of the block + * + */ + +static void gfs2_setbit(struct gfs2_rgrpd *rgd, unsigned char *buffer, + unsigned int buflen, u32 block, + unsigned char new_state) +{ + unsigned char *byte, *end, cur_state; + unsigned int bit; + + byte = buffer + (block / GFS2_NBBY); + bit = (block % GFS2_NBBY) * GFS2_BIT_SIZE; + end = buffer + buflen; + + gfs2_assert(rgd->rd_sbd, byte < end); + + cur_state = (*byte >> bit) & GFS2_BIT_MASK; + + if (valid_change[new_state * 4 + cur_state]) { + *byte ^= cur_state << bit; + *byte |= new_state << bit; + } else + gfs2_consist_rgrpd(rgd); +} + +/** + * gfs2_testbit - test a bit in the bitmaps + * @buffer: the buffer that holds the bitmaps + * @buflen: the length (in bytes) of the buffer + * @block: the block to read + * + */ + +static unsigned char gfs2_testbit(struct gfs2_rgrpd *rgd, unsigned char *buffer, + unsigned int buflen, u32 block) +{ + unsigned char *byte, *end, cur_state; + unsigned int bit; + + byte = buffer + (block / GFS2_NBBY); + bit = (block % GFS2_NBBY) * GFS2_BIT_SIZE; + end = buffer + buflen; + + gfs2_assert(rgd->rd_sbd, byte < end); + + cur_state = (*byte >> bit) & GFS2_BIT_MASK; + + return cur_state; +} + +/** + * gfs2_bitfit - Search an rgrp's bitmap buffer to find a bit-pair representing + * a block in a given allocation state. + * @buffer: the buffer that holds the bitmaps + * @buflen: the length (in bytes) of the buffer + * @goal: start search at this block's bit-pair (within @buffer) + * @old_state: GFS2_BLKST_XXX the state of the block we're looking for; + * bit 0 = alloc(1)/free(0), bit 1 = meta(1)/data(0) + * + * Scope of @goal and returned block number is only within this bitmap buffer, + * not entire rgrp or filesystem. @buffer will be offset from the actual + * beginning of a bitmap block buffer, skipping any header structures. + * + * Return: the block number (bitmap buffer scope) that was found + */ + +static u32 gfs2_bitfit(struct gfs2_rgrpd *rgd, unsigned char *buffer, + unsigned int buflen, u32 goal, + unsigned char old_state) +{ + unsigned char *byte, *end, alloc; + u32 blk = goal; + unsigned int bit; + + byte = buffer + (goal / GFS2_NBBY); + bit = (goal % GFS2_NBBY) * GFS2_BIT_SIZE; + end = buffer + buflen; + alloc = (old_state & 1) ? 0 : 0x55; + + while (byte < end) { + if ((*byte & 0x55) == alloc) { + blk += (8 - bit) >> 1; + + bit = 0; + byte++; + + continue; + } + + if (((*byte >> bit) & GFS2_BIT_MASK) == old_state) + return blk; + + bit += GFS2_BIT_SIZE; + if (bit >= 8) { + bit = 0; + byte++; + } + + blk++; + } + + return BFITNOENT; +} + +/** + * gfs2_bitcount - count the number of bits in a certain state + * @buffer: the buffer that holds the bitmaps + * @buflen: the length (in bytes) of the buffer + * @state: the state of the block we're looking for + * + * Returns: The number of bits + */ + +static u32 gfs2_bitcount(struct gfs2_rgrpd *rgd, unsigned char *buffer, + unsigned int buflen, unsigned char state) +{ + unsigned char *byte = buffer; + unsigned char *end = buffer + buflen; + unsigned char state1 = state << 2; + unsigned char state2 = state << 4; + unsigned char state3 = state << 6; + u32 count = 0; + + for (; byte < end; byte++) { + if (((*byte) & 0x03) == state) + count++; + if (((*byte) & 0x0C) == state1) + count++; + if (((*byte) & 0x30) == state2) + count++; + if (((*byte) & 0xC0) == state3) + count++; + } + + return count; +} + +/** + * gfs2_rgrp_verify - Verify that a resource group is consistent + * @sdp: the filesystem + * @rgd: the rgrp + * + */ + +void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd) +{ + struct gfs2_sbd *sdp = rgd->rd_sbd; + struct gfs2_bitmap *bi = NULL; + u32 length = rgd->rd_ri.ri_length; + u32 count[4], tmp; + int buf, x; + + memset(count, 0, 4 * sizeof(u32)); + + /* Count # blocks in each of 4 possible allocation states */ + for (buf = 0; buf < length; buf++) { + bi = rgd->rd_bits + buf; + for (x = 0; x < 4; x++) + count[x] += gfs2_bitcount(rgd, + bi->bi_bh->b_data + + bi->bi_offset, + bi->bi_len, x); + } + + if (count[0] != rgd->rd_rg.rg_free) { + if (gfs2_consist_rgrpd(rgd)) + fs_err(sdp, "free data mismatch: %u != %u\n", + count[0], rgd->rd_rg.rg_free); + return; + } + + tmp = rgd->rd_ri.ri_data - + rgd->rd_rg.rg_free - + rgd->rd_rg.rg_dinodes; + if (count[1] + count[2] != tmp) { + if (gfs2_consist_rgrpd(rgd)) + fs_err(sdp, "used data mismatch: %u != %u\n", + count[1], tmp); + return; + } + + if (count[3] != rgd->rd_rg.rg_dinodes) { + if (gfs2_consist_rgrpd(rgd)) + fs_err(sdp, "used metadata mismatch: %u != %u\n", + count[3], rgd->rd_rg.rg_dinodes); + return; + } + + if (count[2] > count[3]) { + if (gfs2_consist_rgrpd(rgd)) + fs_err(sdp, "unlinked inodes > inodes: %u\n", + count[2]); + return; + } + +} + +static inline int rgrp_contains_block(struct gfs2_rindex *ri, u64 block) +{ + u64 first = ri->ri_data0; + u64 last = first + ri->ri_data; + return first <= block && block < last; +} + +/** + * gfs2_blk2rgrpd - Find resource group for a given data/meta block number + * @sdp: The GFS2 superblock + * @n: The data block number + * + * Returns: The resource group, or NULL if not found + */ + +struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk) +{ + struct gfs2_rgrpd *rgd; + + spin_lock(&sdp->sd_rindex_spin); + + list_for_each_entry(rgd, &sdp->sd_rindex_mru_list, rd_list_mru) { + if (rgrp_contains_block(&rgd->rd_ri, blk)) { + list_move(&rgd->rd_list_mru, &sdp->sd_rindex_mru_list); + spin_unlock(&sdp->sd_rindex_spin); + return rgd; + } + } + + spin_unlock(&sdp->sd_rindex_spin); + + return NULL; +} + +/** + * gfs2_rgrpd_get_first - get the first Resource Group in the filesystem + * @sdp: The GFS2 superblock + * + * Returns: The first rgrp in the filesystem + */ + +struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp) +{ + gfs2_assert(sdp, !list_empty(&sdp->sd_rindex_list)); + return list_entry(sdp->sd_rindex_list.next, struct gfs2_rgrpd, rd_list); +} + +/** + * gfs2_rgrpd_get_next - get the next RG + * @rgd: A RG + * + * Returns: The next rgrp + */ + +struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd) +{ + if (rgd->rd_list.next == &rgd->rd_sbd->sd_rindex_list) + return NULL; + return list_entry(rgd->rd_list.next, struct gfs2_rgrpd, rd_list); +} + +static void clear_rgrpdi(struct gfs2_sbd *sdp) +{ + struct list_head *head; + struct gfs2_rgrpd *rgd; + struct gfs2_glock *gl; + + spin_lock(&sdp->sd_rindex_spin); + sdp->sd_rindex_forward = NULL; + head = &sdp->sd_rindex_recent_list; + while (!list_empty(head)) { + rgd = list_entry(head->next, struct gfs2_rgrpd, rd_recent); + list_del(&rgd->rd_recent); + } + spin_unlock(&sdp->sd_rindex_spin); + + head = &sdp->sd_rindex_list; + while (!list_empty(head)) { + rgd = list_entry(head->next, struct gfs2_rgrpd, rd_list); + gl = rgd->rd_gl; + + list_del(&rgd->rd_list); + list_del(&rgd->rd_list_mru); + + if (gl) { + gl->gl_object = NULL; + gfs2_glock_put(gl); + } + + kfree(rgd->rd_bits); + kfree(rgd); + } +} + +void gfs2_clear_rgrpd(struct gfs2_sbd *sdp) +{ + mutex_lock(&sdp->sd_rindex_mutex); + clear_rgrpdi(sdp); + mutex_unlock(&sdp->sd_rindex_mutex); +} + +/** + * gfs2_compute_bitstructs - Compute the bitmap sizes + * @rgd: The resource group descriptor + * + * Calculates bitmap descriptors, one for each block that contains bitmap data + * + * Returns: errno + */ + +static int compute_bitstructs(struct gfs2_rgrpd *rgd) +{ + struct gfs2_sbd *sdp = rgd->rd_sbd; + struct gfs2_bitmap *bi; + u32 length = rgd->rd_ri.ri_length; /* # blocks in hdr & bitmap */ + u32 bytes_left, bytes; + int x; + + if (!length) + return -EINVAL; + + rgd->rd_bits = kcalloc(length, sizeof(struct gfs2_bitmap), GFP_NOFS); + if (!rgd->rd_bits) + return -ENOMEM; + + bytes_left = rgd->rd_ri.ri_bitbytes; + + for (x = 0; x < length; x++) { + bi = rgd->rd_bits + x; + + /* small rgrp; bitmap stored completely in header block */ + if (length == 1) { + bytes = bytes_left; + bi->bi_offset = sizeof(struct gfs2_rgrp); + bi->bi_start = 0; + bi->bi_len = bytes; + /* header block */ + } else if (x == 0) { + bytes = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_rgrp); + bi->bi_offset = sizeof(struct gfs2_rgrp); + bi->bi_start = 0; + bi->bi_len = bytes; + /* last block */ + } else if (x + 1 == length) { + bytes = bytes_left; + bi->bi_offset = sizeof(struct gfs2_meta_header); + bi->bi_start = rgd->rd_ri.ri_bitbytes - bytes_left; + bi->bi_len = bytes; + /* other blocks */ + } else { + bytes = sdp->sd_sb.sb_bsize - + sizeof(struct gfs2_meta_header); + bi->bi_offset = sizeof(struct gfs2_meta_header); + bi->bi_start = rgd->rd_ri.ri_bitbytes - bytes_left; + bi->bi_len = bytes; + } + + bytes_left -= bytes; + } + + if (bytes_left) { + gfs2_consist_rgrpd(rgd); + return -EIO; + } + bi = rgd->rd_bits + (length - 1); + if ((bi->bi_start + bi->bi_len) * GFS2_NBBY != rgd->rd_ri.ri_data) { + if (gfs2_consist_rgrpd(rgd)) { + gfs2_rindex_print(&rgd->rd_ri); + fs_err(sdp, "start=%u len=%u offset=%u\n", + bi->bi_start, bi->bi_len, bi->bi_offset); + } + return -EIO; + } + + return 0; +} + +/** + * gfs2_ri_update - Pull in a new resource index from the disk + * @gl: The glock covering the rindex inode + * + * Returns: 0 on successful update, error code otherwise + */ + +static int gfs2_ri_update(struct gfs2_inode *ip) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct inode *inode = &ip->i_inode; + struct gfs2_rgrpd *rgd; + char buf[sizeof(struct gfs2_rindex)]; + struct file_ra_state ra_state; + u64 junk = ip->i_di.di_size; + int error; + + if (do_div(junk, sizeof(struct gfs2_rindex))) { + gfs2_consist_inode(ip); + return -EIO; + } + + clear_rgrpdi(sdp); + + file_ra_state_init(&ra_state, inode->i_mapping); + for (sdp->sd_rgrps = 0;; sdp->sd_rgrps++) { + loff_t pos = sdp->sd_rgrps * sizeof(struct gfs2_rindex); + error = gfs2_internal_read(ip, &ra_state, buf, &pos, + sizeof(struct gfs2_rindex)); + if (!error) + break; + if (error != sizeof(struct gfs2_rindex)) { + if (error > 0) + error = -EIO; + goto fail; + } + + rgd = kzalloc(sizeof(struct gfs2_rgrpd), GFP_NOFS); + error = -ENOMEM; + if (!rgd) + goto fail; + + mutex_init(&rgd->rd_mutex); + lops_init_le(&rgd->rd_le, &gfs2_rg_lops); + rgd->rd_sbd = sdp; + + list_add_tail(&rgd->rd_list, &sdp->sd_rindex_list); + list_add_tail(&rgd->rd_list_mru, &sdp->sd_rindex_mru_list); + + gfs2_rindex_in(&rgd->rd_ri, buf); + error = compute_bitstructs(rgd); + if (error) + goto fail; + + error = gfs2_glock_get(sdp, rgd->rd_ri.ri_addr, + &gfs2_rgrp_glops, CREATE, &rgd->rd_gl); + if (error) + goto fail; + + rgd->rd_gl->gl_object = rgd; + rgd->rd_rg_vn = rgd->rd_gl->gl_vn - 1; + } + + sdp->sd_rindex_vn = ip->i_gl->gl_vn; + return 0; + +fail: + clear_rgrpdi(sdp); + return error; +} + +/** + * gfs2_rindex_hold - Grab a lock on the rindex + * @sdp: The GFS2 superblock + * @ri_gh: the glock holder + * + * We grab a lock on the rindex inode to make sure that it doesn't + * change whilst we are performing an operation. We keep this lock + * for quite long periods of time compared to other locks. This + * doesn't matter, since it is shared and it is very, very rarely + * accessed in the exclusive mode (i.e. only when expanding the filesystem). + * + * This makes sure that we're using the latest copy of the resource index + * special file, which might have been updated if someone expanded the + * filesystem (via gfs2_grow utility), which adds new resource groups. + * + * Returns: 0 on success, error code otherwise + */ + +int gfs2_rindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ri_gh) +{ + struct gfs2_inode *ip = GFS2_I(sdp->sd_rindex); + struct gfs2_glock *gl = ip->i_gl; + int error; + + error = gfs2_glock_nq_init(gl, LM_ST_SHARED, 0, ri_gh); + if (error) + return error; + + /* Read new copy from disk if we don't have the latest */ + if (sdp->sd_rindex_vn != gl->gl_vn) { + mutex_lock(&sdp->sd_rindex_mutex); + if (sdp->sd_rindex_vn != gl->gl_vn) { + error = gfs2_ri_update(ip); + if (error) + gfs2_glock_dq_uninit(ri_gh); + } + mutex_unlock(&sdp->sd_rindex_mutex); + } + + return error; +} + +/** + * gfs2_rgrp_bh_get - Read in a RG's header and bitmaps + * @rgd: the struct gfs2_rgrpd describing the RG to read in + * + * Read in all of a Resource Group's header and bitmap blocks. + * Caller must eventually call gfs2_rgrp_relse() to free the bitmaps. + * + * Returns: errno + */ + +int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd) +{ + struct gfs2_sbd *sdp = rgd->rd_sbd; + struct gfs2_glock *gl = rgd->rd_gl; + unsigned int length = rgd->rd_ri.ri_length; + struct gfs2_bitmap *bi; + unsigned int x, y; + int error; + + mutex_lock(&rgd->rd_mutex); + + spin_lock(&sdp->sd_rindex_spin); + if (rgd->rd_bh_count) { + rgd->rd_bh_count++; + spin_unlock(&sdp->sd_rindex_spin); + mutex_unlock(&rgd->rd_mutex); + return 0; + } + spin_unlock(&sdp->sd_rindex_spin); + + for (x = 0; x < length; x++) { + bi = rgd->rd_bits + x; + error = gfs2_meta_read(gl, rgd->rd_ri.ri_addr + x, 0, &bi->bi_bh); + if (error) + goto fail; + } + + for (y = length; y--;) { + bi = rgd->rd_bits + y; + error = gfs2_meta_wait(sdp, bi->bi_bh); + if (error) + goto fail; + if (gfs2_metatype_check(sdp, bi->bi_bh, y ? GFS2_METATYPE_RB : + GFS2_METATYPE_RG)) { + error = -EIO; + goto fail; + } + } + + if (rgd->rd_rg_vn != gl->gl_vn) { + gfs2_rgrp_in(&rgd->rd_rg, (rgd->rd_bits[0].bi_bh)->b_data); + rgd->rd_rg_vn = gl->gl_vn; + } + + spin_lock(&sdp->sd_rindex_spin); + rgd->rd_free_clone = rgd->rd_rg.rg_free; + rgd->rd_bh_count++; + spin_unlock(&sdp->sd_rindex_spin); + + mutex_unlock(&rgd->rd_mutex); + + return 0; + +fail: + while (x--) { + bi = rgd->rd_bits + x; + brelse(bi->bi_bh); + bi->bi_bh = NULL; + gfs2_assert_warn(sdp, !bi->bi_clone); + } + mutex_unlock(&rgd->rd_mutex); + + return error; +} + +void gfs2_rgrp_bh_hold(struct gfs2_rgrpd *rgd) +{ + struct gfs2_sbd *sdp = rgd->rd_sbd; + + spin_lock(&sdp->sd_rindex_spin); + gfs2_assert_warn(rgd->rd_sbd, rgd->rd_bh_count); + rgd->rd_bh_count++; + spin_unlock(&sdp->sd_rindex_spin); +} + +/** + * gfs2_rgrp_bh_put - Release RG bitmaps read in with gfs2_rgrp_bh_get() + * @rgd: the struct gfs2_rgrpd describing the RG to read in + * + */ + +void gfs2_rgrp_bh_put(struct gfs2_rgrpd *rgd) +{ + struct gfs2_sbd *sdp = rgd->rd_sbd; + int x, length = rgd->rd_ri.ri_length; + + spin_lock(&sdp->sd_rindex_spin); + gfs2_assert_warn(rgd->rd_sbd, rgd->rd_bh_count); + if (--rgd->rd_bh_count) { + spin_unlock(&sdp->sd_rindex_spin); + return; + } + + for (x = 0; x < length; x++) { + struct gfs2_bitmap *bi = rgd->rd_bits + x; + kfree(bi->bi_clone); + bi->bi_clone = NULL; + brelse(bi->bi_bh); + bi->bi_bh = NULL; + } + + spin_unlock(&sdp->sd_rindex_spin); +} + +void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd) +{ + struct gfs2_sbd *sdp = rgd->rd_sbd; + unsigned int length = rgd->rd_ri.ri_length; + unsigned int x; + + for (x = 0; x < length; x++) { + struct gfs2_bitmap *bi = rgd->rd_bits + x; + if (!bi->bi_clone) + continue; + memcpy(bi->bi_clone + bi->bi_offset, + bi->bi_bh->b_data + bi->bi_offset, bi->bi_len); + } + + spin_lock(&sdp->sd_rindex_spin); + rgd->rd_free_clone = rgd->rd_rg.rg_free; + spin_unlock(&sdp->sd_rindex_spin); +} + +/** + * gfs2_alloc_get - get the struct gfs2_alloc structure for an inode + * @ip: the incore GFS2 inode structure + * + * Returns: the struct gfs2_alloc + */ + +struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip) +{ + struct gfs2_alloc *al = &ip->i_alloc; + + /* FIXME: Should assert that the correct locks are held here... */ + memset(al, 0, sizeof(*al)); + return al; +} + +/** + * try_rgrp_fit - See if a given reservation will fit in a given RG + * @rgd: the RG data + * @al: the struct gfs2_alloc structure describing the reservation + * + * If there's room for the requested blocks to be allocated from the RG: + * Sets the $al_reserved_data field in @al. + * Sets the $al_reserved_meta field in @al. + * Sets the $al_rgd field in @al. + * + * Returns: 1 on success (it fits), 0 on failure (it doesn't fit) + */ + +static int try_rgrp_fit(struct gfs2_rgrpd *rgd, struct gfs2_alloc *al) +{ + struct gfs2_sbd *sdp = rgd->rd_sbd; + int ret = 0; + + spin_lock(&sdp->sd_rindex_spin); + if (rgd->rd_free_clone >= al->al_requested) { + al->al_rgd = rgd; + ret = 1; + } + spin_unlock(&sdp->sd_rindex_spin); + + return ret; +} + +/** + * recent_rgrp_first - get first RG from "recent" list + * @sdp: The GFS2 superblock + * @rglast: address of the rgrp used last + * + * Returns: The first rgrp in the recent list + */ + +static struct gfs2_rgrpd *recent_rgrp_first(struct gfs2_sbd *sdp, + u64 rglast) +{ + struct gfs2_rgrpd *rgd = NULL; + + spin_lock(&sdp->sd_rindex_spin); + + if (list_empty(&sdp->sd_rindex_recent_list)) + goto out; + + if (!rglast) + goto first; + + list_for_each_entry(rgd, &sdp->sd_rindex_recent_list, rd_recent) { + if (rgd->rd_ri.ri_addr == rglast) + goto out; + } + +first: + rgd = list_entry(sdp->sd_rindex_recent_list.next, struct gfs2_rgrpd, + rd_recent); +out: + spin_unlock(&sdp->sd_rindex_spin); + return rgd; +} + +/** + * recent_rgrp_next - get next RG from "recent" list + * @cur_rgd: current rgrp + * @remove: + * + * Returns: The next rgrp in the recent list + */ + +static struct gfs2_rgrpd *recent_rgrp_next(struct gfs2_rgrpd *cur_rgd, + int remove) +{ + struct gfs2_sbd *sdp = cur_rgd->rd_sbd; + struct list_head *head; + struct gfs2_rgrpd *rgd; + + spin_lock(&sdp->sd_rindex_spin); + + head = &sdp->sd_rindex_recent_list; + + list_for_each_entry(rgd, head, rd_recent) { + if (rgd == cur_rgd) { + if (cur_rgd->rd_recent.next != head) + rgd = list_entry(cur_rgd->rd_recent.next, + struct gfs2_rgrpd, rd_recent); + else + rgd = NULL; + + if (remove) + list_del(&cur_rgd->rd_recent); + + goto out; + } + } + + rgd = NULL; + if (!list_empty(head)) + rgd = list_entry(head->next, struct gfs2_rgrpd, rd_recent); + +out: + spin_unlock(&sdp->sd_rindex_spin); + return rgd; +} + +/** + * recent_rgrp_add - add an RG to tail of "recent" list + * @new_rgd: The rgrp to add + * + */ + +static void recent_rgrp_add(struct gfs2_rgrpd *new_rgd) +{ + struct gfs2_sbd *sdp = new_rgd->rd_sbd; + struct gfs2_rgrpd *rgd; + unsigned int count = 0; + unsigned int max = sdp->sd_rgrps / gfs2_jindex_size(sdp); + + spin_lock(&sdp->sd_rindex_spin); + + list_for_each_entry(rgd, &sdp->sd_rindex_recent_list, rd_recent) { + if (rgd == new_rgd) + goto out; + + if (++count >= max) + goto out; + } + list_add_tail(&new_rgd->rd_recent, &sdp->sd_rindex_recent_list); + +out: + spin_unlock(&sdp->sd_rindex_spin); +} + +/** + * forward_rgrp_get - get an rgrp to try next from full list + * @sdp: The GFS2 superblock + * + * Returns: The rgrp to try next + */ + +static struct gfs2_rgrpd *forward_rgrp_get(struct gfs2_sbd *sdp) +{ + struct gfs2_rgrpd *rgd; + unsigned int journals = gfs2_jindex_size(sdp); + unsigned int rg = 0, x; + + spin_lock(&sdp->sd_rindex_spin); + + rgd = sdp->sd_rindex_forward; + if (!rgd) { + if (sdp->sd_rgrps >= journals) + rg = sdp->sd_rgrps * sdp->sd_jdesc->jd_jid / journals; + + for (x = 0, rgd = gfs2_rgrpd_get_first(sdp); x < rg; + x++, rgd = gfs2_rgrpd_get_next(rgd)) + /* Do Nothing */; + + sdp->sd_rindex_forward = rgd; + } + + spin_unlock(&sdp->sd_rindex_spin); + + return rgd; +} + +/** + * forward_rgrp_set - set the forward rgrp pointer + * @sdp: the filesystem + * @rgd: The new forward rgrp + * + */ + +static void forward_rgrp_set(struct gfs2_sbd *sdp, struct gfs2_rgrpd *rgd) +{ + spin_lock(&sdp->sd_rindex_spin); + sdp->sd_rindex_forward = rgd; + spin_unlock(&sdp->sd_rindex_spin); +} + +/** + * get_local_rgrp - Choose and lock a rgrp for allocation + * @ip: the inode to reserve space for + * @rgp: the chosen and locked rgrp + * + * Try to acquire rgrp in way which avoids contending with others. + * + * Returns: errno + */ + +static int get_local_rgrp(struct gfs2_inode *ip) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct gfs2_rgrpd *rgd, *begin = NULL; + struct gfs2_alloc *al = &ip->i_alloc; + int flags = LM_FLAG_TRY; + int skipped = 0; + int loops = 0; + int error; + + /* Try recently successful rgrps */ + + rgd = recent_rgrp_first(sdp, ip->i_last_rg_alloc); + + while (rgd) { + error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, + LM_FLAG_TRY, &al->al_rgd_gh); + switch (error) { + case 0: + if (try_rgrp_fit(rgd, al)) + goto out; + gfs2_glock_dq_uninit(&al->al_rgd_gh); + rgd = recent_rgrp_next(rgd, 1); + break; + + case GLR_TRYFAILED: + rgd = recent_rgrp_next(rgd, 0); + break; + + default: + return error; + } + } + + /* Go through full list of rgrps */ + + begin = rgd = forward_rgrp_get(sdp); + + for (;;) { + error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, flags, + &al->al_rgd_gh); + switch (error) { + case 0: + if (try_rgrp_fit(rgd, al)) + goto out; + gfs2_glock_dq_uninit(&al->al_rgd_gh); + break; + + case GLR_TRYFAILED: + skipped++; + break; + + default: + return error; + } + + rgd = gfs2_rgrpd_get_next(rgd); + if (!rgd) + rgd = gfs2_rgrpd_get_first(sdp); + + if (rgd == begin) { + if (++loops >= 2 || !skipped) + return -ENOSPC; + flags = 0; + } + } + +out: + ip->i_last_rg_alloc = rgd->rd_ri.ri_addr; + + if (begin) { + recent_rgrp_add(rgd); + rgd = gfs2_rgrpd_get_next(rgd); + if (!rgd) + rgd = gfs2_rgrpd_get_first(sdp); + forward_rgrp_set(sdp, rgd); + } + + return 0; +} + +/** + * gfs2_inplace_reserve_i - Reserve space in the filesystem + * @ip: the inode to reserve space for + * + * Returns: errno + */ + +int gfs2_inplace_reserve_i(struct gfs2_inode *ip, char *file, unsigned int line) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct gfs2_alloc *al = &ip->i_alloc; + int error; + + if (gfs2_assert_warn(sdp, al->al_requested)) + return -EINVAL; + + error = gfs2_rindex_hold(sdp, &al->al_ri_gh); + if (error) + return error; + + error = get_local_rgrp(ip); + if (error) { + gfs2_glock_dq_uninit(&al->al_ri_gh); + return error; + } + + al->al_file = file; + al->al_line = line; + + return 0; +} + +/** + * gfs2_inplace_release - release an inplace reservation + * @ip: the inode the reservation was taken out on + * + * Release a reservation made by gfs2_inplace_reserve(). + */ + +void gfs2_inplace_release(struct gfs2_inode *ip) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct gfs2_alloc *al = &ip->i_alloc; + + if (gfs2_assert_warn(sdp, al->al_alloced <= al->al_requested) == -1) + fs_warn(sdp, "al_alloced = %u, al_requested = %u " + "al_file = %s, al_line = %u\n", + al->al_alloced, al->al_requested, al->al_file, + al->al_line); + + al->al_rgd = NULL; + gfs2_glock_dq_uninit(&al->al_rgd_gh); + gfs2_glock_dq_uninit(&al->al_ri_gh); +} + +/** + * gfs2_get_block_type - Check a block in a RG is of given type + * @rgd: the resource group holding the block + * @block: the block number + * + * Returns: The block type (GFS2_BLKST_*) + */ + +unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, u64 block) +{ + struct gfs2_bitmap *bi = NULL; + u32 length, rgrp_block, buf_block; + unsigned int buf; + unsigned char type; + + length = rgd->rd_ri.ri_length; + rgrp_block = block - rgd->rd_ri.ri_data0; + + for (buf = 0; buf < length; buf++) { + bi = rgd->rd_bits + buf; + if (rgrp_block < (bi->bi_start + bi->bi_len) * GFS2_NBBY) + break; + } + + gfs2_assert(rgd->rd_sbd, buf < length); + buf_block = rgrp_block - bi->bi_start * GFS2_NBBY; + + type = gfs2_testbit(rgd, bi->bi_bh->b_data + bi->bi_offset, + bi->bi_len, buf_block); + + return type; +} + +/** + * rgblk_search - find a block in @old_state, change allocation + * state to @new_state + * @rgd: the resource group descriptor + * @goal: the goal block within the RG (start here to search for avail block) + * @old_state: GFS2_BLKST_XXX the before-allocation state to find + * @new_state: GFS2_BLKST_XXX the after-allocation block state + * + * Walk rgrp's bitmap to find bits that represent a block in @old_state. + * Add the found bitmap buffer to the transaction. + * Set the found bits to @new_state to change block's allocation state. + * + * This function never fails, because we wouldn't call it unless we + * know (from reservation results, etc.) that a block is available. + * + * Scope of @goal and returned block is just within rgrp, not the whole + * filesystem. + * + * Returns: the block number allocated + */ + +static u32 rgblk_search(struct gfs2_rgrpd *rgd, u32 goal, + unsigned char old_state, unsigned char new_state) +{ + struct gfs2_bitmap *bi = NULL; + u32 length = rgd->rd_ri.ri_length; + u32 blk = 0; + unsigned int buf, x; + + /* Find bitmap block that contains bits for goal block */ + for (buf = 0; buf < length; buf++) { + bi = rgd->rd_bits + buf; + if (goal < (bi->bi_start + bi->bi_len) * GFS2_NBBY) + break; + } + + gfs2_assert(rgd->rd_sbd, buf < length); + + /* Convert scope of "goal" from rgrp-wide to within found bit block */ + goal -= bi->bi_start * GFS2_NBBY; + + /* Search (up to entire) bitmap in this rgrp for allocatable block. + "x <= length", instead of "x < length", because we typically start + the search in the middle of a bit block, but if we can't find an + allocatable block anywhere else, we want to be able wrap around and + search in the first part of our first-searched bit block. */ + for (x = 0; x <= length; x++) { + if (bi->bi_clone) + blk = gfs2_bitfit(rgd, bi->bi_clone + bi->bi_offset, + bi->bi_len, goal, old_state); + else + blk = gfs2_bitfit(rgd, + bi->bi_bh->b_data + bi->bi_offset, + bi->bi_len, goal, old_state); + if (blk != BFITNOENT) + break; + + /* Try next bitmap block (wrap back to rgrp header if at end) */ + buf = (buf + 1) % length; + bi = rgd->rd_bits + buf; + goal = 0; + } + + if (gfs2_assert_withdraw(rgd->rd_sbd, x <= length)) + blk = 0; + + gfs2_trans_add_bh(rgd->rd_gl, bi->bi_bh, 1); + gfs2_setbit(rgd, bi->bi_bh->b_data + bi->bi_offset, + bi->bi_len, blk, new_state); + if (bi->bi_clone) + gfs2_setbit(rgd, bi->bi_clone + bi->bi_offset, + bi->bi_len, blk, new_state); + + return bi->bi_start * GFS2_NBBY + blk; +} + +/** + * rgblk_free - Change alloc state of given block(s) + * @sdp: the filesystem + * @bstart: the start of a run of blocks to free + * @blen: the length of the block run (all must lie within ONE RG!) + * @new_state: GFS2_BLKST_XXX the after-allocation block state + * + * Returns: Resource group containing the block(s) + */ + +static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart, + u32 blen, unsigned char new_state) +{ + struct gfs2_rgrpd *rgd; + struct gfs2_bitmap *bi = NULL; + u32 length, rgrp_blk, buf_blk; + unsigned int buf; + + rgd = gfs2_blk2rgrpd(sdp, bstart); + if (!rgd) { + if (gfs2_consist(sdp)) + fs_err(sdp, "block = %llu\n", (unsigned long long)bstart); + return NULL; + } + + length = rgd->rd_ri.ri_length; + + rgrp_blk = bstart - rgd->rd_ri.ri_data0; + + while (blen--) { + for (buf = 0; buf < length; buf++) { + bi = rgd->rd_bits + buf; + if (rgrp_blk < (bi->bi_start + bi->bi_len) * GFS2_NBBY) + break; + } + + gfs2_assert(rgd->rd_sbd, buf < length); + + buf_blk = rgrp_blk - bi->bi_start * GFS2_NBBY; + rgrp_blk++; + + if (!bi->bi_clone) { + bi->bi_clone = kmalloc(bi->bi_bh->b_size, + GFP_NOFS | __GFP_NOFAIL); + memcpy(bi->bi_clone + bi->bi_offset, + bi->bi_bh->b_data + bi->bi_offset, + bi->bi_len); + } + gfs2_trans_add_bh(rgd->rd_gl, bi->bi_bh, 1); + gfs2_setbit(rgd, bi->bi_bh->b_data + bi->bi_offset, + bi->bi_len, buf_blk, new_state); + } + + return rgd; +} + +/** + * gfs2_alloc_data - Allocate a data block + * @ip: the inode to allocate the data block for + * + * Returns: the allocated block + */ + +u64 gfs2_alloc_data(struct gfs2_inode *ip) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct gfs2_alloc *al = &ip->i_alloc; + struct gfs2_rgrpd *rgd = al->al_rgd; + u32 goal, blk; + u64 block; + + if (rgrp_contains_block(&rgd->rd_ri, ip->i_di.di_goal_data)) + goal = ip->i_di.di_goal_data - rgd->rd_ri.ri_data0; + else + goal = rgd->rd_last_alloc_data; + + blk = rgblk_search(rgd, goal, GFS2_BLKST_FREE, GFS2_BLKST_USED); + rgd->rd_last_alloc_data = blk; + + block = rgd->rd_ri.ri_data0 + blk; + ip->i_di.di_goal_data = block; + + gfs2_assert_withdraw(sdp, rgd->rd_rg.rg_free); + rgd->rd_rg.rg_free--; + + gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); + gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data); + + al->al_alloced++; + + gfs2_statfs_change(sdp, 0, -1, 0); + gfs2_quota_change(ip, +1, ip->i_di.di_uid, ip->i_di.di_gid); + + spin_lock(&sdp->sd_rindex_spin); + rgd->rd_free_clone--; + spin_unlock(&sdp->sd_rindex_spin); + + return block; +} + +/** + * gfs2_alloc_meta - Allocate a metadata block + * @ip: the inode to allocate the metadata block for + * + * Returns: the allocated block + */ + +u64 gfs2_alloc_meta(struct gfs2_inode *ip) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct gfs2_alloc *al = &ip->i_alloc; + struct gfs2_rgrpd *rgd = al->al_rgd; + u32 goal, blk; + u64 block; + + if (rgrp_contains_block(&rgd->rd_ri, ip->i_di.di_goal_meta)) + goal = ip->i_di.di_goal_meta - rgd->rd_ri.ri_data0; + else + goal = rgd->rd_last_alloc_meta; + + blk = rgblk_search(rgd, goal, GFS2_BLKST_FREE, GFS2_BLKST_USED); + rgd->rd_last_alloc_meta = blk; + + block = rgd->rd_ri.ri_data0 + blk; + ip->i_di.di_goal_meta = block; + + gfs2_assert_withdraw(sdp, rgd->rd_rg.rg_free); + rgd->rd_rg.rg_free--; + + gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); + gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data); + + al->al_alloced++; + + gfs2_statfs_change(sdp, 0, -1, 0); + gfs2_quota_change(ip, +1, ip->i_di.di_uid, ip->i_di.di_gid); + gfs2_trans_add_unrevoke(sdp, block); + + spin_lock(&sdp->sd_rindex_spin); + rgd->rd_free_clone--; + spin_unlock(&sdp->sd_rindex_spin); + + return block; +} + +/** + * gfs2_alloc_di - Allocate a dinode + * @dip: the directory that the inode is going in + * + * Returns: the block allocated + */ + +u64 gfs2_alloc_di(struct gfs2_inode *dip, u64 *generation) +{ + struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); + struct gfs2_alloc *al = &dip->i_alloc; + struct gfs2_rgrpd *rgd = al->al_rgd; + u32 blk; + u64 block; + + blk = rgblk_search(rgd, rgd->rd_last_alloc_meta, + GFS2_BLKST_FREE, GFS2_BLKST_DINODE); + + rgd->rd_last_alloc_meta = blk; + + block = rgd->rd_ri.ri_data0 + blk; + + gfs2_assert_withdraw(sdp, rgd->rd_rg.rg_free); + rgd->rd_rg.rg_free--; + rgd->rd_rg.rg_dinodes++; + *generation = rgd->rd_rg.rg_igeneration++; + gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); + gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data); + + al->al_alloced++; + + gfs2_statfs_change(sdp, 0, -1, +1); + gfs2_trans_add_unrevoke(sdp, block); + + spin_lock(&sdp->sd_rindex_spin); + rgd->rd_free_clone--; + spin_unlock(&sdp->sd_rindex_spin); + + return block; +} + +/** + * gfs2_free_data - free a contiguous run of data block(s) + * @ip: the inode these blocks are being freed from + * @bstart: first block of a run of contiguous blocks + * @blen: the length of the block run + * + */ + +void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct gfs2_rgrpd *rgd; + + rgd = rgblk_free(sdp, bstart, blen, GFS2_BLKST_FREE); + if (!rgd) + return; + + rgd->rd_rg.rg_free += blen; + + gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); + gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data); + + gfs2_trans_add_rg(rgd); + + gfs2_statfs_change(sdp, 0, +blen, 0); + gfs2_quota_change(ip, -(s64)blen, + ip->i_di.di_uid, ip->i_di.di_gid); +} + +/** + * gfs2_free_meta - free a contiguous run of data block(s) + * @ip: the inode these blocks are being freed from + * @bstart: first block of a run of contiguous blocks + * @blen: the length of the block run + * + */ + +void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct gfs2_rgrpd *rgd; + + rgd = rgblk_free(sdp, bstart, blen, GFS2_BLKST_FREE); + if (!rgd) + return; + + rgd->rd_rg.rg_free += blen; + + gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); + gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data); + + gfs2_trans_add_rg(rgd); + + gfs2_statfs_change(sdp, 0, +blen, 0); + gfs2_quota_change(ip, -(s64)blen, ip->i_di.di_uid, ip->i_di.di_gid); + gfs2_meta_wipe(ip, bstart, blen); +} + +void gfs2_unlink_di(struct inode *inode) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + struct gfs2_rgrpd *rgd; + u64 blkno = ip->i_num.no_addr; + + rgd = rgblk_free(sdp, blkno, 1, GFS2_BLKST_UNLINKED); + if (!rgd) + return; + gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); + gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data); + gfs2_trans_add_rg(rgd); +} + +static void gfs2_free_uninit_di(struct gfs2_rgrpd *rgd, u64 blkno) +{ + struct gfs2_sbd *sdp = rgd->rd_sbd; + struct gfs2_rgrpd *tmp_rgd; + + tmp_rgd = rgblk_free(sdp, blkno, 1, GFS2_BLKST_FREE); + if (!tmp_rgd) + return; + gfs2_assert_withdraw(sdp, rgd == tmp_rgd); + + if (!rgd->rd_rg.rg_dinodes) + gfs2_consist_rgrpd(rgd); + rgd->rd_rg.rg_dinodes--; + rgd->rd_rg.rg_free++; + + gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); + gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data); + + gfs2_statfs_change(sdp, 0, +1, -1); + gfs2_trans_add_rg(rgd); +} + + +void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip) +{ + gfs2_free_uninit_di(rgd, ip->i_num.no_addr); + gfs2_quota_change(ip, -1, ip->i_di.di_uid, ip->i_di.di_gid); + gfs2_meta_wipe(ip, ip->i_num.no_addr, 1); +} + +/** + * gfs2_rlist_add - add a RG to a list of RGs + * @sdp: the filesystem + * @rlist: the list of resource groups + * @block: the block + * + * Figure out what RG a block belongs to and add that RG to the list + * + * FIXME: Don't use NOFAIL + * + */ + +void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist, + u64 block) +{ + struct gfs2_rgrpd *rgd; + struct gfs2_rgrpd **tmp; + unsigned int new_space; + unsigned int x; + + if (gfs2_assert_warn(sdp, !rlist->rl_ghs)) + return; + + rgd = gfs2_blk2rgrpd(sdp, block); + if (!rgd) { + if (gfs2_consist(sdp)) + fs_err(sdp, "block = %llu\n", (unsigned long long)block); + return; + } + + for (x = 0; x < rlist->rl_rgrps; x++) + if (rlist->rl_rgd[x] == rgd) + return; + + if (rlist->rl_rgrps == rlist->rl_space) { + new_space = rlist->rl_space + 10; + + tmp = kcalloc(new_space, sizeof(struct gfs2_rgrpd *), + GFP_NOFS | __GFP_NOFAIL); + + if (rlist->rl_rgd) { + memcpy(tmp, rlist->rl_rgd, + rlist->rl_space * sizeof(struct gfs2_rgrpd *)); + kfree(rlist->rl_rgd); + } + + rlist->rl_space = new_space; + rlist->rl_rgd = tmp; + } + + rlist->rl_rgd[rlist->rl_rgrps++] = rgd; +} + +/** + * gfs2_rlist_alloc - all RGs have been added to the rlist, now allocate + * and initialize an array of glock holders for them + * @rlist: the list of resource groups + * @state: the lock state to acquire the RG lock in + * @flags: the modifier flags for the holder structures + * + * FIXME: Don't use NOFAIL + * + */ + +void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state, + int flags) +{ + unsigned int x; + + rlist->rl_ghs = kcalloc(rlist->rl_rgrps, sizeof(struct gfs2_holder), + GFP_NOFS | __GFP_NOFAIL); + for (x = 0; x < rlist->rl_rgrps; x++) + gfs2_holder_init(rlist->rl_rgd[x]->rd_gl, + state, flags, + &rlist->rl_ghs[x]); +} + +/** + * gfs2_rlist_free - free a resource group list + * @list: the list of resource groups + * + */ + +void gfs2_rlist_free(struct gfs2_rgrp_list *rlist) +{ + unsigned int x; + + kfree(rlist->rl_rgd); + + if (rlist->rl_ghs) { + for (x = 0; x < rlist->rl_rgrps; x++) + gfs2_holder_uninit(&rlist->rl_ghs[x]); + kfree(rlist->rl_ghs); + } +} + diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h new file mode 100644 index 000000000000..9eedfd12bfff --- /dev/null +++ b/fs/gfs2/rgrp.h @@ -0,0 +1,69 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#ifndef __RGRP_DOT_H__ +#define __RGRP_DOT_H__ + +struct gfs2_rgrpd; +struct gfs2_sbd; +struct gfs2_holder; + +void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd); + +struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk); +struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp); +struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd); + +void gfs2_clear_rgrpd(struct gfs2_sbd *sdp); +int gfs2_rindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ri_gh); + +int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd); +void gfs2_rgrp_bh_hold(struct gfs2_rgrpd *rgd); +void gfs2_rgrp_bh_put(struct gfs2_rgrpd *rgd); + +void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd); + +struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip); +static inline void gfs2_alloc_put(struct gfs2_inode *ip) +{ + return; /* Se we can see where ip->i_alloc is used */ +} + +int gfs2_inplace_reserve_i(struct gfs2_inode *ip, + char *file, unsigned int line); +#define gfs2_inplace_reserve(ip) \ +gfs2_inplace_reserve_i((ip), __FILE__, __LINE__) + +void gfs2_inplace_release(struct gfs2_inode *ip); + +unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, u64 block); + +u64 gfs2_alloc_data(struct gfs2_inode *ip); +u64 gfs2_alloc_meta(struct gfs2_inode *ip); +u64 gfs2_alloc_di(struct gfs2_inode *ip, u64 *generation); + +void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen); +void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen); +void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip); +void gfs2_unlink_di(struct inode *inode); + +struct gfs2_rgrp_list { + unsigned int rl_rgrps; + unsigned int rl_space; + struct gfs2_rgrpd **rl_rgd; + struct gfs2_holder *rl_ghs; +}; + +void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist, + u64 block); +void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state, + int flags); +void gfs2_rlist_free(struct gfs2_rgrp_list *rlist); + +#endif /* __RGRP_DOT_H__ */ diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c new file mode 100644 index 000000000000..6a78b1b32e25 --- /dev/null +++ b/fs/gfs2/super.c @@ -0,0 +1,976 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/completion.h> +#include <linux/buffer_head.h> +#include <linux/crc32.h> +#include <linux/gfs2_ondisk.h> +#include <linux/bio.h> +#include <linux/lm_interface.h> + +#include "gfs2.h" +#include "incore.h" +#include "bmap.h" +#include "dir.h" +#include "glock.h" +#include "glops.h" +#include "inode.h" +#include "log.h" +#include "meta_io.h" +#include "quota.h" +#include "recovery.h" +#include "rgrp.h" +#include "super.h" +#include "trans.h" +#include "util.h" + +static const u32 gfs2_old_fs_formats[] = { + 0 +}; + +static const u32 gfs2_old_multihost_formats[] = { + 0 +}; + +/** + * gfs2_tune_init - Fill a gfs2_tune structure with default values + * @gt: tune + * + */ + +void gfs2_tune_init(struct gfs2_tune *gt) +{ + spin_lock_init(>->gt_spin); + + gt->gt_ilimit = 100; + gt->gt_ilimit_tries = 3; + gt->gt_ilimit_min = 1; + gt->gt_demote_secs = 300; + gt->gt_incore_log_blocks = 1024; + gt->gt_log_flush_secs = 60; + gt->gt_jindex_refresh_secs = 60; + gt->gt_scand_secs = 15; + gt->gt_recoverd_secs = 60; + gt->gt_logd_secs = 1; + gt->gt_quotad_secs = 5; + gt->gt_quota_simul_sync = 64; + gt->gt_quota_warn_period = 10; + gt->gt_quota_scale_num = 1; + gt->gt_quota_scale_den = 1; + gt->gt_quota_cache_secs = 300; + gt->gt_quota_quantum = 60; + gt->gt_atime_quantum = 3600; + gt->gt_new_files_jdata = 0; + gt->gt_new_files_directio = 0; + gt->gt_max_atomic_write = 4 << 20; + gt->gt_max_readahead = 1 << 18; + gt->gt_lockdump_size = 131072; + gt->gt_stall_secs = 600; + gt->gt_complain_secs = 10; + gt->gt_reclaim_limit = 5000; + gt->gt_entries_per_readdir = 32; + gt->gt_prefetch_secs = 10; + gt->gt_greedy_default = HZ / 10; + gt->gt_greedy_quantum = HZ / 40; + gt->gt_greedy_max = HZ / 4; + gt->gt_statfs_quantum = 30; + gt->gt_statfs_slow = 0; +} + +/** + * gfs2_check_sb - Check superblock + * @sdp: the filesystem + * @sb: The superblock + * @silent: Don't print a message if the check fails + * + * Checks the version code of the FS is one that we understand how to + * read and that the sizes of the various on-disk structures have not + * changed. + */ + +int gfs2_check_sb(struct gfs2_sbd *sdp, struct gfs2_sb *sb, int silent) +{ + unsigned int x; + + if (sb->sb_header.mh_magic != GFS2_MAGIC || + sb->sb_header.mh_type != GFS2_METATYPE_SB) { + if (!silent) + printk(KERN_WARNING "GFS2: not a GFS2 filesystem\n"); + return -EINVAL; + } + + /* If format numbers match exactly, we're done. */ + + if (sb->sb_fs_format == GFS2_FORMAT_FS && + sb->sb_multihost_format == GFS2_FORMAT_MULTI) + return 0; + + if (sb->sb_fs_format != GFS2_FORMAT_FS) { + for (x = 0; gfs2_old_fs_formats[x]; x++) + if (gfs2_old_fs_formats[x] == sb->sb_fs_format) + break; + + if (!gfs2_old_fs_formats[x]) { + printk(KERN_WARNING + "GFS2: code version (%u, %u) is incompatible " + "with ondisk format (%u, %u)\n", + GFS2_FORMAT_FS, GFS2_FORMAT_MULTI, + sb->sb_fs_format, sb->sb_multihost_format); + printk(KERN_WARNING + "GFS2: I don't know how to upgrade this FS\n"); + return -EINVAL; + } + } + + if (sb->sb_multihost_format != GFS2_FORMAT_MULTI) { + for (x = 0; gfs2_old_multihost_formats[x]; x++) + if (gfs2_old_multihost_formats[x] == + sb->sb_multihost_format) + break; + + if (!gfs2_old_multihost_formats[x]) { + printk(KERN_WARNING + "GFS2: code version (%u, %u) is incompatible " + "with ondisk format (%u, %u)\n", + GFS2_FORMAT_FS, GFS2_FORMAT_MULTI, + sb->sb_fs_format, sb->sb_multihost_format); + printk(KERN_WARNING + "GFS2: I don't know how to upgrade this FS\n"); + return -EINVAL; + } + } + + if (!sdp->sd_args.ar_upgrade) { + printk(KERN_WARNING + "GFS2: code version (%u, %u) is incompatible " + "with ondisk format (%u, %u)\n", + GFS2_FORMAT_FS, GFS2_FORMAT_MULTI, + sb->sb_fs_format, sb->sb_multihost_format); + printk(KERN_INFO + "GFS2: Use the \"upgrade\" mount option to upgrade " + "the FS\n"); + printk(KERN_INFO "GFS2: See the manual for more details\n"); + return -EINVAL; + } + + return 0; +} + + +static int end_bio_io_page(struct bio *bio, unsigned int bytes_done, int error) +{ + struct page *page = bio->bi_private; + if (bio->bi_size) + return 1; + + if (!error) + SetPageUptodate(page); + else + printk(KERN_WARNING "gfs2: error %d reading superblock\n", error); + unlock_page(page); + return 0; +} + +struct page *gfs2_read_super(struct super_block *sb, sector_t sector) +{ + struct page *page; + struct bio *bio; + + page = alloc_page(GFP_KERNEL); + if (unlikely(!page)) + return NULL; + + ClearPageUptodate(page); + ClearPageDirty(page); + lock_page(page); + + bio = bio_alloc(GFP_KERNEL, 1); + if (unlikely(!bio)) { + __free_page(page); + return NULL; + } + + bio->bi_sector = sector; + bio->bi_bdev = sb->s_bdev; + bio_add_page(bio, page, PAGE_SIZE, 0); + + bio->bi_end_io = end_bio_io_page; + bio->bi_private = page; + submit_bio(READ_SYNC | (1 << BIO_RW_META), bio); + wait_on_page_locked(page); + bio_put(bio); + if (!PageUptodate(page)) { + __free_page(page); + return NULL; + } + return page; +} + +/** + * gfs2_read_sb - Read super block + * @sdp: The GFS2 superblock + * @gl: the glock for the superblock (assumed to be held) + * @silent: Don't print message if mount fails + * + */ + +int gfs2_read_sb(struct gfs2_sbd *sdp, struct gfs2_glock *gl, int silent) +{ + u32 hash_blocks, ind_blocks, leaf_blocks; + u32 tmp_blocks; + unsigned int x; + int error; + struct page *page; + char *sb; + + page = gfs2_read_super(sdp->sd_vfs, GFS2_SB_ADDR >> sdp->sd_fsb2bb_shift); + if (!page) { + if (!silent) + fs_err(sdp, "can't read superblock\n"); + return -EIO; + } + sb = kmap(page); + gfs2_sb_in(&sdp->sd_sb, sb); + kunmap(page); + __free_page(page); + + error = gfs2_check_sb(sdp, &sdp->sd_sb, silent); + if (error) + return error; + + sdp->sd_fsb2bb_shift = sdp->sd_sb.sb_bsize_shift - + GFS2_BASIC_BLOCK_SHIFT; + sdp->sd_fsb2bb = 1 << sdp->sd_fsb2bb_shift; + sdp->sd_diptrs = (sdp->sd_sb.sb_bsize - + sizeof(struct gfs2_dinode)) / sizeof(u64); + sdp->sd_inptrs = (sdp->sd_sb.sb_bsize - + sizeof(struct gfs2_meta_header)) / sizeof(u64); + sdp->sd_jbsize = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_meta_header); + sdp->sd_hash_bsize = sdp->sd_sb.sb_bsize / 2; + sdp->sd_hash_bsize_shift = sdp->sd_sb.sb_bsize_shift - 1; + sdp->sd_hash_ptrs = sdp->sd_hash_bsize / sizeof(u64); + sdp->sd_qc_per_block = (sdp->sd_sb.sb_bsize - + sizeof(struct gfs2_meta_header)) / + sizeof(struct gfs2_quota_change); + + /* Compute maximum reservation required to add a entry to a directory */ + + hash_blocks = DIV_ROUND_UP(sizeof(u64) * (1 << GFS2_DIR_MAX_DEPTH), + sdp->sd_jbsize); + + ind_blocks = 0; + for (tmp_blocks = hash_blocks; tmp_blocks > sdp->sd_diptrs;) { + tmp_blocks = DIV_ROUND_UP(tmp_blocks, sdp->sd_inptrs); + ind_blocks += tmp_blocks; + } + + leaf_blocks = 2 + GFS2_DIR_MAX_DEPTH; + + sdp->sd_max_dirres = hash_blocks + ind_blocks + leaf_blocks; + + sdp->sd_heightsize[0] = sdp->sd_sb.sb_bsize - + sizeof(struct gfs2_dinode); + sdp->sd_heightsize[1] = sdp->sd_sb.sb_bsize * sdp->sd_diptrs; + for (x = 2;; x++) { + u64 space, d; + u32 m; + + space = sdp->sd_heightsize[x - 1] * sdp->sd_inptrs; + d = space; + m = do_div(d, sdp->sd_inptrs); + + if (d != sdp->sd_heightsize[x - 1] || m) + break; + sdp->sd_heightsize[x] = space; + } + sdp->sd_max_height = x; + gfs2_assert(sdp, sdp->sd_max_height <= GFS2_MAX_META_HEIGHT); + + sdp->sd_jheightsize[0] = sdp->sd_sb.sb_bsize - + sizeof(struct gfs2_dinode); + sdp->sd_jheightsize[1] = sdp->sd_jbsize * sdp->sd_diptrs; + for (x = 2;; x++) { + u64 space, d; + u32 m; + + space = sdp->sd_jheightsize[x - 1] * sdp->sd_inptrs; + d = space; + m = do_div(d, sdp->sd_inptrs); + + if (d != sdp->sd_jheightsize[x - 1] || m) + break; + sdp->sd_jheightsize[x] = space; + } + sdp->sd_max_jheight = x; + gfs2_assert(sdp, sdp->sd_max_jheight <= GFS2_MAX_META_HEIGHT); + + return 0; +} + +/** + * gfs2_jindex_hold - Grab a lock on the jindex + * @sdp: The GFS2 superblock + * @ji_gh: the holder for the jindex glock + * + * This is very similar to the gfs2_rindex_hold() function, except that + * in general we hold the jindex lock for longer periods of time and + * we grab it far less frequently (in general) then the rgrp lock. + * + * Returns: errno + */ + +int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh) +{ + struct gfs2_inode *dip = GFS2_I(sdp->sd_jindex); + struct qstr name; + char buf[20]; + struct gfs2_jdesc *jd; + int error; + + name.name = buf; + + mutex_lock(&sdp->sd_jindex_mutex); + + for (;;) { + error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, + GL_LOCAL_EXCL, ji_gh); + if (error) + break; + + name.len = sprintf(buf, "journal%u", sdp->sd_journals); + name.hash = gfs2_disk_hash(name.name, name.len); + + error = gfs2_dir_search(sdp->sd_jindex, &name, NULL, NULL); + if (error == -ENOENT) { + error = 0; + break; + } + + gfs2_glock_dq_uninit(ji_gh); + + if (error) + break; + + error = -ENOMEM; + jd = kzalloc(sizeof(struct gfs2_jdesc), GFP_KERNEL); + if (!jd) + break; + + jd->jd_inode = gfs2_lookupi(sdp->sd_jindex, &name, 1, NULL); + if (!jd->jd_inode || IS_ERR(jd->jd_inode)) { + if (!jd->jd_inode) + error = -ENOENT; + else + error = PTR_ERR(jd->jd_inode); + kfree(jd); + break; + } + + spin_lock(&sdp->sd_jindex_spin); + jd->jd_jid = sdp->sd_journals++; + list_add_tail(&jd->jd_list, &sdp->sd_jindex_list); + spin_unlock(&sdp->sd_jindex_spin); + } + + mutex_unlock(&sdp->sd_jindex_mutex); + + return error; +} + +/** + * gfs2_jindex_free - Clear all the journal index information + * @sdp: The GFS2 superblock + * + */ + +void gfs2_jindex_free(struct gfs2_sbd *sdp) +{ + struct list_head list; + struct gfs2_jdesc *jd; + + spin_lock(&sdp->sd_jindex_spin); + list_add(&list, &sdp->sd_jindex_list); + list_del_init(&sdp->sd_jindex_list); + sdp->sd_journals = 0; + spin_unlock(&sdp->sd_jindex_spin); + + while (!list_empty(&list)) { + jd = list_entry(list.next, struct gfs2_jdesc, jd_list); + list_del(&jd->jd_list); + iput(jd->jd_inode); + kfree(jd); + } +} + +static struct gfs2_jdesc *jdesc_find_i(struct list_head *head, unsigned int jid) +{ + struct gfs2_jdesc *jd; + int found = 0; + + list_for_each_entry(jd, head, jd_list) { + if (jd->jd_jid == jid) { + found = 1; + break; + } + } + + if (!found) + jd = NULL; + + return jd; +} + +struct gfs2_jdesc *gfs2_jdesc_find(struct gfs2_sbd *sdp, unsigned int jid) +{ + struct gfs2_jdesc *jd; + + spin_lock(&sdp->sd_jindex_spin); + jd = jdesc_find_i(&sdp->sd_jindex_list, jid); + spin_unlock(&sdp->sd_jindex_spin); + + return jd; +} + +void gfs2_jdesc_make_dirty(struct gfs2_sbd *sdp, unsigned int jid) +{ + struct gfs2_jdesc *jd; + + spin_lock(&sdp->sd_jindex_spin); + jd = jdesc_find_i(&sdp->sd_jindex_list, jid); + if (jd) + jd->jd_dirty = 1; + spin_unlock(&sdp->sd_jindex_spin); +} + +struct gfs2_jdesc *gfs2_jdesc_find_dirty(struct gfs2_sbd *sdp) +{ + struct gfs2_jdesc *jd; + int found = 0; + + spin_lock(&sdp->sd_jindex_spin); + + list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) { + if (jd->jd_dirty) { + jd->jd_dirty = 0; + found = 1; + break; + } + } + spin_unlock(&sdp->sd_jindex_spin); + + if (!found) + jd = NULL; + + return jd; +} + +int gfs2_jdesc_check(struct gfs2_jdesc *jd) +{ + struct gfs2_inode *ip = GFS2_I(jd->jd_inode); + struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); + int ar; + int error; + + if (ip->i_di.di_size < (8 << 20) || ip->i_di.di_size > (1 << 30) || + (ip->i_di.di_size & (sdp->sd_sb.sb_bsize - 1))) { + gfs2_consist_inode(ip); + return -EIO; + } + jd->jd_blocks = ip->i_di.di_size >> sdp->sd_sb.sb_bsize_shift; + + error = gfs2_write_alloc_required(ip, 0, ip->i_di.di_size, &ar); + if (!error && ar) { + gfs2_consist_inode(ip); + error = -EIO; + } + + return error; +} + +/** + * gfs2_make_fs_rw - Turn a Read-Only FS into a Read-Write one + * @sdp: the filesystem + * + * Returns: errno + */ + +int gfs2_make_fs_rw(struct gfs2_sbd *sdp) +{ + struct gfs2_inode *ip = GFS2_I(sdp->sd_jdesc->jd_inode); + struct gfs2_glock *j_gl = ip->i_gl; + struct gfs2_holder t_gh; + struct gfs2_log_header head; + int error; + + error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, + GL_LOCAL_EXCL, &t_gh); + if (error) + return error; + + gfs2_meta_cache_flush(ip); + j_gl->gl_ops->go_inval(j_gl, DIO_METADATA | DIO_DATA); + + error = gfs2_find_jhead(sdp->sd_jdesc, &head); + if (error) + goto fail; + + if (!(head.lh_flags & GFS2_LOG_HEAD_UNMOUNT)) { + gfs2_consist(sdp); + error = -EIO; + goto fail; + } + + /* Initialize some head of the log stuff */ + sdp->sd_log_sequence = head.lh_sequence + 1; + gfs2_log_pointers_init(sdp, head.lh_blkno); + + error = gfs2_quota_init(sdp); + if (error) + goto fail; + + set_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags); + + gfs2_glock_dq_uninit(&t_gh); + + return 0; + +fail: + t_gh.gh_flags |= GL_NOCACHE; + gfs2_glock_dq_uninit(&t_gh); + + return error; +} + +/** + * gfs2_make_fs_ro - Turn a Read-Write FS into a Read-Only one + * @sdp: the filesystem + * + * Returns: errno + */ + +int gfs2_make_fs_ro(struct gfs2_sbd *sdp) +{ + struct gfs2_holder t_gh; + int error; + + gfs2_quota_sync(sdp); + gfs2_statfs_sync(sdp); + + error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, + GL_LOCAL_EXCL | GL_NOCACHE, + &t_gh); + if (error && !test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) + return error; + + gfs2_meta_syncfs(sdp); + gfs2_log_shutdown(sdp); + + clear_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags); + + if (t_gh.gh_gl) + gfs2_glock_dq_uninit(&t_gh); + + gfs2_quota_cleanup(sdp); + + return error; +} + +int gfs2_statfs_init(struct gfs2_sbd *sdp) +{ + struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode); + struct gfs2_statfs_change *m_sc = &sdp->sd_statfs_master; + struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode); + struct gfs2_statfs_change *l_sc = &sdp->sd_statfs_local; + struct buffer_head *m_bh, *l_bh; + struct gfs2_holder gh; + int error; + + error = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE, GL_NOCACHE, + &gh); + if (error) + return error; + + error = gfs2_meta_inode_buffer(m_ip, &m_bh); + if (error) + goto out; + + if (sdp->sd_args.ar_spectator) { + spin_lock(&sdp->sd_statfs_spin); + gfs2_statfs_change_in(m_sc, m_bh->b_data + + sizeof(struct gfs2_dinode)); + spin_unlock(&sdp->sd_statfs_spin); + } else { + error = gfs2_meta_inode_buffer(l_ip, &l_bh); + if (error) + goto out_m_bh; + + spin_lock(&sdp->sd_statfs_spin); + gfs2_statfs_change_in(m_sc, m_bh->b_data + + sizeof(struct gfs2_dinode)); + gfs2_statfs_change_in(l_sc, l_bh->b_data + + sizeof(struct gfs2_dinode)); + spin_unlock(&sdp->sd_statfs_spin); + + brelse(l_bh); + } + +out_m_bh: + brelse(m_bh); +out: + gfs2_glock_dq_uninit(&gh); + return 0; +} + +void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free, + s64 dinodes) +{ + struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode); + struct gfs2_statfs_change *l_sc = &sdp->sd_statfs_local; + struct buffer_head *l_bh; + int error; + + error = gfs2_meta_inode_buffer(l_ip, &l_bh); + if (error) + return; + + mutex_lock(&sdp->sd_statfs_mutex); + gfs2_trans_add_bh(l_ip->i_gl, l_bh, 1); + mutex_unlock(&sdp->sd_statfs_mutex); + + spin_lock(&sdp->sd_statfs_spin); + l_sc->sc_total += total; + l_sc->sc_free += free; + l_sc->sc_dinodes += dinodes; + gfs2_statfs_change_out(l_sc, l_bh->b_data + sizeof(struct gfs2_dinode)); + spin_unlock(&sdp->sd_statfs_spin); + + brelse(l_bh); +} + +int gfs2_statfs_sync(struct gfs2_sbd *sdp) +{ + struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode); + struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode); + struct gfs2_statfs_change *m_sc = &sdp->sd_statfs_master; + struct gfs2_statfs_change *l_sc = &sdp->sd_statfs_local; + struct gfs2_holder gh; + struct buffer_head *m_bh, *l_bh; + int error; + + error = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE, GL_NOCACHE, + &gh); + if (error) + return error; + + error = gfs2_meta_inode_buffer(m_ip, &m_bh); + if (error) + goto out; + + spin_lock(&sdp->sd_statfs_spin); + gfs2_statfs_change_in(m_sc, m_bh->b_data + + sizeof(struct gfs2_dinode)); + if (!l_sc->sc_total && !l_sc->sc_free && !l_sc->sc_dinodes) { + spin_unlock(&sdp->sd_statfs_spin); + goto out_bh; + } + spin_unlock(&sdp->sd_statfs_spin); + + error = gfs2_meta_inode_buffer(l_ip, &l_bh); + if (error) + goto out_bh; + + error = gfs2_trans_begin(sdp, 2 * RES_DINODE, 0); + if (error) + goto out_bh2; + + mutex_lock(&sdp->sd_statfs_mutex); + gfs2_trans_add_bh(l_ip->i_gl, l_bh, 1); + mutex_unlock(&sdp->sd_statfs_mutex); + + spin_lock(&sdp->sd_statfs_spin); + m_sc->sc_total += l_sc->sc_total; + m_sc->sc_free += l_sc->sc_free; + m_sc->sc_dinodes += l_sc->sc_dinodes; + memset(l_sc, 0, sizeof(struct gfs2_statfs_change)); + memset(l_bh->b_data + sizeof(struct gfs2_dinode), + 0, sizeof(struct gfs2_statfs_change)); + spin_unlock(&sdp->sd_statfs_spin); + + gfs2_trans_add_bh(m_ip->i_gl, m_bh, 1); + gfs2_statfs_change_out(m_sc, m_bh->b_data + sizeof(struct gfs2_dinode)); + + gfs2_trans_end(sdp); + +out_bh2: + brelse(l_bh); +out_bh: + brelse(m_bh); +out: + gfs2_glock_dq_uninit(&gh); + return error; +} + +/** + * gfs2_statfs_i - Do a statfs + * @sdp: the filesystem + * @sg: the sg structure + * + * Returns: errno + */ + +int gfs2_statfs_i(struct gfs2_sbd *sdp, struct gfs2_statfs_change *sc) +{ + struct gfs2_statfs_change *m_sc = &sdp->sd_statfs_master; + struct gfs2_statfs_change *l_sc = &sdp->sd_statfs_local; + + spin_lock(&sdp->sd_statfs_spin); + + *sc = *m_sc; + sc->sc_total += l_sc->sc_total; + sc->sc_free += l_sc->sc_free; + sc->sc_dinodes += l_sc->sc_dinodes; + + spin_unlock(&sdp->sd_statfs_spin); + + if (sc->sc_free < 0) + sc->sc_free = 0; + if (sc->sc_free > sc->sc_total) + sc->sc_free = sc->sc_total; + if (sc->sc_dinodes < 0) + sc->sc_dinodes = 0; + + return 0; +} + +/** + * statfs_fill - fill in the sg for a given RG + * @rgd: the RG + * @sc: the sc structure + * + * Returns: 0 on success, -ESTALE if the LVB is invalid + */ + +static int statfs_slow_fill(struct gfs2_rgrpd *rgd, + struct gfs2_statfs_change *sc) +{ + gfs2_rgrp_verify(rgd); + sc->sc_total += rgd->rd_ri.ri_data; + sc->sc_free += rgd->rd_rg.rg_free; + sc->sc_dinodes += rgd->rd_rg.rg_dinodes; + return 0; +} + +/** + * gfs2_statfs_slow - Stat a filesystem using asynchronous locking + * @sdp: the filesystem + * @sc: the sc info that will be returned + * + * Any error (other than a signal) will cause this routine to fall back + * to the synchronous version. + * + * FIXME: This really shouldn't busy wait like this. + * + * Returns: errno + */ + +int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change *sc) +{ + struct gfs2_holder ri_gh; + struct gfs2_rgrpd *rgd_next; + struct gfs2_holder *gha, *gh; + unsigned int slots = 64; + unsigned int x; + int done; + int error = 0, err; + + memset(sc, 0, sizeof(struct gfs2_statfs_change)); + gha = kcalloc(slots, sizeof(struct gfs2_holder), GFP_KERNEL); + if (!gha) + return -ENOMEM; + + error = gfs2_rindex_hold(sdp, &ri_gh); + if (error) + goto out; + + rgd_next = gfs2_rgrpd_get_first(sdp); + + for (;;) { + done = 1; + + for (x = 0; x < slots; x++) { + gh = gha + x; + + if (gh->gh_gl && gfs2_glock_poll(gh)) { + err = gfs2_glock_wait(gh); + if (err) { + gfs2_holder_uninit(gh); + error = err; + } else { + if (!error) + error = statfs_slow_fill( + gh->gh_gl->gl_object, sc); + gfs2_glock_dq_uninit(gh); + } + } + + if (gh->gh_gl) + done = 0; + else if (rgd_next && !error) { + error = gfs2_glock_nq_init(rgd_next->rd_gl, + LM_ST_SHARED, + GL_ASYNC, + gh); + rgd_next = gfs2_rgrpd_get_next(rgd_next); + done = 0; + } + + if (signal_pending(current)) + error = -ERESTARTSYS; + } + + if (done) + break; + + yield(); + } + + gfs2_glock_dq_uninit(&ri_gh); + +out: + kfree(gha); + return error; +} + +struct lfcc { + struct list_head list; + struct gfs2_holder gh; +}; + +/** + * gfs2_lock_fs_check_clean - Stop all writes to the FS and check that all + * journals are clean + * @sdp: the file system + * @state: the state to put the transaction lock into + * @t_gh: the hold on the transaction lock + * + * Returns: errno + */ + +static int gfs2_lock_fs_check_clean(struct gfs2_sbd *sdp, + struct gfs2_holder *t_gh) +{ + struct gfs2_inode *ip; + struct gfs2_holder ji_gh; + struct gfs2_jdesc *jd; + struct lfcc *lfcc; + LIST_HEAD(list); + struct gfs2_log_header lh; + int error; + + error = gfs2_jindex_hold(sdp, &ji_gh); + if (error) + return error; + + list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) { + lfcc = kmalloc(sizeof(struct lfcc), GFP_KERNEL); + if (!lfcc) { + error = -ENOMEM; + goto out; + } + ip = GFS2_I(jd->jd_inode); + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &lfcc->gh); + if (error) { + kfree(lfcc); + goto out; + } + list_add(&lfcc->list, &list); + } + + error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_DEFERRED, + LM_FLAG_PRIORITY | GL_NOCACHE, + t_gh); + + list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) { + error = gfs2_jdesc_check(jd); + if (error) + break; + error = gfs2_find_jhead(jd, &lh); + if (error) + break; + if (!(lh.lh_flags & GFS2_LOG_HEAD_UNMOUNT)) { + error = -EBUSY; + break; + } + } + + if (error) + gfs2_glock_dq_uninit(t_gh); + +out: + while (!list_empty(&list)) { + lfcc = list_entry(list.next, struct lfcc, list); + list_del(&lfcc->list); + gfs2_glock_dq_uninit(&lfcc->gh); + kfree(lfcc); + } + gfs2_glock_dq_uninit(&ji_gh); + return error; +} + +/** + * gfs2_freeze_fs - freezes the file system + * @sdp: the file system + * + * This function flushes data and meta data for all machines by + * aquiring the transaction log exclusively. All journals are + * ensured to be in a clean state as well. + * + * Returns: errno + */ + +int gfs2_freeze_fs(struct gfs2_sbd *sdp) +{ + int error = 0; + + mutex_lock(&sdp->sd_freeze_lock); + + if (!sdp->sd_freeze_count++) { + error = gfs2_lock_fs_check_clean(sdp, &sdp->sd_freeze_gh); + if (error) + sdp->sd_freeze_count--; + } + + mutex_unlock(&sdp->sd_freeze_lock); + + return error; +} + +/** + * gfs2_unfreeze_fs - unfreezes the file system + * @sdp: the file system + * + * This function allows the file system to proceed by unlocking + * the exclusively held transaction lock. Other GFS2 nodes are + * now free to acquire the lock shared and go on with their lives. + * + */ + +void gfs2_unfreeze_fs(struct gfs2_sbd *sdp) +{ + mutex_lock(&sdp->sd_freeze_lock); + + if (sdp->sd_freeze_count && !--sdp->sd_freeze_count) + gfs2_glock_dq_uninit(&sdp->sd_freeze_gh); + + mutex_unlock(&sdp->sd_freeze_lock); +} + diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h new file mode 100644 index 000000000000..5bb443ae0f59 --- /dev/null +++ b/fs/gfs2/super.h @@ -0,0 +1,55 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#ifndef __SUPER_DOT_H__ +#define __SUPER_DOT_H__ + +#include "incore.h" + +void gfs2_tune_init(struct gfs2_tune *gt); + +int gfs2_check_sb(struct gfs2_sbd *sdp, struct gfs2_sb *sb, int silent); +int gfs2_read_sb(struct gfs2_sbd *sdp, struct gfs2_glock *gl, int silent); +struct page *gfs2_read_super(struct super_block *sb, sector_t sector); + +static inline unsigned int gfs2_jindex_size(struct gfs2_sbd *sdp) +{ + unsigned int x; + spin_lock(&sdp->sd_jindex_spin); + x = sdp->sd_journals; + spin_unlock(&sdp->sd_jindex_spin); + return x; +} + +int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh); +void gfs2_jindex_free(struct gfs2_sbd *sdp); + +struct gfs2_jdesc *gfs2_jdesc_find(struct gfs2_sbd *sdp, unsigned int jid); +void gfs2_jdesc_make_dirty(struct gfs2_sbd *sdp, unsigned int jid); +struct gfs2_jdesc *gfs2_jdesc_find_dirty(struct gfs2_sbd *sdp); +int gfs2_jdesc_check(struct gfs2_jdesc *jd); + +int gfs2_lookup_in_master_dir(struct gfs2_sbd *sdp, char *filename, + struct gfs2_inode **ipp); + +int gfs2_make_fs_rw(struct gfs2_sbd *sdp); +int gfs2_make_fs_ro(struct gfs2_sbd *sdp); + +int gfs2_statfs_init(struct gfs2_sbd *sdp); +void gfs2_statfs_change(struct gfs2_sbd *sdp, + s64 total, s64 free, s64 dinodes); +int gfs2_statfs_sync(struct gfs2_sbd *sdp); +int gfs2_statfs_i(struct gfs2_sbd *sdp, struct gfs2_statfs_change *sc); +int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change *sc); + +int gfs2_freeze_fs(struct gfs2_sbd *sdp); +void gfs2_unfreeze_fs(struct gfs2_sbd *sdp); + +#endif /* __SUPER_DOT_H__ */ + diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c new file mode 100644 index 000000000000..0e0ec988f731 --- /dev/null +++ b/fs/gfs2/sys.c @@ -0,0 +1,583 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/completion.h> +#include <linux/buffer_head.h> +#include <linux/module.h> +#include <linux/kobject.h> +#include <linux/gfs2_ondisk.h> +#include <linux/lm_interface.h> +#include <asm/uaccess.h> + +#include "gfs2.h" +#include "incore.h" +#include "lm.h" +#include "sys.h" +#include "super.h" +#include "glock.h" +#include "quota.h" +#include "util.h" + +char *gfs2_sys_margs; +spinlock_t gfs2_sys_margs_lock; + +static ssize_t id_show(struct gfs2_sbd *sdp, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%s\n", sdp->sd_vfs->s_id); +} + +static ssize_t fsname_show(struct gfs2_sbd *sdp, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%s\n", sdp->sd_fsname); +} + +static ssize_t freeze_show(struct gfs2_sbd *sdp, char *buf) +{ + unsigned int count; + + mutex_lock(&sdp->sd_freeze_lock); + count = sdp->sd_freeze_count; + mutex_unlock(&sdp->sd_freeze_lock); + + return snprintf(buf, PAGE_SIZE, "%u\n", count); +} + +static ssize_t freeze_store(struct gfs2_sbd *sdp, const char *buf, size_t len) +{ + ssize_t ret = len; + int error = 0; + int n = simple_strtol(buf, NULL, 0); + + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + + switch (n) { + case 0: + gfs2_unfreeze_fs(sdp); + break; + case 1: + error = gfs2_freeze_fs(sdp); + break; + default: + ret = -EINVAL; + } + + if (error) + fs_warn(sdp, "freeze %d error %d", n, error); + + return ret; +} + +static ssize_t withdraw_show(struct gfs2_sbd *sdp, char *buf) +{ + unsigned int b = test_bit(SDF_SHUTDOWN, &sdp->sd_flags); + return snprintf(buf, PAGE_SIZE, "%u\n", b); +} + +static ssize_t withdraw_store(struct gfs2_sbd *sdp, const char *buf, size_t len) +{ + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + + if (simple_strtol(buf, NULL, 0) != 1) + return -EINVAL; + + gfs2_lm_withdraw(sdp, + "GFS2: fsid=%s: withdrawing from cluster at user's request\n", + sdp->sd_fsname); + return len; +} + +static ssize_t statfs_sync_store(struct gfs2_sbd *sdp, const char *buf, + size_t len) +{ + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + + if (simple_strtol(buf, NULL, 0) != 1) + return -EINVAL; + + gfs2_statfs_sync(sdp); + return len; +} + +static ssize_t shrink_store(struct gfs2_sbd *sdp, const char *buf, size_t len) +{ + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + + if (simple_strtol(buf, NULL, 0) != 1) + return -EINVAL; + + gfs2_gl_hash_clear(sdp, NO_WAIT); + return len; +} + +static ssize_t quota_sync_store(struct gfs2_sbd *sdp, const char *buf, + size_t len) +{ + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + + if (simple_strtol(buf, NULL, 0) != 1) + return -EINVAL; + + gfs2_quota_sync(sdp); + return len; +} + +static ssize_t quota_refresh_user_store(struct gfs2_sbd *sdp, const char *buf, + size_t len) +{ + u32 id; + + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + + id = simple_strtoul(buf, NULL, 0); + + gfs2_quota_refresh(sdp, 1, id); + return len; +} + +static ssize_t quota_refresh_group_store(struct gfs2_sbd *sdp, const char *buf, + size_t len) +{ + u32 id; + + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + + id = simple_strtoul(buf, NULL, 0); + + gfs2_quota_refresh(sdp, 0, id); + return len; +} + +struct gfs2_attr { + struct attribute attr; + ssize_t (*show)(struct gfs2_sbd *, char *); + ssize_t (*store)(struct gfs2_sbd *, const char *, size_t); +}; + +#define GFS2_ATTR(name, mode, show, store) \ +static struct gfs2_attr gfs2_attr_##name = __ATTR(name, mode, show, store) + +GFS2_ATTR(id, 0444, id_show, NULL); +GFS2_ATTR(fsname, 0444, fsname_show, NULL); +GFS2_ATTR(freeze, 0644, freeze_show, freeze_store); +GFS2_ATTR(shrink, 0200, NULL, shrink_store); +GFS2_ATTR(withdraw, 0644, withdraw_show, withdraw_store); +GFS2_ATTR(statfs_sync, 0200, NULL, statfs_sync_store); +GFS2_ATTR(quota_sync, 0200, NULL, quota_sync_store); +GFS2_ATTR(quota_refresh_user, 0200, NULL, quota_refresh_user_store); +GFS2_ATTR(quota_refresh_group, 0200, NULL, quota_refresh_group_store); + +static struct attribute *gfs2_attrs[] = { + &gfs2_attr_id.attr, + &gfs2_attr_fsname.attr, + &gfs2_attr_freeze.attr, + &gfs2_attr_shrink.attr, + &gfs2_attr_withdraw.attr, + &gfs2_attr_statfs_sync.attr, + &gfs2_attr_quota_sync.attr, + &gfs2_attr_quota_refresh_user.attr, + &gfs2_attr_quota_refresh_group.attr, + NULL, +}; + +static ssize_t gfs2_attr_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj); + struct gfs2_attr *a = container_of(attr, struct gfs2_attr, attr); + return a->show ? a->show(sdp, buf) : 0; +} + +static ssize_t gfs2_attr_store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t len) +{ + struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj); + struct gfs2_attr *a = container_of(attr, struct gfs2_attr, attr); + return a->store ? a->store(sdp, buf, len) : len; +} + +static struct sysfs_ops gfs2_attr_ops = { + .show = gfs2_attr_show, + .store = gfs2_attr_store, +}; + +static struct kobj_type gfs2_ktype = { + .default_attrs = gfs2_attrs, + .sysfs_ops = &gfs2_attr_ops, +}; + +static struct kset gfs2_kset = { + .subsys = &fs_subsys, + .kobj = {.name = "gfs2"}, + .ktype = &gfs2_ktype, +}; + +/* + * display struct lm_lockstruct fields + */ + +struct lockstruct_attr { + struct attribute attr; + ssize_t (*show)(struct gfs2_sbd *, char *); +}; + +#define LOCKSTRUCT_ATTR(name, fmt) \ +static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf) \ +{ \ + return snprintf(buf, PAGE_SIZE, fmt, sdp->sd_lockstruct.ls_##name); \ +} \ +static struct lockstruct_attr lockstruct_attr_##name = __ATTR_RO(name) + +LOCKSTRUCT_ATTR(jid, "%u\n"); +LOCKSTRUCT_ATTR(first, "%u\n"); +LOCKSTRUCT_ATTR(lvb_size, "%u\n"); +LOCKSTRUCT_ATTR(flags, "%d\n"); + +static struct attribute *lockstruct_attrs[] = { + &lockstruct_attr_jid.attr, + &lockstruct_attr_first.attr, + &lockstruct_attr_lvb_size.attr, + &lockstruct_attr_flags.attr, + NULL, +}; + +/* + * display struct gfs2_args fields + */ + +struct args_attr { + struct attribute attr; + ssize_t (*show)(struct gfs2_sbd *, char *); +}; + +#define ARGS_ATTR(name, fmt) \ +static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf) \ +{ \ + return snprintf(buf, PAGE_SIZE, fmt, sdp->sd_args.ar_##name); \ +} \ +static struct args_attr args_attr_##name = __ATTR_RO(name) + +ARGS_ATTR(lockproto, "%s\n"); +ARGS_ATTR(locktable, "%s\n"); +ARGS_ATTR(hostdata, "%s\n"); +ARGS_ATTR(spectator, "%d\n"); +ARGS_ATTR(ignore_local_fs, "%d\n"); +ARGS_ATTR(localcaching, "%d\n"); +ARGS_ATTR(localflocks, "%d\n"); +ARGS_ATTR(debug, "%d\n"); +ARGS_ATTR(upgrade, "%d\n"); +ARGS_ATTR(num_glockd, "%u\n"); +ARGS_ATTR(posix_acl, "%d\n"); +ARGS_ATTR(quota, "%u\n"); +ARGS_ATTR(suiddir, "%d\n"); +ARGS_ATTR(data, "%d\n"); + +/* one oddball doesn't fit the macro mold */ +static ssize_t noatime_show(struct gfs2_sbd *sdp, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%d\n", + !!test_bit(SDF_NOATIME, &sdp->sd_flags)); +} +static struct args_attr args_attr_noatime = __ATTR_RO(noatime); + +static struct attribute *args_attrs[] = { + &args_attr_lockproto.attr, + &args_attr_locktable.attr, + &args_attr_hostdata.attr, + &args_attr_spectator.attr, + &args_attr_ignore_local_fs.attr, + &args_attr_localcaching.attr, + &args_attr_localflocks.attr, + &args_attr_debug.attr, + &args_attr_upgrade.attr, + &args_attr_num_glockd.attr, + &args_attr_posix_acl.attr, + &args_attr_quota.attr, + &args_attr_suiddir.attr, + &args_attr_data.attr, + &args_attr_noatime.attr, + NULL, +}; + +/* + * display counters from superblock + */ + +struct counters_attr { + struct attribute attr; + ssize_t (*show)(struct gfs2_sbd *, char *); +}; + +#define COUNTERS_ATTR(name, fmt) \ +static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf) \ +{ \ + return snprintf(buf, PAGE_SIZE, fmt, \ + (unsigned int)atomic_read(&sdp->sd_##name)); \ +} \ +static struct counters_attr counters_attr_##name = __ATTR_RO(name) + +COUNTERS_ATTR(glock_count, "%u\n"); +COUNTERS_ATTR(glock_held_count, "%u\n"); +COUNTERS_ATTR(inode_count, "%u\n"); +COUNTERS_ATTR(reclaimed, "%u\n"); + +static struct attribute *counters_attrs[] = { + &counters_attr_glock_count.attr, + &counters_attr_glock_held_count.attr, + &counters_attr_inode_count.attr, + &counters_attr_reclaimed.attr, + NULL, +}; + +/* + * get and set struct gfs2_tune fields + */ + +static ssize_t quota_scale_show(struct gfs2_sbd *sdp, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%u %u\n", + sdp->sd_tune.gt_quota_scale_num, + sdp->sd_tune.gt_quota_scale_den); +} + +static ssize_t quota_scale_store(struct gfs2_sbd *sdp, const char *buf, + size_t len) +{ + struct gfs2_tune *gt = &sdp->sd_tune; + unsigned int x, y; + + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + + if (sscanf(buf, "%u %u", &x, &y) != 2 || !y) + return -EINVAL; + + spin_lock(>->gt_spin); + gt->gt_quota_scale_num = x; + gt->gt_quota_scale_den = y; + spin_unlock(>->gt_spin); + return len; +} + +static ssize_t tune_set(struct gfs2_sbd *sdp, unsigned int *field, + int check_zero, const char *buf, size_t len) +{ + struct gfs2_tune *gt = &sdp->sd_tune; + unsigned int x; + + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + + x = simple_strtoul(buf, NULL, 0); + + if (check_zero && !x) + return -EINVAL; + + spin_lock(>->gt_spin); + *field = x; + spin_unlock(>->gt_spin); + return len; +} + +struct tune_attr { + struct attribute attr; + ssize_t (*show)(struct gfs2_sbd *, char *); + ssize_t (*store)(struct gfs2_sbd *, const char *, size_t); +}; + +#define TUNE_ATTR_3(name, show, store) \ +static struct tune_attr tune_attr_##name = __ATTR(name, 0644, show, store) + +#define TUNE_ATTR_2(name, store) \ +static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf) \ +{ \ + return snprintf(buf, PAGE_SIZE, "%u\n", sdp->sd_tune.gt_##name); \ +} \ +TUNE_ATTR_3(name, name##_show, store) + +#define TUNE_ATTR(name, check_zero) \ +static ssize_t name##_store(struct gfs2_sbd *sdp, const char *buf, size_t len)\ +{ \ + return tune_set(sdp, &sdp->sd_tune.gt_##name, check_zero, buf, len); \ +} \ +TUNE_ATTR_2(name, name##_store) + +#define TUNE_ATTR_DAEMON(name, process) \ +static ssize_t name##_store(struct gfs2_sbd *sdp, const char *buf, size_t len)\ +{ \ + ssize_t r = tune_set(sdp, &sdp->sd_tune.gt_##name, 1, buf, len); \ + wake_up_process(sdp->sd_##process); \ + return r; \ +} \ +TUNE_ATTR_2(name, name##_store) + +TUNE_ATTR(ilimit, 0); +TUNE_ATTR(ilimit_tries, 0); +TUNE_ATTR(ilimit_min, 0); +TUNE_ATTR(demote_secs, 0); +TUNE_ATTR(incore_log_blocks, 0); +TUNE_ATTR(log_flush_secs, 0); +TUNE_ATTR(jindex_refresh_secs, 0); +TUNE_ATTR(quota_warn_period, 0); +TUNE_ATTR(quota_quantum, 0); +TUNE_ATTR(atime_quantum, 0); +TUNE_ATTR(max_readahead, 0); +TUNE_ATTR(complain_secs, 0); +TUNE_ATTR(reclaim_limit, 0); +TUNE_ATTR(prefetch_secs, 0); +TUNE_ATTR(statfs_slow, 0); +TUNE_ATTR(new_files_jdata, 0); +TUNE_ATTR(new_files_directio, 0); +TUNE_ATTR(quota_simul_sync, 1); +TUNE_ATTR(quota_cache_secs, 1); +TUNE_ATTR(max_atomic_write, 1); +TUNE_ATTR(stall_secs, 1); +TUNE_ATTR(entries_per_readdir, 1); +TUNE_ATTR(greedy_default, 1); +TUNE_ATTR(greedy_quantum, 1); +TUNE_ATTR(greedy_max, 1); +TUNE_ATTR(statfs_quantum, 1); +TUNE_ATTR_DAEMON(scand_secs, scand_process); +TUNE_ATTR_DAEMON(recoverd_secs, recoverd_process); +TUNE_ATTR_DAEMON(logd_secs, logd_process); +TUNE_ATTR_DAEMON(quotad_secs, quotad_process); +TUNE_ATTR_3(quota_scale, quota_scale_show, quota_scale_store); + +static struct attribute *tune_attrs[] = { + &tune_attr_ilimit.attr, + &tune_attr_ilimit_tries.attr, + &tune_attr_ilimit_min.attr, + &tune_attr_demote_secs.attr, + &tune_attr_incore_log_blocks.attr, + &tune_attr_log_flush_secs.attr, + &tune_attr_jindex_refresh_secs.attr, + &tune_attr_quota_warn_period.attr, + &tune_attr_quota_quantum.attr, + &tune_attr_atime_quantum.attr, + &tune_attr_max_readahead.attr, + &tune_attr_complain_secs.attr, + &tune_attr_reclaim_limit.attr, + &tune_attr_prefetch_secs.attr, + &tune_attr_statfs_slow.attr, + &tune_attr_quota_simul_sync.attr, + &tune_attr_quota_cache_secs.attr, + &tune_attr_max_atomic_write.attr, + &tune_attr_stall_secs.attr, + &tune_attr_entries_per_readdir.attr, + &tune_attr_greedy_default.attr, + &tune_attr_greedy_quantum.attr, + &tune_attr_greedy_max.attr, + &tune_attr_statfs_quantum.attr, + &tune_attr_scand_secs.attr, + &tune_attr_recoverd_secs.attr, + &tune_attr_logd_secs.attr, + &tune_attr_quotad_secs.attr, + &tune_attr_quota_scale.attr, + &tune_attr_new_files_jdata.attr, + &tune_attr_new_files_directio.attr, + NULL, +}; + +static struct attribute_group lockstruct_group = { + .name = "lockstruct", + .attrs = lockstruct_attrs, +}; + +static struct attribute_group counters_group = { + .name = "counters", + .attrs = counters_attrs, +}; + +static struct attribute_group args_group = { + .name = "args", + .attrs = args_attrs, +}; + +static struct attribute_group tune_group = { + .name = "tune", + .attrs = tune_attrs, +}; + +int gfs2_sys_fs_add(struct gfs2_sbd *sdp) +{ + int error; + + sdp->sd_kobj.kset = &gfs2_kset; + sdp->sd_kobj.ktype = &gfs2_ktype; + + error = kobject_set_name(&sdp->sd_kobj, "%s", sdp->sd_table_name); + if (error) + goto fail; + + error = kobject_register(&sdp->sd_kobj); + if (error) + goto fail; + + error = sysfs_create_group(&sdp->sd_kobj, &lockstruct_group); + if (error) + goto fail_reg; + + error = sysfs_create_group(&sdp->sd_kobj, &counters_group); + if (error) + goto fail_lockstruct; + + error = sysfs_create_group(&sdp->sd_kobj, &args_group); + if (error) + goto fail_counters; + + error = sysfs_create_group(&sdp->sd_kobj, &tune_group); + if (error) + goto fail_args; + + return 0; + +fail_args: + sysfs_remove_group(&sdp->sd_kobj, &args_group); +fail_counters: + sysfs_remove_group(&sdp->sd_kobj, &counters_group); +fail_lockstruct: + sysfs_remove_group(&sdp->sd_kobj, &lockstruct_group); +fail_reg: + kobject_unregister(&sdp->sd_kobj); +fail: + fs_err(sdp, "error %d adding sysfs files", error); + return error; +} + +void gfs2_sys_fs_del(struct gfs2_sbd *sdp) +{ + sysfs_remove_group(&sdp->sd_kobj, &tune_group); + sysfs_remove_group(&sdp->sd_kobj, &args_group); + sysfs_remove_group(&sdp->sd_kobj, &counters_group); + sysfs_remove_group(&sdp->sd_kobj, &lockstruct_group); + kobject_unregister(&sdp->sd_kobj); +} + +int gfs2_sys_init(void) +{ + gfs2_sys_margs = NULL; + spin_lock_init(&gfs2_sys_margs_lock); + return kset_register(&gfs2_kset); +} + +void gfs2_sys_uninit(void) +{ + kfree(gfs2_sys_margs); + kset_unregister(&gfs2_kset); +} + diff --git a/fs/gfs2/sys.h b/fs/gfs2/sys.h new file mode 100644 index 000000000000..1ca8cdac5304 --- /dev/null +++ b/fs/gfs2/sys.h @@ -0,0 +1,27 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#ifndef __SYS_DOT_H__ +#define __SYS_DOT_H__ + +#include <linux/spinlock.h> +struct gfs2_sbd; + +/* Allow args to be passed to GFS2 when using an initial ram disk */ +extern char *gfs2_sys_margs; +extern spinlock_t gfs2_sys_margs_lock; + +int gfs2_sys_fs_add(struct gfs2_sbd *sdp); +void gfs2_sys_fs_del(struct gfs2_sbd *sdp); + +int gfs2_sys_init(void); +void gfs2_sys_uninit(void); + +#endif /* __SYS_DOT_H__ */ + diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c new file mode 100644 index 000000000000..f8dabf8446bb --- /dev/null +++ b/fs/gfs2/trans.c @@ -0,0 +1,184 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/completion.h> +#include <linux/buffer_head.h> +#include <linux/gfs2_ondisk.h> +#include <linux/kallsyms.h> +#include <linux/lm_interface.h> + +#include "gfs2.h" +#include "incore.h" +#include "glock.h" +#include "log.h" +#include "lops.h" +#include "meta_io.h" +#include "trans.h" +#include "util.h" + +int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks, + unsigned int revokes) +{ + struct gfs2_trans *tr; + int error; + + BUG_ON(current->journal_info); + BUG_ON(blocks == 0 && revokes == 0); + + tr = kzalloc(sizeof(struct gfs2_trans), GFP_NOFS); + if (!tr) + return -ENOMEM; + + tr->tr_ip = (unsigned long)__builtin_return_address(0); + tr->tr_blocks = blocks; + tr->tr_revokes = revokes; + tr->tr_reserved = 1; + if (blocks) + tr->tr_reserved += 6 + blocks; + if (revokes) + tr->tr_reserved += gfs2_struct2blk(sdp, revokes, + sizeof(u64)); + INIT_LIST_HEAD(&tr->tr_list_buf); + + gfs2_holder_init(sdp->sd_trans_gl, LM_ST_SHARED, 0, &tr->tr_t_gh); + + error = gfs2_glock_nq(&tr->tr_t_gh); + if (error) + goto fail_holder_uninit; + + if (!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) { + tr->tr_t_gh.gh_flags |= GL_NOCACHE; + error = -EROFS; + goto fail_gunlock; + } + + error = gfs2_log_reserve(sdp, tr->tr_reserved); + if (error) + goto fail_gunlock; + + current->journal_info = tr; + + return 0; + +fail_gunlock: + gfs2_glock_dq(&tr->tr_t_gh); + +fail_holder_uninit: + gfs2_holder_uninit(&tr->tr_t_gh); + kfree(tr); + + return error; +} + +void gfs2_trans_end(struct gfs2_sbd *sdp) +{ + struct gfs2_trans *tr = current->journal_info; + + BUG_ON(!tr); + current->journal_info = NULL; + + if (!tr->tr_touched) { + gfs2_log_release(sdp, tr->tr_reserved); + gfs2_glock_dq(&tr->tr_t_gh); + gfs2_holder_uninit(&tr->tr_t_gh); + kfree(tr); + return; + } + + if (gfs2_assert_withdraw(sdp, tr->tr_num_buf <= tr->tr_blocks)) { + fs_err(sdp, "tr_num_buf = %u, tr_blocks = %u ", + tr->tr_num_buf, tr->tr_blocks); + print_symbol(KERN_WARNING "GFS2: Transaction created at: %s\n", tr->tr_ip); + } + if (gfs2_assert_withdraw(sdp, tr->tr_num_revoke <= tr->tr_revokes)) { + fs_err(sdp, "tr_num_revoke = %u, tr_revokes = %u ", + tr->tr_num_revoke, tr->tr_revokes); + print_symbol(KERN_WARNING "GFS2: Transaction created at: %s\n", tr->tr_ip); + } + + gfs2_log_commit(sdp, tr); + gfs2_glock_dq(&tr->tr_t_gh); + gfs2_holder_uninit(&tr->tr_t_gh); + kfree(tr); + + if (sdp->sd_vfs->s_flags & MS_SYNCHRONOUS) + gfs2_log_flush(sdp, NULL); +} + +void gfs2_trans_add_gl(struct gfs2_glock *gl) +{ + lops_add(gl->gl_sbd, &gl->gl_le); +} + +/** + * gfs2_trans_add_bh - Add a to-be-modified buffer to the current transaction + * @gl: the glock the buffer belongs to + * @bh: The buffer to add + * @meta: True in the case of adding metadata + * + */ + +void gfs2_trans_add_bh(struct gfs2_glock *gl, struct buffer_head *bh, int meta) +{ + struct gfs2_sbd *sdp = gl->gl_sbd; + struct gfs2_bufdata *bd; + + bd = bh->b_private; + if (bd) + gfs2_assert(sdp, bd->bd_gl == gl); + else { + gfs2_attach_bufdata(gl, bh, meta); + bd = bh->b_private; + } + lops_add(sdp, &bd->bd_le); +} + +void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, u64 blkno) +{ + struct gfs2_revoke *rv = kmalloc(sizeof(struct gfs2_revoke), + GFP_NOFS | __GFP_NOFAIL); + lops_init_le(&rv->rv_le, &gfs2_revoke_lops); + rv->rv_blkno = blkno; + lops_add(sdp, &rv->rv_le); +} + +void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno) +{ + struct gfs2_revoke *rv; + int found = 0; + + gfs2_log_lock(sdp); + + list_for_each_entry(rv, &sdp->sd_log_le_revoke, rv_le.le_list) { + if (rv->rv_blkno == blkno) { + list_del(&rv->rv_le.le_list); + gfs2_assert_withdraw(sdp, sdp->sd_log_num_revoke); + sdp->sd_log_num_revoke--; + found = 1; + break; + } + } + + gfs2_log_unlock(sdp); + + if (found) { + struct gfs2_trans *tr = current->journal_info; + kfree(rv); + tr->tr_num_revoke_rm++; + } +} + +void gfs2_trans_add_rg(struct gfs2_rgrpd *rgd) +{ + lops_add(rgd->rd_sbd, &rgd->rd_le); +} + diff --git a/fs/gfs2/trans.h b/fs/gfs2/trans.h new file mode 100644 index 000000000000..23d4cbe1de5b --- /dev/null +++ b/fs/gfs2/trans.h @@ -0,0 +1,39 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#ifndef __TRANS_DOT_H__ +#define __TRANS_DOT_H__ + +#include <linux/buffer_head.h> +struct gfs2_sbd; +struct gfs2_rgrpd; +struct gfs2_glock; + +#define RES_DINODE 1 +#define RES_INDIRECT 1 +#define RES_JDATA 1 +#define RES_DATA 1 +#define RES_LEAF 1 +#define RES_RG_BIT 2 +#define RES_EATTR 1 +#define RES_STATFS 1 +#define RES_QUOTA 2 + +int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks, + unsigned int revokes); + +void gfs2_trans_end(struct gfs2_sbd *sdp); + +void gfs2_trans_add_gl(struct gfs2_glock *gl); +void gfs2_trans_add_bh(struct gfs2_glock *gl, struct buffer_head *bh, int meta); +void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, u64 blkno); +void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno); +void gfs2_trans_add_rg(struct gfs2_rgrpd *rgd); + +#endif /* __TRANS_DOT_H__ */ diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c new file mode 100644 index 000000000000..196c604faadc --- /dev/null +++ b/fs/gfs2/util.c @@ -0,0 +1,245 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/completion.h> +#include <linux/buffer_head.h> +#include <linux/crc32.h> +#include <linux/gfs2_ondisk.h> +#include <linux/lm_interface.h> +#include <asm/uaccess.h> + +#include "gfs2.h" +#include "incore.h" +#include "glock.h" +#include "lm.h" +#include "util.h" + +kmem_cache_t *gfs2_glock_cachep __read_mostly; +kmem_cache_t *gfs2_inode_cachep __read_mostly; +kmem_cache_t *gfs2_bufdata_cachep __read_mostly; + +void gfs2_assert_i(struct gfs2_sbd *sdp) +{ + printk(KERN_EMERG "GFS2: fsid=%s: fatal assertion failed\n", + sdp->sd_fsname); +} + +/** + * gfs2_assert_withdraw_i - Cause the machine to withdraw if @assertion is false + * Returns: -1 if this call withdrew the machine, + * -2 if it was already withdrawn + */ + +int gfs2_assert_withdraw_i(struct gfs2_sbd *sdp, char *assertion, + const char *function, char *file, unsigned int line) +{ + int me; + me = gfs2_lm_withdraw(sdp, + "GFS2: fsid=%s: fatal: assertion \"%s\" failed\n" + "GFS2: fsid=%s: function = %s, file = %s, line = %u\n", + sdp->sd_fsname, assertion, + sdp->sd_fsname, function, file, line); + dump_stack(); + return (me) ? -1 : -2; +} + +/** + * gfs2_assert_warn_i - Print a message to the console if @assertion is false + * Returns: -1 if we printed something + * -2 if we didn't + */ + +int gfs2_assert_warn_i(struct gfs2_sbd *sdp, char *assertion, + const char *function, char *file, unsigned int line) +{ + if (time_before(jiffies, + sdp->sd_last_warning + + gfs2_tune_get(sdp, gt_complain_secs) * HZ)) + return -2; + + printk(KERN_WARNING + "GFS2: fsid=%s: warning: assertion \"%s\" failed\n" + "GFS2: fsid=%s: function = %s, file = %s, line = %u\n", + sdp->sd_fsname, assertion, + sdp->sd_fsname, function, file, line); + + if (sdp->sd_args.ar_debug) + BUG(); + else + dump_stack(); + + sdp->sd_last_warning = jiffies; + + return -1; +} + +/** + * gfs2_consist_i - Flag a filesystem consistency error and withdraw + * Returns: -1 if this call withdrew the machine, + * 0 if it was already withdrawn + */ + +int gfs2_consist_i(struct gfs2_sbd *sdp, int cluster_wide, const char *function, + char *file, unsigned int line) +{ + int rv; + rv = gfs2_lm_withdraw(sdp, + "GFS2: fsid=%s: fatal: filesystem consistency error\n" + "GFS2: fsid=%s: function = %s, file = %s, line = %u\n", + sdp->sd_fsname, + sdp->sd_fsname, function, file, line); + return rv; +} + +/** + * gfs2_consist_inode_i - Flag an inode consistency error and withdraw + * Returns: -1 if this call withdrew the machine, + * 0 if it was already withdrawn + */ + +int gfs2_consist_inode_i(struct gfs2_inode *ip, int cluster_wide, + const char *function, char *file, unsigned int line) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + int rv; + rv = gfs2_lm_withdraw(sdp, + "GFS2: fsid=%s: fatal: filesystem consistency error\n" + "GFS2: fsid=%s: inode = %llu %llu\n" + "GFS2: fsid=%s: function = %s, file = %s, line = %u\n", + sdp->sd_fsname, + sdp->sd_fsname, (unsigned long long)ip->i_num.no_formal_ino, + (unsigned long long)ip->i_num.no_addr, + sdp->sd_fsname, function, file, line); + return rv; +} + +/** + * gfs2_consist_rgrpd_i - Flag a RG consistency error and withdraw + * Returns: -1 if this call withdrew the machine, + * 0 if it was already withdrawn + */ + +int gfs2_consist_rgrpd_i(struct gfs2_rgrpd *rgd, int cluster_wide, + const char *function, char *file, unsigned int line) +{ + struct gfs2_sbd *sdp = rgd->rd_sbd; + int rv; + rv = gfs2_lm_withdraw(sdp, + "GFS2: fsid=%s: fatal: filesystem consistency error\n" + "GFS2: fsid=%s: RG = %llu\n" + "GFS2: fsid=%s: function = %s, file = %s, line = %u\n", + sdp->sd_fsname, + sdp->sd_fsname, (unsigned long long)rgd->rd_ri.ri_addr, + sdp->sd_fsname, function, file, line); + return rv; +} + +/** + * gfs2_meta_check_ii - Flag a magic number consistency error and withdraw + * Returns: -1 if this call withdrew the machine, + * -2 if it was already withdrawn + */ + +int gfs2_meta_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh, + const char *type, const char *function, char *file, + unsigned int line) +{ + int me; + me = gfs2_lm_withdraw(sdp, + "GFS2: fsid=%s: fatal: invalid metadata block\n" + "GFS2: fsid=%s: bh = %llu (%s)\n" + "GFS2: fsid=%s: function = %s, file = %s, line = %u\n", + sdp->sd_fsname, + sdp->sd_fsname, (unsigned long long)bh->b_blocknr, type, + sdp->sd_fsname, function, file, line); + return (me) ? -1 : -2; +} + +/** + * gfs2_metatype_check_ii - Flag a metadata type consistency error and withdraw + * Returns: -1 if this call withdrew the machine, + * -2 if it was already withdrawn + */ + +int gfs2_metatype_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh, + u16 type, u16 t, const char *function, + char *file, unsigned int line) +{ + int me; + me = gfs2_lm_withdraw(sdp, + "GFS2: fsid=%s: fatal: invalid metadata block\n" + "GFS2: fsid=%s: bh = %llu (type: exp=%u, found=%u)\n" + "GFS2: fsid=%s: function = %s, file = %s, line = %u\n", + sdp->sd_fsname, + sdp->sd_fsname, (unsigned long long)bh->b_blocknr, type, t, + sdp->sd_fsname, function, file, line); + return (me) ? -1 : -2; +} + +/** + * gfs2_io_error_i - Flag an I/O error and withdraw + * Returns: -1 if this call withdrew the machine, + * 0 if it was already withdrawn + */ + +int gfs2_io_error_i(struct gfs2_sbd *sdp, const char *function, char *file, + unsigned int line) +{ + int rv; + rv = gfs2_lm_withdraw(sdp, + "GFS2: fsid=%s: fatal: I/O error\n" + "GFS2: fsid=%s: function = %s, file = %s, line = %u\n", + sdp->sd_fsname, + sdp->sd_fsname, function, file, line); + return rv; +} + +/** + * gfs2_io_error_bh_i - Flag a buffer I/O error and withdraw + * Returns: -1 if this call withdrew the machine, + * 0 if it was already withdrawn + */ + +int gfs2_io_error_bh_i(struct gfs2_sbd *sdp, struct buffer_head *bh, + const char *function, char *file, unsigned int line) +{ + int rv; + rv = gfs2_lm_withdraw(sdp, + "GFS2: fsid=%s: fatal: I/O error\n" + "GFS2: fsid=%s: block = %llu\n" + "GFS2: fsid=%s: function = %s, file = %s, line = %u\n", + sdp->sd_fsname, + sdp->sd_fsname, (unsigned long long)bh->b_blocknr, + sdp->sd_fsname, function, file, line); + return rv; +} + +void gfs2_icbit_munge(struct gfs2_sbd *sdp, unsigned char **bitmap, + unsigned int bit, int new_value) +{ + unsigned int c, o, b = bit; + int old_value; + + c = b / (8 * PAGE_SIZE); + b %= 8 * PAGE_SIZE; + o = b / 8; + b %= 8; + + old_value = (bitmap[c][o] & (1 << b)); + gfs2_assert_withdraw(sdp, !old_value != !new_value); + + if (new_value) + bitmap[c][o] |= 1 << b; + else + bitmap[c][o] &= ~(1 << b); +} + diff --git a/fs/gfs2/util.h b/fs/gfs2/util.h new file mode 100644 index 000000000000..76a50899fe9e --- /dev/null +++ b/fs/gfs2/util.h @@ -0,0 +1,170 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#ifndef __UTIL_DOT_H__ +#define __UTIL_DOT_H__ + +#include "incore.h" + +#define fs_printk(level, fs, fmt, arg...) \ + printk(level "GFS2: fsid=%s: " fmt , (fs)->sd_fsname , ## arg) + +#define fs_info(fs, fmt, arg...) \ + fs_printk(KERN_INFO , fs , fmt , ## arg) + +#define fs_warn(fs, fmt, arg...) \ + fs_printk(KERN_WARNING , fs , fmt , ## arg) + +#define fs_err(fs, fmt, arg...) \ + fs_printk(KERN_ERR, fs , fmt , ## arg) + + +void gfs2_assert_i(struct gfs2_sbd *sdp); + +#define gfs2_assert(sdp, assertion) \ +do { \ + if (unlikely(!(assertion))) { \ + gfs2_assert_i(sdp); \ + BUG(); \ + } \ +} while (0) + + +int gfs2_assert_withdraw_i(struct gfs2_sbd *sdp, char *assertion, + const char *function, char *file, unsigned int line); + +#define gfs2_assert_withdraw(sdp, assertion) \ +((likely(assertion)) ? 0 : gfs2_assert_withdraw_i((sdp), #assertion, \ + __FUNCTION__, __FILE__, __LINE__)) + + +int gfs2_assert_warn_i(struct gfs2_sbd *sdp, char *assertion, + const char *function, char *file, unsigned int line); + +#define gfs2_assert_warn(sdp, assertion) \ +((likely(assertion)) ? 0 : gfs2_assert_warn_i((sdp), #assertion, \ + __FUNCTION__, __FILE__, __LINE__)) + + +int gfs2_consist_i(struct gfs2_sbd *sdp, int cluster_wide, + const char *function, char *file, unsigned int line); + +#define gfs2_consist(sdp) \ +gfs2_consist_i((sdp), 0, __FUNCTION__, __FILE__, __LINE__) + + +int gfs2_consist_inode_i(struct gfs2_inode *ip, int cluster_wide, + const char *function, char *file, unsigned int line); + +#define gfs2_consist_inode(ip) \ +gfs2_consist_inode_i((ip), 0, __FUNCTION__, __FILE__, __LINE__) + + +int gfs2_consist_rgrpd_i(struct gfs2_rgrpd *rgd, int cluster_wide, + const char *function, char *file, unsigned int line); + +#define gfs2_consist_rgrpd(rgd) \ +gfs2_consist_rgrpd_i((rgd), 0, __FUNCTION__, __FILE__, __LINE__) + + +int gfs2_meta_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh, + const char *type, const char *function, + char *file, unsigned int line); + +static inline int gfs2_meta_check_i(struct gfs2_sbd *sdp, + struct buffer_head *bh, + const char *function, + char *file, unsigned int line) +{ + struct gfs2_meta_header *mh = (struct gfs2_meta_header *)bh->b_data; + u32 magic = mh->mh_magic; + magic = be32_to_cpu(magic); + if (unlikely(magic != GFS2_MAGIC)) + return gfs2_meta_check_ii(sdp, bh, "magic number", function, + file, line); + return 0; +} + +#define gfs2_meta_check(sdp, bh) \ +gfs2_meta_check_i((sdp), (bh), __FUNCTION__, __FILE__, __LINE__) + + +int gfs2_metatype_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh, + u16 type, u16 t, + const char *function, + char *file, unsigned int line); + +static inline int gfs2_metatype_check_i(struct gfs2_sbd *sdp, + struct buffer_head *bh, + u16 type, + const char *function, + char *file, unsigned int line) +{ + struct gfs2_meta_header *mh = (struct gfs2_meta_header *)bh->b_data; + u32 magic = mh->mh_magic; + u16 t = be32_to_cpu(mh->mh_type); + magic = be32_to_cpu(magic); + if (unlikely(magic != GFS2_MAGIC)) + return gfs2_meta_check_ii(sdp, bh, "magic number", function, + file, line); + if (unlikely(t != type)) + return gfs2_metatype_check_ii(sdp, bh, type, t, function, + file, line); + return 0; +} + +#define gfs2_metatype_check(sdp, bh, type) \ +gfs2_metatype_check_i((sdp), (bh), (type), __FUNCTION__, __FILE__, __LINE__) + +static inline void gfs2_metatype_set(struct buffer_head *bh, u16 type, + u16 format) +{ + struct gfs2_meta_header *mh; + mh = (struct gfs2_meta_header *)bh->b_data; + mh->mh_type = cpu_to_be32(type); + mh->mh_format = cpu_to_be32(format); +} + + +int gfs2_io_error_i(struct gfs2_sbd *sdp, const char *function, + char *file, unsigned int line); + +#define gfs2_io_error(sdp) \ +gfs2_io_error_i((sdp), __FUNCTION__, __FILE__, __LINE__); + + +int gfs2_io_error_bh_i(struct gfs2_sbd *sdp, struct buffer_head *bh, + const char *function, char *file, unsigned int line); + +#define gfs2_io_error_bh(sdp, bh) \ +gfs2_io_error_bh_i((sdp), (bh), __FUNCTION__, __FILE__, __LINE__); + + +extern kmem_cache_t *gfs2_glock_cachep; +extern kmem_cache_t *gfs2_inode_cachep; +extern kmem_cache_t *gfs2_bufdata_cachep; + +static inline unsigned int gfs2_tune_get_i(struct gfs2_tune *gt, + unsigned int *p) +{ + unsigned int x; + spin_lock(>->gt_spin); + x = *p; + spin_unlock(>->gt_spin); + return x; +} + +#define gfs2_tune_get(sdp, field) \ +gfs2_tune_get_i(&(sdp)->sd_tune, &(sdp)->sd_tune.field) + +void gfs2_icbit_munge(struct gfs2_sbd *sdp, unsigned char **bitmap, + unsigned int bit, int new_value); + +#endif /* __UTIL_DOT_H__ */ + |