diff options
Diffstat (limited to 'fs')
115 files changed, 1433 insertions, 780 deletions
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c index 5c5bc8480070..f8b86e92cd66 100644 --- a/fs/9p/v9fs.c +++ b/fs/9p/v9fs.c @@ -238,6 +238,13 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses, return ERR_PTR(-ENOMEM); } + rc = bdi_setup_and_register(&v9ses->bdi, "9p", BDI_CAP_MAP_COPY); + if (rc) { + __putname(v9ses->aname); + __putname(v9ses->uname); + return ERR_PTR(rc); + } + spin_lock(&v9fs_sessionlist_lock); list_add(&v9ses->slist, &v9fs_sessionlist); spin_unlock(&v9fs_sessionlist_lock); @@ -301,6 +308,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses, return fid; error: + bdi_destroy(&v9ses->bdi); return ERR_PTR(retval); } @@ -326,6 +334,8 @@ void v9fs_session_close(struct v9fs_session_info *v9ses) __putname(v9ses->uname); __putname(v9ses->aname); + bdi_destroy(&v9ses->bdi); + spin_lock(&v9fs_sessionlist_lock); list_del(&v9ses->slist); spin_unlock(&v9fs_sessionlist_lock); diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h index a0a8d3dd1361..bec4d0bcb458 100644 --- a/fs/9p/v9fs.h +++ b/fs/9p/v9fs.h @@ -20,6 +20,7 @@ * Boston, MA 02111-1301 USA * */ +#include <linux/backing-dev.h> /** * enum p9_session_flags - option flags for each 9P session @@ -102,6 +103,7 @@ struct v9fs_session_info { u32 uid; /* if ACCESS_SINGLE, the uid that has access */ struct p9_client *clnt; /* 9p client */ struct list_head slist; /* list of sessions registered with v9fs */ + struct backing_dev_info bdi; }; struct p9_fid *v9fs_session_init(struct v9fs_session_info *, const char *, diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index 491108bd6e0d..806da5d3b3a0 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -77,6 +77,7 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses, sb->s_blocksize = 1 << sb->s_blocksize_bits; sb->s_magic = V9FS_MAGIC; sb->s_op = &v9fs_super_ops; + sb->s_bdi = &v9ses->bdi; sb->s_flags = flags | MS_ACTIVE | MS_SYNCHRONOUS | MS_DIRSYNC | MS_NOATIME; diff --git a/fs/afs/internal.h b/fs/afs/internal.h index c54dad4e6063..a10f2582844f 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -19,6 +19,7 @@ #include <linux/workqueue.h> #include <linux/sched.h> #include <linux/fscache.h> +#include <linux/backing-dev.h> #include "afs.h" #include "afs_vl.h" @@ -313,6 +314,7 @@ struct afs_volume { unsigned short rjservers; /* number of servers discarded due to -ENOMEDIUM */ struct afs_server *servers[8]; /* servers on which volume resides (ordered) */ struct rw_semaphore server_sem; /* lock for accessing current server */ + struct backing_dev_info bdi; }; /* diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c index 5e813a816ce4..b3feddc4f7d6 100644 --- a/fs/afs/mntpt.c +++ b/fs/afs/mntpt.c @@ -138,9 +138,9 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt) { struct afs_super_info *super; struct vfsmount *mnt; - struct page *page = NULL; + struct page *page; size_t size; - char *buf, *devname = NULL, *options = NULL; + char *buf, *devname, *options; int ret; _enter("{%s}", mntpt->d_name.name); @@ -150,22 +150,22 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt) ret = -EINVAL; size = mntpt->d_inode->i_size; if (size > PAGE_SIZE - 1) - goto error; + goto error_no_devname; ret = -ENOMEM; devname = (char *) get_zeroed_page(GFP_KERNEL); if (!devname) - goto error; + goto error_no_devname; options = (char *) get_zeroed_page(GFP_KERNEL); if (!options) - goto error; + goto error_no_options; /* read the contents of the AFS special symlink */ page = read_mapping_page(mntpt->d_inode->i_mapping, 0, NULL); if (IS_ERR(page)) { ret = PTR_ERR(page); - goto error; + goto error_no_page; } ret = -EIO; @@ -196,12 +196,12 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt) return mnt; error: - if (page) - page_cache_release(page); - if (devname) - free_page((unsigned long) devname); - if (options) - free_page((unsigned long) options); + page_cache_release(page); +error_no_page: + free_page((unsigned long) options); +error_no_options: + free_page((unsigned long) devname); +error_no_devname: _leave(" = %d", ret); return ERR_PTR(ret); } diff --git a/fs/afs/super.c b/fs/afs/super.c index 14f6431598ad..e932e5a3a0c1 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -311,6 +311,7 @@ static int afs_fill_super(struct super_block *sb, void *data) sb->s_magic = AFS_FS_MAGIC; sb->s_op = &afs_super_ops; sb->s_fs_info = as; + sb->s_bdi = &as->volume->bdi; /* allocate the root inode and dentry */ fid.vid = as->volume->vid; diff --git a/fs/afs/volume.c b/fs/afs/volume.c index a353e69e2391..401eeb21869f 100644 --- a/fs/afs/volume.c +++ b/fs/afs/volume.c @@ -106,6 +106,10 @@ struct afs_volume *afs_volume_lookup(struct afs_mount_params *params) volume->cell = params->cell; volume->vid = vlocation->vldb.vid[params->type]; + ret = bdi_setup_and_register(&volume->bdi, "afs", BDI_CAP_MAP_COPY); + if (ret) + goto error_bdi; + init_rwsem(&volume->server_sem); /* look up all the applicable server records */ @@ -151,6 +155,8 @@ error: return ERR_PTR(ret); error_discard: + bdi_destroy(&volume->bdi); +error_bdi: up_write(¶ms->cell->vl_sem); for (loop = volume->nservers - 1; loop >= 0; loop--) @@ -200,6 +206,7 @@ void afs_put_volume(struct afs_volume *volume) for (loop = volume->nservers - 1; loop >= 0; loop--) afs_put_server(volume->servers[loop]); + bdi_destroy(&volume->bdi); kfree(volume); _leave(" [destroyed]"); diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c index 109a6c606d92..e8e5e63ac950 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs4/root.c @@ -177,8 +177,7 @@ static int try_to_fill_dentry(struct dentry *dentry, int flags) } /* Trigger mount for path component or follow link */ } else if (ino->flags & AUTOFS_INF_PENDING || - autofs4_need_mount(flags) || - current->link_count) { + autofs4_need_mount(flags)) { DPRINTK("waiting for mount name=%.*s", dentry->d_name.len, dentry->d_name.name); @@ -262,7 +261,7 @@ static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd) spin_unlock(&dcache_lock); spin_unlock(&sbi->fs_lock); - status = try_to_fill_dentry(dentry, 0); + status = try_to_fill_dentry(dentry, nd->flags); if (status) goto out_error; diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 7ab23e006e4c..2c5f9a0e5d72 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -1005,15 +1005,8 @@ static int elf_fdpic_map_file_constdisp_on_uclinux( } } else if (!mm->start_data) { mm->start_data = seg->addr; -#ifndef CONFIG_MMU mm->end_data = seg->addr + phdr->p_memsz; -#endif } - -#ifdef CONFIG_MMU - if (seg->addr + phdr->p_memsz > mm->end_data) - mm->end_data = seg->addr + phdr->p_memsz; -#endif } seg++; diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c index e0e769bdca59..49566c1687d8 100644 --- a/fs/binfmt_flat.c +++ b/fs/binfmt_flat.c @@ -355,7 +355,7 @@ calc_reloc(unsigned long r, struct lib_info *p, int curid, int internalp) if (!flat_reloc_valid(r, start_brk - start_data + text_len)) { printk("BINFMT_FLAT: reloc outside program 0x%x (0 - 0x%x/0x%x)", - (int) r,(int)(start_brk-start_code),(int)text_len); + (int) r,(int)(start_brk-start_data+text_len),(int)text_len); goto failed; } diff --git a/fs/block_dev.c b/fs/block_dev.c index 2a6d0193f139..6dcee88c2e5d 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -406,16 +406,23 @@ static loff_t block_llseek(struct file *file, loff_t offset, int origin) int blkdev_fsync(struct file *filp, struct dentry *dentry, int datasync) { - struct block_device *bdev = I_BDEV(filp->f_mapping->host); + struct inode *bd_inode = filp->f_mapping->host; + struct block_device *bdev = I_BDEV(bd_inode); int error; - error = sync_blockdev(bdev); - if (error) - return error; - + /* + * There is no need to serialise calls to blkdev_issue_flush with + * i_mutex and doing so causes performance issues with concurrent + * O_SYNC writers to a block device. + */ + mutex_unlock(&bd_inode->i_mutex); + error = blkdev_issue_flush(bdev, NULL); if (error == -EOPNOTSUPP) error = 0; + + mutex_lock(&bd_inode->i_mutex); + return error; } EXPORT_SYMBOL(blkdev_fsync); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index e7b8f2c89ccb..feca04197d02 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -44,8 +44,6 @@ static struct extent_io_ops btree_extent_io_ops; static void end_workqueue_fn(struct btrfs_work *work); static void free_fs_root(struct btrfs_root *root); -static atomic_t btrfs_bdi_num = ATOMIC_INIT(0); - /* * end_io_wq structs are used to do processing in task context when an IO is * complete. This is used during reads to verify checksums, and it is used @@ -1375,19 +1373,11 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi) { int err; - bdi->name = "btrfs"; bdi->capabilities = BDI_CAP_MAP_COPY; - err = bdi_init(bdi); + err = bdi_setup_and_register(bdi, "btrfs", BDI_CAP_MAP_COPY); if (err) return err; - err = bdi_register(bdi, NULL, "btrfs-%d", - atomic_inc_return(&btrfs_bdi_num)); - if (err) { - bdi_destroy(bdi); - return err; - } - bdi->ra_pages = default_backing_dev_info.ra_pages; bdi->unplug_io_fn = btrfs_unplug_io_fn; bdi->unplug_io_data = info; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index e84ef60ffe35..97a97839a867 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1481,12 +1481,17 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, ret = -EBADF; goto out_drop_write; } + src = src_file->f_dentry->d_inode; ret = -EINVAL; if (src == inode) goto out_fput; + /* the src must be open for reading */ + if (!(src_file->f_mode & FMODE_READ)) + goto out_fput; + ret = -EISDIR; if (S_ISDIR(src->i_mode) || S_ISDIR(inode->i_mode)) goto out_fput; diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h index f7c255f9c624..a8cd821226da 100644 --- a/fs/cachefiles/internal.h +++ b/fs/cachefiles/internal.h @@ -34,6 +34,7 @@ struct cachefiles_object { loff_t i_size; /* object size */ unsigned long flags; #define CACHEFILES_OBJECT_ACTIVE 0 /* T if marked active */ +#define CACHEFILES_OBJECT_BURIED 1 /* T if preemptively buried */ atomic_t usage; /* object usage count */ uint8_t type; /* object type */ uint8_t new; /* T if object new */ diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index d5db84a1ee0d..f4a7840bf42c 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -93,6 +93,59 @@ static noinline void cachefiles_printk_object(struct cachefiles_object *object, } /* + * mark the owner of a dentry, if there is one, to indicate that that dentry + * has been preemptively deleted + * - the caller must hold the i_mutex on the dentry's parent as required to + * call vfs_unlink(), vfs_rmdir() or vfs_rename() + */ +static void cachefiles_mark_object_buried(struct cachefiles_cache *cache, + struct dentry *dentry) +{ + struct cachefiles_object *object; + struct rb_node *p; + + _enter(",'%*.*s'", + dentry->d_name.len, dentry->d_name.len, dentry->d_name.name); + + write_lock(&cache->active_lock); + + p = cache->active_nodes.rb_node; + while (p) { + object = rb_entry(p, struct cachefiles_object, active_node); + if (object->dentry > dentry) + p = p->rb_left; + else if (object->dentry < dentry) + p = p->rb_right; + else + goto found_dentry; + } + + write_unlock(&cache->active_lock); + _leave(" [no owner]"); + return; + + /* found the dentry for */ +found_dentry: + kdebug("preemptive burial: OBJ%x [%s] %p", + object->fscache.debug_id, + fscache_object_states[object->fscache.state], + dentry); + + if (object->fscache.state < FSCACHE_OBJECT_DYING) { + printk(KERN_ERR "\n"); + printk(KERN_ERR "CacheFiles: Error:" + " Can't preemptively bury live object\n"); + cachefiles_printk_object(object, NULL); + } else if (test_and_set_bit(CACHEFILES_OBJECT_BURIED, &object->flags)) { + printk(KERN_ERR "CacheFiles: Error:" + " Object already preemptively buried\n"); + } + + write_unlock(&cache->active_lock); + _leave(" [owner marked]"); +} + +/* * record the fact that an object is now active */ static int cachefiles_mark_object_active(struct cachefiles_cache *cache, @@ -219,7 +272,8 @@ requeue: */ static int cachefiles_bury_object(struct cachefiles_cache *cache, struct dentry *dir, - struct dentry *rep) + struct dentry *rep, + bool preemptive) { struct dentry *grave, *trap; char nbuffer[8 + 8 + 1]; @@ -229,11 +283,16 @@ static int cachefiles_bury_object(struct cachefiles_cache *cache, dir->d_name.len, dir->d_name.len, dir->d_name.name, rep->d_name.len, rep->d_name.len, rep->d_name.name); + _debug("remove %p from %p", rep, dir); + /* non-directories can just be unlinked */ if (!S_ISDIR(rep->d_inode->i_mode)) { _debug("unlink stale object"); ret = vfs_unlink(dir->d_inode, rep); + if (preemptive) + cachefiles_mark_object_buried(cache, rep); + mutex_unlock(&dir->d_inode->i_mutex); if (ret == -EIO) @@ -325,6 +384,9 @@ try_again: if (ret != 0 && ret != -ENOMEM) cachefiles_io_error(cache, "Rename failed with error %d", ret); + if (preemptive) + cachefiles_mark_object_buried(cache, rep); + unlock_rename(cache->graveyard, dir); dput(grave); _leave(" = 0"); @@ -340,7 +402,7 @@ int cachefiles_delete_object(struct cachefiles_cache *cache, struct dentry *dir; int ret; - _enter(",{%p}", object->dentry); + _enter(",OBJ%x{%p}", object->fscache.debug_id, object->dentry); ASSERT(object->dentry); ASSERT(object->dentry->d_inode); @@ -350,15 +412,25 @@ int cachefiles_delete_object(struct cachefiles_cache *cache, mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT); - /* we need to check that our parent is _still_ our parent - it may have - * been renamed */ - if (dir == object->dentry->d_parent) { - ret = cachefiles_bury_object(cache, dir, object->dentry); - } else { - /* it got moved, presumably by cachefilesd culling it, so it's - * no longer in the key path and we can ignore it */ + if (test_bit(CACHEFILES_OBJECT_BURIED, &object->flags)) { + /* object allocation for the same key preemptively deleted this + * object's file so that it could create its own file */ + _debug("object preemptively buried"); mutex_unlock(&dir->d_inode->i_mutex); ret = 0; + } else { + /* we need to check that our parent is _still_ our parent - it + * may have been renamed */ + if (dir == object->dentry->d_parent) { + ret = cachefiles_bury_object(cache, dir, + object->dentry, false); + } else { + /* it got moved, presumably by cachefilesd culling it, + * so it's no longer in the key path and we can ignore + * it */ + mutex_unlock(&dir->d_inode->i_mutex); + ret = 0; + } } dput(dir); @@ -381,7 +453,9 @@ int cachefiles_walk_to_object(struct cachefiles_object *parent, const char *name; int ret, nlen; - _enter("{%p},,%s,", parent->dentry, key); + _enter("OBJ%x{%p},OBJ%x,%s,", + parent->fscache.debug_id, parent->dentry, + object->fscache.debug_id, key); cache = container_of(parent->fscache.cache, struct cachefiles_cache, cache); @@ -509,7 +583,7 @@ lookup_again: * mutex) */ object->dentry = NULL; - ret = cachefiles_bury_object(cache, dir, next); + ret = cachefiles_bury_object(cache, dir, next, true); dput(next); next = NULL; @@ -828,7 +902,7 @@ int cachefiles_cull(struct cachefiles_cache *cache, struct dentry *dir, /* actually remove the victim (drops the dir mutex) */ _debug("bury"); - ret = cachefiles_bury_object(cache, dir, victim); + ret = cachefiles_bury_object(cache, dir, victim, false); if (ret < 0) goto error; diff --git a/fs/cachefiles/security.c b/fs/cachefiles/security.c index b5808cdb2232..039b5011d83b 100644 --- a/fs/cachefiles/security.c +++ b/fs/cachefiles/security.c @@ -77,6 +77,8 @@ static int cachefiles_check_cache_dir(struct cachefiles_cache *cache, /* * check the security details of the on-disk cache * - must be called with security override in force + * - must return with a security override in force - even in the case of an + * error */ int cachefiles_determine_cache_security(struct cachefiles_cache *cache, struct dentry *root, @@ -99,6 +101,8 @@ int cachefiles_determine_cache_security(struct cachefiles_cache *cache, * which create files */ ret = set_create_files_as(new, root->d_inode); if (ret < 0) { + abort_creds(new); + cachefiles_begin_secure(cache, _saved_cred); _leave(" = %d [cfa]", ret); return ret; } diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index aa3cd7cc3e40..a9005d862ed4 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -337,16 +337,15 @@ out: /* * Get ref for the oldest snapc for an inode with dirty data... that is, the * only snap context we are allowed to write back. - * - * Caller holds i_lock. */ -static struct ceph_snap_context *__get_oldest_context(struct inode *inode, - u64 *snap_size) +static struct ceph_snap_context *get_oldest_context(struct inode *inode, + u64 *snap_size) { struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_snap_context *snapc = NULL; struct ceph_cap_snap *capsnap = NULL; + spin_lock(&inode->i_lock); list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) { dout(" cap_snap %p snapc %p has %d dirty pages\n", capsnap, capsnap->context, capsnap->dirty_pages); @@ -357,21 +356,11 @@ static struct ceph_snap_context *__get_oldest_context(struct inode *inode, break; } } - if (!snapc && ci->i_snap_realm) { - snapc = ceph_get_snap_context(ci->i_snap_realm->cached_context); + if (!snapc && ci->i_head_snapc) { + snapc = ceph_get_snap_context(ci->i_head_snapc); dout(" head snapc %p has %d dirty pages\n", snapc, ci->i_wrbuffer_ref_head); } - return snapc; -} - -static struct ceph_snap_context *get_oldest_context(struct inode *inode, - u64 *snap_size) -{ - struct ceph_snap_context *snapc = NULL; - - spin_lock(&inode->i_lock); - snapc = __get_oldest_context(inode, snap_size); spin_unlock(&inode->i_lock); return snapc; } @@ -392,7 +381,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) int len = PAGE_CACHE_SIZE; loff_t i_size; int err = 0; - struct ceph_snap_context *snapc; + struct ceph_snap_context *snapc, *oldest; u64 snap_size = 0; long writeback_stat; @@ -413,13 +402,16 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) dout("writepage %p page %p not dirty?\n", inode, page); goto out; } - if (snapc != get_oldest_context(inode, &snap_size)) { + oldest = get_oldest_context(inode, &snap_size); + if (snapc->seq > oldest->seq) { dout("writepage %p page %p snapc %p not writeable - noop\n", inode, page, (void *)page->private); /* we should only noop if called by kswapd */ WARN_ON((current->flags & PF_MEMALLOC) == 0); + ceph_put_snap_context(oldest); goto out; } + ceph_put_snap_context(oldest); /* is this a partial page at end of file? */ if (snap_size) @@ -458,7 +450,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) ClearPagePrivate(page); end_page_writeback(page); ceph_put_wrbuffer_cap_refs(ci, 1, snapc); - ceph_put_snap_context(snapc); + ceph_put_snap_context(snapc); /* page's reference */ out: return err; } @@ -512,12 +504,11 @@ static void writepages_finish(struct ceph_osd_request *req, int i; struct ceph_snap_context *snapc = req->r_snapc; struct address_space *mapping = inode->i_mapping; - struct writeback_control *wbc = req->r_wbc; __s32 rc = -EIO; u64 bytes = 0; struct ceph_client *client = ceph_inode_to_client(inode); long writeback_stat; - unsigned issued = __ceph_caps_issued(ci, NULL); + unsigned issued = ceph_caps_issued(ci); /* parse reply */ replyhead = msg->front.iov_base; @@ -554,13 +545,9 @@ static void writepages_finish(struct ceph_osd_request *req, clear_bdi_congested(&client->backing_dev_info, BLK_RW_ASYNC); - if (i >= wrote) { - dout("inode %p skipping page %p\n", inode, page); - wbc->pages_skipped++; - } + ceph_put_snap_context((void *)page->private); page->private = 0; ClearPagePrivate(page); - ceph_put_snap_context(snapc); dout("unlocking %d %p\n", i, page); end_page_writeback(page); @@ -618,7 +605,7 @@ static int ceph_writepages_start(struct address_space *mapping, int range_whole = 0; int should_loop = 1; pgoff_t max_pages = 0, max_pages_ever = 0; - struct ceph_snap_context *snapc = NULL, *last_snapc = NULL; + struct ceph_snap_context *snapc = NULL, *last_snapc = NULL, *pgsnapc; struct pagevec pvec; int done = 0; int rc = 0; @@ -770,9 +757,10 @@ get_more_pages: } /* only if matching snap context */ - if (snapc != (void *)page->private) { - dout("page snapc %p != oldest %p\n", - (void *)page->private, snapc); + pgsnapc = (void *)page->private; + if (pgsnapc->seq > snapc->seq) { + dout("page snapc %p %lld > oldest %p %lld\n", + pgsnapc, pgsnapc->seq, snapc, snapc->seq); unlock_page(page); if (!locked_pages) continue; /* keep looking for snap */ @@ -806,7 +794,6 @@ get_more_pages: alloc_page_vec(client, req); req->r_callback = writepages_finish; req->r_inode = inode; - req->r_wbc = wbc; } /* note position of first page in pvec */ @@ -914,7 +901,10 @@ static int context_is_writeable_or_written(struct inode *inode, struct ceph_snap_context *snapc) { struct ceph_snap_context *oldest = get_oldest_context(inode, NULL); - return !oldest || snapc->seq <= oldest->seq; + int ret = !oldest || snapc->seq <= oldest->seq; + + ceph_put_snap_context(oldest); + return ret; } /* @@ -936,8 +926,8 @@ static int ceph_update_writeable_page(struct file *file, int pos_in_page = pos & ~PAGE_CACHE_MASK; int end_in_page = pos_in_page + len; loff_t i_size; - struct ceph_snap_context *snapc; int r; + struct ceph_snap_context *snapc, *oldest; retry_locked: /* writepages currently holds page lock, but if we change that later, */ @@ -947,23 +937,24 @@ retry_locked: BUG_ON(!ci->i_snap_realm); down_read(&mdsc->snap_rwsem); BUG_ON(!ci->i_snap_realm->cached_context); - if (page->private && - (void *)page->private != ci->i_snap_realm->cached_context) { + snapc = (void *)page->private; + if (snapc && snapc != ci->i_head_snapc) { /* * this page is already dirty in another (older) snap * context! is it writeable now? */ - snapc = get_oldest_context(inode, NULL); + oldest = get_oldest_context(inode, NULL); up_read(&mdsc->snap_rwsem); - if (snapc != (void *)page->private) { + if (snapc->seq > oldest->seq) { + ceph_put_snap_context(oldest); dout(" page %p snapc %p not current or oldest\n", - page, (void *)page->private); + page, snapc); /* * queue for writeback, and wait for snapc to * be writeable or written */ - snapc = ceph_get_snap_context((void *)page->private); + snapc = ceph_get_snap_context(snapc); unlock_page(page); ceph_queue_writeback(inode); r = wait_event_interruptible(ci->i_cap_wq, @@ -973,6 +964,7 @@ retry_locked: return r; return -EAGAIN; } + ceph_put_snap_context(oldest); /* yay, writeable, do it now (without dropping page lock) */ dout(" page %p snapc %p not current, but oldest\n", diff --git a/fs/ceph/auth.c b/fs/ceph/auth.c index f6394b94b866..818afe72e6c7 100644 --- a/fs/ceph/auth.c +++ b/fs/ceph/auth.c @@ -3,6 +3,7 @@ #include <linux/module.h> #include <linux/slab.h> #include <linux/err.h> +#include <linux/slab.h> #include "types.h" #include "auth_none.h" diff --git a/fs/ceph/auth_none.h b/fs/ceph/auth_none.h index 56c05533a31c..8164df1a08be 100644 --- a/fs/ceph/auth_none.h +++ b/fs/ceph/auth_none.h @@ -1,6 +1,8 @@ #ifndef _FS_CEPH_AUTH_NONE_H #define _FS_CEPH_AUTH_NONE_H +#include <linux/slab.h> + #include "auth.h" /* diff --git a/fs/ceph/auth_x.c b/fs/ceph/auth_x.c index d9001a4dc8cc..fee5a08da881 100644 --- a/fs/ceph/auth_x.c +++ b/fs/ceph/auth_x.c @@ -12,8 +12,6 @@ #include "auth.h" #include "decode.h" -struct kmem_cache *ceph_x_ticketbuf_cachep; - #define TEMP_TICKET_BUF_LEN 256 static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed); @@ -131,13 +129,12 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac, char *ticket_buf; u8 struct_v; - dbuf = kmem_cache_alloc(ceph_x_ticketbuf_cachep, GFP_NOFS | GFP_ATOMIC); + dbuf = kmalloc(TEMP_TICKET_BUF_LEN, GFP_NOFS); if (!dbuf) return -ENOMEM; ret = -ENOMEM; - ticket_buf = kmem_cache_alloc(ceph_x_ticketbuf_cachep, - GFP_NOFS | GFP_ATOMIC); + ticket_buf = kmalloc(TEMP_TICKET_BUF_LEN, GFP_NOFS); if (!ticket_buf) goto out_dbuf; @@ -251,9 +248,9 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac, ret = 0; out: - kmem_cache_free(ceph_x_ticketbuf_cachep, ticket_buf); + kfree(ticket_buf); out_dbuf: - kmem_cache_free(ceph_x_ticketbuf_cachep, dbuf); + kfree(dbuf); return ret; bad: @@ -605,8 +602,6 @@ static void ceph_x_destroy(struct ceph_auth_client *ac) remove_ticket_handler(ac, th); } - kmem_cache_destroy(ceph_x_ticketbuf_cachep); - kfree(ac->private); ac->private = NULL; } @@ -641,26 +636,20 @@ int ceph_x_init(struct ceph_auth_client *ac) int ret; dout("ceph_x_init %p\n", ac); + ret = -ENOMEM; xi = kzalloc(sizeof(*xi), GFP_NOFS); if (!xi) - return -ENOMEM; + goto out; - ret = -ENOMEM; - ceph_x_ticketbuf_cachep = kmem_cache_create("ceph_x_ticketbuf", - TEMP_TICKET_BUF_LEN, 8, - (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD), - NULL); - if (!ceph_x_ticketbuf_cachep) - goto done_nomem; ret = -EINVAL; if (!ac->secret) { pr_err("no secret set (for auth_x protocol)\n"); - goto done_nomem; + goto out_nomem; } ret = ceph_crypto_key_unarmor(&xi->secret, ac->secret); if (ret) - goto done_nomem; + goto out_nomem; xi->starting = true; xi->ticket_handlers = RB_ROOT; @@ -670,10 +659,9 @@ int ceph_x_init(struct ceph_auth_client *ac) ac->ops = &ceph_x_ops; return 0; -done_nomem: +out_nomem: kfree(xi); - if (ceph_x_ticketbuf_cachep) - kmem_cache_destroy(ceph_x_ticketbuf_cachep); +out: return ret; } diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 3710e077a857..d9400534b279 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -858,6 +858,8 @@ static int __ceph_is_any_caps(struct ceph_inode_info *ci) } /* + * Remove a cap. Take steps to deal with a racing iterate_session_caps. + * * caller should hold i_lock. * caller will not hold session s_mutex if called from destroy_inode. */ @@ -866,15 +868,10 @@ void __ceph_remove_cap(struct ceph_cap *cap) struct ceph_mds_session *session = cap->session; struct ceph_inode_info *ci = cap->ci; struct ceph_mds_client *mdsc = &ceph_client(ci->vfs_inode.i_sb)->mdsc; + int removed = 0; dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode); - /* remove from inode list */ - rb_erase(&cap->ci_node, &ci->i_caps); - cap->ci = NULL; - if (ci->i_auth_cap == cap) - ci->i_auth_cap = NULL; - /* remove from session list */ spin_lock(&session->s_cap_lock); if (session->s_cap_iterator == cap) { @@ -885,10 +882,18 @@ void __ceph_remove_cap(struct ceph_cap *cap) list_del_init(&cap->session_caps); session->s_nr_caps--; cap->session = NULL; + removed = 1; } + /* protect backpointer with s_cap_lock: see iterate_session_caps */ + cap->ci = NULL; spin_unlock(&session->s_cap_lock); - if (cap->session == NULL) + /* remove from inode list */ + rb_erase(&cap->ci_node, &ci->i_caps); + if (ci->i_auth_cap == cap) + ci->i_auth_cap = NULL; + + if (removed) ceph_put_cap(cap); if (!__ceph_is_any_caps(ci) && ci->i_snap_realm) { @@ -1205,6 +1210,12 @@ retry: if (capsnap->dirty_pages || capsnap->writing) continue; + /* + * if cap writeback already occurred, we should have dropped + * the capsnap in ceph_put_wrbuffer_cap_refs. + */ + BUG_ON(capsnap->dirty == 0); + /* pick mds, take s_mutex */ mds = __ceph_get_cap_mds(ci, &mseq); if (session && session->s_mds != mds) { @@ -1855,8 +1866,8 @@ static void kick_flushing_capsnaps(struct ceph_mds_client *mdsc, } else { pr_err("%p auth cap %p not mds%d ???\n", inode, cap, session->s_mds); - spin_unlock(&inode->i_lock); } + spin_unlock(&inode->i_lock); } } @@ -2118,8 +2129,8 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had) } spin_unlock(&inode->i_lock); - dout("put_cap_refs %p had %s %s\n", inode, ceph_cap_string(had), - last ? "last" : ""); + dout("put_cap_refs %p had %s%s%s\n", inode, ceph_cap_string(had), + last ? " last" : "", put ? " put" : ""); if (last && !flushsnaps) ceph_check_caps(ci, 0, NULL); @@ -2143,7 +2154,8 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, { struct inode *inode = &ci->vfs_inode; int last = 0; - int last_snap = 0; + int complete_capsnap = 0; + int drop_capsnap = 0; int found = 0; struct ceph_cap_snap *capsnap = NULL; @@ -2166,19 +2178,32 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) { if (capsnap->context == snapc) { found = 1; - capsnap->dirty_pages -= nr; - last_snap = !capsnap->dirty_pages; break; } } BUG_ON(!found); + capsnap->dirty_pages -= nr; + if (capsnap->dirty_pages == 0) { + complete_capsnap = 1; + if (capsnap->dirty == 0) + /* cap writeback completed before we created + * the cap_snap; no FLUSHSNAP is needed */ + drop_capsnap = 1; + } dout("put_wrbuffer_cap_refs on %p cap_snap %p " - " snap %lld %d/%d -> %d/%d %s%s\n", + " snap %lld %d/%d -> %d/%d %s%s%s\n", inode, capsnap, capsnap->context->seq, ci->i_wrbuffer_ref+nr, capsnap->dirty_pages + nr, ci->i_wrbuffer_ref, capsnap->dirty_pages, last ? " (wrbuffer last)" : "", - last_snap ? " (capsnap last)" : ""); + complete_capsnap ? " (complete capsnap)" : "", + drop_capsnap ? " (drop capsnap)" : ""); + if (drop_capsnap) { + ceph_put_snap_context(capsnap->context); + list_del(&capsnap->ci_item); + list_del(&capsnap->flushing_item); + ceph_put_cap_snap(capsnap); + } } spin_unlock(&inode->i_lock); @@ -2186,10 +2211,12 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, if (last) { ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL); iput(inode); - } else if (last_snap) { + } else if (complete_capsnap) { ceph_flush_snaps(ci); wake_up(&ci->i_cap_wq); } + if (drop_capsnap) + iput(inode); } /* @@ -2465,8 +2492,8 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid, break; } WARN_ON(capsnap->dirty_pages || capsnap->writing); - dout(" removing cap_snap %p follows %lld\n", - capsnap, follows); + dout(" removing %p cap_snap %p follows %lld\n", + inode, capsnap, follows); ceph_put_snap_context(capsnap->context); list_del(&capsnap->ci_item); list_del(&capsnap->flushing_item); diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 7261dc6c2ead..650d2db5ed26 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -171,11 +171,11 @@ more: spin_lock(&inode->i_lock); spin_lock(&dcache_lock); + last = dentry; + if (err < 0) goto out_unlock; - last = dentry; - p = p->prev; filp->f_pos++; @@ -312,7 +312,7 @@ more: req->r_readdir_offset = fi->next_offset; req->r_args.readdir.frag = cpu_to_le32(frag); req->r_args.readdir.max_entries = cpu_to_le32(max_entries); - req->r_num_caps = max_entries; + req->r_num_caps = max_entries + 1; err = ceph_mdsc_do_request(mdsc, NULL, req); if (err < 0) { ceph_mdsc_put_request(req); @@ -489,6 +489,7 @@ struct dentry *ceph_finish_lookup(struct ceph_mds_request *req, struct inode *inode = ceph_get_snapdir(parent); dout("ENOENT on snapdir %p '%.*s', linking to snapdir %p\n", dentry, dentry->d_name.len, dentry->d_name.name, inode); + BUG_ON(!d_unhashed(dentry)); d_add(dentry, inode); err = 0; } @@ -879,7 +880,16 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry, * do_request, above). If there is no trace, we need * to do it here. */ + + /* d_move screws up d_subdirs order */ + ceph_i_clear(new_dir, CEPH_I_COMPLETE); + d_move(old_dentry, new_dentry); + + /* ensure target dentry is invalidated, despite + rehashing bug in vfs_rename_dir */ + new_dentry->d_time = jiffies; + ceph_dentry(new_dentry)->lease_shared_gen = 0; } ceph_mdsc_put_request(req); return err; diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 4add3d5da2c1..ed6f19721d6e 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -665,7 +665,8 @@ more: * throw out any page cache pages in this range. this * may block. */ - truncate_inode_pages_range(inode->i_mapping, pos, pos+len); + truncate_inode_pages_range(inode->i_mapping, pos, + (pos+len) | (PAGE_CACHE_SIZE-1)); } else { pages = alloc_page_vector(num_pages); if (IS_ERR(pages)) { diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index aca82d55cc53..85b4d2ffdeba 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -733,6 +733,10 @@ no_change: __ceph_get_fmode(ci, cap_fmode); spin_unlock(&inode->i_lock); } + } else if (cap_fmode >= 0) { + pr_warning("mds issued no caps on %llx.%llx\n", + ceph_vinop(inode)); + __ceph_get_fmode(ci, cap_fmode); } /* update delegation info? */ @@ -886,6 +890,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, struct inode *in = NULL; struct ceph_mds_reply_inode *ininfo; struct ceph_vino vino; + struct ceph_client *client = ceph_sb_to_client(sb); int i = 0; int err = 0; @@ -949,7 +954,14 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, return err; } - if (rinfo->head->is_dentry && !req->r_aborted) { + /* + * ignore null lease/binding on snapdir ENOENT, or else we + * will have trouble splicing in the virtual snapdir later + */ + if (rinfo->head->is_dentry && !req->r_aborted && + (rinfo->head->is_target || strncmp(req->r_dentry->d_name.name, + client->mount_args->snapdir_name, + req->r_dentry->d_name.len))) { /* * lookup link rename : null -> possibly existing inode * mknod symlink mkdir : null -> new inode @@ -989,6 +1001,10 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, dn, dn->d_name.len, dn->d_name.name); dout("fill_trace doing d_move %p -> %p\n", req->r_old_dentry, dn); + + /* d_move screws up d_subdirs order */ + ceph_i_clear(dir, CEPH_I_COMPLETE); + d_move(req->r_old_dentry, dn); dout(" src %p '%.*s' dst %p '%.*s'\n", req->r_old_dentry, diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 60a9a4ae47be..24561a557e01 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -736,9 +736,10 @@ static void cleanup_cap_releases(struct ceph_mds_session *session) } /* - * Helper to safely iterate over all caps associated with a session. + * Helper to safely iterate over all caps associated with a session, with + * special care taken to handle a racing __ceph_remove_cap(). * - * caller must hold session s_mutex + * Caller must hold session s_mutex. */ static int iterate_session_caps(struct ceph_mds_session *session, int (*cb)(struct inode *, struct ceph_cap *, @@ -2136,7 +2137,7 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, int mds) struct ceph_mds_session *session = NULL; struct ceph_msg *reply; struct rb_node *p; - int err; + int err = -ENOMEM; struct ceph_pagelist *pagelist; pr_info("reconnect to recovering mds%d\n", mds); @@ -2185,7 +2186,7 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, int mds) goto fail; err = iterate_session_caps(session, encode_caps_cb, pagelist); if (err < 0) - goto out; + goto fail; /* * snaprealms. we provide mds with the ino, seq (version), and @@ -2213,28 +2214,31 @@ send: reply->nr_pages = calc_pages_for(0, pagelist->length); ceph_con_send(&session->s_con, reply); - if (session) { - session->s_state = CEPH_MDS_SESSION_OPEN; - __wake_requests(mdsc, &session->s_waiting); - } + session->s_state = CEPH_MDS_SESSION_OPEN; + mutex_unlock(&session->s_mutex); + + mutex_lock(&mdsc->mutex); + __wake_requests(mdsc, &session->s_waiting); + mutex_unlock(&mdsc->mutex); + + ceph_put_mds_session(session); -out: up_read(&mdsc->snap_rwsem); - if (session) { - mutex_unlock(&session->s_mutex); - ceph_put_mds_session(session); - } mutex_lock(&mdsc->mutex); return; fail: ceph_msg_put(reply); + up_read(&mdsc->snap_rwsem); + mutex_unlock(&session->s_mutex); + ceph_put_mds_session(session); fail_nomsg: ceph_pagelist_release(pagelist); kfree(pagelist); fail_nopagelist: - pr_err("ENOMEM preparing reconnect for mds%d\n", mds); - goto out; + pr_err("error %d preparing reconnect for mds%d\n", err, mds); + mutex_lock(&mdsc->mutex); + return; } diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c index 8f1715ffbe4b..cd4fadb6491a 100644 --- a/fs/ceph/messenger.c +++ b/fs/ceph/messenger.c @@ -30,6 +30,10 @@ static char tag_msg = CEPH_MSGR_TAG_MSG; static char tag_ack = CEPH_MSGR_TAG_ACK; static char tag_keepalive = CEPH_MSGR_TAG_KEEPALIVE; +#ifdef CONFIG_LOCKDEP +static struct lock_class_key socket_class; +#endif + static void queue_con(struct ceph_connection *con); static void con_work(struct work_struct *); @@ -228,6 +232,10 @@ static struct socket *ceph_tcp_connect(struct ceph_connection *con) con->sock = sock; sock->sk->sk_allocation = GFP_NOFS; +#ifdef CONFIG_LOCKDEP + lockdep_set_class(&sock->sk->sk_lock, &socket_class); +#endif + set_sock_callbacks(sock, con); dout("connect %s\n", pr_addr(&con->peer_addr.in_addr)); @@ -333,6 +341,7 @@ static void reset_connection(struct ceph_connection *con) con->out_msg = NULL; } con->in_seq = 0; + con->in_seq_acked = 0; } /* @@ -483,7 +492,14 @@ static void prepare_write_message(struct ceph_connection *con) list_move_tail(&m->list_head, &con->out_sent); } - m->hdr.seq = cpu_to_le64(++con->out_seq); + /* + * only assign outgoing seq # if we haven't sent this message + * yet. if it is requeued, resend with it's original seq. + */ + if (m->needs_out_seq) { + m->hdr.seq = cpu_to_le64(++con->out_seq); + m->needs_out_seq = false; + } dout("prepare_write_message %p seq %lld type %d len %d+%d+%d %d pgs\n", m, con->out_seq, le16_to_cpu(m->hdr.type), @@ -1325,6 +1341,7 @@ static int read_partial_message(struct ceph_connection *con) unsigned front_len, middle_len, data_len, data_off; int datacrc = con->msgr->nocrc; int skip; + u64 seq; dout("read_partial_message con %p msg %p\n", con, m); @@ -1359,6 +1376,25 @@ static int read_partial_message(struct ceph_connection *con) return -EIO; data_off = le16_to_cpu(con->in_hdr.data_off); + /* verify seq# */ + seq = le64_to_cpu(con->in_hdr.seq); + if ((s64)seq - (s64)con->in_seq < 1) { + pr_info("skipping %s%lld %s seq %lld, expected %lld\n", + ENTITY_NAME(con->peer_name), + pr_addr(&con->peer_addr.in_addr), + seq, con->in_seq + 1); + con->in_base_pos = -front_len - middle_len - data_len - + sizeof(m->footer); + con->in_tag = CEPH_MSGR_TAG_READY; + con->in_seq++; + return 0; + } else if ((s64)seq - (s64)con->in_seq > 1) { + pr_err("read_partial_message bad seq %lld expected %lld\n", + seq, con->in_seq + 1); + con->error_msg = "bad message sequence # for incoming message"; + return -EBADMSG; + } + /* allocate message? */ if (!con->in_msg) { dout("got hdr type %d front %d data %d\n", con->in_hdr.type, @@ -1370,6 +1406,7 @@ static int read_partial_message(struct ceph_connection *con) con->in_base_pos = -front_len - middle_len - data_len - sizeof(m->footer); con->in_tag = CEPH_MSGR_TAG_READY; + con->in_seq++; return 0; } if (IS_ERR(con->in_msg)) { @@ -1956,6 +1993,8 @@ void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg) BUG_ON(msg->front.iov_len != le32_to_cpu(msg->hdr.front_len)); + msg->needs_out_seq = true; + /* queue */ mutex_lock(&con->mutex); BUG_ON(!list_empty(&msg->list_head)); @@ -2021,6 +2060,7 @@ void ceph_con_revoke_message(struct ceph_connection *con, struct ceph_msg *msg) ceph_msg_put(con->in_msg); con->in_msg = NULL; con->in_tag = CEPH_MSGR_TAG_READY; + con->in_seq++; } else { dout("con_revoke_pages %p msg %p pages %p no-op\n", con, con->in_msg, msg); @@ -2054,15 +2094,19 @@ struct ceph_msg *ceph_msg_new(int type, int front_len, kref_init(&m->kref); INIT_LIST_HEAD(&m->list_head); + m->hdr.tid = 0; m->hdr.type = cpu_to_le16(type); + m->hdr.priority = cpu_to_le16(CEPH_MSG_PRIO_DEFAULT); + m->hdr.version = 0; m->hdr.front_len = cpu_to_le32(front_len); m->hdr.middle_len = 0; m->hdr.data_len = cpu_to_le32(page_len); m->hdr.data_off = cpu_to_le16(page_off); - m->hdr.priority = cpu_to_le16(CEPH_MSG_PRIO_DEFAULT); + m->hdr.reserved = 0; m->footer.front_crc = 0; m->footer.middle_crc = 0; m->footer.data_crc = 0; + m->footer.flags = 0; m->front_max = front_len; m->front_is_vmalloc = false; m->more_to_follow = false; diff --git a/fs/ceph/messenger.h b/fs/ceph/messenger.h index a343dae73cdc..a5caf91cc971 100644 --- a/fs/ceph/messenger.h +++ b/fs/ceph/messenger.h @@ -86,6 +86,7 @@ struct ceph_msg { struct kref kref; bool front_is_vmalloc; bool more_to_follow; + bool needs_out_seq; int front_max; struct ceph_msgpool *pool; diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c index c7b4dedaace6..3514f71ff85f 100644 --- a/fs/ceph/osd_client.c +++ b/fs/ceph/osd_client.c @@ -565,7 +565,8 @@ static int __map_osds(struct ceph_osd_client *osdc, { struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base; struct ceph_pg pgid; - int o = -1; + int acting[CEPH_PG_MAX_SIZE]; + int o = -1, num = 0; int err; dout("map_osds %p tid %lld\n", req, req->r_tid); @@ -576,10 +577,16 @@ static int __map_osds(struct ceph_osd_client *osdc, pgid = reqhead->layout.ol_pgid; req->r_pgid = pgid; - o = ceph_calc_pg_primary(osdc->osdmap, pgid); + err = ceph_calc_pg_acting(osdc->osdmap, pgid, acting); + if (err > 0) { + o = acting[0]; + num = err; + } if ((req->r_osd && req->r_osd->o_osd == o && - req->r_sent >= req->r_osd->o_incarnation) || + req->r_sent >= req->r_osd->o_incarnation && + req->r_num_pg_osds == num && + memcmp(req->r_pg_osds, acting, sizeof(acting[0])*num) == 0) || (req->r_osd == NULL && o == -1)) return 0; /* no change */ @@ -587,6 +594,10 @@ static int __map_osds(struct ceph_osd_client *osdc, req->r_tid, le32_to_cpu(pgid.pool), le16_to_cpu(pgid.ps), o, req->r_osd ? req->r_osd->o_osd : -1); + /* record full pg acting set */ + memcpy(req->r_pg_osds, acting, sizeof(acting[0]) * num); + req->r_num_pg_osds = num; + if (req->r_osd) { __cancel_request(req); list_del_init(&req->r_osd_item); @@ -612,7 +623,7 @@ static int __map_osds(struct ceph_osd_client *osdc, __remove_osd_from_lru(req->r_osd); list_add(&req->r_osd_item, &req->r_osd->o_requests); } - err = 1; /* osd changed */ + err = 1; /* osd or pg changed */ out: return err; @@ -779,16 +790,18 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg, struct ceph_osd_request *req; u64 tid; int numops, object_len, flags; + s32 result; tid = le64_to_cpu(msg->hdr.tid); if (msg->front.iov_len < sizeof(*rhead)) goto bad; numops = le32_to_cpu(rhead->num_ops); object_len = le32_to_cpu(rhead->object_len); + result = le32_to_cpu(rhead->result); if (msg->front.iov_len != sizeof(*rhead) + object_len + numops * sizeof(struct ceph_osd_op)) goto bad; - dout("handle_reply %p tid %llu\n", msg, tid); + dout("handle_reply %p tid %llu result %d\n", msg, tid, (int)result); /* lookup */ mutex_lock(&osdc->request_mutex); @@ -834,7 +847,8 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg, dout("handle_reply tid %llu flags %d\n", tid, flags); /* either this is a read, or we got the safe response */ - if ((flags & CEPH_OSD_FLAG_ONDISK) || + if (result < 0 || + (flags & CEPH_OSD_FLAG_ONDISK) || ((flags & CEPH_OSD_FLAG_WRITE) == 0)) __unregister_request(osdc, req); diff --git a/fs/ceph/osd_client.h b/fs/ceph/osd_client.h index b0759911e7c3..ce776989ef6a 100644 --- a/fs/ceph/osd_client.h +++ b/fs/ceph/osd_client.h @@ -48,6 +48,8 @@ struct ceph_osd_request { struct list_head r_osd_item; struct ceph_osd *r_osd; struct ceph_pg r_pgid; + int r_pg_osds[CEPH_PG_MAX_SIZE]; + int r_num_pg_osds; struct ceph_connection *r_con_filling_msg; @@ -66,7 +68,6 @@ struct ceph_osd_request { struct list_head r_unsafe_item; struct inode *r_inode; /* for use by callbacks */ - struct writeback_control *r_wbc; /* ditto */ char r_oid[40]; /* object name */ int r_oid_len; diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c index 21c6623c4b07..cfdd8f4388b7 100644 --- a/fs/ceph/osdmap.c +++ b/fs/ceph/osdmap.c @@ -314,71 +314,6 @@ bad: return ERR_PTR(err); } - -/* - * osd map - */ -void ceph_osdmap_destroy(struct ceph_osdmap *map) -{ - dout("osdmap_destroy %p\n", map); - if (map->crush) - crush_destroy(map->crush); - while (!RB_EMPTY_ROOT(&map->pg_temp)) { - struct ceph_pg_mapping *pg = - rb_entry(rb_first(&map->pg_temp), - struct ceph_pg_mapping, node); - rb_erase(&pg->node, &map->pg_temp); - kfree(pg); - } - while (!RB_EMPTY_ROOT(&map->pg_pools)) { - struct ceph_pg_pool_info *pi = - rb_entry(rb_first(&map->pg_pools), - struct ceph_pg_pool_info, node); - rb_erase(&pi->node, &map->pg_pools); - kfree(pi); - } - kfree(map->osd_state); - kfree(map->osd_weight); - kfree(map->osd_addr); - kfree(map); -} - -/* - * adjust max osd value. reallocate arrays. - */ -static int osdmap_set_max_osd(struct ceph_osdmap *map, int max) -{ - u8 *state; - struct ceph_entity_addr *addr; - u32 *weight; - - state = kcalloc(max, sizeof(*state), GFP_NOFS); - addr = kcalloc(max, sizeof(*addr), GFP_NOFS); - weight = kcalloc(max, sizeof(*weight), GFP_NOFS); - if (state == NULL || addr == NULL || weight == NULL) { - kfree(state); - kfree(addr); - kfree(weight); - return -ENOMEM; - } - - /* copy old? */ - if (map->osd_state) { - memcpy(state, map->osd_state, map->max_osd*sizeof(*state)); - memcpy(addr, map->osd_addr, map->max_osd*sizeof(*addr)); - memcpy(weight, map->osd_weight, map->max_osd*sizeof(*weight)); - kfree(map->osd_state); - kfree(map->osd_addr); - kfree(map->osd_weight); - } - - map->osd_state = state; - map->osd_weight = weight; - map->osd_addr = addr; - map->max_osd = max; - return 0; -} - /* * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid * to a set of osds) @@ -482,6 +417,13 @@ static struct ceph_pg_pool_info *__lookup_pg_pool(struct rb_root *root, int id) return NULL; } +static void __remove_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *pi) +{ + rb_erase(&pi->node, root); + kfree(pi->name); + kfree(pi); +} + void __decode_pool(void **p, struct ceph_pg_pool_info *pi) { ceph_decode_copy(p, &pi->v, sizeof(pi->v)); @@ -490,6 +432,98 @@ void __decode_pool(void **p, struct ceph_pg_pool_info *pi) *p += le32_to_cpu(pi->v.num_removed_snap_intervals) * sizeof(u64) * 2; } +static int __decode_pool_names(void **p, void *end, struct ceph_osdmap *map) +{ + struct ceph_pg_pool_info *pi; + u32 num, len, pool; + + ceph_decode_32_safe(p, end, num, bad); + dout(" %d pool names\n", num); + while (num--) { + ceph_decode_32_safe(p, end, pool, bad); + ceph_decode_32_safe(p, end, len, bad); + dout(" pool %d len %d\n", pool, len); + pi = __lookup_pg_pool(&map->pg_pools, pool); + if (pi) { + kfree(pi->name); + pi->name = kmalloc(len + 1, GFP_NOFS); + if (pi->name) { + memcpy(pi->name, *p, len); + pi->name[len] = '\0'; + dout(" name is %s\n", pi->name); + } + } + *p += len; + } + return 0; + +bad: + return -EINVAL; +} + +/* + * osd map + */ +void ceph_osdmap_destroy(struct ceph_osdmap *map) +{ + dout("osdmap_destroy %p\n", map); + if (map->crush) + crush_destroy(map->crush); + while (!RB_EMPTY_ROOT(&map->pg_temp)) { + struct ceph_pg_mapping *pg = + rb_entry(rb_first(&map->pg_temp), + struct ceph_pg_mapping, node); + rb_erase(&pg->node, &map->pg_temp); + kfree(pg); + } + while (!RB_EMPTY_ROOT(&map->pg_pools)) { + struct ceph_pg_pool_info *pi = + rb_entry(rb_first(&map->pg_pools), + struct ceph_pg_pool_info, node); + __remove_pg_pool(&map->pg_pools, pi); + } + kfree(map->osd_state); + kfree(map->osd_weight); + kfree(map->osd_addr); + kfree(map); +} + +/* + * adjust max osd value. reallocate arrays. + */ +static int osdmap_set_max_osd(struct ceph_osdmap *map, int max) +{ + u8 *state; + struct ceph_entity_addr *addr; + u32 *weight; + + state = kcalloc(max, sizeof(*state), GFP_NOFS); + addr = kcalloc(max, sizeof(*addr), GFP_NOFS); + weight = kcalloc(max, sizeof(*weight), GFP_NOFS); + if (state == NULL || addr == NULL || weight == NULL) { + kfree(state); + kfree(addr); + kfree(weight); + return -ENOMEM; + } + + /* copy old? */ + if (map->osd_state) { + memcpy(state, map->osd_state, map->max_osd*sizeof(*state)); + memcpy(addr, map->osd_addr, map->max_osd*sizeof(*addr)); + memcpy(weight, map->osd_weight, map->max_osd*sizeof(*weight)); + kfree(map->osd_state); + kfree(map->osd_addr); + kfree(map->osd_weight); + } + + map->osd_state = state; + map->osd_weight = weight; + map->osd_addr = addr; + map->max_osd = max; + return 0; +} + /* * decode a full map. */ @@ -526,7 +560,7 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end) ceph_decode_32_safe(p, end, max, bad); while (max--) { ceph_decode_need(p, end, 4 + 1 + sizeof(pi->v), bad); - pi = kmalloc(sizeof(*pi), GFP_NOFS); + pi = kzalloc(sizeof(*pi), GFP_NOFS); if (!pi) goto bad; pi->id = ceph_decode_32(p); @@ -539,6 +573,10 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end) __decode_pool(p, pi); __insert_pg_pool(&map->pg_pools, pi); } + + if (version >= 5 && __decode_pool_names(p, end, map) < 0) + goto bad; + ceph_decode_32_safe(p, end, map->pool_max, bad); ceph_decode_32_safe(p, end, map->flags, bad); @@ -712,7 +750,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, } pi = __lookup_pg_pool(&map->pg_pools, pool); if (!pi) { - pi = kmalloc(sizeof(*pi), GFP_NOFS); + pi = kzalloc(sizeof(*pi), GFP_NOFS); if (!pi) { err = -ENOMEM; goto bad; @@ -722,6 +760,8 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, } __decode_pool(p, pi); } + if (version >= 5 && __decode_pool_names(p, end, map) < 0) + goto bad; /* old_pool */ ceph_decode_32_safe(p, end, len, bad); @@ -730,10 +770,8 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, ceph_decode_32_safe(p, end, pool, bad); pi = __lookup_pg_pool(&map->pg_pools, pool); - if (pi) { - rb_erase(&pi->node, &map->pg_pools); - kfree(pi); - } + if (pi) + __remove_pg_pool(&map->pg_pools, pi); } /* new_up */ @@ -1003,12 +1041,33 @@ static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid, } /* + * Return acting set for given pgid. + */ +int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid, + int *acting) +{ + int rawosds[CEPH_PG_MAX_SIZE], *osds; + int i, o, num = CEPH_PG_MAX_SIZE; + + osds = calc_pg_raw(osdmap, pgid, rawosds, &num); + if (!osds) + return -1; + + /* primary is first up osd */ + o = 0; + for (i = 0; i < num; i++) + if (ceph_osd_is_up(osdmap, osds[i])) + acting[o++] = osds[i]; + return o; +} + +/* * Return primary osd for given pgid, or -1 if none. */ int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid) { - int rawosds[10], *osds; - int i, num = ARRAY_SIZE(rawosds); + int rawosds[CEPH_PG_MAX_SIZE], *osds; + int i, num = CEPH_PG_MAX_SIZE; osds = calc_pg_raw(osdmap, pgid, rawosds, &num); if (!osds) @@ -1016,9 +1075,7 @@ int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid) /* primary is first up osd */ for (i = 0; i < num; i++) - if (ceph_osd_is_up(osdmap, osds[i])) { + if (ceph_osd_is_up(osdmap, osds[i])) return osds[i]; - break; - } return -1; } diff --git a/fs/ceph/osdmap.h b/fs/ceph/osdmap.h index 1fb55afb2642..970b547e510d 100644 --- a/fs/ceph/osdmap.h +++ b/fs/ceph/osdmap.h @@ -23,6 +23,7 @@ struct ceph_pg_pool_info { int id; struct ceph_pg_pool v; int pg_num_mask, pgp_num_mask, lpg_num_mask, lpgp_num_mask; + char *name; }; struct ceph_pg_mapping { @@ -119,6 +120,8 @@ extern int ceph_calc_object_layout(struct ceph_object_layout *ol, const char *oid, struct ceph_file_layout *fl, struct ceph_osdmap *osdmap); +extern int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid, + int *acting); extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid); diff --git a/fs/ceph/rados.h b/fs/ceph/rados.h index 26ac8b89a676..fd56451a871f 100644 --- a/fs/ceph/rados.h +++ b/fs/ceph/rados.h @@ -11,8 +11,10 @@ /* * osdmap encoding versions */ -#define CEPH_OSDMAP_INC_VERSION 4 -#define CEPH_OSDMAP_VERSION 4 +#define CEPH_OSDMAP_INC_VERSION 5 +#define CEPH_OSDMAP_INC_VERSION_EXT 5 +#define CEPH_OSDMAP_VERSION 5 +#define CEPH_OSDMAP_VERSION_EXT 5 /* * fs id @@ -56,6 +58,7 @@ struct ceph_timespec { #define CEPH_PG_LAYOUT_LINEAR 2 #define CEPH_PG_LAYOUT_HYBRID 3 +#define CEPH_PG_MAX_SIZE 16 /* max # osds in a single pg */ /* * placement group. diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index e6f9bc57d472..d5114db70453 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -431,8 +431,7 @@ static int dup_array(u64 **dst, __le64 *src, int num) * Caller must hold snap_rwsem for read (i.e., the realm topology won't * change). */ -void ceph_queue_cap_snap(struct ceph_inode_info *ci, - struct ceph_snap_context *snapc) +void ceph_queue_cap_snap(struct ceph_inode_info *ci) { struct inode *inode = &ci->vfs_inode; struct ceph_cap_snap *capsnap; @@ -451,10 +450,11 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci, as no new writes are allowed to start when pending, so any writes in progress now were started before the previous cap_snap. lucky us. */ - dout("queue_cap_snap %p snapc %p seq %llu used %d" - " already pending\n", inode, snapc, snapc->seq, used); + dout("queue_cap_snap %p already pending\n", inode); kfree(capsnap); } else if (ci->i_wrbuffer_ref_head || (used & CEPH_CAP_FILE_WR)) { + struct ceph_snap_context *snapc = ci->i_head_snapc; + igrab(inode); atomic_set(&capsnap->nref, 1); @@ -463,7 +463,6 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci, INIT_LIST_HEAD(&capsnap->flushing_item); capsnap->follows = snapc->seq - 1; - capsnap->context = ceph_get_snap_context(snapc); capsnap->issued = __ceph_caps_issued(ci, NULL); capsnap->dirty = __ceph_caps_dirty(ci); @@ -480,7 +479,7 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci, snapshot. */ capsnap->dirty_pages = ci->i_wrbuffer_ref_head; ci->i_wrbuffer_ref_head = 0; - ceph_put_snap_context(ci->i_head_snapc); + capsnap->context = snapc; ci->i_head_snapc = NULL; list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps); @@ -522,15 +521,17 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci, capsnap->ctime = inode->i_ctime; capsnap->time_warp_seq = ci->i_time_warp_seq; if (capsnap->dirty_pages) { - dout("finish_cap_snap %p cap_snap %p snapc %p %llu s=%llu " + dout("finish_cap_snap %p cap_snap %p snapc %p %llu %s s=%llu " "still has %d dirty pages\n", inode, capsnap, capsnap->context, capsnap->context->seq, - capsnap->size, capsnap->dirty_pages); + ceph_cap_string(capsnap->dirty), capsnap->size, + capsnap->dirty_pages); return 0; } - dout("finish_cap_snap %p cap_snap %p snapc %p %llu s=%llu clean\n", + dout("finish_cap_snap %p cap_snap %p snapc %p %llu %s s=%llu\n", inode, capsnap, capsnap->context, - capsnap->context->seq, capsnap->size); + capsnap->context->seq, ceph_cap_string(capsnap->dirty), + capsnap->size); spin_lock(&mdsc->snap_flush_lock); list_add_tail(&ci->i_snap_flush_item, &mdsc->snap_flush_list); @@ -602,7 +603,7 @@ more: if (lastinode) iput(lastinode); lastinode = inode; - ceph_queue_cap_snap(ci, realm->cached_context); + ceph_queue_cap_snap(ci); spin_lock(&realm->inodes_with_caps_lock); } spin_unlock(&realm->inodes_with_caps_lock); @@ -824,8 +825,7 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc, spin_unlock(&realm->inodes_with_caps_lock); spin_unlock(&inode->i_lock); - ceph_queue_cap_snap(ci, - ci->i_snap_realm->cached_context); + ceph_queue_cap_snap(ci); iput(inode); continue; @@ -869,16 +869,20 @@ skip_inode: continue; ci = ceph_inode(inode); spin_lock(&inode->i_lock); - if (!ci->i_snap_realm) - goto split_skip_inode; - ceph_put_snap_realm(mdsc, ci->i_snap_realm); - spin_lock(&realm->inodes_with_caps_lock); - list_add(&ci->i_snap_realm_item, - &realm->inodes_with_caps); - ci->i_snap_realm = realm; - spin_unlock(&realm->inodes_with_caps_lock); - ceph_get_snap_realm(mdsc, realm); -split_skip_inode: + if (list_empty(&ci->i_snap_realm_item)) { + struct ceph_snap_realm *oldrealm = + ci->i_snap_realm; + + dout(" moving %p to split realm %llx %p\n", + inode, realm->ino, realm); + spin_lock(&realm->inodes_with_caps_lock); + list_add(&ci->i_snap_realm_item, + &realm->inodes_with_caps); + ci->i_snap_realm = realm; + spin_unlock(&realm->inodes_with_caps_lock); + ceph_get_snap_realm(mdsc, realm); + ceph_put_snap_realm(mdsc, oldrealm); + } spin_unlock(&inode->i_lock); iput(inode); } diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 75d02eaa1279..110857ba9269 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -47,10 +47,20 @@ const char *ceph_file_part(const char *s, int len) */ static void ceph_put_super(struct super_block *s) { - struct ceph_client *cl = ceph_client(s); + struct ceph_client *client = ceph_sb_to_client(s); dout("put_super\n"); - ceph_mdsc_close_sessions(&cl->mdsc); + ceph_mdsc_close_sessions(&client->mdsc); + + /* + * ensure we release the bdi before put_anon_super releases + * the device name. + */ + if (s->s_bdi == &client->backing_dev_info) { + bdi_unregister(&client->backing_dev_info); + s->s_bdi = NULL; + } + return; } @@ -636,6 +646,8 @@ static void ceph_destroy_client(struct ceph_client *client) destroy_workqueue(client->pg_inv_wq); destroy_workqueue(client->trunc_wq); + bdi_destroy(&client->backing_dev_info); + if (client->msgr) ceph_messenger_destroy(client->msgr); mempool_destroy(client->wb_pagevec_pool); @@ -876,14 +888,14 @@ static int ceph_register_bdi(struct super_block *sb, struct ceph_client *client) { int err; - sb->s_bdi = &client->backing_dev_info; - /* set ra_pages based on rsize mount option? */ if (client->mount_args->rsize >= PAGE_CACHE_SIZE) client->backing_dev_info.ra_pages = (client->mount_args->rsize + PAGE_CACHE_SIZE - 1) >> PAGE_SHIFT; err = bdi_register_dev(&client->backing_dev_info, sb->s_dev); + if (!err) + sb->s_bdi = &client->backing_dev_info; return err; } @@ -957,9 +969,6 @@ static void ceph_kill_sb(struct super_block *s) dout("kill_sb %p\n", s); ceph_mdsc_pre_umount(&client->mdsc); kill_anon_super(s); /* will call put_super after sb is r/o */ - if (s->s_bdi == &client->backing_dev_info) - bdi_unregister(&client->backing_dev_info); - bdi_destroy(&client->backing_dev_info); ceph_destroy_client(client); } @@ -996,9 +1005,10 @@ static int __init init_ceph(void) if (ret) goto out_icache; - pr_info("loaded %d.%d.%d (mon/mds/osd proto %d/%d/%d)\n", - CEPH_VERSION_MAJOR, CEPH_VERSION_MINOR, CEPH_VERSION_PATCH, - CEPH_MONC_PROTOCOL, CEPH_MDSC_PROTOCOL, CEPH_OSDC_PROTOCOL); + pr_info("loaded (mon/mds/osd proto %d/%d/%d, osdmap %d/%d %d/%d)\n", + CEPH_MONC_PROTOCOL, CEPH_MDSC_PROTOCOL, CEPH_OSDC_PROTOCOL, + CEPH_OSDMAP_VERSION, CEPH_OSDMAP_VERSION_EXT, + CEPH_OSDMAP_INC_VERSION, CEPH_OSDMAP_INC_VERSION_EXT); return 0; out_icache: diff --git a/fs/ceph/super.h b/fs/ceph/super.h index ca702c67bc66..13513b80d87f 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -10,6 +10,7 @@ #include <linux/fs.h> #include <linux/mempool.h> #include <linux/pagemap.h> +#include <linux/slab.h> #include <linux/wait.h> #include <linux/writeback.h> #include <linux/slab.h> @@ -715,8 +716,7 @@ extern int ceph_update_snap_trace(struct ceph_mds_client *m, extern void ceph_handle_snap(struct ceph_mds_client *mdsc, struct ceph_mds_session *session, struct ceph_msg *msg); -extern void ceph_queue_cap_snap(struct ceph_inode_info *ci, - struct ceph_snap_context *snapc); +extern void ceph_queue_cap_snap(struct ceph_inode_info *ci); extern int __ceph_finish_cap_snap(struct ceph_inode_info *ci, struct ceph_cap_snap *capsnap); extern void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc); diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h index 4797787c6a44..246a167cb913 100644 --- a/fs/cifs/cifs_fs_sb.h +++ b/fs/cifs/cifs_fs_sb.h @@ -18,6 +18,8 @@ #ifndef _CIFS_FS_SB_H #define _CIFS_FS_SB_H +#include <linux/backing-dev.h> + #define CIFS_MOUNT_NO_PERM 1 /* do not do client vfs_perm check */ #define CIFS_MOUNT_SET_UID 2 /* set current's euid in create etc. */ #define CIFS_MOUNT_SERVER_INUM 4 /* inode numbers from uniqueid from server */ @@ -50,5 +52,6 @@ struct cifs_sb_info { #ifdef CONFIG_CIFS_DFS_UPCALL char *mountdata; /* mount options received at mount time */ #endif + struct backing_dev_info bdi; }; #endif /* _CIFS_FS_SB_H */ diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index ded66be6597c..ad235d604a0b 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -103,6 +103,12 @@ cifs_read_super(struct super_block *sb, void *data, if (cifs_sb == NULL) return -ENOMEM; + rc = bdi_setup_and_register(&cifs_sb->bdi, "cifs", BDI_CAP_MAP_COPY); + if (rc) { + kfree(cifs_sb); + return rc; + } + #ifdef CONFIG_CIFS_DFS_UPCALL /* copy mount params to sb for use in submounts */ /* BB: should we move this after the mount so we @@ -115,6 +121,7 @@ cifs_read_super(struct super_block *sb, void *data, int len = strlen(data); cifs_sb->mountdata = kzalloc(len + 1, GFP_KERNEL); if (cifs_sb->mountdata == NULL) { + bdi_destroy(&cifs_sb->bdi); kfree(sb->s_fs_info); sb->s_fs_info = NULL; return -ENOMEM; @@ -135,6 +142,7 @@ cifs_read_super(struct super_block *sb, void *data, sb->s_magic = CIFS_MAGIC_NUMBER; sb->s_op = &cifs_super_ops; + sb->s_bdi = &cifs_sb->bdi; /* if (cifs_sb->tcon->ses->server->maxBuf > MAX_CIFS_HDR_SIZE + 512) sb->s_blocksize = cifs_sb->tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE; */ @@ -183,6 +191,7 @@ out_mount_failed: } #endif unload_nls(cifs_sb->local_nls); + bdi_destroy(&cifs_sb->bdi); kfree(cifs_sb); } return rc; @@ -214,6 +223,7 @@ cifs_put_super(struct super_block *sb) #endif unload_nls(cifs_sb->local_nls); + bdi_destroy(&cifs_sb->bdi); kfree(cifs_sb); unlock_kernel(); diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index ecf0ffbe2b64..0c2fd17439c8 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -502,6 +502,7 @@ struct dfs_info3_param { #define CIFS_FATTR_DFS_REFERRAL 0x1 #define CIFS_FATTR_DELETE_PENDING 0x2 #define CIFS_FATTR_NEED_REVAL 0x4 +#define CIFS_FATTR_INO_COLLISION 0x8 struct cifs_fattr { u32 cf_flags; diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 35ec11716213..29b9ea244c81 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -715,6 +715,16 @@ cifs_find_inode(struct inode *inode, void *opaque) if (CIFS_I(inode)->uniqueid != fattr->cf_uniqueid) return 0; + /* + * uh oh -- it's a directory. We can't use it since hardlinked dirs are + * verboten. Disable serverino and return it as if it were found, the + * caller can discard it, generate a uniqueid and retry the find + */ + if (S_ISDIR(inode->i_mode) && !list_empty(&inode->i_dentry)) { + fattr->cf_flags |= CIFS_FATTR_INO_COLLISION; + cifs_autodisable_serverino(CIFS_SB(inode->i_sb)); + } + return 1; } @@ -734,15 +744,22 @@ cifs_iget(struct super_block *sb, struct cifs_fattr *fattr) unsigned long hash; struct inode *inode; +retry_iget5_locked: cFYI(1, ("looking for uniqueid=%llu", fattr->cf_uniqueid)); /* hash down to 32-bits on 32-bit arch */ hash = cifs_uniqueid_to_ino_t(fattr->cf_uniqueid); inode = iget5_locked(sb, hash, cifs_find_inode, cifs_init_inode, fattr); - - /* we have fattrs in hand, update the inode */ if (inode) { + /* was there a problematic inode number collision? */ + if (fattr->cf_flags & CIFS_FATTR_INO_COLLISION) { + iput(inode); + fattr->cf_uniqueid = iunique(sb, ROOT_I); + fattr->cf_flags &= ~CIFS_FATTR_INO_COLLISION; + goto retry_iget5_locked; + } + cifs_fattr_to_inode(inode, fattr); if (sb->s_flags & MS_NOATIME) inode->i_flags |= S_NOATIME | S_NOCMTIME; diff --git a/fs/coda/inode.c b/fs/coda/inode.c index a1695dcadd99..d97f9935a028 100644 --- a/fs/coda/inode.c +++ b/fs/coda/inode.c @@ -167,6 +167,10 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent) return -EBUSY; } + error = bdi_setup_and_register(&vc->bdi, "coda", BDI_CAP_MAP_COPY); + if (error) + goto bdi_err; + vc->vc_sb = sb; sb->s_fs_info = vc; @@ -175,6 +179,7 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent) sb->s_blocksize_bits = 12; sb->s_magic = CODA_SUPER_MAGIC; sb->s_op = &coda_super_operations; + sb->s_bdi = &vc->bdi; /* get root fid from Venus: this needs the root inode */ error = venus_rootfid(sb, &fid); @@ -200,6 +205,8 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent) return 0; error: + bdi_destroy(&vc->bdi); + bdi_err: if (root) iput(root); if (vc) @@ -210,6 +217,7 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent) static void coda_put_super(struct super_block *sb) { + bdi_destroy(&coda_vcp(sb)->bdi); coda_vcp(sb)->vc_sb = NULL; sb->s_fs_info = NULL; diff --git a/fs/compat.c b/fs/compat.c index 4b6ed03cc478..05448730f840 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -1531,8 +1531,6 @@ int compat_do_execve(char * filename, if (retval < 0) goto out; - current->stack_start = current->mm->start_stack; - /* execve succeeded */ current->fs->in_exec = 0; current->in_execve = 0; diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index c32a1b6a856b..641640dc7ae5 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -102,7 +102,6 @@ #include <linux/nbd.h> #include <linux/random.h> #include <linux/filter.h> -#include <linux/pktcdvd.h> #include <linux/hiddev.h> @@ -1126,8 +1125,6 @@ COMPATIBLE_IOCTL(PPGETMODE) COMPATIBLE_IOCTL(PPGETPHASE) COMPATIBLE_IOCTL(PPGETFLAGS) COMPATIBLE_IOCTL(PPSETFLAGS) -/* pktcdvd */ -COMPATIBLE_IOCTL(PACKET_CTRL_CMD) /* Big A */ /* sparc only */ /* Big Q for sound/OSS */ diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index 8e48b52205aa..0b502f80c691 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -645,6 +645,7 @@ static void detach_groups(struct config_group *group) configfs_detach_group(sd->s_element); child->d_inode->i_flags |= S_DEAD; + dont_mount(child); mutex_unlock(&child->d_inode->i_mutex); @@ -840,6 +841,7 @@ static int configfs_attach_item(struct config_item *parent_item, mutex_lock(&dentry->d_inode->i_mutex); configfs_remove_dir(item); dentry->d_inode->i_flags |= S_DEAD; + dont_mount(dentry); mutex_unlock(&dentry->d_inode->i_mutex); d_delete(dentry); } @@ -882,6 +884,7 @@ static int configfs_attach_group(struct config_item *parent_item, if (ret) { configfs_detach_item(item); dentry->d_inode->i_flags |= S_DEAD; + dont_mount(dentry); } configfs_adjust_dir_dirent_depth_after_populate(sd); mutex_unlock(&dentry->d_inode->i_mutex); @@ -1725,6 +1728,7 @@ void configfs_unregister_subsystem(struct configfs_subsystem *subsys) mutex_unlock(&configfs_symlink_mutex); configfs_detach_group(&group->cg_item); dentry->d_inode->i_flags |= S_DEAD; + dont_mount(dentry); mutex_unlock(&dentry->d_inode->i_mutex); d_delete(dentry); diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index efb2b9400391..1cc087635a5e 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -382,8 +382,8 @@ out: static void ecryptfs_lower_offset_for_extent(loff_t *offset, loff_t extent_num, struct ecryptfs_crypt_stat *crypt_stat) { - (*offset) = (crypt_stat->num_header_bytes_at_front - + (crypt_stat->extent_size * extent_num)); + (*offset) = ecryptfs_lower_header_size(crypt_stat) + + (crypt_stat->extent_size * extent_num); } /** @@ -835,13 +835,13 @@ void ecryptfs_set_default_sizes(struct ecryptfs_crypt_stat *crypt_stat) set_extent_mask_and_shift(crypt_stat); crypt_stat->iv_bytes = ECRYPTFS_DEFAULT_IV_BYTES; if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR) - crypt_stat->num_header_bytes_at_front = 0; + crypt_stat->metadata_size = ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE; else { if (PAGE_CACHE_SIZE <= ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE) - crypt_stat->num_header_bytes_at_front = + crypt_stat->metadata_size = ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE; else - crypt_stat->num_header_bytes_at_front = PAGE_CACHE_SIZE; + crypt_stat->metadata_size = PAGE_CACHE_SIZE; } } @@ -1108,9 +1108,9 @@ static void write_ecryptfs_marker(char *page_virt, size_t *written) (*written) = MAGIC_ECRYPTFS_MARKER_SIZE_BYTES; } -static void -write_ecryptfs_flags(char *page_virt, struct ecryptfs_crypt_stat *crypt_stat, - size_t *written) +void ecryptfs_write_crypt_stat_flags(char *page_virt, + struct ecryptfs_crypt_stat *crypt_stat, + size_t *written) { u32 flags = 0; int i; @@ -1238,8 +1238,7 @@ ecryptfs_write_header_metadata(char *virt, header_extent_size = (u32)crypt_stat->extent_size; num_header_extents_at_front = - (u16)(crypt_stat->num_header_bytes_at_front - / crypt_stat->extent_size); + (u16)(crypt_stat->metadata_size / crypt_stat->extent_size); put_unaligned_be32(header_extent_size, virt); virt += 4; put_unaligned_be16(num_header_extents_at_front, virt); @@ -1292,7 +1291,8 @@ static int ecryptfs_write_headers_virt(char *page_virt, size_t max, offset = ECRYPTFS_FILE_SIZE_BYTES; write_ecryptfs_marker((page_virt + offset), &written); offset += written; - write_ecryptfs_flags((page_virt + offset), crypt_stat, &written); + ecryptfs_write_crypt_stat_flags((page_virt + offset), crypt_stat, + &written); offset += written; ecryptfs_write_header_metadata((page_virt + offset), crypt_stat, &written); @@ -1382,7 +1382,7 @@ int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry) rc = -EINVAL; goto out; } - virt_len = crypt_stat->num_header_bytes_at_front; + virt_len = crypt_stat->metadata_size; order = get_order(virt_len); /* Released in this function */ virt = (char *)ecryptfs_get_zeroed_pages(GFP_KERNEL, order); @@ -1428,16 +1428,15 @@ static int parse_header_metadata(struct ecryptfs_crypt_stat *crypt_stat, header_extent_size = get_unaligned_be32(virt); virt += sizeof(__be32); num_header_extents_at_front = get_unaligned_be16(virt); - crypt_stat->num_header_bytes_at_front = - (((size_t)num_header_extents_at_front - * (size_t)header_extent_size)); + crypt_stat->metadata_size = (((size_t)num_header_extents_at_front + * (size_t)header_extent_size)); (*bytes_read) = (sizeof(__be32) + sizeof(__be16)); if ((validate_header_size == ECRYPTFS_VALIDATE_HEADER_SIZE) - && (crypt_stat->num_header_bytes_at_front + && (crypt_stat->metadata_size < ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE)) { rc = -EINVAL; printk(KERN_WARNING "Invalid header size: [%zd]\n", - crypt_stat->num_header_bytes_at_front); + crypt_stat->metadata_size); } return rc; } @@ -1452,8 +1451,7 @@ static int parse_header_metadata(struct ecryptfs_crypt_stat *crypt_stat, */ static void set_default_header_data(struct ecryptfs_crypt_stat *crypt_stat) { - crypt_stat->num_header_bytes_at_front = - ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE; + crypt_stat->metadata_size = ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE; } /** @@ -1607,6 +1605,7 @@ int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry) ecryptfs_dentry, ECRYPTFS_VALIDATE_HEADER_SIZE); if (rc) { + memset(page_virt, 0, PAGE_CACHE_SIZE); rc = ecryptfs_read_xattr_region(page_virt, ecryptfs_inode); if (rc) { printk(KERN_DEBUG "Valid eCryptfs headers not found in " diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h index 542f625312f3..bfc2e0f78f00 100644 --- a/fs/ecryptfs/ecryptfs_kernel.h +++ b/fs/ecryptfs/ecryptfs_kernel.h @@ -35,6 +35,7 @@ #include <linux/scatterlist.h> #include <linux/hash.h> #include <linux/nsproxy.h> +#include <linux/backing-dev.h> /* Version verification for shared data structures w/ userspace */ #define ECRYPTFS_VERSION_MAJOR 0x00 @@ -273,7 +274,7 @@ struct ecryptfs_crypt_stat { u32 flags; unsigned int file_version; size_t iv_bytes; - size_t num_header_bytes_at_front; + size_t metadata_size; size_t extent_size; /* Data extent size; default is 4096 */ size_t key_size; size_t extent_shift; @@ -393,6 +394,7 @@ struct ecryptfs_mount_crypt_stat { struct ecryptfs_sb_info { struct super_block *wsi_sb; struct ecryptfs_mount_crypt_stat mount_crypt_stat; + struct backing_dev_info bdi; }; /* file private data. */ @@ -464,6 +466,14 @@ struct ecryptfs_daemon { extern struct mutex ecryptfs_daemon_hash_mux; +static inline size_t +ecryptfs_lower_header_size(struct ecryptfs_crypt_stat *crypt_stat) +{ + if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR) + return 0; + return crypt_stat->metadata_size; +} + static inline struct ecryptfs_file_info * ecryptfs_file_to_private(struct file *file) { @@ -651,6 +661,9 @@ int ecryptfs_decrypt_page(struct page *page); int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry); int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry); int ecryptfs_new_file_context(struct dentry *ecryptfs_dentry); +void ecryptfs_write_crypt_stat_flags(char *page_virt, + struct ecryptfs_crypt_stat *crypt_stat, + size_t *written); int ecryptfs_read_and_validate_header_region(char *data, struct inode *ecryptfs_inode); int ecryptfs_read_and_validate_xattr_region(char *page_virt, diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index d3362faf3852..e2d4418affac 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -324,6 +324,7 @@ int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry, rc = ecryptfs_read_and_validate_header_region(page_virt, ecryptfs_dentry->d_inode); if (rc) { + memset(page_virt, 0, PAGE_CACHE_SIZE); rc = ecryptfs_read_and_validate_xattr_region(page_virt, ecryptfs_dentry); if (rc) { @@ -336,7 +337,7 @@ int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry, ecryptfs_dentry->d_sb)->mount_crypt_stat; if (mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED) { if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR) - file_size = (crypt_stat->num_header_bytes_at_front + file_size = (crypt_stat->metadata_size + i_size_read(lower_dentry->d_inode)); else file_size = i_size_read(lower_dentry->d_inode); @@ -388,9 +389,9 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode, mutex_unlock(&lower_dir_dentry->d_inode->i_mutex); if (IS_ERR(lower_dentry)) { rc = PTR_ERR(lower_dentry); - printk(KERN_ERR "%s: lookup_one_len() returned [%d] on " - "lower_dentry = [%s]\n", __func__, rc, - ecryptfs_dentry->d_name.name); + ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned " + "[%d] on lower_dentry = [%s]\n", __func__, rc, + encrypted_and_encoded_name); goto out_d_drop; } if (lower_dentry->d_inode) @@ -417,9 +418,9 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode, mutex_unlock(&lower_dir_dentry->d_inode->i_mutex); if (IS_ERR(lower_dentry)) { rc = PTR_ERR(lower_dentry); - printk(KERN_ERR "%s: lookup_one_len() returned [%d] on " - "lower_dentry = [%s]\n", __func__, rc, - encrypted_and_encoded_name); + ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned " + "[%d] on lower_dentry = [%s]\n", __func__, rc, + encrypted_and_encoded_name); goto out_d_drop; } lookup_and_interpose: @@ -456,8 +457,8 @@ static int ecryptfs_link(struct dentry *old_dentry, struct inode *dir, rc = ecryptfs_interpose(lower_new_dentry, new_dentry, dir->i_sb, 0); if (rc) goto out_lock; - fsstack_copy_attr_times(dir, lower_new_dentry->d_inode); - fsstack_copy_inode_size(dir, lower_new_dentry->d_inode); + fsstack_copy_attr_times(dir, lower_dir_dentry->d_inode); + fsstack_copy_inode_size(dir, lower_dir_dentry->d_inode); old_dentry->d_inode->i_nlink = ecryptfs_inode_to_lower(old_dentry->d_inode)->i_nlink; i_size_write(new_dentry->d_inode, file_size_save); @@ -648,38 +649,17 @@ out_lock: return rc; } -static int -ecryptfs_readlink(struct dentry *dentry, char __user *buf, int bufsiz) +static int ecryptfs_readlink_lower(struct dentry *dentry, char **buf, + size_t *bufsiz) { + struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry); char *lower_buf; - size_t lower_bufsiz; - struct dentry *lower_dentry; - struct ecryptfs_mount_crypt_stat *mount_crypt_stat; - char *plaintext_name; - size_t plaintext_name_size; + size_t lower_bufsiz = PATH_MAX; mm_segment_t old_fs; int rc; - lower_dentry = ecryptfs_dentry_to_lower(dentry); - if (!lower_dentry->d_inode->i_op->readlink) { - rc = -EINVAL; - goto out; - } - mount_crypt_stat = &ecryptfs_superblock_to_private( - dentry->d_sb)->mount_crypt_stat; - /* - * If the lower filename is encrypted, it will result in a significantly - * longer name. If needed, truncate the name after decode and decrypt. - */ - if (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES) - lower_bufsiz = PATH_MAX; - else - lower_bufsiz = bufsiz; - /* Released in this function */ lower_buf = kmalloc(lower_bufsiz, GFP_KERNEL); - if (lower_buf == NULL) { - printk(KERN_ERR "%s: Out of memory whilst attempting to " - "kmalloc [%zd] bytes\n", __func__, lower_bufsiz); + if (!lower_buf) { rc = -ENOMEM; goto out; } @@ -689,29 +669,31 @@ ecryptfs_readlink(struct dentry *dentry, char __user *buf, int bufsiz) (char __user *)lower_buf, lower_bufsiz); set_fs(old_fs); - if (rc >= 0) { - rc = ecryptfs_decode_and_decrypt_filename(&plaintext_name, - &plaintext_name_size, - dentry, lower_buf, - rc); - if (rc) { - printk(KERN_ERR "%s: Error attempting to decode and " - "decrypt filename; rc = [%d]\n", __func__, - rc); - goto out_free_lower_buf; - } - /* Check for bufsiz <= 0 done in sys_readlinkat() */ - rc = copy_to_user(buf, plaintext_name, - min((size_t) bufsiz, plaintext_name_size)); - if (rc) - rc = -EFAULT; - else - rc = plaintext_name_size; - kfree(plaintext_name); - fsstack_copy_attr_atime(dentry->d_inode, lower_dentry->d_inode); - } -out_free_lower_buf: + if (rc < 0) + goto out; + lower_bufsiz = rc; + rc = ecryptfs_decode_and_decrypt_filename(buf, bufsiz, dentry, + lower_buf, lower_bufsiz); +out: kfree(lower_buf); + return rc; +} + +static int +ecryptfs_readlink(struct dentry *dentry, char __user *buf, int bufsiz) +{ + char *kbuf; + size_t kbufsiz, copied; + int rc; + + rc = ecryptfs_readlink_lower(dentry, &kbuf, &kbufsiz); + if (rc) + goto out; + copied = min_t(size_t, bufsiz, kbufsiz); + rc = copy_to_user(buf, kbuf, copied) ? -EFAULT : copied; + kfree(kbuf); + fsstack_copy_attr_atime(dentry->d_inode, + ecryptfs_dentry_to_lower(dentry)->d_inode); out: return rc; } @@ -769,7 +751,7 @@ upper_size_to_lower_size(struct ecryptfs_crypt_stat *crypt_stat, { loff_t lower_size; - lower_size = crypt_stat->num_header_bytes_at_front; + lower_size = ecryptfs_lower_header_size(crypt_stat); if (upper_size != 0) { loff_t num_extents; @@ -1016,6 +998,28 @@ out: return rc; } +int ecryptfs_getattr_link(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat) +{ + struct ecryptfs_mount_crypt_stat *mount_crypt_stat; + int rc = 0; + + mount_crypt_stat = &ecryptfs_superblock_to_private( + dentry->d_sb)->mount_crypt_stat; + generic_fillattr(dentry->d_inode, stat); + if (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES) { + char *target; + size_t targetsiz; + + rc = ecryptfs_readlink_lower(dentry, &target, &targetsiz); + if (!rc) { + kfree(target); + stat->size = targetsiz; + } + } + return rc; +} + int ecryptfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { @@ -1040,7 +1044,7 @@ ecryptfs_setxattr(struct dentry *dentry, const char *name, const void *value, lower_dentry = ecryptfs_dentry_to_lower(dentry); if (!lower_dentry->d_inode->i_op->setxattr) { - rc = -ENOSYS; + rc = -EOPNOTSUPP; goto out; } mutex_lock(&lower_dentry->d_inode->i_mutex); @@ -1058,7 +1062,7 @@ ecryptfs_getxattr_lower(struct dentry *lower_dentry, const char *name, int rc = 0; if (!lower_dentry->d_inode->i_op->getxattr) { - rc = -ENOSYS; + rc = -EOPNOTSUPP; goto out; } mutex_lock(&lower_dentry->d_inode->i_mutex); @@ -1085,7 +1089,7 @@ ecryptfs_listxattr(struct dentry *dentry, char *list, size_t size) lower_dentry = ecryptfs_dentry_to_lower(dentry); if (!lower_dentry->d_inode->i_op->listxattr) { - rc = -ENOSYS; + rc = -EOPNOTSUPP; goto out; } mutex_lock(&lower_dentry->d_inode->i_mutex); @@ -1102,7 +1106,7 @@ static int ecryptfs_removexattr(struct dentry *dentry, const char *name) lower_dentry = ecryptfs_dentry_to_lower(dentry); if (!lower_dentry->d_inode->i_op->removexattr) { - rc = -ENOSYS; + rc = -EOPNOTSUPP; goto out; } mutex_lock(&lower_dentry->d_inode->i_mutex); @@ -1133,6 +1137,7 @@ const struct inode_operations ecryptfs_symlink_iops = { .put_link = ecryptfs_put_link, .permission = ecryptfs_permission, .setattr = ecryptfs_setattr, + .getattr = ecryptfs_getattr_link, .setxattr = ecryptfs_setxattr, .getxattr = ecryptfs_getxattr, .listxattr = ecryptfs_listxattr, diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index af1a8f01ebac..760983d0f25e 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c @@ -497,17 +497,25 @@ struct kmem_cache *ecryptfs_sb_info_cache; static int ecryptfs_fill_super(struct super_block *sb, void *raw_data, int silent) { + struct ecryptfs_sb_info *esi; int rc = 0; /* Released in ecryptfs_put_super() */ ecryptfs_set_superblock_private(sb, kmem_cache_zalloc(ecryptfs_sb_info_cache, GFP_KERNEL)); - if (!ecryptfs_superblock_to_private(sb)) { + esi = ecryptfs_superblock_to_private(sb); + if (!esi) { ecryptfs_printk(KERN_WARNING, "Out of memory\n"); rc = -ENOMEM; goto out; } + + rc = bdi_setup_and_register(&esi->bdi, "ecryptfs", BDI_CAP_MAP_COPY); + if (rc) + goto out; + + sb->s_bdi = &esi->bdi; sb->s_op = &ecryptfs_sops; /* Released through deactivate_super(sb) from get_sb_nodev */ sb->s_root = d_alloc(NULL, &(const struct qstr) { diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c index d491237c98e7..2ee9a3a7b68c 100644 --- a/fs/ecryptfs/mmap.c +++ b/fs/ecryptfs/mmap.c @@ -83,6 +83,19 @@ out: return rc; } +static void strip_xattr_flag(char *page_virt, + struct ecryptfs_crypt_stat *crypt_stat) +{ + if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR) { + size_t written; + + crypt_stat->flags &= ~ECRYPTFS_METADATA_IN_XATTR; + ecryptfs_write_crypt_stat_flags(page_virt, crypt_stat, + &written); + crypt_stat->flags |= ECRYPTFS_METADATA_IN_XATTR; + } +} + /** * Header Extent: * Octets 0-7: Unencrypted file size (big-endian) @@ -98,19 +111,6 @@ out: * (big-endian) * Octet 26: Begin RFC 2440 authentication token packet set */ -static void set_header_info(char *page_virt, - struct ecryptfs_crypt_stat *crypt_stat) -{ - size_t written; - size_t save_num_header_bytes_at_front = - crypt_stat->num_header_bytes_at_front; - - crypt_stat->num_header_bytes_at_front = - ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE; - ecryptfs_write_header_metadata(page_virt + 20, crypt_stat, &written); - crypt_stat->num_header_bytes_at_front = - save_num_header_bytes_at_front; -} /** * ecryptfs_copy_up_encrypted_with_header @@ -136,8 +136,7 @@ ecryptfs_copy_up_encrypted_with_header(struct page *page, * num_extents_per_page) + extent_num_in_page); size_t num_header_extents_at_front = - (crypt_stat->num_header_bytes_at_front - / crypt_stat->extent_size); + (crypt_stat->metadata_size / crypt_stat->extent_size); if (view_extent_num < num_header_extents_at_front) { /* This is a header extent */ @@ -147,9 +146,14 @@ ecryptfs_copy_up_encrypted_with_header(struct page *page, memset(page_virt, 0, PAGE_CACHE_SIZE); /* TODO: Support more than one header extent */ if (view_extent_num == 0) { + size_t written; + rc = ecryptfs_read_xattr_region( page_virt, page->mapping->host); - set_header_info(page_virt, crypt_stat); + strip_xattr_flag(page_virt + 16, crypt_stat); + ecryptfs_write_header_metadata(page_virt + 20, + crypt_stat, + &written); } kunmap_atomic(page_virt, KM_USER0); flush_dcache_page(page); @@ -162,7 +166,7 @@ ecryptfs_copy_up_encrypted_with_header(struct page *page, /* This is an encrypted data extent */ loff_t lower_offset = ((view_extent_num * crypt_stat->extent_size) - - crypt_stat->num_header_bytes_at_front); + - crypt_stat->metadata_size); rc = ecryptfs_read_lower_page_segment( page, (lower_offset >> PAGE_CACHE_SHIFT), diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c index fcef41c1d2cf..0c0ae491d231 100644 --- a/fs/ecryptfs/super.c +++ b/fs/ecryptfs/super.c @@ -86,7 +86,6 @@ static void ecryptfs_destroy_inode(struct inode *inode) if (lower_dentry->d_inode) { fput(inode_info->lower_file); inode_info->lower_file = NULL; - d_drop(lower_dentry); } } ecryptfs_destroy_crypt_stat(&inode_info->crypt_stat); @@ -123,6 +122,7 @@ static void ecryptfs_put_super(struct super_block *sb) lock_kernel(); ecryptfs_destroy_mount_crypt_stat(&sb_info->mount_crypt_stat); + bdi_destroy(&sb_info->bdi); kmem_cache_free(ecryptfs_sb_info_cache, sb_info); ecryptfs_set_superblock_private(sb, NULL); diff --git a/fs/exec.c b/fs/exec.c index 49cdaa19e5b9..e6e94c626c2c 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1387,8 +1387,6 @@ int do_execve(char * filename, if (retval < 0) goto out; - current->stack_start = current->mm->start_stack; - /* execve succeeded */ current->fs->in_exec = 0; current->in_execve = 0; diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h index 8442e353309f..22721b2fd890 100644 --- a/fs/exofs/exofs.h +++ b/fs/exofs/exofs.h @@ -35,6 +35,7 @@ #include <linux/fs.h> #include <linux/time.h> +#include <linux/backing-dev.h> #include "common.h" /* FIXME: Remove once pnfs hits mainline @@ -84,6 +85,7 @@ struct exofs_sb_info { u32 s_next_generation; /* next gen # to use */ atomic_t s_curr_pending; /* number of pending commands */ uint8_t s_cred[OSD_CAP_LEN]; /* credential for the fscb */ + struct backing_dev_info bdi; /* register our bdi with VFS */ struct pnfs_osd_data_map data_map; /* Default raid to use * FIXME: Needed ? diff --git a/fs/exofs/super.c b/fs/exofs/super.c index 18e57ea1e5b4..03149b9a5178 100644 --- a/fs/exofs/super.c +++ b/fs/exofs/super.c @@ -302,6 +302,7 @@ static void exofs_put_super(struct super_block *sb) _exofs_print_device("Unmounting", NULL, sbi->layout.s_ods[0], sbi->layout.s_pid); + bdi_destroy(&sbi->bdi); exofs_free_sbi(sbi); sb->s_fs_info = NULL; } @@ -546,6 +547,10 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent) if (!sbi) return -ENOMEM; + ret = bdi_setup_and_register(&sbi->bdi, "exofs", BDI_CAP_MAP_COPY); + if (ret) + goto free_bdi; + /* use mount options to fill superblock */ od = osduld_path_lookup(opts->dev_name); if (IS_ERR(od)) { @@ -612,6 +617,7 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent) } /* set up operation vectors */ + sb->s_bdi = &sbi->bdi; sb->s_fs_info = sbi; sb->s_op = &exofs_sops; sb->s_export_op = &exofs_export_ops; @@ -643,6 +649,8 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent) return 0; free_sbi: + bdi_destroy(&sbi->bdi); +free_bdi: EXOFS_ERR("Unable to mount exofs on %s pid=0x%llx err=%d\n", opts->dev_name, sbi->layout.s_pid, ret); exofs_free_sbi(sbi); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 94c8ee81f5e1..236b834b4ca8 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3879,6 +3879,7 @@ static int ext4_xattr_fiemap(struct inode *inode, physical += offset; length = EXT4_SB(inode->i_sb)->s_inode_size - offset; flags |= FIEMAP_EXTENT_DATA_INLINE; + brelse(iloc.bh); } else { /* external block */ physical = EXT4_I(inode)->i_file_acl << blockbits; length = inode->i_sb->s_blocksize; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 5381802d6052..81d605412844 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -5375,7 +5375,7 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc) } else { struct ext4_iloc iloc; - err = ext4_get_inode_loc(inode, &iloc); + err = __ext4_get_inode_loc(inode, &iloc, 0); if (err) return err; if (wbc->sync_mode == WB_SYNC_ALL) @@ -5386,6 +5386,7 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc) (unsigned long long)iloc.bh->b_blocknr); err = -EIO; } + brelse(iloc.bh); } return err; } diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index bde9d0b170c2..b423a364dca3 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2535,6 +2535,17 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn) mb_debug(1, "gonna free %u blocks in group %u (0x%p):", entry->count, entry->group, entry); + if (test_opt(sb, DISCARD)) { + ext4_fsblk_t discard_block; + + discard_block = entry->start_blk + + ext4_group_first_block_no(sb, entry->group); + trace_ext4_discard_blocks(sb, + (unsigned long long)discard_block, + entry->count); + sb_issue_discard(sb, discard_block, entry->count); + } + err = ext4_mb_load_buddy(sb, entry->group, &e4b); /* we expect to find existing buddy because it's pinned */ BUG_ON(err != 0); @@ -2556,16 +2567,6 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn) page_cache_release(e4b.bd_bitmap_page); } ext4_unlock_group(sb, entry->group); - if (test_opt(sb, DISCARD)) { - ext4_fsblk_t discard_block; - - discard_block = entry->start_blk + - ext4_group_first_block_no(sb, entry->group); - trace_ext4_discard_blocks(sb, - (unsigned long long)discard_block, - entry->count); - sb_issue_discard(sb, discard_block, entry->count); - } kmem_cache_free(ext4_free_ext_cachep, entry); ext4_mb_release_desc(&e4b); } diff --git a/fs/ioctl.c b/fs/ioctl.c index 6c751106c2e5..7faefb4da939 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -228,14 +228,23 @@ static int ioctl_fiemap(struct file *filp, unsigned long arg) #ifdef CONFIG_BLOCK -#define blk_to_logical(inode, blk) (blk << (inode)->i_blkbits) -#define logical_to_blk(inode, offset) (offset >> (inode)->i_blkbits); +static inline sector_t logical_to_blk(struct inode *inode, loff_t offset) +{ + return (offset >> inode->i_blkbits); +} + +static inline loff_t blk_to_logical(struct inode *inode, sector_t blk) +{ + return (blk << inode->i_blkbits); +} /** * __generic_block_fiemap - FIEMAP for block based inodes (no locking) - * @inode - the inode to map - * @arg - the pointer to userspace where we copy everything to - * @get_block - the fs's get_block function + * @inode: the inode to map + * @fieinfo: the fiemap info struct that will be passed back to userspace + * @start: where to start mapping in the inode + * @len: how much space to map + * @get_block: the fs's get_block function * * This does FIEMAP for block based inodes. Basically it will just loop * through get_block until we hit the number of extents we want to map, or we @@ -250,58 +259,63 @@ static int ioctl_fiemap(struct file *filp, unsigned long arg) */ int __generic_block_fiemap(struct inode *inode, - struct fiemap_extent_info *fieinfo, u64 start, - u64 len, get_block_t *get_block) + struct fiemap_extent_info *fieinfo, loff_t start, + loff_t len, get_block_t *get_block) { - struct buffer_head tmp; - unsigned long long start_blk; - long long length = 0, map_len = 0; + struct buffer_head map_bh; + sector_t start_blk, last_blk; + loff_t isize = i_size_read(inode); u64 logical = 0, phys = 0, size = 0; u32 flags = FIEMAP_EXTENT_MERGED; - int ret = 0, past_eof = 0, whole_file = 0; + bool past_eof = false, whole_file = false; + int ret = 0; - if ((ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC))) + ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC); + if (ret) return ret; - start_blk = logical_to_blk(inode, start); - - length = (long long)min_t(u64, len, i_size_read(inode)); - if (length < len) - whole_file = 1; + /* + * Either the i_mutex or other appropriate locking needs to be held + * since we expect isize to not change at all through the duration of + * this call. + */ + if (len >= isize) { + whole_file = true; + len = isize; + } - map_len = length; + start_blk = logical_to_blk(inode, start); + last_blk = logical_to_blk(inode, start + len - 1); do { /* * we set b_size to the total size we want so it will map as * many contiguous blocks as possible at once */ - memset(&tmp, 0, sizeof(struct buffer_head)); - tmp.b_size = map_len; + memset(&map_bh, 0, sizeof(struct buffer_head)); + map_bh.b_size = len; - ret = get_block(inode, start_blk, &tmp, 0); + ret = get_block(inode, start_blk, &map_bh, 0); if (ret) break; /* HOLE */ - if (!buffer_mapped(&tmp)) { - length -= blk_to_logical(inode, 1); + if (!buffer_mapped(&map_bh)) { start_blk++; /* - * we want to handle the case where there is an + * We want to handle the case where there is an * allocated block at the front of the file, and then * nothing but holes up to the end of the file properly, * to make sure that extent at the front gets properly * marked with FIEMAP_EXTENT_LAST */ if (!past_eof && - blk_to_logical(inode, start_blk) >= - blk_to_logical(inode, 0)+i_size_read(inode)) + blk_to_logical(inode, start_blk) >= isize) past_eof = 1; /* - * first hole after going past the EOF, this is our + * First hole after going past the EOF, this is our * last extent */ if (past_eof && size) { @@ -309,15 +323,18 @@ int __generic_block_fiemap(struct inode *inode, ret = fiemap_fill_next_extent(fieinfo, logical, phys, size, flags); - break; + } else if (size) { + ret = fiemap_fill_next_extent(fieinfo, logical, + phys, size, flags); + size = 0; } /* if we have holes up to/past EOF then we're done */ - if (length <= 0 || past_eof) + if (start_blk > last_blk || past_eof || ret) break; } else { /* - * we have gone over the length of what we wanted to + * We have gone over the length of what we wanted to * map, and it wasn't the entire file, so add the extent * we got last time and exit. * @@ -331,7 +348,7 @@ int __generic_block_fiemap(struct inode *inode, * are good to go, just add the extent to the fieinfo * and break */ - if (length <= 0 && !whole_file) { + if (start_blk > last_blk && !whole_file) { ret = fiemap_fill_next_extent(fieinfo, logical, phys, size, flags); @@ -351,11 +368,10 @@ int __generic_block_fiemap(struct inode *inode, } logical = blk_to_logical(inode, start_blk); - phys = blk_to_logical(inode, tmp.b_blocknr); - size = tmp.b_size; + phys = blk_to_logical(inode, map_bh.b_blocknr); + size = map_bh.b_size; flags = FIEMAP_EXTENT_MERGED; - length -= tmp.b_size; start_blk += logical_to_blk(inode, size); /* @@ -363,15 +379,13 @@ int __generic_block_fiemap(struct inode *inode, * soon as we find a hole that the last extent we found * is marked with FIEMAP_EXTENT_LAST */ - if (!past_eof && - logical+size >= - blk_to_logical(inode, 0)+i_size_read(inode)) - past_eof = 1; + if (!past_eof && logical + size >= isize) + past_eof = true; } cond_resched(); } while (1); - /* if ret is 1 then we just hit the end of the extent array */ + /* If ret is 1 then we just hit the end of the extent array */ if (ret == 1) ret = 0; diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c index 9dd126276c9f..ed9ba6fe04f5 100644 --- a/fs/jfs/inode.c +++ b/fs/jfs/inode.c @@ -61,7 +61,7 @@ struct inode *jfs_iget(struct super_block *sb, unsigned long ino) inode->i_op = &page_symlink_inode_operations; inode->i_mapping->a_ops = &jfs_aops; } else { - inode->i_op = &jfs_symlink_inode_operations; + inode->i_op = &jfs_fast_symlink_inode_operations; /* * The inline data should be null-terminated, but * don't let on-disk corruption crash the kernel diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c index 6c4dfcbf3f55..9e2f6a721668 100644 --- a/fs/jfs/jfs_dmap.c +++ b/fs/jfs/jfs_dmap.c @@ -196,7 +196,7 @@ int dbMount(struct inode *ipbmap) bmp->db_maxag = le32_to_cpu(dbmp_le->dn_maxag); bmp->db_agpref = le32_to_cpu(dbmp_le->dn_agpref); bmp->db_aglevel = le32_to_cpu(dbmp_le->dn_aglevel); - bmp->db_agheigth = le32_to_cpu(dbmp_le->dn_agheigth); + bmp->db_agheight = le32_to_cpu(dbmp_le->dn_agheight); bmp->db_agwidth = le32_to_cpu(dbmp_le->dn_agwidth); bmp->db_agstart = le32_to_cpu(dbmp_le->dn_agstart); bmp->db_agl2size = le32_to_cpu(dbmp_le->dn_agl2size); @@ -288,7 +288,7 @@ int dbSync(struct inode *ipbmap) dbmp_le->dn_maxag = cpu_to_le32(bmp->db_maxag); dbmp_le->dn_agpref = cpu_to_le32(bmp->db_agpref); dbmp_le->dn_aglevel = cpu_to_le32(bmp->db_aglevel); - dbmp_le->dn_agheigth = cpu_to_le32(bmp->db_agheigth); + dbmp_le->dn_agheight = cpu_to_le32(bmp->db_agheight); dbmp_le->dn_agwidth = cpu_to_le32(bmp->db_agwidth); dbmp_le->dn_agstart = cpu_to_le32(bmp->db_agstart); dbmp_le->dn_agl2size = cpu_to_le32(bmp->db_agl2size); @@ -1441,7 +1441,7 @@ dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb, s64 * results) * tree index of this allocation group within the control page. */ agperlev = - (1 << (L2LPERCTL - (bmp->db_agheigth << 1))) / bmp->db_agwidth; + (1 << (L2LPERCTL - (bmp->db_agheight << 1))) / bmp->db_agwidth; ti = bmp->db_agstart + bmp->db_agwidth * (agno & (agperlev - 1)); /* dmap control page trees fan-out by 4 and a single allocation @@ -1460,7 +1460,7 @@ dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb, s64 * results) * the subtree to find the leftmost leaf that describes this * free space. */ - for (k = bmp->db_agheigth; k > 0; k--) { + for (k = bmp->db_agheight; k > 0; k--) { for (n = 0, m = (ti << 2) + 1; n < 4; n++) { if (l2nb <= dcp->stree[m + n]) { ti = m + n; @@ -3607,7 +3607,7 @@ void dbFinalizeBmap(struct inode *ipbmap) } /* - * compute db_aglevel, db_agheigth, db_width, db_agstart: + * compute db_aglevel, db_agheight, db_width, db_agstart: * an ag is covered in aglevel dmapctl summary tree, * at agheight level height (from leaf) with agwidth number of nodes * each, which starts at agstart index node of the smmary tree node @@ -3616,9 +3616,9 @@ void dbFinalizeBmap(struct inode *ipbmap) bmp->db_aglevel = BMAPSZTOLEV(bmp->db_agsize); l2nl = bmp->db_agl2size - (L2BPERDMAP + bmp->db_aglevel * L2LPERCTL); - bmp->db_agheigth = l2nl >> 1; - bmp->db_agwidth = 1 << (l2nl - (bmp->db_agheigth << 1)); - for (i = 5 - bmp->db_agheigth, bmp->db_agstart = 0, n = 1; i > 0; + bmp->db_agheight = l2nl >> 1; + bmp->db_agwidth = 1 << (l2nl - (bmp->db_agheight << 1)); + for (i = 5 - bmp->db_agheight, bmp->db_agstart = 0, n = 1; i > 0; i--) { bmp->db_agstart += n; n <<= 2; diff --git a/fs/jfs/jfs_dmap.h b/fs/jfs/jfs_dmap.h index 1a6eb41569bc..6dcb906c55d8 100644 --- a/fs/jfs/jfs_dmap.h +++ b/fs/jfs/jfs_dmap.h @@ -210,7 +210,7 @@ struct dbmap_disk { __le32 dn_maxag; /* 4: max active alloc group number */ __le32 dn_agpref; /* 4: preferred alloc group (hint) */ __le32 dn_aglevel; /* 4: dmapctl level holding the AG */ - __le32 dn_agheigth; /* 4: height in dmapctl of the AG */ + __le32 dn_agheight; /* 4: height in dmapctl of the AG */ __le32 dn_agwidth; /* 4: width in dmapctl of the AG */ __le32 dn_agstart; /* 4: start tree index at AG height */ __le32 dn_agl2size; /* 4: l2 num of blks per alloc group */ @@ -229,7 +229,7 @@ struct dbmap { int dn_maxag; /* max active alloc group number */ int dn_agpref; /* preferred alloc group (hint) */ int dn_aglevel; /* dmapctl level holding the AG */ - int dn_agheigth; /* height in dmapctl of the AG */ + int dn_agheight; /* height in dmapctl of the AG */ int dn_agwidth; /* width in dmapctl of the AG */ int dn_agstart; /* start tree index at AG height */ int dn_agl2size; /* l2 num of blks per alloc group */ @@ -255,7 +255,7 @@ struct bmap { #define db_agsize db_bmap.dn_agsize #define db_agl2size db_bmap.dn_agl2size #define db_agwidth db_bmap.dn_agwidth -#define db_agheigth db_bmap.dn_agheigth +#define db_agheight db_bmap.dn_agheight #define db_agstart db_bmap.dn_agstart #define db_numag db_bmap.dn_numag #define db_maxlevel db_bmap.dn_maxlevel diff --git a/fs/jfs/jfs_inode.h b/fs/jfs/jfs_inode.h index 79e2c79661df..9e6bda30a6e8 100644 --- a/fs/jfs/jfs_inode.h +++ b/fs/jfs/jfs_inode.h @@ -48,5 +48,6 @@ extern const struct file_operations jfs_dir_operations; extern const struct inode_operations jfs_file_inode_operations; extern const struct file_operations jfs_file_operations; extern const struct inode_operations jfs_symlink_inode_operations; +extern const struct inode_operations jfs_fast_symlink_inode_operations; extern const struct dentry_operations jfs_ci_dentry_operations; #endif /* _H_JFS_INODE */ diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c index 4a3e9f39c21d..a9cf8e8675be 100644 --- a/fs/jfs/namei.c +++ b/fs/jfs/namei.c @@ -956,7 +956,7 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry, */ if (ssize <= IDATASIZE) { - ip->i_op = &jfs_symlink_inode_operations; + ip->i_op = &jfs_fast_symlink_inode_operations; i_fastsymlink = JFS_IP(ip)->i_inline; memcpy(i_fastsymlink, name, ssize); @@ -978,7 +978,7 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry, else { jfs_info("jfs_symlink: allocate extent ip:0x%p", ip); - ip->i_op = &page_symlink_inode_operations; + ip->i_op = &jfs_symlink_inode_operations; ip->i_mapping->a_ops = &jfs_aops; /* diff --git a/fs/jfs/resize.c b/fs/jfs/resize.c index 7f24a0bb08ca..1aba0039f1c9 100644 --- a/fs/jfs/resize.c +++ b/fs/jfs/resize.c @@ -81,6 +81,7 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize) struct inode *iplist[1]; struct jfs_superblock *j_sb, *j_sb2; uint old_agsize; + int agsizechanged = 0; struct buffer_head *bh, *bh2; /* If the volume hasn't grown, get out now */ @@ -333,6 +334,9 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize) */ if ((rc = dbExtendFS(ipbmap, XAddress, nblocks))) goto error_out; + + agsizechanged |= (bmp->db_agsize != old_agsize); + /* * the map now has extended to cover additional nblocks: * dn_mapsize = oldMapsize + nblocks; @@ -432,7 +436,7 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize) * will correctly identify the new ag); */ /* if new AG size the same as old AG size, done! */ - if (bmp->db_agsize != old_agsize) { + if (agsizechanged) { if ((rc = diExtendFS(ipimap, ipbmap))) goto error_out; diff --git a/fs/jfs/super.c b/fs/jfs/super.c index 157382fa6256..b66832ac33ac 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -446,10 +446,8 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent) /* initialize the mount flag and determine the default error handler */ flag = JFS_ERR_REMOUNT_RO; - if (!parse_options((char *) data, sb, &newLVSize, &flag)) { - kfree(sbi); - return -EINVAL; - } + if (!parse_options((char *) data, sb, &newLVSize, &flag)) + goto out_kfree; sbi->flag = flag; #ifdef CONFIG_JFS_POSIX_ACL @@ -458,7 +456,7 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent) if (newLVSize) { printk(KERN_ERR "resize option for remount only\n"); - return -EINVAL; + goto out_kfree; } /* @@ -478,7 +476,7 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent) inode = new_inode(sb); if (inode == NULL) { ret = -ENOMEM; - goto out_kfree; + goto out_unload; } inode->i_ino = 0; inode->i_nlink = 1; @@ -550,9 +548,10 @@ out_mount_failed: make_bad_inode(sbi->direct_inode); iput(sbi->direct_inode); sbi->direct_inode = NULL; -out_kfree: +out_unload: if (sbi->nls_tab) unload_nls(sbi->nls_tab); +out_kfree: kfree(sbi); return ret; } diff --git a/fs/jfs/symlink.c b/fs/jfs/symlink.c index 4af1a05aad0a..205b946d8e0d 100644 --- a/fs/jfs/symlink.c +++ b/fs/jfs/symlink.c @@ -29,9 +29,21 @@ static void *jfs_follow_link(struct dentry *dentry, struct nameidata *nd) return NULL; } -const struct inode_operations jfs_symlink_inode_operations = { +const struct inode_operations jfs_fast_symlink_inode_operations = { .readlink = generic_readlink, .follow_link = jfs_follow_link, + .setattr = jfs_setattr, + .setxattr = jfs_setxattr, + .getxattr = jfs_getxattr, + .listxattr = jfs_listxattr, + .removexattr = jfs_removexattr, +}; + +const struct inode_operations jfs_symlink_inode_operations = { + .readlink = generic_readlink, + .follow_link = page_follow_link_light, + .put_link = page_put_link, + .setattr = jfs_setattr, .setxattr = jfs_setxattr, .getxattr = jfs_getxattr, .listxattr = jfs_listxattr, diff --git a/fs/logfs/gc.c b/fs/logfs/gc.c index 84e36f52fe95..76c242fbe1b0 100644 --- a/fs/logfs/gc.c +++ b/fs/logfs/gc.c @@ -459,6 +459,14 @@ static void __logfs_gc_pass(struct super_block *sb, int target) struct logfs_block *block; int round, progress, last_progress = 0; + /* + * Doing too many changes to the segfile at once would result + * in a large number of aliases. Write the journal before + * things get out of hand. + */ + if (super->s_shadow_tree.no_shadowed_segments >= MAX_OBJ_ALIASES) + logfs_write_anchor(sb); + if (no_free_segments(sb) >= target && super->s_no_object_aliases < MAX_OBJ_ALIASES) return; diff --git a/fs/logfs/journal.c b/fs/logfs/journal.c index 33bd260b8309..fb0a613f885b 100644 --- a/fs/logfs/journal.c +++ b/fs/logfs/journal.c @@ -389,7 +389,10 @@ static void journal_get_erase_count(struct logfs_area *area) static int journal_erase_segment(struct logfs_area *area) { struct super_block *sb = area->a_sb; - struct logfs_segment_header sh; + union { + struct logfs_segment_header sh; + unsigned char c[ALIGN(sizeof(struct logfs_segment_header), 16)]; + } u; u64 ofs; int err; @@ -397,20 +400,21 @@ static int journal_erase_segment(struct logfs_area *area) if (err) return err; - sh.pad = 0; - sh.type = SEG_JOURNAL; - sh.level = 0; - sh.segno = cpu_to_be32(area->a_segno); - sh.ec = cpu_to_be32(area->a_erase_count); - sh.gec = cpu_to_be64(logfs_super(sb)->s_gec); - sh.crc = logfs_crc32(&sh, sizeof(sh), 4); + memset(&u, 0, sizeof(u)); + u.sh.pad = 0; + u.sh.type = SEG_JOURNAL; + u.sh.level = 0; + u.sh.segno = cpu_to_be32(area->a_segno); + u.sh.ec = cpu_to_be32(area->a_erase_count); + u.sh.gec = cpu_to_be64(logfs_super(sb)->s_gec); + u.sh.crc = logfs_crc32(&u.sh, sizeof(u.sh), 4); /* This causes a bug in segment.c. Not yet. */ //logfs_set_segment_erased(sb, area->a_segno, area->a_erase_count, 0); ofs = dev_ofs(sb, area->a_segno, 0); - area->a_used_bytes = ALIGN(sizeof(sh), 16); - logfs_buf_write(area, ofs, &sh, sizeof(sh)); + area->a_used_bytes = sizeof(u); + logfs_buf_write(area, ofs, &u, sizeof(u)); return 0; } @@ -494,6 +498,8 @@ static void account_shadows(struct super_block *sb) btree_grim_visitor64(&tree->new, (unsigned long)sb, account_shadow); btree_grim_visitor64(&tree->old, (unsigned long)sb, account_shadow); + btree_grim_visitor32(&tree->segment_map, 0, NULL); + tree->no_shadowed_segments = 0; if (li->li_block) { /* @@ -607,9 +613,9 @@ static size_t __logfs_write_je(struct super_block *sb, void *buf, u16 type, if (len == 0) return logfs_write_header(super, header, 0, type); + BUG_ON(len > sb->s_blocksize); compr_len = logfs_compress(buf, data, len, sb->s_blocksize); if (compr_len < 0 || type == JE_ANCHOR) { - BUG_ON(len > sb->s_blocksize); memcpy(data, buf, len); compr_len = len; compr = COMPR_NONE; @@ -661,6 +667,7 @@ static int logfs_write_je_buf(struct super_block *sb, void *buf, u16 type, if (ofs < 0) return ofs; logfs_buf_write(area, ofs, super->s_compressed_je, len); + BUG_ON(super->s_no_je >= MAX_JOURNAL_ENTRIES); super->s_je_array[super->s_no_je++] = cpu_to_be64(ofs); return 0; } diff --git a/fs/logfs/logfs.h b/fs/logfs/logfs.h index b84b0eec6024..0a3df1a0c936 100644 --- a/fs/logfs/logfs.h +++ b/fs/logfs/logfs.h @@ -257,10 +257,14 @@ struct logfs_shadow { * struct shadow_tree * @new: shadows where old_ofs==0, indexed by new_ofs * @old: shadows where old_ofs!=0, indexed by old_ofs + * @segment_map: bitfield of segments containing shadows + * @no_shadowed_segment: number of segments containing shadows */ struct shadow_tree { struct btree_head64 new; struct btree_head64 old; + struct btree_head32 segment_map; + int no_shadowed_segments; }; struct object_alias_item { @@ -305,13 +309,14 @@ typedef int write_alias_t(struct super_block *sb, u64 ino, u64 bix, level_t level, int child_no, __be64 val); struct logfs_block_ops { void (*write_block)(struct logfs_block *block); - gc_level_t (*block_level)(struct logfs_block *block); void (*free_block)(struct super_block *sb, struct logfs_block*block); int (*write_alias)(struct super_block *sb, struct logfs_block *block, write_alias_t *write_one_alias); }; +#define MAX_JOURNAL_ENTRIES 256 + struct logfs_super { struct mtd_info *s_mtd; /* underlying device */ struct block_device *s_bdev; /* underlying device */ @@ -378,7 +383,7 @@ struct logfs_super { u32 s_journal_ec[LOGFS_JOURNAL_SEGS]; /* journal erasecounts */ u64 s_last_version; struct logfs_area *s_journal_area; /* open journal segment */ - __be64 s_je_array[64]; + __be64 s_je_array[MAX_JOURNAL_ENTRIES]; int s_no_je; int s_sum_index; /* for the 12 summaries */ @@ -722,4 +727,10 @@ static inline struct logfs_area *get_area(struct super_block *sb, return logfs_super(sb)->s_area[(__force u8)gc_level]; } +static inline void logfs_mempool_destroy(mempool_t *pool) +{ + if (pool) + mempool_destroy(pool); +} + #endif diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c index bff40253dfb2..3159db6958e5 100644 --- a/fs/logfs/readwrite.c +++ b/fs/logfs/readwrite.c @@ -430,25 +430,6 @@ static void inode_write_block(struct logfs_block *block) } } -static gc_level_t inode_block_level(struct logfs_block *block) -{ - BUG_ON(block->inode->i_ino == LOGFS_INO_MASTER); - return GC_LEVEL(LOGFS_MAX_LEVELS); -} - -static gc_level_t indirect_block_level(struct logfs_block *block) -{ - struct page *page; - struct inode *inode; - u64 bix; - level_t level; - - page = block->page; - inode = page->mapping->host; - logfs_unpack_index(page->index, &bix, &level); - return expand_level(inode->i_ino, level); -} - /* * This silences a false, yet annoying gcc warning. I hate it when my editor * jumps into bitops.h each time I recompile this file. @@ -587,14 +568,12 @@ static void indirect_free_block(struct super_block *sb, static struct logfs_block_ops inode_block_ops = { .write_block = inode_write_block, - .block_level = inode_block_level, .free_block = inode_free_block, .write_alias = inode_write_alias, }; struct logfs_block_ops indirect_block_ops = { .write_block = indirect_write_block, - .block_level = indirect_block_level, .free_block = indirect_free_block, .write_alias = indirect_write_alias, }; @@ -1241,6 +1220,18 @@ static void free_shadow(struct inode *inode, struct logfs_shadow *shadow) mempool_free(shadow, super->s_shadow_pool); } +static void mark_segment(struct shadow_tree *tree, u32 segno) +{ + int err; + + if (!btree_lookup32(&tree->segment_map, segno)) { + err = btree_insert32(&tree->segment_map, segno, (void *)1, + GFP_NOFS); + BUG_ON(err); + tree->no_shadowed_segments++; + } +} + /** * fill_shadow_tree - Propagate shadow tree changes due to a write * @inode: Inode owning the page @@ -1288,6 +1279,8 @@ static void fill_shadow_tree(struct inode *inode, struct page *page, super->s_dirty_used_bytes += shadow->new_len; super->s_dirty_free_bytes += shadow->old_len; + mark_segment(tree, shadow->old_ofs >> super->s_segshift); + mark_segment(tree, shadow->new_ofs >> super->s_segshift); } } @@ -1845,19 +1838,37 @@ static int __logfs_truncate(struct inode *inode, u64 size) return logfs_truncate_direct(inode, size); } -int logfs_truncate(struct inode *inode, u64 size) +/* + * Truncate, by changing the segment file, can consume a fair amount + * of resources. So back off from time to time and do some GC. + * 8 or 2048 blocks should be well within safety limits even if + * every single block resided in a different segment. + */ +#define TRUNCATE_STEP (8 * 1024 * 1024) +int logfs_truncate(struct inode *inode, u64 target) { struct super_block *sb = inode->i_sb; - int err; + u64 size = i_size_read(inode); + int err = 0; - logfs_get_wblocks(sb, NULL, 1); - err = __logfs_truncate(inode, size); - if (!err) - err = __logfs_write_inode(inode, 0); - logfs_put_wblocks(sb, NULL, 1); + size = ALIGN(size, TRUNCATE_STEP); + while (size > target) { + if (size > TRUNCATE_STEP) + size -= TRUNCATE_STEP; + else + size = 0; + if (size < target) + size = target; + + logfs_get_wblocks(sb, NULL, 1); + err = __logfs_truncate(inode, target); + if (!err) + err = __logfs_write_inode(inode, 0); + logfs_put_wblocks(sb, NULL, 1); + } if (!err) - err = vmtruncate(inode, size); + err = vmtruncate(inode, target); /* I don't trust error recovery yet. */ WARN_ON(err); @@ -2251,8 +2262,6 @@ void logfs_cleanup_rw(struct super_block *sb) struct logfs_super *super = logfs_super(sb); destroy_meta_inode(super->s_segfile_inode); - if (super->s_block_pool) - mempool_destroy(super->s_block_pool); - if (super->s_shadow_pool) - mempool_destroy(super->s_shadow_pool); + logfs_mempool_destroy(super->s_block_pool); + logfs_mempool_destroy(super->s_shadow_pool); } diff --git a/fs/logfs/segment.c b/fs/logfs/segment.c index 801a3a141625..f77ce2b470ba 100644 --- a/fs/logfs/segment.c +++ b/fs/logfs/segment.c @@ -183,14 +183,8 @@ static int btree_write_alias(struct super_block *sb, struct logfs_block *block, return 0; } -static gc_level_t btree_block_level(struct logfs_block *block) -{ - return expand_level(block->ino, block->level); -} - static struct logfs_block_ops btree_block_ops = { .write_block = btree_write_block, - .block_level = btree_block_level, .free_block = __free_block, .write_alias = btree_write_alias, }; @@ -919,7 +913,7 @@ err: for (i--; i >= 0; i--) free_area(super->s_area[i]); free_area(super->s_journal_area); - mempool_destroy(super->s_alias_pool); + logfs_mempool_destroy(super->s_alias_pool); return -ENOMEM; } diff --git a/fs/logfs/super.c b/fs/logfs/super.c index b60bfac3263c..d7c23ed8349a 100644 --- a/fs/logfs/super.c +++ b/fs/logfs/super.c @@ -12,6 +12,7 @@ #include "logfs.h" #include <linux/bio.h> #include <linux/slab.h> +#include <linux/blkdev.h> #include <linux/mtd/mtd.h> #include <linux/statfs.h> #include <linux/buffer_head.h> @@ -137,6 +138,10 @@ static int logfs_sb_set(struct super_block *sb, void *_super) sb->s_fs_info = super; sb->s_mtd = super->s_mtd; sb->s_bdev = super->s_bdev; + if (sb->s_bdev) + sb->s_bdi = &bdev_get_queue(sb->s_bdev)->backing_dev_info; + if (sb->s_mtd) + sb->s_bdi = sb->s_mtd->backing_dev_info; return 0; } @@ -328,27 +333,27 @@ static int logfs_get_sb_final(struct super_block *sb, struct vfsmount *mnt) goto fail; sb->s_root = d_alloc_root(rootdir); - if (!sb->s_root) - goto fail2; + if (!sb->s_root) { + iput(rootdir); + goto fail; + } super->s_erase_page = alloc_pages(GFP_KERNEL, 0); if (!super->s_erase_page) - goto fail2; + goto fail; memset(page_address(super->s_erase_page), 0xFF, PAGE_SIZE); /* FIXME: check for read-only mounts */ err = logfs_make_writeable(sb); if (err) - goto fail3; + goto fail1; log_super("LogFS: Finished mounting\n"); simple_set_mnt(mnt, sb); return 0; -fail3: +fail1: __free_page(super->s_erase_page); -fail2: - iput(rootdir); fail: iput(logfs_super(sb)->s_master_inode); return -EIO; @@ -452,6 +457,8 @@ static int logfs_read_sb(struct super_block *sb, int read_only) btree_init_mempool64(&super->s_shadow_tree.new, super->s_btree_pool); btree_init_mempool64(&super->s_shadow_tree.old, super->s_btree_pool); + btree_init_mempool32(&super->s_shadow_tree.segment_map, + super->s_btree_pool); ret = logfs_init_mapping(sb); if (ret) @@ -516,8 +523,8 @@ static void logfs_kill_sb(struct super_block *sb) if (super->s_erase_page) __free_page(super->s_erase_page); super->s_devops->put_device(sb); - mempool_destroy(super->s_btree_pool); - mempool_destroy(super->s_alias_pool); + logfs_mempool_destroy(super->s_btree_pool); + logfs_mempool_destroy(super->s_alias_pool); kfree(super); log_super("LogFS: Finished unmounting\n"); } diff --git a/fs/namei.c b/fs/namei.c index a7dce91a7e42..b86b96fe1dc3 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1641,7 +1641,7 @@ static struct file *do_last(struct nameidata *nd, struct path *path, if (nd->last.name[nd->last.len]) { if (open_flag & O_CREAT) goto exit; - nd->flags |= LOOKUP_DIRECTORY; + nd->flags |= LOOKUP_DIRECTORY | LOOKUP_FOLLOW; } /* just plain open? */ @@ -1830,6 +1830,8 @@ reval: } if (open_flag & O_DIRECTORY) nd.flags |= LOOKUP_DIRECTORY; + if (!(open_flag & O_NOFOLLOW)) + nd.flags |= LOOKUP_FOLLOW; filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname); while (unlikely(!filp)) { /* trailing symlink */ struct path holder; @@ -1837,7 +1839,7 @@ reval: void *cookie; error = -ELOOP; /* S_ISDIR part is a temporary automount kludge */ - if ((open_flag & O_NOFOLLOW) && !S_ISDIR(inode->i_mode)) + if (!(nd.flags & LOOKUP_FOLLOW) && !S_ISDIR(inode->i_mode)) goto exit_dput; if (count++ == 32) goto exit_dput; @@ -2174,8 +2176,10 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry) error = security_inode_rmdir(dir, dentry); if (!error) { error = dir->i_op->rmdir(dir, dentry); - if (!error) + if (!error) { dentry->d_inode->i_flags |= S_DEAD; + dont_mount(dentry); + } } } mutex_unlock(&dentry->d_inode->i_mutex); @@ -2259,7 +2263,7 @@ int vfs_unlink(struct inode *dir, struct dentry *dentry) if (!error) { error = dir->i_op->unlink(dir, dentry); if (!error) - dentry->d_inode->i_flags |= S_DEAD; + dont_mount(dentry); } } mutex_unlock(&dentry->d_inode->i_mutex); @@ -2570,17 +2574,20 @@ static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry, return error; target = new_dentry->d_inode; - if (target) { + if (target) mutex_lock(&target->i_mutex); - dentry_unhash(new_dentry); - } if (d_mountpoint(old_dentry)||d_mountpoint(new_dentry)) error = -EBUSY; - else + else { + if (target) + dentry_unhash(new_dentry); error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry); + } if (target) { - if (!error) + if (!error) { target->i_flags |= S_DEAD; + dont_mount(new_dentry); + } mutex_unlock(&target->i_mutex); if (d_unhashed(new_dentry)) d_rehash(new_dentry); @@ -2612,7 +2619,7 @@ static int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry, error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry); if (!error) { if (target) - target->i_flags |= S_DEAD; + dont_mount(new_dentry); if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) d_move(old_dentry, new_dentry); } diff --git a/fs/namespace.c b/fs/namespace.c index 8174c8ab5c70..f20cb57d1067 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1432,7 +1432,7 @@ static int graft_tree(struct vfsmount *mnt, struct path *path) err = -ENOENT; mutex_lock(&path->dentry->d_inode->i_mutex); - if (IS_DEADDIR(path->dentry->d_inode)) + if (cant_mount(path->dentry)) goto out_unlock; err = security_sb_check_sb(mnt, path); @@ -1623,7 +1623,7 @@ static int do_move_mount(struct path *path, char *old_name) err = -ENOENT; mutex_lock(&path->dentry->d_inode->i_mutex); - if (IS_DEADDIR(path->dentry->d_inode)) + if (cant_mount(path->dentry)) goto out1; if (d_unlinked(path->dentry)) @@ -2234,7 +2234,7 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, if (!check_mnt(root.mnt)) goto out2; error = -ENOENT; - if (IS_DEADDIR(new.dentry->d_inode)) + if (cant_mount(old.dentry)) goto out2; if (d_unlinked(new.dentry)) goto out2; diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c index cf98da1be23e..fa3385154023 100644 --- a/fs/ncpfs/inode.c +++ b/fs/ncpfs/inode.c @@ -526,10 +526,15 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent) sb->s_blocksize_bits = 10; sb->s_magic = NCP_SUPER_MAGIC; sb->s_op = &ncp_sops; + sb->s_bdi = &server->bdi; server = NCP_SBP(sb); memset(server, 0, sizeof(*server)); + error = bdi_setup_and_register(&server->bdi, "ncpfs", BDI_CAP_MAP_COPY); + if (error) + goto out_bdi; + server->ncp_filp = ncp_filp; server->ncp_sock = sock; @@ -719,6 +724,8 @@ out_fput2: if (server->info_filp) fput(server->info_filp); out_fput: + bdi_destroy(&server->bdi); +out_bdi: /* 23/12/1998 Marcin Dalecki <dalecki@cs.net.pl>: * * The previously used put_filp(ncp_filp); was bogous, since @@ -756,6 +763,7 @@ static void ncp_put_super(struct super_block *sb) kill_pid(server->m.wdog_pid, SIGTERM, 1); put_pid(server->m.wdog_pid); + bdi_destroy(&server->bdi); kfree(server->priv.data); kfree(server->auth.object_name); vfree(server->rxbuf); diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 2a3d352c0bff..acc9c4943b84 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -966,6 +966,8 @@ out_error: static void nfs_server_copy_userdata(struct nfs_server *target, struct nfs_server *source) { target->flags = source->flags; + target->rsize = source->rsize; + target->wsize = source->wsize; target->acregmin = source->acregmin; target->acregmax = source->acregmax; target->acdirmin = source->acdirmin; @@ -1294,7 +1296,8 @@ static int nfs4_init_server(struct nfs_server *server, /* Initialise the client representation from the mount data */ server->flags = data->flags; - server->caps |= NFS_CAP_ATOMIC_OPEN|NFS_CAP_CHANGE_ATTR; + server->caps |= NFS_CAP_ATOMIC_OPEN|NFS_CAP_CHANGE_ATTR| + NFS_CAP_POSIX_LOCK; server->options = data->options; /* Get a client record */ diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index 15671245c6ee..ea61d26e7871 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -24,6 +24,8 @@ static void nfs_do_free_delegation(struct nfs_delegation *delegation) { + if (delegation->cred) + put_rpccred(delegation->cred); kfree(delegation); } @@ -36,13 +38,7 @@ static void nfs_free_delegation_callback(struct rcu_head *head) static void nfs_free_delegation(struct nfs_delegation *delegation) { - struct rpc_cred *cred; - - cred = rcu_dereference(delegation->cred); - rcu_assign_pointer(delegation->cred, NULL); call_rcu(&delegation->rcu, nfs_free_delegation_callback); - if (cred) - put_rpccred(cred); } void nfs_mark_delegation_referenced(struct nfs_delegation *delegation) @@ -129,21 +125,35 @@ again: */ void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res) { - struct nfs_delegation *delegation = NFS_I(inode)->delegation; - struct rpc_cred *oldcred; + struct nfs_delegation *delegation; + struct rpc_cred *oldcred = NULL; - if (delegation == NULL) - return; - memcpy(delegation->stateid.data, res->delegation.data, - sizeof(delegation->stateid.data)); - delegation->type = res->delegation_type; - delegation->maxsize = res->maxsize; - oldcred = delegation->cred; - delegation->cred = get_rpccred(cred); - clear_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags); - NFS_I(inode)->delegation_state = delegation->type; - smp_wmb(); - put_rpccred(oldcred); + rcu_read_lock(); + delegation = rcu_dereference(NFS_I(inode)->delegation); + if (delegation != NULL) { + spin_lock(&delegation->lock); + if (delegation->inode != NULL) { + memcpy(delegation->stateid.data, res->delegation.data, + sizeof(delegation->stateid.data)); + delegation->type = res->delegation_type; + delegation->maxsize = res->maxsize; + oldcred = delegation->cred; + delegation->cred = get_rpccred(cred); + clear_bit(NFS_DELEGATION_NEED_RECLAIM, + &delegation->flags); + NFS_I(inode)->delegation_state = delegation->type; + spin_unlock(&delegation->lock); + put_rpccred(oldcred); + rcu_read_unlock(); + } else { + /* We appear to have raced with a delegation return. */ + spin_unlock(&delegation->lock); + rcu_read_unlock(); + nfs_inode_set_delegation(inode, cred, res); + } + } else { + rcu_read_unlock(); + } } static int nfs_do_return_delegation(struct inode *inode, struct nfs_delegation *delegation, int issync) @@ -166,9 +176,13 @@ static struct inode *nfs_delegation_grab_inode(struct nfs_delegation *delegation return inode; } -static struct nfs_delegation *nfs_detach_delegation_locked(struct nfs_inode *nfsi, const nfs4_stateid *stateid) +static struct nfs_delegation *nfs_detach_delegation_locked(struct nfs_inode *nfsi, + const nfs4_stateid *stateid, + struct nfs_client *clp) { - struct nfs_delegation *delegation = rcu_dereference(nfsi->delegation); + struct nfs_delegation *delegation = + rcu_dereference_protected(nfsi->delegation, + lockdep_is_held(&clp->cl_lock)); if (delegation == NULL) goto nomatch; @@ -195,7 +209,7 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct { struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; struct nfs_inode *nfsi = NFS_I(inode); - struct nfs_delegation *delegation; + struct nfs_delegation *delegation, *old_delegation; struct nfs_delegation *freeme = NULL; int status = 0; @@ -213,10 +227,12 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct spin_lock_init(&delegation->lock); spin_lock(&clp->cl_lock); - if (rcu_dereference(nfsi->delegation) != NULL) { - if (memcmp(&delegation->stateid, &nfsi->delegation->stateid, - sizeof(delegation->stateid)) == 0 && - delegation->type == nfsi->delegation->type) { + old_delegation = rcu_dereference_protected(nfsi->delegation, + lockdep_is_held(&clp->cl_lock)); + if (old_delegation != NULL) { + if (memcmp(&delegation->stateid, &old_delegation->stateid, + sizeof(old_delegation->stateid)) == 0 && + delegation->type == old_delegation->type) { goto out; } /* @@ -226,12 +242,12 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct dfprintk(FILE, "%s: server %s handed out " "a duplicate delegation!\n", __func__, clp->cl_hostname); - if (delegation->type <= nfsi->delegation->type) { + if (delegation->type <= old_delegation->type) { freeme = delegation; delegation = NULL; goto out; } - freeme = nfs_detach_delegation_locked(nfsi, NULL); + freeme = nfs_detach_delegation_locked(nfsi, NULL, clp); } list_add_rcu(&delegation->super_list, &clp->cl_delegations); nfsi->delegation_state = delegation->type; @@ -301,7 +317,7 @@ restart: if (inode == NULL) continue; spin_lock(&clp->cl_lock); - delegation = nfs_detach_delegation_locked(NFS_I(inode), NULL); + delegation = nfs_detach_delegation_locked(NFS_I(inode), NULL, clp); spin_unlock(&clp->cl_lock); rcu_read_unlock(); if (delegation != NULL) { @@ -330,9 +346,9 @@ void nfs_inode_return_delegation_noreclaim(struct inode *inode) struct nfs_inode *nfsi = NFS_I(inode); struct nfs_delegation *delegation; - if (rcu_dereference(nfsi->delegation) != NULL) { + if (rcu_access_pointer(nfsi->delegation) != NULL) { spin_lock(&clp->cl_lock); - delegation = nfs_detach_delegation_locked(nfsi, NULL); + delegation = nfs_detach_delegation_locked(nfsi, NULL, clp); spin_unlock(&clp->cl_lock); if (delegation != NULL) nfs_do_return_delegation(inode, delegation, 0); @@ -346,9 +362,9 @@ int nfs_inode_return_delegation(struct inode *inode) struct nfs_delegation *delegation; int err = 0; - if (rcu_dereference(nfsi->delegation) != NULL) { + if (rcu_access_pointer(nfsi->delegation) != NULL) { spin_lock(&clp->cl_lock); - delegation = nfs_detach_delegation_locked(nfsi, NULL); + delegation = nfs_detach_delegation_locked(nfsi, NULL, clp); spin_unlock(&clp->cl_lock); if (delegation != NULL) { nfs_msync_inode(inode); @@ -526,7 +542,7 @@ restart: if (inode == NULL) continue; spin_lock(&clp->cl_lock); - delegation = nfs_detach_delegation_locked(NFS_I(inode), NULL); + delegation = nfs_detach_delegation_locked(NFS_I(inode), NULL, clp); spin_unlock(&clp->cl_lock); rcu_read_unlock(); if (delegation != NULL) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index c6f2750648f4..a7bb5c694aa3 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -837,6 +837,8 @@ out_zap_parent: /* If we have submounts, don't unhash ! */ if (have_submounts(dentry)) goto out_valid; + if (dentry->d_flags & DCACHE_DISCONNECTED) + goto out_valid; shrink_dcache_parent(dentry); } d_drop(dentry); @@ -1025,12 +1027,12 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry res = NULL; goto out; /* This turned out not to be a regular file */ + case -EISDIR: case -ENOTDIR: goto no_open; case -ELOOP: if (!(nd->intent.open.flags & O_NOFOLLOW)) goto no_open; - /* case -EISDIR: */ /* case -EINVAL: */ default: goto out; @@ -1050,7 +1052,7 @@ static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd) struct inode *dir; int openflags, ret = 0; - if (!is_atomic_open(nd)) + if (!is_atomic_open(nd) || d_mountpoint(dentry)) goto no_open; parent = dget_parent(dentry); dir = parent->d_inode; diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 737128f777f3..50a56edca0b5 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -623,10 +623,10 @@ struct nfs_open_context *nfs_find_open_context(struct inode *inode, struct rpc_c list_for_each_entry(pos, &nfsi->open_files, list) { if (cred != NULL && pos->cred != cred) continue; - if ((pos->mode & mode) == mode) { - ctx = get_nfs_open_context(pos); - break; - } + if ((pos->mode & (FMODE_READ|FMODE_WRITE)) != mode) + continue; + ctx = get_nfs_open_context(pos); + break; } spin_unlock(&inode->i_lock); return ctx; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index fe0cd9eb1d4d..071fcedd517c 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -1523,6 +1523,8 @@ static int _nfs4_proc_open(struct nfs4_opendata *data) nfs_post_op_update_inode(dir, o_res->dir_attr); } else nfs_refresh_inode(dir, o_res->dir_attr); + if ((o_res->rflags & NFS4_OPEN_RESULT_LOCKTYPE_POSIX) == 0) + server->caps &= ~NFS_CAP_POSIX_LOCK; if(o_res->rflags & NFS4_OPEN_RESULT_CONFIRM) { status = _nfs4_proc_open_confirm(data); if (status != 0) @@ -1664,7 +1666,7 @@ static int _nfs4_do_open(struct inode *dir, struct path *path, fmode_t fmode, in status = PTR_ERR(state); if (IS_ERR(state)) goto err_opendata_put; - if ((opendata->o_res.rflags & NFS4_OPEN_RESULT_LOCKTYPE_POSIX) != 0) + if (server->caps & NFS_CAP_POSIX_LOCK) set_bit(NFS_STATE_POSIX_LOCKS, &state->flags); nfs4_opendata_put(opendata); nfs4_put_state_owner(sp); @@ -5216,9 +5218,12 @@ static int nfs41_proc_reclaim_complete(struct nfs_client *clp) msg.rpc_resp = &calldata->res; task_setup_data.callback_data = calldata; task = rpc_run_task(&task_setup_data); - if (IS_ERR(task)) + if (IS_ERR(task)) { status = PTR_ERR(task); + goto out; + } rpc_put_task(task); + return 0; out: dprintk("<-- %s status=%d\n", __func__, status); return status; diff --git a/fs/nfs/super.c b/fs/nfs/super.c index e01637240eeb..b4148fc00f9f 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -2187,6 +2187,7 @@ static int nfs_get_sb(struct file_system_type *fs_type, if (data->version == 4) { error = nfs4_try_mount(flags, dev_name, data, mnt); kfree(data->client_address); + kfree(data->nfs_server.export_path); goto out; } #endif /* CONFIG_NFS_V4 */ @@ -2657,7 +2658,7 @@ static void nfs_fix_devname(const struct path *path, struct vfsmount *mnt) devname = nfs_path(path->mnt->mnt_devname, path->mnt->mnt_root, path->dentry, page, PAGE_SIZE); - if (devname == NULL) + if (IS_ERR(devname)) goto out_freepage; tmp = kstrdup(devname, GFP_KERNEL); if (tmp == NULL) diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 53ff70e23993..3aea3ca98ab7 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -201,6 +201,7 @@ static int nfs_set_page_writeback(struct page *page) struct inode *inode = page->mapping->host; struct nfs_server *nfss = NFS_SERVER(inode); + page_cache_get(page); if (atomic_long_inc_return(&nfss->writeback) > NFS_CONGESTION_ON_THRESH) { set_bdi_congested(&nfss->backing_dev_info, @@ -216,6 +217,7 @@ static void nfs_end_page_writeback(struct page *page) struct nfs_server *nfss = NFS_SERVER(inode); end_page_writeback(page); + page_cache_release(page); if (atomic_long_dec_return(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH) clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC); } @@ -421,6 +423,7 @@ static void nfs_mark_request_dirty(struct nfs_page *req) { __set_page_dirty_nobuffers(req->wb_page); + __mark_inode_dirty(req->wb_page->mapping->host, I_DIRTY_DATASYNC); } #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) @@ -660,9 +663,11 @@ static int nfs_writepage_setup(struct nfs_open_context *ctx, struct page *page, req = nfs_setup_write_request(ctx, page, offset, count); if (IS_ERR(req)) return PTR_ERR(req); + nfs_mark_request_dirty(req); /* Update file length */ nfs_grow_file(page, offset, count); nfs_mark_uptodate(page, req->wb_pgbase, req->wb_bytes); + nfs_mark_request_dirty(req); nfs_clear_page_tag_locked(req); return 0; } @@ -739,8 +744,6 @@ int nfs_updatepage(struct file *file, struct page *page, status = nfs_writepage_setup(ctx, page, offset, count); if (status < 0) nfs_set_pageerror(page); - else - __set_page_dirty_nobuffers(page); dprintk("NFS: nfs_updatepage returns %d (isize %lld)\n", status, (long long)i_size_read(inode)); @@ -749,13 +752,12 @@ int nfs_updatepage(struct file *file, struct page *page, static void nfs_writepage_release(struct nfs_page *req) { + struct page *page = req->wb_page; - if (PageError(req->wb_page) || !nfs_reschedule_unstable_write(req)) { - nfs_end_page_writeback(req->wb_page); + if (PageError(req->wb_page) || !nfs_reschedule_unstable_write(req)) nfs_inode_remove_request(req); - } else - nfs_end_page_writeback(req->wb_page); nfs_clear_page_tag_locked(req); + nfs_end_page_writeback(page); } static int flush_task_priority(int how) @@ -779,7 +781,6 @@ static int nfs_write_rpcsetup(struct nfs_page *req, int how) { struct inode *inode = req->wb_context->path.dentry->d_inode; - int flags = (how & FLUSH_SYNC) ? 0 : RPC_TASK_ASYNC; int priority = flush_task_priority(how); struct rpc_task *task; struct rpc_message msg = { @@ -794,9 +795,10 @@ static int nfs_write_rpcsetup(struct nfs_page *req, .callback_ops = call_ops, .callback_data = data, .workqueue = nfsiod_workqueue, - .flags = flags, + .flags = RPC_TASK_ASYNC, .priority = priority, }; + int ret = 0; /* Set up the RPC argument and reply structs * NB: take care not to mess about with data->commit et al. */ @@ -835,10 +837,18 @@ static int nfs_write_rpcsetup(struct nfs_page *req, (unsigned long long)data->args.offset); task = rpc_run_task(&task_setup_data); - if (IS_ERR(task)) - return PTR_ERR(task); + if (IS_ERR(task)) { + ret = PTR_ERR(task); + goto out; + } + if (how & FLUSH_SYNC) { + ret = rpc_wait_for_completion_task(task); + if (ret == 0) + ret = task->tk_status; + } rpc_put_task(task); - return 0; +out: + return ret; } /* If a nfs_flush_* function fails, it should remove reqs from @head and @@ -847,9 +857,11 @@ static int nfs_write_rpcsetup(struct nfs_page *req, */ static void nfs_redirty_request(struct nfs_page *req) { + struct page *page = req->wb_page; + nfs_mark_request_dirty(req); - nfs_end_page_writeback(req->wb_page); nfs_clear_page_tag_locked(req); + nfs_end_page_writeback(page); } /* @@ -1084,16 +1096,15 @@ static void nfs_writeback_release_full(void *calldata) if (nfs_write_need_commit(data)) { memcpy(&req->wb_verf, &data->verf, sizeof(req->wb_verf)); nfs_mark_request_commit(req); - nfs_end_page_writeback(page); dprintk(" marked for commit\n"); goto next; } dprintk(" OK\n"); remove_request: - nfs_end_page_writeback(page); nfs_inode_remove_request(req); next: nfs_clear_page_tag_locked(req); + nfs_end_page_writeback(page); } nfs_writedata_release(calldata); } @@ -1190,6 +1201,25 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data) #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) +static int nfs_commit_set_lock(struct nfs_inode *nfsi, int may_wait) +{ + if (!test_and_set_bit(NFS_INO_COMMIT, &nfsi->flags)) + return 1; + if (may_wait && !out_of_line_wait_on_bit_lock(&nfsi->flags, + NFS_INO_COMMIT, nfs_wait_bit_killable, + TASK_KILLABLE)) + return 1; + return 0; +} + +static void nfs_commit_clear_lock(struct nfs_inode *nfsi) +{ + clear_bit(NFS_INO_COMMIT, &nfsi->flags); + smp_mb__after_clear_bit(); + wake_up_bit(&nfsi->flags, NFS_INO_COMMIT); +} + + static void nfs_commitdata_release(void *data) { struct nfs_write_data *wdata = data; @@ -1207,7 +1237,6 @@ static int nfs_commit_rpcsetup(struct list_head *head, { struct nfs_page *first = nfs_list_entry(head->next); struct inode *inode = first->wb_context->path.dentry->d_inode; - int flags = (how & FLUSH_SYNC) ? 0 : RPC_TASK_ASYNC; int priority = flush_task_priority(how); struct rpc_task *task; struct rpc_message msg = { @@ -1222,7 +1251,7 @@ static int nfs_commit_rpcsetup(struct list_head *head, .callback_ops = &nfs_commit_ops, .callback_data = data, .workqueue = nfsiod_workqueue, - .flags = flags, + .flags = RPC_TASK_ASYNC, .priority = priority, }; @@ -1282,6 +1311,7 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how) BDI_RECLAIMABLE); nfs_clear_page_tag_locked(req); } + nfs_commit_clear_lock(NFS_I(inode)); return -ENOMEM; } @@ -1337,6 +1367,7 @@ static void nfs_commit_release(void *calldata) next: nfs_clear_page_tag_locked(req); } + nfs_commit_clear_lock(NFS_I(data->inode)); nfs_commitdata_release(calldata); } @@ -1351,8 +1382,11 @@ static const struct rpc_call_ops nfs_commit_ops = { static int nfs_commit_inode(struct inode *inode, int how) { LIST_HEAD(head); - int res; + int may_wait = how & FLUSH_SYNC; + int res = 0; + if (!nfs_commit_set_lock(NFS_I(inode), may_wait)) + goto out; spin_lock(&inode->i_lock); res = nfs_scan_commit(inode, &head, 0, 0); spin_unlock(&inode->i_lock); @@ -1360,7 +1394,13 @@ static int nfs_commit_inode(struct inode *inode, int how) int error = nfs_commit_list(inode, &head, how); if (error < 0) return error; - } + if (may_wait) + wait_on_bit(&NFS_I(inode)->flags, NFS_INO_COMMIT, + nfs_wait_bit_killable, + TASK_KILLABLE); + } else + nfs_commit_clear_lock(NFS_I(inode)); +out: return res; } @@ -1432,6 +1472,7 @@ int nfs_wb_page_cancel(struct inode *inode, struct page *page) BUG_ON(!PageLocked(page)); for (;;) { + wait_on_page_writeback(page); req = nfs_page_find_request(page); if (req == NULL) break; @@ -1466,30 +1507,18 @@ int nfs_wb_page(struct inode *inode, struct page *page) .range_start = range_start, .range_end = range_end, }; - struct nfs_page *req; - int need_commit; int ret; while(PagePrivate(page)) { + wait_on_page_writeback(page); if (clear_page_dirty_for_io(page)) { ret = nfs_writepage_locked(page, &wbc); if (ret < 0) goto out_error; } - req = nfs_find_and_lock_request(page); - if (!req) - break; - if (IS_ERR(req)) { - ret = PTR_ERR(req); + ret = sync_inode(inode, &wbc); + if (ret < 0) goto out_error; - } - need_commit = test_bit(PG_CLEAN, &req->wb_flags); - nfs_clear_page_tag_locked(req); - if (need_commit) { - ret = nfs_commit_inode(inode, FLUSH_SYNC); - if (ret < 0) - goto out_error; - } } return 0; out_error: diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index e1703175ee28..34ccf815ea8a 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -161,10 +161,10 @@ static __be32 *read_buf(struct nfsd4_compoundargs *argp, u32 nbytes) argp->p = page_address(argp->pagelist[0]); argp->pagelist++; if (argp->pagelen < PAGE_SIZE) { - argp->end = p + (argp->pagelen>>2); + argp->end = argp->p + (argp->pagelen>>2); argp->pagelen = 0; } else { - argp->end = p + (PAGE_SIZE>>2); + argp->end = argp->p + (PAGE_SIZE>>2); argp->pagelen -= PAGE_SIZE; } memcpy(((char*)p)+avail, argp->p, (nbytes - avail)); @@ -1426,10 +1426,10 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp) argp->p = page_address(argp->pagelist[0]); argp->pagelist++; if (argp->pagelen < PAGE_SIZE) { - argp->end = p + (argp->pagelen>>2); + argp->end = argp->p + (argp->pagelen>>2); argp->pagelen = 0; } else { - argp->end = p + (PAGE_SIZE>>2); + argp->end = argp->p + (PAGE_SIZE>>2); argp->pagelen -= PAGE_SIZE; } } diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 0cdbc5e7655a..48145f505a6a 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -749,6 +749,7 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent, sb->s_export_op = &nilfs_export_ops; sb->s_root = NULL; sb->s_time_gran = 1; + sb->s_bdi = nilfs->ns_bdi; err = load_nilfs(nilfs, sbi); if (err) diff --git a/fs/notify/inotify/Kconfig b/fs/notify/inotify/Kconfig index 3e56dbffe729..b3a159b21cfd 100644 --- a/fs/notify/inotify/Kconfig +++ b/fs/notify/inotify/Kconfig @@ -15,6 +15,7 @@ config INOTIFY config INOTIFY_USER bool "Inotify support for userspace" + select ANON_INODES select FSNOTIFY default y ---help--- diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c index 1afb0a10229f..e27960cd76ab 100644 --- a/fs/notify/inotify/inotify_fsnotify.c +++ b/fs/notify/inotify/inotify_fsnotify.c @@ -28,6 +28,7 @@ #include <linux/path.h> /* struct path */ #include <linux/slab.h> /* kmem_* */ #include <linux/types.h> +#include <linux/sched.h> #include "inotify.h" @@ -146,6 +147,7 @@ static void inotify_free_group_priv(struct fsnotify_group *group) idr_for_each(&group->inotify_data.idr, idr_callback, group); idr_remove_all(&group->inotify_data.idr); idr_destroy(&group->inotify_data.idr); + free_uid(group->inotify_data.user); } void inotify_free_event_priv(struct fsnotify_event_private_data *fsn_event_priv) diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index 472cdf29ef82..e46ca685b9be 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -546,21 +546,24 @@ retry: if (unlikely(!idr_pre_get(&group->inotify_data.idr, GFP_KERNEL))) goto out_err; + /* we are putting the mark on the idr, take a reference */ + fsnotify_get_mark(&tmp_ientry->fsn_entry); + spin_lock(&group->inotify_data.idr_lock); ret = idr_get_new_above(&group->inotify_data.idr, &tmp_ientry->fsn_entry, group->inotify_data.last_wd+1, &tmp_ientry->wd); spin_unlock(&group->inotify_data.idr_lock); if (ret) { + /* we didn't get on the idr, drop the idr reference */ + fsnotify_put_mark(&tmp_ientry->fsn_entry); + /* idr was out of memory allocate and try again */ if (ret == -EAGAIN) goto retry; goto out_err; } - /* we put the mark on the idr, take a reference */ - fsnotify_get_mark(&tmp_ientry->fsn_entry); - /* we are on the idr, now get on the inode */ ret = fsnotify_add_mark(&tmp_ientry->fsn_entry, group, inode); if (ret) { @@ -578,16 +581,13 @@ retry: /* return the watch descriptor for this new entry */ ret = tmp_ientry->wd; - /* match the ref from fsnotify_init_markentry() */ - fsnotify_put_mark(&tmp_ientry->fsn_entry); - /* if this mark added a new event update the group mask */ if (mask & ~group->mask) fsnotify_recalc_group_mask(group); out_err: - if (ret < 0) - kmem_cache_free(inotify_inode_mark_cachep, tmp_ientry); + /* match the ref from fsnotify_init_markentry() */ + fsnotify_put_mark(&tmp_ientry->fsn_entry); return ret; } diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c index ecebb2276790..f9d5d3ffc75a 100644 --- a/fs/ocfs2/buffer_head_io.c +++ b/fs/ocfs2/buffer_head_io.c @@ -406,6 +406,7 @@ int ocfs2_write_super_or_backup(struct ocfs2_super *osb, struct buffer_head *bh) { int ret = 0; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data; mlog_entry_void(); @@ -425,6 +426,7 @@ int ocfs2_write_super_or_backup(struct ocfs2_super *osb, get_bh(bh); /* for end_buffer_write_sync() */ bh->b_end_io = end_buffer_write_sync; + ocfs2_compute_meta_ecc(osb->sb, bh->b_data, &di->i_check); submit_bh(WRITE, bh); wait_on_buffer(bh); diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c index a795eb91f4ea..12d5eb78a11a 100644 --- a/fs/ocfs2/dlm/dlmast.c +++ b/fs/ocfs2/dlm/dlmast.c @@ -184,9 +184,8 @@ static void dlm_update_lvb(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, BUG_ON(!lksb); /* only updates if this node masters the lockres */ + spin_lock(&res->spinlock); if (res->owner == dlm->node_num) { - - spin_lock(&res->spinlock); /* check the lksb flags for the direction */ if (lksb->flags & DLM_LKSB_GET_LVB) { mlog(0, "getting lvb from lockres for %s node\n", @@ -201,8 +200,8 @@ static void dlm_update_lvb(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, * here. In the future we might want to clear it at the time * the put is actually done. */ - spin_unlock(&res->spinlock); } + spin_unlock(&res->spinlock); /* reset any lvb flags on the lksb */ lksb->flags &= ~(DLM_LKSB_PUT_LVB|DLM_LKSB_GET_LVB); diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c index 1b0de157a08c..b83d6107a1f5 100644 --- a/fs/ocfs2/dlmfs/dlmfs.c +++ b/fs/ocfs2/dlmfs/dlmfs.c @@ -112,20 +112,20 @@ MODULE_PARM_DESC(capabilities, DLMFS_CAPABILITIES); * O_RDONLY -> PRMODE level * O_WRONLY -> EXMODE level * - * O_NONBLOCK -> LKM_NOQUEUE + * O_NONBLOCK -> NOQUEUE */ static int dlmfs_decode_open_flags(int open_flags, int *level, int *flags) { if (open_flags & (O_WRONLY|O_RDWR)) - *level = LKM_EXMODE; + *level = DLM_LOCK_EX; else - *level = LKM_PRMODE; + *level = DLM_LOCK_PR; *flags = 0; if (open_flags & O_NONBLOCK) - *flags |= LKM_NOQUEUE; + *flags |= DLM_LKF_NOQUEUE; return 0; } @@ -166,7 +166,7 @@ static int dlmfs_file_open(struct inode *inode, * to be able userspace to be able to distinguish a * valid lock request from one that simply couldn't be * granted. */ - if (flags & LKM_NOQUEUE && status == -EAGAIN) + if (flags & DLM_LKF_NOQUEUE && status == -EAGAIN) status = -ETXTBSY; kfree(fp); goto bail; @@ -193,7 +193,7 @@ static int dlmfs_file_release(struct inode *inode, status = 0; if (fp) { level = fp->fp_lock_level; - if (level != LKM_IVMODE) + if (level != DLM_LOCK_IV) user_dlm_cluster_unlock(&ip->ip_lockres, level); kfree(fp); @@ -262,7 +262,7 @@ static ssize_t dlmfs_file_read(struct file *filp, if ((count + *ppos) > i_size_read(inode)) readlen = i_size_read(inode) - *ppos; else - readlen = count - *ppos; + readlen = count; lvb_buf = kmalloc(readlen, GFP_NOFS); if (!lvb_buf) diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 17947dc8341e..a5fbd9cea968 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -684,6 +684,7 @@ restarted_transaction: if (why == RESTART_META) { mlog(0, "restarting function.\n"); restart_func = 1; + status = 0; } else { BUG_ON(why != RESTART_TRANS); @@ -1981,18 +1982,18 @@ relock: /* communicate with ocfs2_dio_end_io */ ocfs2_iocb_set_rw_locked(iocb, rw_level); - if (direct_io) { - ret = generic_segment_checks(iov, &nr_segs, &ocount, - VERIFY_READ); - if (ret) - goto out_dio; + ret = generic_segment_checks(iov, &nr_segs, &ocount, + VERIFY_READ); + if (ret) + goto out_dio; - count = ocount; - ret = generic_write_checks(file, ppos, &count, - S_ISBLK(inode->i_mode)); - if (ret) - goto out_dio; + count = ocount; + ret = generic_write_checks(file, ppos, &count, + S_ISBLK(inode->i_mode)); + if (ret) + goto out_dio; + if (direct_io) { written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos, ppos, count, ocount); if (written < 0) { @@ -2007,7 +2008,10 @@ relock: goto out_dio; } } else { - written = __generic_file_aio_write(iocb, iov, nr_segs, ppos); + current->backing_dev_info = file->f_mapping->backing_dev_info; + written = generic_file_buffered_write(iocb, iov, nr_segs, *ppos, + ppos, count, 0); + current->backing_dev_info = NULL; } out_dio: @@ -2021,9 +2025,9 @@ out_dio: if (ret < 0) written = ret; - if (!ret && (old_size != i_size_read(inode) || - old_clusters != OCFS2_I(inode)->ip_clusters || - has_refcount)) { + if (!ret && ((old_size != i_size_read(inode)) || + (old_clusters != OCFS2_I(inode)->ip_clusters) || + has_refcount)) { ret = jbd2_journal_force_commit(osb->journal->j_journal); if (ret < 0) written = ret; diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index 07cc8bb68b6d..af189887201c 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -558,6 +558,7 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb, handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); if (IS_ERR(handle)) { status = PTR_ERR(handle); + handle = NULL; mlog_errno(status); goto out; } @@ -639,11 +640,13 @@ static int ocfs2_remove_inode(struct inode *inode, goto bail_unlock; } - status = ocfs2_orphan_del(osb, handle, orphan_dir_inode, inode, - orphan_dir_bh); - if (status < 0) { - mlog_errno(status); - goto bail_commit; + if (!(OCFS2_I(inode)->ip_flags & OCFS2_INODE_SKIP_ORPHAN_DIR)) { + status = ocfs2_orphan_del(osb, handle, orphan_dir_inode, inode, + orphan_dir_bh); + if (status < 0) { + mlog_errno(status); + goto bail_commit; + } } /* set the inodes dtime */ @@ -722,38 +725,39 @@ static void ocfs2_signal_wipe_completion(struct ocfs2_super *osb, static int ocfs2_wipe_inode(struct inode *inode, struct buffer_head *di_bh) { - int status, orphaned_slot; + int status, orphaned_slot = -1; struct inode *orphan_dir_inode = NULL; struct buffer_head *orphan_dir_bh = NULL; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - struct ocfs2_dinode *di; + struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data; - di = (struct ocfs2_dinode *) di_bh->b_data; - orphaned_slot = le16_to_cpu(di->i_orphaned_slot); + if (!(OCFS2_I(inode)->ip_flags & OCFS2_INODE_SKIP_ORPHAN_DIR)) { + orphaned_slot = le16_to_cpu(di->i_orphaned_slot); - status = ocfs2_check_orphan_recovery_state(osb, orphaned_slot); - if (status) - return status; + status = ocfs2_check_orphan_recovery_state(osb, orphaned_slot); + if (status) + return status; - orphan_dir_inode = ocfs2_get_system_file_inode(osb, - ORPHAN_DIR_SYSTEM_INODE, - orphaned_slot); - if (!orphan_dir_inode) { - status = -EEXIST; - mlog_errno(status); - goto bail; - } + orphan_dir_inode = ocfs2_get_system_file_inode(osb, + ORPHAN_DIR_SYSTEM_INODE, + orphaned_slot); + if (!orphan_dir_inode) { + status = -EEXIST; + mlog_errno(status); + goto bail; + } - /* Lock the orphan dir. The lock will be held for the entire - * delete_inode operation. We do this now to avoid races with - * recovery completion on other nodes. */ - mutex_lock(&orphan_dir_inode->i_mutex); - status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1); - if (status < 0) { - mutex_unlock(&orphan_dir_inode->i_mutex); + /* Lock the orphan dir. The lock will be held for the entire + * delete_inode operation. We do this now to avoid races with + * recovery completion on other nodes. */ + mutex_lock(&orphan_dir_inode->i_mutex); + status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1); + if (status < 0) { + mutex_unlock(&orphan_dir_inode->i_mutex); - mlog_errno(status); - goto bail; + mlog_errno(status); + goto bail; + } } /* we do this while holding the orphan dir lock because we @@ -794,6 +798,9 @@ static int ocfs2_wipe_inode(struct inode *inode, mlog_errno(status); bail_unlock_dir: + if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SKIP_ORPHAN_DIR) + return status; + ocfs2_inode_unlock(orphan_dir_inode, 1); mutex_unlock(&orphan_dir_inode->i_mutex); brelse(orphan_dir_bh); @@ -889,7 +896,8 @@ static int ocfs2_query_inode_wipe(struct inode *inode, /* Do some basic inode verification... */ di = (struct ocfs2_dinode *) di_bh->b_data; - if (!(di->i_flags & cpu_to_le32(OCFS2_ORPHANED_FL))) { + if (!(di->i_flags & cpu_to_le32(OCFS2_ORPHANED_FL)) && + !(oi->ip_flags & OCFS2_INODE_SKIP_ORPHAN_DIR)) { /* * Inodes in the orphan dir must have ORPHANED_FL. The only * inodes that come back out of the orphan dir are reflink diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h index ba4fe07b293c..0b28e1921a39 100644 --- a/fs/ocfs2/inode.h +++ b/fs/ocfs2/inode.h @@ -100,6 +100,8 @@ struct ocfs2_inode_info #define OCFS2_INODE_MAYBE_ORPHANED 0x00000020 /* Does someone have the file open O_DIRECT */ #define OCFS2_INODE_OPEN_DIRECT 0x00000040 +/* Tell the inode wipe code it's not in orphan dir */ +#define OCFS2_INODE_SKIP_ORPHAN_DIR 0x00000080 static inline struct ocfs2_inode_info *OCFS2_I(struct inode *inode) { diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index b1eb50ae4097..4cbb18f26c5f 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -408,23 +408,28 @@ static int ocfs2_mknod(struct inode *dir, } } - status = ocfs2_add_entry(handle, dentry, inode, - OCFS2_I(inode)->ip_blkno, parent_fe_bh, - &lookup); - if (status < 0) { + /* + * Do this before adding the entry to the directory. We add + * also set d_op after success so that ->d_iput() will cleanup + * the dentry lock even if ocfs2_add_entry() fails below. + */ + status = ocfs2_dentry_attach_lock(dentry, inode, + OCFS2_I(dir)->ip_blkno); + if (status) { mlog_errno(status); goto leave; } + dentry->d_op = &ocfs2_dentry_ops; - status = ocfs2_dentry_attach_lock(dentry, inode, - OCFS2_I(dir)->ip_blkno); - if (status) { + status = ocfs2_add_entry(handle, dentry, inode, + OCFS2_I(inode)->ip_blkno, parent_fe_bh, + &lookup); + if (status < 0) { mlog_errno(status); goto leave; } insert_inode_hash(inode); - dentry->d_op = &ocfs2_dentry_ops; d_instantiate(dentry, inode); status = 0; leave: @@ -445,11 +450,6 @@ leave: ocfs2_free_dir_lookup_result(&lookup); - if ((status < 0) && inode) { - clear_nlink(inode); - iput(inode); - } - if (inode_ac) ocfs2_free_alloc_context(inode_ac); @@ -459,6 +459,17 @@ leave: if (meta_ac) ocfs2_free_alloc_context(meta_ac); + /* + * We should call iput after the i_mutex of the bitmap been + * unlocked in ocfs2_free_alloc_context, or the + * ocfs2_delete_inode will mutex_lock again. + */ + if ((status < 0) && inode) { + OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SKIP_ORPHAN_DIR; + clear_nlink(inode); + iput(inode); + } + mlog_exit(status); return status; @@ -1771,22 +1782,27 @@ static int ocfs2_symlink(struct inode *dir, } } - status = ocfs2_add_entry(handle, dentry, inode, - le64_to_cpu(fe->i_blkno), parent_fe_bh, - &lookup); - if (status < 0) { + /* + * Do this before adding the entry to the directory. We add + * also set d_op after success so that ->d_iput() will cleanup + * the dentry lock even if ocfs2_add_entry() fails below. + */ + status = ocfs2_dentry_attach_lock(dentry, inode, OCFS2_I(dir)->ip_blkno); + if (status) { mlog_errno(status); goto bail; } + dentry->d_op = &ocfs2_dentry_ops; - status = ocfs2_dentry_attach_lock(dentry, inode, OCFS2_I(dir)->ip_blkno); - if (status) { + status = ocfs2_add_entry(handle, dentry, inode, + le64_to_cpu(fe->i_blkno), parent_fe_bh, + &lookup); + if (status < 0) { mlog_errno(status); goto bail; } insert_inode_hash(inode); - dentry->d_op = &ocfs2_dentry_ops; d_instantiate(dentry, inode); bail: if (status < 0 && did_quota) @@ -1811,6 +1827,7 @@ bail: if (xattr_ac) ocfs2_free_alloc_context(xattr_ac); if ((status < 0) && inode) { + OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SKIP_ORPHAN_DIR; clear_nlink(inode); iput(inode); } @@ -1976,6 +1993,7 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb, } le32_add_cpu(&fe->i_flags, OCFS2_ORPHANED_FL); + OCFS2_I(inode)->ip_flags &= ~OCFS2_INODE_SKIP_ORPHAN_DIR; /* Record which orphan dir our inode now resides * in. delete_inode will use this to determine which orphan diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index bd96f6c7877e..5cbcd0f008fc 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -4083,6 +4083,9 @@ static int ocfs2_complete_reflink(struct inode *s_inode, di->i_attr = s_di->i_attr; if (preserve) { + t_inode->i_uid = s_inode->i_uid; + t_inode->i_gid = s_inode->i_gid; + t_inode->i_mode = s_inode->i_mode; di->i_uid = s_di->i_uid; di->i_gid = s_di->i_gid; di->i_mode = s_di->i_mode; diff --git a/fs/proc/array.c b/fs/proc/array.c index e51f2ec2c5e5..885ab5513ac5 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -81,7 +81,6 @@ #include <linux/pid_namespace.h> #include <linux/ptrace.h> #include <linux/tracehook.h> -#include <linux/swapops.h> #include <asm/pgtable.h> #include <asm/processor.h> @@ -495,7 +494,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, rsslim, mm ? mm->start_code : 0, mm ? mm->end_code : 0, - (permitted && mm) ? task->stack_start : 0, + (permitted && mm) ? mm->start_stack : 0, esp, eip, /* The signal information here is obsolete. diff --git a/fs/proc/base.c b/fs/proc/base.c index 7621db800a74..8418fcc0a6ab 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2909,7 +2909,7 @@ out_no_task: */ static const struct pid_entry tid_base_stuff[] = { DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations), - DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fd_operations), + DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations), REG("environ", S_IRUSR, proc_environ_operations), INF("auxv", S_IRUSR, proc_pid_auxv), ONE("status", S_IRUGO, proc_pid_status), diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 070553427dd5..47f5b145f56e 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -247,25 +247,6 @@ static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma) } else if (vma->vm_start <= mm->start_stack && vma->vm_end >= mm->start_stack) { name = "[stack]"; - } else { - unsigned long stack_start; - struct proc_maps_private *pmp; - - pmp = m->private; - stack_start = pmp->task->stack_start; - - if (vma->vm_start <= stack_start && - vma->vm_end >= stack_start) { - pad_len_spaces(m, len); - seq_printf(m, - "[threadstack:%08lx]", -#ifdef CONFIG_STACK_GROWSUP - vma->vm_end - stack_start -#else - stack_start - vma->vm_start -#endif - ); - } } } else { name = "[vdso]"; diff --git a/fs/quota/Kconfig b/fs/quota/Kconfig index dad7fb247ddc..3e21b1e2ad3a 100644 --- a/fs/quota/Kconfig +++ b/fs/quota/Kconfig @@ -33,6 +33,14 @@ config PRINT_QUOTA_WARNING Note that this behavior is currently deprecated and may go away in future. Please use notification via netlink socket instead. +config QUOTA_DEBUG + bool "Additional quota sanity checks" + depends on QUOTA + default n + help + If you say Y here, quota subsystem will perform some additional + sanity checks of quota internal structures. If unsure, say N. + # Generic support for tree structured quota files. Selected when needed. config QUOTA_TREE tristate diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index a0a9405b202a..788b5802a7ce 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -80,8 +80,6 @@ #include <asm/uaccess.h> -#define __DQUOT_PARANOIA - /* * There are three quota SMP locks. dq_list_lock protects all lists with quotas * and quota formats, dqstats structure containing statistics about the lists @@ -695,7 +693,7 @@ void dqput(struct dquot *dquot) if (!dquot) return; -#ifdef __DQUOT_PARANOIA +#ifdef CONFIG_QUOTA_DEBUG if (!atomic_read(&dquot->dq_count)) { printk("VFS: dqput: trying to free free dquot\n"); printk("VFS: device %s, dquot of %s %d\n", @@ -748,7 +746,7 @@ we_slept: goto we_slept; } atomic_dec(&dquot->dq_count); -#ifdef __DQUOT_PARANOIA +#ifdef CONFIG_QUOTA_DEBUG /* sanity check */ BUG_ON(!list_empty(&dquot->dq_free)); #endif @@ -845,7 +843,7 @@ we_slept: dquot = NULL; goto out; } -#ifdef __DQUOT_PARANOIA +#ifdef CONFIG_QUOTA_DEBUG BUG_ON(!dquot->dq_sb); /* Has somebody invalidated entry under us? */ #endif out: @@ -874,7 +872,7 @@ static int dqinit_needed(struct inode *inode, int type) static void add_dquot_ref(struct super_block *sb, int type) { struct inode *inode, *old_inode = NULL; -#ifdef __DQUOT_PARANOIA +#ifdef CONFIG_QUOTA_DEBUG int reserved = 0; #endif @@ -882,7 +880,7 @@ static void add_dquot_ref(struct super_block *sb, int type) list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW)) continue; -#ifdef __DQUOT_PARANOIA +#ifdef CONFIG_QUOTA_DEBUG if (unlikely(inode_get_rsv_space(inode) > 0)) reserved = 1; #endif @@ -907,7 +905,7 @@ static void add_dquot_ref(struct super_block *sb, int type) spin_unlock(&inode_lock); iput(old_inode); -#ifdef __DQUOT_PARANOIA +#ifdef CONFIG_QUOTA_DEBUG if (reserved) { printk(KERN_WARNING "VFS (%s): Writes happened before quota" " was turned on thus quota information is probably " @@ -940,7 +938,7 @@ static int remove_inode_dquot_ref(struct inode *inode, int type, inode->i_dquot[type] = NULL; if (dquot) { if (dqput_blocks(dquot)) { -#ifdef __DQUOT_PARANOIA +#ifdef CONFIG_QUOTA_DEBUG if (atomic_read(&dquot->dq_count) != 1) printk(KERN_WARNING "VFS: Adding dquot with dq_count %d to dispose list.\n", atomic_read(&dquot->dq_count)); #endif diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c index f8a6075abf50..07930449a958 100644 --- a/fs/reiserfs/dir.c +++ b/fs/reiserfs/dir.c @@ -46,8 +46,6 @@ static inline bool is_privroot_deh(struct dentry *dir, struct reiserfs_de_head *deh) { struct dentry *privroot = REISERFS_SB(dir->d_sb)->priv_root; - if (reiserfs_expose_privroot(dir->d_sb)) - return 0; return (dir == dir->d_parent && privroot->d_inode && deh->deh_objectid == INODE_PKEY(privroot->d_inode)->k_objectid); } diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index 4f9586bb7631..e7cc00e636dc 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -554,7 +554,7 @@ reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th, if (!err && new_size < i_size_read(dentry->d_inode)) { struct iattr newattrs = { .ia_ctime = current_fs_time(inode->i_sb), - .ia_size = buffer_size, + .ia_size = new_size, .ia_valid = ATTR_SIZE | ATTR_CTIME, }; @@ -973,21 +973,13 @@ int reiserfs_permission(struct inode *inode, int mask) return generic_permission(inode, mask, NULL); } -/* This will catch lookups from the fs root to .reiserfs_priv */ -static int -xattr_lookup_poison(struct dentry *dentry, struct qstr *q1, struct qstr *name) +static int xattr_hide_revalidate(struct dentry *dentry, struct nameidata *nd) { - struct dentry *priv_root = REISERFS_SB(dentry->d_sb)->priv_root; - if (container_of(q1, struct dentry, d_name) == priv_root) - return -ENOENT; - if (q1->len == name->len && - !memcmp(q1->name, name->name, name->len)) - return 0; - return 1; + return -EPERM; } static const struct dentry_operations xattr_lookup_poison_ops = { - .d_compare = xattr_lookup_poison, + .d_revalidate = xattr_hide_revalidate, }; int reiserfs_lookup_privroot(struct super_block *s) @@ -1001,8 +993,7 @@ int reiserfs_lookup_privroot(struct super_block *s) strlen(PRIVROOT_NAME)); if (!IS_ERR(dentry)) { REISERFS_SB(s)->priv_root = dentry; - if (!reiserfs_expose_privroot(s)) - s->s_root->d_op = &xattr_lookup_poison_ops; + dentry->d_op = &xattr_lookup_poison_ops; if (dentry->d_inode) dentry->d_inode->i_flags |= S_PRIVATE; } else diff --git a/fs/smbfs/inode.c b/fs/smbfs/inode.c index 1c4c8f089970..dfa1d67f8fca 100644 --- a/fs/smbfs/inode.c +++ b/fs/smbfs/inode.c @@ -479,6 +479,7 @@ smb_put_super(struct super_block *sb) if (server->conn_pid) kill_pid(server->conn_pid, SIGTERM, 1); + bdi_destroy(&server->bdi); kfree(server->ops); smb_unload_nls(server); sb->s_fs_info = NULL; @@ -525,6 +526,11 @@ static int smb_fill_super(struct super_block *sb, void *raw_data, int silent) if (!server) goto out_no_server; sb->s_fs_info = server; + + if (bdi_setup_and_register(&server->bdi, "smbfs", BDI_CAP_MAP_COPY)) + goto out_bdi; + + sb->s_bdi = &server->bdi; server->super_block = sb; server->mnt = NULL; @@ -624,6 +630,8 @@ out_no_smbiod: out_bad_option: kfree(mem); out_no_mem: + bdi_destroy(&server->bdi); +out_bdi: if (!server->mnt) printk(KERN_ERR "smb_fill_super: allocation failure\n"); sb->s_fs_info = NULL; diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c index 1cb0d81b164b..653c030eb840 100644 --- a/fs/squashfs/block.c +++ b/fs/squashfs/block.c @@ -87,9 +87,8 @@ int squashfs_read_data(struct super_block *sb, void **buffer, u64 index, u64 cur_index = index >> msblk->devblksize_log2; int bytes, compressed, b = 0, k = 0, page = 0, avail; - - bh = kcalloc((msblk->block_size >> msblk->devblksize_log2) + 1, - sizeof(*bh), GFP_KERNEL); + bh = kcalloc(((srclength + msblk->devblksize - 1) + >> msblk->devblksize_log2) + 1, sizeof(*bh), GFP_KERNEL); if (bh == NULL) return -ENOMEM; diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c index 3550aec2f655..48b6f4a385a6 100644 --- a/fs/squashfs/super.c +++ b/fs/squashfs/super.c @@ -275,7 +275,8 @@ allocate_root: err = squashfs_read_inode(root, root_inode); if (err) { - iget_failed(root); + make_bad_inode(root); + iput(root); goto failed_mount; } insert_inode_hash(root); @@ -353,6 +354,7 @@ static void squashfs_put_super(struct super_block *sb) kfree(sbi->id_table); kfree(sbi->fragment_index); kfree(sbi->meta_index); + kfree(sbi->inode_lookup_table); kfree(sb->s_fs_info); sb->s_fs_info = NULL; } diff --git a/fs/squashfs/zlib_wrapper.c b/fs/squashfs/zlib_wrapper.c index 15a03d0fb9f3..7a603874e483 100644 --- a/fs/squashfs/zlib_wrapper.c +++ b/fs/squashfs/zlib_wrapper.c @@ -128,8 +128,9 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void **buffer, goto release_mutex; } + length = stream->total_out; mutex_unlock(&msblk->read_data_mutex); - return stream->total_out; + return length; release_mutex: mutex_unlock(&msblk->read_data_mutex); diff --git a/fs/super.c b/fs/super.c index f35ac6022109..1527e6a0ee35 100644 --- a/fs/super.c +++ b/fs/super.c @@ -37,6 +37,7 @@ #include <linux/kobject.h> #include <linux/mutex.h> #include <linux/file.h> +#include <linux/backing-dev.h> #include <asm/uaccess.h> #include "internal.h" @@ -693,6 +694,7 @@ int set_anon_super(struct super_block *s, void *data) return -EMFILE; } s->s_dev = MKDEV(0, dev & MINORMASK); + s->s_bdi = &noop_backing_dev_info; return 0; } @@ -954,10 +956,11 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void if (error < 0) goto out_free_secdata; BUG_ON(!mnt->mnt_sb); + WARN_ON(!mnt->mnt_sb->s_bdi); - error = security_sb_kern_mount(mnt->mnt_sb, flags, secdata); - if (error) - goto out_sb; + error = security_sb_kern_mount(mnt->mnt_sb, flags, secdata); + if (error) + goto out_sb; /* * filesystems should never set s_maxbytes larger than MAX_LFS_FILESIZE diff --git a/fs/sync.c b/fs/sync.c index fc5c3d75cf3c..92b228176f7c 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -14,6 +14,7 @@ #include <linux/pagemap.h> #include <linux/quotaops.h> #include <linux/buffer_head.h> +#include <linux/backing-dev.h> #include "internal.h" #define VALID_FLAGS (SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE| \ @@ -32,7 +33,7 @@ static int __sync_filesystem(struct super_block *sb, int wait) * This should be safe, as we require bdi backing to actually * write out data in the first place */ - if (!sb->s_bdi) + if (!sb->s_bdi || sb->s_bdi == &noop_backing_dev_info) return 0; if (sb->s_qcop && sb->s_qcop->quota_sync) diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c index 4e50286a4cc3..1dabed286b4c 100644 --- a/fs/sysv/dir.c +++ b/fs/sysv/dir.c @@ -164,8 +164,8 @@ struct sysv_dir_entry *sysv_find_entry(struct dentry *dentry, struct page **res_ name, de->name)) goto found; } + dir_put_page(page); } - dir_put_page(page); if (++n >= npages) n = 0; diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index 52e06b487ced..29f1edca76de 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -1209,6 +1209,7 @@ xfs_fs_put_super( xfs_unmountfs(mp); xfs_freesb(mp); + xfs_inode_shrinker_unregister(mp); xfs_icsb_destroy_counters(mp); xfs_close_devices(mp); xfs_dmops_put(mp); @@ -1622,6 +1623,8 @@ xfs_fs_fill_super( if (error) goto fail_vnrele; + xfs_inode_shrinker_register(mp); + kfree(mtpt); return 0; @@ -1867,6 +1870,7 @@ init_xfs_fs(void) goto out_cleanup_procfs; vfs_initquota(); + xfs_inode_shrinker_init(); error = register_filesystem(&xfs_fs_type); if (error) @@ -1894,6 +1898,7 @@ exit_xfs_fs(void) { vfs_exitquota(); unregister_filesystem(&xfs_fs_type); + xfs_inode_shrinker_destroy(); xfs_sysctl_unregister(); xfs_cleanup_procfs(); xfs_buf_terminate(); diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c index 05cd85317f6f..a427c638d909 100644 --- a/fs/xfs/linux-2.6/xfs_sync.c +++ b/fs/xfs/linux-2.6/xfs_sync.c @@ -95,7 +95,8 @@ xfs_inode_ag_walk( struct xfs_perag *pag, int flags), int flags, int tag, - int exclusive) + int exclusive, + int *nr_to_scan) { uint32_t first_index; int last_error = 0; @@ -134,7 +135,7 @@ restart: if (error == EFSCORRUPTED) break; - } while (1); + } while ((*nr_to_scan)--); if (skipped) { delay(1); @@ -150,12 +151,15 @@ xfs_inode_ag_iterator( struct xfs_perag *pag, int flags), int flags, int tag, - int exclusive) + int exclusive, + int *nr_to_scan) { int error = 0; int last_error = 0; xfs_agnumber_t ag; + int nr; + nr = nr_to_scan ? *nr_to_scan : INT_MAX; for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) { struct xfs_perag *pag; @@ -165,14 +169,18 @@ xfs_inode_ag_iterator( continue; } error = xfs_inode_ag_walk(mp, pag, execute, flags, tag, - exclusive); + exclusive, &nr); xfs_perag_put(pag); if (error) { last_error = error; if (error == EFSCORRUPTED) break; } + if (nr <= 0) + break; } + if (nr_to_scan) + *nr_to_scan = nr; return XFS_ERROR(last_error); } @@ -291,7 +299,7 @@ xfs_sync_data( ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT)) == 0); error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags, - XFS_ICI_NO_TAG, 0); + XFS_ICI_NO_TAG, 0, NULL); if (error) return XFS_ERROR(error); @@ -310,7 +318,7 @@ xfs_sync_attr( ASSERT((flags & ~SYNC_WAIT) == 0); return xfs_inode_ag_iterator(mp, xfs_sync_inode_attr, flags, - XFS_ICI_NO_TAG, 0); + XFS_ICI_NO_TAG, 0, NULL); } STATIC int @@ -673,6 +681,7 @@ __xfs_inode_set_reclaim_tag( radix_tree_tag_set(&pag->pag_ici_root, XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), XFS_ICI_RECLAIM_TAG); + pag->pag_ici_reclaimable++; } /* @@ -705,6 +714,7 @@ __xfs_inode_clear_reclaim_tag( { radix_tree_tag_clear(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG); + pag->pag_ici_reclaimable--; } /* @@ -820,10 +830,10 @@ xfs_reclaim_inode( * call into reclaim to find it in a clean state instead of waiting for * it now. We also don't return errors here - if the error is transient * then the next reclaim pass will flush the inode, and if the error - * is permanent then the next sync reclaim will relcaim the inode and + * is permanent then the next sync reclaim will reclaim the inode and * pass on the error. */ - if (error && !XFS_FORCED_SHUTDOWN(ip->i_mount)) { + if (error && error != EAGAIN && !XFS_FORCED_SHUTDOWN(ip->i_mount)) { xfs_fs_cmn_err(CE_WARN, ip->i_mount, "inode 0x%llx background reclaim flush failed with %d", (long long)ip->i_ino, error); @@ -854,5 +864,93 @@ xfs_reclaim_inodes( int mode) { return xfs_inode_ag_iterator(mp, xfs_reclaim_inode, mode, - XFS_ICI_RECLAIM_TAG, 1); + XFS_ICI_RECLAIM_TAG, 1, NULL); +} + +/* + * Shrinker infrastructure. + * + * This is all far more complex than it needs to be. It adds a global list of + * mounts because the shrinkers can only call a global context. We need to make + * the shrinkers pass a context to avoid the need for global state. + */ +static LIST_HEAD(xfs_mount_list); +static struct rw_semaphore xfs_mount_list_lock; + +static int +xfs_reclaim_inode_shrink( + int nr_to_scan, + gfp_t gfp_mask) +{ + struct xfs_mount *mp; + struct xfs_perag *pag; + xfs_agnumber_t ag; + int reclaimable = 0; + + if (nr_to_scan) { + if (!(gfp_mask & __GFP_FS)) + return -1; + + down_read(&xfs_mount_list_lock); + list_for_each_entry(mp, &xfs_mount_list, m_mplist) { + xfs_inode_ag_iterator(mp, xfs_reclaim_inode, 0, + XFS_ICI_RECLAIM_TAG, 1, &nr_to_scan); + if (nr_to_scan <= 0) + break; + } + up_read(&xfs_mount_list_lock); + } + + down_read(&xfs_mount_list_lock); + list_for_each_entry(mp, &xfs_mount_list, m_mplist) { + for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) { + + pag = xfs_perag_get(mp, ag); + if (!pag->pag_ici_init) { + xfs_perag_put(pag); + continue; + } + reclaimable += pag->pag_ici_reclaimable; + xfs_perag_put(pag); + } + } + up_read(&xfs_mount_list_lock); + return reclaimable; +} + +static struct shrinker xfs_inode_shrinker = { + .shrink = xfs_reclaim_inode_shrink, + .seeks = DEFAULT_SEEKS, +}; + +void __init +xfs_inode_shrinker_init(void) +{ + init_rwsem(&xfs_mount_list_lock); + register_shrinker(&xfs_inode_shrinker); +} + +void +xfs_inode_shrinker_destroy(void) +{ + ASSERT(list_empty(&xfs_mount_list)); + unregister_shrinker(&xfs_inode_shrinker); +} + +void +xfs_inode_shrinker_register( + struct xfs_mount *mp) +{ + down_write(&xfs_mount_list_lock); + list_add_tail(&mp->m_mplist, &xfs_mount_list); + up_write(&xfs_mount_list_lock); +} + +void +xfs_inode_shrinker_unregister( + struct xfs_mount *mp) +{ + down_write(&xfs_mount_list_lock); + list_del(&mp->m_mplist); + up_write(&xfs_mount_list_lock); } diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h index d480c346cabb..cdcbaaca9880 100644 --- a/fs/xfs/linux-2.6/xfs_sync.h +++ b/fs/xfs/linux-2.6/xfs_sync.h @@ -53,6 +53,11 @@ void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag, int xfs_sync_inode_valid(struct xfs_inode *ip, struct xfs_perag *pag); int xfs_inode_ag_iterator(struct xfs_mount *mp, int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags), - int flags, int tag, int write_lock); + int flags, int tag, int write_lock, int *nr_to_scan); + +void xfs_inode_shrinker_init(void); +void xfs_inode_shrinker_destroy(void); +void xfs_inode_shrinker_register(struct xfs_mount *mp); +void xfs_inode_shrinker_unregister(struct xfs_mount *mp); #endif diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c index 5d0ee8d492db..50bee07d6b0e 100644 --- a/fs/xfs/quota/xfs_qm_syscalls.c +++ b/fs/xfs/quota/xfs_qm_syscalls.c @@ -891,7 +891,8 @@ xfs_qm_dqrele_all_inodes( uint flags) { ASSERT(mp->m_quotainfo); - xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags, XFS_ICI_NO_TAG, 0); + xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags, + XFS_ICI_NO_TAG, 0, NULL); } /*------------------------------------------------------------------------*/ diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h index b1a5a1ff88ea..abb8222b88c9 100644 --- a/fs/xfs/xfs_ag.h +++ b/fs/xfs/xfs_ag.h @@ -223,6 +223,7 @@ typedef struct xfs_perag { int pag_ici_init; /* incore inode cache initialised */ rwlock_t pag_ici_lock; /* incore inode lock */ struct radix_tree_root pag_ici_root; /* incore inode cache root */ + int pag_ici_reclaimable; /* reclaimable inodes */ #endif int pagb_count; /* pagb slots in use */ xfs_perag_busy_t pagb_list[XFS_PAGB_NUM_SLOTS]; /* unstable blocks */ diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c index cd27c9d6c71f..5bba29a07812 100644 --- a/fs/xfs/xfs_dfrag.c +++ b/fs/xfs/xfs_dfrag.c @@ -177,16 +177,26 @@ xfs_swap_extents_check_format( XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) > tip->i_df.if_ext_max) return EINVAL; - /* Check root block of temp in btree form to max in target */ + /* + * If we are in a btree format, check that the temp root block will fit + * in the target and that it has enough extents to be in btree format + * in the target. + * + * Note that we have to be careful to allow btree->extent conversions + * (a common defrag case) which will occur when the temp inode is in + * extent format... + */ if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE && - XFS_IFORK_BOFF(ip) && - tip->i_df.if_broot_bytes > XFS_IFORK_BOFF(ip)) + ((XFS_IFORK_BOFF(ip) && + tip->i_df.if_broot_bytes > XFS_IFORK_BOFF(ip)) || + XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) <= ip->i_df.if_ext_max)) return EINVAL; - /* Check root block of target in btree form to max in temp */ + /* Reciprocal target->temp btree format checks */ if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE && - XFS_IFORK_BOFF(tip) && - ip->i_df.if_broot_bytes > XFS_IFORK_BOFF(tip)) + ((XFS_IFORK_BOFF(tip) && + ip->i_df.if_broot_bytes > XFS_IFORK_BOFF(tip)) || + XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) <= tip->i_df.if_ext_max)) return EINVAL; return 0; diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index e8fba92d7cd9..2be019136287 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -745,9 +745,16 @@ xfs_log_move_tail(xfs_mount_t *mp, /* * Determine if we have a transaction that has gone to disk - * that needs to be covered. Log activity needs to be idle (no AIL and - * nothing in the iclogs). And, we need to be in the right state indicating - * something has gone out. + * that needs to be covered. To begin the transition to the idle state + * firstly the log needs to be idle (no AIL and nothing in the iclogs). + * If we are then in a state where covering is needed, the caller is informed + * that dummy transactions are required to move the log into the idle state. + * + * Because this is called as part of the sync process, we should also indicate + * that dummy transactions should be issued in anything but the covered or + * idle states. This ensures that the log tail is accurately reflected in + * the log at the end of the sync, hence if a crash occurrs avoids replay + * of transactions where the metadata is already on disk. */ int xfs_log_need_covered(xfs_mount_t *mp) @@ -759,17 +766,24 @@ xfs_log_need_covered(xfs_mount_t *mp) return 0; spin_lock(&log->l_icloglock); - if (((log->l_covered_state == XLOG_STATE_COVER_NEED) || - (log->l_covered_state == XLOG_STATE_COVER_NEED2)) - && !xfs_trans_ail_tail(log->l_ailp) - && xlog_iclogs_empty(log)) { - if (log->l_covered_state == XLOG_STATE_COVER_NEED) - log->l_covered_state = XLOG_STATE_COVER_DONE; - else { - ASSERT(log->l_covered_state == XLOG_STATE_COVER_NEED2); - log->l_covered_state = XLOG_STATE_COVER_DONE2; + switch (log->l_covered_state) { + case XLOG_STATE_COVER_DONE: + case XLOG_STATE_COVER_DONE2: + case XLOG_STATE_COVER_IDLE: + break; + case XLOG_STATE_COVER_NEED: + case XLOG_STATE_COVER_NEED2: + if (!xfs_trans_ail_tail(log->l_ailp) && + xlog_iclogs_empty(log)) { + if (log->l_covered_state == XLOG_STATE_COVER_NEED) + log->l_covered_state = XLOG_STATE_COVER_DONE; + else + log->l_covered_state = XLOG_STATE_COVER_DONE2; } + /* FALLTHRU */ + default: needed = 1; + break; } spin_unlock(&log->l_icloglock); return needed; diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 4fa0bc7b983e..9ff48a16a7ee 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -259,6 +259,7 @@ typedef struct xfs_mount { wait_queue_head_t m_wait_single_sync_task; __int64_t m_update_flags; /* sb flags we need to update on the next remount,rw */ + struct list_head m_mplist; /* inode shrinker mount list */ } xfs_mount_t; /* |