diff options
Diffstat (limited to 'fs')
65 files changed, 933 insertions, 403 deletions
diff --git a/fs/Kconfig b/fs/Kconfig index d4bf8caad8d0..2126078a38ed 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -135,8 +135,8 @@ config TMPFS_POSIX_ACL config HUGETLBFS bool "HugeTLB file system support" - depends on X86 || IA64 || PPC64 || SPARC64 || (SUPERH && MMU) || \ - (S390 && 64BIT) || SYS_SUPPORTS_HUGETLBFS || BROKEN + depends on X86 || IA64 || PPC_BOOK3S_64 || SPARC64 || (S390 && 64BIT) || \ + SYS_SUPPORTS_HUGETLBFS || BROKEN help hugetlbfs is a filesystem backing for HugeTLB pages, based on ramfs. For architectures that support it, say Y here and read @@ -325,8 +325,16 @@ static void bio_fs_destructor(struct bio *bio) * @gfp_mask: allocation mask to use * @nr_iovecs: number of iovecs * - * Allocate a new bio with @nr_iovecs bvecs. If @gfp_mask - * contains __GFP_WAIT, the allocation is guaranteed to succeed. + * bio_alloc will allocate a bio and associated bio_vec array that can hold + * at least @nr_iovecs entries. Allocations will be done from the + * fs_bio_set. Also see @bio_alloc_bioset and @bio_kmalloc. + * + * If %__GFP_WAIT is set, then bio_alloc will always be able to allocate + * a bio. This is due to the mempool guarantees. To make this work, callers + * must never allocate more than 1 bio at a time from this pool. Callers + * that need to allocate more than 1 bio must always submit the previously + * allocated bio for IO before attempting to allocate a new one. Failure to + * do so can cause livelocks under memory pressure. * * RETURNS: * Pointer to new bio on success, NULL on failure. @@ -350,21 +358,13 @@ static void bio_kmalloc_destructor(struct bio *bio) } /** - * bio_alloc - allocate a bio for I/O + * bio_kmalloc - allocate a bio for I/O using kmalloc() * @gfp_mask: the GFP_ mask given to the slab allocator * @nr_iovecs: number of iovecs to pre-allocate * * Description: - * bio_alloc will allocate a bio and associated bio_vec array that can hold - * at least @nr_iovecs entries. Allocations will be done from the - * fs_bio_set. Also see @bio_alloc_bioset. - * - * If %__GFP_WAIT is set, then bio_alloc will always be able to allocate - * a bio. This is due to the mempool guarantees. To make this work, callers - * must never allocate more than 1 bio at a time from this pool. Callers - * that need to allocate more than 1 bio must always submit the previously - * allocated bio for IO before attempting to allocate a new one. Failure to - * do so can cause livelocks under memory pressure. + * Allocate a new bio with @nr_iovecs bvecs. If @gfp_mask contains + * %__GFP_WAIT, the allocation is guaranteed to succeed. * **/ struct bio *bio_kmalloc(gfp_t gfp_mask, int nr_iovecs) @@ -407,7 +407,7 @@ EXPORT_SYMBOL(zero_fill_bio); * * Description: * Put a reference to a &struct bio, either one you have gotten with - * bio_alloc or bio_get. The last put of a bio will free it. + * bio_alloc, bio_get or bio_clone. The last put of a bio will free it. **/ void bio_put(struct bio *bio) { diff --git a/fs/block_dev.c b/fs/block_dev.c index dde91e7e1c3a..73d6a735b8f3 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1258,8 +1258,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9); } } else { - put_disk(disk); module_put(disk->fops->owner); + put_disk(disk); disk = NULL; if (bdev->bd_contains == bdev) { if (bdev->bd_disk->fops->open) { diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index 69b355ae7f49..361604244271 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -27,7 +27,7 @@ #include "btrfs_inode.h" #include "xattr.h" -#ifdef CONFIG_BTRFS_POSIX_ACL +#ifdef CONFIG_BTRFS_FS_POSIX_ACL static struct posix_acl *btrfs_get_acl(struct inode *inode, int type) { @@ -313,7 +313,7 @@ struct xattr_handler btrfs_xattr_acl_access_handler = { .set = btrfs_xattr_acl_access_set, }; -#else /* CONFIG_BTRFS_POSIX_ACL */ +#else /* CONFIG_BTRFS_FS_POSIX_ACL */ int btrfs_acl_chmod(struct inode *inode) { @@ -325,4 +325,4 @@ int btrfs_init_acl(struct inode *inode, struct inode *dir) return 0; } -#endif /* CONFIG_BTRFS_POSIX_ACL */ +#endif /* CONFIG_BTRFS_FS_POSIX_ACL */ diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index 282ca085c2fb..c0861e781cdb 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -64,6 +64,51 @@ struct btrfs_worker_thread { }; /* + * btrfs_start_workers uses kthread_run, which can block waiting for memory + * for a very long time. It will actually throttle on page writeback, + * and so it may not make progress until after our btrfs worker threads + * process all of the pending work structs in their queue + * + * This means we can't use btrfs_start_workers from inside a btrfs worker + * thread that is used as part of cleaning dirty memory, which pretty much + * involves all of the worker threads. + * + * Instead we have a helper queue who never has more than one thread + * where we scheduler thread start operations. This worker_start struct + * is used to contain the work and hold a pointer to the queue that needs + * another worker. + */ +struct worker_start { + struct btrfs_work work; + struct btrfs_workers *queue; +}; + +static void start_new_worker_func(struct btrfs_work *work) +{ + struct worker_start *start; + start = container_of(work, struct worker_start, work); + btrfs_start_workers(start->queue, 1); + kfree(start); +} + +static int start_new_worker(struct btrfs_workers *queue) +{ + struct worker_start *start; + int ret; + + start = kzalloc(sizeof(*start), GFP_NOFS); + if (!start) + return -ENOMEM; + + start->work.func = start_new_worker_func; + start->queue = queue; + ret = btrfs_queue_worker(queue->atomic_worker_start, &start->work); + if (ret) + kfree(start); + return ret; +} + +/* * helper function to move a thread onto the idle list after it * has finished some requests. */ @@ -118,11 +163,13 @@ static void check_pending_worker_creates(struct btrfs_worker_thread *worker) goto out; workers->atomic_start_pending = 0; - if (workers->num_workers >= workers->max_workers) + if (workers->num_workers + workers->num_workers_starting >= + workers->max_workers) goto out; + workers->num_workers_starting += 1; spin_unlock_irqrestore(&workers->lock, flags); - btrfs_start_workers(workers, 1); + start_new_worker(workers); return; out: @@ -390,9 +437,11 @@ int btrfs_stop_workers(struct btrfs_workers *workers) /* * simple init on struct btrfs_workers */ -void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max) +void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max, + struct btrfs_workers *async_helper) { workers->num_workers = 0; + workers->num_workers_starting = 0; INIT_LIST_HEAD(&workers->worker_list); INIT_LIST_HEAD(&workers->idle_list); INIT_LIST_HEAD(&workers->order_list); @@ -404,14 +453,15 @@ void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max) workers->name = name; workers->ordered = 0; workers->atomic_start_pending = 0; - workers->atomic_worker_start = 0; + workers->atomic_worker_start = async_helper; } /* * starts new worker threads. This does not enforce the max worker * count in case you need to temporarily go past it. */ -int btrfs_start_workers(struct btrfs_workers *workers, int num_workers) +static int __btrfs_start_workers(struct btrfs_workers *workers, + int num_workers) { struct btrfs_worker_thread *worker; int ret = 0; @@ -444,6 +494,8 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers) list_add_tail(&worker->worker_list, &workers->idle_list); worker->idle = 1; workers->num_workers++; + workers->num_workers_starting--; + WARN_ON(workers->num_workers_starting < 0); spin_unlock_irq(&workers->lock); } return 0; @@ -452,6 +504,14 @@ fail: return ret; } +int btrfs_start_workers(struct btrfs_workers *workers, int num_workers) +{ + spin_lock_irq(&workers->lock); + workers->num_workers_starting += num_workers; + spin_unlock_irq(&workers->lock); + return __btrfs_start_workers(workers, num_workers); +} + /* * run through the list and find a worker thread that doesn't have a lot * to do right now. This can return null if we aren't yet at the thread @@ -461,7 +521,10 @@ static struct btrfs_worker_thread *next_worker(struct btrfs_workers *workers) { struct btrfs_worker_thread *worker; struct list_head *next; - int enforce_min = workers->num_workers < workers->max_workers; + int enforce_min; + + enforce_min = (workers->num_workers + workers->num_workers_starting) < + workers->max_workers; /* * if we find an idle thread, don't move it to the end of the @@ -509,15 +572,17 @@ again: worker = next_worker(workers); if (!worker) { - if (workers->num_workers >= workers->max_workers) { + if (workers->num_workers + workers->num_workers_starting >= + workers->max_workers) { goto fallback; } else if (workers->atomic_worker_start) { workers->atomic_start_pending = 1; goto fallback; } else { + workers->num_workers_starting++; spin_unlock_irqrestore(&workers->lock, flags); /* we're below the limit, start another worker */ - btrfs_start_workers(workers, 1); + __btrfs_start_workers(workers, 1); goto again; } } diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h index fc089b95ec14..5077746cf85e 100644 --- a/fs/btrfs/async-thread.h +++ b/fs/btrfs/async-thread.h @@ -64,6 +64,8 @@ struct btrfs_workers { /* current number of running workers */ int num_workers; + int num_workers_starting; + /* max number of workers allowed. changed by btrfs_start_workers */ int max_workers; @@ -78,9 +80,10 @@ struct btrfs_workers { /* * are we allowed to sleep while starting workers or are we required - * to start them at a later time? + * to start them at a later time? If we can't sleep, this indicates + * which queue we need to use to schedule thread creation. */ - int atomic_worker_start; + struct btrfs_workers *atomic_worker_start; /* list with all the work threads. The workers on the idle thread * may be actively servicing jobs, but they haven't yet hit the @@ -109,7 +112,8 @@ struct btrfs_workers { int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work); int btrfs_start_workers(struct btrfs_workers *workers, int num_workers); int btrfs_stop_workers(struct btrfs_workers *workers); -void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max); +void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max, + struct btrfs_workers *async_starter); int btrfs_requeue_work(struct btrfs_work *work); void btrfs_set_work_high_prio(struct btrfs_work *work); #endif diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index a54d354cefcb..f6783a42f010 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -86,6 +86,12 @@ struct btrfs_inode { * transid of the trans_handle that last modified this inode */ u64 last_trans; + + /* + * log transid when this inode was last modified + */ + u64 last_sub_trans; + /* * transid that last logged this inode */ @@ -128,12 +134,14 @@ struct btrfs_inode { u64 last_unlink_trans; /* - * These two counters are for delalloc metadata reservations. We keep - * track of how many extents we've accounted for vs how many extents we - * have. + * Counters to keep track of the number of extent item's we may use due + * to delalloc and such. outstanding_extents is the number of extent + * items we think we'll end up using, and reserved_extents is the number + * of extent items we've reserved metadata for. */ - int delalloc_reserved_extents; - int delalloc_extents; + spinlock_t accounting_lock; + int reserved_extents; + int outstanding_extents; /* * ordered_data_close is set by truncate when a file that used diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index dd8ced9814c4..444b3e9b92a4 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -691,14 +691,17 @@ struct btrfs_space_info { struct list_head list; + /* for controlling how we free up space for allocations */ + wait_queue_head_t allocate_wait; + wait_queue_head_t flush_wait; + int allocating_chunk; + int flushing; + /* for block groups in our same type */ struct list_head block_groups; spinlock_t lock; struct rw_semaphore groups_sem; atomic_t caching_threads; - - int allocating_chunk; - wait_queue_head_t wait; }; /* @@ -907,6 +910,7 @@ struct btrfs_fs_info { * A third pool does submit_bio to avoid deadlocking with the other * two */ + struct btrfs_workers generic_worker; struct btrfs_workers workers; struct btrfs_workers delalloc_workers; struct btrfs_workers endio_workers; @@ -914,6 +918,7 @@ struct btrfs_fs_info { struct btrfs_workers endio_meta_write_workers; struct btrfs_workers endio_write_workers; struct btrfs_workers submit_workers; + struct btrfs_workers enospc_workers; /* * fixup workers take dirty pages that didn't properly go through * the cow mechanism and make them safe to write. It happens @@ -1004,7 +1009,10 @@ struct btrfs_root { atomic_t log_writers; atomic_t log_commit[2]; unsigned long log_transid; + unsigned long last_log_commit; unsigned long log_batch; + pid_t log_start_pid; + bool log_multiple_pids; u64 objectid; u64 last_trans; @@ -1145,6 +1153,7 @@ struct btrfs_root { #define BTRFS_MOUNT_FLUSHONCOMMIT (1 << 7) #define BTRFS_MOUNT_SSD_SPREAD (1 << 8) #define BTRFS_MOUNT_NOSSD (1 << 9) +#define BTRFS_MOUNT_DISCARD (1 << 10) #define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) #define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) @@ -2323,7 +2332,7 @@ int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode); void btrfs_orphan_cleanup(struct btrfs_root *root); int btrfs_cont_expand(struct inode *inode, loff_t size); int btrfs_invalidate_inodes(struct btrfs_root *root); -extern struct dentry_operations btrfs_dentry_operations; +extern const struct dentry_operations btrfs_dentry_operations; /* ioctl.c */ long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); @@ -2366,7 +2375,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options); int btrfs_sync_fs(struct super_block *sb, int wait); /* acl.c */ -#ifdef CONFIG_BTRFS_POSIX_ACL +#ifdef CONFIG_BTRFS_FS_POSIX_ACL int btrfs_check_acl(struct inode *inode, int mask); #else #define btrfs_check_acl NULL diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index af0435f79fa6..02b6afbd7450 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -917,6 +917,7 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, atomic_set(&root->log_writers, 0); root->log_batch = 0; root->log_transid = 0; + root->last_log_commit = 0; extent_io_tree_init(&root->dirty_log_pages, fs_info->btree_inode->i_mapping, GFP_NOFS); @@ -1087,6 +1088,7 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans, WARN_ON(root->log_root); root->log_root = log_root; root->log_transid = 0; + root->last_log_commit = 0; return 0; } @@ -1746,21 +1748,25 @@ struct btrfs_root *open_ctree(struct super_block *sb, err = -EINVAL; goto fail_iput; } -printk("thread pool is %d\n", fs_info->thread_pool_size); - /* - * we need to start all the end_io workers up front because the - * queue work function gets called at interrupt time, and so it - * cannot dynamically grow. - */ + + btrfs_init_workers(&fs_info->generic_worker, + "genwork", 1, NULL); + btrfs_init_workers(&fs_info->workers, "worker", - fs_info->thread_pool_size); + fs_info->thread_pool_size, + &fs_info->generic_worker); btrfs_init_workers(&fs_info->delalloc_workers, "delalloc", - fs_info->thread_pool_size); + fs_info->thread_pool_size, + &fs_info->generic_worker); btrfs_init_workers(&fs_info->submit_workers, "submit", min_t(u64, fs_devices->num_devices, - fs_info->thread_pool_size)); + fs_info->thread_pool_size), + &fs_info->generic_worker); + btrfs_init_workers(&fs_info->enospc_workers, "enospc", + fs_info->thread_pool_size, + &fs_info->generic_worker); /* a higher idle thresh on the submit workers makes it much more * likely that bios will be send down in a sane order to the @@ -1774,15 +1780,20 @@ printk("thread pool is %d\n", fs_info->thread_pool_size); fs_info->delalloc_workers.idle_thresh = 2; fs_info->delalloc_workers.ordered = 1; - btrfs_init_workers(&fs_info->fixup_workers, "fixup", 1); + btrfs_init_workers(&fs_info->fixup_workers, "fixup", 1, + &fs_info->generic_worker); btrfs_init_workers(&fs_info->endio_workers, "endio", - fs_info->thread_pool_size); + fs_info->thread_pool_size, + &fs_info->generic_worker); btrfs_init_workers(&fs_info->endio_meta_workers, "endio-meta", - fs_info->thread_pool_size); + fs_info->thread_pool_size, + &fs_info->generic_worker); btrfs_init_workers(&fs_info->endio_meta_write_workers, - "endio-meta-write", fs_info->thread_pool_size); + "endio-meta-write", fs_info->thread_pool_size, + &fs_info->generic_worker); btrfs_init_workers(&fs_info->endio_write_workers, "endio-write", - fs_info->thread_pool_size); + fs_info->thread_pool_size, + &fs_info->generic_worker); /* * endios are largely parallel and should have a very @@ -1794,12 +1805,8 @@ printk("thread pool is %d\n", fs_info->thread_pool_size); fs_info->endio_write_workers.idle_thresh = 2; fs_info->endio_meta_write_workers.idle_thresh = 2; - fs_info->endio_workers.atomic_worker_start = 1; - fs_info->endio_meta_workers.atomic_worker_start = 1; - fs_info->endio_write_workers.atomic_worker_start = 1; - fs_info->endio_meta_write_workers.atomic_worker_start = 1; - btrfs_start_workers(&fs_info->workers, 1); + btrfs_start_workers(&fs_info->generic_worker, 1); btrfs_start_workers(&fs_info->submit_workers, 1); btrfs_start_workers(&fs_info->delalloc_workers, 1); btrfs_start_workers(&fs_info->fixup_workers, 1); @@ -1807,6 +1814,7 @@ printk("thread pool is %d\n", fs_info->thread_pool_size); btrfs_start_workers(&fs_info->endio_meta_workers, 1); btrfs_start_workers(&fs_info->endio_meta_write_workers, 1); btrfs_start_workers(&fs_info->endio_write_workers, 1); + btrfs_start_workers(&fs_info->enospc_workers, 1); fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super); fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages, @@ -2012,6 +2020,7 @@ fail_chunk_root: free_extent_buffer(chunk_root->node); free_extent_buffer(chunk_root->commit_root); fail_sb_buffer: + btrfs_stop_workers(&fs_info->generic_worker); btrfs_stop_workers(&fs_info->fixup_workers); btrfs_stop_workers(&fs_info->delalloc_workers); btrfs_stop_workers(&fs_info->workers); @@ -2020,6 +2029,7 @@ fail_sb_buffer: btrfs_stop_workers(&fs_info->endio_meta_write_workers); btrfs_stop_workers(&fs_info->endio_write_workers); btrfs_stop_workers(&fs_info->submit_workers); + btrfs_stop_workers(&fs_info->enospc_workers); fail_iput: invalidate_inode_pages2(fs_info->btree_inode->i_mapping); iput(fs_info->btree_inode); @@ -2437,6 +2447,7 @@ int close_ctree(struct btrfs_root *root) iput(fs_info->btree_inode); + btrfs_stop_workers(&fs_info->generic_worker); btrfs_stop_workers(&fs_info->fixup_workers); btrfs_stop_workers(&fs_info->delalloc_workers); btrfs_stop_workers(&fs_info->workers); @@ -2445,6 +2456,7 @@ int close_ctree(struct btrfs_root *root) btrfs_stop_workers(&fs_info->endio_meta_write_workers); btrfs_stop_workers(&fs_info->endio_write_workers); btrfs_stop_workers(&fs_info->submit_workers); + btrfs_stop_workers(&fs_info->enospc_workers); btrfs_close_devices(fs_info->fs_devices); btrfs_mapping_tree_free(&fs_info->mapping_tree); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 359a754c782c..e238a0cdac67 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -1568,23 +1568,23 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans, return ret; } -#ifdef BIO_RW_DISCARD static void btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len) { blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL, DISCARD_FL_BARRIER); } -#endif static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, u64 num_bytes) { -#ifdef BIO_RW_DISCARD int ret; u64 map_length = num_bytes; struct btrfs_multi_bio *multi = NULL; + if (!btrfs_test_opt(root, DISCARD)) + return 0; + /* Tell the block device(s) that the sectors can be discarded */ ret = btrfs_map_block(&root->fs_info->mapping_tree, READ, bytenr, &map_length, &multi, 0); @@ -1604,9 +1604,6 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, } return ret; -#else - return 0; -#endif } int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, @@ -2824,14 +2821,17 @@ int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root, num_items); spin_lock(&meta_sinfo->lock); - if (BTRFS_I(inode)->delalloc_reserved_extents <= - BTRFS_I(inode)->delalloc_extents) { + spin_lock(&BTRFS_I(inode)->accounting_lock); + if (BTRFS_I(inode)->reserved_extents <= + BTRFS_I(inode)->outstanding_extents) { + spin_unlock(&BTRFS_I(inode)->accounting_lock); spin_unlock(&meta_sinfo->lock); return 0; } + spin_unlock(&BTRFS_I(inode)->accounting_lock); - BTRFS_I(inode)->delalloc_reserved_extents--; - BUG_ON(BTRFS_I(inode)->delalloc_reserved_extents < 0); + BTRFS_I(inode)->reserved_extents--; + BUG_ON(BTRFS_I(inode)->reserved_extents < 0); if (meta_sinfo->bytes_delalloc < num_bytes) { bug = true; @@ -2864,6 +2864,107 @@ static void check_force_delalloc(struct btrfs_space_info *meta_sinfo) meta_sinfo->force_delalloc = 0; } +struct async_flush { + struct btrfs_root *root; + struct btrfs_space_info *info; + struct btrfs_work work; +}; + +static noinline void flush_delalloc_async(struct btrfs_work *work) +{ + struct async_flush *async; + struct btrfs_root *root; + struct btrfs_space_info *info; + + async = container_of(work, struct async_flush, work); + root = async->root; + info = async->info; + + btrfs_start_delalloc_inodes(root); + wake_up(&info->flush_wait); + btrfs_wait_ordered_extents(root, 0); + + spin_lock(&info->lock); + info->flushing = 0; + spin_unlock(&info->lock); + wake_up(&info->flush_wait); + + kfree(async); +} + +static void wait_on_flush(struct btrfs_space_info *info) +{ + DEFINE_WAIT(wait); + u64 used; + + while (1) { + prepare_to_wait(&info->flush_wait, &wait, + TASK_UNINTERRUPTIBLE); + spin_lock(&info->lock); + if (!info->flushing) { + spin_unlock(&info->lock); + break; + } + + used = info->bytes_used + info->bytes_reserved + + info->bytes_pinned + info->bytes_readonly + + info->bytes_super + info->bytes_root + + info->bytes_may_use + info->bytes_delalloc; + if (used < info->total_bytes) { + spin_unlock(&info->lock); + break; + } + spin_unlock(&info->lock); + schedule(); + } + finish_wait(&info->flush_wait, &wait); +} + +static void flush_delalloc(struct btrfs_root *root, + struct btrfs_space_info *info) +{ + struct async_flush *async; + bool wait = false; + + spin_lock(&info->lock); + + if (!info->flushing) { + info->flushing = 1; + init_waitqueue_head(&info->flush_wait); + } else { + wait = true; + } + + spin_unlock(&info->lock); + + if (wait) { + wait_on_flush(info); + return; + } + + async = kzalloc(sizeof(*async), GFP_NOFS); + if (!async) + goto flush; + + async->root = root; + async->info = info; + async->work.func = flush_delalloc_async; + + btrfs_queue_worker(&root->fs_info->enospc_workers, + &async->work); + wait_on_flush(info); + return; + +flush: + btrfs_start_delalloc_inodes(root); + btrfs_wait_ordered_extents(root, 0); + + spin_lock(&info->lock); + info->flushing = 0; + spin_unlock(&info->lock); + wake_up(&info->flush_wait); +} + static int maybe_allocate_chunk(struct btrfs_root *root, struct btrfs_space_info *info) { @@ -2894,7 +2995,7 @@ static int maybe_allocate_chunk(struct btrfs_root *root, if (!info->allocating_chunk) { info->force_alloc = 1; info->allocating_chunk = 1; - init_waitqueue_head(&info->wait); + init_waitqueue_head(&info->allocate_wait); } else { wait = true; } @@ -2902,7 +3003,7 @@ static int maybe_allocate_chunk(struct btrfs_root *root, spin_unlock(&info->lock); if (wait) { - wait_event(info->wait, + wait_event(info->allocate_wait, !info->allocating_chunk); return 1; } @@ -2923,7 +3024,7 @@ out: spin_lock(&info->lock); info->allocating_chunk = 0; spin_unlock(&info->lock); - wake_up(&info->wait); + wake_up(&info->allocate_wait); if (ret) return 0; @@ -2981,21 +3082,20 @@ again: filemap_flush(inode->i_mapping); goto again; } else if (flushed == 3) { - btrfs_start_delalloc_inodes(root); - btrfs_wait_ordered_extents(root, 0); + flush_delalloc(root, meta_sinfo); goto again; } spin_lock(&meta_sinfo->lock); meta_sinfo->bytes_delalloc -= num_bytes; spin_unlock(&meta_sinfo->lock); printk(KERN_ERR "enospc, has %d, reserved %d\n", - BTRFS_I(inode)->delalloc_extents, - BTRFS_I(inode)->delalloc_reserved_extents); + BTRFS_I(inode)->outstanding_extents, + BTRFS_I(inode)->reserved_extents); dump_space_info(meta_sinfo, 0, 0); return -ENOSPC; } - BTRFS_I(inode)->delalloc_reserved_extents++; + BTRFS_I(inode)->reserved_extents++; check_force_delalloc(meta_sinfo); spin_unlock(&meta_sinfo->lock); @@ -3094,8 +3194,7 @@ again: } if (retries == 2) { - btrfs_start_delalloc_inodes(root); - btrfs_wait_ordered_extents(root, 0); + flush_delalloc(root, meta_sinfo); goto again; } spin_lock(&meta_sinfo->lock); @@ -3588,6 +3687,14 @@ static int pin_down_bytes(struct btrfs_trans_handle *trans, if (is_data) goto pinit; + /* + * discard is sloooow, and so triggering discards on + * individual btree blocks isn't a good plan. Just + * pin everything in discard mode. + */ + if (btrfs_test_opt(root, DISCARD)) + goto pinit; + buf = btrfs_find_tree_block(root, bytenr, num_bytes); if (!buf) goto pinit; @@ -4029,6 +4136,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, int loop = 0; bool found_uncached_bg = false; bool failed_cluster_refill = false; + bool failed_alloc = false; WARN_ON(num_bytes < root->sectorsize); btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY); @@ -4233,14 +4341,23 @@ refill_cluster: offset = btrfs_find_space_for_alloc(block_group, search_start, num_bytes, empty_size); - if (!offset && (cached || (!cached && - loop == LOOP_CACHING_NOWAIT))) { - goto loop; - } else if (!offset && (!cached && - loop > LOOP_CACHING_NOWAIT)) { + /* + * If we didn't find a chunk, and we haven't failed on this + * block group before, and this block group is in the middle of + * caching and we are ok with waiting, then go ahead and wait + * for progress to be made, and set failed_alloc to true. + * + * If failed_alloc is true then we've already waited on this + * block group once and should move on to the next block group. + */ + if (!offset && !failed_alloc && !cached && + loop > LOOP_CACHING_NOWAIT) { wait_block_group_cache_progress(block_group, - num_bytes + empty_size); + num_bytes + empty_size); + failed_alloc = true; goto have_block_group; + } else if (!offset) { + goto loop; } checks: search_start = stripe_align(root, offset); @@ -4288,6 +4405,7 @@ checks: break; loop: failed_cluster_refill = false; + failed_alloc = false; btrfs_put_block_group(block_group); } up_read(&space_info->groups_sem); @@ -4799,6 +4917,7 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans, u64 bytenr; u64 generation; u64 refs; + u64 flags; u64 last = 0; u32 nritems; u32 blocksize; @@ -4836,15 +4955,19 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans, generation <= root->root_key.offset) continue; + /* We don't lock the tree block, it's OK to be racy here */ + ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize, + &refs, &flags); + BUG_ON(ret); + BUG_ON(refs == 0); + if (wc->stage == DROP_REFERENCE) { - ret = btrfs_lookup_extent_info(trans, root, - bytenr, blocksize, - &refs, NULL); - BUG_ON(ret); - BUG_ON(refs == 0); if (refs == 1) goto reada; + if (wc->level == 1 && + (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) + continue; if (!wc->update_ref || generation <= root->root_key.offset) continue; @@ -4853,6 +4976,10 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans, &wc->update_progress); if (ret < 0) continue; + } else { + if (wc->level == 1 && + (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) + continue; } reada: ret = readahead_tree_block(root, bytenr, blocksize, @@ -4876,7 +5003,7 @@ reada: static noinline int walk_down_proc(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, - struct walk_control *wc) + struct walk_control *wc, int lookup_info) { int level = wc->level; struct extent_buffer *eb = path->nodes[level]; @@ -4891,8 +5018,9 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans, * when reference count of tree block is 1, it won't increase * again. once full backref flag is set, we never clear it. */ - if ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) || - (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag))) { + if (lookup_info && + ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) || + (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) { BUG_ON(!path->locks[level]); ret = btrfs_lookup_extent_info(trans, root, eb->start, eb->len, @@ -4953,7 +5081,7 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans, static noinline int do_walk_down(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, - struct walk_control *wc) + struct walk_control *wc, int *lookup_info) { u64 bytenr; u64 generation; @@ -4973,8 +5101,10 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, * for the subtree */ if (wc->stage == UPDATE_BACKREF && - generation <= root->root_key.offset) + generation <= root->root_key.offset) { + *lookup_info = 1; return 1; + } bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]); blocksize = btrfs_level_size(root, level - 1); @@ -4987,14 +5117,19 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, btrfs_tree_lock(next); btrfs_set_lock_blocking(next); - if (wc->stage == DROP_REFERENCE) { - ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize, - &wc->refs[level - 1], - &wc->flags[level - 1]); - BUG_ON(ret); - BUG_ON(wc->refs[level - 1] == 0); + ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize, + &wc->refs[level - 1], + &wc->flags[level - 1]); + BUG_ON(ret); + BUG_ON(wc->refs[level - 1] == 0); + *lookup_info = 0; + if (wc->stage == DROP_REFERENCE) { if (wc->refs[level - 1] > 1) { + if (level == 1 && + (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF)) + goto skip; + if (!wc->update_ref || generation <= root->root_key.offset) goto skip; @@ -5008,12 +5143,17 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, wc->stage = UPDATE_BACKREF; wc->shared_level = level - 1; } + } else { + if (level == 1 && + (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF)) + goto skip; } if (!btrfs_buffer_uptodate(next, generation)) { btrfs_tree_unlock(next); free_extent_buffer(next); next = NULL; + *lookup_info = 1; } if (!next) { @@ -5036,21 +5176,22 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, skip: wc->refs[level - 1] = 0; wc->flags[level - 1] = 0; + if (wc->stage == DROP_REFERENCE) { + if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) { + parent = path->nodes[level]->start; + } else { + BUG_ON(root->root_key.objectid != + btrfs_header_owner(path->nodes[level])); + parent = 0; + } - if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) { - parent = path->nodes[level]->start; - } else { - BUG_ON(root->root_key.objectid != - btrfs_header_owner(path->nodes[level])); - parent = 0; + ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent, + root->root_key.objectid, level - 1, 0); + BUG_ON(ret); } - - ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent, - root->root_key.objectid, level - 1, 0); - BUG_ON(ret); - btrfs_tree_unlock(next); free_extent_buffer(next); + *lookup_info = 1; return 1; } @@ -5164,6 +5305,7 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans, struct walk_control *wc) { int level = wc->level; + int lookup_info = 1; int ret; while (level >= 0) { @@ -5171,14 +5313,14 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans, btrfs_header_nritems(path->nodes[level])) break; - ret = walk_down_proc(trans, root, path, wc); + ret = walk_down_proc(trans, root, path, wc, lookup_info); if (ret > 0) break; if (level == 0) break; - ret = do_walk_down(trans, root, path, wc); + ret = do_walk_down(trans, root, path, wc, &lookup_info); if (ret > 0) { path->slots[level]++; continue; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index de1793ba004a..96577e8bf9fd 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -460,7 +460,8 @@ static int clear_state_bit(struct extent_io_tree *tree, struct extent_state *state, int bits, int wake, int delete) { - int ret = state->state & bits; + int bits_to_clear = bits & ~EXTENT_DO_ACCOUNTING; + int ret = state->state & bits_to_clear; if ((bits & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) { u64 range = state->end - state->start + 1; @@ -468,7 +469,7 @@ static int clear_state_bit(struct extent_io_tree *tree, tree->dirty_bytes -= range; } clear_state_cb(tree, state, bits); - state->state &= ~bits; + state->state &= ~bits_to_clear; if (wake) wake_up(&state->wq); if (delete || state->state == 0) { @@ -956,7 +957,8 @@ int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) { return clear_extent_bit(tree, start, end, - EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, + EXTENT_DIRTY | EXTENT_DELALLOC | + EXTENT_DO_ACCOUNTING, 0, 0, NULL, mask); } @@ -1401,12 +1403,7 @@ out_failed: int extent_clear_unlock_delalloc(struct inode *inode, struct extent_io_tree *tree, u64 start, u64 end, struct page *locked_page, - int unlock_pages, - int clear_unlock, - int clear_delalloc, int clear_dirty, - int set_writeback, - int end_writeback, - int set_private2) + unsigned long op) { int ret; struct page *pages[16]; @@ -1416,17 +1413,21 @@ int extent_clear_unlock_delalloc(struct inode *inode, int i; int clear_bits = 0; - if (clear_unlock) + if (op & EXTENT_CLEAR_UNLOCK) clear_bits |= EXTENT_LOCKED; - if (clear_dirty) + if (op & EXTENT_CLEAR_DIRTY) clear_bits |= EXTENT_DIRTY; - if (clear_delalloc) + if (op & EXTENT_CLEAR_DELALLOC) clear_bits |= EXTENT_DELALLOC; + if (op & EXTENT_CLEAR_ACCOUNTING) + clear_bits |= EXTENT_DO_ACCOUNTING; + clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS); - if (!(unlock_pages || clear_dirty || set_writeback || end_writeback || - set_private2)) + if (!(op & (EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY | + EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK | + EXTENT_SET_PRIVATE2))) return 0; while (nr_pages > 0) { @@ -1435,20 +1436,20 @@ int extent_clear_unlock_delalloc(struct inode *inode, nr_pages, ARRAY_SIZE(pages)), pages); for (i = 0; i < ret; i++) { - if (set_private2) + if (op & EXTENT_SET_PRIVATE2) SetPagePrivate2(pages[i]); if (pages[i] == locked_page) { page_cache_release(pages[i]); continue; } - if (clear_dirty) + if (op & EXTENT_CLEAR_DIRTY) clear_page_dirty_for_io(pages[i]); - if (set_writeback) + if (op & EXTENT_SET_WRITEBACK) set_page_writeback(pages[i]); - if (end_writeback) + if (op & EXTENT_END_WRITEBACK) end_page_writeback(pages[i]); - if (unlock_pages) + if (op & EXTENT_CLEAR_UNLOCK_PAGE) unlock_page(pages[i]); page_cache_release(pages[i]); } @@ -2714,7 +2715,8 @@ int extent_invalidatepage(struct extent_io_tree *tree, lock_extent(tree, start, end, GFP_NOFS); wait_on_page_writeback(page); clear_extent_bit(tree, start, end, - EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC, + EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | + EXTENT_DO_ACCOUNTING, 1, 1, NULL, GFP_NOFS); return 0; } diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 4794ec891fed..36de250a7b2b 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -15,6 +15,7 @@ #define EXTENT_BUFFER_FILLED (1 << 8) #define EXTENT_BOUNDARY (1 << 9) #define EXTENT_NODATASUM (1 << 10) +#define EXTENT_DO_ACCOUNTING (1 << 11) #define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) /* flags for bio submission */ @@ -25,6 +26,16 @@ #define EXTENT_BUFFER_BLOCKING 1 #define EXTENT_BUFFER_DIRTY 2 +/* these are flags for extent_clear_unlock_delalloc */ +#define EXTENT_CLEAR_UNLOCK_PAGE 0x1 +#define EXTENT_CLEAR_UNLOCK 0x2 +#define EXTENT_CLEAR_DELALLOC 0x4 +#define EXTENT_CLEAR_DIRTY 0x8 +#define EXTENT_SET_WRITEBACK 0x10 +#define EXTENT_END_WRITEBACK 0x20 +#define EXTENT_SET_PRIVATE2 0x40 +#define EXTENT_CLEAR_ACCOUNTING 0x80 + /* * page->private values. Every page that is controlled by the extent * map has page->private set to one. @@ -288,10 +299,5 @@ int extent_range_uptodate(struct extent_io_tree *tree, int extent_clear_unlock_delalloc(struct inode *inode, struct extent_io_tree *tree, u64 start, u64 end, struct page *locked_page, - int unlock_page, - int clear_unlock, - int clear_delalloc, int clear_dirty, - int set_writeback, - int end_writeback, - int set_private2); + unsigned long op); #endif diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index f19e1259a971..06550affbd27 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -878,7 +878,8 @@ again: btrfs_put_ordered_extent(ordered); clear_extent_bits(&BTRFS_I(inode)->io_tree, start_pos, - last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC, + last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC | + EXTENT_DO_ACCOUNTING, GFP_NOFS); unlock_extent(&BTRFS_I(inode)->io_tree, start_pos, last_pos - 1, GFP_NOFS); @@ -1085,8 +1086,10 @@ out_nolock: btrfs_end_transaction(trans, root); else btrfs_commit_transaction(trans, root); - } else { + } else if (ret != BTRFS_NO_LOG_SYNC) { btrfs_commit_transaction(trans, root); + } else { + btrfs_end_transaction(trans, root); } } if (file->f_flags & O_DIRECT) { @@ -1136,6 +1139,13 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync) int ret = 0; struct btrfs_trans_handle *trans; + + /* we wait first, since the writeback may change the inode */ + root->log_batch++; + /* the VFS called filemap_fdatawrite for us */ + btrfs_wait_ordered_range(inode, 0, (u64)-1); + root->log_batch++; + /* * check the transaction that last modified this inode * and see if its already been committed @@ -1143,6 +1153,11 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync) if (!BTRFS_I(inode)->last_trans) goto out; + /* + * if the last transaction that changed this file was before + * the current transaction, we can bail out now without any + * syncing + */ mutex_lock(&root->fs_info->trans_mutex); if (BTRFS_I(inode)->last_trans <= root->fs_info->last_trans_committed) { @@ -1152,13 +1167,6 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync) } mutex_unlock(&root->fs_info->trans_mutex); - root->log_batch++; - filemap_fdatawrite(inode->i_mapping); - btrfs_wait_ordered_range(inode, 0, (u64)-1); - root->log_batch++; - - if (datasync && !(inode->i_state & I_DIRTY_PAGES)) - goto out; /* * ok we haven't committed the transaction yet, lets do a commit */ @@ -1187,14 +1195,18 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync) */ mutex_unlock(&dentry->d_inode->i_mutex); - if (ret > 0) { - ret = btrfs_commit_transaction(trans, root); - } else { - ret = btrfs_sync_log(trans, root); - if (ret == 0) - ret = btrfs_end_transaction(trans, root); - else + if (ret != BTRFS_NO_LOG_SYNC) { + if (ret > 0) { ret = btrfs_commit_transaction(trans, root); + } else { + ret = btrfs_sync_log(trans, root); + if (ret == 0) + ret = btrfs_end_transaction(trans, root); + else + ret = btrfs_commit_transaction(trans, root); + } + } else { + ret = btrfs_end_transaction(trans, root); } mutex_lock(&dentry->d_inode->i_mutex); out: diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 112e5aa85892..dae12dc7e159 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -424,9 +424,12 @@ again: * and free up our temp pages. */ extent_clear_unlock_delalloc(inode, - &BTRFS_I(inode)->io_tree, - start, end, NULL, 1, 0, - 0, 1, 1, 1, 0); + &BTRFS_I(inode)->io_tree, + start, end, NULL, + EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY | + EXTENT_CLEAR_DELALLOC | + EXTENT_CLEAR_ACCOUNTING | + EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK); ret = 0; goto free_pages_out; } @@ -637,11 +640,14 @@ static noinline int submit_compressed_extents(struct inode *inode, * clear dirty, set writeback and unlock the pages. */ extent_clear_unlock_delalloc(inode, - &BTRFS_I(inode)->io_tree, - async_extent->start, - async_extent->start + - async_extent->ram_size - 1, - NULL, 1, 1, 0, 1, 1, 0, 0); + &BTRFS_I(inode)->io_tree, + async_extent->start, + async_extent->start + + async_extent->ram_size - 1, + NULL, EXTENT_CLEAR_UNLOCK_PAGE | + EXTENT_CLEAR_UNLOCK | + EXTENT_CLEAR_DELALLOC | + EXTENT_CLEAR_DIRTY | EXTENT_SET_WRITEBACK); ret = btrfs_submit_compressed_write(inode, async_extent->start, @@ -712,9 +718,15 @@ static noinline int cow_file_range(struct inode *inode, start, end, 0, NULL); if (ret == 0) { extent_clear_unlock_delalloc(inode, - &BTRFS_I(inode)->io_tree, - start, end, NULL, 1, 1, - 1, 1, 1, 1, 0); + &BTRFS_I(inode)->io_tree, + start, end, NULL, + EXTENT_CLEAR_UNLOCK_PAGE | + EXTENT_CLEAR_UNLOCK | + EXTENT_CLEAR_DELALLOC | + EXTENT_CLEAR_ACCOUNTING | + EXTENT_CLEAR_DIRTY | + EXTENT_SET_WRITEBACK | + EXTENT_END_WRITEBACK); *nr_written = *nr_written + (end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE; *page_started = 1; @@ -738,6 +750,8 @@ static noinline int cow_file_range(struct inode *inode, btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0); while (disk_num_bytes > 0) { + unsigned long op; + cur_alloc_size = min(disk_num_bytes, root->fs_info->max_extent); ret = btrfs_reserve_extent(trans, root, cur_alloc_size, root->sectorsize, 0, alloc_hint, @@ -789,10 +803,13 @@ static noinline int cow_file_range(struct inode *inode, * Do set the Private2 bit so we know this page was properly * setup for writepage */ + op = unlock ? EXTENT_CLEAR_UNLOCK_PAGE : 0; + op |= EXTENT_CLEAR_UNLOCK | EXTENT_CLEAR_DELALLOC | + EXTENT_SET_PRIVATE2; + extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, start, start + ram_size - 1, - locked_page, unlock, 1, - 1, 0, 0, 0, 1); + locked_page, op); disk_num_bytes -= cur_alloc_size; num_bytes -= cur_alloc_size; alloc_hint = ins.objectid + ins.offset; @@ -864,8 +881,8 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page, u64 cur_end; int limit = 10 * 1024 * 1042; - clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED | - EXTENT_DELALLOC, 1, 0, NULL, GFP_NOFS); + clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED, + 1, 0, NULL, GFP_NOFS); while (start < end) { async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS); async_cow->inode = inode; @@ -1006,6 +1023,7 @@ next_slot: if (found_key.offset > cur_offset) { extent_end = found_key.offset; + extent_type = 0; goto out_check; } @@ -1112,8 +1130,10 @@ out_check: BUG_ON(ret); extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, - cur_offset, cur_offset + num_bytes - 1, - locked_page, 1, 1, 1, 0, 0, 0, 1); + cur_offset, cur_offset + num_bytes - 1, + locked_page, EXTENT_CLEAR_UNLOCK_PAGE | + EXTENT_CLEAR_UNLOCK | EXTENT_CLEAR_DELALLOC | + EXTENT_SET_PRIVATE2); cur_offset = extent_end; if (cur_offset > end) break; @@ -1178,15 +1198,17 @@ static int btrfs_split_extent_hook(struct inode *inode, root->fs_info->max_extent); /* - * if we break a large extent up then leave delalloc_extents be, - * since we've already accounted for the large extent. + * if we break a large extent up then leave oustanding_extents + * be, since we've already accounted for the large extent. */ if (div64_u64(new_size + root->fs_info->max_extent - 1, root->fs_info->max_extent) < num_extents) return 0; } - BTRFS_I(inode)->delalloc_extents++; + spin_lock(&BTRFS_I(inode)->accounting_lock); + BTRFS_I(inode)->outstanding_extents++; + spin_unlock(&BTRFS_I(inode)->accounting_lock); return 0; } @@ -1217,7 +1239,9 @@ static int btrfs_merge_extent_hook(struct inode *inode, /* we're not bigger than the max, unreserve the space and go */ if (new_size <= root->fs_info->max_extent) { - BTRFS_I(inode)->delalloc_extents--; + spin_lock(&BTRFS_I(inode)->accounting_lock); + BTRFS_I(inode)->outstanding_extents--; + spin_unlock(&BTRFS_I(inode)->accounting_lock); return 0; } @@ -1231,7 +1255,9 @@ static int btrfs_merge_extent_hook(struct inode *inode, root->fs_info->max_extent) > num_extents) return 0; - BTRFS_I(inode)->delalloc_extents--; + spin_lock(&BTRFS_I(inode)->accounting_lock); + BTRFS_I(inode)->outstanding_extents--; + spin_unlock(&BTRFS_I(inode)->accounting_lock); return 0; } @@ -1253,7 +1279,9 @@ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end, if (!(old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { struct btrfs_root *root = BTRFS_I(inode)->root; - BTRFS_I(inode)->delalloc_extents++; + spin_lock(&BTRFS_I(inode)->accounting_lock); + BTRFS_I(inode)->outstanding_extents++; + spin_unlock(&BTRFS_I(inode)->accounting_lock); btrfs_delalloc_reserve_space(root, inode, end - start + 1); spin_lock(&root->fs_info->delalloc_lock); BTRFS_I(inode)->delalloc_bytes += end - start + 1; @@ -1281,8 +1309,12 @@ static int btrfs_clear_bit_hook(struct inode *inode, if ((state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { struct btrfs_root *root = BTRFS_I(inode)->root; - BTRFS_I(inode)->delalloc_extents--; - btrfs_unreserve_metadata_for_delalloc(root, inode, 1); + if (bits & EXTENT_DO_ACCOUNTING) { + spin_lock(&BTRFS_I(inode)->accounting_lock); + BTRFS_I(inode)->outstanding_extents--; + spin_unlock(&BTRFS_I(inode)->accounting_lock); + btrfs_unreserve_metadata_for_delalloc(root, inode, 1); + } spin_lock(&root->fs_info->delalloc_lock); if (state->end - state->start + 1 > @@ -3000,12 +3032,22 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from) if ((offset & (blocksize - 1)) == 0) goto out; + ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE); + if (ret) + goto out; + + ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1); + if (ret) + goto out; ret = -ENOMEM; again: page = grab_cache_page(mapping, index); - if (!page) + if (!page) { + btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE); + btrfs_unreserve_metadata_for_delalloc(root, inode, 1); goto out; + } page_start = page_offset(page); page_end = page_start + PAGE_CACHE_SIZE - 1; @@ -3038,6 +3080,10 @@ again: goto again; } + clear_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, + EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING, + GFP_NOFS); + ret = btrfs_set_extent_delalloc(inode, page_start, page_end); if (ret) { unlock_extent(io_tree, page_start, page_end, GFP_NOFS); @@ -3056,6 +3102,9 @@ again: unlock_extent(io_tree, page_start, page_end, GFP_NOFS); out_unlock: + if (ret) + btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE); + btrfs_unreserve_metadata_for_delalloc(root, inode, 1); unlock_page(page); page_cache_release(page); out: @@ -3079,7 +3128,9 @@ int btrfs_cont_expand(struct inode *inode, loff_t size) if (size <= hole_start) return 0; - btrfs_truncate_page(inode->i_mapping, inode->i_size); + err = btrfs_truncate_page(inode->i_mapping, inode->i_size); + if (err) + return err; while (1) { struct btrfs_ordered_extent *ordered; @@ -3448,6 +3499,7 @@ static noinline void init_btrfs_i(struct inode *inode) bi->generation = 0; bi->sequence = 0; bi->last_trans = 0; + bi->last_sub_trans = 0; bi->logged_trans = 0; bi->delalloc_bytes = 0; bi->reserved_bytes = 0; @@ -3598,12 +3650,14 @@ static int btrfs_dentry_delete(struct dentry *dentry) { struct btrfs_root *root; - if (!dentry->d_inode) - return 0; + if (!dentry->d_inode && !IS_ROOT(dentry)) + dentry = dentry->d_parent; - root = BTRFS_I(dentry->d_inode)->root; - if (btrfs_root_refs(&root->root_item) == 0) - return 1; + if (dentry->d_inode) { + root = BTRFS_I(dentry->d_inode)->root; + if (btrfs_root_refs(&root->root_item) == 0) + return 1; + } return 0; } @@ -4808,7 +4862,8 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset) */ clear_extent_bit(tree, page_start, page_end, EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_LOCKED, 1, 0, NULL, GFP_NOFS); + EXTENT_LOCKED | EXTENT_DO_ACCOUNTING, 1, 0, + NULL, GFP_NOFS); /* * whoever cleared the private bit is responsible * for the finish_ordered_io @@ -4821,8 +4876,8 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset) lock_extent(tree, page_start, page_end, GFP_NOFS); } clear_extent_bit(tree, page_start, page_end, - EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC, - 1, 1, NULL, GFP_NOFS); + EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | + EXTENT_DO_ACCOUNTING, 1, 1, NULL, GFP_NOFS); __btrfs_releasepage(page, GFP_NOFS); ClearPageChecked(page); @@ -4917,7 +4972,8 @@ again: * prepare_pages in the normal write path. */ clear_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, - EXTENT_DIRTY | EXTENT_DELALLOC, GFP_NOFS); + EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING, + GFP_NOFS); ret = btrfs_set_extent_delalloc(inode, page_start, page_end); if (ret) { @@ -4944,7 +5000,9 @@ again: set_page_dirty(page); SetPageUptodate(page); - BTRFS_I(inode)->last_trans = root->fs_info->generation + 1; + BTRFS_I(inode)->last_trans = root->fs_info->generation; + BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid; + unlock_extent(io_tree, page_start, page_end, GFP_NOFS); out_unlock: @@ -4969,7 +5027,9 @@ static void btrfs_truncate(struct inode *inode) if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) return; - btrfs_truncate_page(inode->i_mapping, inode->i_size); + ret = btrfs_truncate_page(inode->i_mapping, inode->i_size); + if (ret) + return; btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1); trans = btrfs_start_transaction(root, 1); @@ -5064,9 +5124,11 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) if (!ei) return NULL; ei->last_trans = 0; + ei->last_sub_trans = 0; ei->logged_trans = 0; - ei->delalloc_extents = 0; - ei->delalloc_reserved_extents = 0; + ei->outstanding_extents = 0; + ei->reserved_extents = 0; + spin_lock_init(&ei->accounting_lock); btrfs_ordered_inode_tree_init(&ei->ordered_tree); INIT_LIST_HEAD(&ei->i_orphan); INIT_LIST_HEAD(&ei->ordered_operations); @@ -5805,6 +5867,6 @@ static const struct inode_operations btrfs_symlink_inode_operations = { .removexattr = btrfs_removexattr, }; -struct dentry_operations btrfs_dentry_operations = { +const struct dentry_operations btrfs_dentry_operations = { .d_delete = btrfs_dentry_delete, }; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 9a780c8d0ac8..cdbb054102b9 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -830,6 +830,7 @@ out_up_write: out_unlock: mutex_unlock(&inode->i_mutex); if (!err) { + shrink_dcache_sb(root->fs_info->sb); btrfs_invalidate_inodes(dest); d_delete(dentry); } @@ -1122,8 +1123,10 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, datao += off - key.offset; datal -= off - key.offset; } - if (key.offset + datao + datal > off + len) - datal = off + len - key.offset - datao; + + if (key.offset + datal > off + len) + datal = off + len - key.offset; + /* disko == 0 means it's a hole */ if (!disko) datao = 0; diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 897fba835f89..5799bc46a309 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -306,6 +306,12 @@ int btrfs_remove_ordered_extent(struct inode *inode, tree->last = NULL; set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags); + spin_lock(&BTRFS_I(inode)->accounting_lock); + BTRFS_I(inode)->outstanding_extents--; + spin_unlock(&BTRFS_I(inode)->accounting_lock); + btrfs_unreserve_metadata_for_delalloc(BTRFS_I(inode)->root, + inode, 1); + spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); list_del_init(&entry->root_extent_list); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 361ad323faac..cfcc93c93a7b 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -3518,7 +3518,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start) BUG_ON(!rc->block_group); btrfs_init_workers(&rc->workers, "relocate", - fs_info->thread_pool_size); + fs_info->thread_pool_size, NULL); rc->extent_root = extent_root; btrfs_prepare_block_group_relocation(extent_root, rc->block_group); @@ -3701,7 +3701,7 @@ int btrfs_recover_relocation(struct btrfs_root *root) mapping_tree_init(&rc->reloc_root_tree); INIT_LIST_HEAD(&rc->reloc_roots); btrfs_init_workers(&rc->workers, "relocate", - root->fs_info->thread_pool_size); + root->fs_info->thread_pool_size, NULL); rc->extent_root = root->fs_info->extent_root; set_reloc_control(rc); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 9de9b2236419..752a5463bf53 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -66,7 +66,8 @@ enum { Opt_degraded, Opt_subvol, Opt_device, Opt_nodatasum, Opt_nodatacow, Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, Opt_ssd, Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, - Opt_compress, Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_err, + Opt_compress, Opt_notreelog, Opt_ratio, Opt_flushoncommit, + Opt_discard, Opt_err, }; static match_table_t tokens = { @@ -88,6 +89,7 @@ static match_table_t tokens = { {Opt_notreelog, "notreelog"}, {Opt_flushoncommit, "flushoncommit"}, {Opt_ratio, "metadata_ratio=%d"}, + {Opt_discard, "discard"}, {Opt_err, NULL}, }; @@ -257,6 +259,9 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) info->metadata_ratio); } break; + case Opt_discard: + btrfs_set_opt(info->mount_opt, DISCARD); + break; default: break; } @@ -344,7 +349,7 @@ static int btrfs_fill_super(struct super_block *sb, sb->s_export_op = &btrfs_export_ops; sb->s_xattr = btrfs_xattr_handlers; sb->s_time_gran = 1; -#ifdef CONFIG_BTRFS_POSIX_ACL +#ifdef CONFIG_BTRFS_FS_POSIX_ACL sb->s_flags |= MS_POSIXACL; #endif diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 0b8f36d4400a..bca82a4ca8e6 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -344,10 +344,10 @@ int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans, /* * when btree blocks are allocated, they have some corresponding bits set for * them in one of two extent_io trees. This is used to make sure all of - * those extents are on disk for transaction or log commit + * those extents are sent to disk but does not wait on them */ -int btrfs_write_and_wait_marked_extents(struct btrfs_root *root, - struct extent_io_tree *dirty_pages) +int btrfs_write_marked_extents(struct btrfs_root *root, + struct extent_io_tree *dirty_pages) { int ret; int err = 0; @@ -394,6 +394,29 @@ int btrfs_write_and_wait_marked_extents(struct btrfs_root *root, page_cache_release(page); } } + if (err) + werr = err; + return werr; +} + +/* + * when btree blocks are allocated, they have some corresponding bits set for + * them in one of two extent_io trees. This is used to make sure all of + * those extents are on disk for transaction or log commit. We wait + * on all the pages and clear them from the dirty pages state tree + */ +int btrfs_wait_marked_extents(struct btrfs_root *root, + struct extent_io_tree *dirty_pages) +{ + int ret; + int err = 0; + int werr = 0; + struct page *page; + struct inode *btree_inode = root->fs_info->btree_inode; + u64 start = 0; + u64 end; + unsigned long index; + while (1) { ret = find_first_extent_bit(dirty_pages, 0, &start, &end, EXTENT_DIRTY); @@ -424,6 +447,22 @@ int btrfs_write_and_wait_marked_extents(struct btrfs_root *root, return werr; } +/* + * when btree blocks are allocated, they have some corresponding bits set for + * them in one of two extent_io trees. This is used to make sure all of + * those extents are on disk for transaction or log commit + */ +int btrfs_write_and_wait_marked_extents(struct btrfs_root *root, + struct extent_io_tree *dirty_pages) +{ + int ret; + int ret2; + + ret = btrfs_write_marked_extents(root, dirty_pages); + ret2 = btrfs_wait_marked_extents(root, dirty_pages); + return ret || ret2; +} + int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root) { diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 663c67404918..d4e3e7a6938c 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -79,6 +79,7 @@ static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans, struct inode *inode) { BTRFS_I(inode)->last_trans = trans->transaction->transid; + BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid; } int btrfs_end_transaction(struct btrfs_trans_handle *trans, @@ -107,5 +108,9 @@ int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_write_and_wait_marked_extents(struct btrfs_root *root, struct extent_io_tree *dirty_pages); +int btrfs_write_marked_extents(struct btrfs_root *root, + struct extent_io_tree *dirty_pages); +int btrfs_wait_marked_extents(struct btrfs_root *root, + struct extent_io_tree *dirty_pages); int btrfs_transaction_in_commit(struct btrfs_fs_info *info); #endif diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 7827841b55cb..741666a7676a 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -137,11 +137,20 @@ static int start_log_trans(struct btrfs_trans_handle *trans, mutex_lock(&root->log_mutex); if (root->log_root) { + if (!root->log_start_pid) { + root->log_start_pid = current->pid; + root->log_multiple_pids = false; + } else if (root->log_start_pid != current->pid) { + root->log_multiple_pids = true; + } + root->log_batch++; atomic_inc(&root->log_writers); mutex_unlock(&root->log_mutex); return 0; } + root->log_multiple_pids = false; + root->log_start_pid = current->pid; mutex_lock(&root->fs_info->tree_log_mutex); if (!root->fs_info->log_root_tree) { ret = btrfs_init_log_root_tree(trans, root->fs_info); @@ -1971,6 +1980,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, int ret; struct btrfs_root *log = root->log_root; struct btrfs_root *log_root_tree = root->fs_info->log_root_tree; + u64 log_transid = 0; mutex_lock(&root->log_mutex); index1 = root->log_transid % 2; @@ -1987,10 +1997,11 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, while (1) { unsigned long batch = root->log_batch; - mutex_unlock(&root->log_mutex); - schedule_timeout_uninterruptible(1); - mutex_lock(&root->log_mutex); - + if (root->log_multiple_pids) { + mutex_unlock(&root->log_mutex); + schedule_timeout_uninterruptible(1); + mutex_lock(&root->log_mutex); + } wait_for_writer(trans, root); if (batch == root->log_batch) break; @@ -2003,14 +2014,19 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, goto out; } - ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages); + /* we start IO on all the marked extents here, but we don't actually + * wait for them until later. + */ + ret = btrfs_write_marked_extents(log, &log->dirty_log_pages); BUG_ON(ret); btrfs_set_root_node(&log->root_item, log->node); root->log_batch = 0; + log_transid = root->log_transid; root->log_transid++; log->log_transid = root->log_transid; + root->log_start_pid = 0; smp_mb(); /* * log tree has been flushed to disk, new modifications of @@ -2036,6 +2052,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, index2 = log_root_tree->log_transid % 2; if (atomic_read(&log_root_tree->log_commit[index2])) { + btrfs_wait_marked_extents(log, &log->dirty_log_pages); wait_log_commit(trans, log_root_tree, log_root_tree->log_transid); mutex_unlock(&log_root_tree->log_mutex); @@ -2055,6 +2072,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, * check the full commit flag again */ if (root->fs_info->last_trans_log_full_commit == trans->transid) { + btrfs_wait_marked_extents(log, &log->dirty_log_pages); mutex_unlock(&log_root_tree->log_mutex); ret = -EAGAIN; goto out_wake_log_root; @@ -2063,6 +2081,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, ret = btrfs_write_and_wait_marked_extents(log_root_tree, &log_root_tree->dirty_log_pages); BUG_ON(ret); + btrfs_wait_marked_extents(log, &log->dirty_log_pages); btrfs_set_super_log_root(&root->fs_info->super_for_commit, log_root_tree->node->start); @@ -2082,9 +2101,14 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, * the running transaction open, so a full commit can't hop * in and cause problems either. */ - write_ctree_super(trans, root->fs_info->tree_root, 2); + write_ctree_super(trans, root->fs_info->tree_root, 1); ret = 0; + mutex_lock(&root->log_mutex); + if (root->last_log_commit < log_transid) + root->last_log_commit = log_transid; + mutex_unlock(&root->log_mutex); + out_wake_log_root: atomic_set(&log_root_tree->log_commit[index2], 0); smp_mb(); @@ -2852,6 +2876,21 @@ out: return ret; } +static int inode_in_log(struct btrfs_trans_handle *trans, + struct inode *inode) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret = 0; + + mutex_lock(&root->log_mutex); + if (BTRFS_I(inode)->logged_trans == trans->transid && + BTRFS_I(inode)->last_sub_trans <= root->last_log_commit) + ret = 1; + mutex_unlock(&root->log_mutex); + return ret; +} + + /* * helper function around btrfs_log_inode to make sure newly created * parent directories also end up in the log. A minimal inode and backref @@ -2891,6 +2930,11 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, if (ret) goto end_no_trans; + if (inode_in_log(trans, inode)) { + ret = BTRFS_NO_LOG_SYNC; + goto end_no_trans; + } + start_log_trans(trans, root); ret = btrfs_log_inode(trans, root, inode, inode_only); diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h index d09c7609e16b..0776eacb5083 100644 --- a/fs/btrfs/tree-log.h +++ b/fs/btrfs/tree-log.h @@ -19,6 +19,9 @@ #ifndef __TREE_LOG_ #define __TREE_LOG_ +/* return value for btrfs_log_dentry_safe that means we don't need to log it at all */ +#define BTRFS_NO_LOG_SYNC 256 + int btrfs_sync_log(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root); diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index b0fc93f95fd0..b6dd5967c48a 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -260,7 +260,7 @@ err: * attributes are handled directly. */ struct xattr_handler *btrfs_xattr_handlers[] = { -#ifdef CONFIG_BTRFS_POSIX_ACL +#ifdef CONFIG_BTRFS_FS_POSIX_ACL &btrfs_xattr_acl_access_handler, &btrfs_xattr_acl_default_handler, #endif diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 43003e0bef18..b09098079916 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -1577,7 +1577,8 @@ cifs_get_tcp_session(struct smb_vol *volume_info) out_err: if (tcp_ses) { - kfree(tcp_ses->hostname); + if (!IS_ERR(tcp_ses->hostname)) + kfree(tcp_ses->hostname); if (tcp_ses->ssocket) sock_release(tcp_ses->ssocket); kfree(tcp_ses); diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 240cef14fe58..70736eb4b516 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -316,6 +316,10 @@ int dlm_lowcomms_connect_node(int nodeid) { struct connection *con; + /* with sctp there's no connecting without sending */ + if (dlm_config.ci_protocol != 0) + return 0; + if (nodeid == dlm_our_nodeid()) return 0; @@ -455,9 +459,9 @@ static void process_sctp_notification(struct connection *con, int prim_len, ret; int addr_len; struct connection *new_con; - struct file *file; sctp_peeloff_arg_t parg; int parglen = sizeof(parg); + int err; /* * We get this before any data for an association. @@ -512,19 +516,22 @@ static void process_sctp_notification(struct connection *con, ret = kernel_getsockopt(con->sock, IPPROTO_SCTP, SCTP_SOCKOPT_PEELOFF, (void *)&parg, &parglen); - if (ret) { + if (ret < 0) { log_print("Can't peel off a socket for " - "connection %d to node %d: err=%d\n", + "connection %d to node %d: err=%d", parg.associd, nodeid, ret); + return; + } + new_con->sock = sockfd_lookup(parg.sd, &err); + if (!new_con->sock) { + log_print("sockfd_lookup error %d", err); + return; } - file = fget(parg.sd); - new_con->sock = SOCKET_I(file->f_dentry->d_inode); add_sock(new_con->sock, new_con); - fput(file); - put_unused_fd(parg.sd); + sockfd_put(new_con->sock); - log_print("got new/restarted association %d nodeid %d", - (int)sn->sn_assoc_change.sac_assoc_id, nodeid); + log_print("connecting to %d sctp association %d", + nodeid, (int)sn->sn_assoc_change.sac_assoc_id); /* Send any pending writes */ clear_bit(CF_CONNECT_PENDING, &new_con->flags); @@ -837,8 +844,6 @@ static void sctp_init_assoc(struct connection *con) if (con->retries++ > MAX_CONNECT_RETRIES) return; - log_print("Initiating association with node %d", con->nodeid); - if (nodeid_to_addr(con->nodeid, (struct sockaddr *)&rem_addr)) { log_print("no address for nodeid %d", con->nodeid); return; @@ -855,11 +860,14 @@ static void sctp_init_assoc(struct connection *con) outmessage.msg_flags = MSG_EOR; spin_lock(&con->writequeue_lock); - e = list_entry(con->writequeue.next, struct writequeue_entry, - list); - BUG_ON((struct list_head *) e == &con->writequeue); + if (list_empty(&con->writequeue)) { + spin_unlock(&con->writequeue_lock); + log_print("writequeue empty for nodeid %d", con->nodeid); + return; + } + e = list_first_entry(&con->writequeue, struct writequeue_entry, list); len = e->len; offset = e->offset; spin_unlock(&con->writequeue_lock); diff --git a/fs/ecryptfs/Kconfig b/fs/ecryptfs/Kconfig index 8aadb99b7634..1cd6d9d3e29a 100644 --- a/fs/ecryptfs/Kconfig +++ b/fs/ecryptfs/Kconfig @@ -1,8 +1,9 @@ config ECRYPT_FS tristate "eCrypt filesystem layer support (EXPERIMENTAL)" - depends on EXPERIMENTAL && KEYS && NET + depends on EXPERIMENTAL && KEYS && CRYPTO select CRYPTO_ECB select CRYPTO_CBC + select CRYPTO_MD5 help Encrypted filesystem that operates on the VFS layer. See <file:Documentation/filesystems/ecryptfs.txt> to learn more about diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index 101fe4c7b1ee..c6ac85d6c701 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c @@ -35,6 +35,7 @@ #include <linux/key.h> #include <linux/parser.h> #include <linux/fs_stack.h> +#include <linux/ima.h> #include "ecryptfs_kernel.h" /** @@ -118,6 +119,7 @@ int ecryptfs_init_persistent_file(struct dentry *ecryptfs_dentry) const struct cred *cred = current_cred(); struct ecryptfs_inode_info *inode_info = ecryptfs_inode_to_private(ecryptfs_dentry->d_inode); + int opened_lower_file = 0; int rc = 0; mutex_lock(&inode_info->lower_file_mutex); @@ -134,9 +136,12 @@ int ecryptfs_init_persistent_file(struct dentry *ecryptfs_dentry) "for lower_dentry [0x%p] and lower_mnt [0x%p]; " "rc = [%d]\n", lower_dentry, lower_mnt, rc); inode_info->lower_file = NULL; - } + } else + opened_lower_file = 1; } mutex_unlock(&inode_info->lower_file_mutex); + if (opened_lower_file) + ima_counts_get(inode_info->lower_file); return rc; } diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 72743d360509..7a520a862f49 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -2321,7 +2321,18 @@ static int ext3_commit_super(struct super_block *sb, if (!sbh) return error; - es->s_wtime = cpu_to_le32(get_seconds()); + /* + * If the file system is mounted read-only, don't update the + * superblock write time. This avoids updating the superblock + * write time when we are mounting the root file system + * read/only but we need to replay the journal; at that point, + * for people who are east of GMT and who make their clock + * tick in localtime for Windows bug-for-bug compatibility, + * the clock is set in the future, and this will cause e2fsck + * to complain and force a full file system check. + */ + if (!(sb->s_flags & MS_RDONLY)) + es->s_wtime = cpu_to_le32(get_seconds()); es->s_free_blocks_count = cpu_to_le32(ext3_count_free_blocks(sb)); es->s_free_inodes_count = cpu_to_le32(ext3_count_free_inodes(sb)); BUFFER_TRACE(sbh, "marking dirty"); diff --git a/fs/file.c b/fs/file.c index f313314f996f..87e129030ab1 100644 --- a/fs/file.c +++ b/fs/file.c @@ -10,6 +10,7 @@ #include <linux/fs.h> #include <linux/mm.h> #include <linux/time.h> +#include <linux/sched.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/file.h> diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c index 9b9d6395bad3..052f214ea6f0 100644 --- a/fs/hfs/btree.c +++ b/fs/hfs/btree.c @@ -58,6 +58,11 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke } unlock_new_inode(tree->inode); + if (!HFS_I(tree->inode)->first_blocks) { + printk(KERN_ERR "hfs: invalid btree extent records (0 size).\n"); + goto free_inode; + } + mapping = tree->inode->i_mapping; page = read_mapping_page(mapping, 0, NULL); if (IS_ERR(page)) diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c index 175d08eacc86..bed78ac8f6d1 100644 --- a/fs/hfsplus/wrapper.c +++ b/fs/hfsplus/wrapper.c @@ -99,6 +99,10 @@ int hfsplus_read_wrapper(struct super_block *sb) if (hfsplus_get_last_session(sb, &part_start, &part_size)) return -EINVAL; + if ((u64)part_start + part_size > 0x100000000ULL) { + pr_err("hfs: volumes larger than 2TB are not supported yet\n"); + return -EINVAL; + } while (1) { bh = sb_bread512(sb, part_start + HFSPLUS_VOLHEAD_SECTOR, vhdr); if (!bh) diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 63976c0ccc25..99ea196f071f 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -1180,7 +1180,7 @@ static int nfs4_init_client(struct nfs_client *clp, 1, flags & NFS_MOUNT_NORESVPORT); if (error < 0) goto error; - memcpy(clp->cl_ipaddr, ip_addr, sizeof(clp->cl_ipaddr)); + strlcpy(clp->cl_ipaddr, ip_addr, sizeof(clp->cl_ipaddr)); error = nfs_idmap_new(clp); if (error < 0) { diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 32062c33c859..7cb298525eef 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1536,6 +1536,8 @@ nfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) old_dentry->d_parent->d_name.name, old_dentry->d_name.name, dentry->d_parent->d_name.name, dentry->d_name.name); + nfs_inode_return_delegation(inode); + d_drop(dentry); error = NFS_PROTO(dir)->link(inode, dir, &dentry->d_name); if (error == 0) { diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 6c3210099d51..e1d415e97849 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -457,6 +457,7 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq) }; struct rpc_task_setup task_setup_data = { .rpc_client = NFS_CLIENT(inode), + .rpc_message = &msg, .callback_ops = &nfs_write_direct_ops, .workqueue = nfsiod_workqueue, .flags = RPC_TASK_ASYNC, diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c index 2636c26d56fa..fa3408f20112 100644 --- a/fs/nfs/nfs4namespace.c +++ b/fs/nfs/nfs4namespace.c @@ -121,7 +121,7 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata, mnt_path = nfs4_pathname_string(&location->rootpath, page2, PAGE_SIZE); if (IS_ERR(mnt_path)) - return mnt; + return ERR_CAST(mnt_path); mountdata->mnt_path = mnt_path; maxbuflen = mnt_path - 1 - page2; @@ -132,15 +132,15 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata, if (buf->len <= 0 || buf->len >= maxbuflen) continue; - mountdata->addr = (struct sockaddr *)&addr; - if (memchr(buf->data, IPV6_SCOPE_DELIMITER, buf->len)) continue; - mountdata->addrlen = nfs_parse_server_name(buf->data, - buf->len, - mountdata->addr, mountdata->addrlen); + + mountdata->addrlen = nfs_parse_server_name(buf->data, buf->len, + (struct sockaddr *)&addr, sizeof(addr)); if (mountdata->addrlen == 0) continue; + + mountdata->addr = (struct sockaddr *)&addr; rpc_set_port(mountdata->addr, NFS_PORT); memcpy(page2, buf->data, buf->len); diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index ed7c269e2514..ff37454fa783 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -72,12 +72,17 @@ static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, /* Prevent leaks of NFSv4 errors into userland */ static int nfs4_map_errors(int err) { - if (err < -1000) { + if (err >= -1000) + return err; + switch (err) { + case -NFS4ERR_RESOURCE: + return -EREMOTEIO; + default: dprintk("%s could not handle NFSv4 error %d\n", __func__, -err); - return -EIO; + break; } - return err; + return -EIO; } /* @@ -3060,9 +3065,6 @@ static void nfs4_renew_done(struct rpc_task *task, void *data) if (time_before(clp->cl_last_renewal,timestamp)) clp->cl_last_renewal = timestamp; spin_unlock(&clp->cl_lock); - dprintk("%s calling put_rpccred on rpc_cred %p\n", __func__, - task->tk_msg.rpc_cred); - put_rpccred(task->tk_msg.rpc_cred); } static const struct rpc_call_ops nfs4_renew_ops = { @@ -4877,7 +4879,6 @@ void nfs41_sequence_call_done(struct rpc_task *task, void *data) nfs41_sequence_free_slot(clp, task->tk_msg.rpc_resp); dprintk("%s rpc_cred %p\n", __func__, task->tk_msg.rpc_cred); - put_rpccred(task->tk_msg.rpc_cred); kfree(task->tk_msg.rpc_argp); kfree(task->tk_msg.rpc_resp); diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c index e27c6cef18f2..0156c01c212c 100644 --- a/fs/nfs/nfs4renewd.c +++ b/fs/nfs/nfs4renewd.c @@ -127,12 +127,6 @@ nfs4_schedule_state_renewal(struct nfs_client *clp) } void -nfs4_renewd_prepare_shutdown(struct nfs_server *server) -{ - cancel_delayed_work(&server->nfs_client->cl_renewd); -} - -void nfs4_kill_renewd(struct nfs_client *clp) { cancel_delayed_work_sync(&clp->cl_renewd); diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 83ad47cbdd8a..20b4e30e6c82 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -5681,7 +5681,6 @@ static struct { { NFS4ERR_SERVERFAULT, -ESERVERFAULT }, { NFS4ERR_BADTYPE, -EBADTYPE }, { NFS4ERR_LOCKED, -EAGAIN }, - { NFS4ERR_RESOURCE, -EREMOTEIO }, { NFS4ERR_SYMLINK, -ELOOP }, { NFS4ERR_OP_ILLEGAL, -EOPNOTSUPP }, { NFS4ERR_DEADLOCK, -EDEADLK }, diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 29786d3b9326..90be551b80c1 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -728,22 +728,24 @@ static void nfs_umount_begin(struct super_block *sb) unlock_kernel(); } -static struct nfs_parsed_mount_data *nfs_alloc_parsed_mount_data(int flags) +static struct nfs_parsed_mount_data *nfs_alloc_parsed_mount_data(unsigned int version) { struct nfs_parsed_mount_data *data; data = kzalloc(sizeof(*data), GFP_KERNEL); if (data) { - data->flags = flags; data->rsize = NFS_MAX_FILE_IO_SIZE; data->wsize = NFS_MAX_FILE_IO_SIZE; data->acregmin = NFS_DEF_ACREGMIN; data->acregmax = NFS_DEF_ACREGMAX; data->acdirmin = NFS_DEF_ACDIRMIN; data->acdirmax = NFS_DEF_ACDIRMAX; + data->mount_server.port = NFS_UNSPEC_PORT; data->nfs_server.port = NFS_UNSPEC_PORT; + data->nfs_server.protocol = XPRT_TRANSPORT_TCP; data->auth_flavors[0] = RPC_AUTH_UNIX; data->auth_flavor_len = 1; + data->version = version; data->minorversion = 0; } return data; @@ -776,15 +778,13 @@ static int nfs_verify_server_address(struct sockaddr *addr) * Select between a default port value and a user-specified port value. * If a zero value is set, then autobind will be used. */ -static void nfs_set_default_port(struct sockaddr *sap, const int parsed_port, +static void nfs_set_port(struct sockaddr *sap, int *port, const unsigned short default_port) { - unsigned short port = default_port; + if (*port == NFS_UNSPEC_PORT) + *port = default_port; - if (parsed_port != NFS_UNSPEC_PORT) - port = parsed_port; - - rpc_set_port(sap, port); + rpc_set_port(sap, *port); } /* @@ -1253,6 +1253,7 @@ static int nfs_parse_mount_options(char *raw, default: dfprintk(MOUNT, "NFS: unrecognized " "transport protocol\n"); + kfree(string); return 0; } break; @@ -1475,7 +1476,7 @@ static int nfs_try_mount(struct nfs_parsed_mount_data *args, args->mount_server.addrlen = args->nfs_server.addrlen; } request.salen = args->mount_server.addrlen; - nfs_set_default_port(request.sap, args->mount_server.port, 0); + nfs_set_port(request.sap, &args->mount_server.port, 0); /* * Now ask the mount server to map our export path @@ -1711,8 +1712,6 @@ static int nfs_validate_mount_data(void *options, if (!(data->flags & NFS_MOUNT_TCP)) args->nfs_server.protocol = XPRT_TRANSPORT_UDP; - else - args->nfs_server.protocol = XPRT_TRANSPORT_TCP; /* N.B. caller will free nfs_server.hostname in all cases */ args->nfs_server.hostname = kstrdup(data->hostname, GFP_KERNEL); args->namlen = data->namlen; @@ -1767,7 +1766,7 @@ static int nfs_validate_mount_data(void *options, goto out_v4_not_compiled; #endif - nfs_set_default_port(sap, args->nfs_server.port, 0); + nfs_set_port(sap, &args->nfs_server.port, 0); nfs_set_mount_transport_protocol(args); @@ -1848,9 +1847,10 @@ nfs_compare_remount_data(struct nfs_server *nfss, data->acdirmin != nfss->acdirmin / HZ || data->acdirmax != nfss->acdirmax / HZ || data->timeo != (10U * nfss->client->cl_timeout->to_initval / HZ) || + data->nfs_server.port != nfss->port || data->nfs_server.addrlen != nfss->nfs_client->cl_addrlen || - memcmp(&data->nfs_server.address, &nfss->nfs_client->cl_addr, - data->nfs_server.addrlen) != 0) + !rpc_cmp_addr((struct sockaddr *)&data->nfs_server.address, + (struct sockaddr *)&nfss->nfs_client->cl_addr)) return -EINVAL; return 0; @@ -1893,6 +1893,7 @@ nfs_remount(struct super_block *sb, int *flags, char *raw_data) data->acdirmin = nfss->acdirmin / HZ; data->acdirmax = nfss->acdirmax / HZ; data->timeo = 10U * nfss->client->cl_timeout->to_initval / HZ; + data->nfs_server.port = nfss->port; data->nfs_server.addrlen = nfss->nfs_client->cl_addrlen; memcpy(&data->nfs_server.address, &nfss->nfs_client->cl_addr, data->nfs_server.addrlen); @@ -2106,7 +2107,7 @@ static int nfs_get_sb(struct file_system_type *fs_type, }; int error = -ENOMEM; - data = nfs_alloc_parsed_mount_data(NFS_MOUNT_VER3 | NFS_MOUNT_TCP); + data = nfs_alloc_parsed_mount_data(3); mntfh = kzalloc(sizeof(*mntfh), GFP_KERNEL); if (data == NULL || mntfh == NULL) goto out_free_fh; @@ -2331,7 +2332,7 @@ static int nfs4_validate_text_mount_data(void *options, { struct sockaddr *sap = (struct sockaddr *)&args->nfs_server.address; - nfs_set_default_port(sap, args->nfs_server.port, NFS_PORT); + nfs_set_port(sap, &args->nfs_server.port, NFS_PORT); nfs_validate_transport_protocol(args); @@ -2376,7 +2377,6 @@ static int nfs4_validate_mount_data(void *options, if (data == NULL) goto out_no_data; - args->version = 4; switch (data->version) { case 1: if (data->host_addrlen > sizeof(args->nfs_server.address)) @@ -2660,7 +2660,7 @@ static int nfs4_get_sb(struct file_system_type *fs_type, struct nfs_parsed_mount_data *data; int error = -ENOMEM; - data = nfs_alloc_parsed_mount_data(0); + data = nfs_alloc_parsed_mount_data(4); if (data == NULL) goto out_free_data; @@ -2690,7 +2690,6 @@ static void nfs4_kill_super(struct super_block *sb) dprintk("--> %s\n", __func__); nfs_super_return_all_delegations(sb); kill_anon_super(sb); - nfs4_renewd_prepare_shutdown(server); nfs_fscache_release_super_cookie(sb); nfs_free_server(server); dprintk("<-- %s\n", __func__); diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c index 828a889be909..7e54e52964dd 100644 --- a/fs/notify/dnotify/dnotify.c +++ b/fs/notify/dnotify/dnotify.c @@ -91,6 +91,7 @@ static int dnotify_handle_event(struct fsnotify_group *group, struct dnotify_struct *dn; struct dnotify_struct **prev; struct fown_struct *fown; + __u32 test_mask = event->mask & ~FS_EVENT_ON_CHILD; to_tell = event->to_tell; @@ -106,7 +107,7 @@ static int dnotify_handle_event(struct fsnotify_group *group, spin_lock(&entry->lock); prev = &dnentry->dn; while ((dn = *prev) != NULL) { - if ((dn->dn_mask & event->mask) == 0) { + if ((dn->dn_mask & test_mask) == 0) { prev = &dn->dn_next; continue; } diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c index c8a07c65482b..3165d85aada2 100644 --- a/fs/notify/inode_mark.c +++ b/fs/notify/inode_mark.c @@ -324,11 +324,11 @@ int fsnotify_add_mark(struct fsnotify_mark_entry *entry, spin_lock(&group->mark_lock); spin_lock(&inode->i_lock); - entry->group = group; - entry->inode = inode; - lentry = fsnotify_find_mark_entry(group, inode); if (!lentry) { + entry->group = group; + entry->inode = inode; + hlist_add_head(&entry->i_list, &inode->i_fsnotify_mark_entries); list_add(&entry->g_list, &group->mark_entries); diff --git a/fs/notify/notification.c b/fs/notify/notification.c index 3816d5750dd5..b8bf53b4c108 100644 --- a/fs/notify/notification.c +++ b/fs/notify/notification.c @@ -143,7 +143,7 @@ static bool event_compare(struct fsnotify_event *old, struct fsnotify_event *new /* remember, after old was put on the wait_q we aren't * allowed to look at the inode any more, only thing * left to check was if the file_name is the same */ - if (old->name_len && + if (!old->name_len || !strcmp(old->file_name, new->file_name)) return true; break; diff --git a/fs/pipe.c b/fs/pipe.c index 52c415114838..ae17d026aaa3 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -777,36 +777,55 @@ pipe_rdwr_release(struct inode *inode, struct file *filp) static int pipe_read_open(struct inode *inode, struct file *filp) { - /* We could have perhaps used atomic_t, but this and friends - below are the only places. So it doesn't seem worthwhile. */ + int ret = -ENOENT; + mutex_lock(&inode->i_mutex); - inode->i_pipe->readers++; + + if (inode->i_pipe) { + ret = 0; + inode->i_pipe->readers++; + } + mutex_unlock(&inode->i_mutex); - return 0; + return ret; } static int pipe_write_open(struct inode *inode, struct file *filp) { + int ret = -ENOENT; + mutex_lock(&inode->i_mutex); - inode->i_pipe->writers++; + + if (inode->i_pipe) { + ret = 0; + inode->i_pipe->writers++; + } + mutex_unlock(&inode->i_mutex); - return 0; + return ret; } static int pipe_rdwr_open(struct inode *inode, struct file *filp) { + int ret = -ENOENT; + mutex_lock(&inode->i_mutex); - if (filp->f_mode & FMODE_READ) - inode->i_pipe->readers++; - if (filp->f_mode & FMODE_WRITE) - inode->i_pipe->writers++; + + if (inode->i_pipe) { + ret = 0; + if (filp->f_mode & FMODE_READ) + inode->i_pipe->readers++; + if (filp->f_mode & FMODE_WRITE) + inode->i_pipe->writers++; + } + mutex_unlock(&inode->i_mutex); - return 0; + return ret; } /* diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index 56013371f9f3..a44a7897fd4d 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -23,7 +23,6 @@ #include <asm/io.h> #include <linux/list.h> #include <linux/ioport.h> -#include <linux/mm.h> #include <linux/memory.h> #include <asm/sections.h> diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index c7bff4f603ff..a65239cfd97e 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -99,7 +99,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) "VmallocUsed: %8lu kB\n" "VmallocChunk: %8lu kB\n" #ifdef CONFIG_MEMORY_FAILURE - "HardwareCorrupted: %8lu kB\n" + "HardwareCorrupted: %5lu kB\n" #endif , K(i.totalram), diff --git a/fs/proc/page.c b/fs/proc/page.c index 2281c2cbfe2b..5033ce0d254b 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -94,6 +94,7 @@ static const struct file_operations proc_kpagecount_operations = { #define KPF_COMPOUND_TAIL 16 #define KPF_HUGE 17 #define KPF_UNEVICTABLE 18 +#define KPF_HWPOISON 19 #define KPF_NOPAGE 20 #define KPF_KSM 21 @@ -180,6 +181,10 @@ static u64 get_uflags(struct page *page) u |= kpf_copy_bit(k, KPF_UNEVICTABLE, PG_unevictable); u |= kpf_copy_bit(k, KPF_MLOCKED, PG_mlocked); +#ifdef CONFIG_MEMORY_FAILURE + u |= kpf_copy_bit(k, KPF_HWPOISON, PG_hwpoison); +#endif + #ifdef CONFIG_IA64_UNCACHED_ALLOCATOR u |= kpf_copy_bit(k, KPF_UNCACHED, PG_uncached); #endif diff --git a/fs/romfs/storage.c b/fs/romfs/storage.c index b3208adf8e71..71e2b4d50a0a 100644 --- a/fs/romfs/storage.c +++ b/fs/romfs/storage.c @@ -253,11 +253,11 @@ ssize_t romfs_dev_strnlen(struct super_block *sb, #ifdef CONFIG_ROMFS_ON_MTD if (sb->s_mtd) - return romfs_mtd_strnlen(sb, pos, limit); + return romfs_mtd_strnlen(sb, pos, maxlen); #endif #ifdef CONFIG_ROMFS_ON_BLOCK if (sb->s_bdev) - return romfs_blk_strnlen(sb, pos, limit); + return romfs_blk_strnlen(sb, pos, maxlen); #endif return -EIO; } diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index 0050fc40e8c9..5fad489ce5bc 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -894,7 +894,8 @@ int sysfs_move_dir(struct kobject *kobj, struct kobject *new_parent_kobj) mutex_lock(&sysfs_rename_mutex); BUG_ON(!sd->s_parent); - new_parent_sd = new_parent_kobj->sd ? new_parent_kobj->sd : &sysfs_root; + new_parent_sd = (new_parent_kobj && new_parent_kobj->sd) ? + new_parent_kobj->sd : &sysfs_root; error = 0; if (sd->s_parent == new_parent_sd) diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index 561a9c050cef..f5ea4680f15f 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -268,7 +268,7 @@ static int sysfs_get_open_dirent(struct sysfs_dirent *sd, struct sysfs_open_dirent *od, *new_od = NULL; retry: - spin_lock(&sysfs_open_dirent_lock); + spin_lock_irq(&sysfs_open_dirent_lock); if (!sd->s_attr.open && new_od) { sd->s_attr.open = new_od; @@ -281,7 +281,7 @@ static int sysfs_get_open_dirent(struct sysfs_dirent *sd, list_add_tail(&buffer->list, &od->buffers); } - spin_unlock(&sysfs_open_dirent_lock); + spin_unlock_irq(&sysfs_open_dirent_lock); if (od) { kfree(new_od); @@ -315,8 +315,9 @@ static void sysfs_put_open_dirent(struct sysfs_dirent *sd, struct sysfs_buffer *buffer) { struct sysfs_open_dirent *od = sd->s_attr.open; + unsigned long flags; - spin_lock(&sysfs_open_dirent_lock); + spin_lock_irqsave(&sysfs_open_dirent_lock, flags); list_del(&buffer->list); if (atomic_dec_and_test(&od->refcnt)) @@ -324,7 +325,7 @@ static void sysfs_put_open_dirent(struct sysfs_dirent *sd, else od = NULL; - spin_unlock(&sysfs_open_dirent_lock); + spin_unlock_irqrestore(&sysfs_open_dirent_lock, flags); kfree(od); } @@ -456,8 +457,9 @@ static unsigned int sysfs_poll(struct file *filp, poll_table *wait) void sysfs_notify_dirent(struct sysfs_dirent *sd) { struct sysfs_open_dirent *od; + unsigned long flags; - spin_lock(&sysfs_open_dirent_lock); + spin_lock_irqsave(&sysfs_open_dirent_lock, flags); od = sd->s_attr.open; if (od) { @@ -465,7 +467,7 @@ void sysfs_notify_dirent(struct sysfs_dirent *sd) wake_up_interruptible(&od->poll); } - spin_unlock(&sysfs_open_dirent_lock); + spin_unlock_irqrestore(&sysfs_open_dirent_lock, flags); } EXPORT_SYMBOL_GPL(sysfs_notify_dirent); diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c index 381854461b28..c2e30eea74dc 100644 --- a/fs/xfs/linux-2.6/xfs_aops.c +++ b/fs/xfs/linux-2.6/xfs_aops.c @@ -186,19 +186,37 @@ xfs_destroy_ioend( } /* + * If the end of the current ioend is beyond the current EOF, + * return the new EOF value, otherwise zero. + */ +STATIC xfs_fsize_t +xfs_ioend_new_eof( + xfs_ioend_t *ioend) +{ + xfs_inode_t *ip = XFS_I(ioend->io_inode); + xfs_fsize_t isize; + xfs_fsize_t bsize; + + bsize = ioend->io_offset + ioend->io_size; + isize = MAX(ip->i_size, ip->i_new_size); + isize = MIN(isize, bsize); + return isize > ip->i_d.di_size ? isize : 0; +} + +/* * Update on-disk file size now that data has been written to disk. * The current in-memory file size is i_size. If a write is beyond * eof i_new_size will be the intended file size until i_size is * updated. If this write does not extend all the way to the valid * file size then restrict this update to the end of the write. */ + STATIC void xfs_setfilesize( xfs_ioend_t *ioend) { xfs_inode_t *ip = XFS_I(ioend->io_inode); xfs_fsize_t isize; - xfs_fsize_t bsize; ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG); ASSERT(ioend->io_type != IOMAP_READ); @@ -206,16 +224,10 @@ xfs_setfilesize( if (unlikely(ioend->io_error)) return; - bsize = ioend->io_offset + ioend->io_size; - xfs_ilock(ip, XFS_ILOCK_EXCL); - - isize = MAX(ip->i_size, ip->i_new_size); - isize = MIN(isize, bsize); - - if (ip->i_d.di_size < isize) { + isize = xfs_ioend_new_eof(ioend); + if (isize) { ip->i_d.di_size = isize; - ip->i_update_core = 1; xfs_mark_inode_dirty_sync(ip); } @@ -404,10 +416,16 @@ xfs_submit_ioend_bio( struct bio *bio) { atomic_inc(&ioend->io_remaining); - bio->bi_private = ioend; bio->bi_end_io = xfs_end_bio; + /* + * If the I/O is beyond EOF we mark the inode dirty immediately + * but don't update the inode size until I/O completion. + */ + if (xfs_ioend_new_eof(ioend)) + xfs_mark_inode_dirty_sync(XFS_I(ioend->io_inode)); + submit_bio(WRITE, bio); ASSERT(!bio_flagged(bio, BIO_EOPNOTSUPP)); bio_put(bio); diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c index 629370974e57..eff61e2732af 100644 --- a/fs/xfs/linux-2.6/xfs_file.c +++ b/fs/xfs/linux-2.6/xfs_file.c @@ -176,14 +176,7 @@ xfs_file_fsync( struct dentry *dentry, int datasync) { - struct inode *inode = dentry->d_inode; - struct xfs_inode *ip = XFS_I(inode); - int error; - - /* capture size updates in I/O completion before writing the inode. */ - error = filemap_fdatawait(inode->i_mapping); - if (error) - return error; + struct xfs_inode *ip = XFS_I(dentry->d_inode); xfs_iflags_clear(ip, XFS_ITRUNCATED); return -xfs_fsync(ip); diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c index da0159d99f82..cd42ef78f6b5 100644 --- a/fs/xfs/linux-2.6/xfs_iops.c +++ b/fs/xfs/linux-2.6/xfs_iops.c @@ -57,19 +57,22 @@ #include <linux/fiemap.h> /* - * Bring the atime in the XFS inode uptodate. - * Used before logging the inode to disk or when the Linux inode goes away. + * Bring the timestamps in the XFS inode uptodate. + * + * Used before writing the inode to disk. */ void -xfs_synchronize_atime( +xfs_synchronize_times( xfs_inode_t *ip) { struct inode *inode = VFS_I(ip); - if (!(inode->i_state & I_CLEAR)) { - ip->i_d.di_atime.t_sec = (__int32_t)inode->i_atime.tv_sec; - ip->i_d.di_atime.t_nsec = (__int32_t)inode->i_atime.tv_nsec; - } + ip->i_d.di_atime.t_sec = (__int32_t)inode->i_atime.tv_sec; + ip->i_d.di_atime.t_nsec = (__int32_t)inode->i_atime.tv_nsec; + ip->i_d.di_ctime.t_sec = (__int32_t)inode->i_ctime.tv_sec; + ip->i_d.di_ctime.t_nsec = (__int32_t)inode->i_ctime.tv_nsec; + ip->i_d.di_mtime.t_sec = (__int32_t)inode->i_mtime.tv_sec; + ip->i_d.di_mtime.t_nsec = (__int32_t)inode->i_mtime.tv_nsec; } /* @@ -106,32 +109,20 @@ xfs_ichgtime( if ((flags & XFS_ICHGTIME_MOD) && !timespec_equal(&inode->i_mtime, &tv)) { inode->i_mtime = tv; - ip->i_d.di_mtime.t_sec = (__int32_t)tv.tv_sec; - ip->i_d.di_mtime.t_nsec = (__int32_t)tv.tv_nsec; sync_it = 1; } if ((flags & XFS_ICHGTIME_CHG) && !timespec_equal(&inode->i_ctime, &tv)) { inode->i_ctime = tv; - ip->i_d.di_ctime.t_sec = (__int32_t)tv.tv_sec; - ip->i_d.di_ctime.t_nsec = (__int32_t)tv.tv_nsec; sync_it = 1; } /* - * We update the i_update_core field _after_ changing - * the timestamps in order to coordinate properly with - * xfs_iflush() so that we don't lose timestamp updates. - * This keeps us from having to hold the inode lock - * while doing this. We use the SYNCHRONIZE macro to - * ensure that the compiler does not reorder the update - * of i_update_core above the timestamp updates above. + * Update complete - now make sure everyone knows that the inode + * is dirty. */ - if (sync_it) { - SYNCHRONIZE(); - ip->i_update_core = 1; + if (sync_it) xfs_mark_inode_dirty_sync(ip); - } } /* @@ -506,10 +497,8 @@ xfs_vn_getattr( stat->gid = ip->i_d.di_gid; stat->ino = ip->i_ino; stat->atime = inode->i_atime; - stat->mtime.tv_sec = ip->i_d.di_mtime.t_sec; - stat->mtime.tv_nsec = ip->i_d.di_mtime.t_nsec; - stat->ctime.tv_sec = ip->i_d.di_ctime.t_sec; - stat->ctime.tv_nsec = ip->i_d.di_ctime.t_nsec; + stat->mtime = inode->i_mtime; + stat->ctime = inode->i_ctime; stat->blocks = XFS_FSB_TO_BB(mp, ip->i_d.di_nblocks + ip->i_delayed_blks); diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c index 49e4a6aea73c..072050f8d346 100644 --- a/fs/xfs/linux-2.6/xfs_lrw.c +++ b/fs/xfs/linux-2.6/xfs_lrw.c @@ -667,7 +667,7 @@ start: xip->i_new_size = new_size; if (likely(!(ioflags & IO_INVIS))) - xfs_ichgtime(xip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + file_update_time(file); /* * If the offset is beyond the size of the file, we have a couple diff --git a/fs/xfs/linux-2.6/xfs_quotaops.c b/fs/xfs/linux-2.6/xfs_quotaops.c index 9e41f91aa269..3d4a0c84d634 100644 --- a/fs/xfs/linux-2.6/xfs_quotaops.c +++ b/fs/xfs/linux-2.6/xfs_quotaops.c @@ -80,7 +80,7 @@ xfs_fs_set_xstate( if (sb->s_flags & MS_RDONLY) return -EROFS; - if (!XFS_IS_QUOTA_RUNNING(mp)) + if (op != Q_XQUOTARM && !XFS_IS_QUOTA_RUNNING(mp)) return -ENOSYS; if (!capable(CAP_SYS_ADMIN)) return -EPERM; diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index bdd41c8c342f..18a4b8e11df2 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -977,6 +977,28 @@ xfs_fs_inode_init_once( } /* + * Dirty the XFS inode when mark_inode_dirty_sync() is called so that + * we catch unlogged VFS level updates to the inode. Care must be taken + * here - the transaction code calls mark_inode_dirty_sync() to mark the + * VFS inode dirty in a transaction and clears the i_update_core field; + * it must clear the field after calling mark_inode_dirty_sync() to + * correctly indicate that the dirty state has been propagated into the + * inode log item. + * + * We need the barrier() to maintain correct ordering between unlogged + * updates and the transaction commit code that clears the i_update_core + * field. This requires all updates to be completed before marking the + * inode dirty. + */ +STATIC void +xfs_fs_dirty_inode( + struct inode *inode) +{ + barrier(); + XFS_I(inode)->i_update_core = 1; +} + +/* * Attempt to flush the inode, this will actually fail * if the inode is pinned, but we dirty the inode again * at the point when it is unpinned after a log write, @@ -1126,7 +1148,7 @@ xfs_fs_put_super( } STATIC int -xfs_fs_sync_super( +xfs_fs_sync_fs( struct super_block *sb, int wait) { @@ -1134,23 +1156,23 @@ xfs_fs_sync_super( int error; /* - * Treat a sync operation like a freeze. This is to work - * around a race in sync_inodes() which works in two phases - * - an asynchronous flush, which can write out an inode - * without waiting for file size updates to complete, and a - * synchronous flush, which wont do anything because the - * async flush removed the inode's dirty flag. Also - * sync_inodes() will not see any files that just have - * outstanding transactions to be flushed because we don't - * dirty the Linux inode until after the transaction I/O - * completes. + * Not much we can do for the first async pass. Writing out the + * superblock would be counter-productive as we are going to redirty + * when writing out other data and metadata (and writing out a single + * block is quite fast anyway). + * + * Try to asynchronously kick off quota syncing at least. */ - if (wait || unlikely(sb->s_frozen == SB_FREEZE_WRITE)) - error = xfs_quiesce_data(mp); - else - error = xfs_sync_fsdata(mp, 0); + if (!wait) { + xfs_qm_sync(mp, SYNC_TRYLOCK); + return 0; + } + + error = xfs_quiesce_data(mp); + if (error) + return -error; - if (unlikely(laptop_mode)) { + if (laptop_mode) { int prev_sync_seq = mp->m_sync_seq; /* @@ -1169,7 +1191,7 @@ xfs_fs_sync_super( mp->m_sync_seq != prev_sync_seq); } - return -error; + return 0; } STATIC int @@ -1539,10 +1561,11 @@ xfs_fs_get_sb( static const struct super_operations xfs_super_operations = { .alloc_inode = xfs_fs_alloc_inode, .destroy_inode = xfs_fs_destroy_inode, + .dirty_inode = xfs_fs_dirty_inode, .write_inode = xfs_fs_write_inode, .clear_inode = xfs_fs_clear_inode, .put_super = xfs_fs_put_super, - .sync_fs = xfs_fs_sync_super, + .sync_fs = xfs_fs_sync_fs, .freeze_fs = xfs_fs_freeze, .statfs = xfs_fs_statfs, .remount_fs = xfs_fs_remount, diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c index 320be6aea492..961df0a22c78 100644 --- a/fs/xfs/linux-2.6/xfs_sync.c +++ b/fs/xfs/linux-2.6/xfs_sync.c @@ -309,11 +309,15 @@ xfs_sync_attr( STATIC int xfs_commit_dummy_trans( struct xfs_mount *mp, - uint log_flags) + uint flags) { struct xfs_inode *ip = mp->m_rootip; struct xfs_trans *tp; int error; + int log_flags = XFS_LOG_FORCE; + + if (flags & SYNC_WAIT) + log_flags |= XFS_LOG_SYNC; /* * Put a dummy transaction in the log to tell recovery @@ -331,13 +335,12 @@ xfs_commit_dummy_trans( xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); xfs_trans_ihold(tp, ip); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - /* XXX(hch): ignoring the error here.. */ error = xfs_trans_commit(tp, 0); - xfs_iunlock(ip, XFS_ILOCK_EXCL); + /* the log force ensures this transaction is pushed to disk */ xfs_log_force(mp, 0, log_flags); - return 0; + return error; } int @@ -385,7 +388,20 @@ xfs_sync_fsdata( else XFS_BUF_ASYNC(bp); - return xfs_bwrite(mp, bp); + error = xfs_bwrite(mp, bp); + if (error) + return error; + + /* + * If this is a data integrity sync make sure all pending buffers + * are flushed out for the log coverage check below. + */ + if (flags & SYNC_WAIT) + xfs_flush_buftarg(mp->m_ddev_targp, 1); + + if (xfs_log_need_covered(mp)) + error = xfs_commit_dummy_trans(mp, flags); + return error; out_brelse: xfs_buf_relse(bp); @@ -419,14 +435,16 @@ xfs_quiesce_data( /* push non-blocking */ xfs_sync_data(mp, 0); xfs_qm_sync(mp, SYNC_TRYLOCK); - xfs_filestream_flush(mp); - /* push and block */ + /* push and block till complete */ xfs_sync_data(mp, SYNC_WAIT); xfs_qm_sync(mp, SYNC_WAIT); + /* drop inode references pinned by filestreams */ + xfs_filestream_flush(mp); + /* write superblock and hoover up shutdown errors */ - error = xfs_sync_fsdata(mp, 0); + error = xfs_sync_fsdata(mp, SYNC_WAIT); /* flush data-only devices */ if (mp->m_rtdev_targp) @@ -570,8 +588,6 @@ xfs_sync_worker( /* dgc: errors ignored here */ error = xfs_qm_sync(mp, SYNC_TRYLOCK); error = xfs_sync_fsdata(mp, SYNC_TRYLOCK); - if (xfs_log_need_covered(mp)) - error = xfs_commit_dummy_trans(mp, XFS_LOG_FORCE); } mp->m_sync_seq++; wake_up(&mp->m_wait_single_sync_task); diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c index 4e4276b956e8..5d1a3b98a6e6 100644 --- a/fs/xfs/quota/xfs_qm_syscalls.c +++ b/fs/xfs/quota/xfs_qm_syscalls.c @@ -876,7 +876,6 @@ xfs_dqrele_inode( ip->i_gdquot = NULL; } xfs_iput(ip, XFS_ILOCK_EXCL); - IRELE(ip); return 0; } diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c index 7465f9ee125f..ab89a7e94a0f 100644 --- a/fs/xfs/xfs_dfrag.c +++ b/fs/xfs/xfs_dfrag.c @@ -206,10 +206,10 @@ xfs_swap_extents( * process that the file was not changed out from * under it. */ - if ((sbp->bs_ctime.tv_sec != ip->i_d.di_ctime.t_sec) || - (sbp->bs_ctime.tv_nsec != ip->i_d.di_ctime.t_nsec) || - (sbp->bs_mtime.tv_sec != ip->i_d.di_mtime.t_sec) || - (sbp->bs_mtime.tv_nsec != ip->i_d.di_mtime.t_nsec)) { + if ((sbp->bs_ctime.tv_sec != VFS_I(ip)->i_ctime.tv_sec) || + (sbp->bs_ctime.tv_nsec != VFS_I(ip)->i_ctime.tv_nsec) || + (sbp->bs_mtime.tv_sec != VFS_I(ip)->i_mtime.tv_sec) || + (sbp->bs_mtime.tv_nsec != VFS_I(ip)->i_mtime.tv_nsec)) { error = XFS_ERROR(EBUSY); goto out_unlock; } diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c index fa913e459442..41ad537c49e9 100644 --- a/fs/xfs/xfs_dir2_leaf.c +++ b/fs/xfs/xfs_dir2_leaf.c @@ -854,6 +854,7 @@ xfs_dir2_leaf_getdents( */ ra_want = howmany(bufsize + mp->m_dirblksize, mp->m_sb.sb_blocksize) - 1; + ASSERT(ra_want >= 0); /* * If we don't have as many as we want, and we haven't @@ -1088,7 +1089,8 @@ xfs_dir2_leaf_getdents( */ ptr += length; curoff += length; - bufsize -= length; + /* bufsize may have just been a guess; don't go negative */ + bufsize = bufsize > length ? bufsize - length : 0; } /* diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c index ab64f3efb43b..0785797db828 100644 --- a/fs/xfs/xfs_ialloc.c +++ b/fs/xfs/xfs_ialloc.c @@ -880,6 +880,7 @@ nextag: * Not in range - save last search * location and allocate a new inode */ + xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); pag->pagl_leftrec = trec.ir_startino; pag->pagl_rightrec = rec.ir_startino; pag->pagl_pagino = pagino; diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index c1dc7ef5a1d8..b92a4fa2a0a1 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -3068,9 +3068,9 @@ xfs_iflush_int( SYNCHRONIZE(); /* - * Make sure to get the latest atime from the Linux inode. + * Make sure to get the latest timestamps from the Linux inode. */ - xfs_synchronize_atime(ip); + xfs_synchronize_times(ip); if (XFS_TEST_ERROR(be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC, mp, XFS_ERRTAG_IFLUSH_1, XFS_RANDOM_IFLUSH_1)) { diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 0b38b9a869ec..41555de1d1db 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -504,7 +504,7 @@ void xfs_ichgtime(xfs_inode_t *, int); void xfs_lock_inodes(xfs_inode_t **, int, uint); void xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint); -void xfs_synchronize_atime(xfs_inode_t *); +void xfs_synchronize_times(xfs_inode_t *); void xfs_mark_inode_dirty_sync(xfs_inode_t *); #if defined(XFS_INODE_TRACE) diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 47d5b663c37e..9794b876d6ff 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -232,6 +232,15 @@ xfs_inode_item_format( nvecs = 1; /* + * Make sure the linux inode is dirty. We do this before + * clearing i_update_core as the VFS will call back into + * XFS here and set i_update_core, so we need to dirty the + * inode first so that the ordering of i_update_core and + * unlogged modifications still works as described below. + */ + xfs_mark_inode_dirty_sync(ip); + + /* * Clear i_update_core if the timestamps (or any other * non-transactional modification) need flushing/logging * and we're about to log them with the rest of the core. @@ -263,14 +272,9 @@ xfs_inode_item_format( } /* - * Make sure to get the latest atime from the Linux inode. + * Make sure to get the latest timestamps from the Linux inode. */ - xfs_synchronize_atime(ip); - - /* - * make sure the linux inode is dirty - */ - xfs_mark_inode_dirty_sync(ip); + xfs_synchronize_times(ip); vecp->i_addr = (xfs_caddr_t)&ip->i_d; vecp->i_len = sizeof(struct xfs_icdinode); diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index b68f9107e26c..62efab2f3839 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -59,6 +59,7 @@ xfs_bulkstat_one_iget( { xfs_icdinode_t *dic; /* dinode core info pointer */ xfs_inode_t *ip; /* incore inode pointer */ + struct inode *inode; int error; error = xfs_iget(mp, NULL, ino, @@ -72,6 +73,7 @@ xfs_bulkstat_one_iget( ASSERT(ip->i_imap.im_blkno != 0); dic = &ip->i_d; + inode = VFS_I(ip); /* xfs_iget returns the following without needing * further change. @@ -83,16 +85,19 @@ xfs_bulkstat_one_iget( buf->bs_uid = dic->di_uid; buf->bs_gid = dic->di_gid; buf->bs_size = dic->di_size; + /* - * We are reading the atime from the Linux inode because the - * dinode might not be uptodate. + * We need to read the timestamps from the Linux inode because + * the VFS keeps writing directly into the inode structure instead + * of telling us about the updates. */ - buf->bs_atime.tv_sec = VFS_I(ip)->i_atime.tv_sec; - buf->bs_atime.tv_nsec = VFS_I(ip)->i_atime.tv_nsec; - buf->bs_mtime.tv_sec = dic->di_mtime.t_sec; - buf->bs_mtime.tv_nsec = dic->di_mtime.t_nsec; - buf->bs_ctime.tv_sec = dic->di_ctime.t_sec; - buf->bs_ctime.tv_nsec = dic->di_ctime.t_nsec; + buf->bs_atime.tv_sec = inode->i_atime.tv_sec; + buf->bs_atime.tv_nsec = inode->i_atime.tv_nsec; + buf->bs_mtime.tv_sec = inode->i_mtime.tv_sec; + buf->bs_mtime.tv_nsec = inode->i_mtime.tv_nsec; + buf->bs_ctime.tv_sec = inode->i_ctime.tv_sec; + buf->bs_ctime.tv_nsec = inode->i_ctime.tv_nsec; + buf->bs_xflags = xfs_ip2xflags(ip); buf->bs_extsize = dic->di_extsize << mp->m_sb.sb_blocklog; buf->bs_extents = dic->di_nextents; diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index a434f287962d..b572f7e840e0 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -2476,12 +2476,6 @@ xfs_reclaim( ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0); /* - * Make sure the atime in the XFS inode is correct before freeing the - * Linux inode. - */ - xfs_synchronize_atime(ip); - - /* * If we have nothing to flush with this inode then complete the * teardown now, otherwise break the link between the xfs inode and the * linux inode and clean up the xfs inode later. This avoids flushing |