summaryrefslogtreecommitdiffstats
path: root/fs/ext4
diff options
context:
space:
mode:
Diffstat (limited to 'fs/ext4')
-rw-r--r--fs/ext4/Makefile4
-rw-r--r--fs/ext4/acl.c13
-rw-r--r--fs/ext4/balloc.c28
-rw-r--r--fs/ext4/block_validity.c244
-rw-r--r--fs/ext4/dir.c3
-rw-r--r--fs/ext4/ext4.h391
-rw-r--r--fs/ext4/ext4_extents.h4
-rw-r--r--fs/ext4/ext4_i.h140
-rw-r--r--fs/ext4/ext4_sb.h161
-rw-r--r--fs/ext4/extents.c89
-rw-r--r--fs/ext4/file.c36
-rw-r--r--fs/ext4/fsync.c8
-rw-r--r--fs/ext4/group.h29
-rw-r--r--fs/ext4/ialloc.c119
-rw-r--r--fs/ext4/inode.c848
-rw-r--r--fs/ext4/ioctl.c36
-rw-r--r--fs/ext4/mballoc.c251
-rw-r--r--fs/ext4/mballoc.h2
-rw-r--r--fs/ext4/migrate.c8
-rw-r--r--fs/ext4/move_extent.c1320
-rw-r--r--fs/ext4/namei.c37
-rw-r--r--fs/ext4/namei.h8
-rw-r--r--fs/ext4/resize.c36
-rw-r--r--fs/ext4/super.c869
24 files changed, 3203 insertions, 1481 deletions
diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
index a8ff003a00f7..8867b2a1e5fe 100644
--- a/fs/ext4/Makefile
+++ b/fs/ext4/Makefile
@@ -5,8 +5,8 @@
obj-$(CONFIG_EXT4_FS) += ext4.o
ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
- ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
- ext4_jbd2.o migrate.o mballoc.o
+ ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
+ ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o
ext4-$(CONFIG_EXT4_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o
ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index 647e0d65a284..605aeed96d68 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -129,12 +129,15 @@ fail:
static inline struct posix_acl *
ext4_iget_acl(struct inode *inode, struct posix_acl **i_acl)
{
- struct posix_acl *acl = EXT4_ACL_NOT_CACHED;
+ struct posix_acl *acl = ACCESS_ONCE(*i_acl);
- spin_lock(&inode->i_lock);
- if (*i_acl != EXT4_ACL_NOT_CACHED)
- acl = posix_acl_dup(*i_acl);
- spin_unlock(&inode->i_lock);
+ if (acl) {
+ spin_lock(&inode->i_lock);
+ acl = *i_acl;
+ if (acl != EXT4_ACL_NOT_CACHED)
+ acl = posix_acl_dup(acl);
+ spin_unlock(&inode->i_lock);
+ }
return acl;
}
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 53c72ad85877..e2126d70dff5 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -19,7 +19,6 @@
#include <linux/buffer_head.h>
#include "ext4.h"
#include "ext4_jbd2.h"
-#include "group.h"
#include "mballoc.h"
/*
@@ -88,6 +87,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
ext4_group_t block_group, struct ext4_group_desc *gdp)
{
int bit, bit_max;
+ ext4_group_t ngroups = ext4_get_groups_count(sb);
unsigned free_blocks, group_blocks;
struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -123,7 +123,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
bit_max += ext4_bg_num_gdb(sb, block_group);
}
- if (block_group == sbi->s_groups_count - 1) {
+ if (block_group == ngroups - 1) {
/*
* Even though mke2fs always initialize first and last group
* if some other tool enabled the EXT4_BG_BLOCK_UNINIT we need
@@ -131,7 +131,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
*/
group_blocks = ext4_blocks_count(sbi->s_es) -
le32_to_cpu(sbi->s_es->s_first_data_block) -
- (EXT4_BLOCKS_PER_GROUP(sb) * (sbi->s_groups_count - 1));
+ (EXT4_BLOCKS_PER_GROUP(sb) * (ngroups - 1));
} else {
group_blocks = EXT4_BLOCKS_PER_GROUP(sb);
}
@@ -205,18 +205,18 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb,
{
unsigned int group_desc;
unsigned int offset;
+ ext4_group_t ngroups = ext4_get_groups_count(sb);
struct ext4_group_desc *desc;
struct ext4_sb_info *sbi = EXT4_SB(sb);
- if (block_group >= sbi->s_groups_count) {
+ if (block_group >= ngroups) {
ext4_error(sb, "ext4_get_group_desc",
"block_group >= groups_count - "
"block_group = %u, groups_count = %u",
- block_group, sbi->s_groups_count);
+ block_group, ngroups);
return NULL;
}
- smp_rmb();
group_desc = block_group >> EXT4_DESC_PER_BLOCK_BITS(sb);
offset = block_group & (EXT4_DESC_PER_BLOCK(sb) - 1);
@@ -326,16 +326,16 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
unlock_buffer(bh);
return bh;
}
- spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group));
+ ext4_lock_group(sb, block_group);
if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
ext4_init_block_bitmap(sb, bh, block_group, desc);
set_bitmap_uptodate(bh);
set_buffer_uptodate(bh);
- spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
+ ext4_unlock_group(sb, block_group);
unlock_buffer(bh);
return bh;
}
- spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
+ ext4_unlock_group(sb, block_group);
if (buffer_uptodate(bh)) {
/*
* if not uninit if bh is uptodate,
@@ -451,7 +451,7 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
down_write(&grp->alloc_sem);
for (i = 0, blocks_freed = 0; i < count; i++) {
BUFFER_TRACE(bitmap_bh, "clear bit");
- if (!ext4_clear_bit_atomic(sb_bgl_lock(sbi, block_group),
+ if (!ext4_clear_bit_atomic(ext4_group_lock_ptr(sb, block_group),
bit + i, bitmap_bh->b_data)) {
ext4_error(sb, __func__,
"bit already cleared for block %llu",
@@ -461,11 +461,11 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
blocks_freed++;
}
}
- spin_lock(sb_bgl_lock(sbi, block_group));
+ ext4_lock_group(sb, block_group);
blk_free_count = blocks_freed + ext4_free_blks_count(sb, desc);
ext4_free_blks_set(sb, desc, blk_free_count);
desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc);
- spin_unlock(sb_bgl_lock(sbi, block_group));
+ ext4_unlock_group(sb, block_group);
percpu_counter_add(&sbi->s_freeblocks_counter, blocks_freed);
if (sbi->s_log_groups_per_flex) {
@@ -665,7 +665,7 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
ext4_fsblk_t desc_count;
struct ext4_group_desc *gdp;
ext4_group_t i;
- ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
+ ext4_group_t ngroups = ext4_get_groups_count(sb);
#ifdef EXT4FS_DEBUG
struct ext4_super_block *es;
ext4_fsblk_t bitmap_count;
@@ -677,7 +677,6 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
bitmap_count = 0;
gdp = NULL;
- smp_rmb();
for (i = 0; i < ngroups; i++) {
gdp = ext4_get_group_desc(sb, i, NULL);
if (!gdp)
@@ -700,7 +699,6 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
return bitmap_count;
#else
desc_count = 0;
- smp_rmb();
for (i = 0; i < ngroups; i++) {
gdp = ext4_get_group_desc(sb, i, NULL);
if (!gdp)
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
new file mode 100644
index 000000000000..50784ef07563
--- /dev/null
+++ b/fs/ext4/block_validity.c
@@ -0,0 +1,244 @@
+/*
+ * linux/fs/ext4/block_validity.c
+ *
+ * Copyright (C) 2009
+ * Theodore Ts'o (tytso@mit.edu)
+ *
+ * Track which blocks in the filesystem are metadata blocks that
+ * should never be used as data blocks by files or directories.
+ */
+
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/namei.h>
+#include <linux/quotaops.h>
+#include <linux/buffer_head.h>
+#include <linux/module.h>
+#include <linux/swap.h>
+#include <linux/pagemap.h>
+#include <linux/version.h>
+#include <linux/blkdev.h>
+#include <linux/mutex.h>
+#include "ext4.h"
+
+struct ext4_system_zone {
+ struct rb_node node;
+ ext4_fsblk_t start_blk;
+ unsigned int count;
+};
+
+static struct kmem_cache *ext4_system_zone_cachep;
+
+int __init init_ext4_system_zone(void)
+{
+ ext4_system_zone_cachep = KMEM_CACHE(ext4_system_zone,
+ SLAB_RECLAIM_ACCOUNT);
+ if (ext4_system_zone_cachep == NULL)
+ return -ENOMEM;
+ return 0;
+}
+
+void exit_ext4_system_zone(void)
+{
+ kmem_cache_destroy(ext4_system_zone_cachep);
+}
+
+static inline int can_merge(struct ext4_system_zone *entry1,
+ struct ext4_system_zone *entry2)
+{
+ if ((entry1->start_blk + entry1->count) == entry2->start_blk)
+ return 1;
+ return 0;
+}
+
+/*
+ * Mark a range of blocks as belonging to the "system zone" --- that
+ * is, filesystem metadata blocks which should never be used by
+ * inodes.
+ */
+static int add_system_zone(struct ext4_sb_info *sbi,
+ ext4_fsblk_t start_blk,
+ unsigned int count)
+{
+ struct ext4_system_zone *new_entry = NULL, *entry;
+ struct rb_node **n = &sbi->system_blks.rb_node, *node;
+ struct rb_node *parent = NULL, *new_node = NULL;
+
+ while (*n) {
+ parent = *n;
+ entry = rb_entry(parent, struct ext4_system_zone, node);
+ if (start_blk < entry->start_blk)
+ n = &(*n)->rb_left;
+ else if (start_blk >= (entry->start_blk + entry->count))
+ n = &(*n)->rb_right;
+ else {
+ if (start_blk + count > (entry->start_blk +
+ entry->count))
+ entry->count = (start_blk + count -
+ entry->start_blk);
+ new_node = *n;
+ new_entry = rb_entry(new_node, struct ext4_system_zone,
+ node);
+ break;
+ }
+ }
+
+ if (!new_entry) {
+ new_entry = kmem_cache_alloc(ext4_system_zone_cachep,
+ GFP_KERNEL);
+ if (!new_entry)
+ return -ENOMEM;
+ new_entry->start_blk = start_blk;
+ new_entry->count = count;
+ new_node = &new_entry->node;
+
+ rb_link_node(new_node, parent, n);
+ rb_insert_color(new_node, &sbi->system_blks);
+ }
+
+ /* Can we merge to the left? */
+ node = rb_prev(new_node);
+ if (node) {
+ entry = rb_entry(node, struct ext4_system_zone, node);
+ if (can_merge(entry, new_entry)) {
+ new_entry->start_blk = entry->start_blk;
+ new_entry->count += entry->count;
+ rb_erase(node, &sbi->system_blks);
+ kmem_cache_free(ext4_system_zone_cachep, entry);
+ }
+ }
+
+ /* Can we merge to the right? */
+ node = rb_next(new_node);
+ if (node) {
+ entry = rb_entry(node, struct ext4_system_zone, node);
+ if (can_merge(new_entry, entry)) {
+ new_entry->count += entry->count;
+ rb_erase(node, &sbi->system_blks);
+ kmem_cache_free(ext4_system_zone_cachep, entry);
+ }
+ }
+ return 0;
+}
+
+static void debug_print_tree(struct ext4_sb_info *sbi)
+{
+ struct rb_node *node;
+ struct ext4_system_zone *entry;
+ int first = 1;
+
+ printk(KERN_INFO "System zones: ");
+ node = rb_first(&sbi->system_blks);
+ while (node) {
+ entry = rb_entry(node, struct ext4_system_zone, node);
+ printk("%s%llu-%llu", first ? "" : ", ",
+ entry->start_blk, entry->start_blk + entry->count - 1);
+ first = 0;
+ node = rb_next(node);
+ }
+ printk("\n");
+}
+
+int ext4_setup_system_zone(struct super_block *sb)
+{
+ ext4_group_t ngroups = ext4_get_groups_count(sb);
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_group_desc *gdp;
+ ext4_group_t i;
+ int flex_size = ext4_flex_bg_size(sbi);
+ int ret;
+
+ if (!test_opt(sb, BLOCK_VALIDITY)) {
+ if (EXT4_SB(sb)->system_blks.rb_node)
+ ext4_release_system_zone(sb);
+ return 0;
+ }
+ if (EXT4_SB(sb)->system_blks.rb_node)
+ return 0;
+
+ for (i=0; i < ngroups; i++) {
+ if (ext4_bg_has_super(sb, i) &&
+ ((i < 5) || ((i % flex_size) == 0)))
+ add_system_zone(sbi, ext4_group_first_block_no(sb, i),
+ sbi->s_gdb_count + 1);
+ gdp = ext4_get_group_desc(sb, i, NULL);
+ ret = add_system_zone(sbi, ext4_block_bitmap(sb, gdp), 1);
+ if (ret)
+ return ret;
+ ret = add_system_zone(sbi, ext4_inode_bitmap(sb, gdp), 1);
+ if (ret)
+ return ret;
+ ret = add_system_zone(sbi, ext4_inode_table(sb, gdp),
+ sbi->s_itb_per_group);
+ if (ret)
+ return ret;
+ }
+
+ if (test_opt(sb, DEBUG))
+ debug_print_tree(EXT4_SB(sb));
+ return 0;
+}
+
+/* Called when the filesystem is unmounted */
+void ext4_release_system_zone(struct super_block *sb)
+{
+ struct rb_node *n = EXT4_SB(sb)->system_blks.rb_node;
+ struct rb_node *parent;
+ struct ext4_system_zone *entry;
+
+ while (n) {
+ /* Do the node's children first */
+ if (n->rb_left) {
+ n = n->rb_left;
+ continue;
+ }
+ if (n->rb_right) {
+ n = n->rb_right;
+ continue;
+ }
+ /*
+ * The node has no children; free it, and then zero
+ * out parent's link to it. Finally go to the
+ * beginning of the loop and try to free the parent
+ * node.
+ */
+ parent = rb_parent(n);
+ entry = rb_entry(n, struct ext4_system_zone, node);
+ kmem_cache_free(ext4_system_zone_cachep, entry);
+ if (!parent)
+ EXT4_SB(sb)->system_blks.rb_node = NULL;
+ else if (parent->rb_left == n)
+ parent->rb_left = NULL;
+ else if (parent->rb_right == n)
+ parent->rb_right = NULL;
+ n = parent;
+ }
+ EXT4_SB(sb)->system_blks.rb_node = NULL;
+}
+
+/*
+ * Returns 1 if the passed-in block region (start_blk,
+ * start_blk+count) is valid; 0 if some part of the block region
+ * overlaps with filesystem metadata blocks.
+ */
+int ext4_data_block_valid(struct ext4_sb_info *sbi, ext4_fsblk_t start_blk,
+ unsigned int count)
+{
+ struct ext4_system_zone *entry;
+ struct rb_node *n = sbi->system_blks.rb_node;
+
+ if ((start_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) ||
+ (start_blk + count > ext4_blocks_count(sbi->s_es)))
+ return 0;
+ while (n) {
+ entry = rb_entry(n, struct ext4_system_zone, node);
+ if (start_blk + count - 1 < entry->start_blk)
+ n = n->rb_left;
+ else if (start_blk >= (entry->start_blk + entry->count))
+ n = n->rb_right;
+ else
+ return 0;
+ }
+ return 1;
+}
+
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index b64789929a65..9dc93168e262 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -131,8 +131,7 @@ static int ext4_readdir(struct file *filp,
struct buffer_head *bh = NULL;
map_bh.b_state = 0;
- err = ext4_get_blocks_wrap(NULL, inode, blk, 1, &map_bh,
- 0, 0, 0);
+ err = ext4_get_blocks(NULL, inode, blk, 1, &map_bh, 0);
if (err > 0) {
pgoff_t index = map_bh.b_blocknr >>
(PAGE_CACHE_SHIFT - inode->i_blkbits);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index d0f15ef56de1..17b9998680e3 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -21,7 +21,14 @@
#include <linux/magic.h>
#include <linux/jbd2.h>
#include <linux/quota.h>
-#include "ext4_i.h"
+#include <linux/rwsem.h>
+#include <linux/rbtree.h>
+#include <linux/seqlock.h>
+#include <linux/mutex.h>
+#include <linux/timer.h>
+#include <linux/wait.h>
+#include <linux/blockgroup_lock.h>
+#include <linux/percpu_counter.h>
/*
* The fourth extended filesystem constants/structures
@@ -46,6 +53,19 @@
#define ext4_debug(f, a...) do {} while (0)
#endif
+/* data type for block offset of block group */
+typedef int ext4_grpblk_t;
+
+/* data type for filesystem-wide blocks number */
+typedef unsigned long long ext4_fsblk_t;
+
+/* data type for file logical block number */
+typedef __u32 ext4_lblk_t;
+
+/* data type for block group number */
+typedef unsigned int ext4_group_t;
+
+
/* prefer goal again. length */
#define EXT4_MB_HINT_MERGE 1
/* blocks already reserved */
@@ -179,9 +199,6 @@ struct flex_groups {
#define EXT4_BG_BLOCK_UNINIT 0x0002 /* Block bitmap not in use */
#define EXT4_BG_INODE_ZEROED 0x0004 /* On-disk itable initialized to zero */
-#ifdef __KERNEL__
-#include "ext4_sb.h"
-#endif
/*
* Macro-instructions used to manage group descriptors
*/
@@ -297,10 +314,23 @@ struct ext4_new_group_data {
};
/*
- * Following is used by preallocation code to tell get_blocks() that we
- * want uninitialzed extents.
+ * Flags used by ext4_get_blocks()
*/
-#define EXT4_CREATE_UNINITIALIZED_EXT 2
+ /* Allocate any needed blocks and/or convert an unitialized
+ extent to be an initialized ext4 */
+#define EXT4_GET_BLOCKS_CREATE 0x0001
+ /* Request the creation of an unitialized extent */
+#define EXT4_GET_BLOCKS_UNINIT_EXT 0x0002
+#define EXT4_GET_BLOCKS_CREATE_UNINIT_EXT (EXT4_GET_BLOCKS_UNINIT_EXT|\
+ EXT4_GET_BLOCKS_CREATE)
+ /* Caller is from the delayed allocation writeout path,
+ so set the magic i_delalloc_reserve_flag after taking the
+ inode allocation semaphore for */
+#define EXT4_GET_BLOCKS_DELALLOC_RESERVE 0x0004
+ /* Call ext4_da_update_reserve_space() after successfully
+ allocating the blocks */
+#define EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE 0x0008
+
/*
* ioctl commands
@@ -322,6 +352,7 @@ struct ext4_new_group_data {
/* note ioctl 10 reserved for an early version of the FIEMAP ioctl */
/* note ioctl 11 reserved for filesystem-independent FIEMAP ioctl */
#define EXT4_IOC_ALLOC_DA_BLKS _IO('f', 12)
+#define EXT4_IOC_MOVE_EXT _IOWR('f', 15, struct move_extent)
/*
* ioctl commands in 32 bit emulation
@@ -417,6 +448,15 @@ struct ext4_inode {
__le32 i_version_hi; /* high 32 bits for 64-bit version */
};
+struct move_extent {
+ __u32 reserved; /* should be zero */
+ __u32 donor_fd; /* donor file descriptor */
+ __u64 orig_start; /* logical start offset in block for orig */
+ __u64 donor_start; /* logical start offset in block for donor */
+ __u64 len; /* block length to be moved */
+ __u64 moved_len; /* moved block length */
+};
+#define MAX_DEFRAG_SIZE ((1UL<<31) - 1)
#define EXT4_EPOCH_BITS 2
#define EXT4_EPOCH_MASK ((1 << EXT4_EPOCH_BITS) - 1)
@@ -516,6 +556,110 @@ do { \
#endif /* defined(__KERNEL__) || defined(__linux__) */
/*
+ * storage for cached extent
+ */
+struct ext4_ext_cache {
+ ext4_fsblk_t ec_start;
+ ext4_lblk_t ec_block;
+ __u32 ec_len; /* must be 32bit to return holes */
+ __u32 ec_type;
+};
+
+/*
+ * fourth extended file system inode data in memory
+ */
+struct ext4_inode_info {
+ __le32 i_data[15]; /* unconverted */
+ __u32 i_flags;
+ ext4_fsblk_t i_file_acl;
+ __u32 i_dtime;
+
+ /*
+ * i_block_group is the number of the block group which contains
+ * this file's inode. Constant across the lifetime of the inode,
+ * it is ued for making block allocation decisions - we try to
+ * place a file's data blocks near its inode block, and new inodes
+ * near to their parent directory's inode.
+ */
+ ext4_group_t i_block_group;
+ __u32 i_state; /* Dynamic state flags for ext4 */
+
+ ext4_lblk_t i_dir_start_lookup;
+#ifdef CONFIG_EXT4_FS_XATTR
+ /*
+ * Extended attributes can be read independently of the main file
+ * data. Taking i_mutex even when reading would cause contention
+ * between readers of EAs and writers of regular file data, so
+ * instead we synchronize on xattr_sem when reading or changing
+ * EAs.
+ */
+ struct rw_semaphore xattr_sem;
+#endif
+#ifdef CONFIG_EXT4_FS_POSIX_ACL
+ struct posix_acl *i_acl;
+ struct posix_acl *i_default_acl;
+#endif
+
+ struct list_head i_orphan; /* unlinked but open inodes */
+
+ /*
+ * i_disksize keeps track of what the inode size is ON DISK, not
+ * in memory. During truncate, i_size is set to the new size by
+ * the VFS prior to calling ext4_truncate(), but the filesystem won't
+ * set i_disksize to 0 until the truncate is actually under way.
+ *
+ * The intent is that i_disksize always represents the blocks which
+ * are used by this file. This allows recovery to restart truncate
+ * on orphans if we crash during truncate. We actually write i_disksize
+ * into the on-disk inode when writing inodes out, instead of i_size.
+ *
+ * The only time when i_disksize and i_size may be different is when
+ * a truncate is in progress. The only things which change i_disksize
+ * are ext4_get_block (growth) and ext4_truncate (shrinkth).
+ */
+ loff_t i_disksize;
+
+ /*
+ * i_data_sem is for serialising ext4_truncate() against
+ * ext4_getblock(). In the 2.4 ext2 design, great chunks of inode's
+ * data tree are chopped off during truncate. We can't do that in
+ * ext4 because whenever we perform intermediate commits during
+ * truncate, the inode and all the metadata blocks *must* be in a
+ * consistent state which allows truncation of the orphans to restart
+ * during recovery. Hence we must fix the get_block-vs-truncate race
+ * by other means, so we have i_data_sem.
+ */
+ struct rw_semaphore i_data_sem;
+ struct inode vfs_inode;
+ struct jbd2_inode jinode;
+
+ struct ext4_ext_cache i_cached_extent;
+ /*
+ * File creation time. Its function is same as that of
+ * struct timespec i_{a,c,m}time in the generic inode.
+ */
+ struct timespec i_crtime;
+
+ /* mballoc */
+ struct list_head i_prealloc_list;
+ spinlock_t i_prealloc_lock;
+
+ /* ialloc */
+ ext4_group_t i_last_alloc_group;
+
+ /* allocation reservation info for delalloc */
+ unsigned int i_reserved_data_blocks;
+ unsigned int i_reserved_meta_blocks;
+ unsigned int i_allocated_meta_blocks;
+ unsigned short i_delalloc_reserved_flag;
+
+ /* on-disk additional length */
+ __u16 i_extra_isize;
+
+ spinlock_t i_block_reservation_lock;
+};
+
+/*
* File system states
*/
#define EXT4_VALID_FS 0x0001 /* Unmounted cleanly */
@@ -540,7 +684,6 @@ do { \
#define EXT4_MOUNT_ERRORS_PANIC 0x00040 /* Panic on errors */
#define EXT4_MOUNT_MINIX_DF 0x00080 /* Mimics the Minix statfs */
#define EXT4_MOUNT_NOLOAD 0x00100 /* Don't use existing journal*/
-#define EXT4_MOUNT_ABORT 0x00200 /* Fatal error detected */
#define EXT4_MOUNT_DATA_FLAGS 0x00C00 /* Mode for data writes: */
#define EXT4_MOUNT_JOURNAL_DATA 0x00400 /* Write data to journal */
#define EXT4_MOUNT_ORDERED_DATA 0x00800 /* Flush data before commit */
@@ -560,18 +703,12 @@ do { \
#define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */
#define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */
#define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */
+#define EXT4_MOUNT_BLOCK_VALIDITY 0x20000000 /* Block validity checking */
-/* Compatibility, for having both ext2_fs.h and ext4_fs.h included at once */
-#ifndef _LINUX_EXT2_FS_H
#define clear_opt(o, opt) o &= ~EXT4_MOUNT_##opt
#define set_opt(o, opt) o |= EXT4_MOUNT_##opt
#define test_opt(sb, opt) (EXT4_SB(sb)->s_mount_opt & \
EXT4_MOUNT_##opt)
-#else
-#define EXT2_MOUNT_NOLOAD EXT4_MOUNT_NOLOAD
-#define EXT2_MOUNT_ABORT EXT4_MOUNT_ABORT
-#define EXT2_MOUNT_DATA_FLAGS EXT4_MOUNT_DATA_FLAGS
-#endif
#define ext4_set_bit ext2_set_bit
#define ext4_set_bit_atomic ext2_set_bit_atomic
@@ -689,6 +826,146 @@ struct ext4_super_block {
};
#ifdef __KERNEL__
+
+/*
+ * run-time mount flags
+ */
+#define EXT4_MF_MNTDIR_SAMPLED 0x0001
+#define EXT4_MF_FS_ABORTED 0x0002 /* Fatal error detected */
+
+/*
+ * fourth extended-fs super-block data in memory
+ */
+struct ext4_sb_info {
+ unsigned long s_desc_size; /* Size of a group descriptor in bytes */
+ unsigned long s_inodes_per_block;/* Number of inodes per block */
+ unsigned long s_blocks_per_group;/* Number of blocks in a group */
+ unsigned long s_inodes_per_group;/* Number of inodes in a group */
+ unsigned long s_itb_per_group; /* Number of inode table blocks per group */
+ unsigned long s_gdb_count; /* Number of group descriptor blocks */
+ unsigned long s_desc_per_block; /* Number of group descriptors per block */
+ ext4_group_t s_groups_count; /* Number of groups in the fs */
+ unsigned long s_overhead_last; /* Last calculated overhead */
+ unsigned long s_blocks_last; /* Last seen block count */
+ loff_t s_bitmap_maxbytes; /* max bytes for bitmap files */
+ struct buffer_head * s_sbh; /* Buffer containing the super block */
+ struct ext4_super_block *s_es; /* Pointer to the super block in the buffer */
+ struct buffer_head **s_group_desc;
+ unsigned int s_mount_opt;
+ unsigned int s_mount_flags;
+ ext4_fsblk_t s_sb_block;
+ uid_t s_resuid;
+ gid_t s_resgid;
+ unsigned short s_mount_state;
+ unsigned short s_pad;
+ int s_addr_per_block_bits;
+ int s_desc_per_block_bits;
+ int s_inode_size;
+ int s_first_ino;
+ unsigned int s_inode_readahead_blks;
+ unsigned int s_inode_goal;
+ spinlock_t s_next_gen_lock;
+ u32 s_next_generation;
+ u32 s_hash_seed[4];
+ int s_def_hash_version;
+ int s_hash_unsigned; /* 3 if hash should be signed, 0 if not */
+ struct percpu_counter s_freeblocks_counter;
+ struct percpu_counter s_freeinodes_counter;
+ struct percpu_counter s_dirs_counter;
+ struct percpu_counter s_dirtyblocks_counter;
+ struct blockgroup_lock *s_blockgroup_lock;
+ struct proc_dir_entry *s_proc;
+ struct kobject s_kobj;
+ struct completion s_kobj_unregister;
+
+ /* Journaling */
+ struct inode *s_journal_inode;
+ struct journal_s *s_journal;
+ struct list_head s_orphan;
+ struct mutex s_orphan_lock;
+ struct mutex s_resize_lock;
+ unsigned long s_commit_interval;
+ u32 s_max_batch_time;
+ u32 s_min_batch_time;
+ struct block_device *journal_bdev;
+#ifdef CONFIG_JBD2_DEBUG
+ struct timer_list turn_ro_timer; /* For turning read-only (crash simulation) */
+ wait_queue_head_t ro_wait_queue; /* For people waiting for the fs to go read-only */
+#endif
+#ifdef CONFIG_QUOTA
+ char *s_qf_names[MAXQUOTAS]; /* Names of quota files with journalled quota */
+ int s_jquota_fmt; /* Format of quota to use */
+#endif
+ unsigned int s_want_extra_isize; /* New inodes should reserve # bytes */
+ struct rb_root system_blks;
+
+#ifdef EXTENTS_STATS
+ /* ext4 extents stats */
+ unsigned long s_ext_min;
+ unsigned long s_ext_max;
+ unsigned long s_depth_max;
+ spinlock_t s_ext_stats_lock;
+ unsigned long s_ext_blocks;
+ unsigned long s_ext_extents;
+#endif
+
+ /* for buddy allocator */
+ struct ext4_group_info ***s_group_info;
+ struct inode *s_buddy_cache;
+ long s_blocks_reserved;
+ spinlock_t s_reserve_lock;
+ spinlock_t s_md_lock;
+ tid_t s_last_transaction;
+ unsigned short *s_mb_offsets;
+ unsigned int *s_mb_maxs;
+
+ /* tunables */
+ unsigned long s_stripe;
+ unsigned int s_mb_stream_request;
+ unsigned int s_mb_max_to_scan;
+ unsigned int s_mb_min_to_scan;
+ unsigned int s_mb_stats;
+ unsigned int s_mb_order2_reqs;
+ unsigned int s_mb_group_prealloc;
+ /* where last allocation was done - for stream allocation */
+ unsigned long s_mb_last_group;
+ unsigned long s_mb_last_start;
+
+ /* history to debug policy */
+ struct ext4_mb_history *s_mb_history;
+ int s_mb_history_cur;
+ int s_mb_history_max;
+ int s_mb_history_num;
+ spinlock_t s_mb_history_lock;
+ int s_mb_history_filter;
+
+ /* stats for buddy allocator */
+ spinlock_t s_mb_pa_lock;
+ atomic_t s_bal_reqs; /* number of reqs with len > 1 */
+ atomic_t s_bal_success; /* we found long enough chunks */
+ atomic_t s_bal_allocated; /* in blocks */
+ atomic_t s_bal_ex_scanned; /* total extents scanned */
+ atomic_t s_bal_goals; /* goal hits */
+ atomic_t s_bal_breaks; /* too long searches */
+ atomic_t s_bal_2orders; /* 2^order hits */
+ spinlock_t s_bal_lock;
+ unsigned long s_mb_buddies_generated;
+ unsigned long long s_mb_generation_time;
+ atomic_t s_mb_lost_chunks;
+ atomic_t s_mb_preallocated;
+ atomic_t s_mb_discarded;
+
+ /* locality groups */
+ struct ext4_locality_group *s_locality_groups;
+
+ /* for write statistics */
+ unsigned long s_sectors_written_start;
+ u64 s_kbytes_written;
+
+ unsigned int s_log_groups_per_flex;
+ struct flex_groups *s_flex_groups;
+};
+
static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
{
return sb->s_fs_info;
@@ -704,7 +981,6 @@ static inline struct timespec ext4_current_time(struct inode *inode)
current_fs_time(inode->i_sb) : CURRENT_TIME_SEC;
}
-
static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
{
return ino == EXT4_ROOT_INO ||
@@ -1014,6 +1290,14 @@ extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
ext4_group_t block_group,
struct buffer_head ** bh);
extern int ext4_should_retry_alloc(struct super_block *sb, int *retries);
+struct buffer_head *ext4_read_block_bitmap(struct super_block *sb,
+ ext4_group_t block_group);
+extern unsigned ext4_init_block_bitmap(struct super_block *sb,
+ struct buffer_head *bh,
+ ext4_group_t group,
+ struct ext4_group_desc *desc);
+#define ext4_free_blocks_after_init(sb, group, desc) \
+ ext4_init_block_bitmap(sb, NULL, group, desc)
/* dir.c */
extern int ext4_check_dir_entry(const char *, struct inode *,
@@ -1032,12 +1316,18 @@ extern int ext4fs_dirhash(const char *name, int len, struct
dx_hash_info *hinfo);
/* ialloc.c */
-extern struct inode * ext4_new_inode(handle_t *, struct inode *, int);
+extern struct inode *ext4_new_inode(handle_t *, struct inode *, int,
+ const struct qstr *qstr, __u32 goal);
extern void ext4_free_inode(handle_t *, struct inode *);
extern struct inode * ext4_orphan_get(struct super_block *, unsigned long);
extern unsigned long ext4_count_free_inodes(struct super_block *);
extern unsigned long ext4_count_dirs(struct super_block *);
extern void ext4_check_inodes_bitmap(struct super_block *);
+extern unsigned ext4_init_inode_bitmap(struct super_block *sb,
+ struct buffer_head *bh,
+ ext4_group_t group,
+ struct ext4_group_desc *desc);
+extern void mark_bitmap_end(int start_bit, int end_bit, char *bitmap);
/* mballoc.c */
extern long ext4_mb_stats;
@@ -1051,7 +1341,7 @@ extern void ext4_discard_preallocations(struct inode *);
extern int __init init_ext4_mballoc(void);
extern void exit_ext4_mballoc(void);
extern void ext4_mb_free_blocks(handle_t *, struct inode *,
- unsigned long, unsigned long, int, unsigned long *);
+ ext4_fsblk_t, unsigned long, int, unsigned long *);
extern int ext4_mb_add_groupinfo(struct super_block *sb,
ext4_group_t i, struct ext4_group_desc *desc);
extern void ext4_mb_update_group_info(struct ext4_group_info *grp,
@@ -1123,6 +1413,8 @@ extern void ext4_abort(struct super_block *, const char *, const char *, ...)
__attribute__ ((format (printf, 3, 4)));
extern void ext4_warning(struct super_block *, const char *, const char *, ...)
__attribute__ ((format (printf, 3, 4)));
+extern void ext4_msg(struct super_block *, const char *, const char *, ...)
+ __attribute__ ((format (printf, 3, 4)));
extern void ext4_grp_locked_error(struct super_block *, ext4_group_t,
const char *, const char *, ...)
__attribute__ ((format (printf, 4, 5)));
@@ -1161,6 +1453,10 @@ extern void ext4_used_dirs_set(struct super_block *sb,
struct ext4_group_desc *bg, __u32 count);
extern void ext4_itable_unused_set(struct super_block *sb,
struct ext4_group_desc *bg, __u32 count);
+extern __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 group,
+ struct ext4_group_desc *gdp);
+extern int ext4_group_desc_csum_verify(struct ext4_sb_info *sbi, __u32 group,
+ struct ext4_group_desc *gdp);
static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es)
{
@@ -1228,6 +1524,18 @@ struct ext4_group_info *ext4_get_group_info(struct super_block *sb,
return grp_info[indexv][indexh];
}
+/*
+ * Reading s_groups_count requires using smp_rmb() afterwards. See
+ * the locking protocol documented in the comments of ext4_group_add()
+ * in resize.c
+ */
+static inline ext4_group_t ext4_get_groups_count(struct super_block *sb)
+{
+ ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
+
+ smp_rmb();
+ return ngroups;
+}
static inline ext4_group_t ext4_flex_group(struct ext4_sb_info *sbi,
ext4_group_t block_group)
@@ -1283,33 +1591,25 @@ struct ext4_group_info {
};
#define EXT4_GROUP_INFO_NEED_INIT_BIT 0
-#define EXT4_GROUP_INFO_LOCKED_BIT 1
#define EXT4_MB_GRP_NEED_INIT(grp) \
(test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state)))
-static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group)
+static inline spinlock_t *ext4_group_lock_ptr(struct super_block *sb,
+ ext4_group_t group)
{
- struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
-
- bit_spin_lock(EXT4_GROUP_INFO_LOCKED_BIT, &(grinfo->bb_state));
+ return bgl_lock_ptr(EXT4_SB(sb)->s_blockgroup_lock, group);
}
-static inline void ext4_unlock_group(struct super_block *sb,
- ext4_group_t group)
+static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group)
{
- struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
-
- bit_spin_unlock(EXT4_GROUP_INFO_LOCKED_BIT, &(grinfo->bb_state));
+ spin_lock(ext4_group_lock_ptr(sb, group));
}
-static inline int ext4_is_group_locked(struct super_block *sb,
+static inline void ext4_unlock_group(struct super_block *sb,
ext4_group_t group)
{
- struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
-
- return bit_spin_is_locked(EXT4_GROUP_INFO_LOCKED_BIT,
- &(grinfo->bb_state));
+ spin_unlock(ext4_group_lock_ptr(sb, group));
}
/*
@@ -1326,11 +1626,21 @@ extern const struct file_operations ext4_file_operations;
/* namei.c */
extern const struct inode_operations ext4_dir_inode_operations;
extern const struct inode_operations ext4_special_inode_operations;
+extern struct dentry *ext4_get_parent(struct dentry *child);
/* symlink.c */
extern const struct inode_operations ext4_symlink_inode_operations;
extern const struct inode_operations ext4_fast_symlink_inode_operations;
+/* block_validity */
+extern void ext4_release_system_zone(struct super_block *sb);
+extern int ext4_setup_system_zone(struct super_block *sb);
+extern int __init init_ext4_system_zone(void);
+extern void exit_ext4_system_zone(void);
+extern int ext4_data_block_valid(struct ext4_sb_info *sbi,
+ ext4_fsblk_t start_blk,
+ unsigned int count);
+
/* extents.c */
extern int ext4_ext_tree_init(handle_t *handle, struct inode *);
extern int ext4_ext_writepage_trans_blocks(struct inode *, int);
@@ -1338,19 +1648,22 @@ extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks,
int chunk);
extern int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
ext4_lblk_t iblock, unsigned int max_blocks,
- struct buffer_head *bh_result,
- int create, int extend_disksize);
+ struct buffer_head *bh_result, int flags);
extern void ext4_ext_truncate(struct inode *);
extern void ext4_ext_init(struct super_block *);
extern void ext4_ext_release(struct super_block *);
extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
loff_t len);
-extern int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode,
- sector_t block, unsigned int max_blocks,
- struct buffer_head *bh, int create,
- int extend_disksize, int flag);
+extern int ext4_get_blocks(handle_t *handle, struct inode *inode,
+ sector_t block, unsigned int max_blocks,
+ struct buffer_head *bh, int flags);
extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
__u64 start, __u64 len);
+/* move_extent.c */
+extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
+ __u64 start_orig, __u64 start_donor,
+ __u64 len, __u64 *moved_len);
+
/*
* Add new method to test wether block and inode bitmaps are properly
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index f0c3ec85bd48..20a84105a10b 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -221,12 +221,16 @@ static inline int ext4_ext_get_actual_len(struct ext4_extent *ext)
}
extern int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks);
+extern ext4_fsblk_t ext_pblock(struct ext4_extent *ex);
extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *);
extern void ext4_ext_store_pblock(struct ext4_extent *, ext4_fsblk_t);
extern int ext4_extent_tree_init(handle_t *, struct inode *);
extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode,
int num,
struct ext4_ext_path *path);
+extern int ext4_can_extents_be_merged(struct inode *inode,
+ struct ext4_extent *ex1,
+ struct ext4_extent *ex2);
extern int ext4_ext_try_to_merge(struct inode *inode,
struct ext4_ext_path *path,
struct ext4_extent *);
diff --git a/fs/ext4/ext4_i.h b/fs/ext4/ext4_i.h
deleted file mode 100644
index 4ce2187123aa..000000000000
--- a/fs/ext4/ext4_i.h
+++ /dev/null
@@ -1,140 +0,0 @@
-/*
- * ext4_i.h
- *
- * Copyright (C) 1992, 1993, 1994, 1995
- * Remy Card (card@masi.ibp.fr)
- * Laboratoire MASI - Institut Blaise Pascal
- * Universite Pierre et Marie Curie (Paris VI)
- *
- * from
- *
- * linux/include/linux/minix_fs_i.h
- *
- * Copyright (C) 1991, 1992 Linus Torvalds
- */
-
-#ifndef _EXT4_I
-#define _EXT4_I
-
-#include <linux/rwsem.h>
-#include <linux/rbtree.h>
-#include <linux/seqlock.h>
-#include <linux/mutex.h>
-
-/* data type for block offset of block group */
-typedef int ext4_grpblk_t;
-
-/* data type for filesystem-wide blocks number */
-typedef unsigned long long ext4_fsblk_t;
-
-/* data type for file logical block number */
-typedef __u32 ext4_lblk_t;
-
-/* data type for block group number */
-typedef unsigned int ext4_group_t;
-
-/*
- * storage for cached extent
- */
-struct ext4_ext_cache {
- ext4_fsblk_t ec_start;
- ext4_lblk_t ec_block;
- __u32 ec_len; /* must be 32bit to return holes */
- __u32 ec_type;
-};
-
-/*
- * fourth extended file system inode data in memory
- */
-struct ext4_inode_info {
- __le32 i_data[15]; /* unconverted */
- __u32 i_flags;
- ext4_fsblk_t i_file_acl;
- __u32 i_dtime;
-
- /*
- * i_block_group is the number of the block group which contains
- * this file's inode. Constant across the lifetime of the inode,
- * it is ued for making block allocation decisions - we try to
- * place a file's data blocks near its inode block, and new inodes
- * near to their parent directory's inode.
- */
- ext4_group_t i_block_group;
- __u32 i_state; /* Dynamic state flags for ext4 */
-
- ext4_lblk_t i_dir_start_lookup;
-#ifdef CONFIG_EXT4_FS_XATTR
- /*
- * Extended attributes can be read independently of the main file
- * data. Taking i_mutex even when reading would cause contention
- * between readers of EAs and writers of regular file data, so
- * instead we synchronize on xattr_sem when reading or changing
- * EAs.
- */
- struct rw_semaphore xattr_sem;
-#endif
-#ifdef CONFIG_EXT4_FS_POSIX_ACL
- struct posix_acl *i_acl;
- struct posix_acl *i_default_acl;
-#endif
-
- struct list_head i_orphan; /* unlinked but open inodes */
-
- /*
- * i_disksize keeps track of what the inode size is ON DISK, not
- * in memory. During truncate, i_size is set to the new size by
- * the VFS prior to calling ext4_truncate(), but the filesystem won't
- * set i_disksize to 0 until the truncate is actually under way.
- *
- * The intent is that i_disksize always represents the blocks which
- * are used by this file. This allows recovery to restart truncate
- * on orphans if we crash during truncate. We actually write i_disksize
- * into the on-disk inode when writing inodes out, instead of i_size.
- *
- * The only time when i_disksize and i_size may be different is when
- * a truncate is in progress. The only things which change i_disksize
- * are ext4_get_block (growth) and ext4_truncate (shrinkth).
- */
- loff_t i_disksize;
-
- /*
- * i_data_sem is for serialising ext4_truncate() against
- * ext4_getblock(). In the 2.4 ext2 design, great chunks of inode's
- * data tree are chopped off during truncate. We can't do that in
- * ext4 because whenever we perform intermediate commits during
- * truncate, the inode and all the metadata blocks *must* be in a
- * consistent state which allows truncation of the orphans to restart
- * during recovery. Hence we must fix the get_block-vs-truncate race
- * by other means, so we have i_data_sem.
- */
- struct rw_semaphore i_data_sem;
- struct inode vfs_inode;
- struct jbd2_inode jinode;
-
- struct ext4_ext_cache i_cached_extent;
- /*
- * File creation time. Its function is same as that of
- * struct timespec i_{a,c,m}time in the generic inode.
- */
- struct timespec i_crtime;
-
- /* mballoc */
- struct list_head i_prealloc_list;
- spinlock_t i_prealloc_lock;
-
- /* ialloc */
- ext4_group_t i_last_alloc_group;
-
- /* allocation reservation info for delalloc */
- unsigned int i_reserved_data_blocks;
- unsigned int i_reserved_meta_blocks;
- unsigned int i_allocated_meta_blocks;
- unsigned short i_delalloc_reserved_flag;
-
- /* on-disk additional length */
- __u16 i_extra_isize;
-
- spinlock_t i_block_reservation_lock;
-};
-
-#endif /* _EXT4_I */
diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h
deleted file mode 100644
index 57b71fefbccf..000000000000
--- a/fs/ext4/ext4_sb.h
+++ /dev/null
@@ -1,161 +0,0 @@
-/*
- * ext4_sb.h
- *
- * Copyright (C) 1992, 1993, 1994, 1995
- * Remy Card (card@masi.ibp.fr)
- * Laboratoire MASI - Institut Blaise Pascal
- * Universite Pierre et Marie Curie (Paris VI)
- *
- * from
- *
- * linux/include/linux/minix_fs_sb.h
- *
- * Copyright (C) 1991, 1992 Linus Torvalds
- */
-
-#ifndef _EXT4_SB
-#define _EXT4_SB
-
-#ifdef __KERNEL__
-#include <linux/timer.h>
-#include <linux/wait.h>
-#include <linux/blockgroup_lock.h>
-#include <linux/percpu_counter.h>
-#endif
-#include <linux/rbtree.h>
-
-/*
- * fourth extended-fs super-block data in memory
- */
-struct ext4_sb_info {
- unsigned long s_desc_size; /* Size of a group descriptor in bytes */
- unsigned long s_inodes_per_block;/* Number of inodes per block */
- unsigned long s_blocks_per_group;/* Number of blocks in a group */
- unsigned long s_inodes_per_group;/* Number of inodes in a group */
- unsigned long s_itb_per_group; /* Number of inode table blocks per group */
- unsigned long s_gdb_count; /* Number of group descriptor blocks */
- unsigned long s_desc_per_block; /* Number of group descriptors per block */
- ext4_group_t s_groups_count; /* Number of groups in the fs */
- unsigned long s_overhead_last; /* Last calculated overhead */
- unsigned long s_blocks_last; /* Last seen block count */
- loff_t s_bitmap_maxbytes; /* max bytes for bitmap files */
- struct buffer_head * s_sbh; /* Buffer containing the super block */
- struct ext4_super_block *s_es; /* Pointer to the super block in the buffer */
- struct buffer_head **s_group_desc;
- unsigned long s_mount_opt;
- ext4_fsblk_t s_sb_block;
- uid_t s_resuid;
- gid_t s_resgid;
- unsigned short s_mount_state;
- unsigned short s_pad;
- int s_addr_per_block_bits;
- int s_desc_per_block_bits;
- int s_inode_size;
- int s_first_ino;
- unsigned int s_inode_readahead_blks;
- spinlock_t s_next_gen_lock;
- u32 s_next_generation;
- u32 s_hash_seed[4];
- int s_def_hash_version;
- int s_hash_unsigned; /* 3 if hash should be signed, 0 if not */
- struct percpu_counter s_freeblocks_counter;
- struct percpu_counter s_freeinodes_counter;
- struct percpu_counter s_dirs_counter;
- struct percpu_counter s_dirtyblocks_counter;
- struct blockgroup_lock *s_blockgroup_lock;
- struct proc_dir_entry *s_proc;
- struct kobject s_kobj;
- struct completion s_kobj_unregister;
-
- /* Journaling */
- struct inode *s_journal_inode;
- struct journal_s *s_journal;
- struct list_head s_orphan;
- unsigned long s_commit_interval;
- u32 s_max_batch_time;
- u32 s_min_batch_time;
- struct block_device *journal_bdev;
-#ifdef CONFIG_JBD2_DEBUG
- struct timer_list turn_ro_timer; /* For turning read-only (crash simulation) */
- wait_queue_head_t ro_wait_queue; /* For people waiting for the fs to go read-only */
-#endif
-#ifdef CONFIG_QUOTA
- char *s_qf_names[MAXQUOTAS]; /* Names of quota files with journalled quota */
- int s_jquota_fmt; /* Format of quota to use */
-#endif
- unsigned int s_want_extra_isize; /* New inodes should reserve # bytes */
-
-#ifdef EXTENTS_STATS
- /* ext4 extents stats */
- unsigned long s_ext_min;
- unsigned long s_ext_max;
- unsigned long s_depth_max;
- spinlock_t s_ext_stats_lock;
- unsigned long s_ext_blocks;
- unsigned long s_ext_extents;
-#endif
-
- /* for buddy allocator */
- struct ext4_group_info ***s_group_info;
- struct inode *s_buddy_cache;
- long s_blocks_reserved;
- spinlock_t s_reserve_lock;
- spinlock_t s_md_lock;
- tid_t s_last_transaction;
- unsigned short *s_mb_offsets;
- unsigned int *s_mb_maxs;
-
- /* tunables */
- unsigned long s_stripe;
- unsigned int s_mb_stream_request;
- unsigned int s_mb_max_to_scan;
- unsigned int s_mb_min_to_scan;
- unsigned int s_mb_stats;
- unsigned int s_mb_order2_reqs;
- unsigned int s_mb_group_prealloc;
- /* where last allocation was done - for stream allocation */
- unsigned long s_mb_last_group;
- unsigned long s_mb_last_start;
-
- /* history to debug policy */
- struct ext4_mb_history *s_mb_history;
- int s_mb_history_cur;
- int s_mb_history_max;
- int s_mb_history_num;
- spinlock_t s_mb_history_lock;
- int s_mb_history_filter;
-
- /* stats for buddy allocator */
- spinlock_t s_mb_pa_lock;
- atomic_t s_bal_reqs; /* number of reqs with len > 1 */
- atomic_t s_bal_success; /* we found long enough chunks */
- atomic_t s_bal_allocated; /* in blocks */
- atomic_t s_bal_ex_scanned; /* total extents scanned */
- atomic_t s_bal_goals; /* goal hits */
- atomic_t s_bal_breaks; /* too long searches */
- atomic_t s_bal_2orders; /* 2^order hits */
- spinlock_t s_bal_lock;
- unsigned long s_mb_buddies_generated;
- unsigned long long s_mb_generation_time;
- atomic_t s_mb_lost_chunks;
- atomic_t s_mb_preallocated;
- atomic_t s_mb_discarded;
-
- /* locality groups */
- struct ext4_locality_group *s_locality_groups;
-
- /* for write statistics */
- unsigned long s_sectors_written_start;
- u64 s_kbytes_written;
-
- unsigned int s_log_groups_per_flex;
- struct flex_groups *s_flex_groups;
-};
-
-static inline spinlock_t *
-sb_bgl_lock(struct ext4_sb_info *sbi, unsigned int block_group)
-{
- return bgl_lock_ptr(sbi->s_blockgroup_lock, block_group);
-}
-
-#endif /* _EXT4_SB */
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index e3a55eb8b26a..50322a09bd01 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -49,7 +49,7 @@
* ext_pblock:
* combine low and high parts of physical block number into ext4_fsblk_t
*/
-static ext4_fsblk_t ext_pblock(struct ext4_extent *ex)
+ext4_fsblk_t ext_pblock(struct ext4_extent *ex)
{
ext4_fsblk_t block;
@@ -326,32 +326,18 @@ ext4_ext_max_entries(struct inode *inode, int depth)
static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext)
{
- ext4_fsblk_t block = ext_pblock(ext), valid_block;
+ ext4_fsblk_t block = ext_pblock(ext);
int len = ext4_ext_get_actual_len(ext);
- struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
- valid_block = le32_to_cpu(es->s_first_data_block) +
- EXT4_SB(inode->i_sb)->s_gdb_count;
- if (unlikely(block <= valid_block ||
- ((block + len) > ext4_blocks_count(es))))
- return 0;
- else
- return 1;
+ return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len);
}
static int ext4_valid_extent_idx(struct inode *inode,
struct ext4_extent_idx *ext_idx)
{
- ext4_fsblk_t block = idx_pblock(ext_idx), valid_block;
- struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
+ ext4_fsblk_t block = idx_pblock(ext_idx);
- valid_block = le32_to_cpu(es->s_first_data_block) +
- EXT4_SB(inode->i_sb)->s_gdb_count;
- if (unlikely(block <= valid_block ||
- (block >= ext4_blocks_count(es))))
- return 0;
- else
- return 1;
+ return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, 1);
}
static int ext4_valid_extent_entries(struct inode *inode,
@@ -1431,7 +1417,7 @@ static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode,
return err;
}
-static int
+int
ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
struct ext4_extent *ex2)
{
@@ -2097,12 +2083,16 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
ex = EXT_LAST_EXTENT(eh);
ex_ee_block = le32_to_cpu(ex->ee_block);
- if (ext4_ext_is_uninitialized(ex))
- uninitialized = 1;
ex_ee_len = ext4_ext_get_actual_len(ex);
while (ex >= EXT_FIRST_EXTENT(eh) &&
ex_ee_block + ex_ee_len > start) {
+
+ if (ext4_ext_is_uninitialized(ex))
+ uninitialized = 1;
+ else
+ uninitialized = 0;
+
ext_debug("remove ext %lu:%u\n", ex_ee_block, ex_ee_len);
path[depth].p_ext = ex;
@@ -2784,7 +2774,7 @@ fix_extent_len:
int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
ext4_lblk_t iblock,
unsigned int max_blocks, struct buffer_head *bh_result,
- int create, int extend_disksize)
+ int flags)
{
struct ext4_ext_path *path = NULL;
struct ext4_extent_header *eh;
@@ -2793,7 +2783,6 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
int err = 0, depth, ret, cache_type;
unsigned int allocated = 0;
struct ext4_allocation_request ar;
- loff_t disksize;
__clear_bit(BH_New, &bh_result->b_state);
ext_debug("blocks %u/%u requested for inode %u\n",
@@ -2803,7 +2792,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
cache_type = ext4_ext_in_cache(inode, iblock, &newex);
if (cache_type) {
if (cache_type == EXT4_EXT_CACHE_GAP) {
- if (!create) {
+ if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
/*
* block isn't allocated yet and
* user doesn't want to allocate it
@@ -2869,9 +2858,11 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
EXT4_EXT_CACHE_EXTENT);
goto out;
}
- if (create == EXT4_CREATE_UNINITIALIZED_EXT)
+ if (flags & EXT4_GET_BLOCKS_UNINIT_EXT)
goto out;
- if (!create) {
+ if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
+ if (allocated > max_blocks)
+ allocated = max_blocks;
/*
* We have blocks reserved already. We
* return allocated blocks so that delalloc
@@ -2879,8 +2870,6 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
* the buffer head will be unmapped so that
* a read from the block returns 0s.
*/
- if (allocated > max_blocks)
- allocated = max_blocks;
set_buffer_unwritten(bh_result);
bh_result->b_bdev = inode->i_sb->s_bdev;
bh_result->b_blocknr = newblock;
@@ -2903,7 +2892,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
* requested block isn't allocated yet;
* we couldn't try to create block if create flag is zero
*/
- if (!create) {
+ if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
/*
* put just found gap into cache to speed up
* subsequent requests
@@ -2932,10 +2921,10 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
* EXT_UNINIT_MAX_LEN.
*/
if (max_blocks > EXT_INIT_MAX_LEN &&
- create != EXT4_CREATE_UNINITIALIZED_EXT)
+ !(flags & EXT4_GET_BLOCKS_UNINIT_EXT))
max_blocks = EXT_INIT_MAX_LEN;
else if (max_blocks > EXT_UNINIT_MAX_LEN &&
- create == EXT4_CREATE_UNINITIALIZED_EXT)
+ (flags & EXT4_GET_BLOCKS_UNINIT_EXT))
max_blocks = EXT_UNINIT_MAX_LEN;
/* Check if we can really insert (iblock)::(iblock+max_blocks) extent */
@@ -2966,7 +2955,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
/* try to insert new extent into found leaf and return */
ext4_ext_store_pblock(&newex, newblock);
newex.ee_len = cpu_to_le16(ar.len);
- if (create == EXT4_CREATE_UNINITIALIZED_EXT) /* Mark uninitialized */
+ if (flags & EXT4_GET_BLOCKS_UNINIT_EXT) /* Mark uninitialized */
ext4_ext_mark_uninitialized(&newex);
err = ext4_ext_insert_extent(handle, inode, path, &newex);
if (err) {
@@ -2983,18 +2972,10 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
newblock = ext_pblock(&newex);
allocated = ext4_ext_get_actual_len(&newex);
outnew:
- if (extend_disksize) {
- disksize = ((loff_t) iblock + ar.len) << inode->i_blkbits;
- if (disksize > i_size_read(inode))
- disksize = i_size_read(inode);
- if (disksize > EXT4_I(inode)->i_disksize)
- EXT4_I(inode)->i_disksize = disksize;
- }
-
set_buffer_new(bh_result);
/* Cache only when it is _not_ an uninitialized extent */
- if (create != EXT4_CREATE_UNINITIALIZED_EXT)
+ if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0)
ext4_ext_put_in_cache(inode, iblock, allocated, newblock,
EXT4_EXT_CACHE_EXTENT);
out:
@@ -3150,9 +3131,10 @@ retry:
ret = PTR_ERR(handle);
break;
}
- ret = ext4_get_blocks_wrap(handle, inode, block,
- max_blocks, &map_bh,
- EXT4_CREATE_UNINITIALIZED_EXT, 0, 0);
+ map_bh.b_state = 0;
+ ret = ext4_get_blocks(handle, inode, block,
+ max_blocks, &map_bh,
+ EXT4_GET_BLOCKS_CREATE_UNINIT_EXT);
if (ret <= 0) {
#ifdef EXT4FS_DEBUG
WARN_ON(ret <= 0);
@@ -3195,7 +3177,7 @@ static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
void *data)
{
struct fiemap_extent_info *fieinfo = data;
- unsigned long blksize_bits = inode->i_sb->s_blocksize_bits;
+ unsigned char blksize_bits = inode->i_sb->s_blocksize_bits;
__u64 logical;
__u64 physical;
__u64 length;
@@ -3242,9 +3224,16 @@ static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
*
* XXX this might miss a single-block extent at EXT_MAX_BLOCK
*/
- if (logical + length - 1 == EXT_MAX_BLOCK ||
- ext4_ext_next_allocated_block(path) == EXT_MAX_BLOCK)
+ if (ext4_ext_next_allocated_block(path) == EXT_MAX_BLOCK ||
+ newex->ec_block + newex->ec_len - 1 == EXT_MAX_BLOCK) {
+ loff_t size = i_size_read(inode);
+ loff_t bs = EXT4_BLOCK_SIZE(inode->i_sb);
+
flags |= FIEMAP_EXTENT_LAST;
+ if ((flags & FIEMAP_EXTENT_DELALLOC) &&
+ logical+length > size)
+ length = (size - logical + bs - 1) & ~(bs-1);
+ }
error = fiemap_fill_next_extent(fieinfo, logical, physical,
length, flags);
@@ -3318,10 +3307,10 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
* Walk the extent tree gathering extent information.
* ext4_ext_fiemap_cb will push extents back to user.
*/
- down_write(&EXT4_I(inode)->i_data_sem);
+ down_read(&EXT4_I(inode)->i_data_sem);
error = ext4_ext_walk_space(inode, start_blk, len_blks,
ext4_ext_fiemap_cb, fieinfo);
- up_write(&EXT4_I(inode)->i_data_sem);
+ up_read(&EXT4_I(inode)->i_data_sem);
}
return error;
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 588af8c77246..3f1873fef1c6 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -21,6 +21,8 @@
#include <linux/time.h>
#include <linux/fs.h>
#include <linux/jbd2.h>
+#include <linux/mount.h>
+#include <linux/path.h>
#include "ext4.h"
#include "ext4_jbd2.h"
#include "xattr.h"
@@ -145,6 +147,38 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
return 0;
}
+static int ext4_file_open(struct inode * inode, struct file * filp)
+{
+ struct super_block *sb = inode->i_sb;
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ struct vfsmount *mnt = filp->f_path.mnt;
+ struct path path;
+ char buf[64], *cp;
+
+ if (unlikely(!(sbi->s_mount_flags & EXT4_MF_MNTDIR_SAMPLED) &&
+ !(sb->s_flags & MS_RDONLY))) {
+ sbi->s_mount_flags |= EXT4_MF_MNTDIR_SAMPLED;
+ /*
+ * Sample where the filesystem has been mounted and
+ * store it in the superblock for sysadmin convenience
+ * when trying to sort through large numbers of block
+ * devices or filesystem images.
+ */
+ memset(buf, 0, sizeof(buf));
+ path.mnt = mnt->mnt_parent;
+ path.dentry = mnt->mnt_mountpoint;
+ path_get(&path);
+ cp = d_path(&path, buf, sizeof(buf));
+ path_put(&path);
+ if (!IS_ERR(cp)) {
+ memcpy(sbi->s_es->s_last_mounted, cp,
+ sizeof(sbi->s_es->s_last_mounted));
+ sb->s_dirt = 1;
+ }
+ }
+ return generic_file_open(inode, filp);
+}
+
const struct file_operations ext4_file_operations = {
.llseek = generic_file_llseek,
.read = do_sync_read,
@@ -156,7 +190,7 @@ const struct file_operations ext4_file_operations = {
.compat_ioctl = ext4_compat_ioctl,
#endif
.mmap = ext4_file_mmap,
- .open = generic_file_open,
+ .open = ext4_file_open,
.release = ext4_release_file,
.fsync = ext4_sync_file,
.splice_read = generic_file_splice_read,
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 5afe4370840b..83cf6415f599 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -28,10 +28,12 @@
#include <linux/writeback.h>
#include <linux/jbd2.h>
#include <linux/blkdev.h>
-#include <linux/marker.h>
+
#include "ext4.h"
#include "ext4_jbd2.h"
+#include <trace/events/ext4.h>
+
/*
* akpm: A new design for ext4_sync_file().
*
@@ -52,9 +54,7 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
J_ASSERT(ext4_journal_current_handle() == NULL);
- trace_mark(ext4_sync_file, "dev %s datasync %d ino %ld parent %ld",
- inode->i_sb->s_id, datasync, inode->i_ino,
- dentry->d_parent->d_inode->i_ino);
+ trace_ext4_sync_file(file, dentry, datasync);
/*
* data=writeback:
diff --git a/fs/ext4/group.h b/fs/ext4/group.h
deleted file mode 100644
index c2c0a8d06d0e..000000000000
--- a/fs/ext4/group.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- * linux/fs/ext4/group.h
- *
- * Copyright (C) 2007 Cluster File Systems, Inc
- *
- * Author: Andreas Dilger <adilger@clusterfs.com>
- */
-
-#ifndef _LINUX_EXT4_GROUP_H
-#define _LINUX_EXT4_GROUP_H
-
-extern __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 group,
- struct ext4_group_desc *gdp);
-extern int ext4_group_desc_csum_verify(struct ext4_sb_info *sbi, __u32 group,
- struct ext4_group_desc *gdp);
-struct buffer_head *ext4_read_block_bitmap(struct super_block *sb,
- ext4_group_t block_group);
-extern unsigned ext4_init_block_bitmap(struct super_block *sb,
- struct buffer_head *bh,
- ext4_group_t group,
- struct ext4_group_desc *desc);
-#define ext4_free_blocks_after_init(sb, group, desc) \
- ext4_init_block_bitmap(sb, NULL, group, desc)
-extern unsigned ext4_init_inode_bitmap(struct super_block *sb,
- struct buffer_head *bh,
- ext4_group_t group,
- struct ext4_group_desc *desc);
-extern void mark_bitmap_end(int start_bit, int end_bit, char *bitmap);
-#endif /* _LINUX_EXT4_GROUP_H */
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index f18e0a08a6b5..2f645732e3b7 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -23,11 +23,13 @@
#include <linux/bitops.h>
#include <linux/blkdev.h>
#include <asm/byteorder.h>
+
#include "ext4.h"
#include "ext4_jbd2.h"
#include "xattr.h"
#include "acl.h"
-#include "group.h"
+
+#include <trace/events/ext4.h>
/*
* ialloc.c contains the inodes allocation and deallocation routines
@@ -123,16 +125,16 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
unlock_buffer(bh);
return bh;
}
- spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group));
+ ext4_lock_group(sb, block_group);
if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
ext4_init_inode_bitmap(sb, bh, block_group, desc);
set_bitmap_uptodate(bh);
set_buffer_uptodate(bh);
- spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
+ ext4_unlock_group(sb, block_group);
unlock_buffer(bh);
return bh;
}
- spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
+ ext4_unlock_group(sb, block_group);
if (buffer_uptodate(bh)) {
/*
* if not uninit if bh is uptodate,
@@ -209,11 +211,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
ino = inode->i_ino;
ext4_debug("freeing inode %lu\n", ino);
- trace_mark(ext4_free_inode,
- "dev %s ino %lu mode %d uid %lu gid %lu bocks %llu",
- sb->s_id, inode->i_ino, inode->i_mode,
- (unsigned long) inode->i_uid, (unsigned long) inode->i_gid,
- (unsigned long long) inode->i_blocks);
+ trace_ext4_free_inode(inode);
/*
* Note: we must free any quota before locking the superblock,
@@ -247,9 +245,8 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
goto error_return;
/* Ok, now we can actually update the inode bitmaps.. */
- spin_lock(sb_bgl_lock(sbi, block_group));
- cleared = ext4_clear_bit(bit, bitmap_bh->b_data);
- spin_unlock(sb_bgl_lock(sbi, block_group));
+ cleared = ext4_clear_bit_atomic(ext4_group_lock_ptr(sb, block_group),
+ bit, bitmap_bh->b_data);
if (!cleared)
ext4_error(sb, "ext4_free_inode",
"bit already cleared for inode %lu", ino);
@@ -261,7 +258,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
if (fatal) goto error_return;
if (gdp) {
- spin_lock(sb_bgl_lock(sbi, block_group));
+ ext4_lock_group(sb, block_group);
count = ext4_free_inodes_count(sb, gdp) + 1;
ext4_free_inodes_set(sb, gdp, count);
if (is_directory) {
@@ -277,7 +274,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
}
gdp->bg_checksum = ext4_group_desc_csum(sbi,
block_group, gdp);
- spin_unlock(sb_bgl_lock(sbi, block_group));
+ ext4_unlock_group(sb, block_group);
percpu_counter_inc(&sbi->s_freeinodes_counter);
if (is_directory)
percpu_counter_dec(&sbi->s_dirs_counter);
@@ -316,7 +313,7 @@ error_return:
static int find_group_dir(struct super_block *sb, struct inode *parent,
ext4_group_t *best_group)
{
- ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
+ ext4_group_t ngroups = ext4_get_groups_count(sb);
unsigned int freei, avefreei;
struct ext4_group_desc *desc, *best_desc = NULL;
ext4_group_t group;
@@ -349,11 +346,10 @@ static int find_group_flex(struct super_block *sb, struct inode *parent,
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_group_desc *desc;
- struct buffer_head *bh;
struct flex_groups *flex_group = sbi->s_flex_groups;
ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
ext4_group_t parent_fbg_group = ext4_flex_group(sbi, parent_group);
- ext4_group_t ngroups = sbi->s_groups_count;
+ ext4_group_t ngroups = ext4_get_groups_count(sb);
int flex_size = ext4_flex_bg_size(sbi);
ext4_group_t best_flex = parent_fbg_group;
int blocks_per_flex = sbi->s_blocks_per_group * flex_size;
@@ -362,7 +358,7 @@ static int find_group_flex(struct super_block *sb, struct inode *parent,
ext4_group_t n_fbg_groups;
ext4_group_t i;
- n_fbg_groups = (sbi->s_groups_count + flex_size - 1) >>
+ n_fbg_groups = (ngroups + flex_size - 1) >>
sbi->s_log_groups_per_flex;
find_close_to_parent:
@@ -404,7 +400,7 @@ find_close_to_parent:
found_flexbg:
for (i = best_flex * flex_size; i < ngroups &&
i < (best_flex + 1) * flex_size; i++) {
- desc = ext4_get_group_desc(sb, i, &bh);
+ desc = ext4_get_group_desc(sb, i, NULL);
if (ext4_free_inodes_count(sb, desc)) {
*best_group = i;
goto out;
@@ -474,24 +470,27 @@ void get_orlov_stats(struct super_block *sb, ext4_group_t g,
*/
static int find_group_orlov(struct super_block *sb, struct inode *parent,
- ext4_group_t *group, int mode)
+ ext4_group_t *group, int mode,
+ const struct qstr *qstr)
{
ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
struct ext4_sb_info *sbi = EXT4_SB(sb);
- ext4_group_t ngroups = sbi->s_groups_count;
+ ext4_group_t real_ngroups = ext4_get_groups_count(sb);
int inodes_per_group = EXT4_INODES_PER_GROUP(sb);
unsigned int freei, avefreei;
ext4_fsblk_t freeb, avefreeb;
unsigned int ndirs;
int max_dirs, min_inodes;
ext4_grpblk_t min_blocks;
- ext4_group_t i, grp, g;
+ ext4_group_t i, grp, g, ngroups;
struct ext4_group_desc *desc;
struct orlov_stats stats;
int flex_size = ext4_flex_bg_size(sbi);
+ struct dx_hash_info hinfo;
+ ngroups = real_ngroups;
if (flex_size > 1) {
- ngroups = (ngroups + flex_size - 1) >>
+ ngroups = (real_ngroups + flex_size - 1) >>
sbi->s_log_groups_per_flex;
parent_group >>= sbi->s_log_groups_per_flex;
}
@@ -509,7 +508,13 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
int best_ndir = inodes_per_group;
int ret = -1;
- get_random_bytes(&grp, sizeof(grp));
+ if (qstr) {
+ hinfo.hash_version = DX_HASH_HALF_MD4;
+ hinfo.seed = sbi->s_hash_seed;
+ ext4fs_dirhash(qstr->name, qstr->len, &hinfo);
+ grp = hinfo.hash;
+ } else
+ get_random_bytes(&grp, sizeof(grp));
parent_group = (unsigned)grp % ngroups;
for (i = 0; i < ngroups; i++) {
g = (parent_group + i) % ngroups;
@@ -543,7 +548,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
*/
grp *= flex_size;
for (i = 0; i < flex_size; i++) {
- if (grp+i >= sbi->s_groups_count)
+ if (grp+i >= real_ngroups)
break;
desc = ext4_get_group_desc(sb, grp+i, NULL);
if (desc && ext4_free_inodes_count(sb, desc)) {
@@ -583,7 +588,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
}
fallback:
- ngroups = sbi->s_groups_count;
+ ngroups = real_ngroups;
avefreei = freei / ngroups;
fallback_retry:
parent_group = EXT4_I(parent)->i_block_group;
@@ -613,9 +618,8 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
ext4_group_t *group, int mode)
{
ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
- ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
+ ext4_group_t i, last, ngroups = ext4_get_groups_count(sb);
struct ext4_group_desc *desc;
- ext4_group_t i, last;
int flex_size = ext4_flex_bg_size(EXT4_SB(sb));
/*
@@ -653,7 +657,7 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
*group = parent_group + flex_size;
if (*group > ngroups)
*group = 0;
- return find_group_orlov(sb, parent, group, mode);
+ return find_group_orlov(sb, parent, group, mode, 0);
}
/*
@@ -708,10 +712,10 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
/*
* claim the inode from the inode bitmap. If the group
- * is uninit we need to take the groups's sb_bgl_lock
+ * is uninit we need to take the groups's ext4_group_lock
* and clear the uninit flag. The inode bitmap update
* and group desc uninit flag clear should be done
- * after holding sb_bgl_lock so that ext4_read_inode_bitmap
+ * after holding ext4_group_lock so that ext4_read_inode_bitmap
* doesn't race with the ext4_claim_inode
*/
static int ext4_claim_inode(struct super_block *sb,
@@ -722,7 +726,7 @@ static int ext4_claim_inode(struct super_block *sb,
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL);
- spin_lock(sb_bgl_lock(sbi, group));
+ ext4_lock_group(sb, group);
if (ext4_set_bit(ino, inode_bitmap_bh->b_data)) {
/* not a free inode */
retval = 1;
@@ -731,7 +735,7 @@ static int ext4_claim_inode(struct super_block *sb,
ino++;
if ((group == 0 && ino < EXT4_FIRST_INO(sb)) ||
ino > EXT4_INODES_PER_GROUP(sb)) {
- spin_unlock(sb_bgl_lock(sbi, group));
+ ext4_unlock_group(sb, group);
ext4_error(sb, __func__,
"reserved inode or inode > inodes count - "
"block_group = %u, inode=%lu", group,
@@ -780,7 +784,7 @@ static int ext4_claim_inode(struct super_block *sb,
}
gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
err_ret:
- spin_unlock(sb_bgl_lock(sbi, group));
+ ext4_unlock_group(sb, group);
return retval;
}
@@ -794,16 +798,16 @@ err_ret:
* For other inodes, search forward from the parent directory's block
* group to find a free inode.
*/
-struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
+struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode,
+ const struct qstr *qstr, __u32 goal)
{
struct super_block *sb;
struct buffer_head *inode_bitmap_bh = NULL;
struct buffer_head *group_desc_bh;
- ext4_group_t group = 0;
+ ext4_group_t ngroups, group = 0;
unsigned long ino = 0;
struct inode *inode;
struct ext4_group_desc *gdp = NULL;
- struct ext4_super_block *es;
struct ext4_inode_info *ei;
struct ext4_sb_info *sbi;
int ret2, err = 0;
@@ -818,15 +822,23 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
return ERR_PTR(-EPERM);
sb = dir->i_sb;
- trace_mark(ext4_request_inode, "dev %s dir %lu mode %d", sb->s_id,
- dir->i_ino, mode);
+ ngroups = ext4_get_groups_count(sb);
+ trace_ext4_request_inode(dir, mode);
inode = new_inode(sb);
if (!inode)
return ERR_PTR(-ENOMEM);
ei = EXT4_I(inode);
-
sbi = EXT4_SB(sb);
- es = sbi->s_es;
+
+ if (!goal)
+ goal = sbi->s_inode_goal;
+
+ if (goal && goal < le32_to_cpu(sbi->s_es->s_inodes_count)) {
+ group = (goal - 1) / EXT4_INODES_PER_GROUP(sb);
+ ino = (goal - 1) % EXT4_INODES_PER_GROUP(sb);
+ ret2 = 0;
+ goto got_group;
+ }
if (sbi->s_log_groups_per_flex && test_opt(sb, OLDALLOC)) {
ret2 = find_group_flex(sb, dir, &group);
@@ -846,7 +858,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
if (test_opt(sb, OLDALLOC))
ret2 = find_group_dir(sb, dir, &group);
else
- ret2 = find_group_orlov(sb, dir, &group, mode);
+ ret2 = find_group_orlov(sb, dir, &group, mode, qstr);
} else
ret2 = find_group_other(sb, dir, &group, mode);
@@ -856,7 +868,7 @@ got_group:
if (ret2 == -1)
goto out;
- for (i = 0; i < sbi->s_groups_count; i++) {
+ for (i = 0; i < ngroups; i++, ino = 0) {
err = -EIO;
gdp = ext4_get_group_desc(sb, group, &group_desc_bh);
@@ -868,8 +880,6 @@ got_group:
if (!inode_bitmap_bh)
goto fail;
- ino = 0;
-
repeat_in_this_group:
ino = ext4_find_next_zero_bit((unsigned long *)
inode_bitmap_bh->b_data,
@@ -917,7 +927,7 @@ repeat_in_this_group:
* group descriptor metadata has not yet been updated.
* So we just go onto the next blockgroup.
*/
- if (++group == sbi->s_groups_count)
+ if (++group == ngroups)
group = 0;
}
err = -ENOSPC;
@@ -938,7 +948,7 @@ got:
}
free = 0;
- spin_lock(sb_bgl_lock(sbi, group));
+ ext4_lock_group(sb, group);
/* recheck and clear flag under lock if we still need to */
if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
free = ext4_free_blocks_after_init(sb, group, gdp);
@@ -947,7 +957,7 @@ got:
gdp->bg_checksum = ext4_group_desc_csum(sbi, group,
gdp);
}
- spin_unlock(sb_bgl_lock(sbi, group));
+ ext4_unlock_group(sb, group);
/* Don't need to dirty bitmap block if we didn't change it */
if (free) {
@@ -1052,8 +1062,7 @@ got:
}
ext4_debug("allocating inode %lu\n", inode->i_ino);
- trace_mark(ext4_allocate_inode, "dev %s ino %lu dir %lu mode %d",
- sb->s_id, inode->i_ino, dir->i_ino, mode);
+ trace_ext4_allocate_inode(inode, dir, mode);
goto really_out;
fail:
ext4_std_error(sb, err);
@@ -1158,7 +1167,7 @@ unsigned long ext4_count_free_inodes(struct super_block *sb)
{
unsigned long desc_count;
struct ext4_group_desc *gdp;
- ext4_group_t i;
+ ext4_group_t i, ngroups = ext4_get_groups_count(sb);
#ifdef EXT4FS_DEBUG
struct ext4_super_block *es;
unsigned long bitmap_count, x;
@@ -1168,7 +1177,7 @@ unsigned long ext4_count_free_inodes(struct super_block *sb)
desc_count = 0;
bitmap_count = 0;
gdp = NULL;
- for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) {
+ for (i = 0; i < ngroups; i++) {
gdp = ext4_get_group_desc(sb, i, NULL);
if (!gdp)
continue;
@@ -1190,7 +1199,7 @@ unsigned long ext4_count_free_inodes(struct super_block *sb)
return desc_count;
#else
desc_count = 0;
- for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) {
+ for (i = 0; i < ngroups; i++) {
gdp = ext4_get_group_desc(sb, i, NULL);
if (!gdp)
continue;
@@ -1205,9 +1214,9 @@ unsigned long ext4_count_free_inodes(struct super_block *sb)
unsigned long ext4_count_dirs(struct super_block * sb)
{
unsigned long count = 0;
- ext4_group_t i;
+ ext4_group_t i, ngroups = ext4_get_groups_count(sb);
- for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) {
+ for (i = 0; i < ngroups; i++) {
struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL);
if (!gdp)
continue;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 2a9ffd528dd1..7c17ae275af4 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -37,11 +37,14 @@
#include <linux/namei.h>
#include <linux/uio.h>
#include <linux/bio.h>
+
#include "ext4_jbd2.h"
#include "xattr.h"
#include "acl.h"
#include "ext4_extents.h"
+#include <trace/events/ext4.h>
+
#define MPAGE_DA_EXTENT_TAIL 0x01
static inline int ext4_begin_ordered_truncate(struct inode *inode,
@@ -78,7 +81,7 @@ static int ext4_inode_is_fast_symlink(struct inode *inode)
* If the handle isn't valid we're not journaling so there's nothing to do.
*/
int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode,
- struct buffer_head *bh, ext4_fsblk_t blocknr)
+ struct buffer_head *bh, ext4_fsblk_t blocknr)
{
int err;
@@ -90,7 +93,7 @@ int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode,
BUFFER_TRACE(bh, "enter");
jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, "
- "data mode %lx\n",
+ "data mode %x\n",
bh, is_metadata, inode->i_mode,
test_opt(inode->i_sb, DATA_FLAGS));
@@ -329,8 +332,8 @@ static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v)
*/
static int ext4_block_to_path(struct inode *inode,
- ext4_lblk_t i_block,
- ext4_lblk_t offsets[4], int *boundary)
+ ext4_lblk_t i_block,
+ ext4_lblk_t offsets[4], int *boundary)
{
int ptrs = EXT4_ADDR_PER_BLOCK(inode->i_sb);
int ptrs_bits = EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb);
@@ -362,9 +365,9 @@ static int ext4_block_to_path(struct inode *inode,
final = ptrs;
} else {
ext4_warning(inode->i_sb, "ext4_block_to_path",
- "block %lu > max in inode %lu",
- i_block + direct_blocks +
- indirect_blocks + double_blocks, inode->i_ino);
+ "block %lu > max in inode %lu",
+ i_block + direct_blocks +
+ indirect_blocks + double_blocks, inode->i_ino);
}
if (boundary)
*boundary = final - 1 - (i_block & (ptrs - 1));
@@ -372,31 +375,32 @@ static int ext4_block_to_path(struct inode *inode,
}
static int __ext4_check_blockref(const char *function, struct inode *inode,
- __le32 *p, unsigned int max) {
-
- unsigned int maxblocks = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es);
+ __le32 *p, unsigned int max)
+{
__le32 *bref = p;
+ unsigned int blk;
+
while (bref < p+max) {
- if (unlikely(le32_to_cpu(*bref) >= maxblocks)) {
+ blk = le32_to_cpu(*bref++);
+ if (blk &&
+ unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb),
+ blk, 1))) {
ext4_error(inode->i_sb, function,
- "block reference %u >= max (%u) "
- "in inode #%lu, offset=%d",
- le32_to_cpu(*bref), maxblocks,
- inode->i_ino, (int)(bref-p));
- return -EIO;
- }
- bref++;
- }
- return 0;
+ "invalid block reference %u "
+ "in inode #%lu", blk, inode->i_ino);
+ return -EIO;
+ }
+ }
+ return 0;
}
#define ext4_check_indirect_blockref(inode, bh) \
- __ext4_check_blockref(__func__, inode, (__le32 *)(bh)->b_data, \
+ __ext4_check_blockref(__func__, inode, (__le32 *)(bh)->b_data, \
EXT4_ADDR_PER_BLOCK((inode)->i_sb))
#define ext4_check_inode_blockref(inode) \
- __ext4_check_blockref(__func__, inode, EXT4_I(inode)->i_data, \
+ __ext4_check_blockref(__func__, inode, EXT4_I(inode)->i_data, \
EXT4_NDIR_BLOCKS)
/**
@@ -446,7 +450,7 @@ static Indirect *ext4_get_branch(struct inode *inode, int depth,
bh = sb_getblk(sb, le32_to_cpu(p->key));
if (unlikely(!bh))
goto failure;
-
+
if (!bh_uptodate_or_lock(bh)) {
if (bh_submit_read(bh) < 0) {
put_bh(bh);
@@ -458,7 +462,7 @@ static Indirect *ext4_get_branch(struct inode *inode, int depth,
goto failure;
}
}
-
+
add_chain(++p, bh, (__le32 *)bh->b_data + *++offsets);
/* Reader: end */
if (!p->key)
@@ -551,7 +555,7 @@ static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
* returns it.
*/
static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block,
- Indirect *partial)
+ Indirect *partial)
{
/*
* XXX need to get goal block from mballoc's data structures
@@ -573,7 +577,7 @@ static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block,
* direct and indirect blocks.
*/
static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks,
- int blocks_to_boundary)
+ int blocks_to_boundary)
{
unsigned int count = 0;
@@ -609,9 +613,9 @@ static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks,
* direct blocks
*/
static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
- ext4_lblk_t iblock, ext4_fsblk_t goal,
- int indirect_blks, int blks,
- ext4_fsblk_t new_blocks[4], int *err)
+ ext4_lblk_t iblock, ext4_fsblk_t goal,
+ int indirect_blks, int blks,
+ ext4_fsblk_t new_blocks[4], int *err)
{
struct ext4_allocation_request ar;
int target, i;
@@ -682,10 +686,10 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
}
if (!*err) {
if (target == blks) {
- /*
- * save the new block number
- * for the first direct block
- */
+ /*
+ * save the new block number
+ * for the first direct block
+ */
new_blocks[index] = current_block;
}
blk_allocated += ar.len;
@@ -727,9 +731,9 @@ failed_out:
* as described above and return 0.
*/
static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
- ext4_lblk_t iblock, int indirect_blks,
- int *blks, ext4_fsblk_t goal,
- ext4_lblk_t *offsets, Indirect *branch)
+ ext4_lblk_t iblock, int indirect_blks,
+ int *blks, ext4_fsblk_t goal,
+ ext4_lblk_t *offsets, Indirect *branch)
{
int blocksize = inode->i_sb->s_blocksize;
int i, n = 0;
@@ -776,7 +780,7 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
* the chain to point to the new allocated
* data blocks numbers
*/
- for (i=1; i < num; i++)
+ for (i = 1; i < num; i++)
*(branch[n].p + i) = cpu_to_le32(++current_block);
}
BUFFER_TRACE(bh, "marking uptodate");
@@ -819,7 +823,8 @@ failed:
* chain to new block and return 0.
*/
static int ext4_splice_branch(handle_t *handle, struct inode *inode,
- ext4_lblk_t block, Indirect *where, int num, int blks)
+ ext4_lblk_t block, Indirect *where, int num,
+ int blks)
{
int i;
int err = 0;
@@ -851,10 +856,6 @@ static int ext4_splice_branch(handle_t *handle, struct inode *inode,
}
/* We are done with atomic stuff, now do the rest of housekeeping */
-
- inode->i_ctime = ext4_current_time(inode);
- ext4_mark_inode_dirty(handle, inode);
-
/* had we spliced it onto indirect block? */
if (where->bh) {
/*
@@ -873,8 +874,8 @@ static int ext4_splice_branch(handle_t *handle, struct inode *inode,
} else {
/*
* OK, we spliced it into the inode itself on a direct block.
- * Inode was dirtied above.
*/
+ ext4_mark_inode_dirty(handle, inode);
jbd_debug(5, "splicing direct\n");
}
return err;
@@ -892,6 +893,10 @@ err_out:
}
/*
+ * The ext4_ind_get_blocks() function handles non-extents inodes
+ * (i.e., using the traditional indirect/double-indirect i_blocks
+ * scheme) for ext4_get_blocks().
+ *
* Allocation strategy is simple: if we have to allocate something, we will
* have to go the whole way to leaf. So let's do it before attaching anything
* to tree, set linkage between the newborn blocks, write them if sync is
@@ -909,15 +914,16 @@ err_out:
* return = 0, if plain lookup failed.
* return < 0, error case.
*
- *
- * Need to be called with
- * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block
- * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem)
+ * The ext4_ind_get_blocks() function should be called with
+ * down_write(&EXT4_I(inode)->i_data_sem) if allocating filesystem
+ * blocks (i.e., flags has EXT4_GET_BLOCKS_CREATE set) or
+ * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system
+ * blocks.
*/
-static int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
- ext4_lblk_t iblock, unsigned int maxblocks,
- struct buffer_head *bh_result,
- int create, int extend_disksize)
+static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
+ ext4_lblk_t iblock, unsigned int maxblocks,
+ struct buffer_head *bh_result,
+ int flags)
{
int err = -EIO;
ext4_lblk_t offsets[4];
@@ -927,16 +933,13 @@ static int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
int indirect_blks;
int blocks_to_boundary = 0;
int depth;
- struct ext4_inode_info *ei = EXT4_I(inode);
int count = 0;
ext4_fsblk_t first_block = 0;
- loff_t disksize;
-
J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL));
- J_ASSERT(handle != NULL || create == 0);
+ J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0);
depth = ext4_block_to_path(inode, iblock, offsets,
- &blocks_to_boundary);
+ &blocks_to_boundary);
if (depth == 0)
goto out;
@@ -963,7 +966,7 @@ static int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
}
/* Next simple case - plain lookup or failed read of indirect block */
- if (!create || err == -EIO)
+ if ((flags & EXT4_GET_BLOCKS_CREATE) == 0 || err == -EIO)
goto cleanup;
/*
@@ -984,8 +987,8 @@ static int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
* Block out ext4_truncate while we alter the tree
*/
err = ext4_alloc_branch(handle, inode, iblock, indirect_blks,
- &count, goal,
- offsets + (partial - chain), partial);
+ &count, goal,
+ offsets + (partial - chain), partial);
/*
* The ext4_splice_branch call will free and forget any buffers
@@ -996,20 +999,8 @@ static int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
*/
if (!err)
err = ext4_splice_branch(handle, inode, iblock,
- partial, indirect_blks, count);
- /*
- * i_disksize growing is protected by i_data_sem. Don't forget to
- * protect it if you're about to implement concurrent
- * ext4_get_block() -bzzz
- */
- if (!err && extend_disksize) {
- disksize = ((loff_t) iblock + count) << inode->i_blkbits;
- if (disksize > i_size_read(inode))
- disksize = i_size_read(inode);
- if (disksize > ei->i_disksize)
- ei->i_disksize = disksize;
- }
- if (err)
+ partial, indirect_blks, count);
+ else
goto cleanup;
set_buffer_new(bh_result);
@@ -1120,8 +1111,23 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used)
ext4_discard_preallocations(inode);
}
+static int check_block_validity(struct inode *inode, sector_t logical,
+ sector_t phys, int len)
+{
+ if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), phys, len)) {
+ ext4_error(inode->i_sb, "check_block_validity",
+ "inode #%lu logical block %llu mapped to %llu "
+ "(size %d)", inode->i_ino,
+ (unsigned long long) logical,
+ (unsigned long long) phys, len);
+ WARN_ON(1);
+ return -EIO;
+ }
+ return 0;
+}
+
/*
- * The ext4_get_blocks_wrap() function try to look up the requested blocks,
+ * The ext4_get_blocks() function tries to look up the requested blocks,
* and returns if the blocks are already mapped.
*
* Otherwise it takes the write lock of the i_data_sem and allocate blocks
@@ -1129,7 +1135,7 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used)
* mapped.
*
* If file type is extents based, it will call ext4_ext_get_blocks(),
- * Otherwise, call with ext4_get_blocks_handle() to handle indirect mapping
+ * Otherwise, call with ext4_ind_get_blocks() to handle indirect mapping
* based files
*
* On success, it returns the number of blocks being mapped or allocate.
@@ -1142,9 +1148,9 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used)
*
* It returns the error in case of allocation failure.
*/
-int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
- unsigned int max_blocks, struct buffer_head *bh,
- int create, int extend_disksize, int flag)
+int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
+ unsigned int max_blocks, struct buffer_head *bh,
+ int flags)
{
int retval;
@@ -1152,21 +1158,28 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
clear_buffer_unwritten(bh);
/*
- * Try to see if we can get the block without requesting
- * for new file system block.
+ * Try to see if we can get the block without requesting a new
+ * file system block.
*/
down_read((&EXT4_I(inode)->i_data_sem));
if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
retval = ext4_ext_get_blocks(handle, inode, block, max_blocks,
- bh, 0, 0);
+ bh, 0);
} else {
- retval = ext4_get_blocks_handle(handle,
- inode, block, max_blocks, bh, 0, 0);
+ retval = ext4_ind_get_blocks(handle, inode, block, max_blocks,
+ bh, 0);
}
up_read((&EXT4_I(inode)->i_data_sem));
+ if (retval > 0 && buffer_mapped(bh)) {
+ int ret = check_block_validity(inode, block,
+ bh->b_blocknr, retval);
+ if (ret != 0)
+ return ret;
+ }
+
/* If it is only a block(s) look up */
- if (!create)
+ if ((flags & EXT4_GET_BLOCKS_CREATE) == 0)
return retval;
/*
@@ -1205,7 +1218,7 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
* let the underlying get_block() function know to
* avoid double accounting
*/
- if (flag)
+ if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
EXT4_I(inode)->i_delalloc_reserved_flag = 1;
/*
* We need to check for EXT4 here because migrate
@@ -1213,10 +1226,10 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
*/
if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
retval = ext4_ext_get_blocks(handle, inode, block, max_blocks,
- bh, create, extend_disksize);
+ bh, flags);
} else {
- retval = ext4_get_blocks_handle(handle, inode, block,
- max_blocks, bh, create, extend_disksize);
+ retval = ext4_ind_get_blocks(handle, inode, block,
+ max_blocks, bh, flags);
if (retval > 0 && buffer_new(bh)) {
/*
@@ -1229,18 +1242,23 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
}
}
- if (flag) {
+ if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
EXT4_I(inode)->i_delalloc_reserved_flag = 0;
- /*
- * Update reserved blocks/metadata blocks
- * after successful block allocation
- * which were deferred till now
- */
- if ((retval > 0) && buffer_delay(bh))
- ext4_da_update_reserve_space(inode, retval);
- }
+
+ /*
+ * Update reserved blocks/metadata blocks after successful
+ * block allocation which had been deferred till now.
+ */
+ if ((retval > 0) && (flags & EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE))
+ ext4_da_update_reserve_space(inode, retval);
up_write((&EXT4_I(inode)->i_data_sem));
+ if (retval > 0 && buffer_mapped(bh)) {
+ int ret = check_block_validity(inode, block,
+ bh->b_blocknr, retval);
+ if (ret != 0)
+ return ret;
+ }
return retval;
}
@@ -1268,8 +1286,8 @@ int ext4_get_block(struct inode *inode, sector_t iblock,
started = 1;
}
- ret = ext4_get_blocks_wrap(handle, inode, iblock,
- max_blocks, bh_result, create, 0, 0);
+ ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result,
+ create ? EXT4_GET_BLOCKS_CREATE : 0);
if (ret > 0) {
bh_result->b_size = (ret << inode->i_blkbits);
ret = 0;
@@ -1288,17 +1306,19 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
{
struct buffer_head dummy;
int fatal = 0, err;
+ int flags = 0;
J_ASSERT(handle != NULL || create == 0);
dummy.b_state = 0;
dummy.b_blocknr = -1000;
buffer_trace_init(&dummy.b_history);
- err = ext4_get_blocks_wrap(handle, inode, block, 1,
- &dummy, create, 1, 0);
+ if (create)
+ flags |= EXT4_GET_BLOCKS_CREATE;
+ err = ext4_get_blocks(handle, inode, block, 1, &dummy, flags);
/*
- * ext4_get_blocks_handle() returns number of blocks
- * mapped. 0 in case of a HOLE.
+ * ext4_get_blocks() returns number of blocks mapped. 0 in
+ * case of a HOLE.
*/
if (err > 0) {
if (err > 1)
@@ -1385,8 +1405,7 @@ static int walk_page_buffers(handle_t *handle,
for (bh = head, block_start = 0;
ret == 0 && (bh != head || !block_start);
- block_start = block_end, bh = next)
- {
+ block_start = block_end, bh = next) {
next = bh->b_this_page;
block_end = block_start + blocksize;
if (block_end <= from || block_start >= to) {
@@ -1427,7 +1446,7 @@ static int walk_page_buffers(handle_t *handle,
* write.
*/
static int do_journal_get_write_access(handle_t *handle,
- struct buffer_head *bh)
+ struct buffer_head *bh)
{
if (!buffer_mapped(bh) || buffer_freed(bh))
return 0;
@@ -1435,22 +1454,24 @@ static int do_journal_get_write_access(handle_t *handle,
}
static int ext4_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
- struct page **pagep, void **fsdata)
+ loff_t pos, unsigned len, unsigned flags,
+ struct page **pagep, void **fsdata)
{
struct inode *inode = mapping->host;
- int ret, needed_blocks = ext4_writepage_trans_blocks(inode);
+ int ret, needed_blocks;
handle_t *handle;
int retries = 0;
struct page *page;
- pgoff_t index;
+ pgoff_t index;
unsigned from, to;
- trace_mark(ext4_write_begin,
- "dev %s ino %lu pos %llu len %u flags %u",
- inode->i_sb->s_id, inode->i_ino,
- (unsigned long long) pos, len, flags);
- index = pos >> PAGE_CACHE_SHIFT;
+ trace_ext4_write_begin(inode, pos, len, flags);
+ /*
+ * Reserve one block more for addition to orphan list in case
+ * we allocate blocks but write fails for some reason
+ */
+ needed_blocks = ext4_writepage_trans_blocks(inode) + 1;
+ index = pos >> PAGE_CACHE_SHIFT;
from = pos & (PAGE_CACHE_SIZE - 1);
to = from + len;
@@ -1483,15 +1504,30 @@ retry:
if (ret) {
unlock_page(page);
- ext4_journal_stop(handle);
page_cache_release(page);
/*
* block_write_begin may have instantiated a few blocks
* outside i_size. Trim these off again. Don't need
* i_size_read because we hold i_mutex.
+ *
+ * Add inode to orphan list in case we crash before
+ * truncate finishes
*/
if (pos + len > inode->i_size)
+ ext4_orphan_add(handle, inode);
+
+ ext4_journal_stop(handle);
+ if (pos + len > inode->i_size) {
vmtruncate(inode, inode->i_size);
+ /*
+ * If vmtruncate failed early the inode might
+ * still be on the orphan list; we need to
+ * make sure the inode is removed from the
+ * orphan list in that case.
+ */
+ if (inode->i_nlink)
+ ext4_orphan_del(NULL, inode);
+ }
}
if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
@@ -1509,6 +1545,52 @@ static int write_end_fn(handle_t *handle, struct buffer_head *bh)
return ext4_handle_dirty_metadata(handle, NULL, bh);
}
+static int ext4_generic_write_end(struct file *file,
+ struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct page *page, void *fsdata)
+{
+ int i_size_changed = 0;
+ struct inode *inode = mapping->host;
+ handle_t *handle = ext4_journal_current_handle();
+
+ copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
+
+ /*
+ * No need to use i_size_read() here, the i_size
+ * cannot change under us because we hold i_mutex.
+ *
+ * But it's important to update i_size while still holding page lock:
+ * page writeout could otherwise come in and zero beyond i_size.
+ */
+ if (pos + copied > inode->i_size) {
+ i_size_write(inode, pos + copied);
+ i_size_changed = 1;
+ }
+
+ if (pos + copied > EXT4_I(inode)->i_disksize) {
+ /* We need to mark inode dirty even if
+ * new_i_size is less that inode->i_size
+ * bu greater than i_disksize.(hint delalloc)
+ */
+ ext4_update_i_disksize(inode, (pos + copied));
+ i_size_changed = 1;
+ }
+ unlock_page(page);
+ page_cache_release(page);
+
+ /*
+ * Don't mark the inode dirty under page lock. First, it unnecessarily
+ * makes the holding time of page lock longer. Second, it forces lock
+ * ordering of page lock and transaction start for journaling
+ * filesystems.
+ */
+ if (i_size_changed)
+ ext4_mark_inode_dirty(handle, inode);
+
+ return copied;
+}
+
/*
* We need to pick up the new inode size which generic_commit_write gave us
* `file' can be NULL - eg, when called from page_symlink().
@@ -1517,36 +1599,27 @@ static int write_end_fn(handle_t *handle, struct buffer_head *bh)
* buffers are managed internally.
*/
static int ext4_ordered_write_end(struct file *file,
- struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied,
- struct page *page, void *fsdata)
+ struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct page *page, void *fsdata)
{
handle_t *handle = ext4_journal_current_handle();
struct inode *inode = mapping->host;
int ret = 0, ret2;
- trace_mark(ext4_ordered_write_end,
- "dev %s ino %lu pos %llu len %u copied %u",
- inode->i_sb->s_id, inode->i_ino,
- (unsigned long long) pos, len, copied);
+ trace_ext4_ordered_write_end(inode, pos, len, copied);
ret = ext4_jbd2_file_inode(handle, inode);
if (ret == 0) {
- loff_t new_i_size;
-
- new_i_size = pos + copied;
- if (new_i_size > EXT4_I(inode)->i_disksize) {
- ext4_update_i_disksize(inode, new_i_size);
- /* We need to mark inode dirty even if
- * new_i_size is less that inode->i_size
- * bu greater than i_disksize.(hint delalloc)
- */
- ext4_mark_inode_dirty(handle, inode);
- }
-
- ret2 = generic_write_end(file, mapping, pos, len, copied,
+ ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
page, fsdata);
copied = ret2;
+ if (pos + len > inode->i_size)
+ /* if we have allocated more blocks and copied
+ * less. We will have blocks allocated outside
+ * inode->i_size. So truncate them
+ */
+ ext4_orphan_add(handle, inode);
if (ret2 < 0)
ret = ret2;
}
@@ -1554,36 +1627,41 @@ static int ext4_ordered_write_end(struct file *file,
if (!ret)
ret = ret2;
+ if (pos + len > inode->i_size) {
+ vmtruncate(inode, inode->i_size);
+ /*
+ * If vmtruncate failed early the inode might still be
+ * on the orphan list; we need to make sure the inode
+ * is removed from the orphan list in that case.
+ */
+ if (inode->i_nlink)
+ ext4_orphan_del(NULL, inode);
+ }
+
+
return ret ? ret : copied;
}
static int ext4_writeback_write_end(struct file *file,
- struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied,
- struct page *page, void *fsdata)
+ struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct page *page, void *fsdata)
{
handle_t *handle = ext4_journal_current_handle();
struct inode *inode = mapping->host;
int ret = 0, ret2;
- loff_t new_i_size;
-
- trace_mark(ext4_writeback_write_end,
- "dev %s ino %lu pos %llu len %u copied %u",
- inode->i_sb->s_id, inode->i_ino,
- (unsigned long long) pos, len, copied);
- new_i_size = pos + copied;
- if (new_i_size > EXT4_I(inode)->i_disksize) {
- ext4_update_i_disksize(inode, new_i_size);
- /* We need to mark inode dirty even if
- * new_i_size is less that inode->i_size
- * bu greater than i_disksize.(hint delalloc)
- */
- ext4_mark_inode_dirty(handle, inode);
- }
- ret2 = generic_write_end(file, mapping, pos, len, copied,
+ trace_ext4_writeback_write_end(inode, pos, len, copied);
+ ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
page, fsdata);
copied = ret2;
+ if (pos + len > inode->i_size)
+ /* if we have allocated more blocks and copied
+ * less. We will have blocks allocated outside
+ * inode->i_size. So truncate them
+ */
+ ext4_orphan_add(handle, inode);
+
if (ret2 < 0)
ret = ret2;
@@ -1591,13 +1669,24 @@ static int ext4_writeback_write_end(struct file *file,
if (!ret)
ret = ret2;
+ if (pos + len > inode->i_size) {
+ vmtruncate(inode, inode->i_size);
+ /*
+ * If vmtruncate failed early the inode might still be
+ * on the orphan list; we need to make sure the inode
+ * is removed from the orphan list in that case.
+ */
+ if (inode->i_nlink)
+ ext4_orphan_del(NULL, inode);
+ }
+
return ret ? ret : copied;
}
static int ext4_journalled_write_end(struct file *file,
- struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied,
- struct page *page, void *fsdata)
+ struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct page *page, void *fsdata)
{
handle_t *handle = ext4_journal_current_handle();
struct inode *inode = mapping->host;
@@ -1606,10 +1695,7 @@ static int ext4_journalled_write_end(struct file *file,
unsigned from, to;
loff_t new_i_size;
- trace_mark(ext4_journalled_write_end,
- "dev %s ino %lu pos %llu len %u copied %u",
- inode->i_sb->s_id, inode->i_ino,
- (unsigned long long) pos, len, copied);
+ trace_ext4_journalled_write_end(inode, pos, len, copied);
from = pos & (PAGE_CACHE_SIZE - 1);
to = from + len;
@@ -1635,10 +1721,27 @@ static int ext4_journalled_write_end(struct file *file,
}
unlock_page(page);
+ page_cache_release(page);
+ if (pos + len > inode->i_size)
+ /* if we have allocated more blocks and copied
+ * less. We will have blocks allocated outside
+ * inode->i_size. So truncate them
+ */
+ ext4_orphan_add(handle, inode);
+
ret2 = ext4_journal_stop(handle);
if (!ret)
ret = ret2;
- page_cache_release(page);
+ if (pos + len > inode->i_size) {
+ vmtruncate(inode, inode->i_size);
+ /*
+ * If vmtruncate failed early the inode might still be
+ * on the orphan list; we need to make sure the inode
+ * is removed from the orphan list in that case.
+ */
+ if (inode->i_nlink)
+ ext4_orphan_del(NULL, inode);
+ }
return ret ? ret : copied;
}
@@ -1738,7 +1841,7 @@ static void ext4_da_release_space(struct inode *inode, int to_free)
}
static void ext4_da_page_release_reservation(struct page *page,
- unsigned long offset)
+ unsigned long offset)
{
int to_release = 0;
struct buffer_head *head, *bh;
@@ -1852,7 +1955,7 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
* @logical - first logical block to start assignment with
*
* the function goes through all passed space and put actual disk
- * block numbers into buffer heads, dropping BH_Delay
+ * block numbers into buffer heads, dropping BH_Delay and BH_Unwritten
*/
static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
struct buffer_head *exbh)
@@ -1902,16 +2005,24 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
do {
if (cur_logical >= logical + blocks)
break;
- if (buffer_delay(bh)) {
- bh->b_blocknr = pblock;
- clear_buffer_delay(bh);
- bh->b_bdev = inode->i_sb->s_bdev;
- } else if (buffer_unwritten(bh)) {
- bh->b_blocknr = pblock;
- clear_buffer_unwritten(bh);
- set_buffer_mapped(bh);
- set_buffer_new(bh);
- bh->b_bdev = inode->i_sb->s_bdev;
+
+ if (buffer_delay(bh) ||
+ buffer_unwritten(bh)) {
+
+ BUG_ON(bh->b_bdev != inode->i_sb->s_bdev);
+
+ if (buffer_delay(bh)) {
+ clear_buffer_delay(bh);
+ bh->b_blocknr = pblock;
+ } else {
+ /*
+ * unwritten already should have
+ * blocknr assigned. Verify that
+ */
+ clear_buffer_unwritten(bh);
+ BUG_ON(bh->b_blocknr != pblock);
+ }
+
} else if (buffer_mapped(bh))
BUG_ON(bh->b_blocknr != pblock);
@@ -1990,51 +2101,6 @@ static void ext4_print_free_blocks(struct inode *inode)
return;
}
-#define EXT4_DELALLOC_RSVED 1
-static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
- struct buffer_head *bh_result, int create)
-{
- int ret;
- unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
- loff_t disksize = EXT4_I(inode)->i_disksize;
- handle_t *handle = NULL;
-
- handle = ext4_journal_current_handle();
- BUG_ON(!handle);
- ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
- bh_result, create, 0, EXT4_DELALLOC_RSVED);
- if (ret <= 0)
- return ret;
-
- bh_result->b_size = (ret << inode->i_blkbits);
-
- if (ext4_should_order_data(inode)) {
- int retval;
- retval = ext4_jbd2_file_inode(handle, inode);
- if (retval)
- /*
- * Failed to add inode for ordered mode. Don't
- * update file size
- */
- return retval;
- }
-
- /*
- * Update on-disk size along with block allocation we don't
- * use 'extend_disksize' as size may change within already
- * allocated block -bzzz
- */
- disksize = ((loff_t) iblock + ret) << inode->i_blkbits;
- if (disksize > i_size_read(inode))
- disksize = i_size_read(inode);
- if (disksize > EXT4_I(inode)->i_disksize) {
- ext4_update_i_disksize(inode, disksize);
- ret = ext4_mark_inode_dirty(handle, inode);
- return ret;
- }
- return 0;
-}
-
/*
* mpage_da_map_blocks - go through given space
*
@@ -2045,29 +2111,57 @@ static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
*/
static int mpage_da_map_blocks(struct mpage_da_data *mpd)
{
- int err = 0;
+ int err, blks, get_blocks_flags;
struct buffer_head new;
- sector_t next;
+ sector_t next = mpd->b_blocknr;
+ unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits;
+ loff_t disksize = EXT4_I(mpd->inode)->i_disksize;
+ handle_t *handle = NULL;
/*
* We consider only non-mapped and non-allocated blocks
*/
if ((mpd->b_state & (1 << BH_Mapped)) &&
- !(mpd->b_state & (1 << BH_Delay)))
+ !(mpd->b_state & (1 << BH_Delay)) &&
+ !(mpd->b_state & (1 << BH_Unwritten)))
return 0;
- new.b_state = mpd->b_state;
- new.b_blocknr = 0;
- new.b_size = mpd->b_size;
- next = mpd->b_blocknr;
+
/*
- * If we didn't accumulate anything
- * to write simply return
+ * If we didn't accumulate anything to write simply return
*/
- if (!new.b_size)
+ if (!mpd->b_size)
return 0;
- err = ext4_da_get_block_write(mpd->inode, next, &new, 1);
- if (err) {
+ handle = ext4_journal_current_handle();
+ BUG_ON(!handle);
+
+ /*
+ * Call ext4_get_blocks() to allocate any delayed allocation
+ * blocks, or to convert an uninitialized extent to be
+ * initialized (in the case where we have written into
+ * one or more preallocated blocks).
+ *
+ * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE to
+ * indicate that we are on the delayed allocation path. This
+ * affects functions in many different parts of the allocation
+ * call path. This flag exists primarily because we don't
+ * want to change *many* call functions, so ext4_get_blocks()
+ * will set the magic i_delalloc_reserved_flag once the
+ * inode's allocation semaphore is taken.
+ *
+ * If the blocks in questions were delalloc blocks, set
+ * EXT4_GET_BLOCKS_DELALLOC_RESERVE so the delalloc accounting
+ * variables are updated after the blocks have been allocated.
+ */
+ new.b_state = 0;
+ get_blocks_flags = (EXT4_GET_BLOCKS_CREATE |
+ EXT4_GET_BLOCKS_DELALLOC_RESERVE);
+ if (mpd->b_state & (1 << BH_Delay))
+ get_blocks_flags |= EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE;
+ blks = ext4_get_blocks(handle, mpd->inode, next, max_blocks,
+ &new, get_blocks_flags);
+ if (blks < 0) {
+ err = blks;
/*
* If get block returns with error we simply
* return. Later writepage will redirty the page and
@@ -2100,12 +2194,14 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
if (err == -ENOSPC) {
ext4_print_free_blocks(mpd->inode);
}
- /* invlaidate all the pages */
+ /* invalidate all the pages */
ext4_da_block_invalidatepages(mpd, next,
mpd->b_size >> mpd->inode->i_blkbits);
return err;
}
- BUG_ON(new.b_size == 0);
+ BUG_ON(blks == 0);
+
+ new.b_size = (blks << mpd->inode->i_blkbits);
if (buffer_new(&new))
__unmap_underlying_blocks(mpd->inode, &new);
@@ -2118,6 +2214,23 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
(mpd->b_state & (1 << BH_Unwritten)))
mpage_put_bnr_to_bhs(mpd, next, &new);
+ if (ext4_should_order_data(mpd->inode)) {
+ err = ext4_jbd2_file_inode(handle, mpd->inode);
+ if (err)
+ return err;
+ }
+
+ /*
+ * Update on-disk size along with block allocation.
+ */
+ disksize = ((loff_t) next + blks) << mpd->inode->i_blkbits;
+ if (disksize > i_size_read(mpd->inode))
+ disksize = i_size_read(mpd->inode);
+ if (disksize > EXT4_I(mpd->inode)->i_disksize) {
+ ext4_update_i_disksize(mpd->inode, disksize);
+ return ext4_mark_inode_dirty(handle, mpd->inode);
+ }
+
return 0;
}
@@ -2192,6 +2305,17 @@ flush_it:
return;
}
+static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh)
+{
+ /*
+ * unmapped buffer is possible for holes.
+ * delay buffer is possible with delayed allocation.
+ * We also need to consider unwritten buffer as unmapped.
+ */
+ return (!buffer_mapped(bh) || buffer_delay(bh) ||
+ buffer_unwritten(bh)) && buffer_dirty(bh);
+}
+
/*
* __mpage_da_writepage - finds extent of pages and blocks
*
@@ -2276,8 +2400,7 @@ static int __mpage_da_writepage(struct page *page,
* Otherwise we won't make progress
* with the page in ext4_da_writepage
*/
- if (buffer_dirty(bh) &&
- (!buffer_mapped(bh) || buffer_delay(bh))) {
+ if (ext4_bh_unmapped_or_delay(NULL, bh)) {
mpage_add_bh_to_extent(mpd, logical,
bh->b_size,
bh->b_state);
@@ -2303,8 +2426,16 @@ static int __mpage_da_writepage(struct page *page,
}
/*
- * this is a special callback for ->write_begin() only
- * it's intention is to return mapped block or reserve space
+ * This is a special get_blocks_t callback which is used by
+ * ext4_da_write_begin(). It will either return mapped block or
+ * reserve space for a single block.
+ *
+ * For delayed buffer_head we have BH_Mapped, BH_New, BH_Delay set.
+ * We also have b_blocknr = -1 and b_bdev initialized properly
+ *
+ * For unwritten buffer_head we have BH_Mapped, BH_New, BH_Unwritten set.
+ * We also have b_blocknr = physicalblock mapping unwritten extent and b_bdev
+ * initialized properly.
*/
static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create)
@@ -2323,7 +2454,7 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
* preallocated blocks are unmapped but should treated
* the same as allocated blocks.
*/
- ret = ext4_get_blocks_wrap(NULL, inode, iblock, 1, bh_result, 0, 0, 0);
+ ret = ext4_get_blocks(NULL, inode, iblock, 1, bh_result, 0);
if ((ret == 0) && !buffer_delay(bh_result)) {
/* the block isn't (pre)allocated yet, let's reserve space */
/*
@@ -2340,40 +2471,53 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
set_buffer_delay(bh_result);
} else if (ret > 0) {
bh_result->b_size = (ret << inode->i_blkbits);
- /*
- * With sub-block writes into unwritten extents
- * we also need to mark the buffer as new so that
- * the unwritten parts of the buffer gets correctly zeroed.
- */
- if (buffer_unwritten(bh_result))
+ if (buffer_unwritten(bh_result)) {
+ /* A delayed write to unwritten bh should
+ * be marked new and mapped. Mapped ensures
+ * that we don't do get_block multiple times
+ * when we write to the same offset and new
+ * ensures that we do proper zero out for
+ * partial write.
+ */
set_buffer_new(bh_result);
+ set_buffer_mapped(bh_result);
+ }
ret = 0;
}
return ret;
}
-static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh)
-{
- /*
- * unmapped buffer is possible for holes.
- * delay buffer is possible with delayed allocation
- */
- return ((!buffer_mapped(bh) || buffer_delay(bh)) && buffer_dirty(bh));
-}
-
-static int ext4_normal_get_block_write(struct inode *inode, sector_t iblock,
+/*
+ * This function is used as a standard get_block_t calback function
+ * when there is no desire to allocate any blocks. It is used as a
+ * callback function for block_prepare_write(), nobh_writepage(), and
+ * block_write_full_page(). These functions should only try to map a
+ * single block at a time.
+ *
+ * Since this function doesn't do block allocations even if the caller
+ * requests it by passing in create=1, it is critically important that
+ * any caller checks to make sure that any buffer heads are returned
+ * by this function are either all already mapped or marked for
+ * delayed allocation before calling nobh_writepage() or
+ * block_write_full_page(). Otherwise, b_blocknr could be left
+ * unitialized, and the page write functions will be taken by
+ * surprise.
+ */
+static int noalloc_get_block_write(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create)
{
int ret = 0;
unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
+ BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize);
+
/*
* we don't want to do block allocation in writepage
* so call get_block_wrap with create = 0
*/
- ret = ext4_get_blocks_wrap(NULL, inode, iblock, max_blocks,
- bh_result, 0, 0, 0);
+ ret = ext4_get_blocks(NULL, inode, iblock, max_blocks, bh_result, 0);
+ BUG_ON(create && ret == 0);
if (ret > 0) {
bh_result->b_size = (ret << inode->i_blkbits);
ret = 0;
@@ -2382,10 +2526,11 @@ static int ext4_normal_get_block_write(struct inode *inode, sector_t iblock,
}
/*
- * get called vi ext4_da_writepages after taking page lock (have journal handle)
- * get called via journal_submit_inode_data_buffers (no journal handle)
- * get called via shrink_page_list via pdflush (no journal handle)
- * or grab_page_cache when doing write_begin (have journal handle)
+ * This function can get called via...
+ * - ext4_da_writepages after taking page lock (have journal handle)
+ * - journal_submit_inode_data_buffers (no journal handle)
+ * - shrink_page_list via pdflush (no journal handle)
+ * - grab_page_cache when doing write_begin (have journal handle)
*/
static int ext4_da_writepage(struct page *page,
struct writeback_control *wbc)
@@ -2396,9 +2541,7 @@ static int ext4_da_writepage(struct page *page,
struct buffer_head *page_bufs;
struct inode *inode = page->mapping->host;
- trace_mark(ext4_da_writepage,
- "dev %s ino %lu page_index %lu",
- inode->i_sb->s_id, inode->i_ino, page->index);
+ trace_ext4_da_writepage(inode, page);
size = i_size_read(inode);
if (page->index == size >> PAGE_CACHE_SHIFT)
len = size & ~PAGE_CACHE_MASK;
@@ -2436,7 +2579,7 @@ static int ext4_da_writepage(struct page *page,
* do block allocation here.
*/
ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
- ext4_normal_get_block_write);
+ noalloc_get_block_write);
if (!ret) {
page_bufs = page_buffers(page);
/* check whether all are mapped and non delay */
@@ -2461,11 +2604,10 @@ static int ext4_da_writepage(struct page *page,
}
if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
- ret = nobh_writepage(page, ext4_normal_get_block_write, wbc);
+ ret = nobh_writepage(page, noalloc_get_block_write, wbc);
else
- ret = block_write_full_page(page,
- ext4_normal_get_block_write,
- wbc);
+ ret = block_write_full_page(page, noalloc_get_block_write,
+ wbc);
return ret;
}
@@ -2510,19 +2652,7 @@ static int ext4_da_writepages(struct address_space *mapping,
int needed_blocks, ret = 0, nr_to_writebump = 0;
struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
- trace_mark(ext4_da_writepages,
- "dev %s ino %lu nr_t_write %ld "
- "pages_skipped %ld range_start %llu "
- "range_end %llu nonblocking %d "
- "for_kupdate %d for_reclaim %d "
- "for_writepages %d range_cyclic %d",
- inode->i_sb->s_id, inode->i_ino,
- wbc->nr_to_write, wbc->pages_skipped,
- (unsigned long long) wbc->range_start,
- (unsigned long long) wbc->range_end,
- wbc->nonblocking, wbc->for_kupdate,
- wbc->for_reclaim, wbc->for_writepages,
- wbc->range_cyclic);
+ trace_ext4_da_writepages(inode, wbc);
/*
* No pages to write? This is mainly a kludge to avoid starting
@@ -2536,13 +2666,13 @@ static int ext4_da_writepages(struct address_space *mapping,
* If the filesystem has aborted, it is read-only, so return
* right away instead of dumping stack traces later on that
* will obscure the real source of the problem. We test
- * EXT4_MOUNT_ABORT instead of sb->s_flag's MS_RDONLY because
+ * EXT4_MF_FS_ABORTED instead of sb->s_flag's MS_RDONLY because
* the latter could be true if the filesystem is mounted
* read-only, and in that case, ext4_da_writepages should
* *never* be called, so if that ever happens, we would want
* the stack trace.
*/
- if (unlikely(sbi->s_mount_opt & EXT4_MOUNT_ABORT))
+ if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED))
return -EROFS;
/*
@@ -2688,14 +2818,7 @@ out_writepages:
if (!no_nrwrite_index_update)
wbc->no_nrwrite_index_update = 0;
wbc->nr_to_write -= nr_to_writebump;
- trace_mark(ext4_da_writepage_result,
- "dev %s ino %lu ret %d pages_written %d "
- "pages_skipped %ld congestion %d "
- "more_io %d no_nrwrite_index_update %d",
- inode->i_sb->s_id, inode->i_ino, ret,
- pages_written, wbc->pages_skipped,
- wbc->encountered_congestion, wbc->more_io,
- wbc->no_nrwrite_index_update);
+ trace_ext4_da_writepages_result(inode, wbc, ret, pages_written);
return ret;
}
@@ -2727,8 +2850,8 @@ static int ext4_nonda_switch(struct super_block *sb)
}
static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
- struct page **pagep, void **fsdata)
+ loff_t pos, unsigned len, unsigned flags,
+ struct page **pagep, void **fsdata)
{
int ret, retries = 0;
struct page *page;
@@ -2747,11 +2870,7 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
len, flags, pagep, fsdata);
}
*fsdata = (void *)0;
-
- trace_mark(ext4_da_write_begin,
- "dev %s ino %lu pos %llu len %u flags %u",
- inode->i_sb->s_id, inode->i_ino,
- (unsigned long long) pos, len, flags);
+ trace_ext4_da_write_begin(inode, pos, len, flags);
retry:
/*
* With delayed allocation, we don't log the i_disksize update
@@ -2777,7 +2896,7 @@ retry:
*pagep = page;
ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
- ext4_da_get_block_prep);
+ ext4_da_get_block_prep);
if (ret < 0) {
unlock_page(page);
ext4_journal_stop(handle);
@@ -2802,7 +2921,7 @@ out:
* when write to the end of file but not require block allocation
*/
static int ext4_da_should_update_i_disksize(struct page *page,
- unsigned long offset)
+ unsigned long offset)
{
struct buffer_head *bh;
struct inode *inode = page->mapping->host;
@@ -2815,15 +2934,15 @@ static int ext4_da_should_update_i_disksize(struct page *page,
for (i = 0; i < idx; i++)
bh = bh->b_this_page;
- if (!buffer_mapped(bh) || (buffer_delay(bh)))
+ if (!buffer_mapped(bh) || (buffer_delay(bh)) || buffer_unwritten(bh))
return 0;
return 1;
}
static int ext4_da_write_end(struct file *file,
- struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied,
- struct page *page, void *fsdata)
+ struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct page *page, void *fsdata)
{
struct inode *inode = mapping->host;
int ret = 0, ret2;
@@ -2844,10 +2963,7 @@ static int ext4_da_write_end(struct file *file,
}
}
- trace_mark(ext4_da_write_end,
- "dev %s ino %lu pos %llu len %u copied %u",
- inode->i_sb->s_id, inode->i_ino,
- (unsigned long long) pos, len, copied);
+ trace_ext4_da_write_end(inode, pos, len, copied);
start = pos & (PAGE_CACHE_SIZE - 1);
end = start + copied - 1;
@@ -2924,7 +3040,7 @@ int ext4_alloc_da_blocks(struct inode *inode)
* not strictly speaking necessary (and for users of
* laptop_mode, not even desirable). However, to do otherwise
* would require replicating code paths in:
- *
+ *
* ext4_da_writepages() ->
* write_cache_pages() ---> (via passed in callback function)
* __mpage_da_writepage() -->
@@ -2944,7 +3060,7 @@ int ext4_alloc_da_blocks(struct inode *inode)
* write out the pages, but rather only collect contiguous
* logical block extents, call the multi-block allocator, and
* then update the buffer heads with the block allocations.
- *
+ *
* For now, though, we'll cheat by calling filemap_flush(),
* which will map the blocks, and start the I/O, but not
* actually wait for the I/O to complete.
@@ -3080,29 +3196,25 @@ static int bput_one(handle_t *handle, struct buffer_head *bh)
*
*/
static int __ext4_normal_writepage(struct page *page,
- struct writeback_control *wbc)
+ struct writeback_control *wbc)
{
struct inode *inode = page->mapping->host;
if (test_opt(inode->i_sb, NOBH))
- return nobh_writepage(page,
- ext4_normal_get_block_write, wbc);
+ return nobh_writepage(page, noalloc_get_block_write, wbc);
else
- return block_write_full_page(page,
- ext4_normal_get_block_write,
- wbc);
+ return block_write_full_page(page, noalloc_get_block_write,
+ wbc);
}
static int ext4_normal_writepage(struct page *page,
- struct writeback_control *wbc)
+ struct writeback_control *wbc)
{
struct inode *inode = page->mapping->host;
loff_t size = i_size_read(inode);
loff_t len;
- trace_mark(ext4_normal_writepage,
- "dev %s ino %lu page_index %lu",
- inode->i_sb->s_id, inode->i_ino, page->index);
+ trace_ext4_normal_writepage(inode, page);
J_ASSERT(PageLocked(page));
if (page->index == size >> PAGE_CACHE_SHIFT)
len = size & ~PAGE_CACHE_MASK;
@@ -3132,7 +3244,7 @@ static int ext4_normal_writepage(struct page *page,
}
static int __ext4_journalled_writepage(struct page *page,
- struct writeback_control *wbc)
+ struct writeback_control *wbc)
{
struct address_space *mapping = page->mapping;
struct inode *inode = mapping->host;
@@ -3142,7 +3254,7 @@ static int __ext4_journalled_writepage(struct page *page,
int err;
ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
- ext4_normal_get_block_write);
+ noalloc_get_block_write);
if (ret != 0)
goto out_unlock;
@@ -3182,15 +3294,13 @@ out:
}
static int ext4_journalled_writepage(struct page *page,
- struct writeback_control *wbc)
+ struct writeback_control *wbc)
{
struct inode *inode = page->mapping->host;
loff_t size = i_size_read(inode);
loff_t len;
- trace_mark(ext4_journalled_writepage,
- "dev %s ino %lu page_index %lu",
- inode->i_sb->s_id, inode->i_ino, page->index);
+ trace_ext4_journalled_writepage(inode, page);
J_ASSERT(PageLocked(page));
if (page->index == size >> PAGE_CACHE_SHIFT)
len = size & ~PAGE_CACHE_MASK;
@@ -3227,9 +3337,8 @@ static int ext4_journalled_writepage(struct page *page,
* really know unless we go poke around in the buffer_heads.
* But block_write_full_page will do the right thing.
*/
- return block_write_full_page(page,
- ext4_normal_get_block_write,
- wbc);
+ return block_write_full_page(page, noalloc_get_block_write,
+ wbc);
}
no_write:
redirty_page_for_writepage(wbc, page);
@@ -3288,8 +3397,8 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
* VFS code falls back into buffered path in that case so we are safe.
*/
static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
- const struct iovec *iov, loff_t offset,
- unsigned long nr_segs)
+ const struct iovec *iov, loff_t offset,
+ unsigned long nr_segs)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
@@ -3609,7 +3718,8 @@ static inline int all_zeroes(__le32 *p, __le32 *q)
* (no partially truncated stuff there). */
static Indirect *ext4_find_shared(struct inode *inode, int depth,
- ext4_lblk_t offsets[4], Indirect chain[4], __le32 *top)
+ ext4_lblk_t offsets[4], Indirect chain[4],
+ __le32 *top)
{
Indirect *partial, *p;
int k, err;
@@ -3665,8 +3775,10 @@ no_top:
* than `count' because there can be holes in there.
*/
static void ext4_clear_blocks(handle_t *handle, struct inode *inode,
- struct buffer_head *bh, ext4_fsblk_t block_to_free,
- unsigned long count, __le32 *first, __le32 *last)
+ struct buffer_head *bh,
+ ext4_fsblk_t block_to_free,
+ unsigned long count, __le32 *first,
+ __le32 *last)
{
__le32 *p;
if (try_to_extend_transaction(handle, inode)) {
@@ -3683,10 +3795,11 @@ static void ext4_clear_blocks(handle_t *handle, struct inode *inode,
}
/*
- * Any buffers which are on the journal will be in memory. We find
- * them on the hash table so jbd2_journal_revoke() will run jbd2_journal_forget()
- * on them. We've already detached each block from the file, so
- * bforget() in jbd2_journal_forget() should be safe.
+ * Any buffers which are on the journal will be in memory. We
+ * find them on the hash table so jbd2_journal_revoke() will
+ * run jbd2_journal_forget() on them. We've already detached
+ * each block from the file, so bforget() in
+ * jbd2_journal_forget() should be safe.
*
* AKPM: turn on bforget in jbd2_journal_forget()!!!
*/
@@ -3973,7 +4086,8 @@ void ext4_truncate(struct inode *inode)
if (!ext4_can_truncate(inode))
return;
- if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
+ if (ei->i_disksize && inode->i_size == 0 &&
+ !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
ei->i_state |= EXT4_STATE_DA_ALLOC_CLOSE;
if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
@@ -4057,7 +4171,7 @@ void ext4_truncate(struct inode *inode)
(__le32*)partial->bh->b_data+addr_per_block,
(chain+n-1) - partial);
BUFFER_TRACE(partial->bh, "call brelse");
- brelse (partial->bh);
+ brelse(partial->bh);
partial--;
}
do_indirects:
@@ -4298,8 +4412,9 @@ void ext4_get_inode_flags(struct ext4_inode_info *ei)
if (flags & S_DIRSYNC)
ei->i_flags |= EXT4_DIRSYNC_FL;
}
+
static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode,
- struct ext4_inode_info *ei)
+ struct ext4_inode_info *ei)
{
blkcnt_t i_blocks ;
struct inode *inode = &(ei->vfs_inode);
@@ -4414,7 +4529,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
EXT4_GOOD_OLD_INODE_SIZE +
ei->i_extra_isize;
if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC))
- ei->i_state |= EXT4_STATE_XATTR;
+ ei->i_state |= EXT4_STATE_XATTR;
}
} else
ei->i_extra_isize = 0;
@@ -4433,7 +4548,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
ret = 0;
if (ei->i_file_acl &&
- ((ei->i_file_acl <
+ ((ei->i_file_acl <
(le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block) +
EXT4_SB(sb)->s_gdb_count)) ||
(ei->i_file_acl >= ext4_blocks_count(EXT4_SB(sb)->s_es)))) {
@@ -4448,15 +4563,15 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
!ext4_inode_is_fast_symlink(inode)))
/* Validate extent which is part of inode */
ret = ext4_ext_check_inode(inode);
- } else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+ } else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
(S_ISLNK(inode->i_mode) &&
!ext4_inode_is_fast_symlink(inode))) {
- /* Validate block references which are part of inode */
+ /* Validate block references which are part of inode */
ret = ext4_check_inode_blockref(inode);
}
if (ret) {
- brelse(bh);
- goto bad_inode;
+ brelse(bh);
+ goto bad_inode;
}
if (S_ISREG(inode->i_mode)) {
@@ -4487,7 +4602,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
} else {
brelse(bh);
ret = -EIO;
- ext4_error(inode->i_sb, __func__,
+ ext4_error(inode->i_sb, __func__,
"bogus i_mode (%o) for inode=%lu",
inode->i_mode, inode->i_ino);
goto bad_inode;
@@ -4640,8 +4755,9 @@ static int ext4_do_update_inode(handle_t *handle,
cpu_to_le32(new_encode_dev(inode->i_rdev));
raw_inode->i_block[2] = 0;
}
- } else for (block = 0; block < EXT4_N_BLOCKS; block++)
- raw_inode->i_block[block] = ei->i_data[block];
+ } else
+ for (block = 0; block < EXT4_N_BLOCKS; block++)
+ raw_inode->i_block[block] = ei->i_data[block];
raw_inode->i_disk_version = cpu_to_le32(inode->i_version);
if (ei->i_extra_isize) {
@@ -4715,25 +4831,6 @@ int ext4_write_inode(struct inode *inode, int wait)
return ext4_force_commit(inode->i_sb);
}
-int __ext4_write_dirty_metadata(struct inode *inode, struct buffer_head *bh)
-{
- int err = 0;
-
- mark_buffer_dirty(bh);
- if (inode && inode_needs_sync(inode)) {
- sync_dirty_buffer(bh);
- if (buffer_req(bh) && !buffer_uptodate(bh)) {
- ext4_error(inode->i_sb, __func__,
- "IO error syncing inode, "
- "inode=%lu, block=%llu",
- inode->i_ino,
- (unsigned long long)bh->b_blocknr);
- err = -EIO;
- }
- }
- return err;
-}
-
/*
* ext4_setattr()
*
@@ -4930,7 +5027,8 @@ static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
*/
int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
{
- int groups, gdpblocks;
+ ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb);
+ int gdpblocks;
int idxblocks;
int ret = 0;
@@ -4957,8 +5055,8 @@ int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
groups += nrblocks;
gdpblocks = groups;
- if (groups > EXT4_SB(inode->i_sb)->s_groups_count)
- groups = EXT4_SB(inode->i_sb)->s_groups_count;
+ if (groups > ngroups)
+ groups = ngroups;
if (groups > EXT4_SB(inode->i_sb)->s_gdb_count)
gdpblocks = EXT4_SB(inode->i_sb)->s_gdb_count;
@@ -4998,7 +5096,7 @@ int ext4_writepage_trans_blocks(struct inode *inode)
* Calculate the journal credits for a chunk of data modification.
*
* This is called from DIO, fallocate or whoever calling
- * ext4_get_blocks_wrap() to map/allocate a chunk of contigous disk blocks.
+ * ext4_get_blocks() to map/allocate a chunk of contigous disk blocks.
*
* journal buffers for data blocks are not included here, as DIO
* and fallocate do no need to journal data buffers.
@@ -5013,7 +5111,7 @@ int ext4_chunk_trans_blocks(struct inode *inode, int nrblocks)
* Give this, we know that the caller already has write access to iloc->bh.
*/
int ext4_mark_iloc_dirty(handle_t *handle,
- struct inode *inode, struct ext4_iloc *iloc)
+ struct inode *inode, struct ext4_iloc *iloc)
{
int err = 0;
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 91e75f7a9e73..bb415408fdb6 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -14,6 +14,7 @@
#include <linux/compat.h>
#include <linux/smp_lock.h>
#include <linux/mount.h>
+#include <linux/file.h>
#include <asm/uaccess.h>
#include "ext4_jbd2.h"
#include "ext4.h"
@@ -213,6 +214,41 @@ setversion_out:
return err;
}
+
+ case EXT4_IOC_MOVE_EXT: {
+ struct move_extent me;
+ struct file *donor_filp;
+ int err;
+
+ if (copy_from_user(&me,
+ (struct move_extent __user *)arg, sizeof(me)))
+ return -EFAULT;
+
+ donor_filp = fget(me.donor_fd);
+ if (!donor_filp)
+ return -EBADF;
+
+ if (!capable(CAP_DAC_OVERRIDE)) {
+ if ((current->real_cred->fsuid != inode->i_uid) ||
+ !(inode->i_mode & S_IRUSR) ||
+ !(donor_filp->f_dentry->d_inode->i_mode &
+ S_IRUSR)) {
+ fput(donor_filp);
+ return -EACCES;
+ }
+ }
+
+ err = ext4_move_extents(filp, donor_filp, me.orig_start,
+ me.donor_start, me.len, &me.moved_len);
+ fput(donor_filp);
+
+ if (!err)
+ if (copy_to_user((struct move_extent *)arg,
+ &me, sizeof(me)))
+ return -EFAULT;
+ return err;
+ }
+
case EXT4_IOC_GROUP_ADD: {
struct ext4_new_group_data input;
struct super_block *sb = inode->i_sb;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index f871677a7984..519a0a686d94 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -22,6 +22,8 @@
*/
#include "mballoc.h"
+#include <trace/events/ext4.h>
+
/*
* MUSTDO:
* - test ext4_ext_search_left() and ext4_ext_search_right()
@@ -340,8 +342,6 @@ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
ext4_group_t group);
static void release_blocks_on_commit(journal_t *journal, transaction_t *txn);
-
-
static inline void *mb_correct_addr_and_bit(int *bit, void *addr)
{
#if BITS_PER_LONG == 64
@@ -372,24 +372,12 @@ static inline void mb_set_bit(int bit, void *addr)
ext4_set_bit(bit, addr);
}
-static inline void mb_set_bit_atomic(spinlock_t *lock, int bit, void *addr)
-{
- addr = mb_correct_addr_and_bit(&bit, addr);
- ext4_set_bit_atomic(lock, bit, addr);
-}
-
static inline void mb_clear_bit(int bit, void *addr)
{
addr = mb_correct_addr_and_bit(&bit, addr);
ext4_clear_bit(bit, addr);
}
-static inline void mb_clear_bit_atomic(spinlock_t *lock, int bit, void *addr)
-{
- addr = mb_correct_addr_and_bit(&bit, addr);
- ext4_clear_bit_atomic(lock, bit, addr);
-}
-
static inline int mb_find_next_zero_bit(void *addr, int max, int start)
{
int fix = 0, ret, tmpmax;
@@ -448,7 +436,7 @@ static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b,
if (unlikely(e4b->bd_info->bb_bitmap == NULL))
return;
- BUG_ON(!ext4_is_group_locked(sb, e4b->bd_group));
+ assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group));
for (i = 0; i < count; i++) {
if (!mb_test_bit(first + i, e4b->bd_info->bb_bitmap)) {
ext4_fsblk_t blocknr;
@@ -472,7 +460,7 @@ static void mb_mark_used_double(struct ext4_buddy *e4b, int first, int count)
if (unlikely(e4b->bd_info->bb_bitmap == NULL))
return;
- BUG_ON(!ext4_is_group_locked(e4b->bd_sb, e4b->bd_group));
+ assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
for (i = 0; i < count; i++) {
BUG_ON(mb_test_bit(first + i, e4b->bd_info->bb_bitmap));
mb_set_bit(first + i, e4b->bd_info->bb_bitmap);
@@ -739,6 +727,7 @@ static void ext4_mb_generate_buddy(struct super_block *sb,
static int ext4_mb_init_cache(struct page *page, char *incore)
{
+ ext4_group_t ngroups;
int blocksize;
int blocks_per_page;
int groups_per_page;
@@ -757,6 +746,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
inode = page->mapping->host;
sb = inode->i_sb;
+ ngroups = ext4_get_groups_count(sb);
blocksize = 1 << inode->i_blkbits;
blocks_per_page = PAGE_CACHE_SIZE / blocksize;
@@ -780,7 +770,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
for (i = 0; i < groups_per_page; i++) {
struct ext4_group_desc *desc;
- if (first_group + i >= EXT4_SB(sb)->s_groups_count)
+ if (first_group + i >= ngroups)
break;
err = -EIO;
@@ -801,17 +791,17 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
unlock_buffer(bh[i]);
continue;
}
- spin_lock(sb_bgl_lock(EXT4_SB(sb), first_group + i));
+ ext4_lock_group(sb, first_group + i);
if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
ext4_init_block_bitmap(sb, bh[i],
first_group + i, desc);
set_bitmap_uptodate(bh[i]);
set_buffer_uptodate(bh[i]);
- spin_unlock(sb_bgl_lock(EXT4_SB(sb), first_group + i));
+ ext4_unlock_group(sb, first_group + i);
unlock_buffer(bh[i]);
continue;
}
- spin_unlock(sb_bgl_lock(EXT4_SB(sb), first_group + i));
+ ext4_unlock_group(sb, first_group + i);
if (buffer_uptodate(bh[i])) {
/*
* if not uninit if bh is uptodate,
@@ -852,7 +842,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
struct ext4_group_info *grinfo;
group = (first_block + i) >> 1;
- if (group >= EXT4_SB(sb)->s_groups_count)
+ if (group >= ngroups)
break;
/*
@@ -1078,7 +1068,7 @@ static int mb_find_order_for_block(struct ext4_buddy *e4b, int block)
return 0;
}
-static void mb_clear_bits(spinlock_t *lock, void *bm, int cur, int len)
+static void mb_clear_bits(void *bm, int cur, int len)
{
__u32 *addr;
@@ -1091,15 +1081,12 @@ static void mb_clear_bits(spinlock_t *lock, void *bm, int cur, int len)
cur += 32;
continue;
}
- if (lock)
- mb_clear_bit_atomic(lock, cur, bm);
- else
- mb_clear_bit(cur, bm);
+ mb_clear_bit(cur, bm);
cur++;
}
}
-static void mb_set_bits(spinlock_t *lock, void *bm, int cur, int len)
+static void mb_set_bits(void *bm, int cur, int len)
{
__u32 *addr;
@@ -1112,10 +1099,7 @@ static void mb_set_bits(spinlock_t *lock, void *bm, int cur, int len)
cur += 32;
continue;
}
- if (lock)
- mb_set_bit_atomic(lock, cur, bm);
- else
- mb_set_bit(cur, bm);
+ mb_set_bit(cur, bm);
cur++;
}
}
@@ -1131,7 +1115,7 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
struct super_block *sb = e4b->bd_sb;
BUG_ON(first + count > (sb->s_blocksize << 3));
- BUG_ON(!ext4_is_group_locked(sb, e4b->bd_group));
+ assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group));
mb_check_buddy(e4b);
mb_free_blocks_double(inode, e4b, first, count);
@@ -1212,7 +1196,7 @@ static int mb_find_extent(struct ext4_buddy *e4b, int order, int block,
int ord;
void *buddy;
- BUG_ON(!ext4_is_group_locked(e4b->bd_sb, e4b->bd_group));
+ assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
BUG_ON(ex == NULL);
buddy = mb_find_buddy(e4b, order, &max);
@@ -1276,7 +1260,7 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
BUG_ON(start + len > (e4b->bd_sb->s_blocksize << 3));
BUG_ON(e4b->bd_group != ex->fe_group);
- BUG_ON(!ext4_is_group_locked(e4b->bd_sb, e4b->bd_group));
+ assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
mb_check_buddy(e4b);
mb_mark_used_double(e4b, start, len);
@@ -1330,8 +1314,7 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
e4b->bd_info->bb_counters[ord]++;
}
- mb_set_bits(sb_bgl_lock(EXT4_SB(e4b->bd_sb), ex->fe_group),
- EXT4_MB_BITMAP(e4b), ex->fe_start, len0);
+ mb_set_bits(EXT4_MB_BITMAP(e4b), ex->fe_start, len0);
mb_check_buddy(e4b);
return ret;
@@ -1726,7 +1709,6 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
unsigned free, fragments;
unsigned i, bits;
int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb));
- struct ext4_group_desc *desc;
struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
BUG_ON(cr < 0 || cr >= 4);
@@ -1742,10 +1724,6 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
switch (cr) {
case 0:
BUG_ON(ac->ac_2order == 0);
- /* If this group is uninitialized, skip it initially */
- desc = ext4_get_group_desc(ac->ac_sb, group, NULL);
- if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))
- return 0;
/* Avoid using the first bg of a flexgroup for data files */
if ((ac->ac_flags & EXT4_MB_HINT_DATA) &&
@@ -1788,6 +1766,7 @@ int ext4_mb_get_buddy_cache_lock(struct super_block *sb, ext4_group_t group)
int block, pnum;
int blocks_per_page;
int groups_per_page;
+ ext4_group_t ngroups = ext4_get_groups_count(sb);
ext4_group_t first_group;
struct ext4_group_info *grp;
@@ -1807,7 +1786,7 @@ int ext4_mb_get_buddy_cache_lock(struct super_block *sb, ext4_group_t group)
/* read all groups the page covers into the cache */
for (i = 0; i < groups_per_page; i++) {
- if ((first_group + i) >= EXT4_SB(sb)->s_groups_count)
+ if ((first_group + i) >= ngroups)
break;
grp = ext4_get_group_info(sb, first_group + i);
/* take all groups write allocation
@@ -1945,8 +1924,7 @@ err:
static noinline_for_stack int
ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
{
- ext4_group_t group;
- ext4_group_t i;
+ ext4_group_t ngroups, group, i;
int cr;
int err = 0;
int bsbits;
@@ -1957,6 +1935,7 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
sb = ac->ac_sb;
sbi = EXT4_SB(sb);
+ ngroups = ext4_get_groups_count(sb);
BUG_ON(ac->ac_status == AC_STATUS_FOUND);
/* first, try the goal */
@@ -2017,11 +1996,11 @@ repeat:
*/
group = ac->ac_g_ex.fe_group;
- for (i = 0; i < EXT4_SB(sb)->s_groups_count; group++, i++) {
+ for (i = 0; i < ngroups; group++, i++) {
struct ext4_group_info *grp;
struct ext4_group_desc *desc;
- if (group == EXT4_SB(sb)->s_groups_count)
+ if (group == ngroups)
group = 0;
/* quick check to skip empty groups */
@@ -2064,9 +2043,7 @@ repeat:
ac->ac_groups_scanned++;
desc = ext4_get_group_desc(sb, group, NULL);
- if (cr == 0 || (desc->bg_flags &
- cpu_to_le16(EXT4_BG_BLOCK_UNINIT) &&
- ac->ac_2order != 0))
+ if (cr == 0)
ext4_mb_simple_scan_group(ac, &e4b);
else if (cr == 1 &&
ac->ac_g_ex.fe_len == sbi->s_stripe)
@@ -2315,12 +2292,10 @@ static struct file_operations ext4_mb_seq_history_fops = {
static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos)
{
struct super_block *sb = seq->private;
- struct ext4_sb_info *sbi = EXT4_SB(sb);
ext4_group_t group;
- if (*pos < 0 || *pos >= sbi->s_groups_count)
+ if (*pos < 0 || *pos >= ext4_get_groups_count(sb))
return NULL;
-
group = *pos + 1;
return (void *) ((unsigned long) group);
}
@@ -2328,11 +2303,10 @@ static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos)
static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos)
{
struct super_block *sb = seq->private;
- struct ext4_sb_info *sbi = EXT4_SB(sb);
ext4_group_t group;
++*pos;
- if (*pos < 0 || *pos >= sbi->s_groups_count)
+ if (*pos < 0 || *pos >= ext4_get_groups_count(sb))
return NULL;
group = *pos + 1;
return (void *) ((unsigned long) group);
@@ -2420,7 +2394,8 @@ static void ext4_mb_history_release(struct super_block *sb)
if (sbi->s_proc != NULL) {
remove_proc_entry("mb_groups", sbi->s_proc);
- remove_proc_entry("mb_history", sbi->s_proc);
+ if (sbi->s_mb_history_max)
+ remove_proc_entry("mb_history", sbi->s_proc);
}
kfree(sbi->s_mb_history);
}
@@ -2431,17 +2406,17 @@ static void ext4_mb_history_init(struct super_block *sb)
int i;
if (sbi->s_proc != NULL) {
- proc_create_data("mb_history", S_IRUGO, sbi->s_proc,
- &ext4_mb_seq_history_fops, sb);
+ if (sbi->s_mb_history_max)
+ proc_create_data("mb_history", S_IRUGO, sbi->s_proc,
+ &ext4_mb_seq_history_fops, sb);
proc_create_data("mb_groups", S_IRUGO, sbi->s_proc,
&ext4_mb_seq_groups_fops, sb);
}
- sbi->s_mb_history_max = 1000;
sbi->s_mb_history_cur = 0;
spin_lock_init(&sbi->s_mb_history_lock);
i = sbi->s_mb_history_max * sizeof(struct ext4_mb_history);
- sbi->s_mb_history = kzalloc(i, GFP_KERNEL);
+ sbi->s_mb_history = i ? kzalloc(i, GFP_KERNEL) : NULL;
/* if we can't allocate history, then we simple won't use it */
}
@@ -2451,7 +2426,7 @@ ext4_mb_store_history(struct ext4_allocation_context *ac)
struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
struct ext4_mb_history h;
- if (unlikely(sbi->s_mb_history == NULL))
+ if (sbi->s_mb_history == NULL)
return;
if (!(ac->ac_op & sbi->s_mb_history_filter))
@@ -2587,6 +2562,7 @@ void ext4_mb_update_group_info(struct ext4_group_info *grp, ext4_grpblk_t add)
static int ext4_mb_init_backend(struct super_block *sb)
{
+ ext4_group_t ngroups = ext4_get_groups_count(sb);
ext4_group_t i;
int metalen;
struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -2598,7 +2574,7 @@ static int ext4_mb_init_backend(struct super_block *sb)
struct ext4_group_desc *desc;
/* This is the number of blocks used by GDT */
- num_meta_group_infos = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) -
+ num_meta_group_infos = (ngroups + EXT4_DESC_PER_BLOCK(sb) -
1) >> EXT4_DESC_PER_BLOCK_BITS(sb);
/*
@@ -2644,7 +2620,7 @@ static int ext4_mb_init_backend(struct super_block *sb)
for (i = 0; i < num_meta_group_infos; i++) {
if ((i + 1) == num_meta_group_infos)
metalen = sizeof(*meta_group_info) *
- (sbi->s_groups_count -
+ (ngroups -
(i << EXT4_DESC_PER_BLOCK_BITS(sb)));
meta_group_info = kmalloc(metalen, GFP_KERNEL);
if (meta_group_info == NULL) {
@@ -2655,7 +2631,7 @@ static int ext4_mb_init_backend(struct super_block *sb)
sbi->s_group_info[i] = meta_group_info;
}
- for (i = 0; i < sbi->s_groups_count; i++) {
+ for (i = 0; i < ngroups; i++) {
desc = ext4_get_group_desc(sb, i, NULL);
if (desc == NULL) {
printk(KERN_ERR
@@ -2761,7 +2737,7 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
return 0;
}
-/* need to called with ext4 group lock (ext4_lock_group) */
+/* need to called with the ext4 group lock held */
static void ext4_mb_cleanup_pa(struct ext4_group_info *grp)
{
struct ext4_prealloc_space *pa;
@@ -2781,13 +2757,14 @@ static void ext4_mb_cleanup_pa(struct ext4_group_info *grp)
int ext4_mb_release(struct super_block *sb)
{
+ ext4_group_t ngroups = ext4_get_groups_count(sb);
ext4_group_t i;
int num_meta_group_infos;
struct ext4_group_info *grinfo;
struct ext4_sb_info *sbi = EXT4_SB(sb);
if (sbi->s_group_info) {
- for (i = 0; i < sbi->s_groups_count; i++) {
+ for (i = 0; i < ngroups; i++) {
grinfo = ext4_get_group_info(sb, i);
#ifdef DOUBLE_CHECK
kfree(grinfo->bb_bitmap);
@@ -2797,7 +2774,7 @@ int ext4_mb_release(struct super_block *sb)
ext4_unlock_group(sb, i);
kfree(grinfo);
}
- num_meta_group_infos = (sbi->s_groups_count +
+ num_meta_group_infos = (ngroups +
EXT4_DESC_PER_BLOCK(sb) - 1) >>
EXT4_DESC_PER_BLOCK_BITS(sb);
for (i = 0; i < num_meta_group_infos; i++)
@@ -2882,9 +2859,8 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
discard_block = (ext4_fsblk_t) entry->group * EXT4_BLOCKS_PER_GROUP(sb)
+ entry->start_blk
+ le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
- trace_mark(ext4_discard_blocks, "dev %s blk %llu count %u",
- sb->s_id, (unsigned long long) discard_block,
- entry->count);
+ trace_ext4_discard_blocks(sb, (unsigned long long)discard_block,
+ entry->count);
sb_issue_discard(sb, discard_block, entry->count);
kmem_cache_free(ext4_free_ext_cachep, entry);
@@ -2984,27 +2960,25 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
+ le32_to_cpu(es->s_first_data_block);
len = ac->ac_b_ex.fe_len;
- if (in_range(ext4_block_bitmap(sb, gdp), block, len) ||
- in_range(ext4_inode_bitmap(sb, gdp), block, len) ||
- in_range(block, ext4_inode_table(sb, gdp),
- EXT4_SB(sb)->s_itb_per_group) ||
- in_range(block + len - 1, ext4_inode_table(sb, gdp),
- EXT4_SB(sb)->s_itb_per_group)) {
+ if (!ext4_data_block_valid(sbi, block, len)) {
ext4_error(sb, __func__,
- "Allocating block %llu in system zone of %d group\n",
- block, ac->ac_b_ex.fe_group);
+ "Allocating blocks %llu-%llu which overlap "
+ "fs metadata\n", block, block+len);
/* File system mounted not to panic on error
* Fix the bitmap and repeat the block allocation
* We leak some of the blocks here.
*/
- mb_set_bits(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group),
- bitmap_bh->b_data, ac->ac_b_ex.fe_start,
- ac->ac_b_ex.fe_len);
+ ext4_lock_group(sb, ac->ac_b_ex.fe_group);
+ mb_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,
+ ac->ac_b_ex.fe_len);
+ ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
if (!err)
err = -EAGAIN;
goto out_err;
}
+
+ ext4_lock_group(sb, ac->ac_b_ex.fe_group);
#ifdef AGGRESSIVE_CHECK
{
int i;
@@ -3014,9 +2988,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
}
}
#endif
- spin_lock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group));
- mb_set_bits(NULL, bitmap_bh->b_data,
- ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len);
+ mb_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,ac->ac_b_ex.fe_len);
if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
ext4_free_blks_set(sb, gdp,
@@ -3026,7 +2998,8 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
len = ext4_free_blks_count(sb, gdp) - ac->ac_b_ex.fe_len;
ext4_free_blks_set(sb, gdp, len);
gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp);
- spin_unlock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group));
+
+ ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
percpu_counter_sub(&sbi->s_freeblocks_counter, ac->ac_b_ex.fe_len);
/*
* Now reduce the dirty block count also. Should not go negative
@@ -3459,7 +3432,7 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
* the function goes through all block freed in the group
* but not yet committed and marks them used in in-core bitmap.
* buddy must be generated from this bitmap
- * Need to be called with ext4 group lock (ext4_lock_group)
+ * Need to be called with the ext4 group lock held
*/
static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
ext4_group_t group)
@@ -3473,9 +3446,7 @@ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
while (n) {
entry = rb_entry(n, struct ext4_free_data, node);
- mb_set_bits(sb_bgl_lock(EXT4_SB(sb), group),
- bitmap, entry->start_blk,
- entry->count);
+ mb_set_bits(bitmap, entry->start_blk, entry->count);
n = rb_next(n);
}
return;
@@ -3484,7 +3455,7 @@ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
/*
* the function goes through all preallocation in this group and marks them
* used in in-core bitmap. buddy must be generated from this bitmap
- * Need to be called with ext4 group lock (ext4_lock_group)
+ * Need to be called with ext4 group lock held
*/
static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
ext4_group_t group)
@@ -3516,8 +3487,7 @@ static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
if (unlikely(len == 0))
continue;
BUG_ON(groupnr != group);
- mb_set_bits(sb_bgl_lock(EXT4_SB(sb), group),
- bitmap, start, len);
+ mb_set_bits(bitmap, start, len);
preallocated += len;
count++;
}
@@ -3658,10 +3628,7 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
mb_debug("new inode pa %p: %llu/%u for %u\n", pa,
pa->pa_pstart, pa->pa_len, pa->pa_lstart);
- trace_mark(ext4_mb_new_inode_pa,
- "dev %s ino %lu pstart %llu len %u lstart %u",
- sb->s_id, ac->ac_inode->i_ino,
- pa->pa_pstart, pa->pa_len, pa->pa_lstart);
+ trace_ext4_mb_new_inode_pa(ac, pa);
ext4_mb_use_inode_pa(ac, pa);
atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated);
@@ -3720,9 +3687,8 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
pa->pa_type = MB_GROUP_PA;
mb_debug("new group pa %p: %llu/%u for %u\n", pa,
- pa->pa_pstart, pa->pa_len, pa->pa_lstart);
- trace_mark(ext4_mb_new_group_pa, "dev %s pstart %llu len %u lstart %u",
- sb->s_id, pa->pa_pstart, pa->pa_len, pa->pa_lstart);
+ pa->pa_pstart, pa->pa_len, pa->pa_lstart);
+ trace_ext4_mb_new_group_pa(ac, pa);
ext4_mb_use_group_pa(ac, pa);
atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated);
@@ -3812,10 +3778,8 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
ext4_mb_store_history(ac);
}
- trace_mark(ext4_mb_release_inode_pa,
- "dev %s ino %lu block %llu count %u",
- sb->s_id, pa->pa_inode->i_ino, grp_blk_start + bit,
- next - bit);
+ trace_ext4_mb_release_inode_pa(ac, pa, grp_blk_start + bit,
+ next - bit);
mb_free_blocks(pa->pa_inode, e4b, bit, next - bit);
bit = next + 1;
}
@@ -3849,8 +3813,7 @@ ext4_mb_release_group_pa(struct ext4_buddy *e4b,
if (ac)
ac->ac_op = EXT4_MB_HISTORY_DISCARD;
- trace_mark(ext4_mb_release_group_pa, "dev %s pstart %llu len %d",
- sb->s_id, pa->pa_pstart, pa->pa_len);
+ trace_ext4_mb_release_group_pa(ac, pa);
BUG_ON(pa->pa_deleted == 0);
ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
@@ -3918,6 +3881,8 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
INIT_LIST_HEAD(&list);
ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
+ if (ac)
+ ac->ac_sb = sb;
repeat:
ext4_lock_group(sb, group);
list_for_each_entry_safe(pa, tmp,
@@ -4016,12 +3981,15 @@ void ext4_discard_preallocations(struct inode *inode)
}
mb_debug("discard preallocation for inode %lu\n", inode->i_ino);
- trace_mark(ext4_discard_preallocations, "dev %s ino %lu", sb->s_id,
- inode->i_ino);
+ trace_ext4_discard_preallocations(inode);
INIT_LIST_HEAD(&list);
ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
+ if (ac) {
+ ac->ac_sb = sb;
+ ac->ac_inode = inode;
+ }
repeat:
/* first, collect all pa's in the inode */
spin_lock(&ei->i_prealloc_lock);
@@ -4121,7 +4089,7 @@ static void ext4_mb_return_to_preallocation(struct inode *inode,
static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
{
struct super_block *sb = ac->ac_sb;
- ext4_group_t i;
+ ext4_group_t ngroups, i;
printk(KERN_ERR "EXT4-fs: Can't allocate:"
" Allocation context details:\n");
@@ -4145,7 +4113,8 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
printk(KERN_ERR "EXT4-fs: %lu scanned, %d found\n", ac->ac_ex_scanned,
ac->ac_found);
printk(KERN_ERR "EXT4-fs: groups: \n");
- for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) {
+ ngroups = ext4_get_groups_count(sb);
+ for (i = 0; i < ngroups; i++) {
struct ext4_group_info *grp = ext4_get_group_info(sb, i);
struct ext4_prealloc_space *pa;
ext4_grpblk_t start;
@@ -4304,6 +4273,8 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
INIT_LIST_HEAD(&discard_list);
ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
+ if (ac)
+ ac->ac_sb = sb;
spin_lock(&lg->lg_prealloc_lock);
list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order],
@@ -4469,13 +4440,12 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac)
static int ext4_mb_discard_preallocations(struct super_block *sb, int needed)
{
- ext4_group_t i;
+ ext4_group_t i, ngroups = ext4_get_groups_count(sb);
int ret;
int freed = 0;
- trace_mark(ext4_mb_discard_preallocations, "dev %s needed %d",
- sb->s_id, needed);
- for (i = 0; i < EXT4_SB(sb)->s_groups_count && needed > 0; i++) {
+ trace_ext4_mb_discard_preallocations(sb, needed);
+ for (i = 0; i < ngroups && needed > 0; i++) {
ret = ext4_mb_discard_group_preallocations(sb, i, needed);
freed += ret;
needed -= ret;
@@ -4503,17 +4473,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
sb = ar->inode->i_sb;
sbi = EXT4_SB(sb);
- trace_mark(ext4_request_blocks, "dev %s flags %u len %u ino %lu "
- "lblk %llu goal %llu lleft %llu lright %llu "
- "pleft %llu pright %llu ",
- sb->s_id, ar->flags, ar->len,
- ar->inode ? ar->inode->i_ino : 0,
- (unsigned long long) ar->logical,
- (unsigned long long) ar->goal,
- (unsigned long long) ar->lleft,
- (unsigned long long) ar->lright,
- (unsigned long long) ar->pleft,
- (unsigned long long) ar->pright);
+ trace_ext4_request_blocks(ar);
/*
* For delayed allocation, we could skip the ENOSPC and
@@ -4549,7 +4509,10 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
}
ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
- if (!ac) {
+ if (ac) {
+ ac->ac_sb = sb;
+ ac->ac_inode = ar->inode;
+ } else {
ar->len = 0;
*errp = -ENOMEM;
goto out1;
@@ -4622,18 +4585,7 @@ out3:
reserv_blks);
}
- trace_mark(ext4_allocate_blocks,
- "dev %s block %llu flags %u len %u ino %lu "
- "logical %llu goal %llu lleft %llu lright %llu "
- "pleft %llu pright %llu ",
- sb->s_id, (unsigned long long) block,
- ar->flags, ar->len, ar->inode ? ar->inode->i_ino : 0,
- (unsigned long long) ar->logical,
- (unsigned long long) ar->goal,
- (unsigned long long) ar->lleft,
- (unsigned long long) ar->lright,
- (unsigned long long) ar->pleft,
- (unsigned long long) ar->pright);
+ trace_ext4_allocate_blocks(ar, (unsigned long long)block);
return block;
}
@@ -4737,7 +4689,7 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
* Main entry point into mballoc to free blocks
*/
void ext4_mb_free_blocks(handle_t *handle, struct inode *inode,
- unsigned long block, unsigned long count,
+ ext4_fsblk_t block, unsigned long count,
int metadata, unsigned long *freed)
{
struct buffer_head *bitmap_bh = NULL;
@@ -4763,15 +4715,12 @@ void ext4_mb_free_blocks(handle_t *handle, struct inode *inode,
block + count > ext4_blocks_count(es)) {
ext4_error(sb, __func__,
"Freeing blocks not in datazone - "
- "block = %lu, count = %lu", block, count);
+ "block = %llu, count = %lu", block, count);
goto error_return;
}
- ext4_debug("freeing block %lu\n", block);
- trace_mark(ext4_free_blocks,
- "dev %s block %llu count %lu metadata %d ino %lu",
- sb->s_id, (unsigned long long) block, count, metadata,
- inode ? inode->i_ino : 0);
+ ext4_debug("freeing block %llu\n", block);
+ trace_ext4_free_blocks(inode, block, count, metadata);
ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
if (ac) {
@@ -4812,7 +4761,7 @@ do_more:
ext4_error(sb, __func__,
"Freeing blocks in system zone - "
- "Block = %lu, count = %lu", block, count);
+ "Block = %llu, count = %lu", block, count);
/* err = 0. ext4_std_error should be a no op */
goto error_return;
}
@@ -4859,29 +4808,25 @@ do_more:
new_entry->group = block_group;
new_entry->count = count;
new_entry->t_tid = handle->h_transaction->t_tid;
+
ext4_lock_group(sb, block_group);
- mb_clear_bits(sb_bgl_lock(sbi, block_group), bitmap_bh->b_data,
- bit, count);
+ mb_clear_bits(bitmap_bh->b_data, bit, count);
ext4_mb_free_metadata(handle, &e4b, new_entry);
- ext4_unlock_group(sb, block_group);
} else {
- ext4_lock_group(sb, block_group);
/* need to update group_info->bb_free and bitmap
* with group lock held. generate_buddy look at
* them with group lock_held
*/
- mb_clear_bits(sb_bgl_lock(sbi, block_group), bitmap_bh->b_data,
- bit, count);
+ ext4_lock_group(sb, block_group);
+ mb_clear_bits(bitmap_bh->b_data, bit, count);
mb_free_blocks(inode, &e4b, bit, count);
ext4_mb_return_to_preallocation(inode, &e4b, block, count);
- ext4_unlock_group(sb, block_group);
}
- spin_lock(sb_bgl_lock(sbi, block_group));
ret = ext4_free_blks_count(sb, gdp) + count;
ext4_free_blks_set(sb, gdp, ret);
gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp);
- spin_unlock(sb_bgl_lock(sbi, block_group));
+ ext4_unlock_group(sb, block_group);
percpu_counter_add(&sbi->s_freeblocks_counter, count);
if (sbi->s_log_groups_per_flex) {
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index dd9e6cd5f6cf..c96bb19f58f9 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -19,11 +19,9 @@
#include <linux/seq_file.h>
#include <linux/version.h>
#include <linux/blkdev.h>
-#include <linux/marker.h>
#include <linux/mutex.h>
#include "ext4_jbd2.h"
#include "ext4.h"
-#include "group.h"
/*
* with AGGRESSIVE_CHECK allocator runs consistency checks over
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index fe64d9f79852..313a50b39741 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -458,6 +458,7 @@ int ext4_ext_migrate(struct inode *inode)
struct inode *tmp_inode = NULL;
struct list_blocks_struct lb;
unsigned long max_entries;
+ __u32 goal;
/*
* If the filesystem does not support extents, or the inode
@@ -483,9 +484,10 @@ int ext4_ext_migrate(struct inode *inode)
retval = PTR_ERR(handle);
return retval;
}
- tmp_inode = ext4_new_inode(handle,
- inode->i_sb->s_root->d_inode,
- S_IFREG);
+ goal = (((inode->i_ino - 1) / EXT4_INODES_PER_GROUP(inode->i_sb)) *
+ EXT4_INODES_PER_GROUP(inode->i_sb)) + 1;
+ tmp_inode = ext4_new_inode(handle, inode->i_sb->s_root->d_inode,
+ S_IFREG, 0, goal);
if (IS_ERR(tmp_inode)) {
retval = -ENOMEM;
ext4_journal_stop(handle);
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
new file mode 100644
index 000000000000..bbf2dd9404dc
--- /dev/null
+++ b/fs/ext4/move_extent.c
@@ -0,0 +1,1320 @@
+/*
+ * Copyright (c) 2008,2009 NEC Software Tohoku, Ltd.
+ * Written by Takashi Sato <t-sato@yk.jp.nec.com>
+ * Akira Fujita <a-fujita@rs.jp.nec.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/fs.h>
+#include <linux/quotaops.h>
+#include "ext4_jbd2.h"
+#include "ext4_extents.h"
+#include "ext4.h"
+
+#define get_ext_path(path, inode, block, ret) \
+ do { \
+ path = ext4_ext_find_extent(inode, block, path); \
+ if (IS_ERR(path)) { \
+ ret = PTR_ERR(path); \
+ path = NULL; \
+ } \
+ } while (0)
+
+/**
+ * copy_extent_status - Copy the extent's initialization status
+ *
+ * @src: an extent for getting initialize status
+ * @dest: an extent to be set the status
+ */
+static void
+copy_extent_status(struct ext4_extent *src, struct ext4_extent *dest)
+{
+ if (ext4_ext_is_uninitialized(src))
+ ext4_ext_mark_uninitialized(dest);
+ else
+ dest->ee_len = cpu_to_le16(ext4_ext_get_actual_len(dest));
+}
+
+/**
+ * mext_next_extent - Search for the next extent and set it to "extent"
+ *
+ * @inode: inode which is searched
+ * @path: this will obtain data for the next extent
+ * @extent: pointer to the next extent we have just gotten
+ *
+ * Search the next extent in the array of ext4_ext_path structure (@path)
+ * and set it to ext4_extent structure (@extent). In addition, the member of
+ * @path (->p_ext) also points the next extent. Return 0 on success, 1 if
+ * ext4_ext_path structure refers to the last extent, or a negative error
+ * value on failure.
+ */
+static int
+mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
+ struct ext4_extent **extent)
+{
+ int ppos, leaf_ppos = path->p_depth;
+
+ ppos = leaf_ppos;
+ if (EXT_LAST_EXTENT(path[ppos].p_hdr) > path[ppos].p_ext) {
+ /* leaf block */
+ *extent = ++path[ppos].p_ext;
+ return 0;
+ }
+
+ while (--ppos >= 0) {
+ if (EXT_LAST_INDEX(path[ppos].p_hdr) >
+ path[ppos].p_idx) {
+ int cur_ppos = ppos;
+
+ /* index block */
+ path[ppos].p_idx++;
+ path[ppos].p_block = idx_pblock(path[ppos].p_idx);
+ if (path[ppos+1].p_bh)
+ brelse(path[ppos+1].p_bh);
+ path[ppos+1].p_bh =
+ sb_bread(inode->i_sb, path[ppos].p_block);
+ if (!path[ppos+1].p_bh)
+ return -EIO;
+ path[ppos+1].p_hdr =
+ ext_block_hdr(path[ppos+1].p_bh);
+
+ /* Halfway index block */
+ while (++cur_ppos < leaf_ppos) {
+ path[cur_ppos].p_idx =
+ EXT_FIRST_INDEX(path[cur_ppos].p_hdr);
+ path[cur_ppos].p_block =
+ idx_pblock(path[cur_ppos].p_idx);
+ if (path[cur_ppos+1].p_bh)
+ brelse(path[cur_ppos+1].p_bh);
+ path[cur_ppos+1].p_bh = sb_bread(inode->i_sb,
+ path[cur_ppos].p_block);
+ if (!path[cur_ppos+1].p_bh)
+ return -EIO;
+ path[cur_ppos+1].p_hdr =
+ ext_block_hdr(path[cur_ppos+1].p_bh);
+ }
+
+ /* leaf block */
+ path[leaf_ppos].p_ext = *extent =
+ EXT_FIRST_EXTENT(path[leaf_ppos].p_hdr);
+ return 0;
+ }
+ }
+ /* We found the last extent */
+ return 1;
+}
+
+/**
+ * mext_double_down_read - Acquire two inodes' read semaphore
+ *
+ * @orig_inode: original inode structure
+ * @donor_inode: donor inode structure
+ * Acquire read semaphore of the two inodes (orig and donor) by i_ino order.
+ */
+static void
+mext_double_down_read(struct inode *orig_inode, struct inode *donor_inode)
+{
+ struct inode *first = orig_inode, *second = donor_inode;
+
+ BUG_ON(orig_inode == NULL || donor_inode == NULL);
+
+ /*
+ * Use the inode number to provide the stable locking order instead
+ * of its address, because the C language doesn't guarantee you can
+ * compare pointers that don't come from the same array.
+ */
+ if (donor_inode->i_ino < orig_inode->i_ino) {
+ first = donor_inode;
+ second = orig_inode;
+ }
+
+ down_read(&EXT4_I(first)->i_data_sem);
+ down_read(&EXT4_I(second)->i_data_sem);
+}
+
+/**
+ * mext_double_down_write - Acquire two inodes' write semaphore
+ *
+ * @orig_inode: original inode structure
+ * @donor_inode: donor inode structure
+ * Acquire write semaphore of the two inodes (orig and donor) by i_ino order.
+ */
+static void
+mext_double_down_write(struct inode *orig_inode, struct inode *donor_inode)
+{
+ struct inode *first = orig_inode, *second = donor_inode;
+
+ BUG_ON(orig_inode == NULL || donor_inode == NULL);
+
+ /*
+ * Use the inode number to provide the stable locking order instead
+ * of its address, because the C language doesn't guarantee you can
+ * compare pointers that don't come from the same array.
+ */
+ if (donor_inode->i_ino < orig_inode->i_ino) {
+ first = donor_inode;
+ second = orig_inode;
+ }
+
+ down_write(&EXT4_I(first)->i_data_sem);
+ down_write(&EXT4_I(second)->i_data_sem);
+}
+
+/**
+ * mext_double_up_read - Release two inodes' read semaphore
+ *
+ * @orig_inode: original inode structure to be released its lock first
+ * @donor_inode: donor inode structure to be released its lock second
+ * Release read semaphore of two inodes (orig and donor).
+ */
+static void
+mext_double_up_read(struct inode *orig_inode, struct inode *donor_inode)
+{
+ BUG_ON(orig_inode == NULL || donor_inode == NULL);
+
+ up_read(&EXT4_I(orig_inode)->i_data_sem);
+ up_read(&EXT4_I(donor_inode)->i_data_sem);
+}
+
+/**
+ * mext_double_up_write - Release two inodes' write semaphore
+ *
+ * @orig_inode: original inode structure to be released its lock first
+ * @donor_inode: donor inode structure to be released its lock second
+ * Release write semaphore of two inodes (orig and donor).
+ */
+static void
+mext_double_up_write(struct inode *orig_inode, struct inode *donor_inode)
+{
+ BUG_ON(orig_inode == NULL || donor_inode == NULL);
+
+ up_write(&EXT4_I(orig_inode)->i_data_sem);
+ up_write(&EXT4_I(donor_inode)->i_data_sem);
+}
+
+/**
+ * mext_insert_across_blocks - Insert extents across leaf block
+ *
+ * @handle: journal handle
+ * @orig_inode: original inode
+ * @o_start: first original extent to be changed
+ * @o_end: last original extent to be changed
+ * @start_ext: first new extent to be inserted
+ * @new_ext: middle of new extent to be inserted
+ * @end_ext: last new extent to be inserted
+ *
+ * Allocate a new leaf block and insert extents into it. Return 0 on success,
+ * or a negative error value on failure.
+ */
+static int
+mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode,
+ struct ext4_extent *o_start, struct ext4_extent *o_end,
+ struct ext4_extent *start_ext, struct ext4_extent *new_ext,
+ struct ext4_extent *end_ext)
+{
+ struct ext4_ext_path *orig_path = NULL;
+ ext4_lblk_t eblock = 0;
+ int new_flag = 0;
+ int end_flag = 0;
+ int err = 0;
+
+ if (start_ext->ee_len && new_ext->ee_len && end_ext->ee_len) {
+ if (o_start == o_end) {
+
+ /* start_ext new_ext end_ext
+ * donor |---------|-----------|--------|
+ * orig |------------------------------|
+ */
+ end_flag = 1;
+ } else {
+
+ /* start_ext new_ext end_ext
+ * donor |---------|----------|---------|
+ * orig |---------------|--------------|
+ */
+ o_end->ee_block = end_ext->ee_block;
+ o_end->ee_len = end_ext->ee_len;
+ ext4_ext_store_pblock(o_end, ext_pblock(end_ext));
+ }
+
+ o_start->ee_len = start_ext->ee_len;
+ new_flag = 1;
+
+ } else if (start_ext->ee_len && new_ext->ee_len &&
+ !end_ext->ee_len && o_start == o_end) {
+
+ /* start_ext new_ext
+ * donor |--------------|---------------|
+ * orig |------------------------------|
+ */
+ o_start->ee_len = start_ext->ee_len;
+ new_flag = 1;
+
+ } else if (!start_ext->ee_len && new_ext->ee_len &&
+ end_ext->ee_len && o_start == o_end) {
+
+ /* new_ext end_ext
+ * donor |--------------|---------------|
+ * orig |------------------------------|
+ */
+ o_end->ee_block = end_ext->ee_block;
+ o_end->ee_len = end_ext->ee_len;
+ ext4_ext_store_pblock(o_end, ext_pblock(end_ext));
+
+ /*
+ * Set 0 to the extent block if new_ext was
+ * the first block.
+ */
+ if (new_ext->ee_block)
+ eblock = le32_to_cpu(new_ext->ee_block);
+
+ new_flag = 1;
+ } else {
+ ext4_debug("ext4 move extent: Unexpected insert case\n");
+ return -EIO;
+ }
+
+ if (new_flag) {
+ get_ext_path(orig_path, orig_inode, eblock, err);
+ if (orig_path == NULL)
+ goto out;
+
+ if (ext4_ext_insert_extent(handle, orig_inode,
+ orig_path, new_ext))
+ goto out;
+ }
+
+ if (end_flag) {
+ get_ext_path(orig_path, orig_inode,
+ le32_to_cpu(end_ext->ee_block) - 1, err);
+ if (orig_path == NULL)
+ goto out;
+
+ if (ext4_ext_insert_extent(handle, orig_inode,
+ orig_path, end_ext))
+ goto out;
+ }
+out:
+ if (orig_path) {
+ ext4_ext_drop_refs(orig_path);
+ kfree(orig_path);
+ }
+
+ return err;
+
+}
+
+/**
+ * mext_insert_inside_block - Insert new extent to the extent block
+ *
+ * @o_start: first original extent to be moved
+ * @o_end: last original extent to be moved
+ * @start_ext: first new extent to be inserted
+ * @new_ext: middle of new extent to be inserted
+ * @end_ext: last new extent to be inserted
+ * @eh: extent header of target leaf block
+ * @range_to_move: used to decide how to insert extent
+ *
+ * Insert extents into the leaf block. The extent (@o_start) is overwritten
+ * by inserted extents.
+ */
+static void
+mext_insert_inside_block(struct ext4_extent *o_start,
+ struct ext4_extent *o_end,
+ struct ext4_extent *start_ext,
+ struct ext4_extent *new_ext,
+ struct ext4_extent *end_ext,
+ struct ext4_extent_header *eh,
+ int range_to_move)
+{
+ int i = 0;
+ unsigned long len;
+
+ /* Move the existing extents */
+ if (range_to_move && o_end < EXT_LAST_EXTENT(eh)) {
+ len = (unsigned long)(EXT_LAST_EXTENT(eh) + 1) -
+ (unsigned long)(o_end + 1);
+ memmove(o_end + 1 + range_to_move, o_end + 1, len);
+ }
+
+ /* Insert start entry */
+ if (start_ext->ee_len)
+ o_start[i++].ee_len = start_ext->ee_len;
+
+ /* Insert new entry */
+ if (new_ext->ee_len) {
+ o_start[i] = *new_ext;
+ ext4_ext_store_pblock(&o_start[i++], ext_pblock(new_ext));
+ }
+
+ /* Insert end entry */
+ if (end_ext->ee_len)
+ o_start[i] = *end_ext;
+
+ /* Increment the total entries counter on the extent block */
+ le16_add_cpu(&eh->eh_entries, range_to_move);
+}
+
+/**
+ * mext_insert_extents - Insert new extent
+ *
+ * @handle: journal handle
+ * @orig_inode: original inode
+ * @orig_path: path indicates first extent to be changed
+ * @o_start: first original extent to be changed
+ * @o_end: last original extent to be changed
+ * @start_ext: first new extent to be inserted
+ * @new_ext: middle of new extent to be inserted
+ * @end_ext: last new extent to be inserted
+ *
+ * Call the function to insert extents. If we cannot add more extents into
+ * the leaf block, we call mext_insert_across_blocks() to create a
+ * new leaf block. Otherwise call mext_insert_inside_block(). Return 0
+ * on success, or a negative error value on failure.
+ */
+static int
+mext_insert_extents(handle_t *handle, struct inode *orig_inode,
+ struct ext4_ext_path *orig_path,
+ struct ext4_extent *o_start,
+ struct ext4_extent *o_end,
+ struct ext4_extent *start_ext,
+ struct ext4_extent *new_ext,
+ struct ext4_extent *end_ext)
+{
+ struct ext4_extent_header *eh;
+ unsigned long need_slots, slots_range;
+ int range_to_move, depth, ret;
+
+ /*
+ * The extents need to be inserted
+ * start_extent + new_extent + end_extent.
+ */
+ need_slots = (start_ext->ee_len ? 1 : 0) + (end_ext->ee_len ? 1 : 0) +
+ (new_ext->ee_len ? 1 : 0);
+
+ /* The number of slots between start and end */
+ slots_range = ((unsigned long)(o_end + 1) - (unsigned long)o_start + 1)
+ / sizeof(struct ext4_extent);
+
+ /* Range to move the end of extent */
+ range_to_move = need_slots - slots_range;
+ depth = orig_path->p_depth;
+ orig_path += depth;
+ eh = orig_path->p_hdr;
+
+ if (depth) {
+ /* Register to journal */
+ ret = ext4_journal_get_write_access(handle, orig_path->p_bh);
+ if (ret)
+ return ret;
+ }
+
+ /* Expansion */
+ if (range_to_move > 0 &&
+ (range_to_move > le16_to_cpu(eh->eh_max)
+ - le16_to_cpu(eh->eh_entries))) {
+
+ ret = mext_insert_across_blocks(handle, orig_inode, o_start,
+ o_end, start_ext, new_ext, end_ext);
+ if (ret < 0)
+ return ret;
+ } else
+ mext_insert_inside_block(o_start, o_end, start_ext, new_ext,
+ end_ext, eh, range_to_move);
+
+ if (depth) {
+ ret = ext4_handle_dirty_metadata(handle, orig_inode,
+ orig_path->p_bh);
+ if (ret)
+ return ret;
+ } else {
+ ret = ext4_mark_inode_dirty(handle, orig_inode);
+ if (ret < 0)
+ return ret;
+ }
+
+ return 0;
+}
+
+/**
+ * mext_leaf_block - Move one leaf extent block into the inode.
+ *
+ * @handle: journal handle
+ * @orig_inode: original inode
+ * @orig_path: path indicates first extent to be changed
+ * @dext: donor extent
+ * @from: start offset on the target file
+ *
+ * In order to insert extents into the leaf block, we must divide the extent
+ * in the leaf block into three extents. The one is located to be inserted
+ * extents, and the others are located around it.
+ *
+ * Therefore, this function creates structures to save extents of the leaf
+ * block, and inserts extents by calling mext_insert_extents() with
+ * created extents. Return 0 on success, or a negative error value on failure.
+ */
+static int
+mext_leaf_block(handle_t *handle, struct inode *orig_inode,
+ struct ext4_ext_path *orig_path, struct ext4_extent *dext,
+ ext4_lblk_t *from)
+{
+ struct ext4_extent *oext, *o_start, *o_end, *prev_ext;
+ struct ext4_extent new_ext, start_ext, end_ext;
+ ext4_lblk_t new_ext_end;
+ ext4_fsblk_t new_phys_end;
+ int oext_alen, new_ext_alen, end_ext_alen;
+ int depth = ext_depth(orig_inode);
+ int ret;
+
+ o_start = o_end = oext = orig_path[depth].p_ext;
+ oext_alen = ext4_ext_get_actual_len(oext);
+ start_ext.ee_len = end_ext.ee_len = 0;
+
+ new_ext.ee_block = cpu_to_le32(*from);
+ ext4_ext_store_pblock(&new_ext, ext_pblock(dext));
+ new_ext.ee_len = dext->ee_len;
+ new_ext_alen = ext4_ext_get_actual_len(&new_ext);
+ new_ext_end = le32_to_cpu(new_ext.ee_block) + new_ext_alen - 1;
+ new_phys_end = ext_pblock(&new_ext) + new_ext_alen - 1;
+
+ /*
+ * Case: original extent is first
+ * oext |--------|
+ * new_ext |--|
+ * start_ext |--|
+ */
+ if (le32_to_cpu(oext->ee_block) < le32_to_cpu(new_ext.ee_block) &&
+ le32_to_cpu(new_ext.ee_block) <
+ le32_to_cpu(oext->ee_block) + oext_alen) {
+ start_ext.ee_len = cpu_to_le16(le32_to_cpu(new_ext.ee_block) -
+ le32_to_cpu(oext->ee_block));
+ copy_extent_status(oext, &start_ext);
+ } else if (oext > EXT_FIRST_EXTENT(orig_path[depth].p_hdr)) {
+ prev_ext = oext - 1;
+ /*
+ * We can merge new_ext into previous extent,
+ * if these are contiguous and same extent type.
+ */
+ if (ext4_can_extents_be_merged(orig_inode, prev_ext,
+ &new_ext)) {
+ o_start = prev_ext;
+ start_ext.ee_len = cpu_to_le16(
+ ext4_ext_get_actual_len(prev_ext) +
+ new_ext_alen);
+ copy_extent_status(prev_ext, &start_ext);
+ new_ext.ee_len = 0;
+ }
+ }
+
+ /*
+ * Case: new_ext_end must be less than oext
+ * oext |-----------|
+ * new_ext |-------|
+ */
+ BUG_ON(le32_to_cpu(oext->ee_block) + oext_alen - 1 < new_ext_end);
+
+ /*
+ * Case: new_ext is smaller than original extent
+ * oext |---------------|
+ * new_ext |-----------|
+ * end_ext |---|
+ */
+ if (le32_to_cpu(oext->ee_block) <= new_ext_end &&
+ new_ext_end < le32_to_cpu(oext->ee_block) + oext_alen - 1) {
+ end_ext.ee_len =
+ cpu_to_le16(le32_to_cpu(oext->ee_block) +
+ oext_alen - 1 - new_ext_end);
+ copy_extent_status(oext, &end_ext);
+ end_ext_alen = ext4_ext_get_actual_len(&end_ext);
+ ext4_ext_store_pblock(&end_ext,
+ (ext_pblock(o_end) + oext_alen - end_ext_alen));
+ end_ext.ee_block =
+ cpu_to_le32(le32_to_cpu(o_end->ee_block) +
+ oext_alen - end_ext_alen);
+ }
+
+ ret = mext_insert_extents(handle, orig_inode, orig_path, o_start,
+ o_end, &start_ext, &new_ext, &end_ext);
+ return ret;
+}
+
+/**
+ * mext_calc_swap_extents - Calculate extents for extent swapping.
+ *
+ * @tmp_dext: the extent that will belong to the original inode
+ * @tmp_oext: the extent that will belong to the donor inode
+ * @orig_off: block offset of original inode
+ * @donor_off: block offset of donor inode
+ * @max_count: the maximun length of extents
+ */
+static void
+mext_calc_swap_extents(struct ext4_extent *tmp_dext,
+ struct ext4_extent *tmp_oext,
+ ext4_lblk_t orig_off, ext4_lblk_t donor_off,
+ ext4_lblk_t max_count)
+{
+ ext4_lblk_t diff, orig_diff;
+ struct ext4_extent dext_old, oext_old;
+
+ dext_old = *tmp_dext;
+ oext_old = *tmp_oext;
+
+ /* When tmp_dext is too large, pick up the target range. */
+ diff = donor_off - le32_to_cpu(tmp_dext->ee_block);
+
+ ext4_ext_store_pblock(tmp_dext, ext_pblock(tmp_dext) + diff);
+ tmp_dext->ee_block =
+ cpu_to_le32(le32_to_cpu(tmp_dext->ee_block) + diff);
+ tmp_dext->ee_len = cpu_to_le16(le16_to_cpu(tmp_dext->ee_len) - diff);
+
+ if (max_count < ext4_ext_get_actual_len(tmp_dext))
+ tmp_dext->ee_len = cpu_to_le16(max_count);
+
+ orig_diff = orig_off - le32_to_cpu(tmp_oext->ee_block);
+ ext4_ext_store_pblock(tmp_oext, ext_pblock(tmp_oext) + orig_diff);
+
+ /* Adjust extent length if donor extent is larger than orig */
+ if (ext4_ext_get_actual_len(tmp_dext) >
+ ext4_ext_get_actual_len(tmp_oext) - orig_diff)
+ tmp_dext->ee_len = cpu_to_le16(le16_to_cpu(tmp_oext->ee_len) -
+ orig_diff);
+
+ tmp_oext->ee_len = cpu_to_le16(ext4_ext_get_actual_len(tmp_dext));
+
+ copy_extent_status(&oext_old, tmp_dext);
+ copy_extent_status(&dext_old, tmp_oext);
+}
+
+/**
+ * mext_replace_branches - Replace original extents with new extents
+ *
+ * @handle: journal handle
+ * @orig_inode: original inode
+ * @donor_inode: donor inode
+ * @from: block offset of orig_inode
+ * @count: block count to be replaced
+ *
+ * Replace original inode extents and donor inode extents page by page.
+ * We implement this replacement in the following three steps:
+ * 1. Save the block information of original and donor inodes into
+ * dummy extents.
+ * 2. Change the block information of original inode to point at the
+ * donor inode blocks.
+ * 3. Change the block information of donor inode to point at the saved
+ * original inode blocks in the dummy extents.
+ *
+ * Return 0 on success, or a negative error value on failure.
+ */
+static int
+mext_replace_branches(handle_t *handle, struct inode *orig_inode,
+ struct inode *donor_inode, ext4_lblk_t from,
+ ext4_lblk_t count)
+{
+ struct ext4_ext_path *orig_path = NULL;
+ struct ext4_ext_path *donor_path = NULL;
+ struct ext4_extent *oext, *dext;
+ struct ext4_extent tmp_dext, tmp_oext;
+ ext4_lblk_t orig_off = from, donor_off = from;
+ int err = 0;
+ int depth;
+ int replaced_count = 0;
+ int dext_alen;
+
+ mext_double_down_write(orig_inode, donor_inode);
+
+ /* Get the original extent for the block "orig_off" */
+ get_ext_path(orig_path, orig_inode, orig_off, err);
+ if (orig_path == NULL)
+ goto out;
+
+ /* Get the donor extent for the head */
+ get_ext_path(donor_path, donor_inode, donor_off, err);
+ if (donor_path == NULL)
+ goto out;
+ depth = ext_depth(orig_inode);
+ oext = orig_path[depth].p_ext;
+ tmp_oext = *oext;
+
+ depth = ext_depth(donor_inode);
+ dext = donor_path[depth].p_ext;
+ tmp_dext = *dext;
+
+ mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
+ donor_off, count);
+
+ /* Loop for the donor extents */
+ while (1) {
+ /* The extent for donor must be found. */
+ BUG_ON(!dext || donor_off != le32_to_cpu(tmp_dext.ee_block));
+
+ /* Set donor extent to orig extent */
+ err = mext_leaf_block(handle, orig_inode,
+ orig_path, &tmp_dext, &orig_off);
+ if (err < 0)
+ goto out;
+
+ /* Set orig extent to donor extent */
+ err = mext_leaf_block(handle, donor_inode,
+ donor_path, &tmp_oext, &donor_off);
+ if (err < 0)
+ goto out;
+
+ dext_alen = ext4_ext_get_actual_len(&tmp_dext);
+ replaced_count += dext_alen;
+ donor_off += dext_alen;
+ orig_off += dext_alen;
+
+ /* Already moved the expected blocks */
+ if (replaced_count >= count)
+ break;
+
+ if (orig_path)
+ ext4_ext_drop_refs(orig_path);
+ get_ext_path(orig_path, orig_inode, orig_off, err);
+ if (orig_path == NULL)
+ goto out;
+ depth = ext_depth(orig_inode);
+ oext = orig_path[depth].p_ext;
+ if (le32_to_cpu(oext->ee_block) +
+ ext4_ext_get_actual_len(oext) <= orig_off) {
+ err = 0;
+ goto out;
+ }
+ tmp_oext = *oext;
+
+ if (donor_path)
+ ext4_ext_drop_refs(donor_path);
+ get_ext_path(donor_path, donor_inode,
+ donor_off, err);
+ if (donor_path == NULL)
+ goto out;
+ depth = ext_depth(donor_inode);
+ dext = donor_path[depth].p_ext;
+ if (le32_to_cpu(dext->ee_block) +
+ ext4_ext_get_actual_len(dext) <= donor_off) {
+ err = 0;
+ goto out;
+ }
+ tmp_dext = *dext;
+
+ mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
+ donor_off,
+ count - replaced_count);
+ }
+
+out:
+ if (orig_path) {
+ ext4_ext_drop_refs(orig_path);
+ kfree(orig_path);
+ }
+ if (donor_path) {
+ ext4_ext_drop_refs(donor_path);
+ kfree(donor_path);
+ }
+
+ mext_double_up_write(orig_inode, donor_inode);
+ return err;
+}
+
+/**
+ * move_extent_per_page - Move extent data per page
+ *
+ * @o_filp: file structure of original file
+ * @donor_inode: donor inode
+ * @orig_page_offset: page index on original file
+ * @data_offset_in_page: block index where data swapping starts
+ * @block_len_in_page: the number of blocks to be swapped
+ * @uninit: orig extent is uninitialized or not
+ *
+ * Save the data in original inode blocks and replace original inode extents
+ * with donor inode extents by calling mext_replace_branches().
+ * Finally, write out the saved data in new original inode blocks. Return 0
+ * on success, or a negative error value on failure.
+ */
+static int
+move_extent_par_page(struct file *o_filp, struct inode *donor_inode,
+ pgoff_t orig_page_offset, int data_offset_in_page,
+ int block_len_in_page, int uninit)
+{
+ struct inode *orig_inode = o_filp->f_dentry->d_inode;
+ struct address_space *mapping = orig_inode->i_mapping;
+ struct buffer_head *bh;
+ struct page *page = NULL;
+ const struct address_space_operations *a_ops = mapping->a_ops;
+ handle_t *handle;
+ ext4_lblk_t orig_blk_offset;
+ long long offs = orig_page_offset << PAGE_CACHE_SHIFT;
+ unsigned long blocksize = orig_inode->i_sb->s_blocksize;
+ unsigned int w_flags = 0;
+ unsigned int tmp_data_len, data_len;
+ void *fsdata;
+ int ret, i, jblocks;
+ int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits;
+
+ /*
+ * It needs twice the amount of ordinary journal buffers because
+ * inode and donor_inode may change each different metadata blocks.
+ */
+ jblocks = ext4_writepage_trans_blocks(orig_inode) * 2;
+ handle = ext4_journal_start(orig_inode, jblocks);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ return ret;
+ }
+
+ if (segment_eq(get_fs(), KERNEL_DS))
+ w_flags |= AOP_FLAG_UNINTERRUPTIBLE;
+
+ orig_blk_offset = orig_page_offset * blocks_per_page +
+ data_offset_in_page;
+
+ /*
+ * If orig extent is uninitialized one,
+ * it's not necessary force the page into memory
+ * and then force it to be written out again.
+ * Just swap data blocks between orig and donor.
+ */
+ if (uninit) {
+ ret = mext_replace_branches(handle, orig_inode,
+ donor_inode, orig_blk_offset,
+ block_len_in_page);
+
+ /* Clear the inode cache not to refer to the old data */
+ ext4_ext_invalidate_cache(orig_inode);
+ ext4_ext_invalidate_cache(donor_inode);
+ goto out2;
+ }
+
+ offs = (long long)orig_blk_offset << orig_inode->i_blkbits;
+
+ /* Calculate data_len */
+ if ((orig_blk_offset + block_len_in_page - 1) ==
+ ((orig_inode->i_size - 1) >> orig_inode->i_blkbits)) {
+ /* Replace the last block */
+ tmp_data_len = orig_inode->i_size & (blocksize - 1);
+ /*
+ * If data_len equal zero, it shows data_len is multiples of
+ * blocksize. So we set appropriate value.
+ */
+ if (tmp_data_len == 0)
+ tmp_data_len = blocksize;
+
+ data_len = tmp_data_len +
+ ((block_len_in_page - 1) << orig_inode->i_blkbits);
+ } else {
+ data_len = block_len_in_page << orig_inode->i_blkbits;
+ }
+
+ ret = a_ops->write_begin(o_filp, mapping, offs, data_len, w_flags,
+ &page, &fsdata);
+ if (unlikely(ret < 0))
+ goto out;
+
+ if (!PageUptodate(page)) {
+ mapping->a_ops->readpage(o_filp, page);
+ lock_page(page);
+ }
+
+ /*
+ * try_to_release_page() doesn't call releasepage in writeback mode.
+ * We should care about the order of writing to the same file
+ * by multiple move extent processes.
+ * It needs to call wait_on_page_writeback() to wait for the
+ * writeback of the page.
+ */
+ if (PageWriteback(page))
+ wait_on_page_writeback(page);
+
+ /* Release old bh and drop refs */
+ try_to_release_page(page, 0);
+
+ ret = mext_replace_branches(handle, orig_inode, donor_inode,
+ orig_blk_offset, block_len_in_page);
+ if (ret < 0)
+ goto out;
+
+ /* Clear the inode cache not to refer to the old data */
+ ext4_ext_invalidate_cache(orig_inode);
+ ext4_ext_invalidate_cache(donor_inode);
+
+ if (!page_has_buffers(page))
+ create_empty_buffers(page, 1 << orig_inode->i_blkbits, 0);
+
+ bh = page_buffers(page);
+ for (i = 0; i < data_offset_in_page; i++)
+ bh = bh->b_this_page;
+
+ for (i = 0; i < block_len_in_page; i++) {
+ ret = ext4_get_block(orig_inode,
+ (sector_t)(orig_blk_offset + i), bh, 0);
+ if (ret < 0)
+ goto out;
+
+ if (bh->b_this_page != NULL)
+ bh = bh->b_this_page;
+ }
+
+ ret = a_ops->write_end(o_filp, mapping, offs, data_len, data_len,
+ page, fsdata);
+ page = NULL;
+
+out:
+ if (unlikely(page)) {
+ if (PageLocked(page))
+ unlock_page(page);
+ page_cache_release(page);
+ }
+out2:
+ ext4_journal_stop(handle);
+
+ return ret < 0 ? ret : 0;
+}
+
+/**
+ * mext_check_argumants - Check whether move extent can be done
+ *
+ * @orig_inode: original inode
+ * @donor_inode: donor inode
+ * @orig_start: logical start offset in block for orig
+ * @donor_start: logical start offset in block for donor
+ * @len: the number of blocks to be moved
+ * @moved_len: moved block length
+ *
+ * Check the arguments of ext4_move_extents() whether the files can be
+ * exchanged with each other.
+ * Return 0 on success, or a negative error value on failure.
+ */
+static int
+mext_check_arguments(struct inode *orig_inode,
+ struct inode *donor_inode, __u64 orig_start,
+ __u64 donor_start, __u64 *len, __u64 moved_len)
+{
+ /* Regular file check */
+ if (!S_ISREG(orig_inode->i_mode) || !S_ISREG(donor_inode->i_mode)) {
+ ext4_debug("ext4 move extent: The argument files should be "
+ "regular file [ino:orig %lu, donor %lu]\n",
+ orig_inode->i_ino, donor_inode->i_ino);
+ return -EINVAL;
+ }
+
+ /* Ext4 move extent does not support swapfile */
+ if (IS_SWAPFILE(orig_inode) || IS_SWAPFILE(donor_inode)) {
+ ext4_debug("ext4 move extent: The argument files should "
+ "not be swapfile [ino:orig %lu, donor %lu]\n",
+ orig_inode->i_ino, donor_inode->i_ino);
+ return -EINVAL;
+ }
+
+ /* Files should be in the same ext4 FS */
+ if (orig_inode->i_sb != donor_inode->i_sb) {
+ ext4_debug("ext4 move extent: The argument files "
+ "should be in same FS [ino:orig %lu, donor %lu]\n",
+ orig_inode->i_ino, donor_inode->i_ino);
+ return -EINVAL;
+ }
+
+ /* orig and donor should be different file */
+ if (orig_inode->i_ino == donor_inode->i_ino) {
+ ext4_debug("ext4 move extent: The argument files should not "
+ "be same file [ino:orig %lu, donor %lu]\n",
+ orig_inode->i_ino, donor_inode->i_ino);
+ return -EINVAL;
+ }
+
+ /* Ext4 move extent supports only extent based file */
+ if (!(EXT4_I(orig_inode)->i_flags & EXT4_EXTENTS_FL)) {
+ ext4_debug("ext4 move extent: orig file is not extents "
+ "based file [ino:orig %lu]\n", orig_inode->i_ino);
+ return -EOPNOTSUPP;
+ } else if (!(EXT4_I(donor_inode)->i_flags & EXT4_EXTENTS_FL)) {
+ ext4_debug("ext4 move extent: donor file is not extents "
+ "based file [ino:donor %lu]\n", donor_inode->i_ino);
+ return -EOPNOTSUPP;
+ }
+
+ if ((!orig_inode->i_size) || (!donor_inode->i_size)) {
+ ext4_debug("ext4 move extent: File size is 0 byte\n");
+ return -EINVAL;
+ }
+
+ /* Start offset should be same */
+ if (orig_start != donor_start) {
+ ext4_debug("ext4 move extent: orig and donor's start "
+ "offset are not same [ino:orig %lu, donor %lu]\n",
+ orig_inode->i_ino, donor_inode->i_ino);
+ return -EINVAL;
+ }
+
+ if (moved_len) {
+ ext4_debug("ext4 move extent: moved_len should be 0 "
+ "[ino:orig %lu, donor %lu]\n", orig_inode->i_ino,
+ donor_inode->i_ino);
+ return -EINVAL;
+ }
+
+ if ((orig_start > MAX_DEFRAG_SIZE) ||
+ (donor_start > MAX_DEFRAG_SIZE) ||
+ (*len > MAX_DEFRAG_SIZE) ||
+ (orig_start + *len > MAX_DEFRAG_SIZE)) {
+ ext4_debug("ext4 move extent: Can't handle over [%lu] blocks "
+ "[ino:orig %lu, donor %lu]\n", MAX_DEFRAG_SIZE,
+ orig_inode->i_ino, donor_inode->i_ino);
+ return -EINVAL;
+ }
+
+ if (orig_inode->i_size > donor_inode->i_size) {
+ if (orig_start >= donor_inode->i_size) {
+ ext4_debug("ext4 move extent: orig start offset "
+ "[%llu] should be less than donor file size "
+ "[%lld] [ino:orig %lu, donor_inode %lu]\n",
+ orig_start, donor_inode->i_size,
+ orig_inode->i_ino, donor_inode->i_ino);
+ return -EINVAL;
+ }
+
+ if (orig_start + *len > donor_inode->i_size) {
+ ext4_debug("ext4 move extent: End offset [%llu] should "
+ "be less than donor file size [%lld]."
+ "So adjust length from %llu to %lld "
+ "[ino:orig %lu, donor %lu]\n",
+ orig_start + *len, donor_inode->i_size,
+ *len, donor_inode->i_size - orig_start,
+ orig_inode->i_ino, donor_inode->i_ino);
+ *len = donor_inode->i_size - orig_start;
+ }
+ } else {
+ if (orig_start >= orig_inode->i_size) {
+ ext4_debug("ext4 move extent: start offset [%llu] "
+ "should be less than original file size "
+ "[%lld] [inode:orig %lu, donor %lu]\n",
+ orig_start, orig_inode->i_size,
+ orig_inode->i_ino, donor_inode->i_ino);
+ return -EINVAL;
+ }
+
+ if (orig_start + *len > orig_inode->i_size) {
+ ext4_debug("ext4 move extent: Adjust length "
+ "from %llu to %lld. Because it should be "
+ "less than original file size "
+ "[ino:orig %lu, donor %lu]\n",
+ *len, orig_inode->i_size - orig_start,
+ orig_inode->i_ino, donor_inode->i_ino);
+ *len = orig_inode->i_size - orig_start;
+ }
+ }
+
+ if (!*len) {
+ ext4_debug("ext4 move extent: len shoudld not be 0 "
+ "[ino:orig %lu, donor %lu]\n", orig_inode->i_ino,
+ donor_inode->i_ino);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+/**
+ * mext_inode_double_lock - Lock i_mutex on both @inode1 and @inode2
+ *
+ * @inode1: the inode structure
+ * @inode2: the inode structure
+ *
+ * Lock two inodes' i_mutex by i_ino order. This function is moved from
+ * fs/inode.c.
+ */
+static void
+mext_inode_double_lock(struct inode *inode1, struct inode *inode2)
+{
+ if (inode1 == NULL || inode2 == NULL || inode1 == inode2) {
+ if (inode1)
+ mutex_lock(&inode1->i_mutex);
+ else if (inode2)
+ mutex_lock(&inode2->i_mutex);
+ return;
+ }
+
+ if (inode1->i_ino < inode2->i_ino) {
+ mutex_lock_nested(&inode1->i_mutex, I_MUTEX_PARENT);
+ mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD);
+ } else {
+ mutex_lock_nested(&inode2->i_mutex, I_MUTEX_PARENT);
+ mutex_lock_nested(&inode1->i_mutex, I_MUTEX_CHILD);
+ }
+}
+
+/**
+ * mext_inode_double_unlock - Release i_mutex on both @inode1 and @inode2
+ *
+ * @inode1: the inode that is released first
+ * @inode2: the inode that is released second
+ *
+ * This function is moved from fs/inode.c.
+ */
+
+static void
+mext_inode_double_unlock(struct inode *inode1, struct inode *inode2)
+{
+ if (inode1)
+ mutex_unlock(&inode1->i_mutex);
+
+ if (inode2 && inode2 != inode1)
+ mutex_unlock(&inode2->i_mutex);
+}
+
+/**
+ * ext4_move_extents - Exchange the specified range of a file
+ *
+ * @o_filp: file structure of the original file
+ * @d_filp: file structure of the donor file
+ * @orig_start: start offset in block for orig
+ * @donor_start: start offset in block for donor
+ * @len: the number of blocks to be moved
+ * @moved_len: moved block length
+ *
+ * This function returns 0 and moved block length is set in moved_len
+ * if succeed, otherwise returns error value.
+ *
+ * Note: ext4_move_extents() proceeds the following order.
+ * 1:ext4_move_extents() calculates the last block number of moving extent
+ * function by the start block number (orig_start) and the number of blocks
+ * to be moved (len) specified as arguments.
+ * If the {orig, donor}_start points a hole, the extent's start offset
+ * pointed by ext_cur (current extent), holecheck_path, orig_path are set
+ * after hole behind.
+ * 2:Continue step 3 to step 5, until the holecheck_path points to last_extent
+ * or the ext_cur exceeds the block_end which is last logical block number.
+ * 3:To get the length of continues area, call mext_next_extent()
+ * specified with the ext_cur (initial value is holecheck_path) re-cursive,
+ * until find un-continuous extent, the start logical block number exceeds
+ * the block_end or the extent points to the last extent.
+ * 4:Exchange the original inode data with donor inode data
+ * from orig_page_offset to seq_end_page.
+ * The start indexes of data are specified as arguments.
+ * That of the original inode is orig_page_offset,
+ * and the donor inode is also orig_page_offset
+ * (To easily handle blocksize != pagesize case, the offset for the
+ * donor inode is block unit).
+ * 5:Update holecheck_path and orig_path to points a next proceeding extent,
+ * then returns to step 2.
+ * 6:Release holecheck_path, orig_path and set the len to moved_len
+ * which shows the number of moved blocks.
+ * The moved_len is useful for the command to calculate the file offset
+ * for starting next move extent ioctl.
+ * 7:Return 0 on success, or a negative error value on failure.
+ */
+int
+ext4_move_extents(struct file *o_filp, struct file *d_filp,
+ __u64 orig_start, __u64 donor_start, __u64 len,
+ __u64 *moved_len)
+{
+ struct inode *orig_inode = o_filp->f_dentry->d_inode;
+ struct inode *donor_inode = d_filp->f_dentry->d_inode;
+ struct ext4_ext_path *orig_path = NULL, *holecheck_path = NULL;
+ struct ext4_extent *ext_prev, *ext_cur, *ext_dummy;
+ ext4_lblk_t block_start = orig_start;
+ ext4_lblk_t block_end, seq_start, add_blocks, file_end, seq_blocks = 0;
+ ext4_lblk_t rest_blocks;
+ pgoff_t orig_page_offset = 0, seq_end_page;
+ int ret, depth, last_extent = 0;
+ int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits;
+ int data_offset_in_page;
+ int block_len_in_page;
+ int uninit;
+
+ /* protect orig and donor against a truncate */
+ mext_inode_double_lock(orig_inode, donor_inode);
+
+ mext_double_down_read(orig_inode, donor_inode);
+ /* Check the filesystem environment whether move_extent can be done */
+ ret = mext_check_arguments(orig_inode, donor_inode, orig_start,
+ donor_start, &len, *moved_len);
+ mext_double_up_read(orig_inode, donor_inode);
+ if (ret)
+ goto out2;
+
+ file_end = (i_size_read(orig_inode) - 1) >> orig_inode->i_blkbits;
+ block_end = block_start + len - 1;
+ if (file_end < block_end)
+ len -= block_end - file_end;
+
+ get_ext_path(orig_path, orig_inode, block_start, ret);
+ if (orig_path == NULL)
+ goto out2;
+
+ /* Get path structure to check the hole */
+ get_ext_path(holecheck_path, orig_inode, block_start, ret);
+ if (holecheck_path == NULL)
+ goto out;
+
+ depth = ext_depth(orig_inode);
+ ext_cur = holecheck_path[depth].p_ext;
+ if (ext_cur == NULL) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /*
+ * Get proper extent whose ee_block is beyond block_start
+ * if block_start was within the hole.
+ */
+ if (le32_to_cpu(ext_cur->ee_block) +
+ ext4_ext_get_actual_len(ext_cur) - 1 < block_start) {
+ last_extent = mext_next_extent(orig_inode,
+ holecheck_path, &ext_cur);
+ if (last_extent < 0) {
+ ret = last_extent;
+ goto out;
+ }
+ last_extent = mext_next_extent(orig_inode, orig_path,
+ &ext_dummy);
+ if (last_extent < 0) {
+ ret = last_extent;
+ goto out;
+ }
+ }
+ seq_start = block_start;
+
+ /* No blocks within the specified range. */
+ if (le32_to_cpu(ext_cur->ee_block) > block_end) {
+ ext4_debug("ext4 move extent: The specified range of file "
+ "may be the hole\n");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /* Adjust start blocks */
+ add_blocks = min(le32_to_cpu(ext_cur->ee_block) +
+ ext4_ext_get_actual_len(ext_cur), block_end + 1) -
+ max(le32_to_cpu(ext_cur->ee_block), block_start);
+
+ while (!last_extent && le32_to_cpu(ext_cur->ee_block) <= block_end) {
+ seq_blocks += add_blocks;
+
+ /* Adjust tail blocks */
+ if (seq_start + seq_blocks - 1 > block_end)
+ seq_blocks = block_end - seq_start + 1;
+
+ ext_prev = ext_cur;
+ last_extent = mext_next_extent(orig_inode, holecheck_path,
+ &ext_cur);
+ if (last_extent < 0) {
+ ret = last_extent;
+ break;
+ }
+ add_blocks = ext4_ext_get_actual_len(ext_cur);
+
+ /*
+ * Extend the length of contiguous block (seq_blocks)
+ * if extents are contiguous.
+ */
+ if (ext4_can_extents_be_merged(orig_inode,
+ ext_prev, ext_cur) &&
+ block_end >= le32_to_cpu(ext_cur->ee_block) &&
+ !last_extent)
+ continue;
+
+ /* Is original extent is uninitialized */
+ uninit = ext4_ext_is_uninitialized(ext_prev);
+
+ data_offset_in_page = seq_start % blocks_per_page;
+
+ /*
+ * Calculate data blocks count that should be swapped
+ * at the first page.
+ */
+ if (data_offset_in_page + seq_blocks > blocks_per_page) {
+ /* Swapped blocks are across pages */
+ block_len_in_page =
+ blocks_per_page - data_offset_in_page;
+ } else {
+ /* Swapped blocks are in a page */
+ block_len_in_page = seq_blocks;
+ }
+
+ orig_page_offset = seq_start >>
+ (PAGE_CACHE_SHIFT - orig_inode->i_blkbits);
+ seq_end_page = (seq_start + seq_blocks - 1) >>
+ (PAGE_CACHE_SHIFT - orig_inode->i_blkbits);
+ seq_start = le32_to_cpu(ext_cur->ee_block);
+ rest_blocks = seq_blocks;
+
+ /* Discard preallocations of two inodes */
+ down_write(&EXT4_I(orig_inode)->i_data_sem);
+ ext4_discard_preallocations(orig_inode);
+ up_write(&EXT4_I(orig_inode)->i_data_sem);
+
+ down_write(&EXT4_I(donor_inode)->i_data_sem);
+ ext4_discard_preallocations(donor_inode);
+ up_write(&EXT4_I(donor_inode)->i_data_sem);
+
+ while (orig_page_offset <= seq_end_page) {
+
+ /* Swap original branches with new branches */
+ ret = move_extent_par_page(o_filp, donor_inode,
+ orig_page_offset,
+ data_offset_in_page,
+ block_len_in_page, uninit);
+ if (ret < 0)
+ goto out;
+ orig_page_offset++;
+ /* Count how many blocks we have exchanged */
+ *moved_len += block_len_in_page;
+ BUG_ON(*moved_len > len);
+
+ data_offset_in_page = 0;
+ rest_blocks -= block_len_in_page;
+ if (rest_blocks > blocks_per_page)
+ block_len_in_page = blocks_per_page;
+ else
+ block_len_in_page = rest_blocks;
+ }
+
+ /* Decrease buffer counter */
+ if (holecheck_path)
+ ext4_ext_drop_refs(holecheck_path);
+ get_ext_path(holecheck_path, orig_inode,
+ seq_start, ret);
+ if (holecheck_path == NULL)
+ break;
+ depth = holecheck_path->p_depth;
+
+ /* Decrease buffer counter */
+ if (orig_path)
+ ext4_ext_drop_refs(orig_path);
+ get_ext_path(orig_path, orig_inode, seq_start, ret);
+ if (orig_path == NULL)
+ break;
+
+ ext_cur = holecheck_path[depth].p_ext;
+ add_blocks = ext4_ext_get_actual_len(ext_cur);
+ seq_blocks = 0;
+
+ }
+out:
+ if (orig_path) {
+ ext4_ext_drop_refs(orig_path);
+ kfree(orig_path);
+ }
+ if (holecheck_path) {
+ ext4_ext_drop_refs(holecheck_path);
+ kfree(holecheck_path);
+ }
+out2:
+ mext_inode_double_unlock(orig_inode, donor_inode);
+
+ if (ret)
+ return ret;
+
+ /* All of the specified blocks must be exchanged in succeed */
+ BUG_ON(*moved_len != len);
+
+ return 0;
+}
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 22098e1cd085..de04013d16ff 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -37,7 +37,6 @@
#include "ext4.h"
#include "ext4_jbd2.h"
-#include "namei.h"
#include "xattr.h"
#include "acl.h"
@@ -750,7 +749,7 @@ static int dx_make_map(struct ext4_dir_entry_2 *de, unsigned blocksize,
ext4fs_dirhash(de->name, de->name_len, &h);
map_tail--;
map_tail->hash = h.hash;
- map_tail->offs = (u16) ((char *) de - base);
+ map_tail->offs = ((char *) de - base)>>2;
map_tail->size = le16_to_cpu(de->rec_len);
count++;
cond_resched();
@@ -1148,7 +1147,8 @@ dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count,
unsigned rec_len = 0;
while (count--) {
- struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *) (from + map->offs);
+ struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *)
+ (from + (map->offs<<2));
rec_len = EXT4_DIR_REC_LEN(de->name_len);
memcpy (to, de, rec_len);
((struct ext4_dir_entry_2 *) to)->rec_len =
@@ -1782,7 +1782,7 @@ retry:
if (IS_DIRSYNC(dir))
ext4_handle_sync(handle);
- inode = ext4_new_inode (handle, dir, mode);
+ inode = ext4_new_inode(handle, dir, mode, &dentry->d_name, 0);
err = PTR_ERR(inode);
if (!IS_ERR(inode)) {
inode->i_op = &ext4_file_inode_operations;
@@ -1816,7 +1816,7 @@ retry:
if (IS_DIRSYNC(dir))
ext4_handle_sync(handle);
- inode = ext4_new_inode(handle, dir, mode);
+ inode = ext4_new_inode(handle, dir, mode, &dentry->d_name, 0);
err = PTR_ERR(inode);
if (!IS_ERR(inode)) {
init_special_inode(inode, inode->i_mode, rdev);
@@ -1853,7 +1853,8 @@ retry:
if (IS_DIRSYNC(dir))
ext4_handle_sync(handle);
- inode = ext4_new_inode(handle, dir, S_IFDIR | mode);
+ inode = ext4_new_inode(handle, dir, S_IFDIR | mode,
+ &dentry->d_name, 0);
err = PTR_ERR(inode);
if (IS_ERR(inode))
goto out_stop;
@@ -1997,7 +1998,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
if (!ext4_handle_valid(handle))
return 0;
- lock_super(sb);
+ mutex_lock(&EXT4_SB(sb)->s_orphan_lock);
if (!list_empty(&EXT4_I(inode)->i_orphan))
goto out_unlock;
@@ -2006,9 +2007,13 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
/* @@@ FIXME: Observation from aviro:
* I think I can trigger J_ASSERT in ext4_orphan_add(). We block
- * here (on lock_super()), so race with ext4_link() which might bump
+ * here (on s_orphan_lock), so race with ext4_link() which might bump
* ->i_nlink. For, say it, character device. Not a regular file,
* not a directory, not a symlink and ->i_nlink > 0.
+ *
+ * tytso, 4/25/2009: I'm not sure how that could happen;
+ * shouldn't the fs core protect us from these sort of
+ * unlink()/link() races?
*/
J_ASSERT((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
S_ISLNK(inode->i_mode)) || inode->i_nlink == 0);
@@ -2045,7 +2050,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
jbd_debug(4, "orphan inode %lu will point to %d\n",
inode->i_ino, NEXT_ORPHAN(inode));
out_unlock:
- unlock_super(sb);
+ mutex_unlock(&EXT4_SB(sb)->s_orphan_lock);
ext4_std_error(inode->i_sb, err);
return err;
}
@@ -2066,11 +2071,9 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
if (!ext4_handle_valid(handle))
return 0;
- lock_super(inode->i_sb);
- if (list_empty(&ei->i_orphan)) {
- unlock_super(inode->i_sb);
- return 0;
- }
+ mutex_lock(&EXT4_SB(inode->i_sb)->s_orphan_lock);
+ if (list_empty(&ei->i_orphan))
+ goto out;
ino_next = NEXT_ORPHAN(inode);
prev = ei->i_orphan.prev;
@@ -2120,7 +2123,7 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
out_err:
ext4_std_error(inode->i_sb, err);
out:
- unlock_super(inode->i_sb);
+ mutex_unlock(&EXT4_SB(inode->i_sb)->s_orphan_lock);
return err;
out_brelse:
@@ -2262,7 +2265,8 @@ retry:
if (IS_DIRSYNC(dir))
ext4_handle_sync(handle);
- inode = ext4_new_inode(handle, dir, S_IFLNK|S_IRWXUGO);
+ inode = ext4_new_inode(handle, dir, S_IFLNK|S_IRWXUGO,
+ &dentry->d_name, 0);
err = PTR_ERR(inode);
if (IS_ERR(inode))
goto out_stop;
@@ -2533,6 +2537,7 @@ const struct inode_operations ext4_dir_inode_operations = {
.removexattr = generic_removexattr,
#endif
.permission = ext4_permission,
+ .fiemap = ext4_fiemap,
};
const struct inode_operations ext4_special_inode_operations = {
diff --git a/fs/ext4/namei.h b/fs/ext4/namei.h
deleted file mode 100644
index 5e4dfff36a00..000000000000
--- a/fs/ext4/namei.h
+++ /dev/null
@@ -1,8 +0,0 @@
-/* linux/fs/ext4/namei.h
- *
- * Copyright (C) 2005 Simtec Electronics
- * Ben Dooks <ben@simtec.co.uk>
- *
-*/
-
-extern struct dentry *ext4_get_parent(struct dentry *child);
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 546c7dd869e1..27eb289eea37 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -15,7 +15,6 @@
#include <linux/slab.h>
#include "ext4_jbd2.h"
-#include "group.h"
#define outside(b, first, last) ((b) < (first) || (b) >= (last))
#define inside(b, first, last) ((b) >= (first) && (b) < (last))
@@ -193,7 +192,7 @@ static int setup_new_group_blocks(struct super_block *sb,
if (IS_ERR(handle))
return PTR_ERR(handle);
- lock_super(sb);
+ mutex_lock(&sbi->s_resize_lock);
if (input->group != sbi->s_groups_count) {
err = -EBUSY;
goto exit_journal;
@@ -302,7 +301,7 @@ exit_bh:
brelse(bh);
exit_journal:
- unlock_super(sb);
+ mutex_unlock(&sbi->s_resize_lock);
if ((err2 = ext4_journal_stop(handle)) && !err)
err = err2;
@@ -643,11 +642,12 @@ exit_free:
* important part is that the new block and inode counts are in the backup
* superblocks, and the location of the new group metadata in the GDT backups.
*
- * We do not need lock_super() for this, because these blocks are not
- * otherwise touched by the filesystem code when it is mounted. We don't
- * need to worry about last changing from sbi->s_groups_count, because the
- * worst that can happen is that we do not copy the full number of backups
- * at this time. The resize which changed s_groups_count will backup again.
+ * We do not need take the s_resize_lock for this, because these
+ * blocks are not otherwise touched by the filesystem code when it is
+ * mounted. We don't need to worry about last changing from
+ * sbi->s_groups_count, because the worst that can happen is that we
+ * do not copy the full number of backups at this time. The resize
+ * which changed s_groups_count will backup again.
*/
static void update_backups(struct super_block *sb,
int blk_off, char *data, int size)
@@ -809,7 +809,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
goto exit_put;
}
- lock_super(sb);
+ mutex_lock(&sbi->s_resize_lock);
if (input->group != sbi->s_groups_count) {
ext4_warning(sb, __func__,
"multiple resizers run on filesystem!");
@@ -840,7 +840,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
/*
* OK, now we've set up the new group. Time to make it active.
*
- * Current kernels don't lock all allocations via lock_super(),
+ * We do not lock all allocations via s_resize_lock
* so we have to be safe wrt. concurrent accesses the group
* data. So we need to be careful to set all of the relevant
* group descriptor data etc. *before* we enable the group.
@@ -900,12 +900,12 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
*
* The precise rules we use are:
*
- * * Writers of s_groups_count *must* hold lock_super
+ * * Writers of s_groups_count *must* hold s_resize_lock
* AND
* * Writers must perform a smp_wmb() after updating all dependent
* data and before modifying the groups count
*
- * * Readers must hold lock_super() over the access
+ * * Readers must hold s_resize_lock over the access
* OR
* * Readers must perform an smp_rmb() after reading the groups count
* and before reading any dependent data.
@@ -948,7 +948,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
sb->s_dirt = 1;
exit_journal:
- unlock_super(sb);
+ mutex_unlock(&sbi->s_resize_lock);
if ((err2 = ext4_journal_stop(handle)) && !err)
err = err2;
if (!err) {
@@ -986,7 +986,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
/* We don't need to worry about locking wrt other resizers just
* yet: we're going to revalidate es->s_blocks_count after
- * taking lock_super() below. */
+ * taking the s_resize_lock below. */
o_blocks_count = ext4_blocks_count(es);
o_groups_count = EXT4_SB(sb)->s_groups_count;
@@ -1056,11 +1056,11 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
goto exit_put;
}
- lock_super(sb);
+ mutex_lock(&EXT4_SB(sb)->s_resize_lock);
if (o_blocks_count != ext4_blocks_count(es)) {
ext4_warning(sb, __func__,
"multiple resizers run on filesystem!");
- unlock_super(sb);
+ mutex_unlock(&EXT4_SB(sb)->s_resize_lock);
ext4_journal_stop(handle);
err = -EBUSY;
goto exit_put;
@@ -1070,14 +1070,14 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
EXT4_SB(sb)->s_sbh))) {
ext4_warning(sb, __func__,
"error %d on journal write access", err);
- unlock_super(sb);
+ mutex_unlock(&EXT4_SB(sb)->s_resize_lock);
ext4_journal_stop(handle);
goto exit_put;
}
ext4_blocks_count_set(es, o_blocks_count + add);
ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh);
sb->s_dirt = 1;
- unlock_super(sb);
+ mutex_unlock(&EXT4_SB(sb)->s_resize_lock);
ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count,
o_blocks_count + add);
/* We add the blocks to the bitmap and set the group need init bit */
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 2958f4e6f222..23013d303f81 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -20,6 +20,7 @@
#include <linux/string.h>
#include <linux/fs.h>
#include <linux/time.h>
+#include <linux/vmalloc.h>
#include <linux/jbd2.h>
#include <linux/slab.h>
#include <linux/init.h>
@@ -36,7 +37,6 @@
#include <linux/seq_file.h>
#include <linux/proc_fs.h>
#include <linux/ctype.h>
-#include <linux/marker.h>
#include <linux/log2.h>
#include <linux/crc16.h>
#include <asm/uaccess.h>
@@ -45,16 +45,23 @@
#include "ext4_jbd2.h"
#include "xattr.h"
#include "acl.h"
-#include "namei.h"
-#include "group.h"
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/ext4.h>
+
+static int default_mb_history_length = 1000;
+
+module_param_named(default_mb_history_length, default_mb_history_length,
+ int, 0644);
+MODULE_PARM_DESC(default_mb_history_length,
+ "Default number of entries saved for mb_history");
struct proc_dir_entry *ext4_proc_root;
static struct kset *ext4_kset;
static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
unsigned long journal_devnum);
-static int ext4_commit_super(struct super_block *sb,
- struct ext4_super_block *es, int sync);
+static int ext4_commit_super(struct super_block *sb, int sync);
static void ext4_mark_recovery_complete(struct super_block *sb,
struct ext4_super_block *es);
static void ext4_clear_journal_err(struct super_block *sb,
@@ -74,7 +81,7 @@ ext4_fsblk_t ext4_block_bitmap(struct super_block *sb,
{
return le32_to_cpu(bg->bg_block_bitmap_lo) |
(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
- (ext4_fsblk_t)le32_to_cpu(bg->bg_block_bitmap_hi) << 32 : 0);
+ (ext4_fsblk_t)le32_to_cpu(bg->bg_block_bitmap_hi) << 32 : 0);
}
ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb,
@@ -82,7 +89,7 @@ ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb,
{
return le32_to_cpu(bg->bg_inode_bitmap_lo) |
(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
- (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_bitmap_hi) << 32 : 0);
+ (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_bitmap_hi) << 32 : 0);
}
ext4_fsblk_t ext4_inode_table(struct super_block *sb,
@@ -90,7 +97,7 @@ ext4_fsblk_t ext4_inode_table(struct super_block *sb,
{
return le32_to_cpu(bg->bg_inode_table_lo) |
(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
- (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_table_hi) << 32 : 0);
+ (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_table_hi) << 32 : 0);
}
__u32 ext4_free_blks_count(struct super_block *sb,
@@ -98,7 +105,7 @@ __u32 ext4_free_blks_count(struct super_block *sb,
{
return le16_to_cpu(bg->bg_free_blocks_count_lo) |
(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
- (__u32)le16_to_cpu(bg->bg_free_blocks_count_hi) << 16 : 0);
+ (__u32)le16_to_cpu(bg->bg_free_blocks_count_hi) << 16 : 0);
}
__u32 ext4_free_inodes_count(struct super_block *sb,
@@ -106,7 +113,7 @@ __u32 ext4_free_inodes_count(struct super_block *sb,
{
return le16_to_cpu(bg->bg_free_inodes_count_lo) |
(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
- (__u32)le16_to_cpu(bg->bg_free_inodes_count_hi) << 16 : 0);
+ (__u32)le16_to_cpu(bg->bg_free_inodes_count_hi) << 16 : 0);
}
__u32 ext4_used_dirs_count(struct super_block *sb,
@@ -114,7 +121,7 @@ __u32 ext4_used_dirs_count(struct super_block *sb,
{
return le16_to_cpu(bg->bg_used_dirs_count_lo) |
(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
- (__u32)le16_to_cpu(bg->bg_used_dirs_count_hi) << 16 : 0);
+ (__u32)le16_to_cpu(bg->bg_used_dirs_count_hi) << 16 : 0);
}
__u32 ext4_itable_unused_count(struct super_block *sb,
@@ -122,7 +129,7 @@ __u32 ext4_itable_unused_count(struct super_block *sb,
{
return le16_to_cpu(bg->bg_itable_unused_lo) |
(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
- (__u32)le16_to_cpu(bg->bg_itable_unused_hi) << 16 : 0);
+ (__u32)le16_to_cpu(bg->bg_itable_unused_hi) << 16 : 0);
}
void ext4_block_bitmap_set(struct super_block *sb,
@@ -202,8 +209,7 @@ handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks)
journal = EXT4_SB(sb)->s_journal;
if (journal) {
if (is_journal_aborted(journal)) {
- ext4_abort(sb, __func__,
- "Detected aborted journal");
+ ext4_abort(sb, __func__, "Detected aborted journal");
return ERR_PTR(-EROFS);
}
return jbd2_journal_start(journal, nblocks);
@@ -297,15 +303,15 @@ static void ext4_handle_error(struct super_block *sb)
if (!test_opt(sb, ERRORS_CONT)) {
journal_t *journal = EXT4_SB(sb)->s_journal;
- EXT4_SB(sb)->s_mount_opt |= EXT4_MOUNT_ABORT;
+ EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED;
if (journal)
jbd2_journal_abort(journal, -EIO);
}
if (test_opt(sb, ERRORS_RO)) {
- printk(KERN_CRIT "Remounting filesystem read-only\n");
+ ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
sb->s_flags |= MS_RDONLY;
}
- ext4_commit_super(sb, es, 1);
+ ext4_commit_super(sb, 1);
if (test_opt(sb, ERRORS_PANIC))
panic("EXT4-fs (device %s): panic forced after error\n",
sb->s_id);
@@ -395,8 +401,6 @@ void ext4_abort(struct super_block *sb, const char *function,
{
va_list args;
- printk(KERN_CRIT "ext4_abort called.\n");
-
va_start(args, fmt);
printk(KERN_CRIT "EXT4-fs error (device %s): %s: ", sb->s_id, function);
vprintk(fmt, args);
@@ -409,14 +413,26 @@ void ext4_abort(struct super_block *sb, const char *function,
if (sb->s_flags & MS_RDONLY)
return;
- printk(KERN_CRIT "Remounting filesystem read-only\n");
+ ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
sb->s_flags |= MS_RDONLY;
- EXT4_SB(sb)->s_mount_opt |= EXT4_MOUNT_ABORT;
+ EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED;
if (EXT4_SB(sb)->s_journal)
jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO);
}
+void ext4_msg (struct super_block * sb, const char *prefix,
+ const char *fmt, ...)
+{
+ va_list args;
+
+ va_start(args, fmt);
+ printk("%sEXT4-fs (%s): ", prefix, sb->s_id);
+ vprintk(fmt, args);
+ printk("\n");
+ va_end(args);
+}
+
void ext4_warning(struct super_block *sb, const char *function,
const char *fmt, ...)
{
@@ -431,7 +447,7 @@ void ext4_warning(struct super_block *sb, const char *function,
}
void ext4_grp_locked_error(struct super_block *sb, ext4_group_t grp,
- const char *function, const char *fmt, ...)
+ const char *function, const char *fmt, ...)
__releases(bitlock)
__acquires(bitlock)
{
@@ -447,7 +463,7 @@ __acquires(bitlock)
if (test_opt(sb, ERRORS_CONT)) {
EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
- ext4_commit_super(sb, es, 0);
+ ext4_commit_super(sb, 0);
return;
}
ext4_unlock_group(sb, grp);
@@ -467,7 +483,6 @@ __acquires(bitlock)
return;
}
-
void ext4_update_dynamic_rev(struct super_block *sb)
{
struct ext4_super_block *es = EXT4_SB(sb)->s_es;
@@ -496,7 +511,7 @@ void ext4_update_dynamic_rev(struct super_block *sb)
/*
* Open the external journal device
*/
-static struct block_device *ext4_blkdev_get(dev_t dev)
+static struct block_device *ext4_blkdev_get(dev_t dev, struct super_block *sb)
{
struct block_device *bdev;
char b[BDEVNAME_SIZE];
@@ -507,7 +522,7 @@ static struct block_device *ext4_blkdev_get(dev_t dev)
return bdev;
fail:
- printk(KERN_ERR "EXT4-fs: failed to open journal device %s: %ld\n",
+ ext4_msg(sb, KERN_ERR, "failed to open journal device %s: %ld",
__bdevname(dev, b), PTR_ERR(bdev));
return NULL;
}
@@ -543,8 +558,8 @@ static void dump_orphan_list(struct super_block *sb, struct ext4_sb_info *sbi)
{
struct list_head *l;
- printk(KERN_ERR "sb orphan head is %d\n",
- le32_to_cpu(sbi->s_es->s_last_orphan));
+ ext4_msg(sb, KERN_ERR, "sb orphan head is %d",
+ le32_to_cpu(sbi->s_es->s_last_orphan));
printk(KERN_ERR "sb_info orphan list:\n");
list_for_each(l, &sbi->s_orphan) {
@@ -563,6 +578,12 @@ static void ext4_put_super(struct super_block *sb)
struct ext4_super_block *es = sbi->s_es;
int i, err;
+ lock_super(sb);
+ lock_kernel();
+ if (sb->s_dirt)
+ ext4_commit_super(sb, 1);
+
+ ext4_release_system_zone(sb);
ext4_mb_release(sb);
ext4_ext_release(sb);
ext4_xattr_put_super(sb);
@@ -576,7 +597,7 @@ static void ext4_put_super(struct super_block *sb)
if (!(sb->s_flags & MS_RDONLY)) {
EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
es->s_state = cpu_to_le16(sbi->s_mount_state);
- ext4_commit_super(sb, es, 1);
+ ext4_commit_super(sb, 1);
}
if (sbi->s_proc) {
remove_proc_entry(sb->s_id, ext4_proc_root);
@@ -586,7 +607,10 @@ static void ext4_put_super(struct super_block *sb)
for (i = 0; i < sbi->s_gdb_count; i++)
brelse(sbi->s_group_desc[i]);
kfree(sbi->s_group_desc);
- kfree(sbi->s_flex_groups);
+ if (is_vmalloc_addr(sbi->s_flex_groups))
+ vfree(sbi->s_flex_groups);
+ else
+ kfree(sbi->s_flex_groups);
percpu_counter_destroy(&sbi->s_freeblocks_counter);
percpu_counter_destroy(&sbi->s_freeinodes_counter);
percpu_counter_destroy(&sbi->s_dirs_counter);
@@ -625,11 +649,8 @@ static void ext4_put_super(struct super_block *sb)
unlock_super(sb);
kobject_put(&sbi->s_kobj);
wait_for_completion(&sbi->s_kobj_unregister);
- lock_super(sb);
- lock_kernel();
kfree(sbi->s_blockgroup_lock);
kfree(sbi);
- return;
}
static struct kmem_cache *ext4_inode_cachep;
@@ -644,6 +665,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
ei = kmem_cache_alloc(ext4_inode_cachep, GFP_NOFS);
if (!ei)
return NULL;
+
#ifdef CONFIG_EXT4_FS_POSIX_ACL
ei->i_acl = EXT4_ACL_NOT_CACHED;
ei->i_default_acl = EXT4_ACL_NOT_CACHED;
@@ -664,14 +686,16 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
ei->i_allocated_meta_blocks = 0;
ei->i_delalloc_reserved_flag = 0;
spin_lock_init(&(ei->i_block_reservation_lock));
+
return &ei->vfs_inode;
}
static void ext4_destroy_inode(struct inode *inode)
{
if (!list_empty(&(EXT4_I(inode)->i_orphan))) {
- printk("EXT4 Inode %p: orphan list check failed!\n",
- EXT4_I(inode));
+ ext4_msg(inode->i_sb, KERN_ERR,
+ "Inode %lu (%p): orphan list check failed!",
+ inode->i_ino, EXT4_I(inode));
print_hex_dump(KERN_INFO, "", DUMP_PREFIX_ADDRESS, 16, 4,
EXT4_I(inode), sizeof(struct ext4_inode_info),
true);
@@ -870,12 +894,12 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
seq_puts(seq, ",noauto_da_alloc");
ext4_show_quota_options(seq, sb);
+
return 0;
}
-
static struct inode *ext4_nfs_get_inode(struct super_block *sb,
- u64 ino, u32 generation)
+ u64 ino, u32 generation)
{
struct inode *inode;
@@ -904,14 +928,14 @@ static struct inode *ext4_nfs_get_inode(struct super_block *sb,
}
static struct dentry *ext4_fh_to_dentry(struct super_block *sb, struct fid *fid,
- int fh_len, int fh_type)
+ int fh_len, int fh_type)
{
return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
ext4_nfs_get_inode);
}
static struct dentry *ext4_fh_to_parent(struct super_block *sb, struct fid *fid,
- int fh_len, int fh_type)
+ int fh_len, int fh_type)
{
return generic_fh_to_parent(sb, fid, fh_len, fh_type,
ext4_nfs_get_inode);
@@ -923,7 +947,8 @@ static struct dentry *ext4_fh_to_parent(struct super_block *sb, struct fid *fid,
* which would prevent try_to_free_buffers() from freeing them, we must use
* jbd2 layer's try_to_free_buffers() function to release them.
*/
-static int bdev_try_to_free_page(struct super_block *sb, struct page *page, gfp_t wait)
+static int bdev_try_to_free_page(struct super_block *sb, struct page *page,
+ gfp_t wait)
{
journal_t *journal = EXT4_SB(sb)->s_journal;
@@ -992,7 +1017,6 @@ static const struct super_operations ext4_sops = {
.dirty_inode = ext4_dirty_inode,
.delete_inode = ext4_delete_inode,
.put_super = ext4_put_super,
- .write_super = ext4_write_super,
.sync_fs = ext4_sync_fs,
.freeze_fs = ext4_freeze,
.unfreeze_fs = ext4_unfreeze,
@@ -1007,6 +1031,25 @@ static const struct super_operations ext4_sops = {
.bdev_try_to_free_page = bdev_try_to_free_page,
};
+static const struct super_operations ext4_nojournal_sops = {
+ .alloc_inode = ext4_alloc_inode,
+ .destroy_inode = ext4_destroy_inode,
+ .write_inode = ext4_write_inode,
+ .dirty_inode = ext4_dirty_inode,
+ .delete_inode = ext4_delete_inode,
+ .write_super = ext4_write_super,
+ .put_super = ext4_put_super,
+ .statfs = ext4_statfs,
+ .remount_fs = ext4_remount,
+ .clear_inode = ext4_clear_inode,
+ .show_options = ext4_show_options,
+#ifdef CONFIG_QUOTA
+ .quota_read = ext4_quota_read,
+ .quota_write = ext4_quota_write,
+#endif
+ .bdev_try_to_free_page = bdev_try_to_free_page,
+};
+
static const struct export_operations ext4_export_ops = {
.fh_to_dentry = ext4_fh_to_dentry,
.fh_to_parent = ext4_fh_to_parent,
@@ -1023,12 +1066,13 @@ enum {
Opt_journal_update, Opt_journal_dev,
Opt_journal_checksum, Opt_journal_async_commit,
Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
- Opt_data_err_abort, Opt_data_err_ignore,
+ Opt_data_err_abort, Opt_data_err_ignore, Opt_mb_history_length,
Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err, Opt_resize,
Opt_usrquota, Opt_grpquota, Opt_i_version,
Opt_stripe, Opt_delalloc, Opt_nodelalloc,
+ Opt_block_validity, Opt_noblock_validity,
Opt_inode_readahead_blks, Opt_journal_ioprio
};
@@ -1069,6 +1113,7 @@ static const match_table_t tokens = {
{Opt_data_writeback, "data=writeback"},
{Opt_data_err_abort, "data_err=abort"},
{Opt_data_err_ignore, "data_err=ignore"},
+ {Opt_mb_history_length, "mb_history_length=%u"},
{Opt_offusrjquota, "usrjquota="},
{Opt_usrjquota, "usrjquota=%s"},
{Opt_offgrpjquota, "grpjquota="},
@@ -1087,6 +1132,8 @@ static const match_table_t tokens = {
{Opt_resize, "resize"},
{Opt_delalloc, "delalloc"},
{Opt_nodelalloc, "nodelalloc"},
+ {Opt_block_validity, "block_validity"},
+ {Opt_noblock_validity, "noblock_validity"},
{Opt_inode_readahead_blks, "inode_readahead_blks=%u"},
{Opt_journal_ioprio, "journal_ioprio=%u"},
{Opt_auto_da_alloc, "auto_da_alloc=%u"},
@@ -1102,8 +1149,9 @@ static ext4_fsblk_t get_sb_block(void **data)
if (!options || strncmp(options, "sb=", 3) != 0)
return 1; /* Default location */
+
options += 3;
- /*todo: use simple_strtoll with >32bit ext4 */
+ /* TODO: use simple_strtoll with >32bit ext4 */
sb_block = simple_strtoul(options, &options, 0);
if (*options && *options != ',') {
printk(KERN_ERR "EXT4-fs: Invalid sb specification: %s\n",
@@ -1113,6 +1161,7 @@ static ext4_fsblk_t get_sb_block(void **data)
if (*options == ',')
options++;
*data = (void *) options;
+
return sb_block;
}
@@ -1206,8 +1255,7 @@ static int parse_options(char *options, struct super_block *sb,
#else
case Opt_user_xattr:
case Opt_nouser_xattr:
- printk(KERN_ERR "EXT4 (no)user_xattr options "
- "not supported\n");
+ ext4_msg(sb, KERN_ERR, "(no)user_xattr options not supported");
break;
#endif
#ifdef CONFIG_EXT4_FS_POSIX_ACL
@@ -1220,8 +1268,7 @@ static int parse_options(char *options, struct super_block *sb,
#else
case Opt_acl:
case Opt_noacl:
- printk(KERN_ERR "EXT4 (no)acl options "
- "not supported\n");
+ ext4_msg(sb, KERN_ERR, "(no)acl options not supported");
break;
#endif
case Opt_journal_update:
@@ -1231,16 +1278,16 @@ static int parse_options(char *options, struct super_block *sb,
user to specify an existing inode to be the
journal file. */
if (is_remount) {
- printk(KERN_ERR "EXT4-fs: cannot specify "
- "journal on remount\n");
+ ext4_msg(sb, KERN_ERR,
+ "Cannot specify journal on remount");
return 0;
}
set_opt(sbi->s_mount_opt, UPDATE_JOURNAL);
break;
case Opt_journal_dev:
if (is_remount) {
- printk(KERN_ERR "EXT4-fs: cannot specify "
- "journal on remount\n");
+ ext4_msg(sb, KERN_ERR,
+ "Cannot specify journal on remount");
return 0;
}
if (match_int(&args[0], &option))
@@ -1294,9 +1341,8 @@ static int parse_options(char *options, struct super_block *sb,
if (is_remount) {
if ((sbi->s_mount_opt & EXT4_MOUNT_DATA_FLAGS)
!= data_opt) {
- printk(KERN_ERR
- "EXT4-fs: cannot change data "
- "mode on remount\n");
+ ext4_msg(sb, KERN_ERR,
+ "Cannot change data mode on remount");
return 0;
}
} else {
@@ -1310,6 +1356,13 @@ static int parse_options(char *options, struct super_block *sb,
case Opt_data_err_ignore:
clear_opt(sbi->s_mount_opt, DATA_ERR_ABORT);
break;
+ case Opt_mb_history_length:
+ if (match_int(&args[0], &option))
+ return 0;
+ if (option < 0)
+ return 0;
+ sbi->s_mb_history_max = option;
+ break;
#ifdef CONFIG_QUOTA
case Opt_usrjquota:
qtype = USRQUOTA;
@@ -1319,31 +1372,31 @@ static int parse_options(char *options, struct super_block *sb,
set_qf_name:
if (sb_any_quota_loaded(sb) &&
!sbi->s_qf_names[qtype]) {
- printk(KERN_ERR
- "EXT4-fs: Cannot change journaled "
- "quota options when quota turned on.\n");
+ ext4_msg(sb, KERN_ERR,
+ "Cannot change journaled "
+ "quota options when quota turned on");
return 0;
}
qname = match_strdup(&args[0]);
if (!qname) {
- printk(KERN_ERR
- "EXT4-fs: not enough memory for "
- "storing quotafile name.\n");
+ ext4_msg(sb, KERN_ERR,
+ "Not enough memory for "
+ "storing quotafile name");
return 0;
}
if (sbi->s_qf_names[qtype] &&
strcmp(sbi->s_qf_names[qtype], qname)) {
- printk(KERN_ERR
- "EXT4-fs: %s quota file already "
- "specified.\n", QTYPE2NAME(qtype));
+ ext4_msg(sb, KERN_ERR,
+ "%s quota file already "
+ "specified", QTYPE2NAME(qtype));
kfree(qname);
return 0;
}
sbi->s_qf_names[qtype] = qname;
if (strchr(sbi->s_qf_names[qtype], '/')) {
- printk(KERN_ERR
- "EXT4-fs: quotafile must be on "
- "filesystem root.\n");
+ ext4_msg(sb, KERN_ERR,
+ "quotafile must be on "
+ "filesystem root");
kfree(sbi->s_qf_names[qtype]);
sbi->s_qf_names[qtype] = NULL;
return 0;
@@ -1358,9 +1411,9 @@ set_qf_name:
clear_qf_name:
if (sb_any_quota_loaded(sb) &&
sbi->s_qf_names[qtype]) {
- printk(KERN_ERR "EXT4-fs: Cannot change "
+ ext4_msg(sb, KERN_ERR, "Cannot change "
"journaled quota options when "
- "quota turned on.\n");
+ "quota turned on");
return 0;
}
/*
@@ -1377,9 +1430,9 @@ clear_qf_name:
set_qf_format:
if (sb_any_quota_loaded(sb) &&
sbi->s_jquota_fmt != qfmt) {
- printk(KERN_ERR "EXT4-fs: Cannot change "
+ ext4_msg(sb, KERN_ERR, "Cannot change "
"journaled quota options when "
- "quota turned on.\n");
+ "quota turned on");
return 0;
}
sbi->s_jquota_fmt = qfmt;
@@ -1395,8 +1448,8 @@ set_qf_format:
break;
case Opt_noquota:
if (sb_any_quota_loaded(sb)) {
- printk(KERN_ERR "EXT4-fs: Cannot change quota "
- "options when quota turned on.\n");
+ ext4_msg(sb, KERN_ERR, "Cannot change quota "
+ "options when quota turned on");
return 0;
}
clear_opt(sbi->s_mount_opt, QUOTA);
@@ -1407,8 +1460,8 @@ set_qf_format:
case Opt_quota:
case Opt_usrquota:
case Opt_grpquota:
- printk(KERN_ERR
- "EXT4-fs: quota options not supported.\n");
+ ext4_msg(sb, KERN_ERR,
+ "quota options not supported");
break;
case Opt_usrjquota:
case Opt_grpjquota:
@@ -1416,15 +1469,14 @@ set_qf_format:
case Opt_offgrpjquota:
case Opt_jqfmt_vfsold:
case Opt_jqfmt_vfsv0:
- printk(KERN_ERR
- "EXT4-fs: journaled quota options not "
- "supported.\n");
+ ext4_msg(sb, KERN_ERR,
+ "journaled quota options not supported");
break;
case Opt_noquota:
break;
#endif
case Opt_abort:
- set_opt(sbi->s_mount_opt, ABORT);
+ sbi->s_mount_flags |= EXT4_MF_FS_ABORTED;
break;
case Opt_nobarrier:
clear_opt(sbi->s_mount_opt, BARRIER);
@@ -1443,8 +1495,9 @@ set_qf_format:
break;
case Opt_resize:
if (!is_remount) {
- printk("EXT4-fs: resize option only available "
- "for remount\n");
+ ext4_msg(sb, KERN_ERR,
+ "resize option only available "
+ "for remount");
return 0;
}
if (match_int(&args[0], &option) != 0)
@@ -1474,14 +1527,21 @@ set_qf_format:
case Opt_delalloc:
set_opt(sbi->s_mount_opt, DELALLOC);
break;
+ case Opt_block_validity:
+ set_opt(sbi->s_mount_opt, BLOCK_VALIDITY);
+ break;
+ case Opt_noblock_validity:
+ clear_opt(sbi->s_mount_opt, BLOCK_VALIDITY);
+ break;
case Opt_inode_readahead_blks:
if (match_int(&args[0], &option))
return 0;
if (option < 0 || option > (1 << 30))
return 0;
- if (option & (option - 1)) {
- printk(KERN_ERR "EXT4-fs: inode_readahead_blks"
- " must be a power of 2\n");
+ if (!is_power_of_2(option)) {
+ ext4_msg(sb, KERN_ERR,
+ "EXT4-fs: inode_readahead_blks"
+ " must be a power of 2");
return 0;
}
sbi->s_inode_readahead_blks = option;
@@ -1508,9 +1568,9 @@ set_qf_format:
set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC);
break;
default:
- printk(KERN_ERR
- "EXT4-fs: Unrecognized mount option \"%s\" "
- "or missing value\n", p);
+ ext4_msg(sb, KERN_ERR,
+ "Unrecognized mount option \"%s\" "
+ "or missing value", p);
return 0;
}
}
@@ -1528,21 +1588,21 @@ set_qf_format:
(sbi->s_mount_opt & EXT4_MOUNT_GRPQUOTA)) ||
(sbi->s_qf_names[GRPQUOTA] &&
(sbi->s_mount_opt & EXT4_MOUNT_USRQUOTA))) {
- printk(KERN_ERR "EXT4-fs: old and new quota "
- "format mixing.\n");
+ ext4_msg(sb, KERN_ERR, "old and new quota "
+ "format mixing");
return 0;
}
if (!sbi->s_jquota_fmt) {
- printk(KERN_ERR "EXT4-fs: journaled quota format "
- "not specified.\n");
+ ext4_msg(sb, KERN_ERR, "journaled quota format "
+ "not specified");
return 0;
}
} else {
if (sbi->s_jquota_fmt) {
- printk(KERN_ERR "EXT4-fs: journaled quota format "
+ ext4_msg(sb, KERN_ERR, "journaled quota format "
"specified with no journaling "
- "enabled.\n");
+ "enabled");
return 0;
}
}
@@ -1557,32 +1617,32 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
int res = 0;
if (le32_to_cpu(es->s_rev_level) > EXT4_MAX_SUPP_REV) {
- printk(KERN_ERR "EXT4-fs warning: revision level too high, "
- "forcing read-only mode\n");
+ ext4_msg(sb, KERN_ERR, "revision level too high, "
+ "forcing read-only mode");
res = MS_RDONLY;
}
if (read_only)
return res;
if (!(sbi->s_mount_state & EXT4_VALID_FS))
- printk(KERN_WARNING "EXT4-fs warning: mounting unchecked fs, "
- "running e2fsck is recommended\n");
+ ext4_msg(sb, KERN_WARNING, "warning: mounting unchecked fs, "
+ "running e2fsck is recommended");
else if ((sbi->s_mount_state & EXT4_ERROR_FS))
- printk(KERN_WARNING
- "EXT4-fs warning: mounting fs with errors, "
- "running e2fsck is recommended\n");
+ ext4_msg(sb, KERN_WARNING,
+ "warning: mounting fs with errors, "
+ "running e2fsck is recommended");
else if ((__s16) le16_to_cpu(es->s_max_mnt_count) >= 0 &&
le16_to_cpu(es->s_mnt_count) >=
(unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count))
- printk(KERN_WARNING
- "EXT4-fs warning: maximal mount count reached, "
- "running e2fsck is recommended\n");
+ ext4_msg(sb, KERN_WARNING,
+ "warning: maximal mount count reached, "
+ "running e2fsck is recommended");
else if (le32_to_cpu(es->s_checkinterval) &&
(le32_to_cpu(es->s_lastcheck) +
le32_to_cpu(es->s_checkinterval) <= get_seconds()))
- printk(KERN_WARNING
- "EXT4-fs warning: checktime reached, "
- "running e2fsck is recommended\n");
- if (!sbi->s_journal)
+ ext4_msg(sb, KERN_WARNING,
+ "warning: checktime reached, "
+ "running e2fsck is recommended");
+ if (!sbi->s_journal)
es->s_state &= cpu_to_le16(~EXT4_VALID_FS);
if (!(__s16) le16_to_cpu(es->s_max_mnt_count))
es->s_max_mnt_count = cpu_to_le16(EXT4_DFL_MAX_MNT_COUNT);
@@ -1592,10 +1652,10 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
if (sbi->s_journal)
EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
- ext4_commit_super(sb, es, 1);
+ ext4_commit_super(sb, 1);
if (test_opt(sb, DEBUG))
printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%u, "
- "bpg=%lu, ipg=%lu, mo=%04lx]\n",
+ "bpg=%lu, ipg=%lu, mo=%04x]\n",
sb->s_blocksize,
sbi->s_groups_count,
EXT4_BLOCKS_PER_GROUP(sb),
@@ -1603,11 +1663,11 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
sbi->s_mount_opt);
if (EXT4_SB(sb)->s_journal) {
- printk(KERN_INFO "EXT4 FS on %s, %s journal on %s\n",
- sb->s_id, EXT4_SB(sb)->s_journal->j_inode ? "internal" :
+ ext4_msg(sb, KERN_INFO, "%s journal on %s",
+ EXT4_SB(sb)->s_journal->j_inode ? "internal" :
"external", EXT4_SB(sb)->s_journal->j_devname);
} else {
- printk(KERN_INFO "EXT4 FS on %s, no journal\n", sb->s_id);
+ ext4_msg(sb, KERN_INFO, "no journal");
}
return res;
}
@@ -1616,10 +1676,10 @@ static int ext4_fill_flex_info(struct super_block *sb)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_group_desc *gdp = NULL;
- struct buffer_head *bh;
ext4_group_t flex_group_count;
ext4_group_t flex_group;
int groups_per_flex = 0;
+ size_t size;
int i;
if (!sbi->s_es->s_log_groups_per_flex) {
@@ -1634,16 +1694,21 @@ static int ext4_fill_flex_info(struct super_block *sb)
flex_group_count = ((sbi->s_groups_count + groups_per_flex - 1) +
((le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) + 1) <<
EXT4_DESC_PER_BLOCK_BITS(sb))) / groups_per_flex;
- sbi->s_flex_groups = kzalloc(flex_group_count *
- sizeof(struct flex_groups), GFP_KERNEL);
+ size = flex_group_count * sizeof(struct flex_groups);
+ sbi->s_flex_groups = kzalloc(size, GFP_KERNEL);
+ if (sbi->s_flex_groups == NULL) {
+ sbi->s_flex_groups = vmalloc(size);
+ if (sbi->s_flex_groups)
+ memset(sbi->s_flex_groups, 0, size);
+ }
if (sbi->s_flex_groups == NULL) {
- printk(KERN_ERR "EXT4-fs: not enough memory for "
- "%u flex groups\n", flex_group_count);
+ ext4_msg(sb, KERN_ERR, "not enough memory for "
+ "%u flex groups", flex_group_count);
goto failed;
}
for (i = 0; i < sbi->s_groups_count; i++) {
- gdp = ext4_get_group_desc(sb, i, &bh);
+ gdp = ext4_get_group_desc(sb, i, NULL);
flex_group = ext4_flex_group(sbi, i);
atomic_set(&sbi->s_flex_groups[flex_group].free_inodes,
@@ -1724,44 +1789,44 @@ static int ext4_check_descriptors(struct super_block *sb)
block_bitmap = ext4_block_bitmap(sb, gdp);
if (block_bitmap < first_block || block_bitmap > last_block) {
- printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
+ ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
"Block bitmap for group %u not in group "
- "(block %llu)!\n", i, block_bitmap);
+ "(block %llu)!", i, block_bitmap);
return 0;
}
inode_bitmap = ext4_inode_bitmap(sb, gdp);
if (inode_bitmap < first_block || inode_bitmap > last_block) {
- printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
+ ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
"Inode bitmap for group %u not in group "
- "(block %llu)!\n", i, inode_bitmap);
+ "(block %llu)!", i, inode_bitmap);
return 0;
}
inode_table = ext4_inode_table(sb, gdp);
if (inode_table < first_block ||
inode_table + sbi->s_itb_per_group - 1 > last_block) {
- printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
+ ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
"Inode table for group %u not in group "
- "(block %llu)!\n", i, inode_table);
+ "(block %llu)!", i, inode_table);
return 0;
}
- spin_lock(sb_bgl_lock(sbi, i));
+ ext4_lock_group(sb, i);
if (!ext4_group_desc_csum_verify(sbi, i, gdp)) {
- printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
- "Checksum for group %u failed (%u!=%u)\n",
- i, le16_to_cpu(ext4_group_desc_csum(sbi, i,
- gdp)), le16_to_cpu(gdp->bg_checksum));
+ ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
+ "Checksum for group %u failed (%u!=%u)",
+ i, le16_to_cpu(ext4_group_desc_csum(sbi, i,
+ gdp)), le16_to_cpu(gdp->bg_checksum));
if (!(sb->s_flags & MS_RDONLY)) {
- spin_unlock(sb_bgl_lock(sbi, i));
+ ext4_unlock_group(sb, i);
return 0;
}
}
- spin_unlock(sb_bgl_lock(sbi, i));
+ ext4_unlock_group(sb, i);
if (!flexbg_flag)
first_block += EXT4_BLOCKS_PER_GROUP(sb);
}
ext4_free_blocks_count_set(sbi->s_es, ext4_count_free_blocks(sb));
- sbi->s_es->s_free_inodes_count = cpu_to_le32(ext4_count_free_inodes(sb));
+ sbi->s_es->s_free_inodes_count =cpu_to_le32(ext4_count_free_inodes(sb));
return 1;
}
@@ -1796,8 +1861,8 @@ static void ext4_orphan_cleanup(struct super_block *sb,
}
if (bdev_read_only(sb->s_bdev)) {
- printk(KERN_ERR "EXT4-fs: write access "
- "unavailable, skipping orphan cleanup.\n");
+ ext4_msg(sb, KERN_ERR, "write access "
+ "unavailable, skipping orphan cleanup");
return;
}
@@ -1811,8 +1876,7 @@ static void ext4_orphan_cleanup(struct super_block *sb,
}
if (s_flags & MS_RDONLY) {
- printk(KERN_INFO "EXT4-fs: %s: orphan cleanup on readonly fs\n",
- sb->s_id);
+ ext4_msg(sb, KERN_INFO, "orphan cleanup on readonly fs");
sb->s_flags &= ~MS_RDONLY;
}
#ifdef CONFIG_QUOTA
@@ -1823,9 +1887,9 @@ static void ext4_orphan_cleanup(struct super_block *sb,
if (EXT4_SB(sb)->s_qf_names[i]) {
int ret = ext4_quota_on_mount(sb, i);
if (ret < 0)
- printk(KERN_ERR
- "EXT4-fs: Cannot turn on journaled "
- "quota: error %d\n", ret);
+ ext4_msg(sb, KERN_ERR,
+ "Cannot turn on journaled "
+ "quota: error %d", ret);
}
}
#endif
@@ -1842,16 +1906,16 @@ static void ext4_orphan_cleanup(struct super_block *sb,
list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan);
vfs_dq_init(inode);
if (inode->i_nlink) {
- printk(KERN_DEBUG
- "%s: truncating inode %lu to %lld bytes\n",
+ ext4_msg(sb, KERN_DEBUG,
+ "%s: truncating inode %lu to %lld bytes",
__func__, inode->i_ino, inode->i_size);
jbd_debug(2, "truncating inode %lu to %lld bytes\n",
inode->i_ino, inode->i_size);
ext4_truncate(inode);
nr_truncates++;
} else {
- printk(KERN_DEBUG
- "%s: deleting unreferenced inode %lu\n",
+ ext4_msg(sb, KERN_DEBUG,
+ "%s: deleting unreferenced inode %lu",
__func__, inode->i_ino);
jbd_debug(2, "deleting unreferenced inode %lu\n",
inode->i_ino);
@@ -1863,11 +1927,11 @@ static void ext4_orphan_cleanup(struct super_block *sb,
#define PLURAL(x) (x), ((x) == 1) ? "" : "s"
if (nr_orphans)
- printk(KERN_INFO "EXT4-fs: %s: %d orphan inode%s deleted\n",
- sb->s_id, PLURAL(nr_orphans));
+ ext4_msg(sb, KERN_INFO, "%d orphan inode%s deleted",
+ PLURAL(nr_orphans));
if (nr_truncates)
- printk(KERN_INFO "EXT4-fs: %s: %d truncate%s cleaned up\n",
- sb->s_id, PLURAL(nr_truncates));
+ ext4_msg(sb, KERN_INFO, "%d truncate%s cleaned up",
+ PLURAL(nr_truncates));
#ifdef CONFIG_QUOTA
/* Turn quotas off */
for (i = 0; i < MAXQUOTAS; i++) {
@@ -1877,6 +1941,7 @@ static void ext4_orphan_cleanup(struct super_block *sb,
#endif
sb->s_flags = s_flags; /* Restore MS_RDONLY status */
}
+
/*
* Maximal extent format file size.
* Resulting logical blkno at s_maxbytes must fit in our on-disk
@@ -1927,19 +1992,19 @@ static loff_t ext4_max_bitmap_size(int bits, int has_huge_files)
loff_t res = EXT4_NDIR_BLOCKS;
int meta_blocks;
loff_t upper_limit;
- /* This is calculated to be the largest file size for a
- * dense, bitmapped file such that the total number of
- * sectors in the file, including data and all indirect blocks,
- * does not exceed 2^48 -1
- * __u32 i_blocks_lo and _u16 i_blocks_high representing the
- * total number of 512 bytes blocks of the file
+ /* This is calculated to be the largest file size for a dense, block
+ * mapped file such that the file's total number of 512-byte sectors,
+ * including data and all indirect blocks, does not exceed (2^48 - 1).
+ *
+ * __u32 i_blocks_lo and _u16 i_blocks_high represent the total
+ * number of 512-byte sectors of the file.
*/
if (!has_huge_files || sizeof(blkcnt_t) < sizeof(u64)) {
/*
- * !has_huge_files or CONFIG_LBD is not enabled
- * implies the inode i_block represent total blocks in
- * 512 bytes 32 == size of vfs inode i_blocks * 8
+ * !has_huge_files or CONFIG_LBD not enabled implies that
+ * the inode i_block field represents total file blocks in
+ * 2^32 512-byte sectors == size of vfs inode i_blocks * 8
*/
upper_limit = (1LL << 32) - 1;
@@ -1981,7 +2046,7 @@ static loff_t ext4_max_bitmap_size(int bits, int has_huge_files)
}
static ext4_fsblk_t descriptor_loc(struct super_block *sb,
- ext4_fsblk_t logical_sb_block, int nr)
+ ext4_fsblk_t logical_sb_block, int nr)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
ext4_group_t bg, first_meta_bg;
@@ -1995,6 +2060,7 @@ static ext4_fsblk_t descriptor_loc(struct super_block *sb,
bg = sbi->s_desc_per_block * nr;
if (ext4_bg_has_super(sb, bg))
has_super = 1;
+
return (has_super + ext4_group_first_block_no(sb, bg));
}
@@ -2091,8 +2157,7 @@ static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
if (parse_strtoul(buf, 0x40000000, &t))
return -EINVAL;
- /* inode_readahead_blks must be a power of 2 */
- if (t & (t-1))
+ if (!is_power_of_2(t))
return -EINVAL;
sbi->s_inode_readahead_blks = t;
@@ -2100,7 +2165,7 @@ static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
}
static ssize_t sbi_ui_show(struct ext4_attr *a,
- struct ext4_sb_info *sbi, char *buf)
+ struct ext4_sb_info *sbi, char *buf)
{
unsigned int *ui = (unsigned int *) (((char *) sbi) + a->offset);
@@ -2141,6 +2206,7 @@ EXT4_RO_ATTR(session_write_kbytes);
EXT4_RO_ATTR(lifetime_write_kbytes);
EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, sbi_ui_show,
inode_readahead_blks_store, s_inode_readahead_blks);
+EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal);
EXT4_RW_ATTR_SBI_UI(mb_stats, s_mb_stats);
EXT4_RW_ATTR_SBI_UI(mb_max_to_scan, s_mb_max_to_scan);
EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan);
@@ -2153,6 +2219,7 @@ static struct attribute *ext4_attrs[] = {
ATTR_LIST(session_write_kbytes),
ATTR_LIST(lifetime_write_kbytes),
ATTR_LIST(inode_readahead_blks),
+ ATTR_LIST(inode_goal),
ATTR_LIST(mb_stats),
ATTR_LIST(mb_max_to_scan),
ATTR_LIST(mb_min_to_scan),
@@ -2205,7 +2272,6 @@ static struct kobj_type ext4_ktype = {
static int ext4_fill_super(struct super_block *sb, void *data, int silent)
__releases(kernel_lock)
__acquires(kernel_lock)
-
{
struct buffer_head *bh;
struct ext4_super_block *es = NULL;
@@ -2256,7 +2322,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
blocksize = sb_min_blocksize(sb, EXT4_MIN_BLOCK_SIZE);
if (!blocksize) {
- printk(KERN_ERR "EXT4-fs: unable to set blocksize\n");
+ ext4_msg(sb, KERN_ERR, "unable to set blocksize");
goto out_fail;
}
@@ -2272,7 +2338,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
}
if (!(bh = sb_bread(sb, logical_sb_block))) {
- printk(KERN_ERR "EXT4-fs: unable to read superblock\n");
+ ext4_msg(sb, KERN_ERR, "unable to read superblock");
goto out_fail;
}
/*
@@ -2321,6 +2387,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ;
sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME;
sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME;
+ sbi->s_mb_history_max = default_mb_history_length;
set_opt(sbi->s_mount_opt, BARRIER);
@@ -2330,7 +2397,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
*/
set_opt(sbi->s_mount_opt, DELALLOC);
-
if (!parse_options((char *) data, sb, &journal_devnum,
&journal_ioprio, NULL, 0))
goto failed_mount;
@@ -2342,9 +2408,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
(EXT4_HAS_COMPAT_FEATURE(sb, ~0U) ||
EXT4_HAS_RO_COMPAT_FEATURE(sb, ~0U) ||
EXT4_HAS_INCOMPAT_FEATURE(sb, ~0U)))
- printk(KERN_WARNING
- "EXT4-fs warning: feature flags set on rev 0 fs, "
- "running e2fsck is recommended\n");
+ ext4_msg(sb, KERN_WARNING,
+ "feature flags set on rev 0 fs, "
+ "running e2fsck is recommended");
/*
* Check feature flags regardless of the revision level, since we
@@ -2353,16 +2419,18 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
*/
features = EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT4_FEATURE_INCOMPAT_SUPP);
if (features) {
- printk(KERN_ERR "EXT4-fs: %s: couldn't mount because of "
- "unsupported optional features (%x).\n", sb->s_id,
+ ext4_msg(sb, KERN_ERR,
+ "Couldn't mount because of "
+ "unsupported optional features (%x)",
(le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_incompat) &
~EXT4_FEATURE_INCOMPAT_SUPP));
goto failed_mount;
}
features = EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT4_FEATURE_RO_COMPAT_SUPP);
if (!(sb->s_flags & MS_RDONLY) && features) {
- printk(KERN_ERR "EXT4-fs: %s: couldn't mount RDWR because of "
- "unsupported optional features (%x).\n", sb->s_id,
+ ext4_msg(sb, KERN_ERR,
+ "Couldn't mount RDWR because of "
+ "unsupported optional features (%x)",
(le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_ro_compat) &
~EXT4_FEATURE_RO_COMPAT_SUPP));
goto failed_mount;
@@ -2376,9 +2444,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
*/
if (sizeof(root->i_blocks) < sizeof(u64) &&
!(sb->s_flags & MS_RDONLY)) {
- printk(KERN_ERR "EXT4-fs: %s: Filesystem with huge "
+ ext4_msg(sb, KERN_ERR, "Filesystem with huge "
"files cannot be mounted read-write "
- "without CONFIG_LBD.\n", sb->s_id);
+ "without CONFIG_LBD");
goto failed_mount;
}
}
@@ -2386,17 +2454,15 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
if (blocksize < EXT4_MIN_BLOCK_SIZE ||
blocksize > EXT4_MAX_BLOCK_SIZE) {
- printk(KERN_ERR
- "EXT4-fs: Unsupported filesystem blocksize %d on %s.\n",
- blocksize, sb->s_id);
+ ext4_msg(sb, KERN_ERR,
+ "Unsupported filesystem blocksize %d", blocksize);
goto failed_mount;
}
if (sb->s_blocksize != blocksize) {
-
/* Validate the filesystem blocksize */
if (!sb_set_blocksize(sb, blocksize)) {
- printk(KERN_ERR "EXT4-fs: bad block size %d.\n",
+ ext4_msg(sb, KERN_ERR, "bad block size %d",
blocksize);
goto failed_mount;
}
@@ -2406,15 +2472,15 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
offset = do_div(logical_sb_block, blocksize);
bh = sb_bread(sb, logical_sb_block);
if (!bh) {
- printk(KERN_ERR
- "EXT4-fs: Can't read superblock on 2nd try.\n");
+ ext4_msg(sb, KERN_ERR,
+ "Can't read superblock on 2nd try");
goto failed_mount;
}
es = (struct ext4_super_block *)(((char *)bh->b_data) + offset);
sbi->s_es = es;
if (es->s_magic != cpu_to_le16(EXT4_SUPER_MAGIC)) {
- printk(KERN_ERR
- "EXT4-fs: Magic mismatch, very weird !\n");
+ ext4_msg(sb, KERN_ERR,
+ "Magic mismatch, very weird!");
goto failed_mount;
}
}
@@ -2432,30 +2498,33 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
if ((sbi->s_inode_size < EXT4_GOOD_OLD_INODE_SIZE) ||
(!is_power_of_2(sbi->s_inode_size)) ||
(sbi->s_inode_size > blocksize)) {
- printk(KERN_ERR
- "EXT4-fs: unsupported inode size: %d\n",
+ ext4_msg(sb, KERN_ERR,
+ "unsupported inode size: %d",
sbi->s_inode_size);
goto failed_mount;
}
if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE)
sb->s_time_gran = 1 << (EXT4_EPOCH_BITS - 2);
}
+
sbi->s_desc_size = le16_to_cpu(es->s_desc_size);
if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT)) {
if (sbi->s_desc_size < EXT4_MIN_DESC_SIZE_64BIT ||
sbi->s_desc_size > EXT4_MAX_DESC_SIZE ||
!is_power_of_2(sbi->s_desc_size)) {
- printk(KERN_ERR
- "EXT4-fs: unsupported descriptor size %lu\n",
+ ext4_msg(sb, KERN_ERR,
+ "unsupported descriptor size %lu",
sbi->s_desc_size);
goto failed_mount;
}
} else
sbi->s_desc_size = EXT4_MIN_DESC_SIZE;
+
sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group);
sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group);
if (EXT4_INODE_SIZE(sb) == 0 || EXT4_INODES_PER_GROUP(sb) == 0)
goto cantfind_ext4;
+
sbi->s_inodes_per_block = blocksize / EXT4_INODE_SIZE(sb);
if (sbi->s_inodes_per_block == 0)
goto cantfind_ext4;
@@ -2466,6 +2535,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
sbi->s_mount_state = le16_to_cpu(es->s_state);
sbi->s_addr_per_block_bits = ilog2(EXT4_ADDR_PER_BLOCK(sb));
sbi->s_desc_per_block_bits = ilog2(EXT4_DESC_PER_BLOCK(sb));
+
for (i = 0; i < 4; i++)
sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
sbi->s_def_hash_version = es->s_def_hash_version;
@@ -2483,25 +2553,24 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
}
if (sbi->s_blocks_per_group > blocksize * 8) {
- printk(KERN_ERR
- "EXT4-fs: #blocks per group too big: %lu\n",
+ ext4_msg(sb, KERN_ERR,
+ "#blocks per group too big: %lu",
sbi->s_blocks_per_group);
goto failed_mount;
}
if (sbi->s_inodes_per_group > blocksize * 8) {
- printk(KERN_ERR
- "EXT4-fs: #inodes per group too big: %lu\n",
+ ext4_msg(sb, KERN_ERR,
+ "#inodes per group too big: %lu",
sbi->s_inodes_per_group);
goto failed_mount;
}
if (ext4_blocks_count(es) >
(sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) {
- printk(KERN_ERR "EXT4-fs: filesystem on %s:"
- " too large to mount safely\n", sb->s_id);
+ ext4_msg(sb, KERN_ERR, "filesystem"
+ " too large to mount safely");
if (sizeof(sector_t) < 8)
- printk(KERN_WARNING "EXT4-fs: CONFIG_LBD not "
- "enabled\n");
+ ext4_msg(sb, KERN_WARNING, "CONFIG_LBD not enabled");
goto failed_mount;
}
@@ -2511,21 +2580,21 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
/* check blocks count against device size */
blocks_count = sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits;
if (blocks_count && ext4_blocks_count(es) > blocks_count) {
- printk(KERN_WARNING "EXT4-fs: bad geometry: block count %llu "
- "exceeds size of device (%llu blocks)\n",
+ ext4_msg(sb, KERN_WARNING, "bad geometry: block count %llu "
+ "exceeds size of device (%llu blocks)",
ext4_blocks_count(es), blocks_count);
goto failed_mount;
}
- /*
- * It makes no sense for the first data block to be beyond the end
- * of the filesystem.
- */
- if (le32_to_cpu(es->s_first_data_block) >= ext4_blocks_count(es)) {
- printk(KERN_WARNING "EXT4-fs: bad geometry: first data"
- "block %u is beyond end of filesystem (%llu)\n",
- le32_to_cpu(es->s_first_data_block),
- ext4_blocks_count(es));
+ /*
+ * It makes no sense for the first data block to be beyond the end
+ * of the filesystem.
+ */
+ if (le32_to_cpu(es->s_first_data_block) >= ext4_blocks_count(es)) {
+ ext4_msg(sb, KERN_WARNING, "bad geometry: first data"
+ "block %u is beyond end of filesystem (%llu)",
+ le32_to_cpu(es->s_first_data_block),
+ ext4_blocks_count(es));
goto failed_mount;
}
blocks_count = (ext4_blocks_count(es) -
@@ -2533,9 +2602,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
EXT4_BLOCKS_PER_GROUP(sb) - 1);
do_div(blocks_count, EXT4_BLOCKS_PER_GROUP(sb));
if (blocks_count > ((uint64_t)1<<32) - EXT4_DESC_PER_BLOCK(sb)) {
- printk(KERN_WARNING "EXT4-fs: groups count too large: %u "
+ ext4_msg(sb, KERN_WARNING, "groups count too large: %u "
"(block count %llu, first data block %u, "
- "blocks per group %lu)\n", sbi->s_groups_count,
+ "blocks per group %lu)", sbi->s_groups_count,
ext4_blocks_count(es),
le32_to_cpu(es->s_first_data_block),
EXT4_BLOCKS_PER_GROUP(sb));
@@ -2547,7 +2616,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
sbi->s_group_desc = kmalloc(db_count * sizeof(struct buffer_head *),
GFP_KERNEL);
if (sbi->s_group_desc == NULL) {
- printk(KERN_ERR "EXT4-fs: not enough memory\n");
+ ext4_msg(sb, KERN_ERR, "not enough memory");
goto failed_mount;
}
@@ -2562,21 +2631,21 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
block = descriptor_loc(sb, logical_sb_block, i);
sbi->s_group_desc[i] = sb_bread(sb, block);
if (!sbi->s_group_desc[i]) {
- printk(KERN_ERR "EXT4-fs: "
- "can't read group descriptor %d\n", i);
+ ext4_msg(sb, KERN_ERR,
+ "can't read group descriptor %d", i);
db_count = i;
goto failed_mount2;
}
}
if (!ext4_check_descriptors(sb)) {
- printk(KERN_ERR "EXT4-fs: group descriptors corrupted!\n");
+ ext4_msg(sb, KERN_ERR, "group descriptors corrupted!");
goto failed_mount2;
}
if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG))
if (!ext4_fill_flex_info(sb)) {
- printk(KERN_ERR
- "EXT4-fs: unable to initialize "
- "flex_bg meta info!\n");
+ ext4_msg(sb, KERN_ERR,
+ "unable to initialize "
+ "flex_bg meta info!");
goto failed_mount2;
}
@@ -2598,7 +2667,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0);
}
if (err) {
- printk(KERN_ERR "EXT4-fs: insufficient memory\n");
+ ext4_msg(sb, KERN_ERR, "insufficient memory");
goto failed_mount3;
}
@@ -2607,7 +2676,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
/*
* set up enough so that it can read an inode
*/
- sb->s_op = &ext4_sops;
+ if (!test_opt(sb, NOLOAD) &&
+ EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL))
+ sb->s_op = &ext4_sops;
+ else
+ sb->s_op = &ext4_nojournal_sops;
sb->s_export_op = &ext4_export_ops;
sb->s_xattr = ext4_xattr_handlers;
#ifdef CONFIG_QUOTA
@@ -2615,6 +2688,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
sb->dq_op = &ext4_quota_operations;
#endif
INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
+ mutex_init(&sbi->s_orphan_lock);
+ mutex_init(&sbi->s_resize_lock);
sb->s_root = NULL;
@@ -2632,13 +2707,13 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
goto failed_mount3;
if (!(sb->s_flags & MS_RDONLY) &&
EXT4_SB(sb)->s_journal->j_failed_commit) {
- printk(KERN_CRIT "EXT4-fs error (device %s): "
+ ext4_msg(sb, KERN_CRIT, "error: "
"ext4_fill_super: Journal transaction "
- "%u is corrupt\n", sb->s_id,
+ "%u is corrupt",
EXT4_SB(sb)->s_journal->j_failed_commit);
if (test_opt(sb, ERRORS_RO)) {
- printk(KERN_CRIT
- "Mounting filesystem read-only\n");
+ ext4_msg(sb, KERN_CRIT,
+ "Mounting filesystem read-only");
sb->s_flags |= MS_RDONLY;
EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
@@ -2646,14 +2721,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
if (test_opt(sb, ERRORS_PANIC)) {
EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
- ext4_commit_super(sb, es, 1);
+ ext4_commit_super(sb, 1);
goto failed_mount4;
}
}
} else if (test_opt(sb, NOLOAD) && !(sb->s_flags & MS_RDONLY) &&
EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) {
- printk(KERN_ERR "EXT4-fs: required journal recovery "
- "suppressed and not mounted read-only\n");
+ ext4_msg(sb, KERN_ERR, "required journal recovery "
+ "suppressed and not mounted read-only");
goto failed_mount4;
} else {
clear_opt(sbi->s_mount_opt, DATA_FLAGS);
@@ -2666,7 +2741,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
if (ext4_blocks_count(es) > 0xffffffffULL &&
!jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0,
JBD2_FEATURE_INCOMPAT_64BIT)) {
- printk(KERN_ERR "EXT4-fs: Failed to set 64-bit journal feature\n");
+ ext4_msg(sb, KERN_ERR, "Failed to set 64-bit journal feature");
goto failed_mount4;
}
@@ -2704,8 +2779,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
case EXT4_MOUNT_WRITEBACK_DATA:
if (!jbd2_journal_check_available_features
(sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) {
- printk(KERN_ERR "EXT4-fs: Journal does not support "
- "requested data journaling mode\n");
+ ext4_msg(sb, KERN_ERR, "Journal does not support "
+ "requested data journaling mode");
goto failed_mount4;
}
default:
@@ -2717,8 +2792,8 @@ no_journal:
if (test_opt(sb, NOBH)) {
if (!(test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)) {
- printk(KERN_WARNING "EXT4-fs: Ignoring nobh option - "
- "its supported only with writeback mode\n");
+ ext4_msg(sb, KERN_WARNING, "Ignoring nobh option - "
+ "its supported only with writeback mode");
clear_opt(sbi->s_mount_opt, NOBH);
}
}
@@ -2729,18 +2804,18 @@ no_journal:
root = ext4_iget(sb, EXT4_ROOT_INO);
if (IS_ERR(root)) {
- printk(KERN_ERR "EXT4-fs: get root inode failed\n");
+ ext4_msg(sb, KERN_ERR, "get root inode failed");
ret = PTR_ERR(root);
goto failed_mount4;
}
if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
iput(root);
- printk(KERN_ERR "EXT4-fs: corrupt root inode, run e2fsck\n");
+ ext4_msg(sb, KERN_ERR, "corrupt root inode, run e2fsck");
goto failed_mount4;
}
sb->s_root = d_alloc_root(root);
if (!sb->s_root) {
- printk(KERN_ERR "EXT4-fs: get root dentry failed\n");
+ ext4_msg(sb, KERN_ERR, "get root dentry failed");
iput(root);
ret = -ENOMEM;
goto failed_mount4;
@@ -2769,22 +2844,29 @@ no_journal:
sbi->s_inode_size) {
sbi->s_want_extra_isize = sizeof(struct ext4_inode) -
EXT4_GOOD_OLD_INODE_SIZE;
- printk(KERN_INFO "EXT4-fs: required extra inode space not"
- "available.\n");
+ ext4_msg(sb, KERN_INFO, "required extra inode space not"
+ "available");
}
if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
- printk(KERN_WARNING "EXT4-fs: Ignoring delalloc option - "
- "requested data journaling mode\n");
+ ext4_msg(sb, KERN_WARNING, "Ignoring delalloc option - "
+ "requested data journaling mode");
clear_opt(sbi->s_mount_opt, DELALLOC);
} else if (test_opt(sb, DELALLOC))
- printk(KERN_INFO "EXT4-fs: delayed allocation enabled\n");
+ ext4_msg(sb, KERN_INFO, "delayed allocation enabled");
+
+ err = ext4_setup_system_zone(sb);
+ if (err) {
+ ext4_msg(sb, KERN_ERR, "failed to initialize system "
+ "zone (%d)\n", err);
+ goto failed_mount4;
+ }
ext4_ext_init(sb);
err = ext4_mb_init(sb, needs_recovery);
if (err) {
- printk(KERN_ERR "EXT4-fs: failed to initalize mballoc (%d)\n",
- err);
+ ext4_msg(sb, KERN_ERR, "failed to initalize mballoc (%d)",
+ err);
goto failed_mount4;
}
@@ -2798,19 +2880,11 @@ no_journal:
goto failed_mount4;
};
- /*
- * akpm: core read_super() calls in here with the superblock locked.
- * That deadlocks, because orphan cleanup needs to lock the superblock
- * in numerous places. Here we just pop the lock - it's relatively
- * harmless, because we are now ready to accept write_super() requests,
- * and aviro says that's the only reason for hanging onto the
- * superblock lock.
- */
EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS;
ext4_orphan_cleanup(sb, es);
EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS;
if (needs_recovery) {
- printk(KERN_INFO "EXT4-fs: recovery complete.\n");
+ ext4_msg(sb, KERN_INFO, "recovery complete");
ext4_mark_recovery_complete(sb, es);
}
if (EXT4_SB(sb)->s_journal) {
@@ -2823,25 +2897,30 @@ no_journal:
} else
descr = "out journal";
- printk(KERN_INFO "EXT4-fs: mounted filesystem %s with%s\n",
- sb->s_id, descr);
+ ext4_msg(sb, KERN_INFO, "mounted filesystem with%s", descr);
lock_kernel();
return 0;
cantfind_ext4:
if (!silent)
- printk(KERN_ERR "VFS: Can't find ext4 filesystem on dev %s.\n",
- sb->s_id);
+ ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem");
goto failed_mount;
failed_mount4:
- printk(KERN_ERR "EXT4-fs (device %s): mount failed\n", sb->s_id);
+ ext4_msg(sb, KERN_ERR, "mount failed");
+ ext4_release_system_zone(sb);
if (sbi->s_journal) {
jbd2_journal_destroy(sbi->s_journal);
sbi->s_journal = NULL;
}
failed_mount3:
+ if (sbi->s_flex_groups) {
+ if (is_vmalloc_addr(sbi->s_flex_groups))
+ vfree(sbi->s_flex_groups);
+ else
+ kfree(sbi->s_flex_groups);
+ }
percpu_counter_destroy(&sbi->s_freeblocks_counter);
percpu_counter_destroy(&sbi->s_freeinodes_counter);
percpu_counter_destroy(&sbi->s_dirs_counter);
@@ -2862,6 +2941,7 @@ failed_mount:
brelse(bh);
out_fail:
sb->s_fs_info = NULL;
+ kfree(sbi->s_blockgroup_lock);
kfree(sbi);
lock_kernel();
return ret;
@@ -2906,27 +2986,27 @@ static journal_t *ext4_get_journal(struct super_block *sb,
journal_inode = ext4_iget(sb, journal_inum);
if (IS_ERR(journal_inode)) {
- printk(KERN_ERR "EXT4-fs: no journal found.\n");
+ ext4_msg(sb, KERN_ERR, "no journal found");
return NULL;
}
if (!journal_inode->i_nlink) {
make_bad_inode(journal_inode);
iput(journal_inode);
- printk(KERN_ERR "EXT4-fs: journal inode is deleted.\n");
+ ext4_msg(sb, KERN_ERR, "journal inode is deleted");
return NULL;
}
jbd_debug(2, "Journal inode found at %p: %lld bytes\n",
journal_inode, journal_inode->i_size);
if (!S_ISREG(journal_inode->i_mode)) {
- printk(KERN_ERR "EXT4-fs: invalid journal inode.\n");
+ ext4_msg(sb, KERN_ERR, "invalid journal inode");
iput(journal_inode);
return NULL;
}
journal = jbd2_journal_init_inode(journal_inode);
if (!journal) {
- printk(KERN_ERR "EXT4-fs: Could not load journal inode\n");
+ ext4_msg(sb, KERN_ERR, "Could not load journal inode");
iput(journal_inode);
return NULL;
}
@@ -2950,22 +3030,22 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL));
- bdev = ext4_blkdev_get(j_dev);
+ bdev = ext4_blkdev_get(j_dev, sb);
if (bdev == NULL)
return NULL;
if (bd_claim(bdev, sb)) {
- printk(KERN_ERR
- "EXT4-fs: failed to claim external journal device.\n");
+ ext4_msg(sb, KERN_ERR,
+ "failed to claim external journal device");
blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
return NULL;
}
blocksize = sb->s_blocksize;
- hblock = bdev_hardsect_size(bdev);
+ hblock = bdev_logical_block_size(bdev);
if (blocksize < hblock) {
- printk(KERN_ERR
- "EXT4-fs: blocksize too small for journal device.\n");
+ ext4_msg(sb, KERN_ERR,
+ "blocksize too small for journal device");
goto out_bdev;
}
@@ -2973,8 +3053,8 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
offset = EXT4_MIN_BLOCK_SIZE % blocksize;
set_blocksize(bdev, blocksize);
if (!(bh = __bread(bdev, sb_block, blocksize))) {
- printk(KERN_ERR "EXT4-fs: couldn't read superblock of "
- "external journal\n");
+ ext4_msg(sb, KERN_ERR, "couldn't read superblock of "
+ "external journal");
goto out_bdev;
}
@@ -2982,14 +3062,14 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
if ((le16_to_cpu(es->s_magic) != EXT4_SUPER_MAGIC) ||
!(le32_to_cpu(es->s_feature_incompat) &
EXT4_FEATURE_INCOMPAT_JOURNAL_DEV)) {
- printk(KERN_ERR "EXT4-fs: external journal has "
- "bad superblock\n");
+ ext4_msg(sb, KERN_ERR, "external journal has "
+ "bad superblock");
brelse(bh);
goto out_bdev;
}
if (memcmp(EXT4_SB(sb)->s_es->s_journal_uuid, es->s_uuid, 16)) {
- printk(KERN_ERR "EXT4-fs: journal UUID does not match\n");
+ ext4_msg(sb, KERN_ERR, "journal UUID does not match");
brelse(bh);
goto out_bdev;
}
@@ -3001,25 +3081,26 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
journal = jbd2_journal_init_dev(bdev, sb->s_bdev,
start, len, blocksize);
if (!journal) {
- printk(KERN_ERR "EXT4-fs: failed to create device journal\n");
+ ext4_msg(sb, KERN_ERR, "failed to create device journal");
goto out_bdev;
}
journal->j_private = sb;
ll_rw_block(READ, 1, &journal->j_sb_buffer);
wait_on_buffer(journal->j_sb_buffer);
if (!buffer_uptodate(journal->j_sb_buffer)) {
- printk(KERN_ERR "EXT4-fs: I/O error on journal device\n");
+ ext4_msg(sb, KERN_ERR, "I/O error on journal device");
goto out_journal;
}
if (be32_to_cpu(journal->j_superblock->s_nr_users) != 1) {
- printk(KERN_ERR "EXT4-fs: External journal has more than one "
- "user (unsupported) - %d\n",
+ ext4_msg(sb, KERN_ERR, "External journal has more than one "
+ "user (unsupported) - %d",
be32_to_cpu(journal->j_superblock->s_nr_users));
goto out_journal;
}
EXT4_SB(sb)->journal_bdev = bdev;
ext4_init_journal_params(sb, journal);
return journal;
+
out_journal:
jbd2_journal_destroy(journal);
out_bdev:
@@ -3041,8 +3122,8 @@ static int ext4_load_journal(struct super_block *sb,
if (journal_devnum &&
journal_devnum != le32_to_cpu(es->s_journal_dev)) {
- printk(KERN_INFO "EXT4-fs: external journal device major/minor "
- "numbers have changed\n");
+ ext4_msg(sb, KERN_INFO, "external journal device major/minor "
+ "numbers have changed");
journal_dev = new_decode_dev(journal_devnum);
} else
journal_dev = new_decode_dev(le32_to_cpu(es->s_journal_dev));
@@ -3054,24 +3135,23 @@ static int ext4_load_journal(struct super_block *sb,
* crash? For recovery, we need to check in advance whether we
* can get read-write access to the device.
*/
-
if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) {
if (sb->s_flags & MS_RDONLY) {
- printk(KERN_INFO "EXT4-fs: INFO: recovery "
- "required on readonly filesystem.\n");
+ ext4_msg(sb, KERN_INFO, "INFO: recovery "
+ "required on readonly filesystem");
if (really_read_only) {
- printk(KERN_ERR "EXT4-fs: write access "
- "unavailable, cannot proceed.\n");
+ ext4_msg(sb, KERN_ERR, "write access "
+ "unavailable, cannot proceed");
return -EROFS;
}
- printk(KERN_INFO "EXT4-fs: write access will "
- "be enabled during recovery.\n");
+ ext4_msg(sb, KERN_INFO, "write access will "
+ "be enabled during recovery");
}
}
if (journal_inum && journal_dev) {
- printk(KERN_ERR "EXT4-fs: filesystem has both journal "
- "and inode journals!\n");
+ ext4_msg(sb, KERN_ERR, "filesystem has both journal "
+ "and inode journals!");
return -EINVAL;
}
@@ -3084,14 +3164,14 @@ static int ext4_load_journal(struct super_block *sb,
}
if (journal->j_flags & JBD2_BARRIER)
- printk(KERN_INFO "EXT4-fs: barriers enabled\n");
+ ext4_msg(sb, KERN_INFO, "barriers enabled");
else
- printk(KERN_INFO "EXT4-fs: barriers disabled\n");
+ ext4_msg(sb, KERN_INFO, "barriers disabled");
if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) {
err = jbd2_journal_update_format(journal);
if (err) {
- printk(KERN_ERR "EXT4-fs: error updating journal.\n");
+ ext4_msg(sb, KERN_ERR, "error updating journal");
jbd2_journal_destroy(journal);
return err;
}
@@ -3103,7 +3183,7 @@ static int ext4_load_journal(struct super_block *sb,
err = jbd2_journal_load(journal);
if (err) {
- printk(KERN_ERR "EXT4-fs: error loading journal.\n");
+ ext4_msg(sb, KERN_ERR, "error loading journal");
jbd2_journal_destroy(journal);
return err;
}
@@ -3114,18 +3194,17 @@ static int ext4_load_journal(struct super_block *sb,
if (journal_devnum &&
journal_devnum != le32_to_cpu(es->s_journal_dev)) {
es->s_journal_dev = cpu_to_le32(journal_devnum);
- sb->s_dirt = 1;
/* Make sure we flush the recovery flag to disk. */
- ext4_commit_super(sb, es, 1);
+ ext4_commit_super(sb, 1);
}
return 0;
}
-static int ext4_commit_super(struct super_block *sb,
- struct ext4_super_block *es, int sync)
+static int ext4_commit_super(struct super_block *sb, int sync)
{
+ struct ext4_super_block *es = EXT4_SB(sb)->s_es;
struct buffer_head *sbh = EXT4_SB(sb)->s_sbh;
int error = 0;
@@ -3140,8 +3219,8 @@ static int ext4_commit_super(struct super_block *sb,
* be remapped. Nothing we can do but to retry the
* write and hope for the best.
*/
- printk(KERN_ERR "EXT4-fs: previous I/O error to "
- "superblock detected for %s.\n", sb->s_id);
+ ext4_msg(sb, KERN_ERR, "previous I/O error to "
+ "superblock detected");
clear_buffer_write_io_error(sbh);
set_buffer_uptodate(sbh);
}
@@ -3154,7 +3233,7 @@ static int ext4_commit_super(struct super_block *sb,
&EXT4_SB(sb)->s_freeblocks_counter));
es->s_free_inodes_count = cpu_to_le32(percpu_counter_sum_positive(
&EXT4_SB(sb)->s_freeinodes_counter));
-
+ sb->s_dirt = 0;
BUFFER_TRACE(sbh, "marking dirty");
mark_buffer_dirty(sbh);
if (sync) {
@@ -3164,8 +3243,8 @@ static int ext4_commit_super(struct super_block *sb,
error = buffer_write_io_error(sbh);
if (error) {
- printk(KERN_ERR "EXT4-fs: I/O error while writing "
- "superblock for %s.\n", sb->s_id);
+ ext4_msg(sb, KERN_ERR, "I/O error while writing "
+ "superblock");
clear_buffer_write_io_error(sbh);
set_buffer_uptodate(sbh);
}
@@ -3173,7 +3252,6 @@ static int ext4_commit_super(struct super_block *sb,
return error;
}
-
/*
* Have we just finished recovery? If so, and if we are mounting (or
* remounting) the filesystem readonly, then we will end up with a
@@ -3192,14 +3270,11 @@ static void ext4_mark_recovery_complete(struct super_block *sb,
if (jbd2_journal_flush(journal) < 0)
goto out;
- lock_super(sb);
if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER) &&
sb->s_flags & MS_RDONLY) {
EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
- sb->s_dirt = 0;
- ext4_commit_super(sb, es, 1);
+ ext4_commit_super(sb, 1);
}
- unlock_super(sb);
out:
jbd2_journal_unlock_updates(journal);
@@ -3238,7 +3313,7 @@ static void ext4_clear_journal_err(struct super_block *sb,
EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
- ext4_commit_super(sb, es, 1);
+ ext4_commit_super(sb, 1);
jbd2_journal_clear_err(journal);
}
@@ -3257,29 +3332,17 @@ int ext4_force_commit(struct super_block *sb)
return 0;
journal = EXT4_SB(sb)->s_journal;
- if (journal) {
- sb->s_dirt = 0;
+ if (journal)
ret = ext4_journal_force_commit(journal);
- }
return ret;
}
-/*
- * Ext4 always journals updates to the superblock itself, so we don't
- * have to propagate any other updates to the superblock on disk at this
- * point. (We can probably nuke this function altogether, and remove
- * any mention to sb->s_dirt in all of fs/ext4; eventual cleanup...)
- */
static void ext4_write_super(struct super_block *sb)
{
- if (EXT4_SB(sb)->s_journal) {
- if (mutex_trylock(&sb->s_lock) != 0)
- BUG();
- sb->s_dirt = 0;
- } else {
- ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1);
- }
+ lock_super(sb);
+ ext4_commit_super(sb, 1);
+ unlock_super(sb);
}
static int ext4_sync_fs(struct super_block *sb, int wait)
@@ -3287,17 +3350,10 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
int ret = 0;
tid_t target;
- trace_mark(ext4_sync_fs, "dev %s wait %d", sb->s_id, wait);
- sb->s_dirt = 0;
- if (EXT4_SB(sb)->s_journal) {
- if (jbd2_journal_start_commit(EXT4_SB(sb)->s_journal,
- &target)) {
- if (wait)
- jbd2_log_wait_commit(EXT4_SB(sb)->s_journal,
- target);
- }
- } else {
- ext4_commit_super(sb, EXT4_SB(sb)->s_es, wait);
+ trace_ext4_sync_fs(sb, wait);
+ if (jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, &target)) {
+ if (wait)
+ jbd2_log_wait_commit(EXT4_SB(sb)->s_journal, target);
}
return ret;
}
@@ -3310,34 +3366,32 @@ static int ext4_freeze(struct super_block *sb)
{
int error = 0;
journal_t *journal;
- sb->s_dirt = 0;
- if (!(sb->s_flags & MS_RDONLY)) {
- journal = EXT4_SB(sb)->s_journal;
+ if (sb->s_flags & MS_RDONLY)
+ return 0;
- if (journal) {
- /* Now we set up the journal barrier. */
- jbd2_journal_lock_updates(journal);
+ journal = EXT4_SB(sb)->s_journal;
- /*
- * We don't want to clear needs_recovery flag when we
- * failed to flush the journal.
- */
- error = jbd2_journal_flush(journal);
- if (error < 0)
- goto out;
- }
+ /* Now we set up the journal barrier. */
+ jbd2_journal_lock_updates(journal);
- /* Journal blocked and flushed, clear needs_recovery flag. */
- EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
- error = ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1);
- if (error)
- goto out;
+ /*
+ * Don't clear the needs_recovery flag if we failed to flush
+ * the journal.
+ */
+ error = jbd2_journal_flush(journal);
+ if (error < 0) {
+ out:
+ jbd2_journal_unlock_updates(journal);
+ return error;
}
+
+ /* Journal blocked and flushed, clear needs_recovery flag. */
+ EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
+ error = ext4_commit_super(sb, 1);
+ if (error)
+ goto out;
return 0;
-out:
- jbd2_journal_unlock_updates(journal);
- return error;
}
/*
@@ -3346,14 +3400,15 @@ out:
*/
static int ext4_unfreeze(struct super_block *sb)
{
- if (EXT4_SB(sb)->s_journal && !(sb->s_flags & MS_RDONLY)) {
- lock_super(sb);
- /* Reser the needs_recovery flag before the fs is unlocked. */
- EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
- ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1);
- unlock_super(sb);
- jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
- }
+ if (sb->s_flags & MS_RDONLY)
+ return 0;
+
+ lock_super(sb);
+ /* Reset the needs_recovery flag before the fs is unlocked. */
+ EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
+ ext4_commit_super(sb, 1);
+ unlock_super(sb);
+ jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
return 0;
}
@@ -3371,7 +3426,10 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
int i;
#endif
+ lock_kernel();
+
/* Store the original options */
+ lock_super(sb);
old_sb_flags = sb->s_flags;
old_opts.s_mount_opt = sbi->s_mount_opt;
old_opts.s_resuid = sbi->s_resuid;
@@ -3396,7 +3454,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
goto restore_opts;
}
- if (sbi->s_mount_opt & EXT4_MOUNT_ABORT)
+ if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED)
ext4_abort(sb, __func__, "Abort forced by user");
sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
@@ -3411,7 +3469,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY) ||
n_blocks_count > ext4_blocks_count(es)) {
- if (sbi->s_mount_opt & EXT4_MOUNT_ABORT) {
+ if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) {
err = -EROFS;
goto restore_opts;
}
@@ -3432,22 +3490,15 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
(sbi->s_mount_state & EXT4_VALID_FS))
es->s_state = cpu_to_le16(sbi->s_mount_state);
- /*
- * We have to unlock super so that we can wait for
- * transactions.
- */
- if (sbi->s_journal) {
- unlock_super(sb);
+ if (sbi->s_journal)
ext4_mark_recovery_complete(sb, es);
- lock_super(sb);
- }
} else {
int ret;
if ((ret = EXT4_HAS_RO_COMPAT_FEATURE(sb,
~EXT4_FEATURE_RO_COMPAT_SUPP))) {
- printk(KERN_WARNING "EXT4-fs: %s: couldn't "
+ ext4_msg(sb, KERN_WARNING, "couldn't "
"remount RDWR because of unsupported "
- "optional features (%x).\n", sb->s_id,
+ "optional features (%x)",
(le32_to_cpu(sbi->s_es->s_feature_ro_compat) &
~EXT4_FEATURE_RO_COMPAT_SUPP));
err = -EROFS;
@@ -3456,17 +3507,15 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
/*
* Make sure the group descriptor checksums
- * are sane. If they aren't, refuse to
- * remount r/w.
+ * are sane. If they aren't, refuse to remount r/w.
*/
for (g = 0; g < sbi->s_groups_count; g++) {
struct ext4_group_desc *gdp =
ext4_get_group_desc(sb, g, NULL);
if (!ext4_group_desc_csum_verify(sbi, g, gdp)) {
- printk(KERN_ERR
- "EXT4-fs: ext4_remount: "
- "Checksum for group %u failed (%u!=%u)\n",
+ ext4_msg(sb, KERN_ERR,
+ "ext4_remount: Checksum for group %u failed (%u!=%u)",
g, le16_to_cpu(ext4_group_desc_csum(sbi, g, gdp)),
le16_to_cpu(gdp->bg_checksum));
err = -EINVAL;
@@ -3480,11 +3529,10 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
* require a full umount/remount for now.
*/
if (es->s_last_orphan) {
- printk(KERN_WARNING "EXT4-fs: %s: couldn't "
+ ext4_msg(sb, KERN_WARNING, "Couldn't "
"remount RDWR because of unprocessed "
"orphan inode list. Please "
- "umount/remount instead.\n",
- sb->s_id);
+ "umount/remount instead");
err = -EINVAL;
goto restore_opts;
}
@@ -3504,8 +3552,9 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
sb->s_flags &= ~MS_RDONLY;
}
}
+ ext4_setup_system_zone(sb);
if (sbi->s_journal == NULL)
- ext4_commit_super(sb, es, 1);
+ ext4_commit_super(sb, 1);
#ifdef CONFIG_QUOTA
/* Release old quota file names */
@@ -3514,7 +3563,10 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
old_opts.s_qf_names[i] != sbi->s_qf_names[i])
kfree(old_opts.s_qf_names[i]);
#endif
+ unlock_super(sb);
+ unlock_kernel();
return 0;
+
restore_opts:
sb->s_flags = old_sb_flags;
sbi->s_mount_opt = old_opts.s_mount_opt;
@@ -3532,6 +3584,8 @@ restore_opts:
sbi->s_qf_names[i] = old_opts.s_qf_names[i];
}
#endif
+ unlock_super(sb);
+ unlock_kernel();
return err;
}
@@ -3545,9 +3599,8 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
if (test_opt(sb, MINIX_DF)) {
sbi->s_overhead_last = 0;
} else if (sbi->s_blocks_last != ext4_blocks_count(es)) {
- ext4_group_t ngroups = sbi->s_groups_count, i;
+ ext4_group_t i, ngroups = ext4_get_groups_count(sb);
ext4_fsblk_t overhead = 0;
- smp_rmb();
/*
* Compute the overhead (FS structures). This is constant
@@ -3599,11 +3652,12 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
le64_to_cpup((void *)es->s_uuid + sizeof(u64));
buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL;
buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL;
+
return 0;
}
-/* Helper function for writing quotas on sync - we need to start transaction before quota file
- * is locked for write. Otherwise the are possible deadlocks:
+/* Helper function for writing quotas on sync - we need to start transaction
+ * before quota file is locked for write. Otherwise the are possible deadlocks:
* Process 1 Process 2
* ext4_create() quota_sync()
* jbd2_journal_start() write_dquot()
@@ -3627,7 +3681,7 @@ static int ext4_write_dquot(struct dquot *dquot)
inode = dquot_to_inode(dquot);
handle = ext4_journal_start(inode,
- EXT4_QUOTA_TRANS_BLOCKS(dquot->dq_sb));
+ EXT4_QUOTA_TRANS_BLOCKS(dquot->dq_sb));
if (IS_ERR(handle))
return PTR_ERR(handle);
ret = dquot_commit(dquot);
@@ -3643,7 +3697,7 @@ static int ext4_acquire_dquot(struct dquot *dquot)
handle_t *handle;
handle = ext4_journal_start(dquot_to_inode(dquot),
- EXT4_QUOTA_INIT_BLOCKS(dquot->dq_sb));
+ EXT4_QUOTA_INIT_BLOCKS(dquot->dq_sb));
if (IS_ERR(handle))
return PTR_ERR(handle);
ret = dquot_acquire(dquot);
@@ -3659,7 +3713,7 @@ static int ext4_release_dquot(struct dquot *dquot)
handle_t *handle;
handle = ext4_journal_start(dquot_to_inode(dquot),
- EXT4_QUOTA_DEL_BLOCKS(dquot->dq_sb));
+ EXT4_QUOTA_DEL_BLOCKS(dquot->dq_sb));
if (IS_ERR(handle)) {
/* Release dquot anyway to avoid endless cycle in dqput() */
dquot_release(dquot);
@@ -3707,7 +3761,7 @@ static int ext4_write_info(struct super_block *sb, int type)
static int ext4_quota_on_mount(struct super_block *sb, int type)
{
return vfs_quota_on_mount(sb, EXT4_SB(sb)->s_qf_names[type],
- EXT4_SB(sb)->s_jquota_fmt, type);
+ EXT4_SB(sb)->s_jquota_fmt, type);
}
/*
@@ -3738,9 +3792,9 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
if (EXT4_SB(sb)->s_qf_names[type]) {
/* Quotafile not in fs root? */
if (path.dentry->d_parent != sb->s_root)
- printk(KERN_WARNING
- "EXT4-fs: Quota file not on filesystem root. "
- "Journaled quota will not work.\n");
+ ext4_msg(sb, KERN_WARNING,
+ "Quota file not on filesystem root. "
+ "Journaled quota will not work");
}
/*
@@ -3823,8 +3877,8 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
handle_t *handle = journal_current_handle();
if (EXT4_SB(sb)->s_journal && !handle) {
- printk(KERN_WARNING "EXT4-fs: Quota write (off=%llu, len=%llu)"
- " cancelled because transaction is not started.\n",
+ ext4_msg(sb, KERN_WARNING, "Quota write (off=%llu, len=%llu)"
+ " cancelled because transaction is not started",
(unsigned long long)off, (unsigned long long)len);
return -EIO;
}
@@ -3878,10 +3932,10 @@ out:
#endif
-static int ext4_get_sb(struct file_system_type *fs_type,
- int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+static int ext4_get_sb(struct file_system_type *fs_type, int flags,
+ const char *dev_name, void *data, struct vfsmount *mnt)
{
- return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super, mnt);
+ return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super,mnt);
}
static struct file_system_type ext4_fs_type = {
@@ -3893,14 +3947,14 @@ static struct file_system_type ext4_fs_type = {
};
#ifdef CONFIG_EXT4DEV_COMPAT
-static int ext4dev_get_sb(struct file_system_type *fs_type,
- int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+static int ext4dev_get_sb(struct file_system_type *fs_type, int flags,
+ const char *dev_name, void *data,struct vfsmount *mnt)
{
- printk(KERN_WARNING "EXT4-fs: Update your userspace programs "
- "to mount using ext4\n");
- printk(KERN_WARNING "EXT4-fs: ext4dev backwards compatibility "
- "will go away by 2.6.31\n");
- return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super, mnt);
+ printk(KERN_WARNING "EXT4-fs (%s): Update your userspace programs "
+ "to mount using ext4\n", dev_name);
+ printk(KERN_WARNING "EXT4-fs (%s): ext4dev backwards compatibility "
+ "will go away by 2.6.31\n", dev_name);
+ return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super,mnt);
}
static struct file_system_type ext4dev_fs_type = {
@@ -3917,13 +3971,16 @@ static int __init init_ext4_fs(void)
{
int err;
+ err = init_ext4_system_zone();
+ if (err)
+ return err;
ext4_kset = kset_create_and_add("ext4", NULL, fs_kobj);
if (!ext4_kset)
- return -ENOMEM;
+ goto out4;
ext4_proc_root = proc_mkdir("fs/ext4", NULL);
err = init_ext4_mballoc();
if (err)
- return err;
+ goto out3;
err = init_ext4_xattr();
if (err)
@@ -3948,6 +4005,11 @@ out1:
exit_ext4_xattr();
out2:
exit_ext4_mballoc();
+out3:
+ remove_proc_entry("fs/ext4", NULL);
+ kset_unregister(ext4_kset);
+out4:
+ exit_ext4_system_zone();
return err;
}
@@ -3962,6 +4024,7 @@ static void __exit exit_ext4_fs(void)
exit_ext4_mballoc();
remove_proc_entry("fs/ext4", NULL);
kset_unregister(ext4_kset);
+ exit_ext4_system_zone();
}
MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");