diff options
Diffstat (limited to 'fs/btrfs/reada.c')
-rw-r--r-- | fs/btrfs/reada.c | 268 |
1 files changed, 134 insertions, 134 deletions
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index 619f92963e27..b892914968c1 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -72,7 +72,7 @@ struct reada_extent { spinlock_t lock; struct reada_zone *zones[BTRFS_MAX_MIRRORS]; int nzones; - struct btrfs_device *scheduled_for; + int scheduled; }; struct reada_zone { @@ -101,67 +101,53 @@ static void reada_start_machine(struct btrfs_fs_info *fs_info); static void __reada_start_machine(struct btrfs_fs_info *fs_info); static int reada_add_block(struct reada_control *rc, u64 logical, - struct btrfs_key *top, int level, u64 generation); + struct btrfs_key *top, u64 generation); /* recurses */ /* in case of err, eb might be NULL */ -static int __readahead_hook(struct btrfs_root *root, struct extent_buffer *eb, - u64 start, int err) +static void __readahead_hook(struct btrfs_fs_info *fs_info, + struct reada_extent *re, struct extent_buffer *eb, + u64 start, int err) { int level = 0; int nritems; int i; u64 bytenr; u64 generation; - struct reada_extent *re; - struct btrfs_fs_info *fs_info = root->fs_info; struct list_head list; - unsigned long index = start >> PAGE_CACHE_SHIFT; - struct btrfs_device *for_dev; if (eb) level = btrfs_header_level(eb); - /* find extent */ - spin_lock(&fs_info->reada_lock); - re = radix_tree_lookup(&fs_info->reada_tree, index); - if (re) - re->refcnt++; - spin_unlock(&fs_info->reada_lock); - - if (!re) - return -1; - spin_lock(&re->lock); /* * just take the full list from the extent. afterwards we * don't need the lock anymore */ list_replace_init(&re->extctl, &list); - for_dev = re->scheduled_for; - re->scheduled_for = NULL; + re->scheduled = 0; spin_unlock(&re->lock); - if (err == 0) { - nritems = level ? btrfs_header_nritems(eb) : 0; - generation = btrfs_header_generation(eb); - /* - * FIXME: currently we just set nritems to 0 if this is a leaf, - * effectively ignoring the content. In a next step we could - * trigger more readahead depending from the content, e.g. - * fetch the checksums for the extents in the leaf. - */ - } else { - /* - * this is the error case, the extent buffer has not been - * read correctly. We won't access anything from it and - * just cleanup our data structures. Effectively this will - * cut the branch below this node from read ahead. - */ - nritems = 0; - generation = 0; - } + /* + * this is the error case, the extent buffer has not been + * read correctly. We won't access anything from it and + * just cleanup our data structures. Effectively this will + * cut the branch below this node from read ahead. + */ + if (err) + goto cleanup; + /* + * FIXME: currently we just set nritems to 0 if this is a leaf, + * effectively ignoring the content. In a next step we could + * trigger more readahead depending from the content, e.g. + * fetch the checksums for the extents in the leaf. + */ + if (!level) + goto cleanup; + + nritems = btrfs_header_nritems(eb); + generation = btrfs_header_generation(eb); for (i = 0; i < nritems; i++) { struct reada_extctl *rec; u64 n_gen; @@ -188,19 +174,20 @@ static int __readahead_hook(struct btrfs_root *root, struct extent_buffer *eb, */ #ifdef DEBUG if (rec->generation != generation) { - btrfs_debug(root->fs_info, - "generation mismatch for (%llu,%d,%llu) %llu != %llu", - key.objectid, key.type, key.offset, - rec->generation, generation); + btrfs_debug(fs_info, + "generation mismatch for (%llu,%d,%llu) %llu != %llu", + key.objectid, key.type, key.offset, + rec->generation, generation); } #endif if (rec->generation == generation && btrfs_comp_cpu_keys(&key, &rc->key_end) < 0 && btrfs_comp_cpu_keys(&next_key, &rc->key_start) > 0) - reada_add_block(rc, bytenr, &next_key, - level - 1, n_gen); + reada_add_block(rc, bytenr, &next_key, n_gen); } } + +cleanup: /* * free extctl records */ @@ -222,26 +209,37 @@ static int __readahead_hook(struct btrfs_root *root, struct extent_buffer *eb, reada_extent_put(fs_info, re); /* one ref for each entry */ } - reada_extent_put(fs_info, re); /* our ref */ - if (for_dev) - atomic_dec(&for_dev->reada_in_flight); - return 0; + return; } /* * start is passed separately in case eb in NULL, which may be the case with * failed I/O */ -int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb, - u64 start, int err) +int btree_readahead_hook(struct btrfs_fs_info *fs_info, + struct extent_buffer *eb, u64 start, int err) { - int ret; + int ret = 0; + struct reada_extent *re; - ret = __readahead_hook(root, eb, start, err); + /* find extent */ + spin_lock(&fs_info->reada_lock); + re = radix_tree_lookup(&fs_info->reada_tree, + start >> PAGE_CACHE_SHIFT); + if (re) + re->refcnt++; + spin_unlock(&fs_info->reada_lock); + if (!re) { + ret = -1; + goto start_machine; + } - reada_start_machine(root->fs_info); + __readahead_hook(fs_info, re, eb, start, err); + reada_extent_put(fs_info, re); /* our ref */ +start_machine: + reada_start_machine(fs_info); return ret; } @@ -260,18 +258,14 @@ static struct reada_zone *reada_find_zone(struct btrfs_fs_info *fs_info, spin_lock(&fs_info->reada_lock); ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone, logical >> PAGE_CACHE_SHIFT, 1); - if (ret == 1) + if (ret == 1 && logical >= zone->start && logical <= zone->end) { kref_get(&zone->refcnt); - spin_unlock(&fs_info->reada_lock); - - if (ret == 1) { - if (logical >= zone->start && logical < zone->end) - return zone; - spin_lock(&fs_info->reada_lock); - kref_put(&zone->refcnt, reada_zone_release); spin_unlock(&fs_info->reada_lock); + return zone; } + spin_unlock(&fs_info->reada_lock); + cache = btrfs_lookup_block_group(fs_info, logical); if (!cache) return NULL; @@ -280,7 +274,7 @@ static struct reada_zone *reada_find_zone(struct btrfs_fs_info *fs_info, end = start + cache->key.offset - 1; btrfs_put_block_group(cache); - zone = kzalloc(sizeof(*zone), GFP_NOFS); + zone = kzalloc(sizeof(*zone), GFP_KERNEL); if (!zone) return NULL; @@ -307,8 +301,10 @@ static struct reada_zone *reada_find_zone(struct btrfs_fs_info *fs_info, kfree(zone); ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone, logical >> PAGE_CACHE_SHIFT, 1); - if (ret == 1) + if (ret == 1 && logical >= zone->start && logical <= zone->end) kref_get(&zone->refcnt); + else + zone = NULL; } spin_unlock(&fs_info->reada_lock); @@ -317,7 +313,7 @@ static struct reada_zone *reada_find_zone(struct btrfs_fs_info *fs_info, static struct reada_extent *reada_find_extent(struct btrfs_root *root, u64 logical, - struct btrfs_key *top, int level) + struct btrfs_key *top) { int ret; struct reada_extent *re = NULL; @@ -330,9 +326,9 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, u64 length; int real_stripes; int nzones = 0; - int i; unsigned long index = logical >> PAGE_CACHE_SHIFT; int dev_replace_is_ongoing; + int have_zone = 0; spin_lock(&fs_info->reada_lock); re = radix_tree_lookup(&fs_info->reada_tree, index); @@ -343,7 +339,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, if (re) return re; - re = kzalloc(sizeof(*re), GFP_NOFS); + re = kzalloc(sizeof(*re), GFP_KERNEL); if (!re) return NULL; @@ -375,11 +371,16 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, struct reada_zone *zone; dev = bbio->stripes[nzones].dev; + + /* cannot read ahead on missing device. */ + if (!dev->bdev) + continue; + zone = reada_find_zone(fs_info, dev, logical, bbio); if (!zone) - break; + continue; - re->zones[nzones] = zone; + re->zones[re->nzones++] = zone; spin_lock(&zone->lock); if (!zone->elems) kref_get(&zone->refcnt); @@ -389,14 +390,13 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, kref_put(&zone->refcnt, reada_zone_release); spin_unlock(&fs_info->reada_lock); } - re->nzones = nzones; - if (nzones == 0) { + if (re->nzones == 0) { /* not a single zone found, error and out */ goto error; } /* insert extent in reada_tree + all per-device trees, all or nothing */ - btrfs_dev_replace_lock(&fs_info->dev_replace); + btrfs_dev_replace_lock(&fs_info->dev_replace, 0); spin_lock(&fs_info->reada_lock); ret = radix_tree_insert(&fs_info->reada_tree, index, re); if (ret == -EEXIST) { @@ -404,19 +404,20 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, BUG_ON(!re_exist); re_exist->refcnt++; spin_unlock(&fs_info->reada_lock); - btrfs_dev_replace_unlock(&fs_info->dev_replace); + btrfs_dev_replace_unlock(&fs_info->dev_replace, 0); goto error; } if (ret) { spin_unlock(&fs_info->reada_lock); - btrfs_dev_replace_unlock(&fs_info->dev_replace); + btrfs_dev_replace_unlock(&fs_info->dev_replace, 0); goto error; } prev_dev = NULL; dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing( &fs_info->dev_replace); - for (i = 0; i < nzones; ++i) { - dev = bbio->stripes[i].dev; + for (nzones = 0; nzones < re->nzones; ++nzones) { + dev = re->zones[nzones]->device; + if (dev == prev_dev) { /* * in case of DUP, just add the first zone. As both @@ -427,15 +428,9 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, */ continue; } - if (!dev->bdev) { - /* - * cannot read ahead on missing device, but for RAID5/6, - * REQ_GET_READ_MIRRORS return 1. So don't skip missing - * device for such case. - */ - if (nzones > 1) - continue; - } + if (!dev->bdev) + continue; + if (dev_replace_is_ongoing && dev == fs_info->dev_replace.tgtdev) { /* @@ -447,8 +442,8 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, prev_dev = dev; ret = radix_tree_insert(&dev->reada_extents, index, re); if (ret) { - while (--i >= 0) { - dev = bbio->stripes[i].dev; + while (--nzones >= 0) { + dev = re->zones[nzones]->device; BUG_ON(dev == NULL); /* ignore whether the entry was inserted */ radix_tree_delete(&dev->reada_extents, index); @@ -456,21 +451,24 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, BUG_ON(fs_info == NULL); radix_tree_delete(&fs_info->reada_tree, index); spin_unlock(&fs_info->reada_lock); - btrfs_dev_replace_unlock(&fs_info->dev_replace); + btrfs_dev_replace_unlock(&fs_info->dev_replace, 0); goto error; } + have_zone = 1; } spin_unlock(&fs_info->reada_lock); - btrfs_dev_replace_unlock(&fs_info->dev_replace); + btrfs_dev_replace_unlock(&fs_info->dev_replace, 0); + + if (!have_zone) + goto error; btrfs_put_bbio(bbio); return re; error: - while (nzones) { + for (nzones = 0; nzones < re->nzones; ++nzones) { struct reada_zone *zone; - --nzones; zone = re->zones[nzones]; kref_get(&zone->refcnt); spin_lock(&zone->lock); @@ -531,8 +529,6 @@ static void reada_extent_put(struct btrfs_fs_info *fs_info, kref_put(&zone->refcnt, reada_zone_release); spin_unlock(&fs_info->reada_lock); } - if (re->scheduled_for) - atomic_dec(&re->scheduled_for->reada_in_flight); kfree(re); } @@ -556,17 +552,17 @@ static void reada_control_release(struct kref *kref) } static int reada_add_block(struct reada_control *rc, u64 logical, - struct btrfs_key *top, int level, u64 generation) + struct btrfs_key *top, u64 generation) { struct btrfs_root *root = rc->root; struct reada_extent *re; struct reada_extctl *rec; - re = reada_find_extent(root, logical, top, level); /* takes one ref */ + re = reada_find_extent(root, logical, top); /* takes one ref */ if (!re) return -1; - rec = kzalloc(sizeof(*rec), GFP_NOFS); + rec = kzalloc(sizeof(*rec), GFP_KERNEL); if (!rec) { reada_extent_put(root->fs_info, re); return -ENOMEM; @@ -662,7 +658,6 @@ static int reada_start_machine_dev(struct btrfs_fs_info *fs_info, u64 logical; int ret; int i; - int need_kick = 0; spin_lock(&fs_info->reada_lock); if (dev->reada_curr_zone == NULL) { @@ -679,7 +674,7 @@ static int reada_start_machine_dev(struct btrfs_fs_info *fs_info, */ ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re, dev->reada_next >> PAGE_CACHE_SHIFT, 1); - if (ret == 0 || re->logical >= dev->reada_curr_zone->end) { + if (ret == 0 || re->logical > dev->reada_curr_zone->end) { ret = reada_pick_zone(dev); if (!ret) { spin_unlock(&fs_info->reada_lock); @@ -698,6 +693,15 @@ static int reada_start_machine_dev(struct btrfs_fs_info *fs_info, spin_unlock(&fs_info->reada_lock); + spin_lock(&re->lock); + if (re->scheduled || list_empty(&re->extctl)) { + spin_unlock(&re->lock); + reada_extent_put(fs_info, re); + return 0; + } + re->scheduled = 1; + spin_unlock(&re->lock); + /* * find mirror num */ @@ -709,29 +713,20 @@ static int reada_start_machine_dev(struct btrfs_fs_info *fs_info, } logical = re->logical; - spin_lock(&re->lock); - if (re->scheduled_for == NULL) { - re->scheduled_for = dev; - need_kick = 1; - } - spin_unlock(&re->lock); - - reada_extent_put(fs_info, re); - - if (!need_kick) - return 0; - atomic_inc(&dev->reada_in_flight); ret = reada_tree_block_flagged(fs_info->extent_root, logical, mirror_num, &eb); if (ret) - __readahead_hook(fs_info->extent_root, NULL, logical, ret); + __readahead_hook(fs_info, re, NULL, logical, ret); else if (eb) - __readahead_hook(fs_info->extent_root, eb, eb->start, ret); + __readahead_hook(fs_info, re, eb, eb->start, ret); if (eb) free_extent_buffer(eb); + atomic_dec(&dev->reada_in_flight); + reada_extent_put(fs_info, re); + return 1; } @@ -752,6 +747,8 @@ static void reada_start_machine_worker(struct btrfs_work *work) set_task_ioprio(current, BTRFS_IOPRIO_READA); __reada_start_machine(fs_info); set_task_ioprio(current, old_ioprio); + + atomic_dec(&fs_info->reada_works_cnt); } static void __reada_start_machine(struct btrfs_fs_info *fs_info) @@ -783,15 +780,19 @@ static void __reada_start_machine(struct btrfs_fs_info *fs_info) * enqueue to workers to finish it. This will distribute the load to * the cores. */ - for (i = 0; i < 2; ++i) + for (i = 0; i < 2; ++i) { reada_start_machine(fs_info); + if (atomic_read(&fs_info->reada_works_cnt) > + BTRFS_MAX_MIRRORS * 2) + break; + } } static void reada_start_machine(struct btrfs_fs_info *fs_info) { struct reada_machine_work *rmw; - rmw = kzalloc(sizeof(*rmw), GFP_NOFS); + rmw = kzalloc(sizeof(*rmw), GFP_KERNEL); if (!rmw) { /* FIXME we cannot handle this properly right now */ BUG(); @@ -801,6 +802,7 @@ static void reada_start_machine(struct btrfs_fs_info *fs_info) rmw->fs_info = fs_info; btrfs_queue_work(fs_info->readahead_workers, &rmw->work); + atomic_inc(&fs_info->reada_works_cnt); } #ifdef DEBUG @@ -848,10 +850,9 @@ static void dump_devs(struct btrfs_fs_info *fs_info, int all) if (ret == 0) break; printk(KERN_DEBUG - " re: logical %llu size %u empty %d for %lld", + " re: logical %llu size %u empty %d scheduled %d", re->logical, fs_info->tree_root->nodesize, - list_empty(&re->extctl), re->scheduled_for ? - re->scheduled_for->devid : -1); + list_empty(&re->extctl), re->scheduled); for (i = 0; i < re->nzones; ++i) { printk(KERN_CONT " zone %llu-%llu devs", @@ -878,27 +879,21 @@ static void dump_devs(struct btrfs_fs_info *fs_info, int all) index, 1); if (ret == 0) break; - if (!re->scheduled_for) { + if (!re->scheduled) { index = (re->logical >> PAGE_CACHE_SHIFT) + 1; continue; } printk(KERN_DEBUG - "re: logical %llu size %u list empty %d for %lld", + "re: logical %llu size %u list empty %d scheduled %d", re->logical, fs_info->tree_root->nodesize, - list_empty(&re->extctl), - re->scheduled_for ? re->scheduled_for->devid : -1); + list_empty(&re->extctl), re->scheduled); for (i = 0; i < re->nzones; ++i) { printk(KERN_CONT " zone %llu-%llu devs", re->zones[i]->start, re->zones[i]->end); - for (i = 0; i < re->nzones; ++i) { - printk(KERN_CONT " zone %llu-%llu devs", - re->zones[i]->start, - re->zones[i]->end); - for (j = 0; j < re->zones[i]->ndevs; ++j) { - printk(KERN_CONT " %lld", - re->zones[i]->devs[j]->devid); - } + for (j = 0; j < re->zones[i]->ndevs; ++j) { + printk(KERN_CONT " %lld", + re->zones[i]->devs[j]->devid); } } printk(KERN_CONT "\n"); @@ -917,7 +912,6 @@ struct reada_control *btrfs_reada_add(struct btrfs_root *root, struct reada_control *rc; u64 start; u64 generation; - int level; int ret; struct extent_buffer *node; static struct btrfs_key max_key = { @@ -926,7 +920,7 @@ struct reada_control *btrfs_reada_add(struct btrfs_root *root, .offset = (u64)-1 }; - rc = kzalloc(sizeof(*rc), GFP_NOFS); + rc = kzalloc(sizeof(*rc), GFP_KERNEL); if (!rc) return ERR_PTR(-ENOMEM); @@ -940,11 +934,10 @@ struct reada_control *btrfs_reada_add(struct btrfs_root *root, node = btrfs_root_node(root); start = node->start; - level = btrfs_header_level(node); generation = btrfs_header_generation(node); free_extent_buffer(node); - ret = reada_add_block(rc, start, &max_key, level, generation); + ret = reada_add_block(rc, start, &max_key, generation); if (ret) { kfree(rc); return ERR_PTR(ret); @@ -959,8 +952,11 @@ struct reada_control *btrfs_reada_add(struct btrfs_root *root, int btrfs_reada_wait(void *handle) { struct reada_control *rc = handle; + struct btrfs_fs_info *fs_info = rc->root->fs_info; while (atomic_read(&rc->elems)) { + if (!atomic_read(&fs_info->reada_works_cnt)) + reada_start_machine(fs_info); wait_event_timeout(rc->wait, atomic_read(&rc->elems) == 0, 5 * HZ); dump_devs(rc->root->fs_info, @@ -977,9 +973,13 @@ int btrfs_reada_wait(void *handle) int btrfs_reada_wait(void *handle) { struct reada_control *rc = handle; + struct btrfs_fs_info *fs_info = rc->root->fs_info; while (atomic_read(&rc->elems)) { - wait_event(rc->wait, atomic_read(&rc->elems) == 0); + if (!atomic_read(&fs_info->reada_works_cnt)) + reada_start_machine(fs_info); + wait_event_timeout(rc->wait, atomic_read(&rc->elems) == 0, + (HZ + 9) / 10); } kref_put(&rc->refcnt, reada_control_release); |