summaryrefslogtreecommitdiffstats
path: root/mm/memory_hotplug.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/memory_hotplug.c')
-rw-r--r--mm/memory_hotplug.c661
1 files changed, 204 insertions, 457 deletions
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index b63d7d1239df..8dccc317aac2 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -52,32 +52,19 @@ static void generic_online_page(struct page *page);
static online_page_callback_t online_page_callback = generic_online_page;
static DEFINE_MUTEX(online_page_callback_lock);
-/* The same as the cpu_hotplug lock, but for memory hotplug. */
-static struct {
- struct task_struct *active_writer;
- struct mutex lock; /* Synchronizes accesses to refcount, */
- /*
- * Also blocks the new readers during
- * an ongoing mem hotplug operation.
- */
- int refcount;
+DEFINE_STATIC_PERCPU_RWSEM(mem_hotplug_lock);
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
- struct lockdep_map dep_map;
-#endif
-} mem_hotplug = {
- .active_writer = NULL,
- .lock = __MUTEX_INITIALIZER(mem_hotplug.lock),
- .refcount = 0,
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
- .dep_map = {.name = "mem_hotplug.lock" },
-#endif
-};
+void get_online_mems(void)
+{
+ percpu_down_read(&mem_hotplug_lock);
+}
-/* Lockdep annotations for get/put_online_mems() and mem_hotplug_begin/end() */
-#define memhp_lock_acquire_read() lock_map_acquire_read(&mem_hotplug.dep_map)
-#define memhp_lock_acquire() lock_map_acquire(&mem_hotplug.dep_map)
-#define memhp_lock_release() lock_map_release(&mem_hotplug.dep_map)
+void put_online_mems(void)
+{
+ percpu_up_read(&mem_hotplug_lock);
+}
+
+bool movable_node_enabled = false;
#ifndef CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE
bool memhp_auto_online;
@@ -97,60 +84,16 @@ static int __init setup_memhp_default_state(char *str)
}
__setup("memhp_default_state=", setup_memhp_default_state);
-void get_online_mems(void)
-{
- might_sleep();
- if (mem_hotplug.active_writer == current)
- return;
- memhp_lock_acquire_read();
- mutex_lock(&mem_hotplug.lock);
- mem_hotplug.refcount++;
- mutex_unlock(&mem_hotplug.lock);
-
-}
-
-void put_online_mems(void)
-{
- if (mem_hotplug.active_writer == current)
- return;
- mutex_lock(&mem_hotplug.lock);
-
- if (WARN_ON(!mem_hotplug.refcount))
- mem_hotplug.refcount++; /* try to fix things up */
-
- if (!--mem_hotplug.refcount && unlikely(mem_hotplug.active_writer))
- wake_up_process(mem_hotplug.active_writer);
- mutex_unlock(&mem_hotplug.lock);
- memhp_lock_release();
-
-}
-
-/* Serializes write accesses to mem_hotplug.active_writer. */
-static DEFINE_MUTEX(memory_add_remove_lock);
-
void mem_hotplug_begin(void)
{
- mutex_lock(&memory_add_remove_lock);
-
- mem_hotplug.active_writer = current;
-
- memhp_lock_acquire();
- for (;;) {
- mutex_lock(&mem_hotplug.lock);
- if (likely(!mem_hotplug.refcount))
- break;
- __set_current_state(TASK_UNINTERRUPTIBLE);
- mutex_unlock(&mem_hotplug.lock);
- schedule();
- }
+ cpus_read_lock();
+ percpu_down_write(&mem_hotplug_lock);
}
void mem_hotplug_done(void)
{
- mem_hotplug.active_writer = NULL;
- mutex_unlock(&mem_hotplug.lock);
- memhp_lock_release();
- mutex_unlock(&memory_add_remove_lock);
+ percpu_up_write(&mem_hotplug_lock);
+ cpus_read_unlock();
}
/* add this memory to iomem resource */
@@ -300,229 +243,38 @@ void __init register_page_bootmem_info_node(struct pglist_data *pgdat)
}
#endif /* CONFIG_HAVE_BOOTMEM_INFO_NODE */
-static void __meminit grow_zone_span(struct zone *zone, unsigned long start_pfn,
- unsigned long end_pfn)
-{
- unsigned long old_zone_end_pfn;
-
- zone_span_writelock(zone);
-
- old_zone_end_pfn = zone_end_pfn(zone);
- if (zone_is_empty(zone) || start_pfn < zone->zone_start_pfn)
- zone->zone_start_pfn = start_pfn;
-
- zone->spanned_pages = max(old_zone_end_pfn, end_pfn) -
- zone->zone_start_pfn;
-
- zone_span_writeunlock(zone);
-}
-
-static void resize_zone(struct zone *zone, unsigned long start_pfn,
- unsigned long end_pfn)
-{
- zone_span_writelock(zone);
-
- if (end_pfn - start_pfn) {
- zone->zone_start_pfn = start_pfn;
- zone->spanned_pages = end_pfn - start_pfn;
- } else {
- /*
- * make it consist as free_area_init_core(),
- * if spanned_pages = 0, then keep start_pfn = 0
- */
- zone->zone_start_pfn = 0;
- zone->spanned_pages = 0;
- }
-
- zone_span_writeunlock(zone);
-}
-
-static void fix_zone_id(struct zone *zone, unsigned long start_pfn,
- unsigned long end_pfn)
-{
- enum zone_type zid = zone_idx(zone);
- int nid = zone->zone_pgdat->node_id;
- unsigned long pfn;
-
- for (pfn = start_pfn; pfn < end_pfn; pfn++)
- set_page_links(pfn_to_page(pfn), zid, nid, pfn);
-}
-
-/* Can fail with -ENOMEM from allocating a wait table with vmalloc() or
- * alloc_bootmem_node_nopanic()/memblock_virt_alloc_node_nopanic() */
-static int __ref ensure_zone_is_initialized(struct zone *zone,
- unsigned long start_pfn, unsigned long num_pages)
-{
- if (!zone_is_initialized(zone))
- return init_currently_empty_zone(zone, start_pfn, num_pages);
-
- return 0;
-}
-
-static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2,
- unsigned long start_pfn, unsigned long end_pfn)
-{
- int ret;
- unsigned long flags;
- unsigned long z1_start_pfn;
-
- ret = ensure_zone_is_initialized(z1, start_pfn, end_pfn - start_pfn);
- if (ret)
- return ret;
-
- pgdat_resize_lock(z1->zone_pgdat, &flags);
-
- /* can't move pfns which are higher than @z2 */
- if (end_pfn > zone_end_pfn(z2))
- goto out_fail;
- /* the move out part must be at the left most of @z2 */
- if (start_pfn > z2->zone_start_pfn)
- goto out_fail;
- /* must included/overlap */
- if (end_pfn <= z2->zone_start_pfn)
- goto out_fail;
-
- /* use start_pfn for z1's start_pfn if z1 is empty */
- if (!zone_is_empty(z1))
- z1_start_pfn = z1->zone_start_pfn;
- else
- z1_start_pfn = start_pfn;
-
- resize_zone(z1, z1_start_pfn, end_pfn);
- resize_zone(z2, end_pfn, zone_end_pfn(z2));
-
- pgdat_resize_unlock(z1->zone_pgdat, &flags);
-
- fix_zone_id(z1, start_pfn, end_pfn);
-
- return 0;
-out_fail:
- pgdat_resize_unlock(z1->zone_pgdat, &flags);
- return -1;
-}
-
-static int __meminit move_pfn_range_right(struct zone *z1, struct zone *z2,
- unsigned long start_pfn, unsigned long end_pfn)
+static int __meminit __add_section(int nid, unsigned long phys_start_pfn,
+ bool want_memblock)
{
int ret;
- unsigned long flags;
- unsigned long z2_end_pfn;
-
- ret = ensure_zone_is_initialized(z2, start_pfn, end_pfn - start_pfn);
- if (ret)
- return ret;
-
- pgdat_resize_lock(z1->zone_pgdat, &flags);
-
- /* can't move pfns which are lower than @z1 */
- if (z1->zone_start_pfn > start_pfn)
- goto out_fail;
- /* the move out part mast at the right most of @z1 */
- if (zone_end_pfn(z1) > end_pfn)
- goto out_fail;
- /* must included/overlap */
- if (start_pfn >= zone_end_pfn(z1))
- goto out_fail;
-
- /* use end_pfn for z2's end_pfn if z2 is empty */
- if (!zone_is_empty(z2))
- z2_end_pfn = zone_end_pfn(z2);
- else
- z2_end_pfn = end_pfn;
-
- resize_zone(z1, z1->zone_start_pfn, start_pfn);
- resize_zone(z2, start_pfn, z2_end_pfn);
-
- pgdat_resize_unlock(z1->zone_pgdat, &flags);
-
- fix_zone_id(z2, start_pfn, end_pfn);
-
- return 0;
-out_fail:
- pgdat_resize_unlock(z1->zone_pgdat, &flags);
- return -1;
-}
-
-static struct zone * __meminit move_pfn_range(int zone_shift,
- unsigned long start_pfn, unsigned long end_pfn)
-{
- struct zone *zone = page_zone(pfn_to_page(start_pfn));
- int ret = 0;
-
- if (zone_shift < 0)
- ret = move_pfn_range_left(zone + zone_shift, zone,
- start_pfn, end_pfn);
- else if (zone_shift)
- ret = move_pfn_range_right(zone, zone + zone_shift,
- start_pfn, end_pfn);
-
- if (ret)
- return NULL;
-
- return zone + zone_shift;
-}
-
-static void __meminit grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn,
- unsigned long end_pfn)
-{
- unsigned long old_pgdat_end_pfn = pgdat_end_pfn(pgdat);
-
- if (!pgdat->node_spanned_pages || start_pfn < pgdat->node_start_pfn)
- pgdat->node_start_pfn = start_pfn;
-
- pgdat->node_spanned_pages = max(old_pgdat_end_pfn, end_pfn) -
- pgdat->node_start_pfn;
-}
+ int i;
-static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn)
-{
- struct pglist_data *pgdat = zone->zone_pgdat;
- int nr_pages = PAGES_PER_SECTION;
- int nid = pgdat->node_id;
- int zone_type;
- unsigned long flags, pfn;
- int ret;
+ if (pfn_valid(phys_start_pfn))
+ return -EEXIST;
- zone_type = zone - pgdat->node_zones;
- ret = ensure_zone_is_initialized(zone, phys_start_pfn, nr_pages);
- if (ret)
+ ret = sparse_add_one_section(NODE_DATA(nid), phys_start_pfn);
+ if (ret < 0)
return ret;
- pgdat_resize_lock(zone->zone_pgdat, &flags);
- grow_zone_span(zone, phys_start_pfn, phys_start_pfn + nr_pages);
- grow_pgdat_span(zone->zone_pgdat, phys_start_pfn,
- phys_start_pfn + nr_pages);
- pgdat_resize_unlock(zone->zone_pgdat, &flags);
- memmap_init_zone(nr_pages, nid, zone_type,
- phys_start_pfn, MEMMAP_HOTPLUG);
-
- /* online_page_range is called later and expects pages reserved */
- for (pfn = phys_start_pfn; pfn < phys_start_pfn + nr_pages; pfn++) {
+ /*
+ * Make all the pages reserved so that nobody will stumble over half
+ * initialized state.
+ * FIXME: We also have to associate it with a node because pfn_to_node
+ * relies on having page with the proper node.
+ */
+ for (i = 0; i < PAGES_PER_SECTION; i++) {
+ unsigned long pfn = phys_start_pfn + i;
+ struct page *page;
if (!pfn_valid(pfn))
continue;
- SetPageReserved(pfn_to_page(pfn));
+ page = pfn_to_page(pfn);
+ set_page_node(page, nid);
+ SetPageReserved(page);
}
- return 0;
-}
-
-static int __meminit __add_section(int nid, struct zone *zone,
- unsigned long phys_start_pfn)
-{
- int ret;
-
- if (pfn_valid(phys_start_pfn))
- return -EEXIST;
-
- ret = sparse_add_one_section(zone, phys_start_pfn);
-
- if (ret < 0)
- return ret;
- ret = __add_zone(zone, phys_start_pfn);
-
- if (ret < 0)
- return ret;
+ if (!want_memblock)
+ return 0;
return register_new_memory(nid, __pfn_to_section(phys_start_pfn));
}
@@ -533,16 +285,14 @@ static int __meminit __add_section(int nid, struct zone *zone,
* call this function after deciding the zone to which to
* add the new pages.
*/
-int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn,
- unsigned long nr_pages)
+int __ref __add_pages(int nid, unsigned long phys_start_pfn,
+ unsigned long nr_pages, bool want_memblock)
{
unsigned long i;
int err = 0;
int start_sec, end_sec;
struct vmem_altmap *altmap;
- clear_zone_contiguous(zone);
-
/* during initialize mem_map, align hot-added range to section */
start_sec = pfn_to_section_nr(phys_start_pfn);
end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1);
@@ -562,7 +312,7 @@ int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn,
}
for (i = start_sec; i <= end_sec; i++) {
- err = __add_section(nid, zone, section_nr_to_pfn(i));
+ err = __add_section(nid, section_nr_to_pfn(i), want_memblock);
/*
* EEXIST is finally dealt with by ioresource collision
@@ -575,7 +325,6 @@ int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn,
}
vmemmap_populate_print_last();
out:
- set_zone_contiguous(zone);
return err;
}
EXPORT_SYMBOL_GPL(__add_pages);
@@ -772,11 +521,8 @@ static void __remove_zone(struct zone *zone, unsigned long start_pfn)
{
struct pglist_data *pgdat = zone->zone_pgdat;
int nr_pages = PAGES_PER_SECTION;
- int zone_type;
unsigned long flags;
- zone_type = zone - pgdat->node_zones;
-
pgdat_resize_lock(zone->zone_pgdat, &flags);
shrink_zone_span(zone, start_pfn, start_pfn + nr_pages);
shrink_pgdat_span(pgdat, start_pfn, start_pfn + nr_pages);
@@ -939,33 +685,20 @@ static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
unsigned long i;
unsigned long onlined_pages = *(unsigned long *)arg;
struct page *page;
+
if (PageReserved(pfn_to_page(start_pfn)))
for (i = 0; i < nr_pages; i++) {
page = pfn_to_page(start_pfn + i);
(*online_page_callback)(page);
onlined_pages++;
}
+
+ online_mem_sections(start_pfn, start_pfn + nr_pages);
+
*(unsigned long *)arg = onlined_pages;
return 0;
}
-#ifdef CONFIG_MOVABLE_NODE
-/*
- * When CONFIG_MOVABLE_NODE, we permit onlining of a node which doesn't have
- * normal memory.
- */
-static bool can_online_high_movable(struct zone *zone)
-{
- return true;
-}
-#else /* CONFIG_MOVABLE_NODE */
-/* ensure every online node has NORMAL memory */
-static bool can_online_high_movable(struct zone *zone)
-{
- return node_state(zone_to_nid(zone), N_NORMAL_MEMORY);
-}
-#endif /* CONFIG_MOVABLE_NODE */
-
/* check which state of node_states will be changed when online memory */
static void node_states_check_changes_online(unsigned long nr_pages,
struct zone *zone, struct memory_notify *arg)
@@ -1040,39 +773,144 @@ static void node_states_set_node(int node, struct memory_notify *arg)
node_set_state(node, N_MEMORY);
}
-bool zone_can_shift(unsigned long pfn, unsigned long nr_pages,
- enum zone_type target, int *zone_shift)
+bool allow_online_pfn_range(int nid, unsigned long pfn, unsigned long nr_pages, int online_type)
{
- struct zone *zone = page_zone(pfn_to_page(pfn));
- enum zone_type idx = zone_idx(zone);
- int i;
+ struct pglist_data *pgdat = NODE_DATA(nid);
+ struct zone *movable_zone = &pgdat->node_zones[ZONE_MOVABLE];
+ struct zone *default_zone = default_zone_for_pfn(nid, pfn, nr_pages);
- *zone_shift = 0;
+ /*
+ * TODO there shouldn't be any inherent reason to have ZONE_NORMAL
+ * physically before ZONE_MOVABLE. All we need is they do not
+ * overlap. Historically we didn't allow ZONE_NORMAL after ZONE_MOVABLE
+ * though so let's stick with it for simplicity for now.
+ * TODO make sure we do not overlap with ZONE_DEVICE
+ */
+ if (online_type == MMOP_ONLINE_KERNEL) {
+ if (zone_is_empty(movable_zone))
+ return true;
+ return movable_zone->zone_start_pfn >= pfn + nr_pages;
+ } else if (online_type == MMOP_ONLINE_MOVABLE) {
+ return zone_end_pfn(default_zone) <= pfn;
+ }
- if (idx < target) {
- /* pages must be at end of current zone */
- if (pfn + nr_pages != zone_end_pfn(zone))
- return false;
+ /* MMOP_ONLINE_KEEP will always succeed and inherits the current zone */
+ return online_type == MMOP_ONLINE_KEEP;
+}
+
+static void __meminit resize_zone_range(struct zone *zone, unsigned long start_pfn,
+ unsigned long nr_pages)
+{
+ unsigned long old_end_pfn = zone_end_pfn(zone);
+
+ if (zone_is_empty(zone) || start_pfn < zone->zone_start_pfn)
+ zone->zone_start_pfn = start_pfn;
+
+ zone->spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - zone->zone_start_pfn;
+}
- /* no zones in use between current zone and target */
- for (i = idx + 1; i < target; i++)
- if (zone_is_initialized(zone - idx + i))
- return false;
+static void __meminit resize_pgdat_range(struct pglist_data *pgdat, unsigned long start_pfn,
+ unsigned long nr_pages)
+{
+ unsigned long old_end_pfn = pgdat_end_pfn(pgdat);
+
+ if (!pgdat->node_spanned_pages || start_pfn < pgdat->node_start_pfn)
+ pgdat->node_start_pfn = start_pfn;
+
+ pgdat->node_spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - pgdat->node_start_pfn;
+}
+
+void __ref move_pfn_range_to_zone(struct zone *zone,
+ unsigned long start_pfn, unsigned long nr_pages)
+{
+ struct pglist_data *pgdat = zone->zone_pgdat;
+ int nid = pgdat->node_id;
+ unsigned long flags;
+
+ if (zone_is_empty(zone))
+ init_currently_empty_zone(zone, start_pfn, nr_pages);
+
+ clear_zone_contiguous(zone);
+
+ /* TODO Huh pgdat is irqsave while zone is not. It used to be like that before */
+ pgdat_resize_lock(pgdat, &flags);
+ zone_span_writelock(zone);
+ resize_zone_range(zone, start_pfn, nr_pages);
+ zone_span_writeunlock(zone);
+ resize_pgdat_range(pgdat, start_pfn, nr_pages);
+ pgdat_resize_unlock(pgdat, &flags);
+
+ /*
+ * TODO now we have a visible range of pages which are not associated
+ * with their zone properly. Not nice but set_pfnblock_flags_mask
+ * expects the zone spans the pfn range. All the pages in the range
+ * are reserved so nobody should be touching them so we should be safe
+ */
+ memmap_init_zone(nr_pages, nid, zone_idx(zone), start_pfn, MEMMAP_HOTPLUG);
+
+ set_zone_contiguous(zone);
+}
+
+/*
+ * Returns a default kernel memory zone for the given pfn range.
+ * If no kernel zone covers this pfn range it will automatically go
+ * to the ZONE_NORMAL.
+ */
+struct zone *default_zone_for_pfn(int nid, unsigned long start_pfn,
+ unsigned long nr_pages)
+{
+ struct pglist_data *pgdat = NODE_DATA(nid);
+ int zid;
+
+ for (zid = 0; zid <= ZONE_NORMAL; zid++) {
+ struct zone *zone = &pgdat->node_zones[zid];
+
+ if (zone_intersects(zone, start_pfn, nr_pages))
+ return zone;
}
- if (target < idx) {
- /* pages must be at beginning of current zone */
- if (pfn != zone->zone_start_pfn)
- return false;
+ return &pgdat->node_zones[ZONE_NORMAL];
+}
+
+static inline bool movable_pfn_range(int nid, struct zone *default_zone,
+ unsigned long start_pfn, unsigned long nr_pages)
+{
+ if (!allow_online_pfn_range(nid, start_pfn, nr_pages,
+ MMOP_ONLINE_KERNEL))
+ return true;
+
+ if (!movable_node_is_enabled())
+ return false;
- /* no zones in use between current zone and target */
- for (i = target + 1; i < idx; i++)
- if (zone_is_initialized(zone - idx + i))
- return false;
+ return !zone_intersects(default_zone, start_pfn, nr_pages);
+}
+
+/*
+ * Associates the given pfn range with the given node and the zone appropriate
+ * for the given online type.
+ */
+static struct zone * __meminit move_pfn_range(int online_type, int nid,
+ unsigned long start_pfn, unsigned long nr_pages)
+{
+ struct pglist_data *pgdat = NODE_DATA(nid);
+ struct zone *zone = default_zone_for_pfn(nid, start_pfn, nr_pages);
+
+ if (online_type == MMOP_ONLINE_KEEP) {
+ struct zone *movable_zone = &pgdat->node_zones[ZONE_MOVABLE];
+ /*
+ * MMOP_ONLINE_KEEP defaults to MMOP_ONLINE_KERNEL but use
+ * movable zone if that is not possible (e.g. we are within
+ * or past the existing movable zone). movable_node overrides
+ * this default and defaults to movable zone
+ */
+ if (movable_pfn_range(nid, zone, start_pfn, nr_pages))
+ zone = movable_zone;
+ } else if (online_type == MMOP_ONLINE_MOVABLE) {
+ zone = &pgdat->node_zones[ZONE_MOVABLE];
}
- *zone_shift = target - idx;
- return true;
+ move_pfn_range_to_zone(zone, start_pfn, nr_pages);
+ return zone;
}
/* Must be protected by mem_hotplug_begin() */
@@ -1085,38 +923,18 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
int nid;
int ret;
struct memory_notify arg;
- int zone_shift = 0;
-
- /*
- * This doesn't need a lock to do pfn_to_page().
- * The section can't be removed here because of the
- * memory_block->state_mutex.
- */
- zone = page_zone(pfn_to_page(pfn));
- if ((zone_idx(zone) > ZONE_NORMAL ||
- online_type == MMOP_ONLINE_MOVABLE) &&
- !can_online_high_movable(zone))
+ nid = pfn_to_nid(pfn);
+ if (!allow_online_pfn_range(nid, pfn, nr_pages, online_type))
return -EINVAL;
- if (online_type == MMOP_ONLINE_KERNEL) {
- if (!zone_can_shift(pfn, nr_pages, ZONE_NORMAL, &zone_shift))
- return -EINVAL;
- } else if (online_type == MMOP_ONLINE_MOVABLE) {
- if (!zone_can_shift(pfn, nr_pages, ZONE_MOVABLE, &zone_shift))
- return -EINVAL;
- }
-
- zone = move_pfn_range(zone_shift, pfn, pfn + nr_pages);
- if (!zone)
- return -EINVAL;
+ /* associate pfn range with the zone */
+ zone = move_pfn_range(online_type, nid, pfn, nr_pages);
arg.start_pfn = pfn;
arg.nr_pages = nr_pages;
node_states_check_changes_online(nr_pages, zone, &arg);
- nid = zone_to_nid(zone);
-
ret = memory_notify(MEM_GOING_ONLINE, &arg);
ret = notifier_to_errno(ret);
if (ret)
@@ -1311,39 +1129,6 @@ static int check_hotplug_memory_range(u64 start, u64 size)
return 0;
}
-/*
- * If movable zone has already been setup, newly added memory should be check.
- * If its address is higher than movable zone, it should be added as movable.
- * Without this check, movable zone may overlap with other zone.
- */
-static int should_add_memory_movable(int nid, u64 start, u64 size)
-{
- unsigned long start_pfn = start >> PAGE_SHIFT;
- pg_data_t *pgdat = NODE_DATA(nid);
- struct zone *movable_zone = pgdat->node_zones + ZONE_MOVABLE;
-
- if (zone_is_empty(movable_zone))
- return 0;
-
- if (movable_zone->zone_start_pfn <= start_pfn)
- return 1;
-
- return 0;
-}
-
-int zone_for_memory(int nid, u64 start, u64 size, int zone_default,
- bool for_device)
-{
-#ifdef CONFIG_ZONE_DEVICE
- if (for_device)
- return ZONE_DEVICE;
-#endif
- if (should_add_memory_movable(nid, start, size))
- return ZONE_MOVABLE;
-
- return zone_default;
-}
-
static int online_memory_block(struct memory_block *mem, void *arg)
{
return device_online(&mem->dev);
@@ -1389,7 +1174,7 @@ int __ref add_memory_resource(int nid, struct resource *res, bool online)
}
/* call arch's memory hotadd */
- ret = arch_add_memory(nid, start, size, false);
+ ret = arch_add_memory(nid, start, size, true);
if (ret < 0)
goto error;
@@ -1398,7 +1183,22 @@ int __ref add_memory_resource(int nid, struct resource *res, bool online)
node_set_online(nid);
if (new_node) {
- ret = register_one_node(nid);
+ unsigned long start_pfn = start >> PAGE_SHIFT;
+ unsigned long nr_pages = size >> PAGE_SHIFT;
+
+ ret = __register_one_node(nid);
+ if (ret)
+ goto register_fail;
+
+ /*
+ * link memory sections under this node. This is already
+ * done when creatig memory section in register_new_memory
+ * but that depends to have the node registered so offline
+ * nodes have to go through register_node.
+ * TODO clean up this mess.
+ */
+ ret = link_mem_sections(nid, start_pfn, nr_pages);
+register_fail:
/*
* If sysfs file of new node can't create, cpu on the node
* can't be hot-added. There is no rollback way now.
@@ -1419,7 +1219,7 @@ int __ref add_memory_resource(int nid, struct resource *res, bool online)
error:
/* rollback pgdat allocation and others */
- if (new_pgdat)
+ if (new_pgdat && pgdat)
rollback_node_hotadd(nid, pgdat);
memblock_remove(start, size);
@@ -1571,34 +1371,19 @@ static unsigned long scan_movable_pages(unsigned long start, unsigned long end)
static struct page *new_node_page(struct page *page, unsigned long private,
int **result)
{
- gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE;
int nid = page_to_nid(page);
nodemask_t nmask = node_states[N_MEMORY];
- struct page *new_page = NULL;
/*
- * TODO: allocate a destination hugepage from a nearest neighbor node,
- * accordance with memory policy of the user process if possible. For
- * now as a simple work-around, we use the next node for destination.
+ * try to allocate from a different node but reuse this node if there
+ * are no other online nodes to be used (e.g. we are offlining a part
+ * of the only existing node)
*/
- if (PageHuge(page))
- return alloc_huge_page_node(page_hstate(compound_head(page)),
- next_node_in(nid, nmask));
-
node_clear(nid, nmask);
+ if (nodes_empty(nmask))
+ node_set(nid, nmask);
- if (PageHighMem(page)
- || (zone_idx(page_zone(page)) == ZONE_MOVABLE))
- gfp_mask |= __GFP_HIGHMEM;
-
- if (!nodes_empty(nmask))
- new_page = __alloc_pages_nodemask(gfp_mask, 0,
- node_zonelist(nid, gfp_mask), &nmask);
- if (!new_page)
- new_page = __alloc_pages(gfp_mask, 0,
- node_zonelist(nid, gfp_mask));
-
- return new_page;
+ return new_page_nodemask(page, nid, &nmask);
}
#define NR_OFFLINE_AT_ONCE_PAGES (256)
@@ -1725,47 +1510,12 @@ check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
return offlined;
}
-#ifdef CONFIG_MOVABLE_NODE
-/*
- * When CONFIG_MOVABLE_NODE, we permit offlining of a node which doesn't have
- * normal memory.
- */
-static bool can_offline_normal(struct zone *zone, unsigned long nr_pages)
-{
- return true;
-}
-#else /* CONFIG_MOVABLE_NODE */
-/* ensure the node has NORMAL memory if it is still online */
-static bool can_offline_normal(struct zone *zone, unsigned long nr_pages)
-{
- struct pglist_data *pgdat = zone->zone_pgdat;
- unsigned long present_pages = 0;
- enum zone_type zt;
-
- for (zt = 0; zt <= ZONE_NORMAL; zt++)
- present_pages += pgdat->node_zones[zt].present_pages;
-
- if (present_pages > nr_pages)
- return true;
-
- present_pages = 0;
- for (; zt <= ZONE_MOVABLE; zt++)
- present_pages += pgdat->node_zones[zt].present_pages;
-
- /*
- * we can't offline the last normal memory until all
- * higher memory is offlined.
- */
- return present_pages == 0;
-}
-#endif /* CONFIG_MOVABLE_NODE */
-
static int __init cmdline_parse_movable_node(char *p)
{
-#ifdef CONFIG_MOVABLE_NODE
+#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
movable_node_enabled = true;
#else
- pr_warn("movable_node option not supported\n");
+ pr_warn("movable_node parameter depends on CONFIG_HAVE_MEMBLOCK_NODE_MAP to work properly\n");
#endif
return 0;
}
@@ -1887,9 +1637,6 @@ static int __ref __offline_pages(unsigned long start_pfn,
node = zone_to_nid(zone);
nr_pages = end_pfn - start_pfn;
- if (zone_idx(zone) <= ZONE_NORMAL && !can_offline_normal(zone, nr_pages))
- return -EINVAL;
-
/* set above range as isolated */
ret = start_isolate_page_range(start_pfn, end_pfn,
MIGRATE_MOVABLE, true);
@@ -1919,7 +1666,7 @@ repeat:
goto failed_removal;
ret = 0;
if (drain) {
- lru_add_drain_all();
+ lru_add_drain_all_cpuslocked();
cond_resched();
drain_all_pages(zone);
}
@@ -1940,7 +1687,7 @@ repeat:
}
}
/* drain all zone's lru pagevec, this is asynchronous... */
- lru_add_drain_all();
+ lru_add_drain_all_cpuslocked();
yield();
/* drain pcp pages, this is synchronous. */
drain_all_pages(zone);