From f29ad6a99b596b8169744d107bf088e8be9e8d0d Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 14 Dec 2009 17:58:40 -0800 Subject: swap_info: private to swapfile.c The swap_info_struct is mostly private to mm/swapfile.c, with only one other in-tree user: get_swap_bio(). Adjust its interface to map_swap_page(), so that we can then remove get_swap_info_struct(). But there is a popular user out-of-tree, TuxOnIce: so leave the declaration of swap_info_struct in linux/swap.h. Signed-off-by: Hugh Dickins Cc: Nigel Cunningham Cc: KAMEZAWA Hiroyuki Reviewed-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) (limited to 'mm/swapfile.c') diff --git a/mm/swapfile.c b/mm/swapfile.c index 9c590eef7912..f83f1c6f6196 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1283,12 +1283,22 @@ static void drain_mmlist(void) /* * Use this swapdev's extent info to locate the (PAGE_SIZE) block which - * corresponds to page offset `offset'. + * corresponds to page offset `offset'. Note that the type of this function + * is sector_t, but it returns page offset into the bdev, not sector offset. */ -sector_t map_swap_page(struct swap_info_struct *sis, pgoff_t offset) +sector_t map_swap_page(swp_entry_t entry, struct block_device **bdev) { - struct swap_extent *se = sis->curr_swap_extent; - struct swap_extent *start_se = se; + struct swap_info_struct *sis; + struct swap_extent *start_se; + struct swap_extent *se; + pgoff_t offset; + + sis = swap_info + swp_type(entry); + *bdev = sis->bdev; + + offset = swp_offset(entry); + start_se = sis->curr_swap_extent; + se = start_se; for ( ; ; ) { struct list_head *lh; @@ -1314,12 +1324,14 @@ sector_t map_swap_page(struct swap_info_struct *sis, pgoff_t offset) sector_t swapdev_block(int swap_type, pgoff_t offset) { struct swap_info_struct *sis; + struct block_device *bdev; if (swap_type >= nr_swapfiles) return 0; sis = swap_info + swap_type; - return (sis->flags & SWP_WRITEOK) ? map_swap_page(sis, offset) : 0; + return (sis->flags & SWP_WRITEOK) ? + map_swap_page(swp_entry(swap_type, offset), &bdev) : 0; } #endif /* CONFIG_HIBERNATION */ @@ -2159,13 +2171,6 @@ int swapcache_prepare(swp_entry_t entry) return __swap_duplicate(entry, SWAP_CACHE); } - -struct swap_info_struct * -get_swap_info_struct(unsigned type) -{ - return &swap_info[type]; -} - /* * swap_lock prevents swap_map being freed. Don't grab an extra * reference on the swaphandle, it doesn't matter if it becomes unused. -- cgit v1.2.3 From efa90a981bbc891efad96db2a75b5487e00852ca Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 14 Dec 2009 17:58:41 -0800 Subject: swap_info: change to array of pointers The swap_info_struct is only 76 or 104 bytes, but it does seem wrong to reserve an array of about 30 of them in bss, when most people will want only one. Change swap_info[] to an array of pointers. That does need a "type" field in the structure: pack it as a char with next type and short prio (aha, char is unsigned by default on PowerPC). Use the (admittedly peculiar) name "type" throughout for this index. /proc/swaps does not take swap_lock: I wouldn't want it to, but do take care with barriers when adding a new item to the array (never removed). Signed-off-by: Hugh Dickins Reviewed-by: KAMEZAWA Hiroyuki Acked-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 204 ++++++++++++++++++++++++++++++++-------------------------- 1 file changed, 113 insertions(+), 91 deletions(-) (limited to 'mm/swapfile.c') diff --git a/mm/swapfile.c b/mm/swapfile.c index f83f1c6f6196..dc88a7e4257e 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -49,7 +49,7 @@ static const char Unused_offset[] = "Unused swap offset entry "; static struct swap_list_t swap_list = {-1, -1}; -static struct swap_info_struct swap_info[MAX_SWAPFILES]; +static struct swap_info_struct *swap_info[MAX_SWAPFILES]; static DEFINE_MUTEX(swapon_mutex); @@ -79,12 +79,11 @@ static inline unsigned short encode_swapmap(int count, bool has_cache) return ret; } -/* returnes 1 if swap entry is freed */ +/* returns 1 if swap entry is freed */ static int __try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset) { - int type = si - swap_info; - swp_entry_t entry = swp_entry(type, offset); + swp_entry_t entry = swp_entry(si->type, offset); struct page *page; int ret = 0; @@ -120,7 +119,7 @@ void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page) down_read(&swap_unplug_sem); entry.val = page_private(page); if (PageSwapCache(page)) { - struct block_device *bdev = swap_info[swp_type(entry)].bdev; + struct block_device *bdev = swap_info[swp_type(entry)]->bdev; struct backing_dev_info *bdi; /* @@ -467,10 +466,10 @@ swp_entry_t get_swap_page(void) nr_swap_pages--; for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) { - si = swap_info + type; + si = swap_info[type]; next = si->next; if (next < 0 || - (!wrapped && si->prio != swap_info[next].prio)) { + (!wrapped && si->prio != swap_info[next]->prio)) { next = swap_list.head; wrapped++; } @@ -503,8 +502,8 @@ swp_entry_t get_swap_page_of_type(int type) pgoff_t offset; spin_lock(&swap_lock); - si = swap_info + type; - if (si->flags & SWP_WRITEOK) { + si = swap_info[type]; + if (si && (si->flags & SWP_WRITEOK)) { nr_swap_pages--; /* This is called for allocating swap entry, not cache */ offset = scan_swap_map(si, SWAP_MAP); @@ -528,7 +527,7 @@ static struct swap_info_struct * swap_info_get(swp_entry_t entry) type = swp_type(entry); if (type >= nr_swapfiles) goto bad_nofile; - p = & swap_info[type]; + p = swap_info[type]; if (!(p->flags & SWP_USED)) goto bad_device; offset = swp_offset(entry); @@ -581,8 +580,9 @@ static int swap_entry_free(struct swap_info_struct *p, p->lowest_bit = offset; if (offset > p->highest_bit) p->highest_bit = offset; - if (p->prio > swap_info[swap_list.next].prio) - swap_list.next = p - swap_info; + if (swap_list.next >= 0 && + p->prio > swap_info[swap_list.next]->prio) + swap_list.next = p->type; nr_swap_pages++; p->inuse_pages--; } @@ -741,14 +741,14 @@ int free_swap_and_cache(swp_entry_t entry) int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p) { struct block_device *bdev = NULL; - int i; + int type; if (device) bdev = bdget(device); spin_lock(&swap_lock); - for (i = 0; i < nr_swapfiles; i++) { - struct swap_info_struct *sis = swap_info + i; + for (type = 0; type < nr_swapfiles; type++) { + struct swap_info_struct *sis = swap_info[type]; if (!(sis->flags & SWP_WRITEOK)) continue; @@ -758,7 +758,7 @@ int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p) *bdev_p = bdgrab(sis->bdev); spin_unlock(&swap_lock); - return i; + return type; } if (bdev == sis->bdev) { struct swap_extent *se; @@ -771,7 +771,7 @@ int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p) spin_unlock(&swap_lock); bdput(bdev); - return i; + return type; } } } @@ -792,15 +792,17 @@ unsigned int count_swap_pages(int type, int free) { unsigned int n = 0; - if (type < nr_swapfiles) { - spin_lock(&swap_lock); - if (swap_info[type].flags & SWP_WRITEOK) { - n = swap_info[type].pages; + spin_lock(&swap_lock); + if ((unsigned int)type < nr_swapfiles) { + struct swap_info_struct *sis = swap_info[type]; + + if (sis->flags & SWP_WRITEOK) { + n = sis->pages; if (free) - n -= swap_info[type].inuse_pages; + n -= sis->inuse_pages; } - spin_unlock(&swap_lock); } + spin_unlock(&swap_lock); return n; } #endif @@ -1024,7 +1026,7 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si, */ static int try_to_unuse(unsigned int type) { - struct swap_info_struct * si = &swap_info[type]; + struct swap_info_struct *si = swap_info[type]; struct mm_struct *start_mm; unsigned short *swap_map; unsigned short swcount; @@ -1270,10 +1272,10 @@ retry: static void drain_mmlist(void) { struct list_head *p, *next; - unsigned int i; + unsigned int type; - for (i = 0; i < nr_swapfiles; i++) - if (swap_info[i].inuse_pages) + for (type = 0; type < nr_swapfiles; type++) + if (swap_info[type]->inuse_pages) return; spin_lock(&mmlist_lock); list_for_each_safe(p, next, &init_mm.mmlist) @@ -1293,7 +1295,7 @@ sector_t map_swap_page(swp_entry_t entry, struct block_device **bdev) struct swap_extent *se; pgoff_t offset; - sis = swap_info + swp_type(entry); + sis = swap_info[swp_type(entry)]; *bdev = sis->bdev; offset = swp_offset(entry); @@ -1321,17 +1323,15 @@ sector_t map_swap_page(swp_entry_t entry, struct block_device **bdev) * Get the (PAGE_SIZE) block corresponding to given offset on the swapdev * corresponding to given index in swap_info (swap type). */ -sector_t swapdev_block(int swap_type, pgoff_t offset) +sector_t swapdev_block(int type, pgoff_t offset) { - struct swap_info_struct *sis; struct block_device *bdev; - if (swap_type >= nr_swapfiles) + if ((unsigned int)type >= nr_swapfiles) return 0; - - sis = swap_info + swap_type; - return (sis->flags & SWP_WRITEOK) ? - map_swap_page(swp_entry(swap_type, offset), &bdev) : 0; + if (!(swap_info[type]->flags & SWP_WRITEOK)) + return 0; + return map_swap_page(swp_entry(type, offset), &bdev); } #endif /* CONFIG_HIBERNATION */ @@ -1547,8 +1547,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) mapping = victim->f_mapping; prev = -1; spin_lock(&swap_lock); - for (type = swap_list.head; type >= 0; type = swap_info[type].next) { - p = swap_info + type; + for (type = swap_list.head; type >= 0; type = swap_info[type]->next) { + p = swap_info[type]; if (p->flags & SWP_WRITEOK) { if (p->swap_file->f_mapping == mapping) break; @@ -1567,18 +1567,17 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) spin_unlock(&swap_lock); goto out_dput; } - if (prev < 0) { + if (prev < 0) swap_list.head = p->next; - } else { - swap_info[prev].next = p->next; - } + else + swap_info[prev]->next = p->next; if (type == swap_list.next) { /* just pick something that's safe... */ swap_list.next = swap_list.head; } if (p->prio < 0) { - for (i = p->next; i >= 0; i = swap_info[i].next) - swap_info[i].prio = p->prio--; + for (i = p->next; i >= 0; i = swap_info[i]->next) + swap_info[i]->prio = p->prio--; least_priority++; } nr_swap_pages -= p->pages; @@ -1596,16 +1595,16 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) if (p->prio < 0) p->prio = --least_priority; prev = -1; - for (i = swap_list.head; i >= 0; i = swap_info[i].next) { - if (p->prio >= swap_info[i].prio) + for (i = swap_list.head; i >= 0; i = swap_info[i]->next) { + if (p->prio >= swap_info[i]->prio) break; prev = i; } p->next = i; if (prev < 0) - swap_list.head = swap_list.next = p - swap_info; + swap_list.head = swap_list.next = type; else - swap_info[prev].next = p - swap_info; + swap_info[prev]->next = type; nr_swap_pages += p->pages; total_swap_pages += p->pages; p->flags |= SWP_WRITEOK; @@ -1665,8 +1664,8 @@ out: /* iterator */ static void *swap_start(struct seq_file *swap, loff_t *pos) { - struct swap_info_struct *ptr = swap_info; - int i; + struct swap_info_struct *si; + int type; loff_t l = *pos; mutex_lock(&swapon_mutex); @@ -1674,11 +1673,13 @@ static void *swap_start(struct seq_file *swap, loff_t *pos) if (!l) return SEQ_START_TOKEN; - for (i = 0; i < nr_swapfiles; i++, ptr++) { - if (!(ptr->flags & SWP_USED) || !ptr->swap_map) + for (type = 0; type < nr_swapfiles; type++) { + smp_rmb(); /* read nr_swapfiles before swap_info[type] */ + si = swap_info[type]; + if (!(si->flags & SWP_USED) || !si->swap_map) continue; if (!--l) - return ptr; + return si; } return NULL; @@ -1686,21 +1687,21 @@ static void *swap_start(struct seq_file *swap, loff_t *pos) static void *swap_next(struct seq_file *swap, void *v, loff_t *pos) { - struct swap_info_struct *ptr; - struct swap_info_struct *endptr = swap_info + nr_swapfiles; + struct swap_info_struct *si = v; + int type; if (v == SEQ_START_TOKEN) - ptr = swap_info; - else { - ptr = v; - ptr++; - } + type = 0; + else + type = si->type + 1; - for (; ptr < endptr; ptr++) { - if (!(ptr->flags & SWP_USED) || !ptr->swap_map) + for (; type < nr_swapfiles; type++) { + smp_rmb(); /* read nr_swapfiles before swap_info[type] */ + si = swap_info[type]; + if (!(si->flags & SWP_USED) || !si->swap_map) continue; ++*pos; - return ptr; + return si; } return NULL; @@ -1713,24 +1714,24 @@ static void swap_stop(struct seq_file *swap, void *v) static int swap_show(struct seq_file *swap, void *v) { - struct swap_info_struct *ptr = v; + struct swap_info_struct *si = v; struct file *file; int len; - if (ptr == SEQ_START_TOKEN) { + if (si == SEQ_START_TOKEN) { seq_puts(swap,"Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n"); return 0; } - file = ptr->swap_file; + file = si->swap_file; len = seq_path(swap, &file->f_path, " \t\n\\"); seq_printf(swap, "%*s%s\t%u\t%u\t%d\n", len < 40 ? 40 - len : 1, " ", S_ISBLK(file->f_path.dentry->d_inode->i_mode) ? "partition" : "file\t", - ptr->pages << (PAGE_SHIFT - 10), - ptr->inuse_pages << (PAGE_SHIFT - 10), - ptr->prio); + si->pages << (PAGE_SHIFT - 10), + si->inuse_pages << (PAGE_SHIFT - 10), + si->prio); return 0; } @@ -1798,23 +1799,45 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) if (!capable(CAP_SYS_ADMIN)) return -EPERM; + + p = kzalloc(sizeof(*p), GFP_KERNEL); + if (!p) + return -ENOMEM; + spin_lock(&swap_lock); - p = swap_info; - for (type = 0 ; type < nr_swapfiles ; type++,p++) - if (!(p->flags & SWP_USED)) + for (type = 0; type < nr_swapfiles; type++) { + if (!(swap_info[type]->flags & SWP_USED)) break; + } error = -EPERM; if (type >= MAX_SWAPFILES) { spin_unlock(&swap_lock); + kfree(p); goto out; } - if (type >= nr_swapfiles) - nr_swapfiles = type+1; - memset(p, 0, sizeof(*p)); INIT_LIST_HEAD(&p->extent_list); + if (type >= nr_swapfiles) { + p->type = type; + swap_info[type] = p; + /* + * Write swap_info[type] before nr_swapfiles, in case a + * racing procfs swap_start() or swap_next() is reading them. + * (We never shrink nr_swapfiles, we never free this entry.) + */ + smp_wmb(); + nr_swapfiles++; + } else { + kfree(p); + p = swap_info[type]; + /* + * Do not memset this entry: a racing procfs swap_next() + * would be relying on p->type to remain valid. + */ + } p->flags = SWP_USED; p->next = -1; spin_unlock(&swap_lock); + name = getname(specialfile); error = PTR_ERR(name); if (IS_ERR(name)) { @@ -1834,7 +1857,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) error = -EBUSY; for (i = 0; i < nr_swapfiles; i++) { - struct swap_info_struct *q = &swap_info[i]; + struct swap_info_struct *q = swap_info[i]; if (i == type || !q->swap_file) continue; @@ -1909,6 +1932,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) p->lowest_bit = 1; p->cluster_next = 1; + p->cluster_nr = 0; /* * Find out how many pages are allowed for a single swap @@ -2015,18 +2039,16 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) /* insert swap space into swap_list: */ prev = -1; - for (i = swap_list.head; i >= 0; i = swap_info[i].next) { - if (p->prio >= swap_info[i].prio) { + for (i = swap_list.head; i >= 0; i = swap_info[i]->next) { + if (p->prio >= swap_info[i]->prio) break; - } prev = i; } p->next = i; - if (prev < 0) { - swap_list.head = swap_list.next = p - swap_info; - } else { - swap_info[prev].next = p - swap_info; - } + if (prev < 0) + swap_list.head = swap_list.next = type; + else + swap_info[prev]->next = type; spin_unlock(&swap_lock); mutex_unlock(&swapon_mutex); error = 0; @@ -2063,15 +2085,15 @@ out: void si_swapinfo(struct sysinfo *val) { - unsigned int i; + unsigned int type; unsigned long nr_to_be_unused = 0; spin_lock(&swap_lock); - for (i = 0; i < nr_swapfiles; i++) { - if (!(swap_info[i].flags & SWP_USED) || - (swap_info[i].flags & SWP_WRITEOK)) - continue; - nr_to_be_unused += swap_info[i].inuse_pages; + for (type = 0; type < nr_swapfiles; type++) { + struct swap_info_struct *si = swap_info[type]; + + if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK)) + nr_to_be_unused += si->inuse_pages; } val->freeswap = nr_swap_pages + nr_to_be_unused; val->totalswap = total_swap_pages + nr_to_be_unused; @@ -2104,7 +2126,7 @@ static int __swap_duplicate(swp_entry_t entry, bool cache) type = swp_type(entry); if (type >= nr_swapfiles) goto bad_file; - p = type + swap_info; + p = swap_info[type]; offset = swp_offset(entry); spin_lock(&swap_lock); @@ -2186,7 +2208,7 @@ int valid_swaphandles(swp_entry_t entry, unsigned long *offset) if (!our_page_cluster) /* no readahead */ return 0; - si = &swap_info[swp_type(entry)]; + si = swap_info[swp_type(entry)]; target = swp_offset(entry); base = (target >> our_page_cluster) << our_page_cluster; end = base + (1 << our_page_cluster); -- cgit v1.2.3 From 9625a5f289f7c3c100b59c317e2bcc3c7e2e51fb Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 14 Dec 2009 17:58:42 -0800 Subject: swap_info: include first_swap_extent Make better use of the space by folding first swap_extent into its swap_info_struct, instead of just the list_head: swap partitions need only that one, and for others it's used as a circular list anyway. [jirislaby@gmail.com: fix crash on double swapon] Signed-off-by: Hugh Dickins Cc: KAMEZAWA Hiroyuki Cc: Rik van Riel Signed-off-by: Jiri Slaby Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 70 ++++++++++++++++++++++++++++++----------------------------- 1 file changed, 36 insertions(+), 34 deletions(-) (limited to 'mm/swapfile.c') diff --git a/mm/swapfile.c b/mm/swapfile.c index dc88a7e4257e..16de84b56644 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -145,23 +145,28 @@ void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page) static int discard_swap(struct swap_info_struct *si) { struct swap_extent *se; + sector_t start_block; + sector_t nr_blocks; int err = 0; - list_for_each_entry(se, &si->extent_list, list) { - sector_t start_block = se->start_block << (PAGE_SHIFT - 9); - sector_t nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9); + /* Do not discard the swap header page! */ + se = &si->first_swap_extent; + start_block = (se->start_block + 1) << (PAGE_SHIFT - 9); + nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9); + if (nr_blocks) { + err = blkdev_issue_discard(si->bdev, start_block, + nr_blocks, GFP_KERNEL, DISCARD_FL_BARRIER); + if (err) + return err; + cond_resched(); + } - if (se->start_page == 0) { - /* Do not discard the swap header page! */ - start_block += 1 << (PAGE_SHIFT - 9); - nr_blocks -= 1 << (PAGE_SHIFT - 9); - if (!nr_blocks) - continue; - } + list_for_each_entry(se, &si->first_swap_extent.list, list) { + start_block = se->start_block << (PAGE_SHIFT - 9); + nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9); err = blkdev_issue_discard(si->bdev, start_block, - nr_blocks, GFP_KERNEL, - DISCARD_FL_BARRIER); + nr_blocks, GFP_KERNEL, DISCARD_FL_BARRIER); if (err) break; @@ -200,14 +205,11 @@ static void discard_swap_cluster(struct swap_info_struct *si, start_block <<= PAGE_SHIFT - 9; nr_blocks <<= PAGE_SHIFT - 9; if (blkdev_issue_discard(si->bdev, start_block, - nr_blocks, GFP_NOIO, - DISCARD_FL_BARRIER)) + nr_blocks, GFP_NOIO, DISCARD_FL_BARRIER)) break; } lh = se->list.next; - if (lh == &si->extent_list) - lh = lh->next; se = list_entry(lh, struct swap_extent, list); } } @@ -761,10 +763,8 @@ int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p) return type; } if (bdev == sis->bdev) { - struct swap_extent *se; + struct swap_extent *se = &sis->first_swap_extent; - se = list_entry(sis->extent_list.next, - struct swap_extent, list); if (se->start_block == offset) { if (bdev_p) *bdev_p = bdgrab(sis->bdev); @@ -1310,8 +1310,6 @@ sector_t map_swap_page(swp_entry_t entry, struct block_device **bdev) return se->start_block + (offset - se->start_page); } lh = se->list.next; - if (lh == &sis->extent_list) - lh = lh->next; se = list_entry(lh, struct swap_extent, list); sis->curr_swap_extent = se; BUG_ON(se == start_se); /* It *must* be present */ @@ -1340,10 +1338,10 @@ sector_t swapdev_block(int type, pgoff_t offset) */ static void destroy_swap_extents(struct swap_info_struct *sis) { - while (!list_empty(&sis->extent_list)) { + while (!list_empty(&sis->first_swap_extent.list)) { struct swap_extent *se; - se = list_entry(sis->extent_list.next, + se = list_entry(sis->first_swap_extent.list.next, struct swap_extent, list); list_del(&se->list); kfree(se); @@ -1364,8 +1362,15 @@ add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, struct swap_extent *new_se; struct list_head *lh; - lh = sis->extent_list.prev; /* The highest page extent */ - if (lh != &sis->extent_list) { + if (start_page == 0) { + se = &sis->first_swap_extent; + sis->curr_swap_extent = se; + se->start_page = 0; + se->nr_pages = nr_pages; + se->start_block = start_block; + return 1; + } else { + lh = sis->first_swap_extent.list.prev; /* Highest extent */ se = list_entry(lh, struct swap_extent, list); BUG_ON(se->start_page + se->nr_pages != start_page); if (se->start_block + se->nr_pages == start_block) { @@ -1385,7 +1390,7 @@ add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, new_se->nr_pages = nr_pages; new_se->start_block = start_block; - list_add_tail(&new_se->list, &sis->extent_list); + list_add_tail(&new_se->list, &sis->first_swap_extent.list); return 1; } @@ -1437,7 +1442,7 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span) if (S_ISBLK(inode->i_mode)) { ret = add_swap_extent(sis, 0, sis->max, 0); *span = sis->pages; - goto done; + goto out; } blkbits = inode->i_blkbits; @@ -1508,15 +1513,12 @@ reprobe: sis->max = page_no; sis->pages = page_no - 1; sis->highest_bit = page_no - 1; -done: - sis->curr_swap_extent = list_entry(sis->extent_list.prev, - struct swap_extent, list); - goto out; +out: + return ret; bad_bmap: printk(KERN_ERR "swapon: swapfile has holes\n"); ret = -EINVAL; -out: - return ret; + goto out; } SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) @@ -1815,7 +1817,6 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) kfree(p); goto out; } - INIT_LIST_HEAD(&p->extent_list); if (type >= nr_swapfiles) { p->type = type; swap_info[type] = p; @@ -1834,6 +1835,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) * would be relying on p->type to remain valid. */ } + INIT_LIST_HEAD(&p->first_swap_extent.list); p->flags = SWP_USED; p->next = -1; spin_unlock(&swap_lock); -- cgit v1.2.3 From 73c34b6accc8427584f5d7db4d5acb230ed8c912 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 14 Dec 2009 17:58:43 -0800 Subject: swap_info: miscellaneous minor cleanups Move CONFIG_HIBERNATION's swapdev_block() into the main CONFIG_HIBERNATION block, remove extraneous whitespace and return, fix typo in a comment. Signed-off-by: Hugh Dickins Reviewed-by: KAMEZAWA Hiroyuki Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 51 ++++++++++++++++++++++++--------------------------- 1 file changed, 24 insertions(+), 27 deletions(-) (limited to 'mm/swapfile.c') diff --git a/mm/swapfile.c b/mm/swapfile.c index 16de84b56644..fa5f10b9c28b 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -519,9 +519,9 @@ swp_entry_t get_swap_page_of_type(int type) return (swp_entry_t) {0}; } -static struct swap_info_struct * swap_info_get(swp_entry_t entry) +static struct swap_info_struct *swap_info_get(swp_entry_t entry) { - struct swap_info_struct * p; + struct swap_info_struct *p; unsigned long offset, type; if (!entry.val) @@ -599,7 +599,7 @@ static int swap_entry_free(struct swap_info_struct *p, */ void swap_free(swp_entry_t entry) { - struct swap_info_struct * p; + struct swap_info_struct *p; p = swap_info_get(entry); if (p) { @@ -629,7 +629,6 @@ void swapcache_free(swp_entry_t entry, struct page *page) } spin_unlock(&swap_lock); } - return; } /* @@ -782,6 +781,21 @@ int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p) return -ENODEV; } +/* + * Get the (PAGE_SIZE) block corresponding to given offset on the swapdev + * corresponding to given index in swap_info (swap type). + */ +sector_t swapdev_block(int type, pgoff_t offset) +{ + struct block_device *bdev; + + if ((unsigned int)type >= nr_swapfiles) + return 0; + if (!(swap_info[type]->flags & SWP_WRITEOK)) + return 0; + return map_swap_page(swp_entry(type, offset), &bdev); +} + /* * Return either the total number of swap pages of given type, or the number * of free pages of that type (depending on @free) @@ -805,7 +819,7 @@ unsigned int count_swap_pages(int type, int free) spin_unlock(&swap_lock); return n; } -#endif +#endif /* CONFIG_HIBERNATION */ /* * No need to decide whether this PTE shares the swap entry with others, @@ -1316,23 +1330,6 @@ sector_t map_swap_page(swp_entry_t entry, struct block_device **bdev) } } -#ifdef CONFIG_HIBERNATION -/* - * Get the (PAGE_SIZE) block corresponding to given offset on the swapdev - * corresponding to given index in swap_info (swap type). - */ -sector_t swapdev_block(int type, pgoff_t offset) -{ - struct block_device *bdev; - - if ((unsigned int)type >= nr_swapfiles) - return 0; - if (!(swap_info[type]->flags & SWP_WRITEOK)) - return 0; - return map_swap_page(swp_entry(type, offset), &bdev); -} -#endif /* CONFIG_HIBERNATION */ - /* * Free all of a swapdev's extent information */ @@ -1523,12 +1520,12 @@ bad_bmap: SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) { - struct swap_info_struct * p = NULL; + struct swap_info_struct *p = NULL; unsigned short *swap_map; struct file *swap_file, *victim; struct address_space *mapping; struct inode *inode; - char * pathname; + char *pathname; int i, type, prev; int err; @@ -1780,7 +1777,7 @@ late_initcall(max_swapfiles_check); */ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) { - struct swap_info_struct * p; + struct swap_info_struct *p; char *name = NULL; struct block_device *bdev = NULL; struct file *swap_file = NULL; @@ -2116,7 +2113,7 @@ void si_swapinfo(struct sysinfo *val) */ static int __swap_duplicate(swp_entry_t entry, bool cache) { - struct swap_info_struct * p; + struct swap_info_struct *p; unsigned long offset, type; int result = -EINVAL; int count; @@ -2185,7 +2182,7 @@ void swap_duplicate(swp_entry_t entry) /* * @entry: swap entry for which we allocate swap cache. * - * Called when allocating swap cache for exising swap entry, + * Called when allocating swap cache for existing swap entry, * This can return error codes. Returns 0 at success. * -EBUSY means there is a swap cache. * Note: return code is different from swap_duplicate(). -- cgit v1.2.3 From 253d553ba75ab26b3e9e2f70cbf6fbf0813f7e86 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 14 Dec 2009 17:58:44 -0800 Subject: swap_info: SWAP_HAS_CACHE cleanups Though swap_count() is useful, I'm finding that swap_has_cache() and encode_swapmap() obscure what happens in the swap_map entry, just at those points where I need to understand it. Remove them, and pass more usable "usage" values to scan_swap_map(), swap_entry_free() and __swap_duplicate(), instead of the SWAP_MAP and SWAP_CACHE enum. Signed-off-by: Hugh Dickins Reviewed-by: KAMEZAWA Hiroyuki Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 155 ++++++++++++++++++++++++---------------------------------- 1 file changed, 64 insertions(+), 91 deletions(-) (limited to 'mm/swapfile.c') diff --git a/mm/swapfile.c b/mm/swapfile.c index fa5f10b9c28b..52497490a7ca 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -53,30 +53,9 @@ static struct swap_info_struct *swap_info[MAX_SWAPFILES]; static DEFINE_MUTEX(swapon_mutex); -/* For reference count accounting in swap_map */ -/* enum for swap_map[] handling. internal use only */ -enum { - SWAP_MAP = 0, /* ops for reference from swap users */ - SWAP_CACHE, /* ops for reference from swap cache */ -}; - static inline int swap_count(unsigned short ent) { - return ent & SWAP_COUNT_MASK; -} - -static inline bool swap_has_cache(unsigned short ent) -{ - return !!(ent & SWAP_HAS_CACHE); -} - -static inline unsigned short encode_swapmap(int count, bool has_cache) -{ - unsigned short ret = count; - - if (has_cache) - return SWAP_HAS_CACHE | ret; - return ret; + return ent & ~SWAP_HAS_CACHE; } /* returns 1 if swap entry is freed */ @@ -224,7 +203,7 @@ static int wait_for_discard(void *word) #define LATENCY_LIMIT 256 static inline unsigned long scan_swap_map(struct swap_info_struct *si, - int cache) + unsigned short usage) { unsigned long offset; unsigned long scan_base; @@ -355,10 +334,7 @@ checks: si->lowest_bit = si->max; si->highest_bit = 0; } - if (cache == SWAP_CACHE) /* at usual swap-out via vmscan.c */ - si->swap_map[offset] = encode_swapmap(0, true); - else /* at suspend */ - si->swap_map[offset] = encode_swapmap(1, false); + si->swap_map[offset] = usage; si->cluster_next = offset + 1; si->flags -= SWP_SCANNING; @@ -483,7 +459,7 @@ swp_entry_t get_swap_page(void) swap_list.next = next; /* This is called for allocating swap entry for cache */ - offset = scan_swap_map(si, SWAP_CACHE); + offset = scan_swap_map(si, SWAP_HAS_CACHE); if (offset) { spin_unlock(&swap_lock); return swp_entry(type, offset); @@ -508,7 +484,7 @@ swp_entry_t get_swap_page_of_type(int type) if (si && (si->flags & SWP_WRITEOK)) { nr_swap_pages--; /* This is called for allocating swap entry, not cache */ - offset = scan_swap_map(si, SWAP_MAP); + offset = scan_swap_map(si, 1); if (offset) { spin_unlock(&swap_lock); return swp_entry(type, offset); @@ -555,29 +531,31 @@ out: return NULL; } -static int swap_entry_free(struct swap_info_struct *p, - swp_entry_t ent, int cache) +static unsigned short swap_entry_free(struct swap_info_struct *p, + swp_entry_t entry, unsigned short usage) { - unsigned long offset = swp_offset(ent); - int count = swap_count(p->swap_map[offset]); - bool has_cache; + unsigned long offset = swp_offset(entry); + unsigned short count; + unsigned short has_cache; - has_cache = swap_has_cache(p->swap_map[offset]); + count = p->swap_map[offset]; + has_cache = count & SWAP_HAS_CACHE; + count &= ~SWAP_HAS_CACHE; - if (cache == SWAP_MAP) { /* dropping usage count of swap */ - if (count < SWAP_MAP_MAX) { - count--; - p->swap_map[offset] = encode_swapmap(count, has_cache); - } - } else { /* dropping swap cache flag */ + if (usage == SWAP_HAS_CACHE) { VM_BUG_ON(!has_cache); - p->swap_map[offset] = encode_swapmap(count, false); + has_cache = 0; + } else if (count < SWAP_MAP_MAX) + count--; + + if (!count) + mem_cgroup_uncharge_swap(entry); + + usage = count | has_cache; + p->swap_map[offset] = usage; - } - /* return code. */ - count = p->swap_map[offset]; /* free if no reference */ - if (!count) { + if (!usage) { if (offset < p->lowest_bit) p->lowest_bit = offset; if (offset > p->highest_bit) @@ -588,9 +566,8 @@ static int swap_entry_free(struct swap_info_struct *p, nr_swap_pages++; p->inuse_pages--; } - if (!swap_count(count)) - mem_cgroup_uncharge_swap(ent); - return count; + + return usage; } /* @@ -603,7 +580,7 @@ void swap_free(swp_entry_t entry) p = swap_info_get(entry); if (p) { - swap_entry_free(p, entry, SWAP_MAP); + swap_entry_free(p, entry, 1); spin_unlock(&swap_lock); } } @@ -614,19 +591,13 @@ void swap_free(swp_entry_t entry) void swapcache_free(swp_entry_t entry, struct page *page) { struct swap_info_struct *p; - int ret; + unsigned short count; p = swap_info_get(entry); if (p) { - ret = swap_entry_free(p, entry, SWAP_CACHE); - if (page) { - bool swapout; - if (ret) - swapout = true; /* the end of swap out */ - else - swapout = false; /* no more swap users! */ - mem_cgroup_uncharge_swapcache(page, entry, swapout); - } + count = swap_entry_free(p, entry, SWAP_HAS_CACHE); + if (page) + mem_cgroup_uncharge_swapcache(page, entry, count != 0); spin_unlock(&swap_lock); } } @@ -705,7 +676,7 @@ int free_swap_and_cache(swp_entry_t entry) p = swap_info_get(entry); if (p) { - if (swap_entry_free(p, entry, SWAP_MAP) == SWAP_HAS_CACHE) { + if (swap_entry_free(p, entry, 1) == SWAP_HAS_CACHE) { page = find_get_page(&swapper_space, entry.val); if (page && !trylock_page(page)) { page_cache_release(page); @@ -1212,7 +1183,7 @@ static int try_to_unuse(unsigned int type) if (swap_count(*swap_map) == SWAP_MAP_MAX) { spin_lock(&swap_lock); - *swap_map = encode_swapmap(0, true); + *swap_map = SWAP_HAS_CACHE; spin_unlock(&swap_lock); reset_overflow = 1; } @@ -2111,16 +2082,16 @@ void si_swapinfo(struct sysinfo *val) * - swap-cache reference is requested but there is already one. -> EEXIST * - swap-cache reference is requested but the entry is not used. -> ENOENT */ -static int __swap_duplicate(swp_entry_t entry, bool cache) +static int __swap_duplicate(swp_entry_t entry, unsigned short usage) { struct swap_info_struct *p; unsigned long offset, type; - int result = -EINVAL; - int count; - bool has_cache; + unsigned short count; + unsigned short has_cache; + int err = -EINVAL; if (non_swap_entry(entry)) - return -EINVAL; + goto out; type = swp_type(entry); if (type >= nr_swapfiles) @@ -2129,54 +2100,56 @@ static int __swap_duplicate(swp_entry_t entry, bool cache) offset = swp_offset(entry); spin_lock(&swap_lock); - if (unlikely(offset >= p->max)) goto unlock_out; - count = swap_count(p->swap_map[offset]); - has_cache = swap_has_cache(p->swap_map[offset]); + count = p->swap_map[offset]; + has_cache = count & SWAP_HAS_CACHE; + count &= ~SWAP_HAS_CACHE; + err = 0; - if (cache == SWAP_CACHE) { /* called for swapcache/swapin-readahead */ + if (usage == SWAP_HAS_CACHE) { /* set SWAP_HAS_CACHE if there is no cache and entry is used */ - if (!has_cache && count) { - p->swap_map[offset] = encode_swapmap(count, true); - result = 0; - } else if (has_cache) /* someone added cache */ - result = -EEXIST; - else if (!count) /* no users */ - result = -ENOENT; + if (!has_cache && count) + has_cache = SWAP_HAS_CACHE; + else if (has_cache) /* someone else added cache */ + err = -EEXIST; + else /* no users remaining */ + err = -ENOENT; } else if (count || has_cache) { - if (count < SWAP_MAP_MAX - 1) { - p->swap_map[offset] = encode_swapmap(count + 1, - has_cache); - result = 0; - } else if (count <= SWAP_MAP_MAX) { + + if (count < SWAP_MAP_MAX - 1) + count++; + else if (count <= SWAP_MAP_MAX) { if (swap_overflow++ < 5) printk(KERN_WARNING "swap_dup: swap entry overflow\n"); - p->swap_map[offset] = encode_swapmap(SWAP_MAP_MAX, - has_cache); - result = 0; - } + count = SWAP_MAP_MAX; + } else + err = -EINVAL; } else - result = -ENOENT; /* unused swap entry */ + err = -ENOENT; /* unused swap entry */ + + p->swap_map[offset] = count | has_cache; + unlock_out: spin_unlock(&swap_lock); out: - return result; + return err; bad_file: printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val); goto out; } + /* * increase reference count of swap entry by 1. */ void swap_duplicate(swp_entry_t entry) { - __swap_duplicate(entry, SWAP_MAP); + __swap_duplicate(entry, 1); } /* @@ -2189,7 +2162,7 @@ void swap_duplicate(swp_entry_t entry) */ int swapcache_prepare(swp_entry_t entry) { - return __swap_duplicate(entry, SWAP_CACHE); + return __swap_duplicate(entry, SWAP_HAS_CACHE); } /* -- cgit v1.2.3 From 8d69aaee80c123b460918816cbfa2e83224c3646 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 14 Dec 2009 17:58:45 -0800 Subject: swap_info: swap_map of chars not shorts Halve the vmalloc'ed swap_map array from unsigned shorts to unsigned chars: it's still very unusual to reach a swap count of 126, and the next patch allows it to be extended indefinitely. Signed-off-by: Hugh Dickins Reviewed-by: KAMEZAWA Hiroyuki Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 40 +++++++++++++++++++++++----------------- 1 file changed, 23 insertions(+), 17 deletions(-) (limited to 'mm/swapfile.c') diff --git a/mm/swapfile.c b/mm/swapfile.c index 52497490a7ca..c0d7b9ed0c16 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -53,7 +53,7 @@ static struct swap_info_struct *swap_info[MAX_SWAPFILES]; static DEFINE_MUTEX(swapon_mutex); -static inline int swap_count(unsigned short ent) +static inline unsigned char swap_count(unsigned char ent) { return ent & ~SWAP_HAS_CACHE; } @@ -203,7 +203,7 @@ static int wait_for_discard(void *word) #define LATENCY_LIMIT 256 static inline unsigned long scan_swap_map(struct swap_info_struct *si, - unsigned short usage) + unsigned char usage) { unsigned long offset; unsigned long scan_base; @@ -531,12 +531,12 @@ out: return NULL; } -static unsigned short swap_entry_free(struct swap_info_struct *p, - swp_entry_t entry, unsigned short usage) +static unsigned char swap_entry_free(struct swap_info_struct *p, + swp_entry_t entry, unsigned char usage) { unsigned long offset = swp_offset(entry); - unsigned short count; - unsigned short has_cache; + unsigned char count; + unsigned char has_cache; count = p->swap_map[offset]; has_cache = count & SWAP_HAS_CACHE; @@ -591,7 +591,7 @@ void swap_free(swp_entry_t entry) void swapcache_free(swp_entry_t entry, struct page *page) { struct swap_info_struct *p; - unsigned short count; + unsigned char count; p = swap_info_get(entry); if (p) { @@ -975,7 +975,7 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si, { unsigned int max = si->max; unsigned int i = prev; - int count; + unsigned char count; /* * No need for swap_lock here: we're just looking @@ -1013,8 +1013,8 @@ static int try_to_unuse(unsigned int type) { struct swap_info_struct *si = swap_info[type]; struct mm_struct *start_mm; - unsigned short *swap_map; - unsigned short swcount; + unsigned char *swap_map; + unsigned char swcount; struct page *page; swp_entry_t entry; unsigned int i = 0; @@ -1174,6 +1174,12 @@ static int try_to_unuse(unsigned int type) * If that's wrong, then we should worry more about * exit_mmap() and do_munmap() cases described above: * we might be resetting SWAP_MAP_MAX too early here. + * + * Yes, that's wrong: though very unlikely, swap count 0x7ffe + * could surely occur if pid_max raised from PID_MAX_DEFAULT; + * and we are now lowering SWAP_MAP_MAX to 0x7e, making it + * much easier to reach. But the next patch will fix that. + * * We know "Undead"s can happen, they're okay, so don't * report them; but do report if we reset SWAP_MAP_MAX. */ @@ -1492,7 +1498,7 @@ bad_bmap: SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) { struct swap_info_struct *p = NULL; - unsigned short *swap_map; + unsigned char *swap_map; struct file *swap_file, *victim; struct address_space *mapping; struct inode *inode; @@ -1762,7 +1768,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) sector_t span; unsigned long maxpages = 1; unsigned long swapfilepages; - unsigned short *swap_map = NULL; + unsigned char *swap_map = NULL; struct page *page = NULL; struct inode *inode = NULL; int did_down = 0; @@ -1938,13 +1944,13 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) goto bad_swap; /* OK, set up the swap map and apply the bad block list */ - swap_map = vmalloc(maxpages * sizeof(short)); + swap_map = vmalloc(maxpages); if (!swap_map) { error = -ENOMEM; goto bad_swap; } - memset(swap_map, 0, maxpages * sizeof(short)); + memset(swap_map, 0, maxpages); for (i = 0; i < swap_header->info.nr_badpages; i++) { int page_nr = swap_header->info.badpages[i]; if (page_nr <= 0 || page_nr >= swap_header->info.last_page) { @@ -2082,12 +2088,12 @@ void si_swapinfo(struct sysinfo *val) * - swap-cache reference is requested but there is already one. -> EEXIST * - swap-cache reference is requested but the entry is not used. -> ENOENT */ -static int __swap_duplicate(swp_entry_t entry, unsigned short usage) +static int __swap_duplicate(swp_entry_t entry, unsigned char usage) { struct swap_info_struct *p; unsigned long offset, type; - unsigned short count; - unsigned short has_cache; + unsigned char count; + unsigned char has_cache; int err = -EINVAL; if (non_swap_entry(entry)) -- cgit v1.2.3 From 570a335b8e22579e2a51a68136d2b1f907a20eec Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 14 Dec 2009 17:58:46 -0800 Subject: swap_info: swap count continuations Swap is duplicated (reference count incremented by one) whenever the same swap page is inserted into another mm (when forking finds a swap entry in place of a pte, or when reclaim unmaps a pte to insert the swap entry). swap_info_struct's vmalloc'ed swap_map is the array of these reference counts: but what happens when the unsigned short (or unsigned char since the preceding patch) is full? (and its high bit is kept for a cache flag) We then lose track of it, never freeing, leaving it in use until swapoff: at which point we _hope_ that a single pass will have found all instances, assume there are no more, and will lose user data if we're wrong. Swapping of KSM pages has not yet been enabled; but it is implemented, and makes it very easy for a user to overflow the maximum swap count: possible with ordinary process pages, but unlikely, even when pid_max has been raised from PID_MAX_DEFAULT. This patch implements swap count continuations: when the count overflows, a continuation page is allocated and linked to the original vmalloc'ed map page, and this used to hold the continuation counts for that entry and its neighbours. These continuation pages are seldom referenced: the common paths all work on the original swap_map, only referring to a continuation page when the low "digit" of a count is incremented or decremented through SWAP_MAP_MAX. Signed-off-by: Hugh Dickins Cc: KAMEZAWA Hiroyuki Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 304 +++++++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 250 insertions(+), 54 deletions(-) (limited to 'mm/swapfile.c') diff --git a/mm/swapfile.c b/mm/swapfile.c index c0d7b9ed0c16..cc5e7ebf2d2c 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -35,11 +35,14 @@ #include #include +static bool swap_count_continued(struct swap_info_struct *, pgoff_t, + unsigned char); +static void free_swap_count_continuations(struct swap_info_struct *); + static DEFINE_SPINLOCK(swap_lock); static unsigned int nr_swapfiles; long nr_swap_pages; long total_swap_pages; -static int swap_overflow; static int least_priority; static const char Bad_file[] = "Bad swap file entry "; @@ -55,7 +58,7 @@ static DEFINE_MUTEX(swapon_mutex); static inline unsigned char swap_count(unsigned char ent) { - return ent & ~SWAP_HAS_CACHE; + return ent & ~SWAP_HAS_CACHE; /* may include SWAP_HAS_CONT flag */ } /* returns 1 if swap entry is freed */ @@ -545,8 +548,15 @@ static unsigned char swap_entry_free(struct swap_info_struct *p, if (usage == SWAP_HAS_CACHE) { VM_BUG_ON(!has_cache); has_cache = 0; - } else if (count < SWAP_MAP_MAX) - count--; + } else if ((count & ~COUNT_CONTINUED) <= SWAP_MAP_MAX) { + if (count == COUNT_CONTINUED) { + if (swap_count_continued(p, offset, count)) + count = SWAP_MAP_MAX | COUNT_CONTINUED; + else + count = SWAP_MAP_MAX; + } else + count--; + } if (!count) mem_cgroup_uncharge_swap(entry); @@ -604,6 +614,8 @@ void swapcache_free(swp_entry_t entry, struct page *page) /* * How many references to page are currently swapped out? + * This does not give an exact answer when swap count is continued, + * but does include the high COUNT_CONTINUED flag to allow for that. */ static inline int page_swapcount(struct page *page) { @@ -1019,7 +1031,6 @@ static int try_to_unuse(unsigned int type) swp_entry_t entry; unsigned int i = 0; int retval = 0; - int reset_overflow = 0; int shmem; /* @@ -1034,8 +1045,7 @@ static int try_to_unuse(unsigned int type) * together, child after parent. If we race with dup_mmap(), we * prefer to resolve parent before child, lest we miss entries * duplicated after we scanned child: using last mm would invert - * that. Though it's only a serious concern when an overflowed - * swap count is reset from SWAP_MAP_MAX, preventing a rescan. + * that. */ start_mm = &init_mm; atomic_inc(&init_mm.mm_users); @@ -1164,36 +1174,6 @@ static int try_to_unuse(unsigned int type) break; } - /* - * How could swap count reach 0x7ffe ? - * There's no way to repeat a swap page within an mm - * (except in shmem, where it's the shared object which takes - * the reference count)? - * We believe SWAP_MAP_MAX cannot occur.(if occur, unsigned - * short is too small....) - * If that's wrong, then we should worry more about - * exit_mmap() and do_munmap() cases described above: - * we might be resetting SWAP_MAP_MAX too early here. - * - * Yes, that's wrong: though very unlikely, swap count 0x7ffe - * could surely occur if pid_max raised from PID_MAX_DEFAULT; - * and we are now lowering SWAP_MAP_MAX to 0x7e, making it - * much easier to reach. But the next patch will fix that. - * - * We know "Undead"s can happen, they're okay, so don't - * report them; but do report if we reset SWAP_MAP_MAX. - */ - /* We might release the lock_page() in unuse_mm(). */ - if (!PageSwapCache(page) || page_private(page) != entry.val) - goto retry; - - if (swap_count(*swap_map) == SWAP_MAP_MAX) { - spin_lock(&swap_lock); - *swap_map = SWAP_HAS_CACHE; - spin_unlock(&swap_lock); - reset_overflow = 1; - } - /* * If a reference remains (rare), we would like to leave * the page in the swap cache; but try_to_unmap could @@ -1235,7 +1215,6 @@ static int try_to_unuse(unsigned int type) * mark page dirty so shrink_page_list will preserve it. */ SetPageDirty(page); -retry: unlock_page(page); page_cache_release(page); @@ -1247,10 +1226,6 @@ retry: } mmput(start_mm); - if (reset_overflow) { - printk(KERN_WARNING "swapoff: cleared swap entry overflow\n"); - swap_overflow = 0; - } return retval; } @@ -1593,6 +1568,9 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) up_write(&swap_unplug_sem); destroy_swap_extents(p); + if (p->flags & SWP_CONTINUED) + free_swap_count_continuations(p); + mutex_lock(&swapon_mutex); spin_lock(&swap_lock); drain_mmlist(); @@ -2079,14 +2057,13 @@ void si_swapinfo(struct sysinfo *val) /* * Verify that a swap entry is valid and increment its swap map count. * - * Note: if swap_map[] reaches SWAP_MAP_MAX the entries are treated as - * "permanent", but will be reclaimed by the next swapoff. * Returns error code in following case. * - success -> 0 * - swp_entry is invalid -> EINVAL * - swp_entry is migration entry -> EINVAL * - swap-cache reference is requested but there is already one. -> EEXIST * - swap-cache reference is requested but the entry is not used. -> ENOENT + * - swap-mapped reference requested but needs continued swap count. -> ENOMEM */ static int __swap_duplicate(swp_entry_t entry, unsigned char usage) { @@ -2126,15 +2103,14 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage) } else if (count || has_cache) { - if (count < SWAP_MAP_MAX - 1) - count++; - else if (count <= SWAP_MAP_MAX) { - if (swap_overflow++ < 5) - printk(KERN_WARNING - "swap_dup: swap entry overflow\n"); - count = SWAP_MAP_MAX; - } else + if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX) + count += usage; + else if ((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX) err = -EINVAL; + else if (swap_count_continued(p, offset, count)) + count = COUNT_CONTINUED; + else + err = -ENOMEM; } else err = -ENOENT; /* unused swap entry */ @@ -2153,9 +2129,13 @@ bad_file: /* * increase reference count of swap entry by 1. */ -void swap_duplicate(swp_entry_t entry) +int swap_duplicate(swp_entry_t entry) { - __swap_duplicate(entry, 1); + int err = 0; + + while (!err && __swap_duplicate(entry, 1) == -ENOMEM) + err = add_swap_count_continuation(entry, GFP_ATOMIC); + return err; } /* @@ -2222,3 +2202,219 @@ int valid_swaphandles(swp_entry_t entry, unsigned long *offset) *offset = ++toff; return nr_pages? ++nr_pages: 0; } + +/* + * add_swap_count_continuation - called when a swap count is duplicated + * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's + * page of the original vmalloc'ed swap_map, to hold the continuation count + * (for that entry and for its neighbouring PAGE_SIZE swap entries). Called + * again when count is duplicated beyond SWAP_MAP_MAX * SWAP_CONT_MAX, etc. + * + * These continuation pages are seldom referenced: the common paths all work + * on the original swap_map, only referring to a continuation page when the + * low "digit" of a count is incremented or decremented through SWAP_MAP_MAX. + * + * add_swap_count_continuation(, GFP_ATOMIC) can be called while holding + * page table locks; if it fails, add_swap_count_continuation(, GFP_KERNEL) + * can be called after dropping locks. + */ +int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) +{ + struct swap_info_struct *si; + struct page *head; + struct page *page; + struct page *list_page; + pgoff_t offset; + unsigned char count; + + /* + * When debugging, it's easier to use __GFP_ZERO here; but it's better + * for latency not to zero a page while GFP_ATOMIC and holding locks. + */ + page = alloc_page(gfp_mask | __GFP_HIGHMEM); + + si = swap_info_get(entry); + if (!si) { + /* + * An acceptable race has occurred since the failing + * __swap_duplicate(): the swap entry has been freed, + * perhaps even the whole swap_map cleared for swapoff. + */ + goto outer; + } + + offset = swp_offset(entry); + count = si->swap_map[offset] & ~SWAP_HAS_CACHE; + + if ((count & ~COUNT_CONTINUED) != SWAP_MAP_MAX) { + /* + * The higher the swap count, the more likely it is that tasks + * will race to add swap count continuation: we need to avoid + * over-provisioning. + */ + goto out; + } + + if (!page) { + spin_unlock(&swap_lock); + return -ENOMEM; + } + + /* + * We are fortunate that although vmalloc_to_page uses pte_offset_map, + * no architecture is using highmem pages for kernel pagetables: so it + * will not corrupt the GFP_ATOMIC caller's atomic pagetable kmaps. + */ + head = vmalloc_to_page(si->swap_map + offset); + offset &= ~PAGE_MASK; + + /* + * Page allocation does not initialize the page's lru field, + * but it does always reset its private field. + */ + if (!page_private(head)) { + BUG_ON(count & COUNT_CONTINUED); + INIT_LIST_HEAD(&head->lru); + set_page_private(head, SWP_CONTINUED); + si->flags |= SWP_CONTINUED; + } + + list_for_each_entry(list_page, &head->lru, lru) { + unsigned char *map; + + /* + * If the previous map said no continuation, but we've found + * a continuation page, free our allocation and use this one. + */ + if (!(count & COUNT_CONTINUED)) + goto out; + + map = kmap_atomic(list_page, KM_USER0) + offset; + count = *map; + kunmap_atomic(map, KM_USER0); + + /* + * If this continuation count now has some space in it, + * free our allocation and use this one. + */ + if ((count & ~COUNT_CONTINUED) != SWAP_CONT_MAX) + goto out; + } + + list_add_tail(&page->lru, &head->lru); + page = NULL; /* now it's attached, don't free it */ +out: + spin_unlock(&swap_lock); +outer: + if (page) + __free_page(page); + return 0; +} + +/* + * swap_count_continued - when the original swap_map count is incremented + * from SWAP_MAP_MAX, check if there is already a continuation page to carry + * into, carry if so, or else fail until a new continuation page is allocated; + * when the original swap_map count is decremented from 0 with continuation, + * borrow from the continuation and report whether it still holds more. + * Called while __swap_duplicate() or swap_entry_free() holds swap_lock. + */ +static bool swap_count_continued(struct swap_info_struct *si, + pgoff_t offset, unsigned char count) +{ + struct page *head; + struct page *page; + unsigned char *map; + + head = vmalloc_to_page(si->swap_map + offset); + if (page_private(head) != SWP_CONTINUED) { + BUG_ON(count & COUNT_CONTINUED); + return false; /* need to add count continuation */ + } + + offset &= ~PAGE_MASK; + page = list_entry(head->lru.next, struct page, lru); + map = kmap_atomic(page, KM_USER0) + offset; + + if (count == SWAP_MAP_MAX) /* initial increment from swap_map */ + goto init_map; /* jump over SWAP_CONT_MAX checks */ + + if (count == (SWAP_MAP_MAX | COUNT_CONTINUED)) { /* incrementing */ + /* + * Think of how you add 1 to 999 + */ + while (*map == (SWAP_CONT_MAX | COUNT_CONTINUED)) { + kunmap_atomic(map, KM_USER0); + page = list_entry(page->lru.next, struct page, lru); + BUG_ON(page == head); + map = kmap_atomic(page, KM_USER0) + offset; + } + if (*map == SWAP_CONT_MAX) { + kunmap_atomic(map, KM_USER0); + page = list_entry(page->lru.next, struct page, lru); + if (page == head) + return false; /* add count continuation */ + map = kmap_atomic(page, KM_USER0) + offset; +init_map: *map = 0; /* we didn't zero the page */ + } + *map += 1; + kunmap_atomic(map, KM_USER0); + page = list_entry(page->lru.prev, struct page, lru); + while (page != head) { + map = kmap_atomic(page, KM_USER0) + offset; + *map = COUNT_CONTINUED; + kunmap_atomic(map, KM_USER0); + page = list_entry(page->lru.prev, struct page, lru); + } + return true; /* incremented */ + + } else { /* decrementing */ + /* + * Think of how you subtract 1 from 1000 + */ + BUG_ON(count != COUNT_CONTINUED); + while (*map == COUNT_CONTINUED) { + kunmap_atomic(map, KM_USER0); + page = list_entry(page->lru.next, struct page, lru); + BUG_ON(page == head); + map = kmap_atomic(page, KM_USER0) + offset; + } + BUG_ON(*map == 0); + *map -= 1; + if (*map == 0) + count = 0; + kunmap_atomic(map, KM_USER0); + page = list_entry(page->lru.prev, struct page, lru); + while (page != head) { + map = kmap_atomic(page, KM_USER0) + offset; + *map = SWAP_CONT_MAX | count; + count = COUNT_CONTINUED; + kunmap_atomic(map, KM_USER0); + page = list_entry(page->lru.prev, struct page, lru); + } + return count == COUNT_CONTINUED; + } +} + +/* + * free_swap_count_continuations - swapoff free all the continuation pages + * appended to the swap_map, after swap_map is quiesced, before vfree'ing it. + */ +static void free_swap_count_continuations(struct swap_info_struct *si) +{ + pgoff_t offset; + + for (offset = 0; offset < si->max; offset += PAGE_SIZE) { + struct page *head; + head = vmalloc_to_page(si->swap_map + offset); + if (page_private(head)) { + struct list_head *this, *next; + list_for_each_safe(this, next, &head->lru) { + struct page *page; + page = list_entry(this, struct page, lru); + list_del(this); + __free_page(page); + } + } + } +} -- cgit v1.2.3 From aaa468653b4a0d11c603c48d716f765177a5a9e4 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 14 Dec 2009 17:58:47 -0800 Subject: swap_info: note SWAP_MAP_SHMEM While we're fiddling with the swap_map values, let's assign a particular value to shmem/tmpfs swap pages: their swap counts are never incremented, and it helps swapoff's try_to_unuse() a little if it can immediately distinguish those pages from process pages. Since we've no use for SWAP_MAP_BAD | COUNT_CONTINUED, we might as well use that 0xbf value for SWAP_MAP_SHMEM. Signed-off-by: Hugh Dickins Reviewed-by: KAMEZAWA Hiroyuki Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 47 +++++++++++++++++++++++++++-------------------- 1 file changed, 27 insertions(+), 20 deletions(-) (limited to 'mm/swapfile.c') diff --git a/mm/swapfile.c b/mm/swapfile.c index cc5e7ebf2d2c..58bec6600167 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -548,6 +548,12 @@ static unsigned char swap_entry_free(struct swap_info_struct *p, if (usage == SWAP_HAS_CACHE) { VM_BUG_ON(!has_cache); has_cache = 0; + } else if (count == SWAP_MAP_SHMEM) { + /* + * Or we could insist on shmem.c using a special + * swap_shmem_free() and free_shmem_swap_and_cache()... + */ + count = 0; } else if ((count & ~COUNT_CONTINUED) <= SWAP_MAP_MAX) { if (count == COUNT_CONTINUED) { if (swap_count_continued(p, offset, count)) @@ -1031,7 +1037,6 @@ static int try_to_unuse(unsigned int type) swp_entry_t entry; unsigned int i = 0; int retval = 0; - int shmem; /* * When searching mms for an entry, a good strategy is to @@ -1107,17 +1112,18 @@ static int try_to_unuse(unsigned int type) /* * Remove all references to entry. - * Whenever we reach init_mm, there's no address space - * to search, but use it as a reminder to search shmem. */ - shmem = 0; swcount = *swap_map; - if (swap_count(swcount)) { - if (start_mm == &init_mm) - shmem = shmem_unuse(entry, page); - else - retval = unuse_mm(start_mm, entry, page); + if (swap_count(swcount) == SWAP_MAP_SHMEM) { + retval = shmem_unuse(entry, page); + /* page has already been unlocked and released */ + if (retval < 0) + break; + continue; } + if (swap_count(swcount) && start_mm != &init_mm) + retval = unuse_mm(start_mm, entry, page); + if (swap_count(*swap_map)) { int set_start_mm = (*swap_map >= swcount); struct list_head *p = &start_mm->mmlist; @@ -1128,7 +1134,7 @@ static int try_to_unuse(unsigned int type) atomic_inc(&new_start_mm->mm_users); atomic_inc(&prev_mm->mm_users); spin_lock(&mmlist_lock); - while (swap_count(*swap_map) && !retval && !shmem && + while (swap_count(*swap_map) && !retval && (p = p->next) != &start_mm->mmlist) { mm = list_entry(p, struct mm_struct, mmlist); if (!atomic_inc_not_zero(&mm->mm_users)) @@ -1142,10 +1148,9 @@ static int try_to_unuse(unsigned int type) swcount = *swap_map; if (!swap_count(swcount)) /* any usage ? */ ; - else if (mm == &init_mm) { + else if (mm == &init_mm) set_start_mm = 1; - shmem = shmem_unuse(entry, page); - } else + else retval = unuse_mm(mm, entry, page); if (set_start_mm && *swap_map < swcount) { @@ -1161,13 +1166,6 @@ static int try_to_unuse(unsigned int type) mmput(start_mm); start_mm = new_start_mm; } - if (shmem) { - /* page has already been unlocked and released */ - if (shmem > 0) - continue; - retval = shmem; - break; - } if (retval) { unlock_page(page); page_cache_release(page); @@ -2126,6 +2124,15 @@ bad_file: goto out; } +/* + * Help swapoff by noting that swap entry belongs to shmem/tmpfs + * (in which case its reference count is never incremented). + */ +void swap_shmem_alloc(swp_entry_t entry) +{ + __swap_duplicate(entry, SWAP_MAP_SHMEM); +} + /* * increase reference count of swap entry by 1. */ -- cgit v1.2.3 From d4906e1aa516cc965292b43b5a26122dd4344e7e Mon Sep 17 00:00:00 2001 From: Lee Schermerhorn Date: Mon, 14 Dec 2009 17:58:49 -0800 Subject: swap: rework map_swap_page() again Seems that page_io.c doesn't really need to know that page_private(page) is the swp_entry 'val'. Rework map_swap_page() to do what its name says and map a page to a page offset in the swap space. The only other caller of map_swap_page() is internal to mm/swapfile.c and it does want to map a swap entry to the 'sector'. So rename map_swap_page() to map_swap_entry(), make it 'static' and and implement map_swap_page() as a wrapper around that. Signed-off-by: Lee Schermerhorn Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) (limited to 'mm/swapfile.c') diff --git a/mm/swapfile.c b/mm/swapfile.c index 58bec6600167..d5eb2e85600b 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -38,6 +38,7 @@ static bool swap_count_continued(struct swap_info_struct *, pgoff_t, unsigned char); static void free_swap_count_continuations(struct swap_info_struct *); +static sector_t map_swap_entry(swp_entry_t, struct block_device**); static DEFINE_SPINLOCK(swap_lock); static unsigned int nr_swapfiles; @@ -782,7 +783,7 @@ sector_t swapdev_block(int type, pgoff_t offset) return 0; if (!(swap_info[type]->flags & SWP_WRITEOK)) return 0; - return map_swap_page(swp_entry(type, offset), &bdev); + return map_swap_entry(swp_entry(type, offset), &bdev); } /* @@ -1249,10 +1250,11 @@ static void drain_mmlist(void) /* * Use this swapdev's extent info to locate the (PAGE_SIZE) block which - * corresponds to page offset `offset'. Note that the type of this function - * is sector_t, but it returns page offset into the bdev, not sector offset. + * corresponds to page offset for the specified swap entry. + * Note that the type of this function is sector_t, but it returns page offset + * into the bdev, not sector offset. */ -sector_t map_swap_page(swp_entry_t entry, struct block_device **bdev) +static sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev) { struct swap_info_struct *sis; struct swap_extent *start_se; @@ -1280,6 +1282,16 @@ sector_t map_swap_page(swp_entry_t entry, struct block_device **bdev) } } +/* + * Returns the page offset into bdev for the specified page's swap entry. + */ +sector_t map_swap_page(struct page *page, struct block_device **bdev) +{ + swp_entry_t entry; + entry.val = page_private(page); + return map_swap_entry(entry, bdev); +} + /* * Free all of a swapdev's extent information */ -- cgit v1.2.3 From 3ca7b3c5b64d35fe02c35b5d44c2c58b49499fee Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 14 Dec 2009 17:58:57 -0800 Subject: mm: define PAGE_MAPPING_FLAGS At present we define PageAnon(page) by the low PAGE_MAPPING_ANON bit set in page->mapping, with the higher bits a pointer to the anon_vma; and have defined PageKsm(page) as that with NULL anon_vma. But KSM swapping will need to store a pointer there: so in preparation for that, now define PAGE_MAPPING_FLAGS as the low two bits, including PAGE_MAPPING_KSM (always set along with PAGE_MAPPING_ANON, until some other use for the bit emerges). Declare page_rmapping(page) to return the pointer part of page->mapping, and page_anon_vma(page) to return the anon_vma pointer when that's what it is. Use these in a few appropriate places: notably, unuse_vma() has been testing page->mapping, but is better to be testing page_anon_vma() (cases may be added in which flag bits are set without any pointer). Signed-off-by: Hugh Dickins Cc: Izik Eidus Cc: Andrea Arcangeli Cc: Nick Piggin Cc: KOSAKI Motohiro Reviewed-by: Rik van Riel Cc: Lee Schermerhorn Cc: Andi Kleen Cc: KAMEZAWA Hiroyuki Cc: Wu Fengguang Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/swapfile.c') diff --git a/mm/swapfile.c b/mm/swapfile.c index d5eb2e85600b..e74112e8e5f4 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -938,7 +938,7 @@ static int unuse_vma(struct vm_area_struct *vma, unsigned long addr, end, next; int ret; - if (page->mapping) { + if (page_anon_vma(page)) { addr = page_address_in_vma(page, vma); if (addr == -EFAULT) return 0; -- cgit v1.2.3 From 5ad6468801d28c4d4ac9f48ec19297817c915f6a Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 14 Dec 2009 17:59:24 -0800 Subject: ksm: let shared pages be swappable Initial implementation for swapping out KSM's shared pages: add page_referenced_ksm() and try_to_unmap_ksm(), which rmap.c calls when faced with a PageKsm page. Most of what's needed can be got from the rmap_items listed from the stable_node of the ksm page, without discovering the actual vma: so in this patch just fake up a struct vma for page_referenced_one() or try_to_unmap_one(), then refine that in the next patch. Add VM_NONLINEAR to ksm_madvise()'s list of exclusions: it has always been implicit there (being only set with VM_SHARED, already excluded), but let's make it explicit, to help justify the lack of nonlinear unmap. Rely on the page lock to protect against concurrent modifications to that page's node of the stable tree. The awkward part is not swapout but swapin: do_swap_page() and page_add_anon_rmap() now have to allow for new possibilities - perhaps a ksm page still in swapcache, perhaps a swapcache page associated with one location in one anon_vma now needed for another location or anon_vma. (And the vma might even be no longer VM_MERGEABLE when that happens.) ksm_might_need_to_copy() checks for that case, and supplies a duplicate page when necessary, simply leaving it to a subsequent pass of ksmd to rediscover the identity and merge them back into one ksm page. Disappointingly primitive: but the alternative would have to accumulate unswappable info about the swapped out ksm pages, limiting swappability. Remove page_add_ksm_rmap(): page_add_anon_rmap() now has to allow for the particular case it was handling, so just use it instead. Signed-off-by: Hugh Dickins Cc: Izik Eidus Cc: Andrea Arcangeli Cc: Chris Wright Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'mm/swapfile.c') diff --git a/mm/swapfile.c b/mm/swapfile.c index e74112e8e5f4..6c0585b16418 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -650,6 +651,8 @@ int reuse_swap_page(struct page *page) int count; VM_BUG_ON(!PageLocked(page)); + if (unlikely(PageKsm(page))) + return 0; count = page_mapcount(page); if (count <= 1 && PageSwapCache(page)) { count += page_swapcount(page); @@ -658,7 +661,7 @@ int reuse_swap_page(struct page *page) SetPageDirty(page); } } - return count == 1; + return count <= 1; } /* @@ -1185,6 +1188,12 @@ static int try_to_unuse(unsigned int type) * read from disk into another page. Splitting into two * pages would be incorrect if swap supported "shared * private" pages, but they are handled by tmpfs files. + * + * Given how unuse_vma() targets one particular offset + * in an anon_vma, once the anon_vma has been determined, + * this splitting happens to be just what is needed to + * handle where KSM pages have been swapped out: re-reading + * is unnecessarily slow, but we can fix that later on. */ if (swap_count(*swap_map) && PageDirty(page) && PageSwapCache(page)) { -- cgit v1.2.3