diff options
Diffstat (limited to 'mm/hugetlb.c')
-rw-r--r-- | mm/hugetlb.c | 288 |
1 files changed, 91 insertions, 197 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index b45a95363a84..ac65bb5e38ac 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -244,16 +244,66 @@ struct file_region { long to; }; +/* Must be called with resv->lock held. Calling this with count_only == true + * will count the number of pages to be added but will not modify the linked + * list. + */ +static long add_reservation_in_range(struct resv_map *resv, long f, long t, + bool count_only) +{ + long chg = 0; + struct list_head *head = &resv->regions; + struct file_region *rg = NULL, *trg = NULL, *nrg = NULL; + + /* Locate the region we are before or in. */ + list_for_each_entry(rg, head, link) + if (f <= rg->to) + break; + + /* Round our left edge to the current segment if it encloses us. */ + if (f > rg->from) + f = rg->from; + + chg = t - f; + + /* Check for and consume any regions we now overlap with. */ + nrg = rg; + list_for_each_entry_safe(rg, trg, rg->link.prev, link) { + if (&rg->link == head) + break; + if (rg->from > t) + break; + + /* We overlap with this area, if it extends further than + * us then we must extend ourselves. Account for its + * existing reservation. + */ + if (rg->to > t) { + chg += rg->to - t; + t = rg->to; + } + chg -= rg->to - rg->from; + + if (!count_only && rg != nrg) { + list_del(&rg->link); + kfree(rg); + } + } + + if (!count_only) { + nrg->from = f; + nrg->to = t; + } + + return chg; +} + /* * Add the huge page range represented by [f, t) to the reserve - * map. In the normal case, existing regions will be expanded - * to accommodate the specified range. Sufficient regions should - * exist for expansion due to the previous call to region_chg - * with the same range. However, it is possible that region_del - * could have been called after region_chg and modifed the map - * in such a way that no region exists to be expanded. In this - * case, pull a region descriptor from the cache associated with - * the map and use that for the new range. + * map. Existing regions will be expanded to accommodate the specified + * range, or a region will be taken from the cache. Sufficient regions + * must exist in the cache due to the previous call to region_chg with + * the same range. * * Return the number of new huge pages added to the map. This * number is greater than or equal to zero. @@ -261,7 +311,7 @@ struct file_region { static long region_add(struct resv_map *resv, long f, long t) { struct list_head *head = &resv->regions; - struct file_region *rg, *nrg, *trg; + struct file_region *rg, *nrg; long add = 0; spin_lock(&resv->lock); @@ -272,9 +322,8 @@ static long region_add(struct resv_map *resv, long f, long t) /* * If no region exists which can be expanded to include the - * specified range, the list must have been modified by an - * interleving call to region_del(). Pull a region descriptor - * from the cache and use it for this range. + * specified range, pull a region descriptor from the cache + * and use it for this range. */ if (&rg->link == head || t < rg->from) { VM_BUG_ON(resv->region_cache_count <= 0); @@ -292,38 +341,7 @@ static long region_add(struct resv_map *resv, long f, long t) goto out_locked; } - /* Round our left edge to the current segment if it encloses us. */ - if (f > rg->from) - f = rg->from; - - /* Check for and consume any regions we now overlap with. */ - nrg = rg; - list_for_each_entry_safe(rg, trg, rg->link.prev, link) { - if (&rg->link == head) - break; - if (rg->from > t) - break; - - /* If this area reaches higher then extend our area to - * include it completely. If this is not the first area - * which we intend to reuse, free it. */ - if (rg->to > t) - t = rg->to; - if (rg != nrg) { - /* Decrement return value by the deleted range. - * Another range will span this area so that by - * end of routine add will be >= zero - */ - add -= (rg->to - rg->from); - list_del(&rg->link); - kfree(rg); - } - } - - add += (nrg->from - f); /* Added to beginning of region */ - nrg->from = f; - add += t - nrg->to; /* Added to end of region */ - nrg->to = t; + add = add_reservation_in_range(resv, f, t, false); out_locked: resv->adds_in_progress--; @@ -339,15 +357,9 @@ out_locked: * call to region_add that will actually modify the reserve * map to add the specified range [f, t). region_chg does * not change the number of huge pages represented by the - * map. However, if the existing regions in the map can not - * be expanded to represent the new range, a new file_region - * structure is added to the map as a placeholder. This is - * so that the subsequent region_add call will have all the - * regions it needs and will not fail. - * - * Upon entry, region_chg will also examine the cache of region descriptors - * associated with the map. If there are not enough descriptors cached, one - * will be allocated for the in progress add operation. + * map. A new file_region structure is added to the cache + * as a placeholder, so that the subsequent region_add + * call will have all the regions it needs and will not fail. * * Returns the number of huge pages that need to be added to the existing * reservation map for the range [f, t). This number is greater or equal to @@ -356,11 +368,8 @@ out_locked: */ static long region_chg(struct resv_map *resv, long f, long t) { - struct list_head *head = &resv->regions; - struct file_region *rg, *nrg = NULL; long chg = 0; -retry: spin_lock(&resv->lock); retry_locked: resv->adds_in_progress++; @@ -378,10 +387,8 @@ retry_locked: spin_unlock(&resv->lock); trg = kmalloc(sizeof(*trg), GFP_KERNEL); - if (!trg) { - kfree(nrg); + if (!trg) return -ENOMEM; - } spin_lock(&resv->lock); list_add(&trg->link, &resv->region_cache); @@ -389,61 +396,8 @@ retry_locked: goto retry_locked; } - /* Locate the region we are before or in. */ - list_for_each_entry(rg, head, link) - if (f <= rg->to) - break; + chg = add_reservation_in_range(resv, f, t, true); - /* If we are below the current region then a new region is required. - * Subtle, allocate a new region at the position but make it zero - * size such that we can guarantee to record the reservation. */ - if (&rg->link == head || t < rg->from) { - if (!nrg) { - resv->adds_in_progress--; - spin_unlock(&resv->lock); - nrg = kmalloc(sizeof(*nrg), GFP_KERNEL); - if (!nrg) - return -ENOMEM; - - nrg->from = f; - nrg->to = f; - INIT_LIST_HEAD(&nrg->link); - goto retry; - } - - list_add(&nrg->link, rg->link.prev); - chg = t - f; - goto out_nrg; - } - - /* Round our left edge to the current segment if it encloses us. */ - if (f > rg->from) - f = rg->from; - chg = t - f; - - /* Check for and consume any regions we now overlap with. */ - list_for_each_entry(rg, rg->link.prev, link) { - if (&rg->link == head) - break; - if (rg->from > t) - goto out; - - /* We overlap with this area, if it extends further than - * us then we must extend ourselves. Account for its - * existing reservation. */ - if (rg->to > t) { - chg += rg->to - t; - t = rg->to; - } - chg -= rg->to - rg->from; - } - -out: - spin_unlock(&resv->lock); - /* We already know we raced and no longer need the new region */ - kfree(nrg); - return chg; -out_nrg: spin_unlock(&resv->lock); return chg; } @@ -1069,85 +1023,12 @@ static void free_gigantic_page(struct page *page, unsigned int order) } #ifdef CONFIG_CONTIG_ALLOC -static int __alloc_gigantic_page(unsigned long start_pfn, - unsigned long nr_pages, gfp_t gfp_mask) -{ - unsigned long end_pfn = start_pfn + nr_pages; - return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE, - gfp_mask); -} - -static bool pfn_range_valid_gigantic(struct zone *z, - unsigned long start_pfn, unsigned long nr_pages) -{ - unsigned long i, end_pfn = start_pfn + nr_pages; - struct page *page; - - for (i = start_pfn; i < end_pfn; i++) { - page = pfn_to_online_page(i); - if (!page) - return false; - - if (page_zone(page) != z) - return false; - - if (PageReserved(page)) - return false; - - if (page_count(page) > 0) - return false; - - if (PageHuge(page)) - return false; - } - - return true; -} - -static bool zone_spans_last_pfn(const struct zone *zone, - unsigned long start_pfn, unsigned long nr_pages) -{ - unsigned long last_pfn = start_pfn + nr_pages - 1; - return zone_spans_pfn(zone, last_pfn); -} - static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask, int nid, nodemask_t *nodemask) { - unsigned int order = huge_page_order(h); - unsigned long nr_pages = 1 << order; - unsigned long ret, pfn, flags; - struct zonelist *zonelist; - struct zone *zone; - struct zoneref *z; + unsigned long nr_pages = 1UL << huge_page_order(h); - zonelist = node_zonelist(nid, gfp_mask); - for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp_mask), nodemask) { - spin_lock_irqsave(&zone->lock, flags); - - pfn = ALIGN(zone->zone_start_pfn, nr_pages); - while (zone_spans_last_pfn(zone, pfn, nr_pages)) { - if (pfn_range_valid_gigantic(zone, pfn, nr_pages)) { - /* - * We release the zone lock here because - * alloc_contig_range() will also lock the zone - * at some point. If there's an allocation - * spinning on this lock, it may win the race - * and cause alloc_contig_range() to fail... - */ - spin_unlock_irqrestore(&zone->lock, flags); - ret = __alloc_gigantic_page(pfn, nr_pages, gfp_mask); - if (!ret) - return pfn_to_page(pfn); - spin_lock_irqsave(&zone->lock, flags); - } - pfn += nr_pages; - } - - spin_unlock_irqrestore(&zone->lock, flags); - } - - return NULL; + return alloc_contig_pages(nr_pages, gfp_mask, nid, nodemask); } static void prep_new_huge_page(struct hstate *h, struct page *page, int nid); @@ -3915,7 +3796,7 @@ retry: * handling userfault. Reacquire after handling * fault to make calling code simpler. */ - hash = hugetlb_fault_mutex_hash(h, mapping, idx, haddr); + hash = hugetlb_fault_mutex_hash(mapping, idx); mutex_unlock(&hugetlb_fault_mutex_table[hash]); ret = handle_userfault(&vmf, VM_UFFD_MISSING); mutex_lock(&hugetlb_fault_mutex_table[hash]); @@ -4042,8 +3923,7 @@ backout_unlocked: } #ifdef CONFIG_SMP -u32 hugetlb_fault_mutex_hash(struct hstate *h, struct address_space *mapping, - pgoff_t idx, unsigned long address) +u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx) { unsigned long key[2]; u32 hash; @@ -4051,7 +3931,7 @@ u32 hugetlb_fault_mutex_hash(struct hstate *h, struct address_space *mapping, key[0] = (unsigned long) mapping; key[1] = idx; - hash = jhash2((u32 *)&key, sizeof(key)/sizeof(u32), 0); + hash = jhash2((u32 *)&key, sizeof(key)/(sizeof(u32)), 0); return hash & (num_fault_mutexes - 1); } @@ -4060,8 +3940,7 @@ u32 hugetlb_fault_mutex_hash(struct hstate *h, struct address_space *mapping, * For uniprocesor systems we always use a single mutex, so just * return 0 and avoid the hashing overhead. */ -u32 hugetlb_fault_mutex_hash(struct hstate *h, struct address_space *mapping, - pgoff_t idx, unsigned long address) +u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx) { return 0; } @@ -4105,7 +3984,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, * get spurious allocation failures if two CPUs race to instantiate * the same page in the page cache. */ - hash = hugetlb_fault_mutex_hash(h, mapping, idx, haddr); + hash = hugetlb_fault_mutex_hash(mapping, idx); mutex_lock(&hugetlb_fault_mutex_table[hash]); entry = huge_ptep_get(ptep); @@ -4459,6 +4338,21 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, break; } } + + /* + * If subpage information not requested, update counters + * and skip the same_page loop below. + */ + if (!pages && !vmas && !pfn_offset && + (vaddr + huge_page_size(h) < vma->vm_end) && + (remainder >= pages_per_huge_page(h))) { + vaddr += huge_page_size(h); + remainder -= pages_per_huge_page(h); + i += pages_per_huge_page(h); + spin_unlock(ptl); + continue; + } + same_page: if (pages) { pages[i] = mem_map_offset(page, pfn_offset); @@ -4842,7 +4736,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) if (!vma_shareable(vma, addr)) return (pte_t *)pmd_alloc(mm, pud, addr); - i_mmap_lock_write(mapping); + i_mmap_lock_read(mapping); vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) { if (svma == vma) continue; @@ -4872,7 +4766,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) spin_unlock(ptl); out: pte = (pte_t *)pmd_alloc(mm, pud, addr); - i_mmap_unlock_write(mapping); + i_mmap_unlock_read(mapping); return pte; } |