diff options
Diffstat (limited to 'mm/filemap.c')
-rw-r--r-- | mm/filemap.c | 450 |
1 files changed, 278 insertions, 172 deletions
diff --git a/mm/filemap.c b/mm/filemap.c index 00f01d8ead47..9e44a49bbd74 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -22,6 +22,7 @@ #include <linux/mm.h> #include <linux/swap.h> #include <linux/swapops.h> +#include <linux/syscalls.h> #include <linux/mman.h> #include <linux/pagemap.h> #include <linux/file.h> @@ -58,6 +59,8 @@ #include <asm/mman.h> +#include "swap.h" + /* * Shared mappings implemented 30.11.1994. It's not fully working yet, * though. @@ -114,7 +117,7 @@ * ->i_pages lock (page_remove_rmap->set_page_dirty) * bdi.wb->list_lock (page_remove_rmap->set_page_dirty) * ->inode->i_lock (page_remove_rmap->set_page_dirty) - * ->memcg->move_lock (page_remove_rmap->lock_page_memcg) + * ->memcg->move_lock (page_remove_rmap->folio_memcg_lock) * bdi.wb->list_lock (zap_pte_range->set_page_dirty) * ->inode->i_lock (zap_pte_range->set_page_dirty) * ->private_lock (zap_pte_range->block_dirty_folio) @@ -1359,8 +1362,6 @@ repeat: /** * migration_entry_wait_on_locked - Wait for a migration entry to be removed * @entry: migration swap entry. - * @ptep: mapped pte pointer. Will return with the ptep unmapped. Only required - * for pte entries, pass NULL for pmd entries. * @ptl: already locked ptl. This function will drop the lock. * * Wait for a migration entry referencing the given page to be removed. This is @@ -1369,13 +1370,13 @@ repeat: * should be called while holding the ptl for the migration entry referencing * the page. * - * Returns after unmapping and unlocking the pte/ptl with pte_unmap_unlock(). + * Returns after unlocking the ptl. * * This follows the same logic as folio_wait_bit_common() so see the comments * there. */ -void migration_entry_wait_on_locked(swp_entry_t entry, pte_t *ptep, - spinlock_t *ptl) +void migration_entry_wait_on_locked(swp_entry_t entry, spinlock_t *ptl) + __releases(ptl) { struct wait_page_queue wait_page; wait_queue_entry_t *wait = &wait_page.wait; @@ -1409,10 +1410,7 @@ void migration_entry_wait_on_locked(swp_entry_t entry, pte_t *ptep, * a valid reference to the page, and it must take the ptl to remove the * migration entry. So the page is valid until the ptl is dropped. */ - if (ptep) - pte_unmap_unlock(ptep, ptl); - else - spin_unlock(ptl); + spin_unlock(ptl); for (;;) { unsigned int flags; @@ -1625,36 +1623,6 @@ void folio_end_writeback(struct folio *folio) } EXPORT_SYMBOL(folio_end_writeback); -/* - * After completing I/O on a page, call this routine to update the page - * flags appropriately - */ -void page_endio(struct page *page, bool is_write, int err) -{ - struct folio *folio = page_folio(page); - - if (!is_write) { - if (!err) { - folio_mark_uptodate(folio); - } else { - folio_clear_uptodate(folio); - folio_set_error(folio); - } - folio_unlock(folio); - } else { - if (err) { - struct address_space *mapping; - - folio_set_error(folio); - mapping = folio_mapping(folio); - if (mapping) - mapping_set_error(mapping, err); - } - folio_end_writeback(folio); - } -} -EXPORT_SYMBOL_GPL(page_endio); - /** * __folio_lock - Get a lock on the folio, assuming we need to sleep to get it. * @folio: The folio to lock @@ -1760,9 +1728,7 @@ bool __folio_lock_or_retry(struct folio *folio, struct mm_struct *mm, * * Return: The index of the gap if found, otherwise an index outside the * range specified (in which case 'return - index >= max_scan' will be true). - * In the rare case of index wrap-around, 0 will be returned. 0 will also - * be returned if index == 0 and there is a gap at the index. We can not - * wrap-around if passed index == 0. + * In the rare case of index wrap-around, 0 will be returned. */ pgoff_t page_cache_next_miss(struct address_space *mapping, pgoff_t index, unsigned long max_scan) @@ -1772,13 +1738,12 @@ pgoff_t page_cache_next_miss(struct address_space *mapping, while (max_scan--) { void *entry = xas_next(&xas); if (!entry || xa_is_value(entry)) - return xas.xa_index; - if (xas.xa_index == 0 && index != 0) - return xas.xa_index; + break; + if (xas.xa_index == 0) + break; } - /* No gaps in range and no wrap-around, return index beyond range */ - return xas.xa_index + 1; + return xas.xa_index; } EXPORT_SYMBOL(page_cache_next_miss); @@ -1799,9 +1764,7 @@ EXPORT_SYMBOL(page_cache_next_miss); * * Return: The index of the gap if found, otherwise an index outside the * range specified (in which case 'index - return >= max_scan' will be true). - * In the rare case of wrap-around, ULONG_MAX will be returned. ULONG_MAX - * will also be returned if index == ULONG_MAX and there is a gap at the - * index. We can not wrap-around if passed index == ULONG_MAX. + * In the rare case of wrap-around, ULONG_MAX will be returned. */ pgoff_t page_cache_prev_miss(struct address_space *mapping, pgoff_t index, unsigned long max_scan) @@ -1811,13 +1774,12 @@ pgoff_t page_cache_prev_miss(struct address_space *mapping, while (max_scan--) { void *entry = xas_prev(&xas); if (!entry || xa_is_value(entry)) - return xas.xa_index; - if (xas.xa_index == ULONG_MAX && index != ULONG_MAX) - return xas.xa_index; + break; + if (xas.xa_index == ULONG_MAX) + break; } - /* No gaps in range and no wrap-around, return index beyond range */ - return xas.xa_index - 1; + return xas.xa_index; } EXPORT_SYMBOL(page_cache_prev_miss); @@ -2767,6 +2729,48 @@ put_folios: } EXPORT_SYMBOL_GPL(filemap_read); +int kiocb_write_and_wait(struct kiocb *iocb, size_t count) +{ + struct address_space *mapping = iocb->ki_filp->f_mapping; + loff_t pos = iocb->ki_pos; + loff_t end = pos + count - 1; + + if (iocb->ki_flags & IOCB_NOWAIT) { + if (filemap_range_needs_writeback(mapping, pos, end)) + return -EAGAIN; + return 0; + } + + return filemap_write_and_wait_range(mapping, pos, end); +} + +int kiocb_invalidate_pages(struct kiocb *iocb, size_t count) +{ + struct address_space *mapping = iocb->ki_filp->f_mapping; + loff_t pos = iocb->ki_pos; + loff_t end = pos + count - 1; + int ret; + + if (iocb->ki_flags & IOCB_NOWAIT) { + /* we could block if there are any pages in the range */ + if (filemap_range_has_page(mapping, pos, end)) + return -EAGAIN; + } else { + ret = filemap_write_and_wait_range(mapping, pos, end); + if (ret) + return ret; + } + + /* + * After a write we want buffered reads to be sure to go to disk to get + * the new data. We invalidate clean cached page from the region we're + * about to write. We do this *before* the write so that we can return + * without clobbering -EIOCBQUEUED from ->direct_IO(). + */ + return invalidate_inode_pages2_range(mapping, pos >> PAGE_SHIFT, + end >> PAGE_SHIFT); +} + /** * generic_file_read_iter - generic filesystem read routine * @iocb: kernel I/O control block @@ -2802,18 +2806,9 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; - if (iocb->ki_flags & IOCB_NOWAIT) { - if (filemap_range_needs_writeback(mapping, iocb->ki_pos, - iocb->ki_pos + count - 1)) - return -EAGAIN; - } else { - retval = filemap_write_and_wait_range(mapping, - iocb->ki_pos, - iocb->ki_pos + count - 1); - if (retval < 0) - return retval; - } - + retval = kiocb_write_and_wait(iocb, count); + if (retval < 0) + return retval; file_accessed(file); retval = mapping->a_ops->direct_IO(iocb, iter); @@ -3436,13 +3431,6 @@ static bool filemap_map_pmd(struct vm_fault *vmf, struct folio *folio, if (pmd_none(*vmf->pmd)) pmd_install(mm, vmf->pmd, &vmf->prealloc_pte); - /* See comment in handle_pte_fault() */ - if (pmd_devmap_trans_unstable(vmf->pmd)) { - folio_unlock(folio); - folio_put(folio); - return true; - } - return false; } @@ -3529,6 +3517,11 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf, addr = vma->vm_start + ((start_pgoff - vma->vm_pgoff) << PAGE_SHIFT); vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, addr, &vmf->ptl); + if (!vmf->pte) { + folio_unlock(folio); + folio_put(folio); + goto out; + } do { again: page = folio_file_page(folio, xas.xa_index); @@ -3547,7 +3540,7 @@ again: * handled in the specific fault path, and it'll prohibit the * fault-around logic. */ - if (!pte_none(*vmf->pte)) + if (!pte_none(ptep_get(vmf->pte))) goto unlock; /* We're about to handle the fault */ @@ -3806,7 +3799,7 @@ EXPORT_SYMBOL(read_cache_page_gfp); /* * Warn about a page cache invalidation failure during a direct I/O write. */ -void dio_warn_stale_pagecache(struct file *filp) +static void dio_warn_stale_pagecache(struct file *filp) { static DEFINE_RATELIMIT_STATE(_rs, 86400 * HZ, DEFAULT_RATELIMIT_BURST); char pathname[128]; @@ -3823,48 +3816,33 @@ void dio_warn_stale_pagecache(struct file *filp) } } -ssize_t -generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from) +void kiocb_invalidate_post_direct_write(struct kiocb *iocb, size_t count) { - struct file *file = iocb->ki_filp; - struct address_space *mapping = file->f_mapping; - struct inode *inode = mapping->host; - loff_t pos = iocb->ki_pos; - ssize_t written; - size_t write_len; - pgoff_t end; + struct address_space *mapping = iocb->ki_filp->f_mapping; - write_len = iov_iter_count(from); - end = (pos + write_len - 1) >> PAGE_SHIFT; + if (mapping->nrpages && + invalidate_inode_pages2_range(mapping, + iocb->ki_pos >> PAGE_SHIFT, + (iocb->ki_pos + count - 1) >> PAGE_SHIFT)) + dio_warn_stale_pagecache(iocb->ki_filp); +} - if (iocb->ki_flags & IOCB_NOWAIT) { - /* If there are pages to writeback, return */ - if (filemap_range_has_page(file->f_mapping, pos, - pos + write_len - 1)) - return -EAGAIN; - } else { - written = filemap_write_and_wait_range(mapping, pos, - pos + write_len - 1); - if (written) - goto out; - } +ssize_t +generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from) +{ + struct address_space *mapping = iocb->ki_filp->f_mapping; + size_t write_len = iov_iter_count(from); + ssize_t written; /* - * After a write we want buffered reads to be sure to go to disk to get - * the new data. We invalidate clean cached page from the region we're - * about to write. We do this *before* the write so that we can return - * without clobbering -EIOCBQUEUED from ->direct_IO(). - */ - written = invalidate_inode_pages2_range(mapping, - pos >> PAGE_SHIFT, end); - /* * If a page can not be invalidated, return 0 to fall back * to buffered write. */ + written = kiocb_invalidate_pages(iocb, write_len); if (written) { if (written == -EBUSY) return 0; - goto out; + return written; } written = mapping->a_ops->direct_IO(iocb, from); @@ -3886,11 +3864,11 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from) * * Skip invalidation for async writes or if mapping has no pages. */ - if (written > 0 && mapping->nrpages && - invalidate_inode_pages2_range(mapping, pos >> PAGE_SHIFT, end)) - dio_warn_stale_pagecache(file); - if (written > 0) { + struct inode *inode = mapping->host; + loff_t pos = iocb->ki_pos; + + kiocb_invalidate_post_direct_write(iocb, written); pos += written; write_len -= written; if (pos > i_size_read(inode) && !S_ISBLK(inode->i_mode)) { @@ -3901,7 +3879,6 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from) } if (written != -EIOCBQUEUED) iov_iter_revert(from, write_len - iov_iter_count(from)); -out: return written; } EXPORT_SYMBOL(generic_file_direct_write); @@ -3980,7 +3957,10 @@ again: balance_dirty_pages_ratelimited(mapping); } while (iov_iter_count(i)); - return written ? written : status; + if (!written) + return status; + iocb->ki_pos += written; + return written; } EXPORT_SYMBOL(generic_perform_write); @@ -4009,25 +3989,19 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; - struct inode *inode = mapping->host; - ssize_t written = 0; - ssize_t err; - ssize_t status; - - /* We can write back this queue in page reclaim */ - current->backing_dev_info = inode_to_bdi(inode); - err = file_remove_privs(file); - if (err) - goto out; + struct inode *inode = mapping->host; + ssize_t ret; - err = file_update_time(file); - if (err) - goto out; + ret = file_remove_privs(file); + if (ret) + return ret; - if (iocb->ki_flags & IOCB_DIRECT) { - loff_t pos, endbyte; + ret = file_update_time(file); + if (ret) + return ret; - written = generic_file_direct_write(iocb, from); + if (iocb->ki_flags & IOCB_DIRECT) { + ret = generic_file_direct_write(iocb, from); /* * If the write stopped short of completing, fall back to * buffered writes. Some filesystems do this for writes to @@ -4035,49 +4009,13 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) * not succeed (even if it did, DAX does not handle dirty * page-cache pages correctly). */ - if (written < 0 || !iov_iter_count(from) || IS_DAX(inode)) - goto out; - - pos = iocb->ki_pos; - status = generic_perform_write(iocb, from); - /* - * If generic_perform_write() returned a synchronous error - * then we want to return the number of bytes which were - * direct-written, or the error code if that was zero. Note - * that this differs from normal direct-io semantics, which - * will return -EFOO even if some bytes were written. - */ - if (unlikely(status < 0)) { - err = status; - goto out; - } - /* - * We need to ensure that the page cache pages are written to - * disk and invalidated to preserve the expected O_DIRECT - * semantics. - */ - endbyte = pos + status - 1; - err = filemap_write_and_wait_range(mapping, pos, endbyte); - if (err == 0) { - iocb->ki_pos = endbyte + 1; - written += status; - invalidate_mapping_pages(mapping, - pos >> PAGE_SHIFT, - endbyte >> PAGE_SHIFT); - } else { - /* - * We don't know how much we wrote, so just return - * the number of bytes which were direct-written - */ - } - } else { - written = generic_perform_write(iocb, from); - if (likely(written > 0)) - iocb->ki_pos += written; + if (ret < 0 || !iov_iter_count(from) || IS_DAX(inode)) + return ret; + return direct_write_fallback(iocb, from, ret, + generic_perform_write(iocb, from)); } -out: - current->backing_dev_info = NULL; - return written ? written : err; + + return generic_perform_write(iocb, from); } EXPORT_SYMBOL(__generic_file_write_iter); @@ -4142,3 +4080,171 @@ bool filemap_release_folio(struct folio *folio, gfp_t gfp) return try_to_free_buffers(folio); } EXPORT_SYMBOL(filemap_release_folio); + +#ifdef CONFIG_CACHESTAT_SYSCALL +/** + * filemap_cachestat() - compute the page cache statistics of a mapping + * @mapping: The mapping to compute the statistics for. + * @first_index: The starting page cache index. + * @last_index: The final page index (inclusive). + * @cs: the cachestat struct to write the result to. + * + * This will query the page cache statistics of a mapping in the + * page range of [first_index, last_index] (inclusive). The statistics + * queried include: number of dirty pages, number of pages marked for + * writeback, and the number of (recently) evicted pages. + */ +static void filemap_cachestat(struct address_space *mapping, + pgoff_t first_index, pgoff_t last_index, struct cachestat *cs) +{ + XA_STATE(xas, &mapping->i_pages, first_index); + struct folio *folio; + + rcu_read_lock(); + xas_for_each(&xas, folio, last_index) { + unsigned long nr_pages; + pgoff_t folio_first_index, folio_last_index; + + if (xas_retry(&xas, folio)) + continue; + + if (xa_is_value(folio)) { + /* page is evicted */ + void *shadow = (void *)folio; + bool workingset; /* not used */ + int order = xa_get_order(xas.xa, xas.xa_index); + + nr_pages = 1 << order; + folio_first_index = round_down(xas.xa_index, 1 << order); + folio_last_index = folio_first_index + nr_pages - 1; + + /* Folios might straddle the range boundaries, only count covered pages */ + if (folio_first_index < first_index) + nr_pages -= first_index - folio_first_index; + + if (folio_last_index > last_index) + nr_pages -= folio_last_index - last_index; + + cs->nr_evicted += nr_pages; + +#ifdef CONFIG_SWAP /* implies CONFIG_MMU */ + if (shmem_mapping(mapping)) { + /* shmem file - in swap cache */ + swp_entry_t swp = radix_to_swp_entry(folio); + + shadow = get_shadow_from_swap_cache(swp); + } +#endif + if (workingset_test_recent(shadow, true, &workingset)) + cs->nr_recently_evicted += nr_pages; + + goto resched; + } + + nr_pages = folio_nr_pages(folio); + folio_first_index = folio_pgoff(folio); + folio_last_index = folio_first_index + nr_pages - 1; + + /* Folios might straddle the range boundaries, only count covered pages */ + if (folio_first_index < first_index) + nr_pages -= first_index - folio_first_index; + + if (folio_last_index > last_index) + nr_pages -= folio_last_index - last_index; + + /* page is in cache */ + cs->nr_cache += nr_pages; + + if (folio_test_dirty(folio)) + cs->nr_dirty += nr_pages; + + if (folio_test_writeback(folio)) + cs->nr_writeback += nr_pages; + +resched: + if (need_resched()) { + xas_pause(&xas); + cond_resched_rcu(); + } + } + rcu_read_unlock(); +} + +/* + * The cachestat(2) system call. + * + * cachestat() returns the page cache statistics of a file in the + * bytes range specified by `off` and `len`: number of cached pages, + * number of dirty pages, number of pages marked for writeback, + * number of evicted pages, and number of recently evicted pages. + * + * An evicted page is a page that is previously in the page cache + * but has been evicted since. A page is recently evicted if its last + * eviction was recent enough that its reentry to the cache would + * indicate that it is actively being used by the system, and that + * there is memory pressure on the system. + * + * `off` and `len` must be non-negative integers. If `len` > 0, + * the queried range is [`off`, `off` + `len`]. If `len` == 0, + * we will query in the range from `off` to the end of the file. + * + * The `flags` argument is unused for now, but is included for future + * extensibility. User should pass 0 (i.e no flag specified). + * + * Currently, hugetlbfs is not supported. + * + * Because the status of a page can change after cachestat() checks it + * but before it returns to the application, the returned values may + * contain stale information. + * + * return values: + * zero - success + * -EFAULT - cstat or cstat_range points to an illegal address + * -EINVAL - invalid flags + * -EBADF - invalid file descriptor + * -EOPNOTSUPP - file descriptor is of a hugetlbfs file + */ +SYSCALL_DEFINE4(cachestat, unsigned int, fd, + struct cachestat_range __user *, cstat_range, + struct cachestat __user *, cstat, unsigned int, flags) +{ + struct fd f = fdget(fd); + struct address_space *mapping; + struct cachestat_range csr; + struct cachestat cs; + pgoff_t first_index, last_index; + + if (!f.file) + return -EBADF; + + if (copy_from_user(&csr, cstat_range, + sizeof(struct cachestat_range))) { + fdput(f); + return -EFAULT; + } + + /* hugetlbfs is not supported */ + if (is_file_hugepages(f.file)) { + fdput(f); + return -EOPNOTSUPP; + } + + if (flags != 0) { + fdput(f); + return -EINVAL; + } + + first_index = csr.off >> PAGE_SHIFT; + last_index = + csr.len == 0 ? ULONG_MAX : (csr.off + csr.len - 1) >> PAGE_SHIFT; + memset(&cs, 0, sizeof(struct cachestat)); + mapping = f.file->f_mapping; + filemap_cachestat(mapping, first_index, last_index, &cs); + fdput(f); + + if (copy_to_user(cstat, &cs, sizeof(struct cachestat))) + return -EFAULT; + + return 0; +} +#endif /* CONFIG_CACHESTAT_SYSCALL */ |