From ec4dd3eb35759f9fbeb5c1abb01403b2fde64cc9 Mon Sep 17 00:00:00 2001 From: Fengguang Wu Date: Mon, 4 Feb 2008 22:28:56 -0800 Subject: maps4: add proportional set size accounting in smaps The "proportional set size" (PSS) of a process is the count of pages it has in memory, where each page is divided by the number of processes sharing it. So if a process has 1000 pages all to itself, and 1000 shared with one other process, its PSS will be 1500. - lwn.net: "ELC: How much memory are applications really using?" The PSS proposed by Matt Mackall is a very nice metic for measuring an process's memory footprint. So collect and export it via /proc//smaps. Matt Mackall's pagemap/kpagemap and John Berthels's exmap can also do the job. They are comprehensive tools. But for PSS, let's do it in the simple way. Cc: John Berthels Cc: Bernardo Innocenti Cc: Padraig Brady Cc: Denys Vlasenko Cc: Balbir Singh Signed-off-by: Matt Mackall Signed-off-by: Fengguang Wu Cc: Hugh Dickins Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) (limited to 'fs/proc/task_mmu.c') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 8043a3eab52c..8952ce70315e 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -114,6 +114,25 @@ static void pad_len_spaces(struct seq_file *m, int len) seq_printf(m, "%*c", len, ' '); } +/* + * Proportional Set Size(PSS): my share of RSS. + * + * PSS of a process is the count of pages it has in memory, where each + * page is divided by the number of processes sharing it. So if a + * process has 1000 pages all to itself, and 1000 shared with one other + * process, its PSS will be 1500. + * + * To keep (accumulated) division errors low, we adopt a 64bit + * fixed-point pss counter to minimize division errors. So (pss >> + * PSS_SHIFT) would be the real byte count. + * + * A shift of 12 before division means (assuming 4K page size): + * - 1M 3-user-pages add up to 8KB errors; + * - supports mapcount up to 2^24, or 16M; + * - supports PSS up to 2^52 bytes, or 4PB. + */ +#define PSS_SHIFT 12 + struct mem_size_stats { unsigned long resident; @@ -122,6 +141,7 @@ struct mem_size_stats unsigned long private_clean; unsigned long private_dirty; unsigned long referenced; + u64 pss; }; struct pmd_walker { @@ -195,6 +215,7 @@ static int show_map_internal(struct seq_file *m, void *v, struct mem_size_stats seq_printf(m, "Size: %8lu kB\n" "Rss: %8lu kB\n" + "Pss: %8lu kB\n" "Shared_Clean: %8lu kB\n" "Shared_Dirty: %8lu kB\n" "Private_Clean: %8lu kB\n" @@ -202,6 +223,7 @@ static int show_map_internal(struct seq_file *m, void *v, struct mem_size_stats "Referenced: %8lu kB\n", (vma->vm_end - vma->vm_start) >> 10, mss->resident >> 10, + (unsigned long)(mss->pss >> (10 + PSS_SHIFT)), mss->shared_clean >> 10, mss->shared_dirty >> 10, mss->private_clean >> 10, @@ -226,6 +248,7 @@ static void smaps_pte_range(struct vm_area_struct *vma, pmd_t *pmd, pte_t *pte, ptent; spinlock_t *ptl; struct page *page; + int mapcount; pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); for (; addr != end; pte++, addr += PAGE_SIZE) { @@ -242,16 +265,19 @@ static void smaps_pte_range(struct vm_area_struct *vma, pmd_t *pmd, /* Accumulate the size in pages that have been accessed. */ if (pte_young(ptent) || PageReferenced(page)) mss->referenced += PAGE_SIZE; - if (page_mapcount(page) >= 2) { + mapcount = page_mapcount(page); + if (mapcount >= 2) { if (pte_dirty(ptent)) mss->shared_dirty += PAGE_SIZE; else mss->shared_clean += PAGE_SIZE; + mss->pss += (PAGE_SIZE << PSS_SHIFT) / mapcount; } else { if (pte_dirty(ptent)) mss->private_dirty += PAGE_SIZE; else mss->private_clean += PAGE_SIZE; + mss->pss += (PAGE_SIZE << PSS_SHIFT); } } pte_unmap_unlock(pte - 1, ptl); -- cgit v1.2.3 From b3ae5acbbb98d95c1300c8ced56d15f97d09c506 Mon Sep 17 00:00:00 2001 From: Matt Mackall Date: Mon, 4 Feb 2008 22:29:01 -0800 Subject: maps4: use pagewalker in clear_refs and smaps Use the generic pagewalker for smaps and clear_refs Signed-off-by: Matt Mackall Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 95 ++++++++++-------------------------------------------- 1 file changed, 17 insertions(+), 78 deletions(-) (limited to 'fs/proc/task_mmu.c') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 8952ce70315e..791b2d400be5 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -135,6 +135,7 @@ static void pad_len_spaces(struct seq_file *m, int len) struct mem_size_stats { + struct vm_area_struct *vma; unsigned long resident; unsigned long shared_clean; unsigned long shared_dirty; @@ -144,13 +145,6 @@ struct mem_size_stats u64 pss; }; -struct pmd_walker { - struct vm_area_struct *vma; - void *private; - void (*action)(struct vm_area_struct *, pmd_t *, unsigned long, - unsigned long, void *); -}; - static int show_map_internal(struct seq_file *m, void *v, struct mem_size_stats *mss) { struct proc_maps_private *priv = m->private; @@ -240,11 +234,11 @@ static int show_map(struct seq_file *m, void *v) return show_map_internal(m, v, NULL); } -static void smaps_pte_range(struct vm_area_struct *vma, pmd_t *pmd, - unsigned long addr, unsigned long end, - void *private) +static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, + void *private) { struct mem_size_stats *mss = private; + struct vm_area_struct *vma = mss->vma; pte_t *pte, ptent; spinlock_t *ptl; struct page *page; @@ -282,12 +276,13 @@ static void smaps_pte_range(struct vm_area_struct *vma, pmd_t *pmd, } pte_unmap_unlock(pte - 1, ptl); cond_resched(); + return 0; } -static void clear_refs_pte_range(struct vm_area_struct *vma, pmd_t *pmd, - unsigned long addr, unsigned long end, - void *private) +static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, + unsigned long end, void *private) { + struct vm_area_struct *vma = private; pte_t *pte, ptent; spinlock_t *ptl; struct page *page; @@ -308,71 +303,10 @@ static void clear_refs_pte_range(struct vm_area_struct *vma, pmd_t *pmd, } pte_unmap_unlock(pte - 1, ptl); cond_resched(); + return 0; } -static inline void walk_pmd_range(struct pmd_walker *walker, pud_t *pud, - unsigned long addr, unsigned long end) -{ - pmd_t *pmd; - unsigned long next; - - for (pmd = pmd_offset(pud, addr); addr != end; - pmd++, addr = next) { - next = pmd_addr_end(addr, end); - if (pmd_none_or_clear_bad(pmd)) - continue; - walker->action(walker->vma, pmd, addr, next, walker->private); - } -} - -static inline void walk_pud_range(struct pmd_walker *walker, pgd_t *pgd, - unsigned long addr, unsigned long end) -{ - pud_t *pud; - unsigned long next; - - for (pud = pud_offset(pgd, addr); addr != end; - pud++, addr = next) { - next = pud_addr_end(addr, end); - if (pud_none_or_clear_bad(pud)) - continue; - walk_pmd_range(walker, pud, addr, next); - } -} - -/* - * walk_page_range - walk the page tables of a VMA with a callback - * @vma - VMA to walk - * @action - callback invoked for every bottom-level (PTE) page table - * @private - private data passed to the callback function - * - * Recursively walk the page table for the memory area in a VMA, calling - * a callback for every bottom-level (PTE) page table. - */ -static inline void walk_page_range(struct vm_area_struct *vma, - void (*action)(struct vm_area_struct *, - pmd_t *, unsigned long, - unsigned long, void *), - void *private) -{ - unsigned long addr = vma->vm_start; - unsigned long end = vma->vm_end; - struct pmd_walker walker = { - .vma = vma, - .private = private, - .action = action, - }; - pgd_t *pgd; - unsigned long next; - - for (pgd = pgd_offset(vma->vm_mm, addr); addr != end; - pgd++, addr = next) { - next = pgd_addr_end(addr, end); - if (pgd_none_or_clear_bad(pgd)) - continue; - walk_pud_range(&walker, pgd, addr, next); - } -} +static struct mm_walk smaps_walk = { .pmd_entry = smaps_pte_range }; static int show_smap(struct seq_file *m, void *v) { @@ -380,11 +314,15 @@ static int show_smap(struct seq_file *m, void *v) struct mem_size_stats mss; memset(&mss, 0, sizeof mss); + mss.vma = vma; if (vma->vm_mm && !is_vm_hugetlb_page(vma)) - walk_page_range(vma, smaps_pte_range, &mss); + walk_page_range(vma->vm_mm, vma->vm_start, vma->vm_end, + &smaps_walk, &mss); return show_map_internal(m, v, &mss); } +static struct mm_walk clear_refs_walk = { .pmd_entry = clear_refs_pte_range }; + void clear_refs_smap(struct mm_struct *mm) { struct vm_area_struct *vma; @@ -392,7 +330,8 @@ void clear_refs_smap(struct mm_struct *mm) down_read(&mm->mmap_sem); for (vma = mm->mmap; vma; vma = vma->vm_next) if (vma->vm_mm && !is_vm_hugetlb_page(vma)) - walk_page_range(vma, clear_refs_pte_range, NULL); + walk_page_range(vma->vm_mm, vma->vm_start, vma->vm_end, + &clear_refs_walk, vma); flush_tlb_mm(mm); up_read(&mm->mmap_sem); } -- cgit v1.2.3 From 4752c369789250eafcd7813e11c8fb689235b0d2 Mon Sep 17 00:00:00 2001 From: Matt Mackall Date: Mon, 4 Feb 2008 22:29:02 -0800 Subject: maps4: simplify interdependence of maps and smaps This pulls the shared map display code out of show_map and puts it in show_smap where it belongs. Signed-off-by: Matt Mackall Cc: Jeremy Fitzhardinge Acked-by: David Rientjes Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 52 ++++++++++++++++++++++++++-------------------------- 1 file changed, 26 insertions(+), 26 deletions(-) (limited to 'fs/proc/task_mmu.c') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 791b2d400be5..abc44f39f1d6 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -145,7 +145,7 @@ struct mem_size_stats u64 pss; }; -static int show_map_internal(struct seq_file *m, void *v, struct mem_size_stats *mss) +static int show_map(struct seq_file *m, void *v) { struct proc_maps_private *priv = m->private; struct task_struct *task = priv->task; @@ -205,35 +205,11 @@ static int show_map_internal(struct seq_file *m, void *v, struct mem_size_stats } seq_putc(m, '\n'); - if (mss) - seq_printf(m, - "Size: %8lu kB\n" - "Rss: %8lu kB\n" - "Pss: %8lu kB\n" - "Shared_Clean: %8lu kB\n" - "Shared_Dirty: %8lu kB\n" - "Private_Clean: %8lu kB\n" - "Private_Dirty: %8lu kB\n" - "Referenced: %8lu kB\n", - (vma->vm_end - vma->vm_start) >> 10, - mss->resident >> 10, - (unsigned long)(mss->pss >> (10 + PSS_SHIFT)), - mss->shared_clean >> 10, - mss->shared_dirty >> 10, - mss->private_clean >> 10, - mss->private_dirty >> 10, - mss->referenced >> 10); - if (m->count < m->size) /* vma is copied successfully */ m->version = (vma != get_gate_vma(task))? vma->vm_start: 0; return 0; } -static int show_map(struct seq_file *m, void *v) -{ - return show_map_internal(m, v, NULL); -} - static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, void *private) { @@ -312,13 +288,37 @@ static int show_smap(struct seq_file *m, void *v) { struct vm_area_struct *vma = v; struct mem_size_stats mss; + int ret; memset(&mss, 0, sizeof mss); mss.vma = vma; if (vma->vm_mm && !is_vm_hugetlb_page(vma)) walk_page_range(vma->vm_mm, vma->vm_start, vma->vm_end, &smaps_walk, &mss); - return show_map_internal(m, v, &mss); + + ret = show_map(m, v); + if (ret) + return ret; + + seq_printf(m, + "Size: %8lu kB\n" + "Rss: %8lu kB\n" + "Pss: %8lu kB\n" + "Shared_Clean: %8lu kB\n" + "Shared_Dirty: %8lu kB\n" + "Private_Clean: %8lu kB\n" + "Private_Dirty: %8lu kB\n" + "Referenced: %8lu kB\n", + (vma->vm_end - vma->vm_start) >> 10, + mss.resident >> 10, + (unsigned long)(mss.pss >> (10 + PSS_SHIFT)), + mss.shared_clean >> 10, + mss.shared_dirty >> 10, + mss.private_clean >> 10, + mss.private_dirty >> 10, + mss.referenced >> 10); + + return ret; } static struct mm_walk clear_refs_walk = { .pmd_entry = clear_refs_pte_range }; -- cgit v1.2.3 From f248dcb34d7b7ac255db70071a20be9d9c6ad491 Mon Sep 17 00:00:00 2001 From: Matt Mackall Date: Mon, 4 Feb 2008 22:29:03 -0800 Subject: maps4: move clear_refs code to task_mmu.c This puts all the clear_refs code where it belongs and probably lets things compile on MMU-less systems as well. Signed-off-by: Matt Mackall Cc: Jeremy Fitzhardinge Cc: David Rientjes Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 44 ++++++++++++++++++++++++++++++++++++-------- 1 file changed, 36 insertions(+), 8 deletions(-) (limited to 'fs/proc/task_mmu.c') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index abc44f39f1d6..fcdbd233f252 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -323,19 +323,47 @@ static int show_smap(struct seq_file *m, void *v) static struct mm_walk clear_refs_walk = { .pmd_entry = clear_refs_pte_range }; -void clear_refs_smap(struct mm_struct *mm) +static ssize_t clear_refs_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) { + struct task_struct *task; + char buffer[PROC_NUMBUF], *end; + struct mm_struct *mm; struct vm_area_struct *vma; - down_read(&mm->mmap_sem); - for (vma = mm->mmap; vma; vma = vma->vm_next) - if (vma->vm_mm && !is_vm_hugetlb_page(vma)) - walk_page_range(vma->vm_mm, vma->vm_start, vma->vm_end, - &clear_refs_walk, vma); - flush_tlb_mm(mm); - up_read(&mm->mmap_sem); + memset(buffer, 0, sizeof(buffer)); + if (count > sizeof(buffer) - 1) + count = sizeof(buffer) - 1; + if (copy_from_user(buffer, buf, count)) + return -EFAULT; + if (!simple_strtol(buffer, &end, 0)) + return -EINVAL; + if (*end == '\n') + end++; + task = get_proc_task(file->f_path.dentry->d_inode); + if (!task) + return -ESRCH; + mm = get_task_mm(task); + if (mm) { + down_read(&mm->mmap_sem); + for (vma = mm->mmap; vma; vma = vma->vm_next) + if (!is_vm_hugetlb_page(vma)) + walk_page_range(mm, vma->vm_start, vma->vm_end, + &clear_refs_walk, vma); + flush_tlb_mm(mm); + up_read(&mm->mmap_sem); + mmput(mm); + } + put_task_struct(task); + if (end - buffer == 0) + return -EIO; + return end - buffer; } +const struct file_operations proc_clear_refs_operations = { + .write = clear_refs_write, +}; + static void *m_start(struct seq_file *m, loff_t *pos) { struct proc_maps_private *priv = m->private; -- cgit v1.2.3 From a6198797cc3fd659b2d81cdf6bb6b9bba9cd93e9 Mon Sep 17 00:00:00 2001 From: Matt Mackall Date: Mon, 4 Feb 2008 22:29:03 -0800 Subject: maps4: regroup task_mmu by interface Reorder source so that all the code and data for each interface is together. Signed-off-by: Matt Mackall Cc: Jeremy Fitzhardinge Cc: David Rientjes Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 417 +++++++++++++++++++++++++++-------------------------- 1 file changed, 210 insertions(+), 207 deletions(-) (limited to 'fs/proc/task_mmu.c') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index fcdbd233f252..308fc5451e43 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -114,36 +114,122 @@ static void pad_len_spaces(struct seq_file *m, int len) seq_printf(m, "%*c", len, ' '); } -/* - * Proportional Set Size(PSS): my share of RSS. - * - * PSS of a process is the count of pages it has in memory, where each - * page is divided by the number of processes sharing it. So if a - * process has 1000 pages all to itself, and 1000 shared with one other - * process, its PSS will be 1500. - * - * To keep (accumulated) division errors low, we adopt a 64bit - * fixed-point pss counter to minimize division errors. So (pss >> - * PSS_SHIFT) would be the real byte count. - * - * A shift of 12 before division means (assuming 4K page size): - * - 1M 3-user-pages add up to 8KB errors; - * - supports mapcount up to 2^24, or 16M; - * - supports PSS up to 2^52 bytes, or 4PB. - */ -#define PSS_SHIFT 12 +static void vma_stop(struct proc_maps_private *priv, struct vm_area_struct *vma) +{ + if (vma && vma != priv->tail_vma) { + struct mm_struct *mm = vma->vm_mm; + up_read(&mm->mmap_sem); + mmput(mm); + } +} -struct mem_size_stats +static void *m_start(struct seq_file *m, loff_t *pos) { - struct vm_area_struct *vma; - unsigned long resident; - unsigned long shared_clean; - unsigned long shared_dirty; - unsigned long private_clean; - unsigned long private_dirty; - unsigned long referenced; - u64 pss; -}; + struct proc_maps_private *priv = m->private; + unsigned long last_addr = m->version; + struct mm_struct *mm; + struct vm_area_struct *vma, *tail_vma = NULL; + loff_t l = *pos; + + /* Clear the per syscall fields in priv */ + priv->task = NULL; + priv->tail_vma = NULL; + + /* + * We remember last_addr rather than next_addr to hit with + * mmap_cache most of the time. We have zero last_addr at + * the beginning and also after lseek. We will have -1 last_addr + * after the end of the vmas. + */ + + if (last_addr == -1UL) + return NULL; + + priv->task = get_pid_task(priv->pid, PIDTYPE_PID); + if (!priv->task) + return NULL; + + mm = mm_for_maps(priv->task); + if (!mm) + return NULL; + + tail_vma = get_gate_vma(priv->task); + priv->tail_vma = tail_vma; + + /* Start with last addr hint */ + vma = find_vma(mm, last_addr); + if (last_addr && vma) { + vma = vma->vm_next; + goto out; + } + + /* + * Check the vma index is within the range and do + * sequential scan until m_index. + */ + vma = NULL; + if ((unsigned long)l < mm->map_count) { + vma = mm->mmap; + while (l-- && vma) + vma = vma->vm_next; + goto out; + } + + if (l != mm->map_count) + tail_vma = NULL; /* After gate vma */ + +out: + if (vma) + return vma; + + /* End of vmas has been reached */ + m->version = (tail_vma != NULL)? 0: -1UL; + up_read(&mm->mmap_sem); + mmput(mm); + return tail_vma; +} + +static void *m_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct proc_maps_private *priv = m->private; + struct vm_area_struct *vma = v; + struct vm_area_struct *tail_vma = priv->tail_vma; + + (*pos)++; + if (vma && (vma != tail_vma) && vma->vm_next) + return vma->vm_next; + vma_stop(priv, vma); + return (vma != tail_vma)? tail_vma: NULL; +} + +static void m_stop(struct seq_file *m, void *v) +{ + struct proc_maps_private *priv = m->private; + struct vm_area_struct *vma = v; + + vma_stop(priv, vma); + if (priv->task) + put_task_struct(priv->task); +} + +static int do_maps_open(struct inode *inode, struct file *file, + struct seq_operations *ops) +{ + struct proc_maps_private *priv; + int ret = -ENOMEM; + priv = kzalloc(sizeof(*priv), GFP_KERNEL); + if (priv) { + priv->pid = proc_pid(inode); + ret = seq_open(file, ops); + if (!ret) { + struct seq_file *m = file->private_data; + m->private = priv; + } else { + kfree(priv); + } + } + return ret; +} static int show_map(struct seq_file *m, void *v) { @@ -210,6 +296,56 @@ static int show_map(struct seq_file *m, void *v) return 0; } +static struct seq_operations proc_pid_maps_op = { + .start = m_start, + .next = m_next, + .stop = m_stop, + .show = show_map +}; + +static int maps_open(struct inode *inode, struct file *file) +{ + return do_maps_open(inode, file, &proc_pid_maps_op); +} + +const struct file_operations proc_maps_operations = { + .open = maps_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + +/* + * Proportional Set Size(PSS): my share of RSS. + * + * PSS of a process is the count of pages it has in memory, where each + * page is divided by the number of processes sharing it. So if a + * process has 1000 pages all to itself, and 1000 shared with one other + * process, its PSS will be 1500. + * + * To keep (accumulated) division errors low, we adopt a 64bit + * fixed-point pss counter to minimize division errors. So (pss >> + * PSS_SHIFT) would be the real byte count. + * + * A shift of 12 before division means (assuming 4K page size): + * - 1M 3-user-pages add up to 8KB errors; + * - supports mapcount up to 2^24, or 16M; + * - supports PSS up to 2^52 bytes, or 4PB. + */ +#define PSS_SHIFT 12 + +struct mem_size_stats +{ + struct vm_area_struct *vma; + unsigned long resident; + unsigned long shared_clean; + unsigned long shared_dirty; + unsigned long private_clean; + unsigned long private_dirty; + unsigned long referenced; + u64 pss; +}; + static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, void *private) { @@ -255,33 +391,6 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, return 0; } -static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, - unsigned long end, void *private) -{ - struct vm_area_struct *vma = private; - pte_t *pte, ptent; - spinlock_t *ptl; - struct page *page; - - pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); - for (; addr != end; pte++, addr += PAGE_SIZE) { - ptent = *pte; - if (!pte_present(ptent)) - continue; - - page = vm_normal_page(vma, addr, ptent); - if (!page) - continue; - - /* Clear accessed and referenced bits. */ - ptep_test_and_clear_young(vma, addr, pte); - ClearPageReferenced(page); - } - pte_unmap_unlock(pte - 1, ptl); - cond_resched(); - return 0; -} - static struct mm_walk smaps_walk = { .pmd_entry = smaps_pte_range }; static int show_smap(struct seq_file *m, void *v) @@ -321,6 +430,52 @@ static int show_smap(struct seq_file *m, void *v) return ret; } +static struct seq_operations proc_pid_smaps_op = { + .start = m_start, + .next = m_next, + .stop = m_stop, + .show = show_smap +}; + +static int smaps_open(struct inode *inode, struct file *file) +{ + return do_maps_open(inode, file, &proc_pid_smaps_op); +} + +const struct file_operations proc_smaps_operations = { + .open = smaps_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + +static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, + unsigned long end, void *private) +{ + struct vm_area_struct *vma = private; + pte_t *pte, ptent; + spinlock_t *ptl; + struct page *page; + + pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); + for (; addr != end; pte++, addr += PAGE_SIZE) { + ptent = *pte; + if (!pte_present(ptent)) + continue; + + page = vm_normal_page(vma, addr, ptent); + if (!page) + continue; + + /* Clear accessed and referenced bits. */ + ptep_test_and_clear_young(vma, addr, pte); + ClearPageReferenced(page); + } + pte_unmap_unlock(pte - 1, ptl); + cond_resched(); + return 0; +} + static struct mm_walk clear_refs_walk = { .pmd_entry = clear_refs_pte_range }; static ssize_t clear_refs_write(struct file *file, const char __user *buf, @@ -364,147 +519,6 @@ const struct file_operations proc_clear_refs_operations = { .write = clear_refs_write, }; -static void *m_start(struct seq_file *m, loff_t *pos) -{ - struct proc_maps_private *priv = m->private; - unsigned long last_addr = m->version; - struct mm_struct *mm; - struct vm_area_struct *vma, *tail_vma = NULL; - loff_t l = *pos; - - /* Clear the per syscall fields in priv */ - priv->task = NULL; - priv->tail_vma = NULL; - - /* - * We remember last_addr rather than next_addr to hit with - * mmap_cache most of the time. We have zero last_addr at - * the beginning and also after lseek. We will have -1 last_addr - * after the end of the vmas. - */ - - if (last_addr == -1UL) - return NULL; - - priv->task = get_pid_task(priv->pid, PIDTYPE_PID); - if (!priv->task) - return NULL; - - mm = mm_for_maps(priv->task); - if (!mm) - return NULL; - - priv->tail_vma = tail_vma = get_gate_vma(priv->task); - - /* Start with last addr hint */ - if (last_addr && (vma = find_vma(mm, last_addr))) { - vma = vma->vm_next; - goto out; - } - - /* - * Check the vma index is within the range and do - * sequential scan until m_index. - */ - vma = NULL; - if ((unsigned long)l < mm->map_count) { - vma = mm->mmap; - while (l-- && vma) - vma = vma->vm_next; - goto out; - } - - if (l != mm->map_count) - tail_vma = NULL; /* After gate vma */ - -out: - if (vma) - return vma; - - /* End of vmas has been reached */ - m->version = (tail_vma != NULL)? 0: -1UL; - up_read(&mm->mmap_sem); - mmput(mm); - return tail_vma; -} - -static void vma_stop(struct proc_maps_private *priv, struct vm_area_struct *vma) -{ - if (vma && vma != priv->tail_vma) { - struct mm_struct *mm = vma->vm_mm; - up_read(&mm->mmap_sem); - mmput(mm); - } -} - -static void *m_next(struct seq_file *m, void *v, loff_t *pos) -{ - struct proc_maps_private *priv = m->private; - struct vm_area_struct *vma = v; - struct vm_area_struct *tail_vma = priv->tail_vma; - - (*pos)++; - if (vma && (vma != tail_vma) && vma->vm_next) - return vma->vm_next; - vma_stop(priv, vma); - return (vma != tail_vma)? tail_vma: NULL; -} - -static void m_stop(struct seq_file *m, void *v) -{ - struct proc_maps_private *priv = m->private; - struct vm_area_struct *vma = v; - - vma_stop(priv, vma); - if (priv->task) - put_task_struct(priv->task); -} - -static struct seq_operations proc_pid_maps_op = { - .start = m_start, - .next = m_next, - .stop = m_stop, - .show = show_map -}; - -static struct seq_operations proc_pid_smaps_op = { - .start = m_start, - .next = m_next, - .stop = m_stop, - .show = show_smap -}; - -static int do_maps_open(struct inode *inode, struct file *file, - struct seq_operations *ops) -{ - struct proc_maps_private *priv; - int ret = -ENOMEM; - priv = kzalloc(sizeof(*priv), GFP_KERNEL); - if (priv) { - priv->pid = proc_pid(inode); - ret = seq_open(file, ops); - if (!ret) { - struct seq_file *m = file->private_data; - m->private = priv; - } else { - kfree(priv); - } - } - return ret; -} - -static int maps_open(struct inode *inode, struct file *file) -{ - return do_maps_open(inode, file, &proc_pid_maps_op); -} - -const struct file_operations proc_maps_operations = { - .open = maps_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release_private, -}; - #ifdef CONFIG_NUMA extern int show_numa_map(struct seq_file *m, void *v); @@ -539,14 +553,3 @@ const struct file_operations proc_numa_maps_operations = { }; #endif -static int smaps_open(struct inode *inode, struct file *file) -{ - return do_maps_open(inode, file, &proc_pid_smaps_op); -} - -const struct file_operations proc_smaps_operations = { - .open = smaps_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release_private, -}; -- cgit v1.2.3 From 85863e475e59afb027b0113290e3796ee6020b7d Mon Sep 17 00:00:00 2001 From: Matt Mackall Date: Mon, 4 Feb 2008 22:29:04 -0800 Subject: maps4: add /proc/pid/pagemap interface This interface provides a mapping for each page in an address space to its physical page frame number, allowing precise determination of what pages are mapped and what pages are shared between processes. New in this version: - headers gone again (as recommended by Dave Hansen and Alan Cox) - 64-bit entries (as per discussion with Andi Kleen) - swap pte information exported (from Dave Hansen) - page walker callback for holes (from Dave Hansen) - direct put_user I/O (as suggested by Rusty Russell) This patch folds in cleanups and swap PTE support from Dave Hansen . Signed-off-by: Matt Mackall Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 200 ++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 199 insertions(+), 1 deletion(-) (limited to 'fs/proc/task_mmu.c') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 308fc5451e43..bbd9b145051d 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -5,7 +5,10 @@ #include #include #include +#include #include +#include +#include #include #include @@ -519,6 +522,202 @@ const struct file_operations proc_clear_refs_operations = { .write = clear_refs_write, }; +struct pagemapread { + char __user *out, *end; +}; + +#define PM_ENTRY_BYTES sizeof(u64) +#define PM_RESERVED_BITS 3 +#define PM_RESERVED_OFFSET (64 - PM_RESERVED_BITS) +#define PM_RESERVED_MASK (((1LL<out + PM_ENTRY_BYTES >= pm->end) { + if (copy_to_user(pm->out, &pfn, pm->end - pm->out)) + return -EFAULT; + pm->out = pm->end; + return PM_END_OF_BUFFER; + } + + if (put_user(pfn, pm->out)) + return -EFAULT; + pm->out += PM_ENTRY_BYTES; + return 0; +} + +static int pagemap_pte_hole(unsigned long start, unsigned long end, + void *private) +{ + struct pagemapread *pm = private; + unsigned long addr; + int err = 0; + for (addr = start; addr < end; addr += PAGE_SIZE) { + err = add_to_pagemap(addr, PM_NOT_PRESENT, pm); + if (err) + break; + } + return err; +} + +u64 swap_pte_to_pagemap_entry(pte_t pte) +{ + swp_entry_t e = pte_to_swp_entry(pte); + return PM_SWAP | swp_type(e) | (swp_offset(e) << MAX_SWAPFILES_SHIFT); +} + +static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, + void *private) +{ + struct pagemapread *pm = private; + pte_t *pte; + int err = 0; + + for (; addr != end; addr += PAGE_SIZE) { + u64 pfn = PM_NOT_PRESENT; + pte = pte_offset_map(pmd, addr); + if (is_swap_pte(*pte)) + pfn = swap_pte_to_pagemap_entry(*pte); + else if (pte_present(*pte)) + pfn = pte_pfn(*pte); + /* unmap so we're not in atomic when we copy to userspace */ + pte_unmap(pte); + err = add_to_pagemap(addr, pfn, pm); + if (err) + return err; + } + + cond_resched(); + + return err; +} + +static struct mm_walk pagemap_walk = { + .pmd_entry = pagemap_pte_range, + .pte_hole = pagemap_pte_hole +}; + +/* + * /proc/pid/pagemap - an array mapping virtual pages to pfns + * + * For each page in the address space, this file contains one 64-bit + * entry representing the corresponding physical page frame number + * (PFN) if the page is present. If there is a swap entry for the + * physical page, then an encoding of the swap file number and the + * page's offset into the swap file are returned. If no page is + * present at all, PM_NOT_PRESENT is returned. This allows determining + * precisely which pages are mapped (or in swap) and comparing mapped + * pages between processes. + * + * Efficient users of this interface will use /proc/pid/maps to + * determine which areas of memory are actually mapped and llseek to + * skip over unmapped regions. + */ +static ssize_t pagemap_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode); + struct page **pages, *page; + unsigned long uaddr, uend; + struct mm_struct *mm; + struct pagemapread pm; + int pagecount; + int ret = -ESRCH; + + if (!task) + goto out; + + ret = -EACCES; + if (!ptrace_may_attach(task)) + goto out; + + ret = -EINVAL; + /* file position must be aligned */ + if (*ppos % PM_ENTRY_BYTES) + goto out; + + ret = 0; + mm = get_task_mm(task); + if (!mm) + goto out; + + ret = -ENOMEM; + uaddr = (unsigned long)buf & PAGE_MASK; + uend = (unsigned long)(buf + count); + pagecount = (PAGE_ALIGN(uend) - uaddr) / PAGE_SIZE; + pages = kmalloc(pagecount * sizeof(struct page *), GFP_KERNEL); + if (!pages) + goto out_task; + + down_read(¤t->mm->mmap_sem); + ret = get_user_pages(current, current->mm, uaddr, pagecount, + 1, 0, pages, NULL); + up_read(¤t->mm->mmap_sem); + + if (ret < 0) + goto out_free; + + pm.out = buf; + pm.end = buf + count; + + if (!ptrace_may_attach(task)) { + ret = -EIO; + } else { + unsigned long src = *ppos; + unsigned long svpfn = src / PM_ENTRY_BYTES; + unsigned long start_vaddr = svpfn << PAGE_SHIFT; + unsigned long end_vaddr = TASK_SIZE_OF(task); + + /* watch out for wraparound */ + if (svpfn > TASK_SIZE_OF(task) >> PAGE_SHIFT) + start_vaddr = end_vaddr; + + /* + * The odds are that this will stop walking way + * before end_vaddr, because the length of the + * user buffer is tracked in "pm", and the walk + * will stop when we hit the end of the buffer. + */ + ret = walk_page_range(mm, start_vaddr, end_vaddr, + &pagemap_walk, &pm); + if (ret == PM_END_OF_BUFFER) + ret = 0; + /* don't need mmap_sem for these, but this looks cleaner */ + *ppos += pm.out - buf; + if (!ret) + ret = pm.out - buf; + } + + for (; pagecount; pagecount--) { + page = pages[pagecount-1]; + if (!PageReserved(page)) + SetPageDirty(page); + page_cache_release(page); + } + mmput(mm); +out_free: + kfree(pages); +out_task: + put_task_struct(task); +out: + return ret; +} + +const struct file_operations proc_pagemap_operations = { + .llseek = mem_lseek, /* borrow this */ + .read = pagemap_read, +}; + #ifdef CONFIG_NUMA extern int show_numa_map(struct seq_file *m, void *v); @@ -552,4 +751,3 @@ const struct file_operations proc_numa_maps_operations = { .release = seq_release_private, }; #endif - -- cgit v1.2.3 From 1e88328111aae3ea408f346763ba9f9bad71f876 Mon Sep 17 00:00:00 2001 From: Matt Mackall Date: Mon, 4 Feb 2008 22:29:07 -0800 Subject: maps4: make page monitoring /proc file optional Make /proc/ page monitoring configurable This puts the following files under an embedded config option: /proc/pid/clear_refs /proc/pid/smaps /proc/pid/pagemap /proc/kpagecount /proc/kpageflags [akpm@linux-foundation.org: Kconfig fix] Signed-off-by: Matt Mackall Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs/proc/task_mmu.c') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index bbd9b145051d..38338ed98cc6 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -337,6 +337,7 @@ const struct file_operations proc_maps_operations = { */ #define PSS_SHIFT 12 +#ifdef CONFIG_PROC_PAGE_MONITOR struct mem_size_stats { struct vm_area_struct *vma; @@ -717,6 +718,7 @@ const struct file_operations proc_pagemap_operations = { .llseek = mem_lseek, /* borrow this */ .read = pagemap_read, }; +#endif /* CONFIG_PROC_PAGE_MONITOR */ #ifdef CONFIG_NUMA extern int show_numa_map(struct seq_file *m, void *v); -- cgit v1.2.3