From 49137f2efb5cf68724bccaba531ab3d59acd71f9 Mon Sep 17 00:00:00 2001 From: Joern Engel Date: Tue, 16 Mar 2010 21:46:15 +0100 Subject: Open segment file before using it logfs_recover_sb() needs it open. Signed-off-by: Joern Engel --- fs/logfs/super.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/logfs/super.c b/fs/logfs/super.c index c66beab78dee..018728120bb3 100644 --- a/fs/logfs/super.c +++ b/fs/logfs/super.c @@ -289,6 +289,10 @@ static int logfs_make_writeable(struct super_block *sb) { int err; + err = logfs_open_segfile(sb); + if (err) + return err; + /* Repair any broken superblock copies */ err = logfs_recover_sb(sb); if (err) @@ -299,10 +303,6 @@ static int logfs_make_writeable(struct super_block *sb) if (err) return err; - err = logfs_open_segfile(sb); - if (err) - return err; - /* Do one GC pass before any data gets dirtied */ logfs_gc_pass(sb); -- cgit v1.2.3 From 59fe27c0a8173a74b105debc803b97582028c90b Mon Sep 17 00:00:00 2001 From: Joern Engel Date: Wed, 17 Mar 2010 13:47:45 +0100 Subject: Limit max_pages for insane devices Intel SSDs have a limit of 0xffff as queue_max_hw_sectors(q). Such a limit may make sense from a hardware pov, but it causes bio_alloc() to return NULL. Signed-off-by: Joern Engel --- fs/logfs/dev_bdev.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c index 9718c22f186d..f99f5dcce546 100644 --- a/fs/logfs/dev_bdev.c +++ b/fs/logfs/dev_bdev.c @@ -97,8 +97,10 @@ static int __bdev_writeseg(struct super_block *sb, u64 ofs, pgoff_t index, unsigned int max_pages = queue_max_hw_sectors(q) >> (PAGE_SHIFT - 9); int i; + if (max_pages > BIO_MAX_PAGES) + max_pages = BIO_MAX_PAGES; bio = bio_alloc(GFP_NOFS, max_pages); - BUG_ON(!bio); /* FIXME: handle this */ + BUG_ON(!bio); for (i = 0; i < nr_pages; i++) { if (i >= max_pages) { @@ -191,8 +193,10 @@ static int do_erase(struct super_block *sb, u64 ofs, pgoff_t index, unsigned int max_pages = queue_max_hw_sectors(q) >> (PAGE_SHIFT - 9); int i; + if (max_pages > BIO_MAX_PAGES) + max_pages = BIO_MAX_PAGES; bio = bio_alloc(GFP_NOFS, max_pages); - BUG_ON(!bio); /* FIXME: handle this */ + BUG_ON(!bio); for (i = 0; i < nr_pages; i++) { if (i >= max_pages) { -- cgit v1.2.3 From e07bf553f37cd4fa470b499ff34d800956df2d48 Mon Sep 17 00:00:00 2001 From: Joern Engel Date: Wed, 17 Mar 2010 15:29:38 +0100 Subject: Plug memory leak in writeseg_end_io Signed-off-by: Joern Engel --- fs/logfs/dev_bdev.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c index f99f5dcce546..a5d0c56d3ebc 100644 --- a/fs/logfs/dev_bdev.c +++ b/fs/logfs/dev_bdev.c @@ -80,6 +80,7 @@ static void writeseg_end_io(struct bio *bio, int err) prefetchw(&bvec->bv_page->flags); end_page_writeback(page); + page_cache_release(page); } while (bvec >= bio->bi_io_vec); bio_put(bio); if (atomic_dec_and_test(&super->s_pending_writes)) -- cgit v1.2.3 From e326068806ee044cc617b1dc24be1293fca3fbf6 Mon Sep 17 00:00:00 2001 From: Joern Engel Date: Wed, 17 Mar 2010 16:00:07 +0100 Subject: Prevent schedule while atomic in __logfs_readdir Apparently filldir can sleep, which forbids kmap_atomic. Signed-off-by: Joern Engel --- fs/logfs/dir.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c index 56a8bfbb0120..c76b4b5c7ff6 100644 --- a/fs/logfs/dir.c +++ b/fs/logfs/dir.c @@ -303,12 +303,12 @@ static int __logfs_readdir(struct file *file, void *buf, filldir_t filldir) (filler_t *)logfs_readpage, NULL); if (IS_ERR(page)) return PTR_ERR(page); - dd = kmap_atomic(page, KM_USER0); + dd = kmap(page); BUG_ON(dd->namelen == 0); full = filldir(buf, (char *)dd->name, be16_to_cpu(dd->namelen), pos, be64_to_cpu(dd->ino), dd->type); - kunmap_atomic(dd, KM_USER0); + kunmap(page); page_cache_release(page); if (full) break; -- cgit v1.2.3 From faaa27ab919799929732c356a92a160f8657ecc6 Mon Sep 17 00:00:00 2001 From: Joern Engel Date: Fri, 26 Mar 2010 10:18:36 +0100 Subject: Write out both superblocks on mismatch If the first superblock is wrong and the second gets written, there will still be a mismatch on next mount. Write both to make sure they match. Signed-off-by: Joern Engel --- fs/logfs/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/logfs/super.c b/fs/logfs/super.c index 018728120bb3..006670fe9e8b 100644 --- a/fs/logfs/super.c +++ b/fs/logfs/super.c @@ -277,7 +277,7 @@ static int logfs_recover_sb(struct super_block *sb) } if (valid0 && valid1 && ds_cmp(ds0, ds1)) { printk(KERN_INFO"Superblocks don't match - fixing.\n"); - return write_one_sb(sb, super->s_devops->find_last_sb); + return logfs_write_sb(sb); } /* If neither is valid now, something's wrong. Didn't we properly * check them before?!? */ -- cgit v1.2.3 From 7db8064c17b92e95aec2e333096c035db9ddd4fe Mon Sep 17 00:00:00 2001 From: Joern Engel Date: Fri, 26 Mar 2010 14:45:55 +0100 Subject: Fix logfs_get_sb_final error path rootdir was already allocated, so we must iput it again. Found by Al Viro. Signed-off-by: Joern Engel --- fs/logfs/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/logfs/super.c b/fs/logfs/super.c index 006670fe9e8b..2845c41d70d4 100644 --- a/fs/logfs/super.c +++ b/fs/logfs/super.c @@ -328,7 +328,7 @@ static int logfs_get_sb_final(struct super_block *sb, struct vfsmount *mnt) sb->s_root = d_alloc_root(rootdir); if (!sb->s_root) - goto fail; + goto fail2; super->s_erase_page = alloc_pages(GFP_KERNEL, 0); if (!super->s_erase_page) -- cgit v1.2.3 From 6f2e9e6a950a165a7d2c399ab7557e6745ef2bfd Mon Sep 17 00:00:00 2001 From: Joern Engel Date: Fri, 26 Mar 2010 14:50:08 +0100 Subject: Use deactivate_locked_super Found by Al Viro. Signed-off-by: Joern Engel --- fs/logfs/super.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/logfs/super.c b/fs/logfs/super.c index 2845c41d70d4..9d856c49afc5 100644 --- a/fs/logfs/super.c +++ b/fs/logfs/super.c @@ -572,8 +572,7 @@ int logfs_get_sb_device(struct file_system_type *type, int flags, return 0; err1: - up_write(&sb->s_umount); - deactivate_super(sb); + deactivate_locked_super(sb); return err; err0: kfree(super); -- cgit v1.2.3 From 193219172691e29813821dc8433317768c6ed1a3 Mon Sep 17 00:00:00 2001 From: Joern Engel Date: Sat, 27 Mar 2010 09:56:58 +0100 Subject: Prevent data corruption in logfs_rewrite_block() The comment was correct, so make the code match the comment. As the new comment indicates, we might be able to do a little less work. But for the current -rc series let's keep it simple and just fix the bug. Signed-off-by: Joern Engel --- fs/logfs/readwrite.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c index 7a23b3e7c0a7..c3a3a6814b84 100644 --- a/fs/logfs/readwrite.c +++ b/fs/logfs/readwrite.c @@ -1594,7 +1594,6 @@ int logfs_delete(struct inode *inode, pgoff_t index, return ret; } -/* Rewrite cannot mark the inode dirty but has to write it immediatly. */ int logfs_rewrite_block(struct inode *inode, u64 bix, u64 ofs, gc_level_t gc_level, long flags) { @@ -1611,6 +1610,18 @@ int logfs_rewrite_block(struct inode *inode, u64 bix, u64 ofs, if (level != 0) alloc_indirect_block(inode, page, 0); err = logfs_write_buf(inode, page, flags); + if (!err && shrink_level(gc_level) == 0) { + /* Rewrite cannot mark the inode dirty but has to + * write it immediatly. + * Q: Can't we just create an alias for the inode + * instead? And if not, why not? + */ + if (inode->i_ino == LOGFS_INO_MASTER) + logfs_write_anchor(inode->i_sb); + else { + err = __logfs_write_inode(inode, flags); + } + } } logfs_put_write_page(page); return err; -- cgit v1.2.3 From 81def6b9862764924a99ac1b680e73ac8c80ac64 Mon Sep 17 00:00:00 2001 From: Joern Engel Date: Sun, 28 Mar 2010 12:47:09 +0200 Subject: Simplify and fix pad_wbuf A comment in the old code read: /* The math in this function can surely use some love */ And indeed it did. In the case that area->a_used_bytes is exactly 4096 bytes below segment size it fell apart. pad_wbuf is now split into two helpers that are significantly less complicated. Signed-off-by: Joern Engel --- fs/logfs/segment.c | 52 ++++++++++++++++++++++++++++++---------------------- 1 file changed, 30 insertions(+), 22 deletions(-) (limited to 'fs') diff --git a/fs/logfs/segment.c b/fs/logfs/segment.c index 1a14f9910d55..ff9d7f3b4d36 100644 --- a/fs/logfs/segment.c +++ b/fs/logfs/segment.c @@ -93,49 +93,57 @@ void __logfs_buf_write(struct logfs_area *area, u64 ofs, void *buf, size_t len, } while (len); } -/* - * bdev_writeseg will write full pages. Memset the tail to prevent data leaks. - */ -static void pad_wbuf(struct logfs_area *area, int final) +static void pad_partial_page(struct logfs_area *area) { struct super_block *sb = area->a_sb; - struct logfs_super *super = logfs_super(sb); struct page *page; u64 ofs = dev_ofs(sb, area->a_segno, area->a_used_bytes); pgoff_t index = ofs >> PAGE_SHIFT; long offset = ofs & (PAGE_SIZE-1); u32 len = PAGE_SIZE - offset; - if (len == PAGE_SIZE) { - /* The math in this function can surely use some love */ - len = 0; - } - if (len) { - BUG_ON(area->a_used_bytes >= super->s_segsize); - - page = get_mapping_page(area->a_sb, index, 0); + if (len % PAGE_SIZE) { + page = get_mapping_page(sb, index, 0); BUG_ON(!page); /* FIXME: reserve a pool */ memset(page_address(page) + offset, 0xff, len); SetPagePrivate(page); page_cache_release(page); } +} - if (!final) - return; +static void pad_full_pages(struct logfs_area *area) +{ + struct super_block *sb = area->a_sb; + struct logfs_super *super = logfs_super(sb); + u64 ofs = dev_ofs(sb, area->a_segno, area->a_used_bytes); + u32 len = super->s_segsize - area->a_used_bytes; + pgoff_t index = PAGE_CACHE_ALIGN(ofs) >> PAGE_CACHE_SHIFT; + pgoff_t no_indizes = len >> PAGE_CACHE_SHIFT; + struct page *page; - area->a_used_bytes += len; - for ( ; area->a_used_bytes < super->s_segsize; - area->a_used_bytes += PAGE_SIZE) { - /* Memset another page */ - index++; - page = get_mapping_page(area->a_sb, index, 0); + while (no_indizes) { + page = get_mapping_page(sb, index, 0); BUG_ON(!page); /* FIXME: reserve a pool */ - memset(page_address(page), 0xff, PAGE_SIZE); + SetPageUptodate(page); + memset(page_address(page), 0xff, PAGE_CACHE_SIZE); SetPagePrivate(page); page_cache_release(page); + index++; + no_indizes--; } } +/* + * bdev_writeseg will write full pages. Memset the tail to prevent data leaks. + * Also make sure we allocate (and memset) all pages for final writeout. + */ +static void pad_wbuf(struct logfs_area *area, int final) +{ + pad_partial_page(area); + if (final) + pad_full_pages(area); +} + /* * We have to be careful with the alias tree. Since lookup is done by bix, * it needs to be normalized, so 14, 15, 16, etc. all match when dealing with -- cgit v1.2.3 From 723b2ff40876678b49e61df34fb1d8001e34639d Mon Sep 17 00:00:00 2001 From: Joern Engel Date: Sun, 28 Mar 2010 18:10:07 +0200 Subject: [LogFS] Clear PagePrivate when moving journal do_logfs_journal_wl_pass() must call freeseg(), thereby clear PagePrivate on all pages of the current journal segment. Signed-off-by: Joern Engel --- fs/logfs/journal.c | 1 + fs/logfs/logfs.h | 1 + fs/logfs/segment.c | 2 +- 3 files changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/logfs/journal.c b/fs/logfs/journal.c index 6ad30a4c9052..15454ac7bd93 100644 --- a/fs/logfs/journal.c +++ b/fs/logfs/journal.c @@ -821,6 +821,7 @@ void do_logfs_journal_wl_pass(struct super_block *sb) logfs_set_segment_reserved(sb, segno); } /* Manually move journal_area */ + freeseg(sb, area->a_segno); area->a_segno = super->s_journal_seg[0]; area->a_is_open = 0; area->a_used_bytes = 0; diff --git a/fs/logfs/logfs.h b/fs/logfs/logfs.h index 129779431373..b84b0eec6024 100644 --- a/fs/logfs/logfs.h +++ b/fs/logfs/logfs.h @@ -587,6 +587,7 @@ void move_page_to_btree(struct page *page); int logfs_init_mapping(struct super_block *sb); void logfs_sync_area(struct logfs_area *area); void logfs_sync_segments(struct super_block *sb); +void freeseg(struct super_block *sb, u32 segno); /* area handling */ int logfs_init_areas(struct super_block *sb); diff --git a/fs/logfs/segment.c b/fs/logfs/segment.c index ff9d7f3b4d36..0ecd8f07c11e 100644 --- a/fs/logfs/segment.c +++ b/fs/logfs/segment.c @@ -691,7 +691,7 @@ int logfs_segment_delete(struct inode *inode, struct logfs_shadow *shadow) return 0; } -static void freeseg(struct super_block *sb, u32 segno) +void freeseg(struct super_block *sb, u32 segno) { struct logfs_super *super = logfs_super(sb); struct address_space *mapping = super->s_mapping_inode->i_mapping; -- cgit v1.2.3 From 0943846ae05603efd98550f2d475e9c98191bde8 Mon Sep 17 00:00:00 2001 From: Joern Engel Date: Mon, 29 Mar 2010 21:13:28 +0200 Subject: [LogFS] Move reserved segments with journal Fixes a GC livelock. Signed-off-by: Joern Engel --- fs/logfs/journal.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/logfs/journal.c b/fs/logfs/journal.c index 15454ac7bd93..25b1345c4652 100644 --- a/fs/logfs/journal.c +++ b/fs/logfs/journal.c @@ -800,6 +800,7 @@ void do_logfs_journal_wl_pass(struct super_block *sb) { struct logfs_super *super = logfs_super(sb); struct logfs_area *area = super->s_journal_area; + struct btree_head32 *head = &super->s_reserved_segments; u32 segno, ec; int i, err; @@ -807,6 +808,7 @@ void do_logfs_journal_wl_pass(struct super_block *sb) /* Drop old segments */ journal_for_each(i) if (super->s_journal_seg[i]) { + btree_remove32(head, super->s_journal_seg[i]); logfs_set_segment_unreserved(sb, super->s_journal_seg[i], super->s_journal_ec[i]); @@ -819,6 +821,8 @@ void do_logfs_journal_wl_pass(struct super_block *sb) super->s_journal_seg[i] = segno; super->s_journal_ec[i] = ec; logfs_set_segment_reserved(sb, segno); + err = btree_insert32(head, segno, (void *)1, GFP_KERNEL); + BUG_ON(err); /* mempool should prevent this */ } /* Manually move journal_area */ freeseg(sb, area->a_segno); -- cgit v1.2.3 From 6be7fa06eb4d721df734bd0946b5e63b27c0589b Mon Sep 17 00:00:00 2001 From: Joern Engel Date: Mon, 29 Mar 2010 21:14:52 +0200 Subject: [LogFS] Erase new journal segments If the device contains on old logfs image and the journal is moved to segment that have never been used by the current logfs and not all journal segments are erased before the next mount, the old content can confuse mount code. To prevent this, always erase the new journal segments. Signed-off-by: Joern Engel --- fs/logfs/journal.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/logfs/journal.c b/fs/logfs/journal.c index 25b1345c4652..d57c7b07b60b 100644 --- a/fs/logfs/journal.c +++ b/fs/logfs/journal.c @@ -823,6 +823,8 @@ void do_logfs_journal_wl_pass(struct super_block *sb) logfs_set_segment_reserved(sb, segno); err = btree_insert32(head, segno, (void *)1, GFP_KERNEL); BUG_ON(err); /* mempool should prevent this */ + err = logfs_erase_segment(sb, segno, 1); + BUG_ON(err); /* FIXME: remount-ro would be nicer */ } /* Manually move journal_area */ freeseg(sb, area->a_segno); -- cgit v1.2.3 From b7b7fa43103a9fb30dbcc60cbd5161fdfc25f904 Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Mon, 29 Mar 2010 15:12:39 -0400 Subject: reiserfs: Fix locking BUG during mount failure Commit 8ebc423238341b52912c7295b045a32477b33f09 (reiserfs: kill-the-BKL) introduced a bug in the mount failure case. The error label releases the lock before calling journal_release_error, but it requires that the lock be held. do_journal_release unlocks and retakes it. When it releases it without it held, we trigger a BUG(). The error_alloc label skips the unlock since the lock isn't held yet but none of the other conditions that are clean up exist yet either. This patch returns immediately after the kzalloc failure and moves the reiserfs_write_unlock after the journal_release_error call. This was reported in https://bugzilla.novell.com/show_bug.cgi?id=591807 Reported-by: Thomas Siedentopf Signed-off-by: Jeff Mahoney Cc: Thomas Siedentopf Cc: Andrew Morton Cc: 2.6.33.x Signed-off-by: Frederic Weisbecker --- fs/reiserfs/super.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 04bf5d791bda..ab190511bc18 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -1618,10 +1618,8 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent) save_mount_options(s, data); sbi = kzalloc(sizeof(struct reiserfs_sb_info), GFP_KERNEL); - if (!sbi) { - errval = -ENOMEM; - goto error_alloc; - } + if (!sbi) + return -ENOMEM; s->s_fs_info = sbi; /* Set default values for options: non-aggressive tails, RO on errors */ REISERFS_SB(s)->s_mount_opt |= (1 << REISERFS_SMALLTAIL); @@ -1878,12 +1876,12 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent) return (0); error: - reiserfs_write_unlock(s); -error_alloc: if (jinit_done) { /* kill the commit thread, free journal ram */ journal_release_error(NULL, s); } + reiserfs_write_unlock(s); + reiserfs_free_bitmap_cache(s); if (SB_BUFFER_WITH_SB(s)) brelse(SB_BUFFER_WITH_SB(s)); -- cgit v1.2.3 From 30d1872d9eb3663b4cf7bdebcbf5cd465674cced Mon Sep 17 00:00:00 2001 From: Nikolaus Schulz Date: Thu, 1 Apr 2010 02:21:10 +0900 Subject: fat: fix buffer overflow in vfat_create_shortname() When using the string representation of a random counter as part of the base name, ensure that it is no longer than 4 bytes. Since we are repeatedly decrementing the counter in a loop until we have found a unique base name, the counter may wrap around zero; therefore, it is not enough to mask its higher bits before entering the loop, this must be done inside the loop. [hirofumi@mail.parknet.co.jp: use snprintf()] Signed-off-by: Nikolaus Schulz Cc: stable@kernel.org Signed-off-by: OGAWA Hirofumi Signed-off-by: Linus Torvalds --- fs/fat/namei_vfat.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c index c1ef50154868..6fcc7e71fbaa 100644 --- a/fs/fat/namei_vfat.c +++ b/fs/fat/namei_vfat.c @@ -309,7 +309,7 @@ static int vfat_create_shortname(struct inode *dir, struct nls_table *nls, { struct fat_mount_options *opts = &MSDOS_SB(dir->i_sb)->options; wchar_t *ip, *ext_start, *end, *name_start; - unsigned char base[9], ext[4], buf[8], *p; + unsigned char base[9], ext[4], buf[5], *p; unsigned char charbuf[NLS_MAX_CHARSET_SIZE]; int chl, chi; int sz = 0, extlen, baselen, i, numtail_baselen, numtail2_baselen; @@ -467,7 +467,7 @@ static int vfat_create_shortname(struct inode *dir, struct nls_table *nls, return 0; } - i = jiffies & 0xffff; + i = jiffies; sz = (jiffies >> 16) & 0x7; if (baselen > 2) { baselen = numtail2_baselen; @@ -476,7 +476,7 @@ static int vfat_create_shortname(struct inode *dir, struct nls_table *nls, name_res[baselen + 4] = '~'; name_res[baselen + 5] = '1' + sz; while (1) { - sprintf(buf, "%04X", i); + snprintf(buf, sizeof(buf), "%04X", i & 0xffff); memcpy(&name_res[baselen], buf, 4); if (vfat_find_form(dir, name_res) < 0) break; -- cgit v1.2.3 From b95c35e76b29ba812e5dabdd91592e25ec640e93 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 1 Apr 2010 15:13:57 +0200 Subject: oom: fix the unsafe usage of badness() in proc_oom_score() proc_oom_score(task) has a reference to task_struct, but that is all. If this task was already released before we take tasklist_lock - we can't use task->group_leader, it points to nowhere - it is not safe to call badness() even if this task is ->group_leader, has_intersects_mems_allowed() assumes it is safe to iterate over ->thread_group list. - even worse, badness() can hit ->signal == NULL Add the pid_alive() check to ensure __unhash_process() was not called. Also, use "task" instead of task->group_leader. badness() should return the same result for any sub-thread. Currently this is not true, but this should be changed anyway. Signed-off-by: Oleg Nesterov Cc: stable@kernel.org Signed-off-by: Linus Torvalds --- fs/proc/base.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/proc/base.c b/fs/proc/base.c index a7310841c831..b1f6e62773d3 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -442,12 +442,13 @@ static const struct file_operations proc_lstats_operations = { unsigned long badness(struct task_struct *p, unsigned long uptime); static int proc_oom_score(struct task_struct *task, char *buffer) { - unsigned long points; + unsigned long points = 0; struct timespec uptime; do_posix_clock_monotonic_gettime(&uptime); read_lock(&tasklist_lock); - points = badness(task->group_leader, uptime.tv_sec); + if (pid_alive(task)) + points = badness(task, uptime.tv_sec); read_unlock(&tasklist_lock); return sprintf(buffer, "%lu\n", points); } -- cgit v1.2.3 From d82ef020cf31504c816803b1def94eb5ff173363 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Fri, 2 Apr 2010 09:11:29 +0900 Subject: proc: pagemap: Hold mmap_sem during page walk In initial design, walk_page_range() was designed just for walking page table and it didn't require mmap_sem. Now, find_vma() etc.. are used in walk_page_range() and we need mmap_sem around it. This patch adds mmap_sem around walk_page_range(). Because /proc//pagemap's callback routine use put_user(), we have to get rid of it to do sane fix. Changelog: 2010/Apr/2 - fixed start_vaddr and end overflow Changelog: 2010/Apr/1 - fixed start_vaddr calculation - removed unnecessary cast. - removed unnecessary change in smaps. - use GFP_TEMPORARY instead of GFP_KERNEL Signed-off-by: KAMEZAWA Hiroyuki Cc: Matt Mackall Cc: KOSAKI Motohiro Cc: San Mehat Cc: Brian Swetland Cc: Dave Hansen Cc: Andrew Morton [ Fixed kmalloc failure return code as per Matt ] Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 87 ++++++++++++++++++++++++------------------------------ 1 file changed, 38 insertions(+), 49 deletions(-) (limited to 'fs') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 183f8ff5f400..096273984c3b 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -406,6 +406,7 @@ static int show_smap(struct seq_file *m, void *v) memset(&mss, 0, sizeof mss); mss.vma = vma; + /* mmap_sem is held in m_start */ if (vma->vm_mm && !is_vm_hugetlb_page(vma)) walk_page_range(vma->vm_start, vma->vm_end, &smaps_walk); @@ -552,7 +553,8 @@ const struct file_operations proc_clear_refs_operations = { }; struct pagemapread { - u64 __user *out, *end; + int pos, len; + u64 *buffer; }; #define PM_ENTRY_BYTES sizeof(u64) @@ -575,10 +577,8 @@ struct pagemapread { static int add_to_pagemap(unsigned long addr, u64 pfn, struct pagemapread *pm) { - if (put_user(pfn, pm->out)) - return -EFAULT; - pm->out++; - if (pm->out >= pm->end) + pm->buffer[pm->pos++] = pfn; + if (pm->pos >= pm->len) return PM_END_OF_BUFFER; return 0; } @@ -720,21 +720,20 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long addr, * determine which areas of memory are actually mapped and llseek to * skip over unmapped regions. */ +#define PAGEMAP_WALK_SIZE (PMD_SIZE) static ssize_t pagemap_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode); - struct page **pages, *page; - unsigned long uaddr, uend; struct mm_struct *mm; struct pagemapread pm; - int pagecount; int ret = -ESRCH; struct mm_walk pagemap_walk = {}; unsigned long src; unsigned long svpfn; unsigned long start_vaddr; unsigned long end_vaddr; + int copied = 0; if (!task) goto out; @@ -757,35 +756,12 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, if (!mm) goto out_task; - - uaddr = (unsigned long)buf & PAGE_MASK; - uend = (unsigned long)(buf + count); - pagecount = (PAGE_ALIGN(uend) - uaddr) / PAGE_SIZE; - ret = 0; - if (pagecount == 0) - goto out_mm; - pages = kcalloc(pagecount, sizeof(struct page *), GFP_KERNEL); + pm.len = PM_ENTRY_BYTES * (PAGEMAP_WALK_SIZE >> PAGE_SHIFT); + pm.buffer = kmalloc(pm.len, GFP_TEMPORARY); ret = -ENOMEM; - if (!pages) + if (!pm.buffer) goto out_mm; - down_read(¤t->mm->mmap_sem); - ret = get_user_pages(current, current->mm, uaddr, pagecount, - 1, 0, pages, NULL); - up_read(¤t->mm->mmap_sem); - - if (ret < 0) - goto out_free; - - if (ret != pagecount) { - pagecount = ret; - ret = -EFAULT; - goto out_pages; - } - - pm.out = (u64 __user *)buf; - pm.end = (u64 __user *)(buf + count); - pagemap_walk.pmd_entry = pagemap_pte_range; pagemap_walk.pte_hole = pagemap_pte_hole; pagemap_walk.hugetlb_entry = pagemap_hugetlb_range; @@ -807,23 +783,36 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, * user buffer is tracked in "pm", and the walk * will stop when we hit the end of the buffer. */ - ret = walk_page_range(start_vaddr, end_vaddr, &pagemap_walk); - if (ret == PM_END_OF_BUFFER) - ret = 0; - /* don't need mmap_sem for these, but this looks cleaner */ - *ppos += (char __user *)pm.out - buf; - if (!ret) - ret = (char __user *)pm.out - buf; - -out_pages: - for (; pagecount; pagecount--) { - page = pages[pagecount-1]; - if (!PageReserved(page)) - SetPageDirty(page); - page_cache_release(page); + ret = 0; + while (count && (start_vaddr < end_vaddr)) { + int len; + unsigned long end; + + pm.pos = 0; + end = start_vaddr + PAGEMAP_WALK_SIZE; + /* overflow ? */ + if (end < start_vaddr || end > end_vaddr) + end = end_vaddr; + down_read(&mm->mmap_sem); + ret = walk_page_range(start_vaddr, end, &pagemap_walk); + up_read(&mm->mmap_sem); + start_vaddr = end; + + len = min(count, PM_ENTRY_BYTES * pm.pos); + if (copy_to_user(buf, pm.buffer, len) < 0) { + ret = -EFAULT; + goto out_free; + } + copied += len; + buf += len; + count -= len; } + *ppos += copied; + if (!ret || ret == PM_END_OF_BUFFER) + ret = copied; + out_free: - kfree(pages); + kfree(pm.buffer); out_mm: mmput(mm); out_task: -- cgit v1.2.3