summaryrefslogtreecommitdiffstats
Commit message (Collapse)AuthorAgeFilesLines
* vfs: change inode times to use struct timespec64Deepa Dinamani2018-06-0579-346/+448
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | struct timespec is not y2038 safe. Transition vfs to use y2038 safe struct timespec64 instead. The change was made with the help of the following cocinelle script. This catches about 80% of the changes. All the header file and logic changes are included in the first 5 rules. The rest are trivial substitutions. I avoid changing any of the function signatures or any other filesystem specific data structures to keep the patch simple for review. The script can be a little shorter by combining different cases. But, this version was sufficient for my usecase. virtual patch @ depends on patch @ identifier now; @@ - struct timespec + struct timespec64 current_time ( ... ) { - struct timespec now = current_kernel_time(); + struct timespec64 now = current_kernel_time64(); ... - return timespec_trunc( + return timespec64_trunc( ... ); } @ depends on patch @ identifier xtime; @@ struct \( iattr \| inode \| kstat \) { ... - struct timespec xtime; + struct timespec64 xtime; ... } @ depends on patch @ identifier t; @@ struct inode_operations { ... int (*update_time) (..., - struct timespec t, + struct timespec64 t, ...); ... } @ depends on patch @ identifier t; identifier fn_update_time =~ "update_time$"; @@ fn_update_time (..., - struct timespec *t, + struct timespec64 *t, ...) { ... } @ depends on patch @ identifier t; @@ lease_get_mtime( ... , - struct timespec *t + struct timespec64 *t ) { ... } @te depends on patch forall@ identifier ts; local idexpression struct inode *inode_node; identifier i_xtime =~ "^i_[acm]time$"; identifier ia_xtime =~ "^ia_[acm]time$"; identifier fn_update_time =~ "update_time$"; identifier fn; expression e, E3; local idexpression struct inode *node1; local idexpression struct inode *node2; local idexpression struct iattr *attr1; local idexpression struct iattr *attr2; local idexpression struct iattr attr; identifier i_xtime1 =~ "^i_[acm]time$"; identifier i_xtime2 =~ "^i_[acm]time$"; identifier ia_xtime1 =~ "^ia_[acm]time$"; identifier ia_xtime2 =~ "^ia_[acm]time$"; @@ ( ( - struct timespec ts; + struct timespec64 ts; | - struct timespec ts = current_time(inode_node); + struct timespec64 ts = current_time(inode_node); ) <+... when != ts ( - timespec_equal(&inode_node->i_xtime, &ts) + timespec64_equal(&inode_node->i_xtime, &ts) | - timespec_equal(&ts, &inode_node->i_xtime) + timespec64_equal(&ts, &inode_node->i_xtime) | - timespec_compare(&inode_node->i_xtime, &ts) + timespec64_compare(&inode_node->i_xtime, &ts) | - timespec_compare(&ts, &inode_node->i_xtime) + timespec64_compare(&ts, &inode_node->i_xtime) | ts = current_time(e) | fn_update_time(..., &ts,...) | inode_node->i_xtime = ts | node1->i_xtime = ts | ts = inode_node->i_xtime | <+... attr1->ia_xtime ...+> = ts | ts = attr1->ia_xtime | ts.tv_sec | ts.tv_nsec | btrfs_set_stack_timespec_sec(..., ts.tv_sec) | btrfs_set_stack_timespec_nsec(..., ts.tv_nsec) | - ts = timespec64_to_timespec( + ts = ... -) | - ts = ktime_to_timespec( + ts = ktime_to_timespec64( ...) | - ts = E3 + ts = timespec_to_timespec64(E3) | - ktime_get_real_ts(&ts) + ktime_get_real_ts64(&ts) | fn(..., - ts + timespec64_to_timespec(ts) ,...) ) ...+> ( <... when != ts - return ts; + return timespec64_to_timespec(ts); ...> ) | - timespec_equal(&node1->i_xtime1, &node2->i_xtime2) + timespec64_equal(&node1->i_xtime2, &node2->i_xtime2) | - timespec_equal(&node1->i_xtime1, &attr2->ia_xtime2) + timespec64_equal(&node1->i_xtime2, &attr2->ia_xtime2) | - timespec_compare(&node1->i_xtime1, &node2->i_xtime2) + timespec64_compare(&node1->i_xtime1, &node2->i_xtime2) | node1->i_xtime1 = - timespec_trunc(attr1->ia_xtime1, + timespec64_trunc(attr1->ia_xtime1, ...) | - attr1->ia_xtime1 = timespec_trunc(attr2->ia_xtime2, + attr1->ia_xtime1 = timespec64_trunc(attr2->ia_xtime2, ...) | - ktime_get_real_ts(&attr1->ia_xtime1) + ktime_get_real_ts64(&attr1->ia_xtime1) | - ktime_get_real_ts(&attr.ia_xtime1) + ktime_get_real_ts64(&attr.ia_xtime1) ) @ depends on patch @ struct inode *node; struct iattr *attr; identifier fn; identifier i_xtime =~ "^i_[acm]time$"; identifier ia_xtime =~ "^ia_[acm]time$"; expression e; @@ ( - fn(node->i_xtime); + fn(timespec64_to_timespec(node->i_xtime)); | fn(..., - node->i_xtime); + timespec64_to_timespec(node->i_xtime)); | - e = fn(attr->ia_xtime); + e = fn(timespec64_to_timespec(attr->ia_xtime)); ) @ depends on patch forall @ struct inode *node; struct iattr *attr; identifier i_xtime =~ "^i_[acm]time$"; identifier ia_xtime =~ "^ia_[acm]time$"; identifier fn; @@ { + struct timespec ts; <+... ( + ts = timespec64_to_timespec(node->i_xtime); fn (..., - &node->i_xtime, + &ts, ...); | + ts = timespec64_to_timespec(attr->ia_xtime); fn (..., - &attr->ia_xtime, + &ts, ...); ) ...+> } @ depends on patch forall @ struct inode *node; struct iattr *attr; struct kstat *stat; identifier ia_xtime =~ "^ia_[acm]time$"; identifier i_xtime =~ "^i_[acm]time$"; identifier xtime =~ "^[acm]time$"; identifier fn, ret; @@ { + struct timespec ts; <+... ( + ts = timespec64_to_timespec(node->i_xtime); ret = fn (..., - &node->i_xtime, + &ts, ...); | + ts = timespec64_to_timespec(node->i_xtime); ret = fn (..., - &node->i_xtime); + &ts); | + ts = timespec64_to_timespec(attr->ia_xtime); ret = fn (..., - &attr->ia_xtime, + &ts, ...); | + ts = timespec64_to_timespec(attr->ia_xtime); ret = fn (..., - &attr->ia_xtime); + &ts); | + ts = timespec64_to_timespec(stat->xtime); ret = fn (..., - &stat->xtime); + &ts); ) ...+> } @ depends on patch @ struct inode *node; struct inode *node2; identifier i_xtime1 =~ "^i_[acm]time$"; identifier i_xtime2 =~ "^i_[acm]time$"; identifier i_xtime3 =~ "^i_[acm]time$"; struct iattr *attrp; struct iattr *attrp2; struct iattr attr ; identifier ia_xtime1 =~ "^ia_[acm]time$"; identifier ia_xtime2 =~ "^ia_[acm]time$"; struct kstat *stat; struct kstat stat1; struct timespec64 ts; identifier xtime =~ "^[acmb]time$"; expression e; @@ ( ( node->i_xtime2 \| attrp->ia_xtime2 \| attr.ia_xtime2 \) = node->i_xtime1 ; | node->i_xtime2 = \( node2->i_xtime1 \| timespec64_trunc(...) \); | node->i_xtime2 = node->i_xtime1 = node->i_xtime3 = \(ts \| current_time(...) \); | node->i_xtime1 = node->i_xtime3 = \(ts \| current_time(...) \); | stat->xtime = node2->i_xtime1; | stat1.xtime = node2->i_xtime1; | ( node->i_xtime2 \| attrp->ia_xtime2 \) = attrp->ia_xtime1 ; | ( attrp->ia_xtime1 \| attr.ia_xtime1 \) = attrp2->ia_xtime2; | - e = node->i_xtime1; + e = timespec64_to_timespec( node->i_xtime1 ); | - e = attrp->ia_xtime1; + e = timespec64_to_timespec( attrp->ia_xtime1 ); | node->i_xtime1 = current_time(...); | node->i_xtime2 = node->i_xtime1 = node->i_xtime3 = - e; + timespec_to_timespec64(e); | node->i_xtime1 = node->i_xtime3 = - e; + timespec_to_timespec64(e); | - node->i_xtime1 = e; + node->i_xtime1 = timespec_to_timespec64(e); ) Signed-off-by: Deepa Dinamani <deepa.kernel@gmail.com> Cc: <anton@tuxera.com> Cc: <balbi@kernel.org> Cc: <bfields@fieldses.org> Cc: <darrick.wong@oracle.com> Cc: <dhowells@redhat.com> Cc: <dsterba@suse.com> Cc: <dwmw2@infradead.org> Cc: <hch@lst.de> Cc: <hirofumi@mail.parknet.co.jp> Cc: <hubcap@omnibond.com> Cc: <jack@suse.com> Cc: <jaegeuk@kernel.org> Cc: <jaharkes@cs.cmu.edu> Cc: <jslaby@suse.com> Cc: <keescook@chromium.org> Cc: <mark@fasheh.com> Cc: <miklos@szeredi.hu> Cc: <nico@linaro.org> Cc: <reiserfs-devel@vger.kernel.org> Cc: <richard@nod.at> Cc: <sage@redhat.com> Cc: <sfrench@samba.org> Cc: <swhiteho@redhat.com> Cc: <tj@kernel.org> Cc: <trond.myklebust@primarydata.com> Cc: <tytso@mit.edu> Cc: <viro@zeniv.linux.org.uk>
* pstore: Convert internal records to timespec64Kees Cook2018-06-055-23/+32
| | | | | | | This prepares pstore for converting the VFS layer to timespec64. Signed-off-by: Kees Cook <keescook@chromium.org> Signed-off-by: Deepa Dinamani <deepa.kernel@gmail.com>
* udf: Simplify calls to udf_disk_stamp_to_timeDeepa Dinamani2018-05-254-37/+21
| | | | | | | | | | | | | | | | | | | | | | | Subsequent patches in the series convert inode timestamps to use struct timespec64 instead of struct timespec as part of solving the y2038 problem. commit fd3cfad374d4 ("udf: Convert udf_disk_stamp_to_time() to use mktime64()") eliminated the NULL return condition from udf_disk_stamp_to_time(). udf_time_to_disk_time() is always called with a valid dest pointer and the return value is ignored. Further, caller can as well check the dest pointer being passed in rather than return argument. Make both the functions return void. This will make the inode timestamp conversion simpler. Signed-off-by: Deepa Dinamani <deepa.kernel@gmail.com> Cc: jack@suse.com ---- Changes from v1: * fixed the pointer error pointed by Jan
* fs: nfs: get rid of memcpys for inode timesDeepa Dinamani2018-05-251-7/+7
| | | | | | | | | | | Subsequent patches in the series convert inode timestamps to use struct timespec64 instead of struct timespec as part of solving the y2038 problem. This will lead to type mismatch for memcpys. Use regular assignments instead. Signed-off-by: Deepa Dinamani <deepa.kernel@gmail.com> Cc: trond.myklebust@primarydata.com
* ceph: make inode time prints to be long longDeepa Dinamani2018-05-251-21/+21
| | | | | | | | | | | | | Subsequent patches in the series convert inode timestamps to use struct timespec64 instead of struct timespec as part of solving the y2038 problem. Convert these print formats to use long long types to avoid warnings and errors on conversion. Signed-off-by: Deepa Dinamani <deepa.kernel@gmail.com> Cc: zyan@redhat.com Cc: ceph-devel@vger.kernel.org
* lustre: Use long long type to print inode timeDeepa Dinamani2018-05-255-16/+20
| | | | | | | | | | | | Subsequent patches in the series convert inode timestamps to use struct timespec64 instead of struct timespec as part of solving the y2038 problem. Convert these print formats to use long long types to avoid warnings and errors on conversion. Signed-off-by: Deepa Dinamani <deepa.kernel@gmail.com> CC: andreas.dilger@intel.com
* fs: add timespec64_truncate()Deepa Dinamani2018-05-252-0/+25
| | | | | | | | | | | | | As vfs moves to using struct timespec64 to represent times, update the argument to timespec_truncate() to use struct timespec64. Also change the name of the function. The rest of the implementation logic is the same. Move this to fs/inode.c instead of kernel/time/time.c as all the users of this api are filesystems. Signed-off-by: Deepa Dinamani <deepa.kernel@gmail.com> Cc: <viro@zeniv.linux.org.uk>
* Linux 4.17-rc6v4.17-rc6Linus Torvalds2018-05-201-1/+1
|
* Merge branch 'parisc-4.17-5' of ↵Linus Torvalds2018-05-203-4/+3
|\ | | | | | | | | | | | | | | | | | | | | | | | | git://git.kernel.org/pub/scm/linux/kernel/git/deller/parisc-linux Pull parisc fixlets from Helge Deller: "Three small section mismatch fixes, one of them was found by 0-day test infrastructure" * 'parisc-4.17-5' of git://git.kernel.org/pub/scm/linux/kernel/git/deller/parisc-linux: parisc: Move ccio_cujo20_fixup() into init section parisc: Move setup_profiling_timer() out of init section parisc: Move find_pa_parent_type() out of init section
| * parisc: Move ccio_cujo20_fixup() into init sectionHelge Deller2018-05-181-1/+1
| | | | | | | | | | | | | | ccio_cujo20_fixup() is called by dino_probe() only, which is in init section already. Signed-off-by: Helge Deller <deller@gmx.de>
| * parisc: Move setup_profiling_timer() out of init sectionHelge Deller2018-05-181-2/+1
| | | | | | | | | | | | | | | | No other architecture has setup_profiling_timer() in the init section, thus on parisc we face this section mismatch warning: Reference from the function devm_device_add_group() to the function .init.text:setup_profiling_timer() Signed-off-by: Helge Deller <deller@gmx.de>
| * parisc: Move find_pa_parent_type() out of init sectionHelge Deller2018-05-181-1/+1
| | | | | | | | | | | | | | | | | | The 0-DAY kernel test infrastructure reported that inet_put_port() may reference the find_pa_parent_type() function, so it can't be moved into the init section. Fixes: b86db40e1ecc ("parisc: Move various functions and strings to init section") Signed-off-by: Helge Deller <deller@gmx.de>
* | Merge tag 'for-4.17-rc5-tag' of ↵Linus Torvalds2018-05-207-48/+180
|\ \ | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux Pull btrfs fixes from David Sterba: "We've accumulated some fixes during the last week, some of them were in the works for a longer time but there are some newer ones too. Most of the fixes have a reproducer and fix user visible problems, also candidates for stable kernels. They IMHO qualify for a late rc, though I did not expect that many" * tag 'for-4.17-rc5-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux: btrfs: fix crash when trying to resume balance without the resume flag btrfs: Fix delalloc inodes invalidation during transaction abort btrfs: Split btrfs_del_delalloc_inode into 2 functions btrfs: fix reading stale metadata blocks after degraded raid1 mounts btrfs: property: Set incompat flag if lzo/zstd compression is set Btrfs: fix duplicate extents after fsync of file with prealloc extents Btrfs: fix xattr loss after power failure Btrfs: send, fix invalid access to commit roots due to concurrent snapshotting
| * | btrfs: fix crash when trying to resume balance without the resume flagAnand Jain2018-05-171-0/+9
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | We set the BTRFS_BALANCE_RESUME flag in the btrfs_recover_balance() only, which isn't called during the remount. So when resuming from the paused balance we hit the bug: kernel: kernel BUG at fs/btrfs/volumes.c:3890! :: kernel: balance_kthread+0x51/0x60 [btrfs] kernel: kthread+0x111/0x130 :: kernel: RIP: btrfs_balance+0x12e1/0x1570 [btrfs] RSP: ffffba7d0090bde8 Reproducer: On a mounted filesystem: btrfs balance start --full-balance /btrfs btrfs balance pause /btrfs mount -o remount,ro /dev/sdb /btrfs mount -o remount,rw /dev/sdb /btrfs To fix this set the BTRFS_BALANCE_RESUME flag in btrfs_resume_balance_async(). CC: stable@vger.kernel.org # 4.4+ Signed-off-by: Anand Jain <anand.jain@oracle.com> Reviewed-by: David Sterba <dsterba@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
| * | btrfs: Fix delalloc inodes invalidation during transaction abortNikolay Borisov2018-05-171-11/+15
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | When a transaction is aborted btrfs_cleanup_transaction is called to cleanup all the various in-flight bits and pieces which migth be active. One of those is delalloc inodes - inodes which have dirty pages which haven't been persisted yet. Currently the process of freeing such delalloc inodes in exceptional circumstances such as transaction abort boiled down to calling btrfs_invalidate_inodes whose sole job is to invalidate the dentries for all inodes related to a root. This is in fact wrong and insufficient since such delalloc inodes will likely have pending pages or ordered-extents and will be linked to the sb->s_inode_list. This means that unmounting a btrfs instance with an aborted transaction could potentially lead inodes/their pages visible to the system long after their superblock has been freed. This in turn leads to a "use-after-free" situation once page shrink is triggered. This situation could be simulated by running generic/019 which would cause such inodes to be left hanging, followed by generic/176 which causes memory pressure and page eviction which lead to touching the freed super block instance. This situation is additionally detected by the unmount code of VFS with the following message: "VFS: Busy inodes after unmount of Self-destruct in 5 seconds. Have a nice day..." Additionally btrfs hits WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree)); in free_fs_root for the same reason. This patch aims to rectify the sitaution by doing the following: 1. Change btrfs_destroy_delalloc_inodes so that it calls invalidate_inode_pages2 for every inode on the delalloc list, this ensures that all the pages of the inode are released. This function boils down to calling btrfs_releasepage. During test I observed cases where inodes on the delalloc list were having an i_count of 0, so this necessitates using igrab to be sure we are working on a non-freed inode. 2. Since calling btrfs_releasepage might queue delayed iputs move the call out to btrfs_cleanup_transaction in btrfs_error_commit_super before calling run_delayed_iputs for the last time. This is necessary to ensure that delayed iputs are run. Note: this patch is tagged for 4.14 stable but the fix applies to older versions too but needs to be backported manually due to conflicts. CC: stable@vger.kernel.org # 4.14.x: 2b8773313494: btrfs: Split btrfs_del_delalloc_inode into 2 functions CC: stable@vger.kernel.org # 4.14.x Signed-off-by: Nikolay Borisov <nborisov@suse.com> Reviewed-by: David Sterba <dsterba@suse.com> [ add comment to igrab ] Signed-off-by: David Sterba <dsterba@suse.com>
| * | btrfs: Split btrfs_del_delalloc_inode into 2 functionsNikolay Borisov2018-05-172-3/+12
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | This is in preparation of fixing delalloc inodes leakage on transaction abort. Also export the new function. Signed-off-by: Nikolay Borisov <nborisov@suse.com> Reviewed-by: David Sterba <dsterba@suse.com> Reviewed-by: Anand Jain <anand.jain@oracle.com> Signed-off-by: David Sterba <dsterba@suse.com>
| * | btrfs: fix reading stale metadata blocks after degraded raid1 mountsLiu Bo2018-05-171-3/+3
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | If a btree block, aka. extent buffer, is not available in the extent buffer cache, it'll be read out from the disk instead, i.e. btrfs_search_slot() read_block_for_search() # hold parent and its lock, go to read child btrfs_release_path() read_tree_block() # read child Unfortunately, the parent lock got released before reading child, so commit 5bdd3536cbbe ("Btrfs: Fix block generation verification race") had used 0 as parent transid to read the child block. It forces read_tree_block() not to check if parent transid is different with the generation id of the child that it reads out from disk. A simple PoC is included in btrfs/124, 0. A two-disk raid1 btrfs, 1. Right after mkfs.btrfs, block A is allocated to be device tree's root. 2. Mount this filesystem and put it in use, after a while, device tree's root got COW but block A hasn't been allocated/overwritten yet. 3. Umount it and reload the btrfs module to remove both disks from the global @fs_devices list. 4. mount -odegraded dev1 and write some data, so now block A is allocated to be a leaf in checksum tree. Note that only dev1 has the latest metadata of this filesystem. 5. Umount it and mount it again normally (with both disks), since raid1 can pick up one disk by the writer task's pid, if btrfs_search_slot() needs to read block A, dev2 which does NOT have the latest metadata might be read for block A, then we got a stale block A. 6. As parent transid is not checked, block A is marked as uptodate and put into the extent buffer cache, so the future search won't bother to read disk again, which means it'll make changes on this stale one and make it dirty and flush it onto disk. To avoid the problem, parent transid needs to be passed to read_tree_block(). In order to get a valid parent transid, we need to hold the parent's lock until finishing reading child. This patch needs to be slightly adapted for stable kernels, the &first_key parameter added to read_tree_block() is from 4.16+ (581c1760415c4). The fix is to replace 0 by 'gen'. Fixes: 5bdd3536cbbe ("Btrfs: Fix block generation verification race") CC: stable@vger.kernel.org # 4.4+ Signed-off-by: Liu Bo <bo.liu@linux.alibaba.com> Reviewed-by: Filipe Manana <fdmanana@suse.com> Reviewed-by: Qu Wenruo <wqu@suse.com> [ update changelog ] Signed-off-by: David Sterba <dsterba@suse.com>
| * | btrfs: property: Set incompat flag if lzo/zstd compression is setMisono Tomohiro2018-05-171-4/+8
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | Incompat flag of LZO/ZSTD compression should be set at: 1. mount time (-o compress/compress-force) 2. when defrag is done 3. when property is set Currently 3. is missing and this commit adds this. This could lead to a filesystem that uses ZSTD but is not marked as such. If a kernel without a ZSTD support encounteres a ZSTD compressed extent, it will handle that but this could be confusing to the user. Typically the filesystem is mounted with the ZSTD option, but the discrepancy can arise when a filesystem is never mounted with ZSTD and then the property on some file is set (and some new extents are written). A simple mount with -o compress=zstd will fix that up on an unpatched kernel. Same goes for LZO, but this has been around for a very long time (2.6.37) so it's unlikely that a pre-LZO kernel would be used. Fixes: 5c1aab1dd544 ("btrfs: Add zstd support") CC: stable@vger.kernel.org # 4.14+ Signed-off-by: Tomohiro Misono <misono.tomohiro@jp.fujitsu.com> Reviewed-by: Anand Jain <anand.jain@oracle.com> Reviewed-by: David Sterba <dsterba@suse.com> [ add user visible impact ] Signed-off-by: David Sterba <dsterba@suse.com>
| * | Btrfs: fix duplicate extents after fsync of file with prealloc extentsFilipe Manana2018-05-171-25/+112
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | In commit 471d557afed1 ("Btrfs: fix loss of prealloc extents past i_size after fsync log replay"), on fsync, we started to always log all prealloc extents beyond an inode's i_size in order to avoid losing them after a power failure. However under some cases this can lead to the log replay code to create duplicate extent items, with different lengths, in the extent tree. That happens because, as of that commit, we can now log extent items based on extent maps that are not on the "modified" list of extent maps of the inode's extent map tree. Logging extent items based on extent maps is used during the fast fsync path to save time and for this to work reliably it requires that the extent maps are not merged with other adjacent extent maps - having the extent maps in the list of modified extents gives such guarantee. Consider the following example, captured during a long run of fsstress, which illustrates this problem. We have inode 271, in the filesystem tree (root 5), for which all of the following operations and discussion apply to. A buffered write starts at offset 312391 with a length of 933471 bytes (end offset at 1245862). At this point we have, for this inode, the following extent maps with the their field values: em A, start 0, orig_start 0, len 40960, block_start 18446744073709551613, block_len 0, orig_block_len 0 em B, start 40960, orig_start 40960, len 376832, block_start 1106399232, block_len 376832, orig_block_len 376832 em C, start 417792, orig_start 417792, len 782336, block_start 18446744073709551613, block_len 0, orig_block_len 0 em D, start 1200128, orig_start 1200128, len 835584, block_start 1106776064, block_len 835584, orig_block_len 835584 em E, start 2035712, orig_start 2035712, len 245760, block_start 1107611648, block_len 245760, orig_block_len 245760 Extent map A corresponds to a hole and extent maps D and E correspond to preallocated extents. Extent map D ends where extent map E begins (1106776064 + 835584 = 1107611648), but these extent maps were not merged because they are in the inode's list of modified extent maps. An fsync against this inode is made, which triggers the fast path (BTRFS_INODE_NEEDS_FULL_SYNC is not set). This fsync triggers writeback of the data previously written using buffered IO, and when the respective ordered extent finishes, btrfs_drop_extents() is called against the (aligned) range 311296..1249279. This causes a split of extent map D at btrfs_drop_extent_cache(), replacing extent map D with a new extent map D', also added to the list of modified extents, with the following values: em D', start 1249280, orig_start of 1200128, block_start 1106825216 (= 1106776064 + 1249280 - 1200128), orig_block_len 835584, block_len 786432 (835584 - (1249280 - 1200128)) Then, during the fast fsync, btrfs_log_changed_extents() is called and extent maps D' and E are removed from the list of modified extents. The flag EXTENT_FLAG_LOGGING is also set on them. After the extents are logged clear_em_logging() is called on each of them, and that makes extent map E to be merged with extent map D' (try_merge_map()), resulting in D' being deleted and E adjusted to: em E, start 1249280, orig_start 1200128, len 1032192, block_start 1106825216, block_len 1032192, orig_block_len 245760 A direct IO write at offset 1847296 and length of 360448 bytes (end offset at 2207744) starts, and at that moment the following extent maps exist for our inode: em A, start 0, orig_start 0, len 40960, block_start 18446744073709551613, block_len 0, orig_block_len 0 em B, start 40960, orig_start 40960, len 270336, block_start 1106399232, block_len 270336, orig_block_len 376832 em C, start 311296, orig_start 311296, len 937984, block_start 1112842240, block_len 937984, orig_block_len 937984 em E (prealloc), start 1249280, orig_start 1200128, len 1032192, block_start 1106825216, block_len 1032192, orig_block_len 245760 The dio write results in drop_extent_cache() being called twice. The first time for a range that starts at offset 1847296 and ends at offset 2035711 (length of 188416), which results in a double split of extent map E, replacing it with two new extent maps: em F, start 1249280, orig_start 1200128, block_start 1106825216, block_len 598016, orig_block_len 598016 em G, start 2035712, orig_start 1200128, block_start 1107611648, block_len 245760, orig_block_len 1032192 It also creates a new extent map that represents a part of the requested IO (through create_io_em()): em H, start 1847296, len 188416, block_start 1107423232, block_len 188416 The second call to drop_extent_cache() has a range with a start offset of 2035712 and end offset of 2207743 (length of 172032). This leads to replacing extent map G with a new extent map I with the following values: em I, start 2207744, orig_start 1200128, block_start 1107783680, block_len 73728, orig_block_len 1032192 It also creates a new extent map that represents the second part of the requested IO (through create_io_em()): em J, start 2035712, len 172032, block_start 1107611648, block_len 172032 The dio write set the inode's i_size to 2207744 bytes. After the dio write the inode has the following extent maps: em A, start 0, orig_start 0, len 40960, block_start 18446744073709551613, block_len 0, orig_block_len 0 em B, start 40960, orig_start 40960, len 270336, block_start 1106399232, block_len 270336, orig_block_len 376832 em C, start 311296, orig_start 311296, len 937984, block_start 1112842240, block_len 937984, orig_block_len 937984 em F, start 1249280, orig_start 1200128, len 598016, block_start 1106825216, block_len 598016, orig_block_len 598016 em H, start 1847296, orig_start 1200128, len 188416, block_start 1107423232, block_len 188416, orig_block_len 835584 em J, start 2035712, orig_start 2035712, len 172032, block_start 1107611648, block_len 172032, orig_block_len 245760 em I, start 2207744, orig_start 1200128, len 73728, block_start 1107783680, block_len 73728, orig_block_len 1032192 Now do some change to the file, like adding a xattr for example and then fsync it again. This triggers a fast fsync path, and as of commit 471d557afed1 ("Btrfs: fix loss of prealloc extents past i_size after fsync log replay"), we use the extent map I to log a file extent item because it's a prealloc extent and it starts at an offset matching the inode's i_size. However when we log it, we create a file extent item with a value for the disk byte location that is wrong, as can be seen from the following output of "btrfs inspect-internal dump-tree": item 1 key (271 EXTENT_DATA 2207744) itemoff 3782 itemsize 53 generation 22 type 2 (prealloc) prealloc data disk byte 1106776064 nr 1032192 prealloc data offset 1007616 nr 73728 Here the disk byte value corresponds to calculation based on some fields from the extent map I: 1106776064 = block_start (1107783680) - 1007616 (extent_offset) extent_offset = 2207744 (start) - 1200128 (orig_start) = 1007616 The disk byte value of 1106776064 clashes with disk byte values of the file extent items at offsets 1249280 and 1847296 in the fs tree: item 6 key (271 EXTENT_DATA 1249280) itemoff 3568 itemsize 53 generation 20 type 2 (prealloc) prealloc data disk byte 1106776064 nr 835584 prealloc data offset 49152 nr 598016 item 7 key (271 EXTENT_DATA 1847296) itemoff 3515 itemsize 53 generation 20 type 1 (regular) extent data disk byte 1106776064 nr 835584 extent data offset 647168 nr 188416 ram 835584 extent compression 0 (none) item 8 key (271 EXTENT_DATA 2035712) itemoff 3462 itemsize 53 generation 20 type 1 (regular) extent data disk byte 1107611648 nr 245760 extent data offset 0 nr 172032 ram 245760 extent compression 0 (none) item 9 key (271 EXTENT_DATA 2207744) itemoff 3409 itemsize 53 generation 20 type 2 (prealloc) prealloc data disk byte 1107611648 nr 245760 prealloc data offset 172032 nr 73728 Instead of the disk byte value of 1106776064, the value of 1107611648 should have been logged. Also the data offset value should have been 172032 and not 1007616. After a log replay we end up getting two extent items in the extent tree with different lengths, one of 835584, which is correct and existed before the log replay, and another one of 1032192 which is wrong and is based on the logged file extent item: item 12 key (1106776064 EXTENT_ITEM 835584) itemoff 3406 itemsize 53 refs 2 gen 15 flags DATA extent data backref root 5 objectid 271 offset 1200128 count 2 item 13 key (1106776064 EXTENT_ITEM 1032192) itemoff 3353 itemsize 53 refs 1 gen 22 flags DATA extent data backref root 5 objectid 271 offset 1200128 count 1 Obviously this leads to many problems and a filesystem check reports many errors: (...) checking extents Extent back ref already exists for 1106776064 parent 0 root 5 owner 271 offset 1200128 num_refs 1 extent item 1106776064 has multiple extent items ref mismatch on [1106776064 835584] extent item 2, found 3 Incorrect local backref count on 1106776064 root 5 owner 271 offset 1200128 found 2 wanted 1 back 0x55b1d0ad7680 Backref 1106776064 root 5 owner 271 offset 1200128 num_refs 0 not found in extent tree Incorrect local backref count on 1106776064 root 5 owner 271 offset 1200128 found 1 wanted 0 back 0x55b1d0ad4e70 Backref bytes do not match extent backref, bytenr=1106776064, ref bytes=835584, backref bytes=1032192 backpointer mismatch on [1106776064 835584] checking free space cache block group 1103101952 has wrong amount of free space failed to load free space cache for block group 1103101952 checking fs roots (...) So fix this by logging the prealloc extents beyond the inode's i_size based on searches in the subvolume tree instead of the extent maps. Fixes: 471d557afed1 ("Btrfs: fix loss of prealloc extents past i_size after fsync log replay") CC: stable@vger.kernel.org # 4.14+ Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
| * | Btrfs: fix xattr loss after power failureFilipe Manana2018-05-141-0/+7
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | If a file has xattrs, we fsync it, to ensure we clear the flags BTRFS_INODE_NEEDS_FULL_SYNC and BTRFS_INODE_COPY_EVERYTHING from its inode, the current transaction commits and then we fsync it (without either of those bits being set in its inode), we end up not logging all its xattrs. This results in deleting all xattrs when replying the log after a power failure. Trivial reproducer $ mkfs.btrfs -f /dev/sdb $ mount /dev/sdb /mnt $ touch /mnt/foobar $ setfattr -n user.xa -v qwerty /mnt/foobar $ xfs_io -c "fsync" /mnt/foobar $ sync $ xfs_io -c "pwrite -S 0xab 0 64K" /mnt/foobar $ xfs_io -c "fsync" /mnt/foobar <power failure> $ mount /dev/sdb /mnt $ getfattr --absolute-names --dump /mnt/foobar <empty output> $ So fix this by making sure all xattrs are logged if we log a file's inode item and neither the flags BTRFS_INODE_NEEDS_FULL_SYNC nor BTRFS_INODE_COPY_EVERYTHING were set in the inode. Fixes: 36283bf777d9 ("Btrfs: fix fsync xattr loss in the fast fsync path") Cc: <stable@vger.kernel.org> # 4.2+ Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
| * | Btrfs: send, fix invalid access to commit roots due to concurrent snapshottingRobbie Ko2018-05-141-2/+14
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | [BUG] btrfs incremental send BUG happens when creating a snapshot of snapshot that is being used by send. [REASON] The problem can happen if while we are doing a send one of the snapshots used (parent or send) is snapshotted, because snapshoting implies COWing the root of the source subvolume/snapshot. 1. When doing an incremental send, the send process will get the commit roots from the parent and send snapshots, and add references to them through extent_buffer_get(). 2. When a snapshot/subvolume is snapshotted, its root node is COWed (transaction.c:create_pending_snapshot()). 3. COWing releases the space used by the node immediately, through: __btrfs_cow_block() --btrfs_free_tree_block() ----btrfs_add_free_space(bytenr of node) 4. Because send doesn't hold a transaction open, it's possible that the transaction used to create the snapshot commits, switches the commit root and the old space used by the previous root node gets assigned to some other node allocation. Allocation of a new node will use the existing extent buffer found in memory, which we previously got a reference through extent_buffer_get(), and allow the extent buffer's content (pages) to be modified: btrfs_alloc_tree_block --btrfs_reserve_extent ----find_free_extent (get bytenr of old node) --btrfs_init_new_buffer (use bytenr of old node) ----btrfs_find_create_tree_block ------alloc_extent_buffer --------find_extent_buffer (get old node) 5. So send can access invalid memory content and have unpredictable behaviour. [FIX] So we fix the problem by copying the commit roots of the send and parent snapshots and use those copies. CallTrace looks like this: ------------[ cut here ]------------ kernel BUG at fs/btrfs/ctree.c:1861! invalid opcode: 0000 [#1] SMP CPU: 6 PID: 24235 Comm: btrfs Tainted: P O 3.10.105 #23721 ffff88046652d680 ti: ffff88041b720000 task.ti: ffff88041b720000 RIP: 0010:[<ffffffffa08dd0e8>] read_node_slot+0x108/0x110 [btrfs] RSP: 0018:ffff88041b723b68 EFLAGS: 00010246 RAX: ffff88043ca6b000 RBX: ffff88041b723c50 RCX: ffff880000000000 RDX: 000000000000004c RSI: ffff880314b133f8 RDI: ffff880458b24000 RBP: 0000000000000000 R08: 0000000000000001 R09: ffff88041b723c66 R10: 0000000000000001 R11: 0000000000001000 R12: ffff8803f3e48890 R13: ffff8803f3e48880 R14: ffff880466351800 R15: 0000000000000001 FS: 00007f8c321dc8c0(0000) GS:ffff88047fcc0000(0000) CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 R2: 00007efd1006d000 CR3: 0000000213a24000 CR4: 00000000003407e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Stack: ffff88041b723c50 ffff8803f3e48880 ffff8803f3e48890 ffff8803f3e48880 ffff880466351800 0000000000000001 ffffffffa08dd9d7 ffff88041b723c50 ffff8803f3e48880 ffff88041b723c66 ffffffffa08dde85 a9ff88042d2c4400 Call Trace: [<ffffffffa08dd9d7>] ? tree_move_down.isra.33+0x27/0x50 [btrfs] [<ffffffffa08dde85>] ? tree_advance+0xb5/0xc0 [btrfs] [<ffffffffa08e83d4>] ? btrfs_compare_trees+0x2d4/0x760 [btrfs] [<ffffffffa0982050>] ? finish_inode_if_needed+0x870/0x870 [btrfs] [<ffffffffa09841ea>] ? btrfs_ioctl_send+0xeda/0x1050 [btrfs] [<ffffffffa094bd3d>] ? btrfs_ioctl+0x1e3d/0x33f0 [btrfs] [<ffffffff81111133>] ? handle_pte_fault+0x373/0x990 [<ffffffff8153a096>] ? atomic_notifier_call_chain+0x16/0x20 [<ffffffff81063256>] ? set_task_cpu+0xb6/0x1d0 [<ffffffff811122c3>] ? handle_mm_fault+0x143/0x2a0 [<ffffffff81539cc0>] ? __do_page_fault+0x1d0/0x500 [<ffffffff81062f07>] ? check_preempt_curr+0x57/0x90 [<ffffffff8115075a>] ? do_vfs_ioctl+0x4aa/0x990 [<ffffffff81034f83>] ? do_fork+0x113/0x3b0 [<ffffffff812dd7d7>] ? trace_hardirqs_off_thunk+0x3a/0x6c [<ffffffff81150cc8>] ? SyS_ioctl+0x88/0xa0 [<ffffffff8153e422>] ? system_call_fastpath+0x16/0x1b ---[ end trace 29576629ee80b2e1 ]--- Fixes: 7069830a9e38 ("Btrfs: add btrfs_compare_trees function") CC: stable@vger.kernel.org # 3.6+ Signed-off-by: Robbie Ko <robbieko@synology.com> Reviewed-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
* | | Merge branch 'fixes' of git://git.armlinux.org.uk/~rmk/linux-armLinus Torvalds2018-05-209-44/+64
|\ \ \ | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | Pull ARM fixes from Russell King: - Łukasz Stelmach spotted a couple of issues with the decompressor. - a couple of kdump fixes found while testing kdump - replace some perl with shell code - resolve SIGFPE breakage - kprobes fixes * 'fixes' of git://git.armlinux.org.uk/~rmk/linux-arm: ARM: fix kill( ,SIGFPE) breakage ARM: 8772/1: kprobes: Prohibit kprobes on get_user functions ARM: 8771/1: kprobes: Prohibit kprobes on do_undefinstr ARM: 8770/1: kprobes: Prohibit probing on optimized_callback ARM: 8769/1: kprobes: Fix to use get_kprobe_ctlblk after irq-disabed ARM: replace unnecessary perl with sed and the shell $(( )) operator ARM: kexec: record parent context registers for non-crash CPUs ARM: kexec: fix kdump register saving on panic() ARM: 8758/1: decompressor: restore r1 and r2 just before jumping to the kernel ARM: 8753/1: decompressor: add a missing parameter to the addruart macro
| * | | ARM: fix kill( ,SIGFPE) breakageRussell King2018-05-192-14/+1
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | Commit 7771c6645700 ("signal/arm: Document conflicts with SI_USER and SIGFPE") broke the siginfo structure for userspace triggered signals, causing the strace testsuite to regress. Fix this by eliminating the FPE_FIXME definition (which is at the root of the breakage) and use FPE_FLTINV instead for the case where the hardware appears to be reporting nonsense. Fixes: 7771c6645700 ("signal/arm: Document conflicts with SI_USER and SIGFPE") Suggested-by: Linus Torvalds <torvalds@linux-foundation.org> Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
| * | | ARM: 8772/1: kprobes: Prohibit kprobes on get_user functionsMasami Hiramatsu2018-05-192-0/+20
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | Since do_undefinstr() uses get_user to get the undefined instruction, it can be called before kprobes processes recursive check. This can cause an infinit recursive exception. Prohibit probing on get_user functions. Fixes: 24ba613c9d6c ("ARM kprobes: core code") Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org> Cc: stable@vger.kernel.org Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
| * | | ARM: 8771/1: kprobes: Prohibit kprobes on do_undefinstrMasami Hiramatsu2018-05-191-1/+4
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | Prohibit kprobes on do_undefinstr because kprobes on arm is implemented by undefined instruction. This means if we probe do_undefinstr(), it can cause infinit recursive exception. Fixes: 24ba613c9d6c ("ARM kprobes: core code") Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org> Cc: stable@vger.kernel.org Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
| * | | ARM: 8770/1: kprobes: Prohibit probing on optimized_callbackMasami Hiramatsu2018-05-191-0/+1
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | Prohibit probing on optimized_callback() because it is called from kprobes itself. If we put a kprobes on it, that will cause a recursive call loop. Mark it NOKPROBE_SYMBOL. Fixes: 0dc016dbd820 ("ARM: kprobes: enable OPTPROBES for ARM 32") Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org> Cc: stable@vger.kernel.org Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
| * | | ARM: 8769/1: kprobes: Fix to use get_kprobe_ctlblk after irq-disabedMasami Hiramatsu2018-05-191-1/+2
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | Since get_kprobe_ctlblk() uses smp_processor_id() to access per-cpu variable, it hits smp_processor_id sanity check as below. [ 7.006928] BUG: using smp_processor_id() in preemptible [00000000] code: swapper/0/1 [ 7.007859] caller is debug_smp_processor_id+0x20/0x24 [ 7.008438] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 4.16.0-rc1-00192-g4eb17253e4b5 #1 [ 7.008890] Hardware name: Generic DT based system [ 7.009917] [<c0313f0c>] (unwind_backtrace) from [<c030e6d8>] (show_stack+0x20/0x24) [ 7.010473] [<c030e6d8>] (show_stack) from [<c0c64694>] (dump_stack+0x84/0x98) [ 7.010990] [<c0c64694>] (dump_stack) from [<c071ca5c>] (check_preemption_disabled+0x138/0x13c) [ 7.011592] [<c071ca5c>] (check_preemption_disabled) from [<c071ca80>] (debug_smp_processor_id+0x20/0x24) [ 7.012214] [<c071ca80>] (debug_smp_processor_id) from [<c03335e0>] (optimized_callback+0x2c/0xe4) [ 7.013077] [<c03335e0>] (optimized_callback) from [<bf0021b0>] (0xbf0021b0) To fix this issue, call get_kprobe_ctlblk() right after irq-disabled since that disables preemption. Fixes: 0dc016dbd820 ("ARM: kprobes: enable OPTPROBES for ARM 32") Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org> Cc: stable@vger.kernel.org Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
| * | | ARM: replace unnecessary perl with sed and the shell $(( )) operatorRussell King2018-05-191-5/+3
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | You can build a kernel in a cross compiling environment that doesn't have perl in the $PATH. Commit 429f7a062e3b broke that for 32 bit ARM. Fix it. As reported by Stephen Rothwell, it appears that the symbols can be either part of the BSS section or absolute symbols depending on the binutils version. When they're an absolute symbol, the $(( )) operator errors out and the build fails. Fix this as well. Fixes: 429f7a062e3b ("ARM: decompressor: fix BSS size calculation") Reported-by: Rob Landley <rob@landley.net> Reported-by: Stephen Rothwell <sfr@canb.auug.org.au> Acked-by: Rob Landley <rob@landley.net> Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
| * | | ARM: kexec: record parent context registers for non-crash CPUsRussell King2018-05-191-1/+1
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | How we got to machine_crash_nonpanic_core() (iow, from an IPI, etc) is not interesting for debugging a crash. The more interesting context is the parent context prior to the IPI being received. Record the parent context register state rather than the register state in machine_crash_nonpanic_core(), which is more relevant to the failing condition. Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
| * | | ARM: kexec: fix kdump register saving on panic()Russell King2018-05-191-12/+22
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | When a panic() occurs, the kexec code uses smp_send_stop() to stop the other CPUs, but this results in the CPU register state not being saved, and gdb is unable to inspect the state of other CPUs. Commit 0ee59413c967 ("x86/panic: replace smp_send_stop() with kdump friendly version in panic path") addressed the issue on x86, but ignored other architectures. Address the issue on ARM by splitting out the crash stop implementation to crash_smp_send_stop() and adding the necessary protection. Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
| * | | ARM: 8758/1: decompressor: restore r1 and r2 just before jumping to the kernelŁukasz Stelmach2018-05-191-2/+2
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | The hypervisor setup before __enter_kernel destroys the value sotred in r1. The value needs to be restored just before the jump. Fixes: 6b52f7bdb888 ("ARM: hyp-stub: Use r1 for the soft-restart address") Signed-off-by: Łukasz Stelmach <l.stelmach@samsung.com> Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
| * | | ARM: 8753/1: decompressor: add a missing parameter to the addruart macroŁukasz Stelmach2018-05-191-8/+8
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | In commit 639da5ee374b ("ARM: add an extra temp register to the low level debugging addruart macro") an additional temporary register was added to the addruart macro, but the decompressor code wasn't updated. Fixes: 639da5ee374b ("ARM: add an extra temp register to the low level debugging addruart macro") Signed-off-by: Łukasz Stelmach <l.stelmach@samsung.com> Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
* | | | Merge branch 'x86-urgent-for-linus' of ↵Linus Torvalds2018-05-2013-129/+585
|\ \ \ \ | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip Pull x86 fixes from Thomas Gleixner: "An unfortunately larger set of fixes, but a large portion is selftests: - Fix the missing clusterid initializaiton for x2apic cluster management which caused boot failures due to IPIs being sent to the wrong cluster - Drop TX_COMPAT when a 64bit executable is exec()'ed from a compat task - Wrap access to __supported_pte_mask in __startup_64() where clang compile fails due to a non PC relative access being generated. - Two fixes for 5 level paging fallout in the decompressor: - Handle GOT correctly for paging_prepare() and cleanup_trampoline() - Fix the page table handling in cleanup_trampoline() to avoid page table corruption. - Stop special casing protection key 0 as this is inconsistent with the manpage and also inconsistent with the allocation map handling. - Override the protection key wen moving away from PROT_EXEC to prevent inaccessible memory. - Fix and update the protection key selftests to address breakage and to cover the above issue - Add a MOV SS self test" [ Part of the x86 fixes were in the earlier core pull due to dependencies ] * 'x86-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (21 commits) x86/mm: Drop TS_COMPAT on 64-bit exec() syscall x86/apic/x2apic: Initialize cluster ID properly x86/boot/compressed/64: Fix moving page table out of trampoline memory x86/boot/compressed/64: Set up GOT for paging_prepare() and cleanup_trampoline() x86/pkeys: Do not special case protection key 0 x86/pkeys/selftests: Add a test for pkey 0 x86/pkeys/selftests: Save off 'prot' for allocations x86/pkeys/selftests: Fix pointer math x86/pkeys: Override pkey when moving away from PROT_EXEC x86/pkeys/selftests: Fix pkey exhaustion test off-by-one x86/pkeys/selftests: Add PROT_EXEC test x86/pkeys/selftests: Factor out "instruction page" x86/pkeys/selftests: Allow faults on unknown keys x86/pkeys/selftests: Avoid printf-in-signal deadlocks x86/pkeys/selftests: Remove dead debugging code, fix dprint_in_signal x86/pkeys/selftests: Stop using assert() x86/pkeys/selftests: Give better unexpected fault error messages x86/selftests: Add mov_to_ss test x86/mpx/selftests: Adjust the self-test to fresh distros that export the MPX ABI x86/pkeys/selftests: Adjust the self-test to fresh distros that export the pkeys ABI ...
| * | | | x86/mm: Drop TS_COMPAT on 64-bit exec() syscallDmitry Safonov2018-05-191-0/+1
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | The x86 mmap() code selects the mmap base for an allocation depending on the bitness of the syscall. For 64bit sycalls it select mm->mmap_base and for 32bit mm->mmap_compat_base. exec() calls mmap() which in turn uses in_compat_syscall() to check whether the mapping is for a 32bit or a 64bit task. The decision is made on the following criteria: ia32 child->thread.status & TS_COMPAT x32 child->pt_regs.orig_ax & __X32_SYSCALL_BIT ia64 !ia32 && !x32 __set_personality_x32() was dropping TS_COMPAT flag, but set_personality_64bit() has kept compat syscall flag making in_compat_syscall() return true during the first exec() syscall. Which in result has user-visible effects, mentioned by Alexey: 1) It breaks ASAN $ gcc -fsanitize=address wrap.c -o wrap-asan $ ./wrap32 ./wrap-asan true ==1217==Shadow memory range interleaves with an existing memory mapping. ASan cannot proceed correctly. ABORTING. ==1217==ASan shadow was supposed to be located in the [0x00007fff7000-0x10007fff7fff] range. ==1217==Process memory map follows: 0x000000400000-0x000000401000 /home/izbyshev/test/gcc/asan-exec-from-32bit/wrap-asan 0x000000600000-0x000000601000 /home/izbyshev/test/gcc/asan-exec-from-32bit/wrap-asan 0x000000601000-0x000000602000 /home/izbyshev/test/gcc/asan-exec-from-32bit/wrap-asan 0x0000f7dbd000-0x0000f7de2000 /lib64/ld-2.27.so 0x0000f7fe2000-0x0000f7fe3000 /lib64/ld-2.27.so 0x0000f7fe3000-0x0000f7fe4000 /lib64/ld-2.27.so 0x0000f7fe4000-0x0000f7fe5000 0x7fed9abff000-0x7fed9af54000 0x7fed9af54000-0x7fed9af6b000 /lib64/libgcc_s.so.1 [snip] 2) It doesn't seem to be great for security if an attacker always knows that ld.so is going to be mapped into the first 4GB in this case (the same thing happens for PIEs as well). The testcase: $ cat wrap.c int main(int argc, char *argv[]) { execvp(argv[1], &argv[1]); return 127; } $ gcc wrap.c -o wrap $ LD_SHOW_AUXV=1 ./wrap ./wrap true |& grep AT_BASE AT_BASE: 0x7f63b8309000 AT_BASE: 0x7faec143c000 AT_BASE: 0x7fbdb25fa000 $ gcc -m32 wrap.c -o wrap32 $ LD_SHOW_AUXV=1 ./wrap32 ./wrap true |& grep AT_BASE AT_BASE: 0xf7eff000 AT_BASE: 0xf7cee000 AT_BASE: 0x7f8b9774e000 Fixes: 1b028f784e8c ("x86/mm: Introduce mmap_compat_base() for 32-bit mmap()") Fixes: ada26481dfe6 ("x86/mm: Make in_compat_syscall() work during exec") Reported-by: Alexey Izbyshev <izbyshev@ispras.ru> Bisected-by: Alexander Monakov <amonakov@ispras.ru> Investigated-by: Andy Lutomirski <luto@kernel.org> Signed-off-by: Dmitry Safonov <dima@arista.com> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Reviewed-by: Cyrill Gorcunov <gorcunov@openvz.org> Cc: Borislav Petkov <bp@suse.de> Cc: Alexander Monakov <amonakov@ispras.ru> Cc: Dmitry Safonov <0x7f454c46@gmail.com> Cc: stable@vger.kernel.org Cc: linux-mm@kvack.org Cc: Andy Lutomirski <luto@kernel.org> Cc: "H. Peter Anvin" <hpa@zytor.com> Cc: Cyrill Gorcunov <gorcunov@openvz.org> Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com> Link: https://lkml.kernel.org/r/20180517233510.24996-1-dima@arista.com
| * | | | x86/apic/x2apic: Initialize cluster ID properlyThomas Gleixner2018-05-171-0/+1
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | Rick bisected a regression on large systems which use the x2apic cluster mode for interrupt delivery to the commit wich reworked the cluster management. The problem is caused by a missing initialization of the clusterid field in the shared cluster data structures. So all structures end up with cluster ID 0 which only allows sharing between all CPUs which belong to cluster 0. All other CPUs with a cluster ID > 0 cannot share the data structure because they cannot find existing data with their cluster ID. This causes malfunction with IPIs because IPIs are sent to the wrong cluster and the caller waits for ever that the target CPU handles the IPI. Add the missing initialization when a upcoming CPU is the first in a cluster so that the later booting CPUs can find the data and share it for proper operation. Fixes: 023a611748fd ("x86/apic/x2apic: Simplify cluster management") Reported-by: Rick Warner <rick@microway.com> Bisected-by: Rick Warner <rick@microway.com> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Tested-by: Rick Warner <rick@microway.com> Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/alpine.DEB.2.21.1805171418210.1947@nanos.tec.linutronix.de
| * | | | x86/boot/compressed/64: Fix moving page table out of trampoline memoryKirill A. Shutemov2018-05-162-11/+14
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | cleanup_trampoline() relocates the top-level page table out of trampoline memory. We use 'top_pgtable' as our new top-level page table. But if the 'top_pgtable' would be referenced from C in a usual way, the address of the table will be calculated relative to RIP. After kernel gets relocated, the address will be in the middle of decompression buffer and the page table may get overwritten. This leads to a crash. We calculate the address of other page tables relative to the relocation address. It makes them safe. We should do the same for 'top_pgtable'. Calculate the address of 'top_pgtable' in assembly and pass down to cleanup_trampoline(). Move the page table to .pgtable section where the rest of page tables are. The section is @nobits so we save 4k in kernel image. Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Reviewed-by: Thomas Gleixner <tglx@linutronix.de> Cc: Hugh Dickins <hughd@google.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Peter Zijlstra <peterz@infradead.org> Fixes: e9d0e6330eb8 ("x86/boot/compressed/64: Prepare new top-level page table for trampoline") Link: http://lkml.kernel.org/r/20180516080131.27913-3-kirill.shutemov@linux.intel.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
| * | | | x86/boot/compressed/64: Set up GOT for paging_prepare() and cleanup_trampoline()Kirill A. Shutemov2018-05-161-13/+55
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | Eric and Hugh have reported instant reboot due to my recent changes in decompression code. The root cause is that I didn't realize that we need to adjust GOT to be able to run C code that early. The problem is only visible with an older toolchain. Binutils >= 2.24 is able to eliminate GOT references by replacing them with RIP-relative address loads: https://sourceware.org/git/gitweb.cgi?p=binutils-gdb.git;a=commitdiff;h=80d873266dec We need to adjust GOT two times: - before calling paging_prepare() using the initial load address - before calling C code from the relocated kernel Reported-by: Eric Dumazet <eric.dumazet@gmail.com> Reported-by: Hugh Dickins <hughd@google.com> Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Reviewed-by: Thomas Gleixner <tglx@linutronix.de> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Peter Zijlstra <peterz@infradead.org> Fixes: 194a9749c73d ("x86/boot/compressed/64: Handle 5-level paging boot if kernel is above 4G") Link: http://lkml.kernel.org/r/20180516080131.27913-2-kirill.shutemov@linux.intel.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
| * | | | x86/pkeys: Do not special case protection key 0Dave Hansen2018-05-142-4/+4
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | mm_pkey_is_allocated() treats pkey 0 as unallocated. That is inconsistent with the manpages, and also inconsistent with mm->context.pkey_allocation_map. Stop special casing it and only disallow values that are actually bad (< 0). The end-user visible effect of this is that you can now use mprotect_pkey() to set pkey=0. This is a bit nicer than what Ram proposed[1] because it is simpler and removes special-casing for pkey 0. On the other hand, it does allow applications to pkey_free() pkey-0, but that's just a silly thing to do, so we are not going to protect against it. The scenario that could happen is similar to what happens if you free any other pkey that is in use: it might get reallocated later and used to protect some other data. The most likely scenario is that pkey-0 comes back from pkey_alloc(), an access-disable or write-disable bit is set in PKRU for it, and the next stack access will SIGSEGV. It's not horribly different from if you mprotect()'d your stack or heap to be unreadable or unwritable, which is generally very foolish, but also not explicitly prevented by the kernel. 1. http://lkml.kernel.org/r/1522112702-27853-1-git-send-email-linuxram@us.ibm.com Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com> Cc: Andrew Morton <akpm@linux-foundation.org>p Cc: Dave Hansen <dave.hansen@intel.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Michael Ellermen <mpe@ellerman.id.au> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Ram Pai <linuxram@us.ibm.com> Cc: Shuah Khan <shuah@kernel.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: linux-mm@kvack.org Cc: stable@vger.kernel.org Fixes: 58ab9a088dda ("x86/pkeys: Check against max pkey to avoid overflows") Link: http://lkml.kernel.org/r/20180509171358.47FD785E@viggo.jf.intel.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
| * | | | x86/pkeys/selftests: Add a test for pkey 0Dave Hansen2018-05-141-0/+30
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | Protection key 0 is the default key for all memory and will not normally come back from pkey_alloc(). But, you might still want pass it to mprotect_pkey(). This check ensures that you can use pkey 0. Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Dave Hansen <dave.hansen@intel.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Michael Ellermen <mpe@ellerman.id.au> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Ram Pai <linuxram@us.ibm.com> Cc: Shuah Khan <shuah@kernel.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20180509171356.9E40B254@viggo.jf.intel.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
| * | | | x86/pkeys/selftests: Save off 'prot' for allocationsDave Hansen2018-05-141-5/+9
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | This makes it possible to to tell what 'prot' a given allocation is supposed to have. That way, if we want to change just the pkey, we know what 'prot' to pass to mprotect_pkey(). Also, keep a record of the most recent allocation so the tests can easily find it. Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Dave Hansen <dave.hansen@intel.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Michael Ellermen <mpe@ellerman.id.au> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Ram Pai <linuxram@us.ibm.com> Cc: Shuah Khan <shuah@kernel.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20180509171354.AA23E228@viggo.jf.intel.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
| * | | | x86/pkeys/selftests: Fix pointer mathDave Hansen2018-05-141-7/+7
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | We dump out the entire area of the siginfo where the si_pkey_ptr is supposed to be. But, we do some math on the poitner, which is a u32. We intended to do byte math, not u32 math on the pointer. Cast it over to a u8* so it works. Also, move this block of code to below th si_code check. It doesn't hurt anything, but the si_pkey field is gibberish for other signal types. Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Dave Hansen <dave.hansen@intel.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Michael Ellermen <mpe@ellerman.id.au> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Ram Pai <linuxram@us.ibm.com> Cc: Shuah Khan <shuah@kernel.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20180509171352.9BE09819@viggo.jf.intel.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
| * | | | x86/pkeys: Override pkey when moving away from PROT_EXECDave Hansen2018-05-142-11/+22
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | I got a bug report that the following code (roughly) was causing a SIGSEGV: mprotect(ptr, size, PROT_EXEC); mprotect(ptr, size, PROT_NONE); mprotect(ptr, size, PROT_READ); *ptr = 100; The problem is hit when the mprotect(PROT_EXEC) is implicitly assigned a protection key to the VMA, and made that key ACCESS_DENY|WRITE_DENY. The PROT_NONE mprotect() failed to remove the protection key, and the PROT_NONE-> PROT_READ left the PTE usable, but the pkey still in place and left the memory inaccessible. To fix this, we ensure that we always "override" the pkee at mprotect() if the VMA does not have execute-only permissions, but the VMA has the execute-only pkey. We had a check for PROT_READ/WRITE, but it did not work for PROT_NONE. This entirely removes the PROT_* checks, which ensures that PROT_NONE now works. Reported-by: Shakeel Butt <shakeelb@google.com> Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Dave Hansen <dave.hansen@intel.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Michael Ellermen <mpe@ellerman.id.au> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Ram Pai <linuxram@us.ibm.com> Cc: Shuah Khan <shuah@kernel.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: linux-mm@kvack.org Cc: stable@vger.kernel.org Fixes: 62b5f7d013f ("mm/core, x86/mm/pkeys: Add execute-only protection keys support") Link: http://lkml.kernel.org/r/20180509171351.084C5A71@viggo.jf.intel.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
| * | | | x86/pkeys/selftests: Fix pkey exhaustion test off-by-oneDave Hansen2018-05-141-5/+8
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | In our "exhaust all pkeys" test, we make sure that there is the expected number available. Turns out that the test did not cover the execute-only key, but discussed it anyway. It did *not* discuss the test-allocated key. Now that we have a test for the mprotect(PROT_EXEC) case, this off-by-one issue showed itself. Correct the off-by- one and add the explanation for the case we missed. Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Dave Hansen <dave.hansen@intel.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Michael Ellermen <mpe@ellerman.id.au> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Ram Pai <linuxram@us.ibm.com> Cc: Shuah Khan <shuah@kernel.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20180509171350.E1656B95@viggo.jf.intel.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
| * | | | x86/pkeys/selftests: Add PROT_EXEC testDave Hansen2018-05-141-0/+44
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | Under the covers, implement executable-only memory with protection keys when userspace calls mprotect(PROT_EXEC). But, we did not have a selftest for that. Now we do. Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Dave Hansen <dave.hansen@intel.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Michael Ellermen <mpe@ellerman.id.au> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Ram Pai <linuxram@us.ibm.com> Cc: Shuah Khan <shuah@kernel.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20180509171348.9EEE4BEF@viggo.jf.intel.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
| * | | | x86/pkeys/selftests: Factor out "instruction page"Dave Hansen2018-05-141-4/+17
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | We currently have an execute-only test, but it is for the explicit mprotect_pkey() interface. We will soon add a test for the implicit mprotect(PROT_EXEC) enterface. We need this code in both tests. Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Dave Hansen <dave.hansen@intel.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Michael Ellermen <mpe@ellerman.id.au> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Ram Pai <linuxram@us.ibm.com> Cc: Shuah Khan <shuah@kernel.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20180509171347.C64AB733@viggo.jf.intel.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
| * | | | x86/pkeys/selftests: Allow faults on unknown keysDave Hansen2018-05-141-1/+9
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | The exec-only pkey is allocated inside the kernel and userspace is not told what it is. So, allow PK faults to occur that have an unknown key. Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Dave Hansen <dave.hansen@intel.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Michael Ellermen <mpe@ellerman.id.au> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Ram Pai <linuxram@us.ibm.com> Cc: Shuah Khan <shuah@kernel.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20180509171345.7FC7DA00@viggo.jf.intel.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
| * | | | x86/pkeys/selftests: Avoid printf-in-signal deadlocksDave Hansen2018-05-141-12/+8
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | printf() and friends are unusable in signal handlers. They deadlock. The pkey selftest does not do any normal printing in signal handlers, only extra debugging. So, just print the format string so we get *some* output when debugging. Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Dave Hansen <dave.hansen@intel.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Michael Ellermen <mpe@ellerman.id.au> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Ram Pai <linuxram@us.ibm.com> Cc: Shuah Khan <shuah@kernel.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20180509171344.C53FD2F3@viggo.jf.intel.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
| * | | | x86/pkeys/selftests: Remove dead debugging code, fix dprint_in_signalDave Hansen2018-05-141-16/+0
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | There is some noisy debug code at the end of the signal handler. It was disabled by an early, unconditional "return". However, that return also hid a dprint_in_signal=0, which kept dprint_in_signal=1 and effectively locked us into permanent dprint_in_signal=1 behavior. Remove the return and the dead code, fixing dprint_in_signal. Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Dave Hansen <dave.hansen@intel.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Michael Ellermen <mpe@ellerman.id.au> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Ram Pai <linuxram@us.ibm.com> Cc: Shuah Khan <shuah@kernel.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20180509171342.846B9B2E@viggo.jf.intel.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
| * | | | x86/pkeys/selftests: Stop using assert()Dave Hansen2018-05-141-4/+8
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | If we use assert(), the program "crashes". That can be scary to users, so stop doing it. Just exit with a >0 exit code instead. Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Dave Hansen <dave.hansen@intel.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Michael Ellermen <mpe@ellerman.id.au> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Ram Pai <linuxram@us.ibm.com> Cc: Shuah Khan <shuah@kernel.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20180509171340.E63EF7DA@viggo.jf.intel.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
| * | | | x86/pkeys/selftests: Give better unexpected fault error messagesDave Hansen2018-05-141-6/+7
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | do_not_expect_pk_fault() is a helper that we call when we do not expect a PK fault to have occurred. But, it is a function, which means that it obscures the line numbers from pkey_assert(). It also gives no details. Replace it with an implementation that gives nice line numbers and also lets callers pass in a more descriptive message about what happened that caused the unexpected fault. Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Dave Hansen <dave.hansen@intel.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Michael Ellermen <mpe@ellerman.id.au> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Ram Pai <linuxram@us.ibm.com> Cc: Shuah Khan <shuah@kernel.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20180509171338.55D13B64@viggo.jf.intel.com Signed-off-by: Ingo Molnar <mingo@kernel.org>