From fe2e082f5da5b4a0a92ae32978f81507ef37ec66 Mon Sep 17 00:00:00 2001 From: Wenwen Wang Date: Tue, 20 Aug 2019 00:16:40 -0500 Subject: ecryptfs: fix a memory leak bug in parse_tag_1_packet() In parse_tag_1_packet(), if tag 1 packet contains a key larger than ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES, no cleanup is executed, leading to a memory leak on the allocated 'auth_tok_list_item'. To fix this issue, go to the label 'out_free' to perform the cleanup work. Cc: stable@vger.kernel.org Fixes: dddfa461fc89 ("[PATCH] eCryptfs: Public key; packet management") Signed-off-by: Wenwen Wang Signed-off-by: Tyler Hicks --- fs/ecryptfs/keystore.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c index 216fbe6a4837..4dc09638de8f 100644 --- a/fs/ecryptfs/keystore.c +++ b/fs/ecryptfs/keystore.c @@ -1304,7 +1304,7 @@ parse_tag_1_packet(struct ecryptfs_crypt_stat *crypt_stat, printk(KERN_WARNING "Tag 1 packet contains key larger " "than ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES\n"); rc = -EINVAL; - goto out; + goto out_free; } memcpy((*new_auth_tok)->session_key.encrypted_key, &data[(*packet_size)], (body_size - (ECRYPTFS_SIG_SIZE + 2))); -- cgit v1.2.3 From b4a81b87a4cfe2bb26a4a943b748d96a43ef20e8 Mon Sep 17 00:00:00 2001 From: Wenwen Wang Date: Tue, 20 Aug 2019 00:33:54 -0500 Subject: ecryptfs: fix a memory leak bug in ecryptfs_init_messaging() In ecryptfs_init_messaging(), if the allocation for 'ecryptfs_msg_ctx_arr' fails, the previously allocated 'ecryptfs_daemon_hash' is not deallocated, leading to a memory leak bug. To fix this issue, free 'ecryptfs_daemon_hash' before returning the error. Cc: stable@vger.kernel.org Fixes: 88b4a07e6610 ("[PATCH] eCryptfs: Public key transport mechanism") Signed-off-by: Wenwen Wang Signed-off-by: Tyler Hicks --- fs/ecryptfs/messaging.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c index d668e60b85b5..c05ca39aa449 100644 --- a/fs/ecryptfs/messaging.c +++ b/fs/ecryptfs/messaging.c @@ -379,6 +379,7 @@ int __init ecryptfs_init_messaging(void) * ecryptfs_message_buf_len), GFP_KERNEL); if (!ecryptfs_msg_ctx_arr) { + kfree(ecryptfs_daemon_hash); rc = -ENOMEM; goto out; } -- cgit v1.2.3 From 042f057fe2dcf38682d85d9f88df00d1a8d45dbd Mon Sep 17 00:00:00 2001 From: Vijay Khemka Date: Wed, 11 Dec 2019 10:56:04 -0800 Subject: drivers: ipmi: Support raw i2c packet in IPMB Many IPMB devices don't support smbus protocol and this driver only supports the smbus protocol at the moment. Added support for the i2c protocol as well. There will be a variable "i2c-protocol" passed by the device tree or ACPI table which determines whether the protocol is i2c or smbus. Signed-off-by: Vijay Khemka Reviewed-by: Asmaa Mnebhi Message-Id: <20191211185604.1266063-1-vijaykhemka@fb.com> [IPMB.txt had moved to driver-api/ipmb.rst, I adjusted] Signed-off-by: Corey Minyard --- Documentation/driver-api/ipmb.rst | 4 ++++ drivers/char/ipmi/ipmb_dev_int.c | 29 +++++++++++++++++++++++++++++ 2 files changed, 33 insertions(+) diff --git a/Documentation/driver-api/ipmb.rst b/Documentation/driver-api/ipmb.rst index 3ec3baed84c4..209c49e05116 100644 --- a/Documentation/driver-api/ipmb.rst +++ b/Documentation/driver-api/ipmb.rst @@ -71,9 +71,13 @@ b) Example for device tree:: ipmb@10 { compatible = "ipmb-dev"; reg = <0x10>; + i2c-protocol; }; }; +If xmit of data to be done using raw i2c block vs smbus +then "i2c-protocol" needs to be defined as above. + 2) Manually from Linux:: modprobe ipmb-dev-int diff --git a/drivers/char/ipmi/ipmb_dev_int.c b/drivers/char/ipmi/ipmb_dev_int.c index 1ff4fb1def7c..86674292b213 100644 --- a/drivers/char/ipmi/ipmb_dev_int.c +++ b/drivers/char/ipmi/ipmb_dev_int.c @@ -63,6 +63,7 @@ struct ipmb_dev { spinlock_t lock; wait_queue_head_t wait_queue; struct mutex file_mutex; + bool is_i2c_protocol; }; static inline struct ipmb_dev *to_ipmb_dev(struct file *file) @@ -112,6 +113,25 @@ static ssize_t ipmb_read(struct file *file, char __user *buf, size_t count, return ret < 0 ? ret : count; } +static int ipmb_i2c_write(struct i2c_client *client, u8 *msg, u8 addr) +{ + struct i2c_msg i2c_msg; + + /* + * subtract 1 byte (rq_sa) from the length of the msg passed to + * raw i2c_transfer + */ + i2c_msg.len = msg[IPMB_MSG_LEN_IDX] - 1; + + /* Assign message to buffer except first 2 bytes (length and address) */ + i2c_msg.buf = msg + 2; + + i2c_msg.addr = addr; + i2c_msg.flags = client->flags & I2C_CLIENT_PEC; + + return i2c_transfer(client->adapter, &i2c_msg, 1); +} + static ssize_t ipmb_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { @@ -133,6 +153,12 @@ static ssize_t ipmb_write(struct file *file, const char __user *buf, rq_sa = GET_7BIT_ADDR(msg[RQ_SA_8BIT_IDX]); netf_rq_lun = msg[NETFN_LUN_IDX]; + /* Check i2c block transfer vs smbus */ + if (ipmb_dev->is_i2c_protocol) { + ret = ipmb_i2c_write(ipmb_dev->client, msg, rq_sa); + return (ret == 1) ? count : ret; + } + /* * subtract rq_sa and netf_rq_lun from the length of the msg passed to * i2c_smbus_xfer @@ -302,6 +328,9 @@ static int ipmb_probe(struct i2c_client *client, if (ret) return ret; + ipmb_dev->is_i2c_protocol + = device_property_read_bool(&client->dev, "i2c-protocol"); + ipmb_dev->client = client; i2c_set_clientdata(client, ipmb_dev); ret = i2c_slave_register(client, ipmb_slave_cb); -- cgit v1.2.3 From 380665becdeeb4f455c23582b7f32e6b3cea27d2 Mon Sep 17 00:00:00 2001 From: Vijay Khemka Date: Wed, 11 Dec 2019 11:01:55 -0800 Subject: drivers: ipmi: Modify max length of IPMB packet As per IPMB specification, maximum packet size supported is 255, modified Max length to 240 from 128 to accommodate more data. Signed-off-by: Vijay Khemka Message-Id: <20191211190155.1279610-1-vijaykhemka@fb.com> Signed-off-by: Corey Minyard --- drivers/char/ipmi/ipmb_dev_int.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/char/ipmi/ipmb_dev_int.c b/drivers/char/ipmi/ipmb_dev_int.c index 86674292b213..9fdae83e59e0 100644 --- a/drivers/char/ipmi/ipmb_dev_int.c +++ b/drivers/char/ipmi/ipmb_dev_int.c @@ -19,7 +19,7 @@ #include #include -#define MAX_MSG_LEN 128 +#define MAX_MSG_LEN 240 #define IPMB_REQUEST_LEN_MIN 7 #define NETFN_RSP_BIT_MASK 0x4 #define REQUEST_QUEUE_MAX_LEN 256 -- cgit v1.2.3 From 6b8526d3abc02c08a2f888e8c20b7ac9e5776dfe Mon Sep 17 00:00:00 2001 From: Corey Minyard Date: Mon, 23 Dec 2019 10:42:19 -0600 Subject: ipmi:ssif: Handle a possible NULL pointer reference In error cases a NULL can be passed to memcpy. The length will always be zero, so it doesn't really matter, but go ahead and check for NULL, anyway, to be more precise and avoid static analysis errors. Reported-by: kbuild test robot Signed-off-by: Corey Minyard --- drivers/char/ipmi/ipmi_ssif.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/drivers/char/ipmi/ipmi_ssif.c b/drivers/char/ipmi/ipmi_ssif.c index 22c6a2e61236..8ac390c2b514 100644 --- a/drivers/char/ipmi/ipmi_ssif.c +++ b/drivers/char/ipmi/ipmi_ssif.c @@ -775,10 +775,14 @@ static void msg_done_handler(struct ssif_info *ssif_info, int result, flags = ipmi_ssif_lock_cond(ssif_info, &oflags); msg = ssif_info->curr_msg; if (msg) { + if (data) { + if (len > IPMI_MAX_MSG_LENGTH) + len = IPMI_MAX_MSG_LENGTH; + memcpy(msg->rsp, data, len); + } else { + len = 0; + } msg->rsp_size = len; - if (msg->rsp_size > IPMI_MAX_MSG_LENGTH) - msg->rsp_size = IPMI_MAX_MSG_LENGTH; - memcpy(msg->rsp, data, msg->rsp_size); ssif_info->curr_msg = NULL; } -- cgit v1.2.3 From 3f666c56c6b8cc40a5e9002aac484b8f5b83c402 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Fri, 3 Jan 2020 13:33:07 -0500 Subject: dax: Pass dax_dev instead of bdev to dax_writeback_mapping_range() As of now dax_writeback_mapping_range() takes "struct block_device" as a parameter and dax_dev is searched from bdev name. This also involves taking a fresh reference on dax_dev and putting that reference at the end of function. We are developing a new filesystem virtio-fs and using dax to access host page cache directly. But there is no block device. IOW, we want to make use of dax but want to get rid of this assumption that there is always a block device associated with dax_dev. So pass in "struct dax_device" as parameter instead of bdev. ext2/ext4/xfs are current users and they already have a reference on dax_device. So there is no need to take reference and drop reference to dax_device on each call of this function. Suggested-by: Christoph Hellwig Reviewed-by: Christoph Hellwig Reviewed-by: Jan Kara Signed-off-by: Vivek Goyal Link: https://lore.kernel.org/r/20200103183307.GB13350@redhat.com Signed-off-by: Dan Williams --- fs/dax.c | 8 +------- fs/ext2/inode.c | 5 +++-- fs/ext4/inode.c | 2 +- fs/xfs/xfs_aops.c | 2 +- include/linux/dax.h | 4 ++-- 5 files changed, 8 insertions(+), 13 deletions(-) diff --git a/fs/dax.c b/fs/dax.c index 1f1f0201cad1..1a64c19de0ad 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -937,12 +937,11 @@ static int dax_writeback_one(struct xa_state *xas, struct dax_device *dax_dev, * on persistent storage prior to completion of the operation. */ int dax_writeback_mapping_range(struct address_space *mapping, - struct block_device *bdev, struct writeback_control *wbc) + struct dax_device *dax_dev, struct writeback_control *wbc) { XA_STATE(xas, &mapping->i_pages, wbc->range_start >> PAGE_SHIFT); struct inode *inode = mapping->host; pgoff_t end_index = wbc->range_end >> PAGE_SHIFT; - struct dax_device *dax_dev; void *entry; int ret = 0; unsigned int scanned = 0; @@ -953,10 +952,6 @@ int dax_writeback_mapping_range(struct address_space *mapping, if (!mapping->nrexceptional || wbc->sync_mode != WB_SYNC_ALL) return 0; - dax_dev = dax_get_by_host(bdev->bd_disk->disk_name); - if (!dax_dev) - return -EIO; - trace_dax_writeback_range(inode, xas.xa_index, end_index); tag_pages_for_writeback(mapping, xas.xa_index, end_index); @@ -977,7 +972,6 @@ int dax_writeback_mapping_range(struct address_space *mapping, xas_lock_irq(&xas); } xas_unlock_irq(&xas); - put_dax(dax_dev); trace_dax_writeback_range_done(inode, xas.xa_index, end_index); return ret; } diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 119667e65890..c885cf7d724b 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -960,8 +960,9 @@ ext2_writepages(struct address_space *mapping, struct writeback_control *wbc) static int ext2_dax_writepages(struct address_space *mapping, struct writeback_control *wbc) { - return dax_writeback_mapping_range(mapping, - mapping->host->i_sb->s_bdev, wbc); + struct ext2_sb_info *sbi = EXT2_SB(mapping->host->i_sb); + + return dax_writeback_mapping_range(mapping, sbi->s_daxdev, wbc); } const struct address_space_operations ext2_aops = { diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 28f28de0c1b6..9992af776fbd 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2866,7 +2866,7 @@ static int ext4_dax_writepages(struct address_space *mapping, percpu_down_read(&sbi->s_journal_flag_rwsem); trace_ext4_writepages(inode, wbc); - ret = dax_writeback_mapping_range(mapping, inode->i_sb->s_bdev, wbc); + ret = dax_writeback_mapping_range(mapping, sbi->s_daxdev, wbc); trace_ext4_writepages_result(inode, wbc, ret, nr_to_write - wbc->nr_to_write); percpu_up_read(&sbi->s_journal_flag_rwsem); diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 3a688eb5c5ae..58e937be24ce 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -587,7 +587,7 @@ xfs_dax_writepages( xfs_iflags_clear(ip, XFS_ITRUNCATED); return dax_writeback_mapping_range(mapping, - xfs_inode_buftarg(ip)->bt_bdev, wbc); + xfs_inode_buftarg(ip)->bt_daxdev, wbc); } STATIC sector_t diff --git a/include/linux/dax.h b/include/linux/dax.h index 9bd8528bd305..d5932e47c597 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -141,7 +141,7 @@ static inline void fs_put_dax(struct dax_device *dax_dev) struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev); int dax_writeback_mapping_range(struct address_space *mapping, - struct block_device *bdev, struct writeback_control *wbc); + struct dax_device *dax_dev, struct writeback_control *wbc); struct page *dax_layout_busy_page(struct address_space *mapping); dax_entry_t dax_lock_page(struct page *page); @@ -180,7 +180,7 @@ static inline struct page *dax_layout_busy_page(struct address_space *mapping) } static inline int dax_writeback_mapping_range(struct address_space *mapping, - struct block_device *bdev, struct writeback_control *wbc) + struct dax_device *dax_dev, struct writeback_control *wbc) { return -EOPNOTSUPP; } -- cgit v1.2.3 From dbaf10027ae92a66f0dfad33e1e3453daa16373f Mon Sep 17 00:00:00 2001 From: Nishad Kamdar Date: Wed, 25 Dec 2019 17:50:58 +0530 Subject: vfio-ccw: Use the correct style for SPDX License Identifier This patch corrects the SPDX License Identifier style in header file related to S/390 common i/o drivers. It assigns explicit block comment to the SPDX License Identifier. Changes made by using a script provided by Joe Perches here: https://lkml.org/lkml/2019/2/7/46. Fixes: 3cd90214b70f ("vfio: ccw: add tracepoints for interesting error paths") Suggested-by: Joe Perches Signed-off-by: Nishad Kamdar Message-Id: <20191225122054.GA4598@nishad> Signed-off-by: Cornelia Huck --- drivers/s390/cio/vfio_ccw_trace.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/s390/cio/vfio_ccw_trace.h b/drivers/s390/cio/vfio_ccw_trace.h index 30162a318a8a..f5d31887d413 100644 --- a/drivers/s390/cio/vfio_ccw_trace.h +++ b/drivers/s390/cio/vfio_ccw_trace.h @@ -1,5 +1,5 @@ -/* SPDX-License-Identifier: GPL-2.0 - * Tracepoints for vfio_ccw driver +/* SPDX-License-Identifier: GPL-2.0 */ +/* Tracepoints for vfio_ccw driver * * Copyright IBM Corp. 2018 * -- cgit v1.2.3 From f01b16a85bfae2e6b4f32de0a1f37ac4050dc316 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Mon, 6 Jan 2020 13:11:17 -0500 Subject: dax: Get rid of fs_dax_get_by_host() helper Looks like nobody is using fs_dax_get_by_host() except fs_dax_get_by_bdev() and it can easily use dax_get_by_host() instead. IIUC, fs_dax_get_by_host() was only introduced so that one could compile with CONFIG_FS_DAX=n and CONFIG_DAX=m. fs_dax_get_by_bdev() achieves the same purpose and hence it looks like fs_dax_get_by_host() is not needed anymore. Signed-off-by: Vivek Goyal Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20200106181117.GA16248@redhat.com Signed-off-by: Dan Williams --- drivers/dax/super.c | 2 +- include/linux/dax.h | 10 ---------- 2 files changed, 1 insertion(+), 11 deletions(-) diff --git a/drivers/dax/super.c b/drivers/dax/super.c index 26a654dbc69a..0aa4b6bc5101 100644 --- a/drivers/dax/super.c +++ b/drivers/dax/super.c @@ -61,7 +61,7 @@ struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev) { if (!blk_queue_dax(bdev->bd_queue)) return NULL; - return fs_dax_get_by_host(bdev->bd_disk->disk_name); + return dax_get_by_host(bdev->bd_disk->disk_name); } EXPORT_SYMBOL_GPL(fs_dax_get_by_bdev); #endif diff --git a/include/linux/dax.h b/include/linux/dax.h index d5932e47c597..328c2dbb4409 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -129,11 +129,6 @@ static inline bool generic_fsdax_supported(struct dax_device *dax_dev, sectors); } -static inline struct dax_device *fs_dax_get_by_host(const char *host) -{ - return dax_get_by_host(host); -} - static inline void fs_put_dax(struct dax_device *dax_dev) { put_dax(dax_dev); @@ -160,11 +155,6 @@ static inline bool generic_fsdax_supported(struct dax_device *dax_dev, return false; } -static inline struct dax_device *fs_dax_get_by_host(const char *host) -{ - return NULL; -} - static inline void fs_put_dax(struct dax_device *dax_dev) { } -- cgit v1.2.3 From e0354d147e5889b5faa12e64fa38187aed39aad4 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 14 Jan 2020 14:40:31 +0000 Subject: drivers: ipmi: fix off-by-one bounds check that leads to a out-of-bounds write The end of buffer check is off-by-one since the check is against an index that is pre-incremented before a store to buf[]. Fix this adjusting the bounds check appropriately. Addresses-Coverity: ("Out-of-bounds write") Fixes: 51bd6f291583 ("Add support for IPMB driver") Signed-off-by: Colin Ian King Message-Id: <20200114144031.358003-1-colin.king@canonical.com> Reviewed-by: Asmaa Mnebhi Signed-off-by: Corey Minyard --- drivers/char/ipmi/ipmb_dev_int.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/char/ipmi/ipmb_dev_int.c b/drivers/char/ipmi/ipmb_dev_int.c index 9fdae83e59e0..382b28f1cf2f 100644 --- a/drivers/char/ipmi/ipmb_dev_int.c +++ b/drivers/char/ipmi/ipmb_dev_int.c @@ -279,7 +279,7 @@ static int ipmb_slave_cb(struct i2c_client *client, break; case I2C_SLAVE_WRITE_RECEIVED: - if (ipmb_dev->msg_idx >= sizeof(struct ipmb_msg)) + if (ipmb_dev->msg_idx >= sizeof(struct ipmb_msg) - 1) break; buf[++ipmb_dev->msg_idx] = *val; -- cgit v1.2.3 From db735fc4036bbe1fbe606819b5f0ff26cc76cdff Mon Sep 17 00:00:00 2001 From: Sean Paul Date: Tue, 21 Jan 2020 11:18:48 -0800 Subject: drm/msm: Set dma maximum segment size for mdss Turning on CONFIG_DMA_API_DEBUG_SG results in the following error: [ 12.078665] msm ae00000.mdss: DMA-API: mapping sg segment longer than device claims to support [len=3526656] [max=65536] [ 12.089870] WARNING: CPU: 6 PID: 334 at /mnt/host/source/src/third_party/kernel/v4.19/kernel/dma/debug.c:1301 debug_dma_map_sg+0x1dc/0x318 [ 12.102655] Modules linked in: joydev [ 12.106442] CPU: 6 PID: 334 Comm: frecon Not tainted 4.19.0 #2 [ 12.112450] Hardware name: Google Cheza (rev3+) (DT) [ 12.117566] pstate: 60400009 (nZCv daif +PAN -UAO) [ 12.122506] pc : debug_dma_map_sg+0x1dc/0x318 [ 12.126995] lr : debug_dma_map_sg+0x1dc/0x318 [ 12.131487] sp : ffffff800cc3ba80 [ 12.134913] x29: ffffff800cc3ba80 x28: 0000000000000000 [ 12.140395] x27: 0000000000000004 x26: 0000000000000004 [ 12.145868] x25: ffffff8008e55b18 x24: 0000000000000000 [ 12.151337] x23: 00000000ffffffff x22: ffffff800921c000 [ 12.156809] x21: ffffffc0fa75b080 x20: ffffffc0f7195090 [ 12.162280] x19: ffffffc0f1c53280 x18: 0000000000000000 [ 12.167749] x17: 0000000000000000 x16: 0000000000000000 [ 12.173218] x15: 0000000000000000 x14: 0720072007200720 [ 12.178689] x13: 0720072007200720 x12: 0720072007200720 [ 12.184161] x11: 0720072007200720 x10: 0720072007200720 [ 12.189641] x9 : ffffffc0f1fc6b60 x8 : 0000000000000000 [ 12.195110] x7 : ffffff8008132ce0 x6 : 0000000000000000 [ 12.200585] x5 : 0000000000000000 x4 : ffffff8008134734 [ 12.206058] x3 : ffffff800cc3b830 x2 : ffffffc0f1fc6240 [ 12.211532] x1 : 25045a74f48a7400 x0 : 25045a74f48a7400 [ 12.217006] Call trace: [ 12.219535] debug_dma_map_sg+0x1dc/0x318 [ 12.223671] get_pages+0x19c/0x20c [ 12.227177] msm_gem_fault+0x64/0xfc [ 12.230874] __do_fault+0x3c/0x140 [ 12.234383] __handle_mm_fault+0x70c/0xdb8 [ 12.238603] handle_mm_fault+0xac/0xc4 [ 12.242473] do_page_fault+0x1bc/0x3d4 [ 12.246342] do_translation_fault+0x54/0x88 [ 12.250652] do_mem_abort+0x60/0xf0 [ 12.254250] el0_da+0x20/0x24 [ 12.257317] irq event stamp: 67260 [ 12.260828] hardirqs last enabled at (67259): [] console_unlock+0x214/0x608 [ 12.269693] hardirqs last disabled at (67260): [] do_debug_exception+0x5c/0x178 [ 12.278820] softirqs last enabled at (67256): [] __do_softirq+0x4d4/0x520 [ 12.287510] softirqs last disabled at (67249): [] irq_exit+0xa8/0x100 [ 12.295742] ---[ end trace e63cfc40c313ffab ]--- The root of the problem is that the default segment size for sgt is (UINT_MAX & PAGE_MASK), and the default segment size for device dma is 64K. As such, if you compare the 2, you would deduce that the sg segment will overflow the device's capacity. In reality, the hardware can accommodate the larger sg segments, it's just not initializing its max segment properly. This patch initializes the max segment size for the mdss device, which gets rid of that pesky warning. Reported-by: Stephen Boyd Tested-by: Stephen Boyd Tested-by: Sai Prakash Ranjan Reviewed-by: Rob Clark Signed-off-by: Sean Paul Signed-off-by: Douglas Anderson Link: https://patchwork.freedesktop.org/patch/msgid/20200121111813.REPOST.1.I92c66a35fb13f368095b05287bdabdbe88ca6922@changeid --- drivers/gpu/drm/msm/msm_drv.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/gpu/drm/msm/msm_drv.c b/drivers/gpu/drm/msm/msm_drv.c index f50fefb87040..c80ddd7019b5 100644 --- a/drivers/gpu/drm/msm/msm_drv.c +++ b/drivers/gpu/drm/msm/msm_drv.c @@ -441,6 +441,14 @@ static int msm_drm_init(struct device *dev, struct drm_driver *drv) if (ret) goto err_msm_uninit; + if (!dev->dma_parms) { + dev->dma_parms = devm_kzalloc(dev, sizeof(*dev->dma_parms), + GFP_KERNEL); + if (!dev->dma_parms) + return -ENOMEM; + } + dma_set_max_seg_size(dev, DMA_BIT_MASK(32)); + msm_gem_shrinker_init(ddev); switch (get_mdp_ver(pdev)) { -- cgit v1.2.3 From 3543d7ddd55fe12c37e8a9db846216c51846015b Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 23 Jan 2020 14:51:12 +0000 Subject: arm64: dts: fast models: Fix FVP PCI interrupt-map property The interrupt map for the FVP's PCI node is missing the parent-unit-address cells for each of the INTx entries, leading to the kernel code failing to parse the entries correctly. Add the missing zero cells, which are pretty useless as far as the GIC is concerned, but that the spec requires. This allows INTx to be usable on the model, and VFIO to work correctly. Fixes: fa083b99eb28 ("arm64: dts: fast models: Add DTS fo Base RevC FVP") Signed-off-by: Marc Zyngier Signed-off-by: Sudeep Holla --- arch/arm64/boot/dts/arm/fvp-base-revc.dts | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/arm64/boot/dts/arm/fvp-base-revc.dts b/arch/arm64/boot/dts/arm/fvp-base-revc.dts index 62ab0d54ff71..335fff762451 100644 --- a/arch/arm64/boot/dts/arm/fvp-base-revc.dts +++ b/arch/arm64/boot/dts/arm/fvp-base-revc.dts @@ -161,10 +161,10 @@ bus-range = <0x0 0x1>; reg = <0x0 0x40000000 0x0 0x10000000>; ranges = <0x2000000 0x0 0x50000000 0x0 0x50000000 0x0 0x10000000>; - interrupt-map = <0 0 0 1 &gic GIC_SPI 168 IRQ_TYPE_LEVEL_HIGH>, - <0 0 0 2 &gic GIC_SPI 169 IRQ_TYPE_LEVEL_HIGH>, - <0 0 0 3 &gic GIC_SPI 170 IRQ_TYPE_LEVEL_HIGH>, - <0 0 0 4 &gic GIC_SPI 171 IRQ_TYPE_LEVEL_HIGH>; + interrupt-map = <0 0 0 1 &gic 0 0 GIC_SPI 168 IRQ_TYPE_LEVEL_HIGH>, + <0 0 0 2 &gic 0 0 GIC_SPI 169 IRQ_TYPE_LEVEL_HIGH>, + <0 0 0 3 &gic 0 0 GIC_SPI 170 IRQ_TYPE_LEVEL_HIGH>, + <0 0 0 4 &gic 0 0 GIC_SPI 171 IRQ_TYPE_LEVEL_HIGH>; interrupt-map-mask = <0x0 0x0 0x0 0x7>; msi-map = <0x0 &its 0x0 0x10000>; iommu-map = <0x0 &smmu 0x0 0x10000>; -- cgit v1.2.3 From cf913e9683273f2640501094fa63a67e29f437b3 Mon Sep 17 00:00:00 2001 From: Jernej Skrabec Date: Sun, 26 Jan 2020 07:59:37 +0100 Subject: Revert "drm/sun4i: drv: Allow framebuffer modifiers in mode config" This reverts commit 9db9c0cf5895e4ddde2814360cae7bea9282edd2. Setting mode_config.allow_fb_modifiers manually is completely unnecessary. It is set automatically by drm_universal_plane_init() based on the fact if modifier list is provided or not. Even more, it breaks DE2 and DE3 as they don't support any modifiers beside linear. Modifiers aware applications can be confused by provided empty modifier list - at least linear modifier should be included, but it's not for DE2 and DE3. Fixes: 9db9c0cf5895 ("drm/sun4i: drv: Allow framebuffer modifiers in mode config") Signed-off-by: Jernej Skrabec Reviewed-by: Paul Kocialkowski Signed-off-by: Maxime Ripard Link: https://patchwork.freedesktop.org/patch/msgid/20200126065937.9564-1-jernej.skrabec@siol.net --- drivers/gpu/drm/sun4i/sun4i_drv.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/gpu/drm/sun4i/sun4i_drv.c b/drivers/gpu/drm/sun4i/sun4i_drv.c index a5757b11b730..5b54eff12cc0 100644 --- a/drivers/gpu/drm/sun4i/sun4i_drv.c +++ b/drivers/gpu/drm/sun4i/sun4i_drv.c @@ -85,7 +85,6 @@ static int sun4i_drv_bind(struct device *dev) } drm_mode_config_init(drm); - drm->mode_config.allow_fb_modifiers = true; ret = component_bind_all(drm->dev, drm); if (ret) { -- cgit v1.2.3 From 488603b815a7514c7009e6fc339d74ed4a30f343 Mon Sep 17 00:00:00 2001 From: Scott Wood Date: Sat, 11 Jan 2020 04:53:38 -0500 Subject: sched/core: Don't skip remote tick for idle CPUs This will be used in the next patch to get a loadavg update from nohz cpus. The delta check is skipped because idle_sched_class doesn't update se.exec_start. Signed-off-by: Scott Wood Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lkml.kernel.org/r/1578736419-14628-2-git-send-email-swood@redhat.com --- kernel/sched/core.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index fc1dfc007604..cf8b33dc4513 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3669,22 +3669,24 @@ static void sched_tick_remote(struct work_struct *work) * statistics and checks timeslices in a time-independent way, regardless * of when exactly it is running. */ - if (idle_cpu(cpu) || !tick_nohz_tick_stopped_cpu(cpu)) + if (!tick_nohz_tick_stopped_cpu(cpu)) goto out_requeue; rq_lock_irq(rq, &rf); curr = rq->curr; - if (is_idle_task(curr) || cpu_is_offline(cpu)) + if (cpu_is_offline(cpu)) goto out_unlock; update_rq_clock(rq); - delta = rq_clock_task(rq) - curr->se.exec_start; - /* - * Make sure the next tick runs within a reasonable - * amount of time. - */ - WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3); + if (!is_idle_task(curr)) { + /* + * Make sure the next tick runs within a reasonable + * amount of time. + */ + delta = rq_clock_task(rq) - curr->se.exec_start; + WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3); + } curr->sched_class->task_tick(rq, curr, 0); out_unlock: -- cgit v1.2.3 From ebc0f83c78a2d26384401ecf2d2fa48063c0ee27 Mon Sep 17 00:00:00 2001 From: "Peter Zijlstra (Intel)" Date: Sat, 11 Jan 2020 04:53:39 -0500 Subject: timers/nohz: Update NOHZ load in remote tick The way loadavg is tracked during nohz only pays attention to the load upon entering nohz. This can be particularly noticeable if full nohz is entered while non-idle, and then the cpu goes idle and stays that way for a long time. Use the remote tick to ensure that full nohz cpus report their deltas within a reasonable time. [ swood: Added changelog and removed recheck of stopped tick. ] Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Scott Wood Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lkml.kernel.org/r/1578736419-14628-3-git-send-email-swood@redhat.com --- include/linux/sched/nohz.h | 2 ++ kernel/sched/core.c | 4 +++- kernel/sched/loadavg.c | 33 +++++++++++++++++++++++---------- 3 files changed, 28 insertions(+), 11 deletions(-) diff --git a/include/linux/sched/nohz.h b/include/linux/sched/nohz.h index 1abe91ff6e4a..6d67e9a5af6b 100644 --- a/include/linux/sched/nohz.h +++ b/include/linux/sched/nohz.h @@ -15,9 +15,11 @@ static inline void nohz_balance_enter_idle(int cpu) { } #ifdef CONFIG_NO_HZ_COMMON void calc_load_nohz_start(void); +void calc_load_nohz_remote(struct rq *rq); void calc_load_nohz_stop(void); #else static inline void calc_load_nohz_start(void) { } +static inline void calc_load_nohz_remote(struct rq *rq) { } static inline void calc_load_nohz_stop(void) { } #endif /* CONFIG_NO_HZ_COMMON */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index cf8b33dc4513..4ff03c27779e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3677,6 +3677,7 @@ static void sched_tick_remote(struct work_struct *work) if (cpu_is_offline(cpu)) goto out_unlock; + curr = rq->curr; update_rq_clock(rq); if (!is_idle_task(curr)) { @@ -3689,10 +3690,11 @@ static void sched_tick_remote(struct work_struct *work) } curr->sched_class->task_tick(rq, curr, 0); + calc_load_nohz_remote(rq); out_unlock: rq_unlock_irq(rq, &rf); - out_requeue: + /* * Run the remote tick once per second (1Hz). This arbitrary * frequency is large enough to avoid overload but short enough diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c index 28a516575c18..de22da666ac7 100644 --- a/kernel/sched/loadavg.c +++ b/kernel/sched/loadavg.c @@ -231,16 +231,11 @@ static inline int calc_load_read_idx(void) return calc_load_idx & 1; } -void calc_load_nohz_start(void) +static void calc_load_nohz_fold(struct rq *rq) { - struct rq *this_rq = this_rq(); long delta; - /* - * We're going into NO_HZ mode, if there's any pending delta, fold it - * into the pending NO_HZ delta. - */ - delta = calc_load_fold_active(this_rq, 0); + delta = calc_load_fold_active(rq, 0); if (delta) { int idx = calc_load_write_idx(); @@ -248,6 +243,24 @@ void calc_load_nohz_start(void) } } +void calc_load_nohz_start(void) +{ + /* + * We're going into NO_HZ mode, if there's any pending delta, fold it + * into the pending NO_HZ delta. + */ + calc_load_nohz_fold(this_rq()); +} + +/* + * Keep track of the load for NOHZ_FULL, must be called between + * calc_load_nohz_{start,stop}(). + */ +void calc_load_nohz_remote(struct rq *rq) +{ + calc_load_nohz_fold(rq); +} + void calc_load_nohz_stop(void) { struct rq *this_rq = this_rq(); @@ -268,7 +281,7 @@ void calc_load_nohz_stop(void) this_rq->calc_load_update += LOAD_FREQ; } -static long calc_load_nohz_fold(void) +static long calc_load_nohz_read(void) { int idx = calc_load_read_idx(); long delta = 0; @@ -323,7 +336,7 @@ static void calc_global_nohz(void) } #else /* !CONFIG_NO_HZ_COMMON */ -static inline long calc_load_nohz_fold(void) { return 0; } +static inline long calc_load_nohz_read(void) { return 0; } static inline void calc_global_nohz(void) { } #endif /* CONFIG_NO_HZ_COMMON */ @@ -346,7 +359,7 @@ void calc_global_load(unsigned long ticks) /* * Fold the 'old' NO_HZ-delta to include all NO_HZ CPUs. */ - delta = calc_load_nohz_fold(); + delta = calc_load_nohz_read(); if (delta) atomic_long_add(delta, &calc_load_tasks); -- cgit v1.2.3 From b396f52326de20ec974471b7b19168867b365cbf Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 14 Jan 2020 10:13:20 +0000 Subject: sched/fair: Allow a small load imbalance between low utilisation SD_NUMA domains The CPU load balancer balances between different domains to spread load and strives to have equal balance everywhere. Communicating tasks can migrate so they are topologically close to each other but these decisions are independent. On a lightly loaded NUMA machine, two communicating tasks pulled together at wakeup time can be pushed apart by the load balancer. In isolation, the load balancer decision is fine but it ignores the tasks data locality and the wakeup/LB paths continually conflict. NUMA balancing is also a factor but it also simply conflicts with the load balancer. This patch allows a fixed degree of imbalance of two tasks to exist between NUMA domains regardless of utilisation levels. In many cases, this prevents communicating tasks being pulled apart. It was evaluated whether the imbalance should be scaled to the domain size. However, no additional benefit was measured across a range of workloads and machines and scaling adds the risk that lower domains have to be rebalanced. While this could change again in the future, such a change should specify the use case and benefit. The most obvious impact is on netperf TCP_STREAM -- two simple communicating tasks with some softirq offload depending on the transmission rate. 2-socket Haswell machine 48 core, HT enabled netperf-tcp -- mmtests config config-network-netperf-unbound baseline lbnuma-v3 Hmean 64 568.73 ( 0.00%) 577.56 * 1.55%* Hmean 128 1089.98 ( 0.00%) 1128.06 * 3.49%* Hmean 256 2061.72 ( 0.00%) 2104.39 * 2.07%* Hmean 1024 7254.27 ( 0.00%) 7557.52 * 4.18%* Hmean 2048 11729.20 ( 0.00%) 13350.67 * 13.82%* Hmean 3312 15309.08 ( 0.00%) 18058.95 * 17.96%* Hmean 4096 17338.75 ( 0.00%) 20483.66 * 18.14%* Hmean 8192 25047.12 ( 0.00%) 27806.84 * 11.02%* Hmean 16384 27359.55 ( 0.00%) 33071.88 * 20.88%* Stddev 64 2.16 ( 0.00%) 2.02 ( 6.53%) Stddev 128 2.31 ( 0.00%) 2.19 ( 5.05%) Stddev 256 11.88 ( 0.00%) 3.22 ( 72.88%) Stddev 1024 23.68 ( 0.00%) 7.24 ( 69.43%) Stddev 2048 79.46 ( 0.00%) 71.49 ( 10.03%) Stddev 3312 26.71 ( 0.00%) 57.80 (-116.41%) Stddev 4096 185.57 ( 0.00%) 96.15 ( 48.19%) Stddev 8192 245.80 ( 0.00%) 100.73 ( 59.02%) Stddev 16384 207.31 ( 0.00%) 141.65 ( 31.67%) In this case, there was a sizable improvement to performance and a general reduction in variance. However, this is not univeral. For most machines, the impact was roughly a 3% performance gain. Ops NUMA base-page range updates 19796.00 292.00 Ops NUMA PTE updates 19796.00 292.00 Ops NUMA PMD updates 0.00 0.00 Ops NUMA hint faults 16113.00 143.00 Ops NUMA hint local faults % 8407.00 142.00 Ops NUMA hint local percent 52.18 99.30 Ops NUMA pages migrated 4244.00 1.00 Without the patch, only 52.18% of sampled accesses are local. In an earlier changelog, 100% of sampled accesses are local and indeed on most machines, this was still the case. In this specific case, the local sampled rates was 99.3% but note the "base-page range updates" and "PTE updates". The activity with the patch is negligible as were the number of faults. The small number of pages migrated were related to shared libraries. A 2-socket Broadwell showed better results on average but are not presented for brevity as the performance was similar except it showed 100% of the sampled NUMA hints were local. The patch holds up for a 4-socket Haswell, an AMD EPYC and AMD Epyc 2 machine. For dbench, the impact depends on the filesystem used and the number of clients. On XFS, there is little difference as the clients typically communicate with workqueues which have a separate class of scheduler problem at the moment. For ext4, performance is generally better, particularly for small numbers of clients as NUMA balancing activity is negligible with the patch applied. A more interesting example is the Facebook schbench which uses a number of messaging threads to communicate with worker threads. In this configuration, one messaging thread is used per NUMA node and the number of worker threads is varied. The 50, 75, 90, 95, 99, 99.5 and 99.9 percentiles for response latency is then reported. Lat 50.00th-qrtle-1 44.00 ( 0.00%) 37.00 ( 15.91%) Lat 75.00th-qrtle-1 53.00 ( 0.00%) 41.00 ( 22.64%) Lat 90.00th-qrtle-1 57.00 ( 0.00%) 42.00 ( 26.32%) Lat 95.00th-qrtle-1 63.00 ( 0.00%) 43.00 ( 31.75%) Lat 99.00th-qrtle-1 76.00 ( 0.00%) 51.00 ( 32.89%) Lat 99.50th-qrtle-1 89.00 ( 0.00%) 52.00 ( 41.57%) Lat 99.90th-qrtle-1 98.00 ( 0.00%) 55.00 ( 43.88%) Lat 50.00th-qrtle-2 42.00 ( 0.00%) 42.00 ( 0.00%) Lat 75.00th-qrtle-2 48.00 ( 0.00%) 47.00 ( 2.08%) Lat 90.00th-qrtle-2 53.00 ( 0.00%) 52.00 ( 1.89%) Lat 95.00th-qrtle-2 55.00 ( 0.00%) 53.00 ( 3.64%) Lat 99.00th-qrtle-2 62.00 ( 0.00%) 60.00 ( 3.23%) Lat 99.50th-qrtle-2 63.00 ( 0.00%) 63.00 ( 0.00%) Lat 99.90th-qrtle-2 68.00 ( 0.00%) 66.00 ( 2.94% For higher worker threads, the differences become negligible but it's interesting to note the difference in wakeup latency at low utilisation and mpstat confirms that activity was almost all on one node until the number of worker threads increase. Hackbench generally showed neutral results across a range of machines. This is different to earlier versions of the patch which allowed imbalances for higher degrees of utilisation. perf bench pipe showed negligible differences in overall performance as the differences are very close to the noise. An earlier prototype of the patch showed major regressions for NAS C-class when running with only half of the available CPUs -- 20-30% performance hits were measured at the time. With this version of the patch, the impact is negligible with small gains/losses within the noise measured. This is because the number of threads far exceeds the small imbalance the aptch cares about. Similarly, there were report of regressions for the autonuma benchmark against earlier versions but again, normal load balancing now applies for that workload. In general, the patch simply seeks to avoid unnecessary cross-node migrations in the basic case where imbalances are very small. For low utilisation communicating workloads, this patch generally behaves better with less NUMA balancing activity. For high utilisation, there is no change in behaviour. Signed-off-by: Mel Gorman Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Valentin Schneider Reviewed-by: Vincent Guittot Reviewed-by: Srikar Dronamraju Acked-by: Phil Auld Tested-by: Phil Auld Link: https://lkml.kernel.org/r/20200114101319.GO3466@techsingularity.net --- kernel/sched/fair.c | 41 +++++++++++++++++++++++++++++------------ 1 file changed, 29 insertions(+), 12 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index fe4e0d775375..25dffc03f0f6 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8658,10 +8658,6 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s /* * Try to use spare capacity of local group without overloading it or * emptying busiest. - * XXX Spreading tasks across NUMA nodes is not always the best policy - * and special care should be taken for SD_NUMA domain level before - * spreading the tasks. For now, load_balance() fully relies on - * NUMA_BALANCING and fbq_classify_group/rq to override the decision. */ if (local->group_type == group_has_spare) { if (busiest->group_type > group_fully_busy) { @@ -8701,16 +8697,37 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s env->migration_type = migrate_task; lsub_positive(&nr_diff, local->sum_nr_running); env->imbalance = nr_diff >> 1; - return; - } + } else { - /* - * If there is no overload, we just want to even the number of - * idle cpus. - */ - env->migration_type = migrate_task; - env->imbalance = max_t(long, 0, (local->idle_cpus - + /* + * If there is no overload, we just want to even the number of + * idle cpus. + */ + env->migration_type = migrate_task; + env->imbalance = max_t(long, 0, (local->idle_cpus - busiest->idle_cpus) >> 1); + } + + /* Consider allowing a small imbalance between NUMA groups */ + if (env->sd->flags & SD_NUMA) { + unsigned int imbalance_min; + + /* + * Compute an allowed imbalance based on a simple + * pair of communicating tasks that should remain + * local and ignore them. + * + * NOTE: Generally this would have been based on + * the domain size and this was evaluated. However, + * the benefit is similar across a range of workloads + * and machines but scaling by the domain size adds + * the risk that lower domains have to be rebalanced. + */ + imbalance_min = 2; + if (busiest->sum_nr_running <= imbalance_min) + env->imbalance = 0; + } + return; } -- cgit v1.2.3 From b562d140649966d4daedd0483a8fe59ad3bb465a Mon Sep 17 00:00:00 2001 From: Qais Yousef Date: Tue, 14 Jan 2020 21:09:47 +0000 Subject: sched/uclamp: Reject negative values in cpu_uclamp_write() The check to ensure that the new written value into cpu.uclamp.{min,max} is within range, [0:100], wasn't working because of the signed comparison 7301 if (req.percent > UCLAMP_PERCENT_SCALE) { 7302 req.ret = -ERANGE; 7303 return req; 7304 } # echo -1 > cpu.uclamp.min # cat cpu.uclamp.min 42949671.96 Cast req.percent into u64 to force the comparison to be unsigned and work as intended in capacity_from_percent(). # echo -1 > cpu.uclamp.min sh: write error: Numerical result out of range Fixes: 2480c093130f ("sched/uclamp: Extend CPU's cgroup controller") Signed-off-by: Qais Yousef Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lkml.kernel.org/r/20200114210947.14083-1-qais.yousef@arm.com --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 4ff03c27779e..55b9a9c53b91 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7264,7 +7264,7 @@ capacity_from_percent(char *buf) &req.percent); if (req.ret) return req; - if (req.percent > UCLAMP_PERCENT_SCALE) { + if ((u64)req.percent > UCLAMP_PERCENT_SCALE) { req.ret = -ERANGE; return req; } -- cgit v1.2.3 From e938b9c94164e4d981039f1cf6007d7453883e5a Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Mon, 13 Jan 2020 08:50:27 +0800 Subject: sched/nohz: Optimize get_nohz_timer_target() On a machine, CPU 0 is used for housekeeping, the other 39 CPUs in the same socket are in nohz_full mode. We can observe huge time burn in the loop for seaching nearest busy housekeeper cpu by ftrace. 2) | get_nohz_timer_target() { 2) 0.240 us | housekeeping_test_cpu(); 2) 0.458 us | housekeeping_test_cpu(); ... 2) 0.292 us | housekeeping_test_cpu(); 2) 0.240 us | housekeeping_test_cpu(); 2) 0.227 us | housekeeping_any_cpu(); 2) + 43.460 us | } This patch optimizes the searching logic by finding a nearest housekeeper CPU in the housekeeping cpumask, it can minimize the worst searching time from ~44us to < 10us in my testing. In addition, the last iterated busy housekeeper can become a random candidate while current CPU is a better fallback if it is a housekeeper. Signed-off-by: Wanpeng Li Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Frederic Weisbecker Link: https://lkml.kernel.org/r/1578876627-11938-1-git-send-email-wanpengli@tencent.com --- kernel/sched/core.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 55b9a9c53b91..a8a5d5b6f5cf 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -552,27 +552,32 @@ void resched_cpu(int cpu) */ int get_nohz_timer_target(void) { - int i, cpu = smp_processor_id(); + int i, cpu = smp_processor_id(), default_cpu = -1; struct sched_domain *sd; - if (!idle_cpu(cpu) && housekeeping_cpu(cpu, HK_FLAG_TIMER)) - return cpu; + if (housekeeping_cpu(cpu, HK_FLAG_TIMER)) { + if (!idle_cpu(cpu)) + return cpu; + default_cpu = cpu; + } rcu_read_lock(); for_each_domain(cpu, sd) { - for_each_cpu(i, sched_domain_span(sd)) { + for_each_cpu_and(i, sched_domain_span(sd), + housekeeping_cpumask(HK_FLAG_TIMER)) { if (cpu == i) continue; - if (!idle_cpu(i) && housekeeping_cpu(i, HK_FLAG_TIMER)) { + if (!idle_cpu(i)) { cpu = i; goto unlock; } } } - if (!housekeeping_cpu(cpu, HK_FLAG_TIMER)) - cpu = housekeeping_any_cpu(HK_FLAG_TIMER); + if (default_cpu == -1) + default_cpu = housekeeping_any_cpu(HK_FLAG_TIMER); + cpu = default_cpu; unlock: rcu_read_unlock(); return cpu; -- cgit v1.2.3 From 2a4b03ffc69f2dedc6388e9a6438b5f4c133a40d Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Tue, 14 Jan 2020 15:13:56 +0100 Subject: sched/fair: Prevent unlimited runtime on throttled group When a running task is moved on a throttled task group and there is no other task enqueued on the CPU, the task can keep running using 100% CPU whatever the allocated bandwidth for the group and although its cfs rq is throttled. Furthermore, the group entity of the cfs_rq and its parents are not enqueued but only set as curr on their respective cfs_rqs. We have the following sequence: sched_move_task -dequeue_task: dequeue task and group_entities. -put_prev_task: put task and group entities. -sched_change_group: move task to new group. -enqueue_task: enqueue only task but not group entities because cfs_rq is throttled. -set_next_task : set task and group_entities as current sched_entity of their cfs_rq. Another impact is that the root cfs_rq runnable_load_avg at root rq stays null because the group_entities are not enqueued. This situation will stay the same until an "external" event triggers a reschedule. Let trigger it immediately instead. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Acked-by: Ben Segall Link: https://lkml.kernel.org/r/1579011236-31256-1-git-send-email-vincent.guittot@linaro.org --- kernel/sched/core.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a8a5d5b6f5cf..89e54f3ed571 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7072,8 +7072,15 @@ void sched_move_task(struct task_struct *tsk) if (queued) enqueue_task(rq, tsk, queue_flags); - if (running) + if (running) { set_next_task(rq, tsk); + /* + * After changing group, the running task may have joined a + * throttled one but it's still the running task. Trigger a + * resched to make sure that task can still run. + */ + resched_curr(rq); + } task_rq_unlock(rq, tsk, &rf); } -- cgit v1.2.3 From 8c8c5a4994a306c217fd061cbfc5903399fd4c1c Mon Sep 17 00:00:00 2001 From: Nicolas Saenz Julienne Date: Fri, 10 Jan 2020 18:19:33 +0100 Subject: dma-contiguous: CMA: give precedence to cmdline Although the device tree might contain a reserved-memory DT node dedicated as the default CMA pool, users might want to change CMA's parameters using the kernel command line for debugging purposes and whatnot. Honor this by bypassing the reserved memory CMA setup, which will ultimately end up freeing the memblock and allow the command line CMA configuration routine to run. Signed-off-by: Nicolas Saenz Julienne Reviewed-by: Phil Elwell Signed-off-by: Christoph Hellwig --- kernel/dma/contiguous.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c index daa4e6eefdde..8bc6f2d670f9 100644 --- a/kernel/dma/contiguous.c +++ b/kernel/dma/contiguous.c @@ -302,9 +302,16 @@ static int __init rmem_cma_setup(struct reserved_mem *rmem) phys_addr_t align = PAGE_SIZE << max(MAX_ORDER - 1, pageblock_order); phys_addr_t mask = align - 1; unsigned long node = rmem->fdt_node; + bool default_cma = of_get_flat_dt_prop(node, "linux,cma-default", NULL); struct cma *cma; int err; + if (size_cmdline != -1 && default_cma) { + pr_info("Reserved memory: bypass %s node, using cmdline CMA params instead\n", + rmem->name); + return -EBUSY; + } + if (!of_get_flat_dt_prop(node, "reusable", NULL) || of_get_flat_dt_prop(node, "no-map", NULL)) return -EINVAL; @@ -322,7 +329,7 @@ static int __init rmem_cma_setup(struct reserved_mem *rmem) /* Architecture specific contiguous memory fixup. */ dma_contiguous_early_fixup(rmem->base, rmem->size); - if (of_get_flat_dt_prop(node, "linux,cma-default", NULL)) + if (default_cma) dma_contiguous_set_default(cma); rmem->ops = &rmem_cma_ops; -- cgit v1.2.3 From 0cd9d33ace336bc424fc30944aa3defd6786e4fe Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 30 Jan 2020 11:37:33 -0500 Subject: cgroup: init_tasks shouldn't be linked to the root cgroup 5153faac18d2 ("cgroup: remove cgroup_enable_task_cg_lists() optimization") removed lazy initialization of css_sets so that new tasks are always lniked to its css_set. In the process, it incorrectly ended up adding init_tasks to root css_set. They show up as PID 0's in root's cgroup.procs triggering warnings in systemd and generally confusing people. Fix it by skip css_set linking for init_tasks. Signed-off-by: Tejun Heo Reported-by: https://github.com/joanbm Link: https://github.com/systemd/systemd/issues/14682 Fixes: 5153faac18d2 ("cgroup: remove cgroup_enable_task_cg_lists() optimization") Cc: stable@vger.kernel.org # v5.5+ --- kernel/cgroup/cgroup.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index b3744872263e..cf8a36bdf5c8 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -5932,11 +5932,14 @@ void cgroup_post_fork(struct task_struct *child) spin_lock_irq(&css_set_lock); - WARN_ON_ONCE(!list_empty(&child->cg_list)); - cset = task_css_set(current); /* current is @child's parent */ - get_css_set(cset); - cset->nr_tasks++; - css_set_move_task(child, NULL, cset, false); + /* init tasks are special, only link regular threads */ + if (likely(child->pid)) { + WARN_ON_ONCE(!list_empty(&child->cg_list)); + cset = task_css_set(current); /* current is @child's parent */ + get_css_set(cset); + cset->nr_tasks++; + css_set_move_task(child, NULL, cset, false); + } /* * If the cgroup has to be frozen, the new task has too. Let's set -- cgit v1.2.3 From 8ccb5bf7619c6523e7a4384a84b72e7be804298c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Roberto=20de=20Souza?= Date: Wed, 29 Jan 2020 15:24:48 -0800 Subject: drm/mst: Fix possible NULL pointer dereference in drm_dp_mst_process_up_req() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit According to DP specification, DP_SINK_EVENT_NOTIFY is also a broadcast message but as this function only handles DP_CONNECTION_STATUS_NOTIFY I will only make the static analyzer that caught this issue happy by not calling drm_dp_get_mst_branch_device_by_guid() with a NULL guid, causing drm_dp_mst_process_up_req() to return in the "if (!mstb)" right bellow. Fixes: 9408cc94eb04 ("drm/dp_mst: Handle UP requests asynchronously") Cc: Lyude Paul Cc: Sean Paul Cc: # v5.5+ Signed-off-by: José Roberto de Souza [added cc to stable] Signed-off-by: Lyude Paul Link: https://patchwork.freedesktop.org/patch/msgid/20200129232448.84704-1-jose.souza@intel.com --- drivers/gpu/drm/drm_dp_mst_topology.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/drm_dp_mst_topology.c b/drivers/gpu/drm/drm_dp_mst_topology.c index e6afe4faeca6..105bb5f40166 100644 --- a/drivers/gpu/drm/drm_dp_mst_topology.c +++ b/drivers/gpu/drm/drm_dp_mst_topology.c @@ -3760,7 +3760,8 @@ drm_dp_mst_process_up_req(struct drm_dp_mst_topology_mgr *mgr, else if (msg->req_type == DP_RESOURCE_STATUS_NOTIFY) guid = msg->u.resource_stat.guid; - mstb = drm_dp_get_mst_branch_device_by_guid(mgr, guid); + if (guid) + mstb = drm_dp_get_mst_branch_device_by_guid(mgr, guid); } else { mstb = drm_dp_get_mst_branch_device(mgr, hdr->lct, hdr->rad); } -- cgit v1.2.3 From 7e0cf7e9936c4358b0863357b90aa12afe6489da Mon Sep 17 00:00:00 2001 From: Boris Brezillon Date: Fri, 29 Nov 2019 14:59:08 +0100 Subject: drm/panfrost: Make sure the shrinker does not reclaim referenced BOs Userspace might tag a BO purgeable while it's still referenced by GPU jobs. We need to make sure the shrinker does not purge such BOs until all jobs referencing it are finished. Fixes: 013b65101315 ("drm/panfrost: Add madvise and shrinker support") Cc: Signed-off-by: Boris Brezillon Reviewed-by: Steven Price Signed-off-by: Rob Herring Link: https://patchwork.freedesktop.org/patch/msgid/20191129135908.2439529-9-boris.brezillon@collabora.com --- drivers/gpu/drm/panfrost/panfrost_drv.c | 1 + drivers/gpu/drm/panfrost/panfrost_gem.h | 6 ++++++ drivers/gpu/drm/panfrost/panfrost_gem_shrinker.c | 3 +++ drivers/gpu/drm/panfrost/panfrost_job.c | 7 ++++++- 4 files changed, 16 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/panfrost/panfrost_drv.c b/drivers/gpu/drm/panfrost/panfrost_drv.c index 88b431a267af..273d67e251c2 100644 --- a/drivers/gpu/drm/panfrost/panfrost_drv.c +++ b/drivers/gpu/drm/panfrost/panfrost_drv.c @@ -166,6 +166,7 @@ panfrost_lookup_bos(struct drm_device *dev, break; } + atomic_inc(&bo->gpu_usecount); job->mappings[i] = mapping; } diff --git a/drivers/gpu/drm/panfrost/panfrost_gem.h b/drivers/gpu/drm/panfrost/panfrost_gem.h index ca1bc9019600..b3517ff9630c 100644 --- a/drivers/gpu/drm/panfrost/panfrost_gem.h +++ b/drivers/gpu/drm/panfrost/panfrost_gem.h @@ -30,6 +30,12 @@ struct panfrost_gem_object { struct mutex lock; } mappings; + /* + * Count the number of jobs referencing this BO so we don't let the + * shrinker reclaim this object prematurely. + */ + atomic_t gpu_usecount; + bool noexec :1; bool is_heap :1; }; diff --git a/drivers/gpu/drm/panfrost/panfrost_gem_shrinker.c b/drivers/gpu/drm/panfrost/panfrost_gem_shrinker.c index f5dd7b29bc95..288e46c40673 100644 --- a/drivers/gpu/drm/panfrost/panfrost_gem_shrinker.c +++ b/drivers/gpu/drm/panfrost/panfrost_gem_shrinker.c @@ -41,6 +41,9 @@ static bool panfrost_gem_purge(struct drm_gem_object *obj) struct drm_gem_shmem_object *shmem = to_drm_gem_shmem_obj(obj); struct panfrost_gem_object *bo = to_panfrost_bo(obj); + if (atomic_read(&bo->gpu_usecount)) + return false; + if (!mutex_trylock(&shmem->pages_lock)) return false; diff --git a/drivers/gpu/drm/panfrost/panfrost_job.c b/drivers/gpu/drm/panfrost/panfrost_job.c index e364ee00f3d0..4d383831c1fc 100644 --- a/drivers/gpu/drm/panfrost/panfrost_job.c +++ b/drivers/gpu/drm/panfrost/panfrost_job.c @@ -269,8 +269,13 @@ static void panfrost_job_cleanup(struct kref *ref) dma_fence_put(job->render_done_fence); if (job->mappings) { - for (i = 0; i < job->bo_count; i++) + for (i = 0; i < job->bo_count; i++) { + if (!job->mappings[i]) + break; + + atomic_dec(&job->mappings[i]->obj->gpu_usecount); panfrost_gem_mapping_put(job->mappings[i]); + } kvfree(job->mappings); } -- cgit v1.2.3 From 183edb20e60a73925bf3b60e2f4796898167262f Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Mon, 3 Feb 2020 15:45:17 +0000 Subject: cpufreq: Make cpufreq_global_kobject static The cpufreq_global_kobject is only used internally by cpufreq.c after commit 2361be236662 ("cpufreq: Don't create empty /sys/devices/system/cpu/cpufreq directory"). Make it static. Signed-off-by: Yangtao Li [ rjw: Add empty line after cpufreq_global_kobject definition ] Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cpufreq.c | 5 ++--- include/linux/cpufreq.h | 3 --- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 4adac3a8c265..cbe6c94bf158 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -105,6 +105,8 @@ bool have_governor_per_policy(void) } EXPORT_SYMBOL_GPL(have_governor_per_policy); +static struct kobject *cpufreq_global_kobject; + struct kobject *get_governor_parent_kobj(struct cpufreq_policy *policy) { if (have_governor_per_policy()) @@ -2745,9 +2747,6 @@ int cpufreq_unregister_driver(struct cpufreq_driver *driver) } EXPORT_SYMBOL_GPL(cpufreq_unregister_driver); -struct kobject *cpufreq_global_kobject; -EXPORT_SYMBOL(cpufreq_global_kobject); - static int __init cpufreq_core_init(void) { if (cpufreq_disabled()) diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 018dce868de6..0fb561d1b524 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -201,9 +201,6 @@ static inline bool policy_is_shared(struct cpufreq_policy *policy) return cpumask_weight(policy->cpus) > 1; } -/* /sys/devices/system/cpu/cpufreq: entry point for global variables */ -extern struct kobject *cpufreq_global_kobject; - #ifdef CONFIG_CPU_FREQ unsigned int cpufreq_get(unsigned int cpu); unsigned int cpufreq_quick_get(unsigned int cpu); -- cgit v1.2.3 From 2343d1529aff8b552589f622c23932035ed7a05d Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Fri, 17 Jan 2020 12:01:36 +0100 Subject: crypto: Kconfig - allow tests to be disabled when manager is disabled The library code uses CRYPTO_MANAGER_DISABLE_TESTS to conditionalize its tests, but the library code can also exist without CRYPTO_MANAGER. That means on minimal configs, the test code winds up being built with no way to disable it. Signed-off-by: Jason A. Donenfeld Signed-off-by: Herbert Xu --- crypto/Kconfig | 4 ---- 1 file changed, 4 deletions(-) diff --git a/crypto/Kconfig b/crypto/Kconfig index cdb51d4272d0..c24a47406f8f 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -136,8 +136,6 @@ config CRYPTO_USER Userspace configuration for cryptographic instantiations such as cbc(aes). -if CRYPTO_MANAGER2 - config CRYPTO_MANAGER_DISABLE_TESTS bool "Disable run-time self tests" default y @@ -155,8 +153,6 @@ config CRYPTO_MANAGER_EXTRA_TESTS This is intended for developer use only, as these tests take much longer to run than the normal self tests. -endif # if CRYPTO_MANAGER2 - config CRYPTO_GF128MUL tristate -- cgit v1.2.3 From 91ef26f914171cf753330f13724fd9142b5b1640 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 3 Feb 2020 18:11:10 +0100 Subject: dma-direct: relax addressability checks in dma_direct_supported dma_direct_supported tries to find the minimum addressable bitmask based on the end pfn and optional magic that architectures can use to communicate the size of the magic ZONE_DMA that can be used for bounce buffering. But between the DMA offsets that can change per device (or sometimes even region), the fact the ZONE_DMA isn't even guaranteed to be the lowest addresses and failure of having proper interfaces to the MM code this fails at least for one arm subarchitecture. As all the legacy DMA implementations have supported 32-bit DMA masks, and 32-bit masks are guranteed to always work by the API contract (using bounce buffers if needed), we can short cut the complicated check and always return true without breaking existing assumptions. Hopefully we can properly clean up the interaction with the arch defined zones and the bootmem allocator eventually. Fixes: ad3c7b18c5b3 ("arm: use swiotlb for bounce buffering on LPAE configs") Reported-by: Peter Ujfalusi Signed-off-by: Christoph Hellwig Tested-by: Peter Ujfalusi --- kernel/dma/direct.c | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 6af7ae83c4ad..32ec69cdba54 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -472,28 +472,26 @@ int dma_direct_mmap(struct device *dev, struct vm_area_struct *vma, } #endif /* CONFIG_MMU */ -/* - * Because 32-bit DMA masks are so common we expect every architecture to be - * able to satisfy them - either by not supporting more physical memory, or by - * providing a ZONE_DMA32. If neither is the case, the architecture needs to - * use an IOMMU instead of the direct mapping. - */ int dma_direct_supported(struct device *dev, u64 mask) { - u64 min_mask; - - if (IS_ENABLED(CONFIG_ZONE_DMA)) - min_mask = DMA_BIT_MASK(zone_dma_bits); - else - min_mask = DMA_BIT_MASK(32); + u64 min_mask = (max_pfn - 1) << PAGE_SHIFT; - min_mask = min_t(u64, min_mask, (max_pfn - 1) << PAGE_SHIFT); + /* + * Because 32-bit DMA masks are so common we expect every architecture + * to be able to satisfy them - either by not supporting more physical + * memory, or by providing a ZONE_DMA32. If neither is the case, the + * architecture needs to use an IOMMU instead of the direct mapping. + */ + if (mask >= DMA_BIT_MASK(32)) + return 1; /* * This check needs to be against the actual bit mask value, so * use __phys_to_dma() here so that the SME encryption mask isn't * part of the check. */ + if (IS_ENABLED(CONFIG_ZONE_DMA)) + min_mask = min_t(u64, min_mask, DMA_BIT_MASK(zone_dma_bits)); return mask >= __phys_to_dma(dev, min_mask); } -- cgit v1.2.3 From 4a47cbae04844f0c5e2365aa6c217b61850bb832 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 3 Feb 2020 14:44:38 +0100 Subject: dma-direct: improve swiotlb error reporting Untangle the way how dma_direct_map_page calls into swiotlb to be able to properly report errors where the swiotlb DMA address overflows the mask separately from overflows in the !swiotlb case. This means that siotlb_map now has to do a little more work that duplicates dma_direct_map_page, but doing so greatly simplifies the calling convention. Signed-off-by: Christoph Hellwig Reviewed-by: Konrad Rzeszutek Wilk --- include/linux/swiotlb.h | 11 +++-------- kernel/dma/direct.c | 16 +++++++--------- kernel/dma/swiotlb.c | 42 +++++++++++++++++++++++------------------- 3 files changed, 33 insertions(+), 36 deletions(-) diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h index cde3dc18e21a..046bb94bd4d6 100644 --- a/include/linux/swiotlb.h +++ b/include/linux/swiotlb.h @@ -64,6 +64,9 @@ extern void swiotlb_tbl_sync_single(struct device *hwdev, size_t size, enum dma_data_direction dir, enum dma_sync_target target); +dma_addr_t swiotlb_map(struct device *dev, phys_addr_t phys, + size_t size, enum dma_data_direction dir, unsigned long attrs); + #ifdef CONFIG_SWIOTLB extern enum swiotlb_force swiotlb_force; extern phys_addr_t io_tlb_start, io_tlb_end; @@ -73,8 +76,6 @@ static inline bool is_swiotlb_buffer(phys_addr_t paddr) return paddr >= io_tlb_start && paddr < io_tlb_end; } -bool swiotlb_map(struct device *dev, phys_addr_t *phys, dma_addr_t *dma_addr, - size_t size, enum dma_data_direction dir, unsigned long attrs); void __init swiotlb_exit(void); unsigned int swiotlb_max_segment(void); size_t swiotlb_max_mapping_size(struct device *dev); @@ -85,12 +86,6 @@ static inline bool is_swiotlb_buffer(phys_addr_t paddr) { return false; } -static inline bool swiotlb_map(struct device *dev, phys_addr_t *phys, - dma_addr_t *dma_addr, size_t size, enum dma_data_direction dir, - unsigned long attrs) -{ - return false; -} static inline void swiotlb_exit(void) { } diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 32ec69cdba54..594bddd04e01 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -357,13 +357,6 @@ void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl, EXPORT_SYMBOL(dma_direct_unmap_sg); #endif -static inline bool dma_direct_possible(struct device *dev, dma_addr_t dma_addr, - size_t size) -{ - return swiotlb_force != SWIOTLB_FORCE && - dma_capable(dev, dma_addr, size, true); -} - dma_addr_t dma_direct_map_page(struct device *dev, struct page *page, unsigned long offset, size_t size, enum dma_data_direction dir, unsigned long attrs) @@ -371,8 +364,13 @@ dma_addr_t dma_direct_map_page(struct device *dev, struct page *page, phys_addr_t phys = page_to_phys(page) + offset; dma_addr_t dma_addr = phys_to_dma(dev, phys); - if (unlikely(!dma_direct_possible(dev, dma_addr, size)) && - !swiotlb_map(dev, &phys, &dma_addr, size, dir, attrs)) { + if (unlikely(swiotlb_force == SWIOTLB_FORCE)) + return swiotlb_map(dev, phys, size, dir, attrs); + + if (unlikely(!dma_capable(dev, dma_addr, size, true))) { + if (swiotlb_force != SWIOTLB_NO_FORCE) + return swiotlb_map(dev, phys, size, dir, attrs); + report_addr(dev, dma_addr, size); return DMA_MAPPING_ERROR; } diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 9280d6f8271e..c19379fabd20 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -22,6 +22,7 @@ #include #include +#include #include #include #include @@ -656,35 +657,38 @@ void swiotlb_tbl_sync_single(struct device *hwdev, phys_addr_t tlb_addr, } /* - * Create a swiotlb mapping for the buffer at @phys, and in case of DMAing + * Create a swiotlb mapping for the buffer at @paddr, and in case of DMAing * to the device copy the data into it as well. */ -bool swiotlb_map(struct device *dev, phys_addr_t *phys, dma_addr_t *dma_addr, - size_t size, enum dma_data_direction dir, unsigned long attrs) +dma_addr_t swiotlb_map(struct device *dev, phys_addr_t paddr, size_t size, + enum dma_data_direction dir, unsigned long attrs) { - trace_swiotlb_bounced(dev, *dma_addr, size, swiotlb_force); + phys_addr_t swiotlb_addr; + dma_addr_t dma_addr; - if (unlikely(swiotlb_force == SWIOTLB_NO_FORCE)) { - dev_warn_ratelimited(dev, - "Cannot do DMA to address %pa\n", phys); - return false; - } + trace_swiotlb_bounced(dev, phys_to_dma(dev, paddr), size, + swiotlb_force); - /* Oh well, have to allocate and map a bounce buffer. */ - *phys = swiotlb_tbl_map_single(dev, __phys_to_dma(dev, io_tlb_start), - *phys, size, size, dir, attrs); - if (*phys == (phys_addr_t)DMA_MAPPING_ERROR) - return false; + swiotlb_addr = swiotlb_tbl_map_single(dev, + __phys_to_dma(dev, io_tlb_start), + paddr, size, size, dir, attrs); + if (swiotlb_addr == (phys_addr_t)DMA_MAPPING_ERROR) + return DMA_MAPPING_ERROR; /* Ensure that the address returned is DMA'ble */ - *dma_addr = __phys_to_dma(dev, *phys); - if (unlikely(!dma_capable(dev, *dma_addr, size, true))) { - swiotlb_tbl_unmap_single(dev, *phys, size, size, dir, + dma_addr = __phys_to_dma(dev, swiotlb_addr); + if (unlikely(!dma_capable(dev, dma_addr, size, true))) { + swiotlb_tbl_unmap_single(dev, swiotlb_addr, size, size, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC); - return false; + dev_WARN_ONCE(dev, 1, + "swiotlb addr %pad+%zu overflow (mask %llx, bus limit %llx).\n", + &dma_addr, size, *dev->dma_mask, dev->bus_dma_limit); + return DMA_MAPPING_ERROR; } - return true; + if (!dev_is_dma_coherent(dev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) + arch_sync_dma_for_device(swiotlb_addr, size, dir); + return dma_addr; } size_t swiotlb_max_mapping_size(struct device *dev) -- cgit v1.2.3 From 75467ee48a5e04cf3ae3cb39aea6adee73aeff91 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 3 Feb 2020 14:54:50 +0100 Subject: dma-direct: improve DMA mask overflow reporting Remove the unset dma_mask case as that won't get into mapping calls anymore, and also report the other errors unconditonally and with a slightly improved message. Remove the now pointless report_addr helper. Signed-off-by: Christoph Hellwig Reviewed-by: Konrad Rzeszutek Wilk --- kernel/dma/direct.c | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 594bddd04e01..ac7956c38f69 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -23,18 +23,6 @@ */ unsigned int zone_dma_bits __ro_after_init = 24; -static void report_addr(struct device *dev, dma_addr_t dma_addr, size_t size) -{ - if (!dev->dma_mask) { - dev_err_once(dev, "DMA map on device without dma_mask\n"); - } else if (*dev->dma_mask >= DMA_BIT_MASK(32) || dev->bus_dma_limit) { - dev_err_once(dev, - "overflow %pad+%zu of DMA mask %llx bus limit %llx\n", - &dma_addr, size, *dev->dma_mask, dev->bus_dma_limit); - } - WARN_ON_ONCE(1); -} - static inline dma_addr_t phys_to_dma_direct(struct device *dev, phys_addr_t phys) { @@ -371,7 +359,9 @@ dma_addr_t dma_direct_map_page(struct device *dev, struct page *page, if (swiotlb_force != SWIOTLB_NO_FORCE) return swiotlb_map(dev, phys, size, dir, attrs); - report_addr(dev, dma_addr, size); + dev_WARN_ONCE(dev, 1, + "DMA addr %pad+%zu overflow (mask %llx, bus limit %llx).\n", + &dma_addr, size, *dev->dma_mask, dev->bus_dma_limit); return DMA_MAPPING_ERROR; } @@ -409,7 +399,10 @@ dma_addr_t dma_direct_map_resource(struct device *dev, phys_addr_t paddr, dma_addr_t dma_addr = paddr; if (unlikely(!dma_capable(dev, dma_addr, size, false))) { - report_addr(dev, dma_addr, size); + dev_err_once(dev, + "DMA addr %pad+%zu overflow (mask %llx, bus limit %llx).\n", + &dma_addr, size, *dev->dma_mask, dev->bus_dma_limit); + WARN_ON_ONCE(1); return DMA_MAPPING_ERROR; } -- cgit v1.2.3 From a20456aef80fa6dda500b46c4bd04e39135097c6 Mon Sep 17 00:00:00 2001 From: Hridya Valsaraju Date: Sat, 1 Feb 2020 17:46:23 -0800 Subject: selinux: fix typo in filesystem name Correct the filesystem name to "binder" to enable genfscon per-file labelling for binderfs. Fixes: 7a4b5194747 ("selinux: allow per-file labelling for binderfs") Signed-off-by: Hridya Valsaraju Acked-by: Stephen Smalley [PM: slight style changes to the subj/description] Signed-off-by: Paul Moore --- security/selinux/hooks.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index d9e8b2131a65..6ef606a3c7f9 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -698,7 +698,7 @@ static int selinux_set_mnt_opts(struct super_block *sb, if (!strcmp(sb->s_type->name, "debugfs") || !strcmp(sb->s_type->name, "tracefs") || - !strcmp(sb->s_type->name, "binderfs") || + !strcmp(sb->s_type->name, "binder") || !strcmp(sb->s_type->name, "pstore")) sbsec->flags |= SE_SBGENFS; -- cgit v1.2.3 From 39a706fbcf2694bfb651bed9041d44c3f4fa8078 Mon Sep 17 00:00:00 2001 From: Ondrej Mosnacek Date: Mon, 3 Feb 2020 09:50:23 +0100 Subject: selinux: fix sidtab string cache locking Avoiding taking a lock in an IRQ context is not enough to prevent deadlocks, as discovered by syzbot: === WARNING: SOFTIRQ-safe -> SOFTIRQ-unsafe lock order detected 5.5.0-syzkaller #0 Not tainted ----------------------------------------------------- syz-executor.0/8927 [HC0[0]:SC0[2]:HE1:SE0] is trying to acquire: ffff888027c94098 (&(&s->cache_lock)->rlock){+.+.}, at: spin_lock include/linux/spinlock.h:338 [inline] ffff888027c94098 (&(&s->cache_lock)->rlock){+.+.}, at: sidtab_sid2str_put.part.0+0x36/0x880 security/selinux/ss/sidtab.c:533 and this task is already holding: ffffffff898639b0 (&(&nf_conntrack_locks[i])->rlock){+.-.}, at: spin_lock include/linux/spinlock.h:338 [inline] ffffffff898639b0 (&(&nf_conntrack_locks[i])->rlock){+.-.}, at: nf_conntrack_lock+0x17/0x70 net/netfilter/nf_conntrack_core.c:91 which would create a new lock dependency: (&(&nf_conntrack_locks[i])->rlock){+.-.} -> (&(&s->cache_lock)->rlock){+.+.} but this new dependency connects a SOFTIRQ-irq-safe lock: (&(&nf_conntrack_locks[i])->rlock){+.-.} [...] other info that might help us debug this: Possible interrupt unsafe locking scenario: CPU0 CPU1 ---- ---- lock(&(&s->cache_lock)->rlock); local_irq_disable(); lock(&(&nf_conntrack_locks[i])->rlock); lock(&(&s->cache_lock)->rlock); lock(&(&nf_conntrack_locks[i])->rlock); *** DEADLOCK *** [...] === Fix this by simply locking with irqsave/irqrestore and stop giving up on !in_task(). It makes the locking a bit slower, but it shouldn't make a big difference in real workloads. Under the scenario from [1] (only cache hits) it only increased the runtime overhead from the security_secid_to_secctx() function from ~2% to ~3% (it was ~5-65% before introducing the cache). [1] https://bugzilla.redhat.com/show_bug.cgi?id=1733259 Fixes: d97bd23c2d7d ("selinux: cache the SID -> context string translation") Reported-by: syzbot+61cba5033e2072d61806@syzkaller.appspotmail.com Signed-off-by: Ondrej Mosnacek Acked-by: Stephen Smalley Signed-off-by: Paul Moore --- security/selinux/ss/sidtab.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/security/selinux/ss/sidtab.c b/security/selinux/ss/sidtab.c index a308ce1e6a13..f511ffccb131 100644 --- a/security/selinux/ss/sidtab.c +++ b/security/selinux/ss/sidtab.c @@ -518,19 +518,13 @@ void sidtab_sid2str_put(struct sidtab *s, struct sidtab_entry *entry, const char *str, u32 str_len) { struct sidtab_str_cache *cache, *victim = NULL; + unsigned long flags; /* do not cache invalid contexts */ if (entry->context.len) return; - /* - * Skip the put operation when in non-task context to avoid the need - * to disable interrupts while holding s->cache_lock. - */ - if (!in_task()) - return; - - spin_lock(&s->cache_lock); + spin_lock_irqsave(&s->cache_lock, flags); cache = rcu_dereference_protected(entry->cache, lockdep_is_held(&s->cache_lock)); @@ -561,7 +555,7 @@ void sidtab_sid2str_put(struct sidtab *s, struct sidtab_entry *entry, rcu_assign_pointer(entry->cache, cache); out_unlock: - spin_unlock(&s->cache_lock); + spin_unlock_irqrestore(&s->cache_lock, flags); kfree_rcu(victim, rcu_member); } -- cgit v1.2.3 From 96222d53842dfe54869ec4e1b9d4856daf9105a2 Mon Sep 17 00:00:00 2001 From: Jeff Moyer Date: Wed, 5 Feb 2020 14:15:58 -0500 Subject: dax: pass NOWAIT flag to iomap_apply fstests generic/471 reports a failure when run with MOUNT_OPTIONS="-o dax". The reason is that the initial pwrite to an empty file with the RWF_NOWAIT flag set does not return -EAGAIN. It turns out that dax_iomap_rw doesn't pass that flag through to iomap_apply. With this patch applied, generic/471 passes for me. Signed-off-by: Jeff Moyer Reviewed-by: Christoph Hellwig Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/x49r1z86e1d.fsf@segfault.boston.devel.redhat.com Signed-off-by: Dan Williams --- fs/dax.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/dax.c b/fs/dax.c index 1a64c19de0ad..35da144375a0 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -1201,6 +1201,9 @@ dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter, lockdep_assert_held(&inode->i_rwsem); } + if (iocb->ki_flags & IOCB_NOWAIT) + flags |= IOMAP_NOWAIT; + while (iov_iter_count(iter)) { ret = iomap_apply(inode, pos, iov_iter_count(iter), flags, ops, iter, dax_iomap_actor); -- cgit v1.2.3 From d1520889782dff58610c0b6b54d4cf3211ceb690 Mon Sep 17 00:00:00 2001 From: Oleksandr Suvorov Date: Wed, 5 Feb 2020 18:04:36 +0200 Subject: ASoC: fsl_sai: Fix exiting path on probing failure If the imx-sdma driver is built as a module, the fsl-sai device doesn't disable on probing failure, which causes the warning in the next probing: ================================================================== fsl-sai 308a0000.sai: Unbalanced pm_runtime_enable! fsl-sai 308a0000.sai: Unbalanced pm_runtime_enable! fsl-sai 308a0000.sai: Unbalanced pm_runtime_enable! fsl-sai 308a0000.sai: Unbalanced pm_runtime_enable! fsl-sai 308a0000.sai: Unbalanced pm_runtime_enable! fsl-sai 308a0000.sai: Unbalanced pm_runtime_enable! ================================================================== Disabling the device properly fixes the issue. Fixes: 812ad463e089 ("ASoC: fsl_sai: Add support for runtime pm") Signed-off-by: Oleksandr Suvorov Link: https://lore.kernel.org/r/20200205160436.3813642-1-oleksandr.suvorov@toradex.com Signed-off-by: Mark Brown --- sound/soc/fsl/fsl_sai.c | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/sound/soc/fsl/fsl_sai.c b/sound/soc/fsl/fsl_sai.c index 8c3ea7300972..9d436b0c5718 100644 --- a/sound/soc/fsl/fsl_sai.c +++ b/sound/soc/fsl/fsl_sai.c @@ -1020,12 +1020,24 @@ static int fsl_sai_probe(struct platform_device *pdev) ret = devm_snd_soc_register_component(&pdev->dev, &fsl_component, &fsl_sai_dai, 1); if (ret) - return ret; + goto err_pm_disable; - if (sai->soc_data->use_imx_pcm) - return imx_pcm_dma_init(pdev, IMX_SAI_DMABUF_SIZE); - else - return devm_snd_dmaengine_pcm_register(&pdev->dev, NULL, 0); + if (sai->soc_data->use_imx_pcm) { + ret = imx_pcm_dma_init(pdev, IMX_SAI_DMABUF_SIZE); + if (ret) + goto err_pm_disable; + } else { + ret = devm_snd_dmaengine_pcm_register(&pdev->dev, NULL, 0); + if (ret) + goto err_pm_disable; + } + + return ret; + +err_pm_disable: + pm_runtime_disable(&pdev->dev); + + return ret; } static int fsl_sai_remove(struct platform_device *pdev) -- cgit v1.2.3 From 4b848f20eda5974020f043ca14bacf7a7e634fc8 Mon Sep 17 00:00:00 2001 From: Daniel Vetter Date: Sun, 2 Feb 2020 14:21:33 +0100 Subject: drm/vgem: Close use-after-free race in vgem_gem_create There's two references floating around here (for the object reference, not the handle_count reference, that's a different thing): - The temporary reference held by vgem_gem_create, acquired by creating the object and released by calling drm_gem_object_put_unlocked. - The reference held by the object handle, created by drm_gem_handle_create. This one generally outlives the function, except if a 2nd thread races with a GEM_CLOSE ioctl call. So usually everything is correct, except in that race case, where the access to gem_object->size could be looking at freed data already. Which again isn't a real problem (userspace shot its feet off already with the race, we could return garbage), but maybe someone can exploit this as an information leak. Cc: Dan Carpenter Cc: Hillf Danton Reported-by: syzbot+0dc4444774d419e916c8@syzkaller.appspotmail.com Cc: stable@vger.kernel.org Cc: Emil Velikov Cc: Daniel Vetter Cc: Sean Paul Cc: Chris Wilson Cc: Eric Anholt Cc: Sam Ravnborg Cc: Rob Clark Reviewed-by: Chris Wilson Signed-off-by: Daniel Vetter Link: https://patchwork.freedesktop.org/patch/msgid/20200202132133.1891846-1-daniel.vetter@ffwll.ch --- drivers/gpu/drm/vgem/vgem_drv.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/vgem/vgem_drv.c b/drivers/gpu/drm/vgem/vgem_drv.c index 5bd60ded3d81..909eba43664a 100644 --- a/drivers/gpu/drm/vgem/vgem_drv.c +++ b/drivers/gpu/drm/vgem/vgem_drv.c @@ -196,9 +196,10 @@ static struct drm_gem_object *vgem_gem_create(struct drm_device *dev, return ERR_CAST(obj); ret = drm_gem_handle_create(file, &obj->base, handle); - drm_gem_object_put_unlocked(&obj->base); - if (ret) + if (ret) { + drm_gem_object_put_unlocked(&obj->base); return ERR_PTR(ret); + } return &obj->base; } @@ -221,7 +222,9 @@ static int vgem_gem_dumb_create(struct drm_file *file, struct drm_device *dev, args->size = gem_object->size; args->pitch = pitch; - DRM_DEBUG("Created object of size %lld\n", size); + drm_gem_object_put_unlocked(gem_object); + + DRM_DEBUG("Created object of size %llu\n", args->size); return 0; } -- cgit v1.2.3 From 1cb1edb2f5ba8a3e8d47ded391007c6fe3ac0ad7 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 6 Feb 2020 21:16:09 +0300 Subject: io_uring: get rid of delayed mm check Fail fast if can't grab mm, so past that requests always have an mm when required. This allows us to remove req->user altogether. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 77f22c3da30f..1859e866c728 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -553,7 +553,6 @@ struct io_kiocb { * llist_node is only used for poll deferred completions */ struct llist_node llist_node; - bool has_user; bool in_async; bool needs_fixed_file; u8 opcode; @@ -2056,9 +2055,6 @@ static ssize_t io_import_iovec(int rw, struct io_kiocb *req, return iorw->size; } - if (!req->has_user) - return -EFAULT; - #ifdef CONFIG_COMPAT if (req->ctx->compat) return compat_import_iovec(rw, buf, sqe_len, UIO_FASTIOV, @@ -4446,7 +4442,6 @@ static void io_wq_submit_work(struct io_wq_work **workptr) } if (!ret) { - req->has_user = (work->flags & IO_WQ_WORK_HAS_MM) != 0; req->in_async = true; do { ret = io_issue_sqe(req, NULL, &nxt, false); @@ -4950,6 +4945,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, for (i = 0; i < nr; i++) { const struct io_uring_sqe *sqe; struct io_kiocb *req; + int err; req = io_get_req(ctx, statep); if (unlikely(!req)) { @@ -4966,20 +4962,23 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, submitted++; if (unlikely(req->opcode >= IORING_OP_LAST)) { - io_cqring_add_event(req, -EINVAL); + err = -EINVAL; +fail_req: + io_cqring_add_event(req, err); io_double_put_req(req); break; } if (io_op_defs[req->opcode].needs_mm && !*mm) { mm_fault = mm_fault || !mmget_not_zero(ctx->sqo_mm); - if (!mm_fault) { - use_mm(ctx->sqo_mm); - *mm = ctx->sqo_mm; + if (unlikely(mm_fault)) { + err = -EFAULT; + goto fail_req; } + use_mm(ctx->sqo_mm); + *mm = ctx->sqo_mm; } - req->has_user = *mm != NULL; req->in_async = async; req->needs_fixed_file = async; trace_io_uring_submit_sqe(ctx, req->opcode, req->user_data, -- cgit v1.2.3 From e1cf35b94c5fd122a8780587559fc6da9fc2dd12 Mon Sep 17 00:00:00 2001 From: Mauro Rossi Date: Mon, 3 Feb 2020 22:31:13 +0100 Subject: drm/edid: fix building error MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes the following building error: CC [M]  drivers/gpu/drm/drm_edid.o ~/pie-x86_kernel/kernel/drivers/gpu/drm/drm_edid.c: In function 'cea_mode_alternate_timings': ~/pie-x86_kernel/kernel/drivers/gpu/drm/drm_edid.c:3275:2: error: call to '__compiletime_assert_3282' declared with attribute error: BUILD_BUG_ON failed: cea_mode_for_vic(8)->vtotal != 262 || cea_mode_for_vic(9)->vtotal != 262 || cea_mode_for_vic(12)->vtotal != 262 || cea_mode_for_vic(13)->vtotal != 262 || cea_mode_for_vic(23)->vtotal != 312 || cea_mode_for_vic(24)->vtotal != 312 || cea_mode_for_vic(27)->vtotal != 312 || cea_mode_for_vic(28)->vtotal != 312 make[4]: *** [~/pie-x86_kernel/kernel/scripts/Makefile.build:265: drivers/gpu/drm/drm_edid.o] Error 1 Fixes: 7befe621ff81 ("drm/edid: Abstract away cea_edid_modes[]") Signed-off-by: Mauro Rossi Signed-off-by: Ville Syrjälä Link: https://patchwork.freedesktop.org/patch/msgid/20200203213113.28183-1-issor.oruam@gmail.com --- drivers/gpu/drm/drm_edid.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/drm_edid.c b/drivers/gpu/drm/drm_edid.c index 99769d6c9f84..805fb004c8eb 100644 --- a/drivers/gpu/drm/drm_edid.c +++ b/drivers/gpu/drm/drm_edid.c @@ -3211,7 +3211,7 @@ static u8 *drm_find_cea_extension(const struct edid *edid) return cea; } -static const struct drm_display_mode *cea_mode_for_vic(u8 vic) +static __always_inline const struct drm_display_mode *cea_mode_for_vic(u8 vic) { BUILD_BUG_ON(1 + ARRAY_SIZE(edid_cea_modes_1) - 1 != 127); BUILD_BUG_ON(193 + ARRAY_SIZE(edid_cea_modes_193) - 1 != 219); -- cgit v1.2.3 From e1d85334d62386e9503e4a0d5d022e2d8e0011a0 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 5 Feb 2020 20:57:10 -0800 Subject: io_uring: fix 1-bit bitfields to be unsigned Make bitfields of size 1 bit be unsigned (since there is no room for the sign bit). This clears up the sparse warnings: CHECK ../fs/io_uring.c ../fs/io_uring.c:207:50: error: dubious one-bit signed bitfield ../fs/io_uring.c:208:55: error: dubious one-bit signed bitfield ../fs/io_uring.c:209:63: error: dubious one-bit signed bitfield ../fs/io_uring.c:210:54: error: dubious one-bit signed bitfield ../fs/io_uring.c:211:57: error: dubious one-bit signed bitfield Found by sight and then verified with sparse. Fixes: 69b3e546139a ("io_uring: change io_ring_ctx bool fields into bit fields") Signed-off-by: Randy Dunlap Cc: Jens Axboe Cc: io-uring@vger.kernel.org Signed-off-by: Jens Axboe --- fs/io_uring.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 1859e866c728..a31187e90697 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -204,11 +204,11 @@ struct io_ring_ctx { struct { unsigned int flags; - int compat: 1; - int account_mem: 1; - int cq_overflow_flushed: 1; - int drain_next: 1; - int eventfd_async: 1; + unsigned int compat: 1; + unsigned int account_mem: 1; + unsigned int cq_overflow_flushed: 1; + unsigned int drain_next: 1; + unsigned int eventfd_async: 1; /* * Ring buffer of indices into array of io_uring_sqe, which is -- cgit v1.2.3 From 1e95081cb5b4cf77065d37866f57cf3c90a3df78 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 6 Feb 2020 19:51:16 +0300 Subject: io_uring: fix deferred req iovec leak After defer, a request will be prepared, that includes allocating iovec if needed, and then submitted through io_wq_submit_work() but not custom handler (e.g. io_rw_async()/io_sendrecv_async()). However, it'll leak iovec, as it's in io-wq and the code goes as follows: io_read() { if (!io_wq_current_is_worker()) kfree(iovec); } Put all deallocation logic in io_{read,write,send,recv}(), which will leave the memory, if going async with -EAGAIN. It also fixes a leak after failed io_alloc_async_ctx() in io_{recv,send}_msg(). Cc: stable@vger.kernel.org # 5.5 Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 47 ++++++++++++----------------------------------- 1 file changed, 12 insertions(+), 35 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index a31187e90697..fcb4536a3c8c 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2144,17 +2144,6 @@ static int io_alloc_async_ctx(struct io_kiocb *req) return req->io == NULL; } -static void io_rw_async(struct io_wq_work **workptr) -{ - struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work); - struct iovec *iov = NULL; - - if (req->io->rw.iov != req->io->rw.fast_iov) - iov = req->io->rw.iov; - io_wq_submit_work(workptr); - kfree(iov); -} - static int io_setup_async_rw(struct io_kiocb *req, ssize_t io_size, struct iovec *iovec, struct iovec *fast_iov, struct iov_iter *iter) @@ -2167,7 +2156,6 @@ static int io_setup_async_rw(struct io_kiocb *req, ssize_t io_size, io_req_map_rw(req, io_size, iovec, fast_iov, iter); } - req->work.func = io_rw_async; return 0; } @@ -2254,8 +2242,7 @@ copy_iov: } } out_free: - if (!io_wq_current_is_worker()) - kfree(iovec); + kfree(iovec); return ret; } @@ -2360,8 +2347,7 @@ copy_iov: } } out_free: - if (!io_wq_current_is_worker()) - kfree(iovec); + kfree(iovec); return ret; } @@ -2956,19 +2942,6 @@ static int io_sync_file_range(struct io_kiocb *req, struct io_kiocb **nxt, return 0; } -#if defined(CONFIG_NET) -static void io_sendrecv_async(struct io_wq_work **workptr) -{ - struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work); - struct iovec *iov = NULL; - - if (req->io->rw.iov != req->io->rw.fast_iov) - iov = req->io->msg.iov; - io_wq_submit_work(workptr); - kfree(iov); -} -#endif - static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { #if defined(CONFIG_NET) @@ -3037,17 +3010,19 @@ static int io_sendmsg(struct io_kiocb *req, struct io_kiocb **nxt, if (force_nonblock && ret == -EAGAIN) { if (req->io) return -EAGAIN; - if (io_alloc_async_ctx(req)) + if (io_alloc_async_ctx(req)) { + if (kmsg && kmsg->iov != kmsg->fast_iov) + kfree(kmsg->iov); return -ENOMEM; + } memcpy(&req->io->msg, &io.msg, sizeof(io.msg)); - req->work.func = io_sendrecv_async; return -EAGAIN; } if (ret == -ERESTARTSYS) ret = -EINTR; } - if (!io_wq_current_is_worker() && kmsg && kmsg->iov != kmsg->fast_iov) + if (kmsg && kmsg->iov != kmsg->fast_iov) kfree(kmsg->iov); io_cqring_add_event(req, ret); if (ret < 0) @@ -3181,17 +3156,19 @@ static int io_recvmsg(struct io_kiocb *req, struct io_kiocb **nxt, if (force_nonblock && ret == -EAGAIN) { if (req->io) return -EAGAIN; - if (io_alloc_async_ctx(req)) + if (io_alloc_async_ctx(req)) { + if (kmsg && kmsg->iov != kmsg->fast_iov) + kfree(kmsg->iov); return -ENOMEM; + } memcpy(&req->io->msg, &io.msg, sizeof(io.msg)); - req->work.func = io_sendrecv_async; return -EAGAIN; } if (ret == -ERESTARTSYS) ret = -EINTR; } - if (!io_wq_current_is_worker() && kmsg && kmsg->iov != kmsg->fast_iov) + if (kmsg && kmsg->iov != kmsg->fast_iov) kfree(kmsg->iov); io_cqring_add_event(req, ret); if (ret < 0) -- cgit v1.2.3 From f2b18baca9539c6a3116d48b70972c7a2ba5d766 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 15 Jan 2020 12:25:50 +0100 Subject: mac80211: use more bits for ack_frame_id MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It turns out that this wasn't a good idea, I hit a test failure in hwsim due to this. That particular failure was easily worked around, but it raised questions: if an AP needs to, for example, send action frames to each connected station, the current limit is nowhere near enough (especially if those stations are sleeping and the frames are queued for a while.) Shuffle around some bits to make more room for ack_frame_id to allow up to 8192 queued up frames, that's enough for queueing 4 frames to each connected station, even at the maximum of 2007 stations on a single AP. We take the bits from band (which currently only 2 but I leave 3 in case we add another band) and from the hw_queue, which can only need 4 since it has a limit of 16 queues. Fixes: 6912daed05e1 ("mac80211: Shrink the size of ack_frame_id to make room for tx_time_est") Signed-off-by: Johannes Berg Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/r/20200115122549.b9a4ef9f4980.Ied52ed90150220b83a280009c590b65d125d087c@changeid Signed-off-by: Johannes Berg --- include/net/mac80211.h | 11 +++++------ net/mac80211/cfg.c | 2 +- net/mac80211/tx.c | 2 +- 3 files changed, 7 insertions(+), 8 deletions(-) diff --git a/include/net/mac80211.h b/include/net/mac80211.h index aa145808e57a..77e6b5a83b06 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -1004,12 +1004,11 @@ ieee80211_rate_get_vht_nss(const struct ieee80211_tx_rate *rate) struct ieee80211_tx_info { /* common information */ u32 flags; - u8 band; - - u8 hw_queue; - - u16 ack_frame_id:6; - u16 tx_time_est:10; + u32 band:3, + ack_frame_id:13, + hw_queue:4, + tx_time_est:10; + /* 2 free bits */ union { struct { diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 000c742d0527..6aee699deb28 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -3450,7 +3450,7 @@ int ieee80211_attach_ack_skb(struct ieee80211_local *local, struct sk_buff *skb, spin_lock_irqsave(&local->ack_status_lock, spin_flags); id = idr_alloc(&local->ack_status_frames, ack_skb, - 1, 0x40, GFP_ATOMIC); + 1, 0x2000, GFP_ATOMIC); spin_unlock_irqrestore(&local->ack_status_lock, spin_flags); if (id < 0) { diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 4bd1faf4f779..87def9cb91ff 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -2442,7 +2442,7 @@ static int ieee80211_store_ack_skb(struct ieee80211_local *local, spin_lock_irqsave(&local->ack_status_lock, flags); id = idr_alloc(&local->ack_status_frames, ack_skb, - 1, 0x40, GFP_ATOMIC); + 1, 0x2000, GFP_ATOMIC); spin_unlock_irqrestore(&local->ack_status_lock, flags); if (id >= 0) { -- cgit v1.2.3 From 2bf973ff9b9aeceb8acda629ae65341820d4b35b Mon Sep 17 00:00:00 2001 From: Sara Sharon Date: Fri, 31 Jan 2020 13:12:51 +0200 Subject: mac80211: fix quiet mode activation in action frames Previously I intended to ignore quiet mode in probe response, however I ended up ignoring it instead for action frames. As a matter of fact, this path isn't invoked for probe responses to start with. Just revert this patch. Signed-off-by: Sara Sharon Fixes: 7976b1e9e3bf ("mac80211: ignore quiet mode in probe") Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/20200131111300.891737-15-luca@coelho.fi Signed-off-by: Johannes Berg --- net/mac80211/mlme.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 5fa13176036f..e041af2f021a 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -8,7 +8,7 @@ * Copyright 2007, Michael Wu * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright (C) 2015 - 2017 Intel Deutschland GmbH - * Copyright (C) 2018 - 2019 Intel Corporation + * Copyright (C) 2018 - 2020 Intel Corporation */ #include @@ -1311,7 +1311,7 @@ ieee80211_sta_process_chanswitch(struct ieee80211_sub_if_data *sdata, if (!res) { ch_switch.timestamp = timestamp; ch_switch.device_timestamp = device_timestamp; - ch_switch.block_tx = beacon ? csa_ie.mode : 0; + ch_switch.block_tx = csa_ie.mode; ch_switch.chandef = csa_ie.chandef; ch_switch.count = csa_ie.count; ch_switch.delay = csa_ie.max_switch_time; @@ -1404,7 +1404,7 @@ ieee80211_sta_process_chanswitch(struct ieee80211_sub_if_data *sdata, sdata->vif.csa_active = true; sdata->csa_chandef = csa_ie.chandef; - sdata->csa_block_tx = ch_switch.block_tx; + sdata->csa_block_tx = csa_ie.mode; ifmgd->csa_ignored_same_chan = false; if (sdata->csa_block_tx) @@ -1438,7 +1438,7 @@ ieee80211_sta_process_chanswitch(struct ieee80211_sub_if_data *sdata, * reset when the disconnection worker runs. */ sdata->vif.csa_active = true; - sdata->csa_block_tx = ch_switch.block_tx; + sdata->csa_block_tx = csa_ie.mode; ieee80211_queue_work(&local->hw, &ifmgd->csa_connection_drop_work); mutex_unlock(&local->chanctx_mtx); -- cgit v1.2.3 From a04564c99bb4a92f805a58e56b2d22cc4978f152 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 31 Jan 2020 13:12:58 +0200 Subject: mac80211: consider more elements in parsing CRC We only use the parsing CRC for checking if a beacon changed, and elements with an ID > 63 cannot be represented in the filter. Thus, like we did before with WMM and Cisco vendor elements, just statically add these forgotten items to the CRC: - WLAN_EID_VHT_OPERATION - WLAN_EID_OPMODE_NOTIF I guess that in most cases when VHT/HE operation change, the HT operation also changed, and so the change was picked up, but we did notice that pure operating mode notification changes were ignored. Signed-off-by: Johannes Berg Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/20200131111300.891737-22-luca@coelho.fi [restrict to VHT for the mac80211 branch] Signed-off-by: Johannes Berg --- net/mac80211/util.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/net/mac80211/util.c b/net/mac80211/util.c index 32a7a53833c0..739e90555d8b 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -1063,16 +1063,22 @@ _ieee802_11_parse_elems_crc(const u8 *start, size_t len, bool action, elem_parse_failed = true; break; case WLAN_EID_VHT_OPERATION: - if (elen >= sizeof(struct ieee80211_vht_operation)) + if (elen >= sizeof(struct ieee80211_vht_operation)) { elems->vht_operation = (void *)pos; - else - elem_parse_failed = true; + if (calc_crc) + crc = crc32_be(crc, pos - 2, elen + 2); + break; + } + elem_parse_failed = true; break; case WLAN_EID_OPMODE_NOTIF: - if (elen > 0) + if (elen > 0) { elems->opmode_notif = pos; - else - elem_parse_failed = true; + if (calc_crc) + crc = crc32_be(crc, pos - 2, elen + 2); + break; + } + elem_parse_failed = true; break; case WLAN_EID_MESH_ID: elems->mesh_id = pos; -- cgit v1.2.3 From bfb7bac3a8f47100ebe7961bd14e924c96e21ca7 Mon Sep 17 00:00:00 2001 From: Sergey Matyukevich Date: Mon, 3 Feb 2020 10:56:50 +0000 Subject: cfg80211: check wiphy driver existence for drvinfo report When preparing ethtool drvinfo, check if wiphy driver is defined before dereferencing it. Driver may not exist, e.g. if wiphy is attached to a virtual platform device. Signed-off-by: Sergey Matyukevich Link: https://lore.kernel.org/r/20200203105644.28875-1-sergey.matyukevich.os@quantenna.com Signed-off-by: Johannes Berg --- net/wireless/ethtool.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/net/wireless/ethtool.c b/net/wireless/ethtool.c index a9c0f368db5d..24e18405cdb4 100644 --- a/net/wireless/ethtool.c +++ b/net/wireless/ethtool.c @@ -7,9 +7,13 @@ void cfg80211_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info) { struct wireless_dev *wdev = dev->ieee80211_ptr; + struct device *pdev = wiphy_dev(wdev->wiphy); - strlcpy(info->driver, wiphy_dev(wdev->wiphy)->driver->name, - sizeof(info->driver)); + if (pdev->driver) + strlcpy(info->driver, pdev->driver->name, + sizeof(info->driver)); + else + strlcpy(info->driver, "N/A", sizeof(info->driver)); strlcpy(info->version, init_utsname()->release, sizeof(info->version)); -- cgit v1.2.3 From c4a3922d2d20c710f827d3a115ee338e8d0467df Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Sun, 2 Feb 2020 20:30:52 -0800 Subject: netfilter: xt_hashlimit: reduce hashlimit_mutex scope for htable_put() It is unnecessary to hold hashlimit_mutex for htable_destroy() as it is already removed from the global hashtable and its refcount is already zero. Also, switch hinfo->use to refcount_t so that we don't have to hold the mutex until it reaches zero in htable_put(). Reported-and-tested-by: syzbot+adf6c6c2be1c3a718121@syzkaller.appspotmail.com Acked-by: Florian Westphal Signed-off-by: Cong Wang Signed-off-by: Pablo Neira Ayuso --- net/netfilter/xt_hashlimit.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c index bccd47cd7190..cc475a608f81 100644 --- a/net/netfilter/xt_hashlimit.c +++ b/net/netfilter/xt_hashlimit.c @@ -36,6 +36,7 @@ #include #include #include +#include #include #define XT_HASHLIMIT_ALL (XT_HASHLIMIT_HASH_DIP | XT_HASHLIMIT_HASH_DPT | \ @@ -114,7 +115,7 @@ struct dsthash_ent { struct xt_hashlimit_htable { struct hlist_node node; /* global list of all htables */ - int use; + refcount_t use; u_int8_t family; bool rnd_initialized; @@ -315,7 +316,7 @@ static int htable_create(struct net *net, struct hashlimit_cfg3 *cfg, for (i = 0; i < hinfo->cfg.size; i++) INIT_HLIST_HEAD(&hinfo->hash[i]); - hinfo->use = 1; + refcount_set(&hinfo->use, 1); hinfo->count = 0; hinfo->family = family; hinfo->rnd_initialized = false; @@ -420,7 +421,7 @@ static struct xt_hashlimit_htable *htable_find_get(struct net *net, hlist_for_each_entry(hinfo, &hashlimit_net->htables, node) { if (!strcmp(name, hinfo->name) && hinfo->family == family) { - hinfo->use++; + refcount_inc(&hinfo->use); return hinfo; } } @@ -429,12 +430,11 @@ static struct xt_hashlimit_htable *htable_find_get(struct net *net, static void htable_put(struct xt_hashlimit_htable *hinfo) { - mutex_lock(&hashlimit_mutex); - if (--hinfo->use == 0) { + if (refcount_dec_and_mutex_lock(&hinfo->use, &hashlimit_mutex)) { hlist_del(&hinfo->node); + mutex_unlock(&hashlimit_mutex); htable_destroy(hinfo); } - mutex_unlock(&hashlimit_mutex); } /* The algorithm used is the Simple Token Bucket Filter (TBF) -- cgit v1.2.3 From 8d0015a7ab76b8b1e89a3e5f5710a6e5103f2dd5 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Sun, 2 Feb 2020 20:30:53 -0800 Subject: netfilter: xt_hashlimit: limit the max size of hashtable The user-specified hashtable size is unbound, this could easily lead to an OOM or a hung task as we hold the global mutex while allocating and initializing the new hashtable. Add a max value to cap both cfg->size and cfg->max, as suggested by Florian. Reported-and-tested-by: syzbot+adf6c6c2be1c3a718121@syzkaller.appspotmail.com Signed-off-by: Cong Wang Reviewed-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/xt_hashlimit.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c index cc475a608f81..7a2c4b8408c4 100644 --- a/net/netfilter/xt_hashlimit.c +++ b/net/netfilter/xt_hashlimit.c @@ -837,6 +837,8 @@ hashlimit_mt(const struct sk_buff *skb, struct xt_action_param *par) return hashlimit_mt_common(skb, par, hinfo, &info->cfg, 3); } +#define HASHLIMIT_MAX_SIZE 1048576 + static int hashlimit_mt_check_common(const struct xt_mtchk_param *par, struct xt_hashlimit_htable **hinfo, struct hashlimit_cfg3 *cfg, @@ -847,6 +849,14 @@ static int hashlimit_mt_check_common(const struct xt_mtchk_param *par, if (cfg->gc_interval == 0 || cfg->expire == 0) return -EINVAL; + if (cfg->size > HASHLIMIT_MAX_SIZE) { + cfg->size = HASHLIMIT_MAX_SIZE; + pr_info_ratelimited("size too large, truncated to %u\n", cfg->size); + } + if (cfg->max > HASHLIMIT_MAX_SIZE) { + cfg->max = HASHLIMIT_MAX_SIZE; + pr_info_ratelimited("max too large, truncated to %u\n", cfg->max); + } if (par->family == NFPROTO_IPV4) { if (cfg->srcmask > 32 || cfg->dstmask > 32) return -EINVAL; -- cgit v1.2.3 From a7da92c2c8a1faf253a3b3e292fda6910deba540 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 3 Feb 2020 13:06:18 +0100 Subject: netfilter: flowtable: skip offload setup if disabled nftables test case tests/shell/testcases/flowtable/0001flowtable_0 results in a crash. After the refactor, if we leave early via nf_flowtable_hw_offload(), then "struct flow_block_offload" is left in an uninitialized state, but later users assume its initialised. Fixes: a7965d58ddab02 ("netfilter: flowtable: add nf_flow_table_offload_cmd()") Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_flow_table_offload.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c index 83e1db37c3b0..06f00cdc3891 100644 --- a/net/netfilter/nf_flow_table_offload.c +++ b/net/netfilter/nf_flow_table_offload.c @@ -847,9 +847,6 @@ static int nf_flow_table_offload_cmd(struct flow_block_offload *bo, { int err; - if (!nf_flowtable_hw_offload(flowtable)) - return 0; - if (!dev->netdev_ops->ndo_setup_tc) return -EOPNOTSUPP; @@ -876,6 +873,9 @@ int nf_flow_table_offload_setup(struct nf_flowtable *flowtable, struct flow_block_offload bo; int err; + if (!nf_flowtable_hw_offload(flowtable)) + return 0; + err = nf_flow_table_offload_cmd(&bo, flowtable, dev, cmd, &extack); if (err < 0) return err; -- cgit v1.2.3 From cf3040ca55f2085b0a372a620ee2cb93ae19b686 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 6 Feb 2020 21:31:40 -0700 Subject: io_uring: statx/openat/openat2 don't support fixed files All of these opcodes take a directory file descriptor. We can't easily support fixed files for these operations, and the use case for that probably isn't all that clear (or sensible) anyway. Disable IOSQE_FIXED_FILE for these operations. Reported-by: Stefan Metzmacher Signed-off-by: Jens Axboe --- fs/io_uring.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index fcb4536a3c8c..d03846822062 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2516,6 +2516,8 @@ static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (sqe->ioprio || sqe->buf_index) return -EINVAL; + if (sqe->flags & IOSQE_FIXED_FILE) + return -EBADF; req->open.dfd = READ_ONCE(sqe->fd); req->open.how.mode = READ_ONCE(sqe->len); @@ -2541,6 +2543,8 @@ static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (sqe->ioprio || sqe->buf_index) return -EINVAL; + if (sqe->flags & IOSQE_FIXED_FILE) + return -EBADF; req->open.dfd = READ_ONCE(sqe->fd); fname = u64_to_user_ptr(READ_ONCE(sqe->addr)); @@ -2736,6 +2740,8 @@ static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (sqe->ioprio || sqe->buf_index) return -EINVAL; + if (sqe->flags & IOSQE_FIXED_FILE) + return -EBADF; req->open.dfd = READ_ONCE(sqe->fd); req->open.mask = READ_ONCE(sqe->len); @@ -2809,7 +2815,7 @@ static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) sqe->rw_flags || sqe->buf_index) return -EINVAL; if (sqe->flags & IOSQE_FIXED_FILE) - return -EINVAL; + return -EBADF; req->close.fd = READ_ONCE(sqe->fd); if (req->file->f_op == &io_uring_fops || -- cgit v1.2.3 From 63e5d81f72af1bf370bf8a6745b0a8d71a7bb37d Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Fri, 7 Feb 2020 13:18:28 +0100 Subject: io_uring: flush overflowed CQ events in the io_uring_poll() In io_uring_poll() we must flush overflowed CQ events before to check if there are CQ events available, to avoid missing events. We call the io_cqring_events() that checks and flushes any overflow and returns the number of CQ events available. Signed-off-by: Stefano Garzarella Signed-off-by: Jens Axboe --- fs/io_uring.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index d03846822062..2954a8bdf824 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -6283,7 +6283,7 @@ static __poll_t io_uring_poll(struct file *file, poll_table *wait) if (READ_ONCE(ctx->rings->sq.tail) - ctx->cached_sq_head != ctx->rings->sq_ring_entries) mask |= EPOLLOUT | EPOLLWRNORM; - if (READ_ONCE(ctx->rings->cq.head) != ctx->cached_cq_tail) + if (io_cqring_events(ctx, false)) mask |= EPOLLIN | EPOLLRDNORM; return mask; -- cgit v1.2.3 From e96e977992d0ea40b6e70cb63dede85c9078e744 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 7 Feb 2020 19:21:25 +0300 Subject: io_uring: remove unused struct io_async_open struct io_async_open is unused, remove it. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 2954a8bdf824..ebf3b43fb91b 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -450,17 +450,12 @@ struct io_async_rw { ssize_t size; }; -struct io_async_open { - struct filename *filename; -}; - struct io_async_ctx { union { struct io_async_rw rw; struct io_async_msghdr msg; struct io_async_connect connect; struct io_timeout_data timeout; - struct io_async_open open; }; }; -- cgit v1.2.3 From 99bc4c38537d774e667d043c520914082da19abf Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 7 Feb 2020 22:04:45 +0300 Subject: io_uring: fix iovec leaks Allocated iovec is freed only in io_{read,write,send,recv)(), and just leaves it if an error occured. There are plenty of such cases: - cancellation of non-head requests - fail grabbing files in __io_queue_sqe() - set REQ_F_NOWAIT and returning in __io_queue_sqe() Add REQ_F_NEED_CLEANUP, which will force such requests with custom allocated resourses go through cleanup handlers on put. Cc: stable@vger.kernel.org # 5.5 Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 49 insertions(+), 2 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index ebf3b43fb91b..5353e96029c7 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -478,6 +478,7 @@ enum { REQ_F_MUST_PUNT_BIT, REQ_F_TIMEOUT_NOSEQ_BIT, REQ_F_COMP_LOCKED_BIT, + REQ_F_NEED_CLEANUP_BIT, }; enum { @@ -516,6 +517,8 @@ enum { REQ_F_TIMEOUT_NOSEQ = BIT(REQ_F_TIMEOUT_NOSEQ_BIT), /* completion under lock */ REQ_F_COMP_LOCKED = BIT(REQ_F_COMP_LOCKED_BIT), + /* needs cleanup */ + REQ_F_NEED_CLEANUP = BIT(REQ_F_NEED_CLEANUP_BIT), }; /* @@ -748,6 +751,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx, unsigned nr_args); static int io_grab_files(struct io_kiocb *req); static void io_ring_file_ref_flush(struct fixed_file_data *data); +static void io_cleanup_req(struct io_kiocb *req); static struct kmem_cache *req_cachep; @@ -1235,6 +1239,9 @@ static void __io_free_req(struct io_kiocb *req) { __io_req_aux_free(req); + if (req->flags & REQ_F_NEED_CLEANUP) + io_cleanup_req(req); + if (req->flags & REQ_F_INFLIGHT) { struct io_ring_ctx *ctx = req->ctx; unsigned long flags; @@ -2128,6 +2135,8 @@ static void io_req_map_rw(struct io_kiocb *req, ssize_t io_size, req->io->rw.iov = req->io->rw.fast_iov; memcpy(req->io->rw.iov, fast_iov, sizeof(struct iovec) * iter->nr_segs); + } else { + req->flags |= REQ_F_NEED_CLEANUP; } } @@ -2238,6 +2247,7 @@ copy_iov: } out_free: kfree(iovec); + req->flags &= ~REQ_F_NEED_CLEANUP; return ret; } @@ -2342,6 +2352,7 @@ copy_iov: } } out_free: + req->flags &= ~REQ_F_NEED_CLEANUP; kfree(iovec); return ret; } @@ -2948,6 +2959,7 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) #if defined(CONFIG_NET) struct io_sr_msg *sr = &req->sr_msg; struct io_async_ctx *io = req->io; + int ret; sr->msg_flags = READ_ONCE(sqe->msg_flags); sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr)); @@ -2957,8 +2969,11 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return 0; io->msg.iov = io->msg.fast_iov; - return sendmsg_copy_msghdr(&io->msg.msg, sr->msg, sr->msg_flags, + ret = sendmsg_copy_msghdr(&io->msg.msg, sr->msg, sr->msg_flags, &io->msg.iov); + if (!ret) + req->flags |= REQ_F_NEED_CLEANUP; + return ret; #else return -EOPNOTSUPP; #endif @@ -3016,6 +3031,7 @@ static int io_sendmsg(struct io_kiocb *req, struct io_kiocb **nxt, kfree(kmsg->iov); return -ENOMEM; } + req->flags |= REQ_F_NEED_CLEANUP; memcpy(&req->io->msg, &io.msg, sizeof(io.msg)); return -EAGAIN; } @@ -3025,6 +3041,7 @@ static int io_sendmsg(struct io_kiocb *req, struct io_kiocb **nxt, if (kmsg && kmsg->iov != kmsg->fast_iov) kfree(kmsg->iov); + req->flags &= ~REQ_F_NEED_CLEANUP; io_cqring_add_event(req, ret); if (ret < 0) req_set_fail_links(req); @@ -3092,6 +3109,7 @@ static int io_recvmsg_prep(struct io_kiocb *req, #if defined(CONFIG_NET) struct io_sr_msg *sr = &req->sr_msg; struct io_async_ctx *io = req->io; + int ret; sr->msg_flags = READ_ONCE(sqe->msg_flags); sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr)); @@ -3101,8 +3119,11 @@ static int io_recvmsg_prep(struct io_kiocb *req, return 0; io->msg.iov = io->msg.fast_iov; - return recvmsg_copy_msghdr(&io->msg.msg, sr->msg, sr->msg_flags, + ret = recvmsg_copy_msghdr(&io->msg.msg, sr->msg, sr->msg_flags, &io->msg.uaddr, &io->msg.iov); + if (!ret) + req->flags |= REQ_F_NEED_CLEANUP; + return ret; #else return -EOPNOTSUPP; #endif @@ -3163,6 +3184,7 @@ static int io_recvmsg(struct io_kiocb *req, struct io_kiocb **nxt, return -ENOMEM; } memcpy(&req->io->msg, &io.msg, sizeof(io.msg)); + req->flags |= REQ_F_NEED_CLEANUP; return -EAGAIN; } if (ret == -ERESTARTSYS) @@ -3171,6 +3193,7 @@ static int io_recvmsg(struct io_kiocb *req, struct io_kiocb **nxt, if (kmsg && kmsg->iov != kmsg->fast_iov) kfree(kmsg->iov); + req->flags &= ~REQ_F_NEED_CLEANUP; io_cqring_add_event(req, ret); if (ret < 0) req_set_fail_links(req); @@ -4181,6 +4204,30 @@ static int io_req_defer(struct io_kiocb *req, const struct io_uring_sqe *sqe) return -EIOCBQUEUED; } +static void io_cleanup_req(struct io_kiocb *req) +{ + struct io_async_ctx *io = req->io; + + switch (req->opcode) { + case IORING_OP_READV: + case IORING_OP_READ_FIXED: + case IORING_OP_READ: + case IORING_OP_WRITEV: + case IORING_OP_WRITE_FIXED: + case IORING_OP_WRITE: + if (io->rw.iov != io->rw.fast_iov) + kfree(io->rw.iov); + break; + case IORING_OP_SENDMSG: + case IORING_OP_RECVMSG: + if (io->msg.iov != io->msg.fast_iov) + kfree(io->msg.iov); + break; + } + + req->flags &= ~REQ_F_NEED_CLEANUP; +} + static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, struct io_kiocb **nxt, bool force_nonblock) { -- cgit v1.2.3 From 8fef80bf56a49c60b457dedb99fd6c5279a5dbe1 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 7 Feb 2020 23:59:53 +0300 Subject: io_uring: add cleanup for openat()/statx() openat() and statx() may have allocated ->open.filename, which should be be put. Add cleanup handlers for them. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/fs/io_uring.c b/fs/io_uring.c index 5353e96029c7..e6829d1bf4b4 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2537,6 +2537,7 @@ static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return ret; } + req->flags |= REQ_F_NEED_CLEANUP; return 0; } @@ -2575,6 +2576,7 @@ static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return ret; } + req->flags |= REQ_F_NEED_CLEANUP; return 0; } @@ -2606,6 +2608,7 @@ static int io_openat2(struct io_kiocb *req, struct io_kiocb **nxt, } err: putname(req->open.filename); + req->flags &= ~REQ_F_NEED_CLEANUP; if (ret < 0) req_set_fail_links(req); io_cqring_add_event(req, ret); @@ -2765,6 +2768,7 @@ static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return ret; } + req->flags |= REQ_F_NEED_CLEANUP; return 0; } @@ -2802,6 +2806,7 @@ retry: ret = cp_statx(&stat, ctx->buffer); err: putname(ctx->filename); + req->flags &= ~REQ_F_NEED_CLEANUP; if (ret < 0) req_set_fail_links(req); io_cqring_add_event(req, ret); @@ -4223,6 +4228,11 @@ static void io_cleanup_req(struct io_kiocb *req) if (io->msg.iov != io->msg.fast_iov) kfree(io->msg.iov); break; + case IORING_OP_OPENAT: + case IORING_OP_OPENAT2: + case IORING_OP_STATX: + putname(req->open.filename); + break; } req->flags &= ~REQ_F_NEED_CLEANUP; -- cgit v1.2.3 From faac996ccd5da95bc56b91aa80f2643c2d0a1c56 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 7 Feb 2020 15:45:22 -0700 Subject: io_uring: retry raw bdev writes if we hit -EOPNOTSUPP For non-blocking issue, we set IOCB_NOWAIT in the kiocb. However, on a raw block device, this yields an -EOPNOTSUPP return, as non-blocking writes aren't supported. Turn this -EOPNOTSUPP into -EAGAIN, so we retry from blocking context with IOCB_NOWAIT cleared. Cc: stable@vger.kernel.org # 5.5 Signed-off-by: Jens Axboe --- fs/io_uring.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/io_uring.c b/fs/io_uring.c index e6829d1bf4b4..1a3ca6577a10 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2340,6 +2340,12 @@ static int io_write(struct io_kiocb *req, struct io_kiocb **nxt, ret2 = call_write_iter(req->file, kiocb, &iter); else ret2 = loop_rw_iter(WRITE, req->file, kiocb, &iter); + /* + * Raw bdev writes will -EOPNOTSUPP for IOCB_NOWAIT. Just + * retry them without IOCB_NOWAIT. + */ + if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT)) + ret2 = -EAGAIN; if (!force_nonblock || ret2 != -EAGAIN) { kiocb_done(kiocb, ret2, nxt, req->in_async); } else { -- cgit v1.2.3 From 9392a27d88b9707145d713654eb26f0c29789e50 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 6 Feb 2020 21:42:51 -0700 Subject: io-wq: add support for inheriting ->fs Some work items need this for relative path lookup, make it available like the other inherited credentials/mm/etc. Cc: stable@vger.kernel.org # 5.3+ Signed-off-by: Jens Axboe --- fs/io-wq.c | 8 ++++++++ fs/io-wq.h | 4 +++- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/fs/io-wq.c b/fs/io-wq.c index cb60a42b9fdf..7ac4a8876a50 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -16,6 +16,7 @@ #include #include #include +#include #include "io-wq.h" @@ -59,6 +60,7 @@ struct io_worker { const struct cred *cur_creds; const struct cred *saved_creds; struct files_struct *restore_files; + struct fs_struct *restore_fs; }; #if BITS_PER_LONG == 64 @@ -151,6 +153,9 @@ static bool __io_worker_unuse(struct io_wqe *wqe, struct io_worker *worker) task_unlock(current); } + if (current->fs != worker->restore_fs) + current->fs = worker->restore_fs; + /* * If we have an active mm, we need to drop the wq lock before unusing * it. If we do, return true and let the caller retry the idle loop. @@ -311,6 +316,7 @@ static void io_worker_start(struct io_wqe *wqe, struct io_worker *worker) worker->flags |= (IO_WORKER_F_UP | IO_WORKER_F_RUNNING); worker->restore_files = current->files; + worker->restore_fs = current->fs; io_wqe_inc_running(wqe, worker); } @@ -481,6 +487,8 @@ next: current->files = work->files; task_unlock(current); } + if (work->fs && current->fs != work->fs) + current->fs = work->fs; if (work->mm != worker->mm) io_wq_switch_mm(worker, work); if (worker->cur_creds != work->creds) diff --git a/fs/io-wq.h b/fs/io-wq.h index 50b3378febf2..f152ba677d8f 100644 --- a/fs/io-wq.h +++ b/fs/io-wq.h @@ -74,6 +74,7 @@ struct io_wq_work { struct files_struct *files; struct mm_struct *mm; const struct cred *creds; + struct fs_struct *fs; unsigned flags; }; @@ -81,10 +82,11 @@ struct io_wq_work { do { \ (work)->list.next = NULL; \ (work)->func = _func; \ - (work)->flags = 0; \ (work)->files = NULL; \ (work)->mm = NULL; \ (work)->creds = NULL; \ + (work)->fs = NULL; \ + (work)->flags = 0; \ } while (0) \ typedef void (get_work_fn)(struct io_wq_work *); -- cgit v1.2.3 From ff002b30181d30cdfbca316dadd099c3ca0d739c Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 7 Feb 2020 16:05:21 -0700 Subject: io_uring: grab ->fs as part of async preparation This passes it in to io-wq, so it assumes the right fs_struct when executing async work that may need to do lookups. Cc: stable@vger.kernel.org # 5.3+ Signed-off-by: Jens Axboe --- fs/io_uring.c | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/fs/io_uring.c b/fs/io_uring.c index 1a3ca6577a10..2a7bb178986e 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -75,6 +75,7 @@ #include #include #include +#include #define CREATE_TRACE_POINTS #include @@ -611,6 +612,8 @@ struct io_op_def { unsigned not_supported : 1; /* needs file table */ unsigned file_table : 1; + /* needs ->fs */ + unsigned needs_fs : 1; }; static const struct io_op_def io_op_defs[] = { @@ -653,12 +656,14 @@ static const struct io_op_def io_op_defs[] = { .needs_mm = 1, .needs_file = 1, .unbound_nonreg_file = 1, + .needs_fs = 1, }, [IORING_OP_RECVMSG] = { .async_ctx = 1, .needs_mm = 1, .needs_file = 1, .unbound_nonreg_file = 1, + .needs_fs = 1, }, [IORING_OP_TIMEOUT] = { .async_ctx = 1, @@ -689,6 +694,7 @@ static const struct io_op_def io_op_defs[] = { .needs_file = 1, .fd_non_neg = 1, .file_table = 1, + .needs_fs = 1, }, [IORING_OP_CLOSE] = { .needs_file = 1, @@ -702,6 +708,7 @@ static const struct io_op_def io_op_defs[] = { .needs_mm = 1, .needs_file = 1, .fd_non_neg = 1, + .needs_fs = 1, }, [IORING_OP_READ] = { .needs_mm = 1, @@ -733,6 +740,7 @@ static const struct io_op_def io_op_defs[] = { .needs_file = 1, .fd_non_neg = 1, .file_table = 1, + .needs_fs = 1, }, [IORING_OP_EPOLL_CTL] = { .unbound_nonreg_file = 1, @@ -907,6 +915,16 @@ static inline void io_req_work_grab_env(struct io_kiocb *req, } if (!req->work.creds) req->work.creds = get_current_cred(); + if (!req->work.fs && def->needs_fs) { + spin_lock(¤t->fs->lock); + if (!current->fs->in_exec) { + req->work.fs = current->fs; + req->work.fs->users++; + } else { + req->work.flags |= IO_WQ_WORK_CANCEL; + } + spin_unlock(¤t->fs->lock); + } } static inline void io_req_work_drop_env(struct io_kiocb *req) @@ -919,6 +937,16 @@ static inline void io_req_work_drop_env(struct io_kiocb *req) put_cred(req->work.creds); req->work.creds = NULL; } + if (req->work.fs) { + struct fs_struct *fs = req->work.fs; + + spin_lock(&req->work.fs->lock); + if (--fs->users) + fs = NULL; + spin_unlock(&req->work.fs->lock); + if (fs) + free_fs_struct(fs); + } } static inline bool io_prep_async_work(struct io_kiocb *req, -- cgit v1.2.3 From 0b5faf6ba7fb78bb1fe7336d23ea1978386a6c3a Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 6 Feb 2020 21:42:51 -0700 Subject: io_uring: allow AT_FDCWD for non-file openat/openat2/statx Don't just check for dirfd == -1, we should allow AT_FDCWD as well for relative lookups. Signed-off-by: Jens Axboe --- fs/io_uring.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 2a7bb178986e..e6247b94c29d 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -4543,7 +4543,7 @@ static int io_req_needs_file(struct io_kiocb *req, int fd) { if (!io_op_defs[req->opcode].needs_file) return 0; - if (fd == -1 && io_op_defs[req->opcode].fd_non_neg) + if ((fd == -1 || fd == AT_FDCWD) && io_op_defs[req->opcode].fd_non_neg) return 0; return 1; } -- cgit v1.2.3 From a93b33312f63ef6d5997f45d6fdf4de84c5396cc Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 8 Feb 2020 14:04:34 +0300 Subject: io_uring: fix async close() with f_op->flush() First, io_close() misses filp_close() and io_cqring_add_event(), when f_op->flush is defined. That's because in this case it will io_queue_async_work() itself not grabbing files, so the corresponding chunk in io_close_finish() won't be executed. Second, when submitted through io_wq_submit_work(), it will do filp_close() and *_add_event() twice: first inline in io_close(), and the second one in call to io_close_finish() from io_close(). The second one will also fire, because it was submitted async through generic path, and so have grabbed files. And the last nice thing is to remove this weird pilgrimage with checking work/old_work and casting it to nxt. Just use a helper instead. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 45 ++++++++++++++++----------------------------- 1 file changed, 16 insertions(+), 29 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index e6247b94c29d..759301bdb19b 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2870,24 +2870,25 @@ static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return 0; } +/* only called when __close_fd_get_file() is done */ +static void __io_close_finish(struct io_kiocb *req, struct io_kiocb **nxt) +{ + int ret; + + ret = filp_close(req->close.put_file, req->work.files); + if (ret < 0) + req_set_fail_links(req); + io_cqring_add_event(req, ret); + fput(req->close.put_file); + io_put_req_find_next(req, nxt); +} + static void io_close_finish(struct io_wq_work **workptr) { struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work); struct io_kiocb *nxt = NULL; - /* Invoked with files, we need to do the close */ - if (req->work.files) { - int ret; - - ret = filp_close(req->close.put_file, req->work.files); - if (ret < 0) - req_set_fail_links(req); - io_cqring_add_event(req, ret); - } - - fput(req->close.put_file); - - io_put_req_find_next(req, &nxt); + __io_close_finish(req, &nxt); if (nxt) io_wq_assign_next(workptr, nxt); } @@ -2910,22 +2911,8 @@ static int io_close(struct io_kiocb *req, struct io_kiocb **nxt, * No ->flush(), safely close from here and just punt the * fput() to async context. */ - ret = filp_close(req->close.put_file, current->files); - - if (ret < 0) - req_set_fail_links(req); - io_cqring_add_event(req, ret); - - if (io_wq_current_is_worker()) { - struct io_wq_work *old_work, *work; - - old_work = work = &req->work; - io_close_finish(&work); - if (work && work != old_work) - *nxt = container_of(work, struct io_kiocb, work); - return 0; - } - + __io_close_finish(req, nxt); + return 0; eagain: req->work.func = io_close_finish; /* -- cgit v1.2.3 From 5f798beaf35d79355cbf18019c1993a84475a2c3 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 8 Feb 2020 13:28:02 +0300 Subject: io_uring: fix double prep iovec leak Requests may be prepared multiple times with ->io allocated (i.e. async prepared). Preparation functions don't handle it and forget about previously allocated resources. This may happen in case of: - spurious defer_check - non-head (i.e. async prepared) request executed in sync (via nxt). Make the handlers check, whether they already allocated resources, which is true IFF REQ_F_NEED_CLEANUP is set. Cc: stable@vger.kernel.org # 5.5 Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 759301bdb19b..097701782339 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2205,7 +2205,8 @@ static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (unlikely(!(req->file->f_mode & FMODE_READ))) return -EBADF; - if (!req->io) + /* either don't need iovec imported or already have it */ + if (!req->io || req->flags & REQ_F_NEED_CLEANUP) return 0; io = req->io; @@ -2293,7 +2294,8 @@ static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (unlikely(!(req->file->f_mode & FMODE_WRITE))) return -EBADF; - if (!req->io) + /* either don't need iovec imported or already have it */ + if (!req->io || req->flags & REQ_F_NEED_CLEANUP) return 0; io = req->io; @@ -2993,6 +2995,9 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (!io || req->opcode == IORING_OP_SEND) return 0; + /* iovec is already imported */ + if (req->flags & REQ_F_NEED_CLEANUP) + return 0; io->msg.iov = io->msg.fast_iov; ret = sendmsg_copy_msghdr(&io->msg.msg, sr->msg, sr->msg_flags, @@ -3143,6 +3148,9 @@ static int io_recvmsg_prep(struct io_kiocb *req, if (!io || req->opcode == IORING_OP_RECV) return 0; + /* iovec is already imported */ + if (req->flags & REQ_F_NEED_CLEANUP) + return 0; io->msg.iov = io->msg.fast_iov; ret = recvmsg_copy_msghdr(&io->msg.msg, sr->msg, sr->msg_flags, -- cgit v1.2.3 From 0bdbdd08a8f991bdaee54465a168c0795ea5d28b Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 8 Feb 2020 13:28:03 +0300 Subject: io_uring: fix openat/statx's filename leak As in the previous patch, make openat*_prep() and statx_prep() handle double preparation to avoid resource leakage. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/io_uring.c b/fs/io_uring.c index 097701782339..24ebd5714bf9 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2560,6 +2560,8 @@ static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return -EINVAL; if (sqe->flags & IOSQE_FIXED_FILE) return -EBADF; + if (req->flags & REQ_F_NEED_CLEANUP) + return 0; req->open.dfd = READ_ONCE(sqe->fd); req->open.how.mode = READ_ONCE(sqe->len); @@ -2588,6 +2590,8 @@ static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return -EINVAL; if (sqe->flags & IOSQE_FIXED_FILE) return -EBADF; + if (req->flags & REQ_F_NEED_CLEANUP) + return 0; req->open.dfd = READ_ONCE(sqe->fd); fname = u64_to_user_ptr(READ_ONCE(sqe->addr)); @@ -2787,6 +2791,8 @@ static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return -EINVAL; if (sqe->flags & IOSQE_FIXED_FILE) return -EBADF; + if (req->flags & REQ_F_NEED_CLEANUP) + return 0; req->open.dfd = READ_ONCE(sqe->fd); req->open.mask = READ_ONCE(sqe->len); -- cgit v1.2.3 From e383e871ab54f073c2a798a9e0bde7f1d0528de8 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Thu, 30 Jan 2020 20:55:24 +0100 Subject: ARM: npcm: Bring back GPIOLIB support The CONFIG_ARCH_REQUIRE_GPIOLIB is gone since commit 65053e1a7743 ("gpio: delete ARCH_[WANTS_OPTIONAL|REQUIRE]_GPIOLIB") and all platforms should explicitly select GPIOLIB to have it. Link: https://lore.kernel.org/r/20200130195525.4525-1-krzk@kernel.org Cc: Fixes: 65053e1a7743 ("gpio: delete ARCH_[WANTS_OPTIONAL|REQUIRE]_GPIOLIB") Signed-off-by: Krzysztof Kozlowski Signed-off-by: Olof Johansson --- arch/arm/mach-npcm/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/mach-npcm/Kconfig b/arch/arm/mach-npcm/Kconfig index 880bc2a5cada..7f7002dc2b21 100644 --- a/arch/arm/mach-npcm/Kconfig +++ b/arch/arm/mach-npcm/Kconfig @@ -11,7 +11,7 @@ config ARCH_NPCM7XX depends on ARCH_MULTI_V7 select PINCTRL_NPCM7XX select NPCM7XX_TIMER - select ARCH_REQUIRE_GPIOLIB + select GPIOLIB select CACHE_L2X0 select ARM_GIC select HAVE_ARM_TWD if SMP -- cgit v1.2.3 From 3508aae9b5618aca727f07c183e25d09033a5b66 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Thu, 30 Jan 2020 20:55:25 +0100 Subject: ARM: configs: Cleanup old Kconfig options CONFIG_MMC_BLOCK_BOUNCE is gone since commit c3dccb74be28 ("mmc: core: Delete bounce buffer Kconfig option"). CONFIG_LBDAF is gone since commit 72deb455b5ec ("block: remove CONFIG_LBDAF"). CONFIG_IOSCHED_DEADLINE and CONFIG_IOSCHED_CFQ are gone since commit f382fb0bcef4 ("block: remove legacy IO schedulers"). The IOSCHED_DEADLINE was replaced by MQ_IOSCHED_DEADLINE and it will be now enabled by default (along with MQ_IOSCHED_KYBER). The IOSCHED_BFQ seems to replace IOSCHED_CFQ so select it in configs previously choosing the latter. CONFIG_CROSS_COMPILE is gone since commit f1089c92da79 ("kbuild: remove CONFIG_CROSS_COMPILE support"). Link: https://lore.kernel.org/r/20200130195525.4525-2-krzk@kernel.org Signed-off-by: Krzysztof Kozlowski Acked-by: Arnd Bergmann Signed-off-by: Olof Johansson --- arch/arm/configs/am200epdkit_defconfig | 2 -- arch/arm/configs/axm55xx_defconfig | 1 - arch/arm/configs/clps711x_defconfig | 1 - arch/arm/configs/cns3420vb_defconfig | 2 +- arch/arm/configs/colibri_pxa300_defconfig | 1 - arch/arm/configs/collie_defconfig | 2 -- arch/arm/configs/davinci_all_defconfig | 2 -- arch/arm/configs/efm32_defconfig | 2 -- arch/arm/configs/ep93xx_defconfig | 1 - arch/arm/configs/eseries_pxa_defconfig | 2 -- arch/arm/configs/ezx_defconfig | 1 - arch/arm/configs/h3600_defconfig | 2 -- arch/arm/configs/h5000_defconfig | 1 - arch/arm/configs/imote2_defconfig | 1 - arch/arm/configs/imx_v4_v5_defconfig | 2 -- arch/arm/configs/lpc18xx_defconfig | 4 ---- arch/arm/configs/magician_defconfig | 2 -- arch/arm/configs/moxart_defconfig | 1 - arch/arm/configs/mxs_defconfig | 2 -- arch/arm/configs/omap1_defconfig | 2 -- arch/arm/configs/palmz72_defconfig | 2 -- arch/arm/configs/pcm027_defconfig | 2 -- arch/arm/configs/pleb_defconfig | 2 -- arch/arm/configs/realview_defconfig | 1 - arch/arm/configs/sama5_defconfig | 3 --- arch/arm/configs/stm32_defconfig | 2 -- arch/arm/configs/u300_defconfig | 2 -- arch/arm/configs/vexpress_defconfig | 2 -- arch/arm/configs/viper_defconfig | 1 - arch/arm/configs/zeus_defconfig | 2 -- arch/arm/configs/zx_defconfig | 1 - 31 files changed, 1 insertion(+), 53 deletions(-) diff --git a/arch/arm/configs/am200epdkit_defconfig b/arch/arm/configs/am200epdkit_defconfig index 622436f44783..f56ac394caf1 100644 --- a/arch/arm/configs/am200epdkit_defconfig +++ b/arch/arm/configs/am200epdkit_defconfig @@ -11,8 +11,6 @@ CONFIG_SLAB=y CONFIG_MODULES=y CONFIG_MODULE_UNLOAD=y # CONFIG_BLK_DEV_BSG is not set -# CONFIG_IOSCHED_DEADLINE is not set -# CONFIG_IOSCHED_CFQ is not set CONFIG_ARCH_PXA=y CONFIG_ARCH_GUMSTIX=y CONFIG_PCCARD=y diff --git a/arch/arm/configs/axm55xx_defconfig b/arch/arm/configs/axm55xx_defconfig index f53634af014b..6ea7dafa4c9e 100644 --- a/arch/arm/configs/axm55xx_defconfig +++ b/arch/arm/configs/axm55xx_defconfig @@ -25,7 +25,6 @@ CONFIG_EMBEDDED=y CONFIG_PROFILING=y CONFIG_MODULES=y CONFIG_MODULE_UNLOAD=y -# CONFIG_IOSCHED_DEADLINE is not set CONFIG_ARCH_AXXIA=y CONFIG_GPIO_PCA953X=y CONFIG_ARM_LPAE=y diff --git a/arch/arm/configs/clps711x_defconfig b/arch/arm/configs/clps711x_defconfig index c255dab36bde..63a153f5cf68 100644 --- a/arch/arm/configs/clps711x_defconfig +++ b/arch/arm/configs/clps711x_defconfig @@ -7,7 +7,6 @@ CONFIG_EMBEDDED=y CONFIG_SLOB=y CONFIG_JUMP_LABEL=y CONFIG_PARTITION_ADVANCED=y -# CONFIG_IOSCHED_CFQ is not set CONFIG_ARCH_CLPS711X=y CONFIG_ARCH_AUTCPU12=y CONFIG_ARCH_CDB89712=y diff --git a/arch/arm/configs/cns3420vb_defconfig b/arch/arm/configs/cns3420vb_defconfig index 89df0a55a065..66a80b46038d 100644 --- a/arch/arm/configs/cns3420vb_defconfig +++ b/arch/arm/configs/cns3420vb_defconfig @@ -17,7 +17,7 @@ CONFIG_MODULE_UNLOAD=y CONFIG_MODULE_FORCE_UNLOAD=y CONFIG_MODVERSIONS=y # CONFIG_BLK_DEV_BSG is not set -CONFIG_IOSCHED_CFQ=m +CONFIG_IOSCHED_BFQ=m CONFIG_ARCH_MULTI_V6=y #CONFIG_ARCH_MULTI_V7 is not set CONFIG_ARCH_CNS3XXX=y diff --git a/arch/arm/configs/colibri_pxa300_defconfig b/arch/arm/configs/colibri_pxa300_defconfig index 446134c70a33..0dae3b185284 100644 --- a/arch/arm/configs/colibri_pxa300_defconfig +++ b/arch/arm/configs/colibri_pxa300_defconfig @@ -43,7 +43,6 @@ CONFIG_USB_ANNOUNCE_NEW_DEVICES=y CONFIG_USB_MON=y CONFIG_USB_STORAGE=y CONFIG_MMC=y -# CONFIG_MMC_BLOCK_BOUNCE is not set CONFIG_MMC_PXA=y CONFIG_EXT3_FS=y CONFIG_NFS_FS=y diff --git a/arch/arm/configs/collie_defconfig b/arch/arm/configs/collie_defconfig index e6df11e906ba..36384fd575f8 100644 --- a/arch/arm/configs/collie_defconfig +++ b/arch/arm/configs/collie_defconfig @@ -7,8 +7,6 @@ CONFIG_EXPERT=y # CONFIG_BASE_FULL is not set # CONFIG_EPOLL is not set CONFIG_SLOB=y -# CONFIG_IOSCHED_DEADLINE is not set -# CONFIG_IOSCHED_CFQ is not set CONFIG_ARCH_SA1100=y CONFIG_SA1100_COLLIE=y CONFIG_PCCARD=y diff --git a/arch/arm/configs/davinci_all_defconfig b/arch/arm/configs/davinci_all_defconfig index 231f8973bbb2..b5ba8d731a25 100644 --- a/arch/arm/configs/davinci_all_defconfig +++ b/arch/arm/configs/davinci_all_defconfig @@ -15,8 +15,6 @@ CONFIG_MODULE_UNLOAD=y CONFIG_MODULE_FORCE_UNLOAD=y CONFIG_MODVERSIONS=y CONFIG_PARTITION_ADVANCED=y -# CONFIG_IOSCHED_DEADLINE is not set -# CONFIG_IOSCHED_CFQ is not set CONFIG_ARCH_MULTIPLATFORM=y CONFIG_ARCH_MULTI_V7=n CONFIG_ARCH_MULTI_V5=y diff --git a/arch/arm/configs/efm32_defconfig b/arch/arm/configs/efm32_defconfig index 10ea92513a69..46213f0530c4 100644 --- a/arch/arm/configs/efm32_defconfig +++ b/arch/arm/configs/efm32_defconfig @@ -12,8 +12,6 @@ CONFIG_EMBEDDED=y # CONFIG_VM_EVENT_COUNTERS is not set # CONFIG_SLUB_DEBUG is not set # CONFIG_BLK_DEV_BSG is not set -# CONFIG_IOSCHED_DEADLINE is not set -# CONFIG_IOSCHED_CFQ is not set # CONFIG_MMU is not set CONFIG_ARM_SINGLE_ARMV7M=y CONFIG_ARCH_EFM32=y diff --git a/arch/arm/configs/ep93xx_defconfig b/arch/arm/configs/ep93xx_defconfig index ef2d2a820c30..cd16fb6eb8e6 100644 --- a/arch/arm/configs/ep93xx_defconfig +++ b/arch/arm/configs/ep93xx_defconfig @@ -11,7 +11,6 @@ CONFIG_MODULE_UNLOAD=y CONFIG_MODULE_FORCE_UNLOAD=y # CONFIG_BLK_DEV_BSG is not set CONFIG_PARTITION_ADVANCED=y -# CONFIG_IOSCHED_CFQ is not set CONFIG_ARCH_EP93XX=y CONFIG_CRUNCH=y CONFIG_MACH_ADSSPHERE=y diff --git a/arch/arm/configs/eseries_pxa_defconfig b/arch/arm/configs/eseries_pxa_defconfig index 56452fa03d56..046f4dc2e18e 100644 --- a/arch/arm/configs/eseries_pxa_defconfig +++ b/arch/arm/configs/eseries_pxa_defconfig @@ -9,8 +9,6 @@ CONFIG_MODULES=y CONFIG_MODULE_UNLOAD=y CONFIG_MODULE_FORCE_UNLOAD=y # CONFIG_BLK_DEV_BSG is not set -# CONFIG_IOSCHED_DEADLINE is not set -# CONFIG_IOSCHED_CFQ is not set CONFIG_ARCH_PXA=y CONFIG_ARCH_PXA_ESERIES=y # CONFIG_ARM_THUMB is not set diff --git a/arch/arm/configs/ezx_defconfig b/arch/arm/configs/ezx_defconfig index 4e28771beecd..bd7b7f945e01 100644 --- a/arch/arm/configs/ezx_defconfig +++ b/arch/arm/configs/ezx_defconfig @@ -14,7 +14,6 @@ CONFIG_MODULE_UNLOAD=y CONFIG_MODULE_FORCE_UNLOAD=y CONFIG_MODVERSIONS=y # CONFIG_BLK_DEV_BSG is not set -# CONFIG_IOSCHED_CFQ is not set CONFIG_ARCH_PXA=y CONFIG_PXA_EZX=y CONFIG_NO_HZ=y diff --git a/arch/arm/configs/h3600_defconfig b/arch/arm/configs/h3600_defconfig index 4d91e41cb628..c02b3e409610 100644 --- a/arch/arm/configs/h3600_defconfig +++ b/arch/arm/configs/h3600_defconfig @@ -5,8 +5,6 @@ CONFIG_LOG_BUF_SHIFT=14 CONFIG_BLK_DEV_INITRD=y CONFIG_MODULES=y # CONFIG_BLK_DEV_BSG is not set -# CONFIG_IOSCHED_DEADLINE is not set -# CONFIG_IOSCHED_CFQ is not set CONFIG_ARCH_SA1100=y CONFIG_SA1100_H3600=y CONFIG_PCCARD=y diff --git a/arch/arm/configs/h5000_defconfig b/arch/arm/configs/h5000_defconfig index 3946c6087327..f5a338fefda8 100644 --- a/arch/arm/configs/h5000_defconfig +++ b/arch/arm/configs/h5000_defconfig @@ -10,7 +10,6 @@ CONFIG_MODULES=y CONFIG_MODULE_UNLOAD=y CONFIG_MODULE_FORCE_UNLOAD=y # CONFIG_BLK_DEV_BSG is not set -# CONFIG_IOSCHED_CFQ is not set CONFIG_ARCH_PXA=y CONFIG_MACH_H5000=y CONFIG_AEABI=y diff --git a/arch/arm/configs/imote2_defconfig b/arch/arm/configs/imote2_defconfig index 770469f61c3e..05c5515fa871 100644 --- a/arch/arm/configs/imote2_defconfig +++ b/arch/arm/configs/imote2_defconfig @@ -13,7 +13,6 @@ CONFIG_MODULE_UNLOAD=y CONFIG_MODULE_FORCE_UNLOAD=y CONFIG_MODVERSIONS=y # CONFIG_BLK_DEV_BSG is not set -# CONFIG_IOSCHED_CFQ is not set CONFIG_ARCH_PXA=y CONFIG_MACH_INTELMOTE2=y CONFIG_NO_HZ=y diff --git a/arch/arm/configs/imx_v4_v5_defconfig b/arch/arm/configs/imx_v4_v5_defconfig index 2b2d617e279d..3df90fc38398 100644 --- a/arch/arm/configs/imx_v4_v5_defconfig +++ b/arch/arm/configs/imx_v4_v5_defconfig @@ -32,8 +32,6 @@ CONFIG_KPROBES=y CONFIG_MODULES=y CONFIG_MODULE_UNLOAD=y # CONFIG_BLK_DEV_BSG is not set -# CONFIG_IOSCHED_DEADLINE is not set -# CONFIG_IOSCHED_CFQ is not set CONFIG_NET=y CONFIG_PACKET=y CONFIG_UNIX=y diff --git a/arch/arm/configs/lpc18xx_defconfig b/arch/arm/configs/lpc18xx_defconfig index e518168a0627..be882ea0eee4 100644 --- a/arch/arm/configs/lpc18xx_defconfig +++ b/arch/arm/configs/lpc18xx_defconfig @@ -1,4 +1,3 @@ -CONFIG_CROSS_COMPILE="arm-linux-gnueabihf-" CONFIG_HIGH_RES_TIMERS=y CONFIG_PREEMPT=y CONFIG_BLK_DEV_INITRD=y @@ -28,10 +27,7 @@ CONFIG_FLASH_SIZE=0x00080000 CONFIG_ZBOOT_ROM_TEXT=0x0 CONFIG_ZBOOT_ROM_BSS=0x0 CONFIG_ARM_APPENDED_DTB=y -# CONFIG_LBDAF is not set # CONFIG_BLK_DEV_BSG is not set -# CONFIG_IOSCHED_DEADLINE is not set -# CONFIG_IOSCHED_CFQ is not set CONFIG_BINFMT_FLAT=y CONFIG_BINFMT_ZFLAT=y CONFIG_BINFMT_SHARED_FLAT=y diff --git a/arch/arm/configs/magician_defconfig b/arch/arm/configs/magician_defconfig index e6486c959220..d2e684f6565a 100644 --- a/arch/arm/configs/magician_defconfig +++ b/arch/arm/configs/magician_defconfig @@ -9,8 +9,6 @@ CONFIG_SLAB=y CONFIG_MODULES=y CONFIG_MODULE_UNLOAD=y # CONFIG_BLK_DEV_BSG is not set -# CONFIG_IOSCHED_DEADLINE is not set -# CONFIG_IOSCHED_CFQ is not set CONFIG_ARCH_PXA=y CONFIG_MACH_H4700=y CONFIG_MACH_MAGICIAN=y diff --git a/arch/arm/configs/moxart_defconfig b/arch/arm/configs/moxart_defconfig index 45d27190c9c9..6834e97af348 100644 --- a/arch/arm/configs/moxart_defconfig +++ b/arch/arm/configs/moxart_defconfig @@ -15,7 +15,6 @@ CONFIG_EMBEDDED=y # CONFIG_SLUB_DEBUG is not set # CONFIG_COMPAT_BRK is not set # CONFIG_BLK_DEV_BSG is not set -# CONFIG_IOSCHED_DEADLINE is not set CONFIG_ARCH_MULTI_V4=y # CONFIG_ARCH_MULTI_V7 is not set CONFIG_ARCH_MOXART=y diff --git a/arch/arm/configs/mxs_defconfig b/arch/arm/configs/mxs_defconfig index 2773899c21b3..a9c6f32a9b1c 100644 --- a/arch/arm/configs/mxs_defconfig +++ b/arch/arm/configs/mxs_defconfig @@ -25,8 +25,6 @@ CONFIG_MODULE_UNLOAD=y CONFIG_MODULE_FORCE_UNLOAD=y CONFIG_MODVERSIONS=y CONFIG_BLK_DEV_INTEGRITY=y -# CONFIG_IOSCHED_DEADLINE is not set -# CONFIG_IOSCHED_CFQ is not set CONFIG_NET=y CONFIG_PACKET=y CONFIG_UNIX=y diff --git a/arch/arm/configs/omap1_defconfig b/arch/arm/configs/omap1_defconfig index 0c43c589f191..3b6e7452609b 100644 --- a/arch/arm/configs/omap1_defconfig +++ b/arch/arm/configs/omap1_defconfig @@ -18,8 +18,6 @@ CONFIG_MODULES=y CONFIG_MODULE_UNLOAD=y CONFIG_MODULE_FORCE_UNLOAD=y # CONFIG_BLK_DEV_BSG is not set -# CONFIG_IOSCHED_DEADLINE is not set -# CONFIG_IOSCHED_CFQ is not set CONFIG_ARCH_OMAP=y CONFIG_ARCH_OMAP1=y CONFIG_OMAP_RESET_CLOCKS=y diff --git a/arch/arm/configs/palmz72_defconfig b/arch/arm/configs/palmz72_defconfig index 4a3fd82c2a0c..b47c8abe85bc 100644 --- a/arch/arm/configs/palmz72_defconfig +++ b/arch/arm/configs/palmz72_defconfig @@ -7,8 +7,6 @@ CONFIG_SLAB=y CONFIG_MODULES=y CONFIG_MODULE_UNLOAD=y # CONFIG_BLK_DEV_BSG is not set -# CONFIG_IOSCHED_DEADLINE is not set -# CONFIG_IOSCHED_CFQ is not set CONFIG_ARCH_PXA=y CONFIG_ARCH_PXA_PALM=y # CONFIG_MACH_PALMTX is not set diff --git a/arch/arm/configs/pcm027_defconfig b/arch/arm/configs/pcm027_defconfig index a8c53228b0c1..e97a158081fc 100644 --- a/arch/arm/configs/pcm027_defconfig +++ b/arch/arm/configs/pcm027_defconfig @@ -13,8 +13,6 @@ CONFIG_MODULES=y CONFIG_MODULE_UNLOAD=y CONFIG_MODULE_FORCE_UNLOAD=y # CONFIG_BLK_DEV_BSG is not set -# CONFIG_IOSCHED_DEADLINE is not set -# CONFIG_IOSCHED_CFQ is not set CONFIG_ARCH_PXA=y CONFIG_MACH_PCM027=y CONFIG_MACH_PCM990_BASEBOARD=y diff --git a/arch/arm/configs/pleb_defconfig b/arch/arm/configs/pleb_defconfig index f0541b060cfa..2170148b975c 100644 --- a/arch/arm/configs/pleb_defconfig +++ b/arch/arm/configs/pleb_defconfig @@ -6,8 +6,6 @@ CONFIG_EXPERT=y # CONFIG_HOTPLUG is not set # CONFIG_SHMEM is not set CONFIG_MODULES=y -# CONFIG_IOSCHED_DEADLINE is not set -# CONFIG_IOSCHED_CFQ is not set CONFIG_ARCH_SA1100=y CONFIG_SA1100_PLEB=y CONFIG_ZBOOT_ROM_TEXT=0x0 diff --git a/arch/arm/configs/realview_defconfig b/arch/arm/configs/realview_defconfig index 8a056cc0c1ec..70e2c74a9f32 100644 --- a/arch/arm/configs/realview_defconfig +++ b/arch/arm/configs/realview_defconfig @@ -8,7 +8,6 @@ CONFIG_SLAB=y CONFIG_MODULES=y CONFIG_MODULE_UNLOAD=y # CONFIG_BLK_DEV_BSG is not set -# CONFIG_IOSCHED_CFQ is not set CONFIG_ARCH_MULTI_V6=y CONFIG_ARCH_REALVIEW=y CONFIG_MACH_REALVIEW_EB=y diff --git a/arch/arm/configs/sama5_defconfig b/arch/arm/configs/sama5_defconfig index 27f6135c4ee7..bab7861443dc 100644 --- a/arch/arm/configs/sama5_defconfig +++ b/arch/arm/configs/sama5_defconfig @@ -14,8 +14,6 @@ CONFIG_MODULE_FORCE_LOAD=y CONFIG_MODULE_UNLOAD=y CONFIG_MODULE_FORCE_UNLOAD=y # CONFIG_BLK_DEV_BSG is not set -# CONFIG_IOSCHED_DEADLINE is not set -# CONFIG_IOSCHED_CFQ is not set CONFIG_ARCH_AT91=y CONFIG_SOC_SAMA5D2=y CONFIG_SOC_SAMA5D3=y @@ -182,7 +180,6 @@ CONFIG_USB_GADGET=y CONFIG_USB_ATMEL_USBA=y CONFIG_USB_G_SERIAL=y CONFIG_MMC=y -# CONFIG_MMC_BLOCK_BOUNCE is not set CONFIG_MMC_SDHCI=y CONFIG_MMC_SDHCI_PLTFM=y CONFIG_MMC_SDHCI_OF_AT91=y diff --git a/arch/arm/configs/stm32_defconfig b/arch/arm/configs/stm32_defconfig index 152321d2893e..551db328009d 100644 --- a/arch/arm/configs/stm32_defconfig +++ b/arch/arm/configs/stm32_defconfig @@ -14,8 +14,6 @@ CONFIG_EMBEDDED=y # CONFIG_VM_EVENT_COUNTERS is not set # CONFIG_SLUB_DEBUG is not set # CONFIG_BLK_DEV_BSG is not set -# CONFIG_IOSCHED_DEADLINE is not set -# CONFIG_IOSCHED_CFQ is not set # CONFIG_MMU is not set CONFIG_ARCH_STM32=y CONFIG_CPU_V7M_NUM_IRQ=240 diff --git a/arch/arm/configs/u300_defconfig b/arch/arm/configs/u300_defconfig index 8223397db047..543f07338100 100644 --- a/arch/arm/configs/u300_defconfig +++ b/arch/arm/configs/u300_defconfig @@ -11,7 +11,6 @@ CONFIG_MODULES=y CONFIG_MODULE_UNLOAD=y # CONFIG_BLK_DEV_BSG is not set CONFIG_PARTITION_ADVANCED=y -# CONFIG_IOSCHED_CFQ is not set # CONFIG_ARCH_MULTI_V7 is not set CONFIG_ARCH_U300=y CONFIG_MACH_U300_SPIDUMMY=y @@ -46,7 +45,6 @@ CONFIG_FB=y CONFIG_BACKLIGHT_CLASS_DEVICE=y # CONFIG_USB_SUPPORT is not set CONFIG_MMC=y -# CONFIG_MMC_BLOCK_BOUNCE is not set CONFIG_MMC_ARMMMCI=y CONFIG_RTC_CLASS=y # CONFIG_RTC_HCTOSYS is not set diff --git a/arch/arm/configs/vexpress_defconfig b/arch/arm/configs/vexpress_defconfig index 25753552277a..c01baf7d6e37 100644 --- a/arch/arm/configs/vexpress_defconfig +++ b/arch/arm/configs/vexpress_defconfig @@ -15,8 +15,6 @@ CONFIG_OPROFILE=y CONFIG_MODULES=y CONFIG_MODULE_UNLOAD=y # CONFIG_BLK_DEV_BSG is not set -# CONFIG_IOSCHED_DEADLINE is not set -# CONFIG_IOSCHED_CFQ is not set CONFIG_ARCH_VEXPRESS=y CONFIG_ARCH_VEXPRESS_DCSCB=y CONFIG_ARCH_VEXPRESS_TC2_PM=y diff --git a/arch/arm/configs/viper_defconfig b/arch/arm/configs/viper_defconfig index 2ff16168d9c2..989599ce5300 100644 --- a/arch/arm/configs/viper_defconfig +++ b/arch/arm/configs/viper_defconfig @@ -9,7 +9,6 @@ CONFIG_SLAB=y CONFIG_MODULES=y CONFIG_MODULE_UNLOAD=y # CONFIG_BLK_DEV_BSG is not set -# CONFIG_IOSCHED_CFQ is not set CONFIG_ARCH_PXA=y CONFIG_ARCH_VIPER=y CONFIG_IWMMXT=y diff --git a/arch/arm/configs/zeus_defconfig b/arch/arm/configs/zeus_defconfig index aa3023c9a011..d3b98c4d225b 100644 --- a/arch/arm/configs/zeus_defconfig +++ b/arch/arm/configs/zeus_defconfig @@ -4,7 +4,6 @@ CONFIG_LOG_BUF_SHIFT=13 CONFIG_MODULES=y CONFIG_MODULE_UNLOAD=y # CONFIG_BLK_DEV_BSG is not set -# CONFIG_IOSCHED_CFQ is not set CONFIG_ARCH_PXA=y CONFIG_MACH_ARCOM_ZEUS=y CONFIG_PCCARD=m @@ -137,7 +136,6 @@ CONFIG_USB_MASS_STORAGE=m CONFIG_USB_G_SERIAL=m CONFIG_USB_G_PRINTER=m CONFIG_MMC=y -# CONFIG_MMC_BLOCK_BOUNCE is not set CONFIG_MMC_PXA=y CONFIG_NEW_LEDS=y CONFIG_LEDS_CLASS=m diff --git a/arch/arm/configs/zx_defconfig b/arch/arm/configs/zx_defconfig index 4d2ef785ed34..a046a492bfa7 100644 --- a/arch/arm/configs/zx_defconfig +++ b/arch/arm/configs/zx_defconfig @@ -16,7 +16,6 @@ CONFIG_EMBEDDED=y CONFIG_PERF_EVENTS=y CONFIG_SLAB=y # CONFIG_BLK_DEV_BSG is not set -# CONFIG_IOSCHED_CFQ is not set CONFIG_ARCH_ZX=y CONFIG_SOC_ZX296702=y # CONFIG_SWP_EMULATE is not set -- cgit v1.2.3 From 6f4261fa86dfe08c34ad99eba66368f43e9dd4c3 Mon Sep 17 00:00:00 2001 From: Vivek Kasireddy Date: Tue, 14 Jan 2020 17:23:05 -0800 Subject: drm/i915/dsi: Lookup the i2c bus from ACPI NS only if CONFIG_ACPI=y (v2) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Perform the i2c bus/adapter lookup from ACPI Namespace only if ACPI is enabled in the kernel config. If ACPI is not enabled or if the lookup fails, we'll fallback to using the VBT for identifying the i2c bus. v2: Add fixes tag (Jani) Fixes: 8cbf89db2941 ("drm/i915/dsi: Parse the I2C element from the VBT MIPI sequence block (v3)") Cc: Hans de Goede Cc: Nabendu Maiti Cc: Matt Roper Cc: Bob Paauwe Cc: Ville Syrjälä Cc: Jani Nikula Cc: Zhang Xiaoxu Reported-by: Hulk Robot Signed-off-by: Vivek Kasireddy Signed-off-by: Jani Nikula Link: https://patchwork.freedesktop.org/patch/msgid/20200115012305.27395-1-vivek.kasireddy@intel.com (cherry picked from commit 960287ca58fd549af9826ff1cb735fe17d031486) Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/display/intel_dsi_vbt.c | 47 ++++++++++++++++++---------- 1 file changed, 31 insertions(+), 16 deletions(-) diff --git a/drivers/gpu/drm/i915/display/intel_dsi_vbt.c b/drivers/gpu/drm/i915/display/intel_dsi_vbt.c index 89fb0d90b694..6ec35d975bd7 100644 --- a/drivers/gpu/drm/i915/display/intel_dsi_vbt.c +++ b/drivers/gpu/drm/i915/display/intel_dsi_vbt.c @@ -384,6 +384,7 @@ static const u8 *mipi_exec_gpio(struct intel_dsi *intel_dsi, const u8 *data) return data; } +#ifdef CONFIG_ACPI static int i2c_adapter_lookup(struct acpi_resource *ares, void *data) { struct i2c_adapter_lookup *lookup = data; @@ -413,14 +414,41 @@ static int i2c_adapter_lookup(struct acpi_resource *ares, void *data) return 1; } -static const u8 *mipi_exec_i2c(struct intel_dsi *intel_dsi, const u8 *data) +static void i2c_acpi_find_adapter(struct intel_dsi *intel_dsi, + const u16 slave_addr) { struct drm_device *drm_dev = intel_dsi->base.base.dev; struct device *dev = &drm_dev->pdev->dev; - struct i2c_adapter *adapter; struct acpi_device *acpi_dev; struct list_head resource_list; struct i2c_adapter_lookup lookup; + + acpi_dev = ACPI_COMPANION(dev); + if (acpi_dev) { + memset(&lookup, 0, sizeof(lookup)); + lookup.slave_addr = slave_addr; + lookup.intel_dsi = intel_dsi; + lookup.dev_handle = acpi_device_handle(acpi_dev); + + INIT_LIST_HEAD(&resource_list); + acpi_dev_get_resources(acpi_dev, &resource_list, + i2c_adapter_lookup, + &lookup); + acpi_dev_free_resource_list(&resource_list); + } +} +#else +static inline void i2c_acpi_find_adapter(struct intel_dsi *intel_dsi, + const u16 slave_addr) +{ +} +#endif + +static const u8 *mipi_exec_i2c(struct intel_dsi *intel_dsi, const u8 *data) +{ + struct drm_device *drm_dev = intel_dsi->base.base.dev; + struct device *dev = &drm_dev->pdev->dev; + struct i2c_adapter *adapter; struct i2c_msg msg; int ret; u8 vbt_i2c_bus_num = *(data + 2); @@ -431,20 +459,7 @@ static const u8 *mipi_exec_i2c(struct intel_dsi *intel_dsi, const u8 *data) if (intel_dsi->i2c_bus_num < 0) { intel_dsi->i2c_bus_num = vbt_i2c_bus_num; - - acpi_dev = ACPI_COMPANION(dev); - if (acpi_dev) { - memset(&lookup, 0, sizeof(lookup)); - lookup.slave_addr = slave_addr; - lookup.intel_dsi = intel_dsi; - lookup.dev_handle = acpi_device_handle(acpi_dev); - - INIT_LIST_HEAD(&resource_list); - acpi_dev_get_resources(acpi_dev, &resource_list, - i2c_adapter_lookup, - &lookup); - acpi_dev_free_resource_list(&resource_list); - } + i2c_acpi_find_adapter(intel_dsi, slave_addr); } adapter = i2c_get_adapter(intel_dsi->i2c_bus_num); -- cgit v1.2.3 From 0887aa8744aea22c10cd4c36746596d67fa8da98 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= Date: Wed, 15 Jan 2020 21:08:09 +0200 Subject: drm/i915: Fix post-fastset modeset check for port sync MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The post-fastset "does anyone still need a full modeset?" for port sync looks busted. The outer loop bails out of a full modeset is still needed by the current crtc, and then we skip forcing a full modeset on the related crtcs. That's totally the opposite of what we want. The MST path has the logic mostly the other way around so it looks correct. To fix the port sync case let's follow the MST logic for both. So, if the current crtc already needs a modeset we do nothing. otherwise we check if any of the related crtcs needs a modeset, and if so we force a full modeset for the current crtc. And while at let's change the else if to a plain if to so we don't have needless coupling between the MST and port sync checks. Cc: José Roberto de Souza Cc: Manasi Navare Fixes: 05a8e45136ca ("drm/i915/display: Use external dependency loop for port sync") Signed-off-by: Ville Syrjälä Link: https://patchwork.freedesktop.org/patch/msgid/20200115190813.17971-1-ville.syrjala@linux.intel.com Reviewed-by: José Roberto de Souza (cherry picked from commit d0eed1545fe75f115a548691a008e94b0e7abc45) Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/display/intel_display.c | 43 +++++++++++----------------- 1 file changed, 17 insertions(+), 26 deletions(-) diff --git a/drivers/gpu/drm/i915/display/intel_display.c b/drivers/gpu/drm/i915/display/intel_display.c index 19ea842cfd84..a410a213bd30 100644 --- a/drivers/gpu/drm/i915/display/intel_display.c +++ b/drivers/gpu/drm/i915/display/intel_display.c @@ -14476,37 +14476,23 @@ static int intel_atomic_check_crtcs(struct intel_atomic_state *state) return 0; } -static bool intel_cpu_transcoder_needs_modeset(struct intel_atomic_state *state, - enum transcoder transcoder) +static bool intel_cpu_transcoders_need_modeset(struct intel_atomic_state *state, + u8 transcoders) { - struct intel_crtc_state *new_crtc_state; + const struct intel_crtc_state *new_crtc_state; struct intel_crtc *crtc; int i; - for_each_new_intel_crtc_in_state(state, crtc, new_crtc_state, i) - if (new_crtc_state->cpu_transcoder == transcoder) - return needs_modeset(new_crtc_state); + for_each_new_intel_crtc_in_state(state, crtc, new_crtc_state, i) { + if (new_crtc_state->hw.enable && + transcoders & BIT(new_crtc_state->cpu_transcoder) && + needs_modeset(new_crtc_state)) + return true; + } return false; } -static void -intel_modeset_synced_crtcs(struct intel_atomic_state *state, - u8 transcoders) -{ - struct intel_crtc_state *new_crtc_state; - struct intel_crtc *crtc; - int i; - - for_each_new_intel_crtc_in_state(state, crtc, - new_crtc_state, i) { - if (transcoders & BIT(new_crtc_state->cpu_transcoder)) { - new_crtc_state->uapi.mode_changed = true; - new_crtc_state->update_pipe = false; - } - } -} - static int intel_modeset_all_tiles(struct intel_atomic_state *state, int tile_grp_id) { @@ -14662,15 +14648,20 @@ static int intel_atomic_check(struct drm_device *dev, if (intel_dp_mst_is_slave_trans(new_crtc_state)) { enum transcoder master = new_crtc_state->mst_master_transcoder; - if (intel_cpu_transcoder_needs_modeset(state, master)) { + if (intel_cpu_transcoders_need_modeset(state, BIT(master))) { new_crtc_state->uapi.mode_changed = true; new_crtc_state->update_pipe = false; } - } else if (is_trans_port_sync_mode(new_crtc_state)) { + } + + if (is_trans_port_sync_mode(new_crtc_state)) { u8 trans = new_crtc_state->sync_mode_slaves_mask | BIT(new_crtc_state->master_transcoder); - intel_modeset_synced_crtcs(state, trans); + if (intel_cpu_transcoders_need_modeset(state, trans)) { + new_crtc_state->uapi.mode_changed = true; + new_crtc_state->update_pipe = false; + } } } -- cgit v1.2.3 From 1788fdf14e518e363bae9d18345d93102f4ee5ad Mon Sep 17 00:00:00 2001 From: Vivek Kasireddy Date: Fri, 17 Jan 2020 16:58:48 -0800 Subject: drm/i915/dsi: Ensure that the ACPI adapter lookup overrides the bus num Remove the i2c_bus_num >= 0 check from the adapter lookup function as this would prevent ACPI bus number override. This check was mainly there to return early if the bus number has already been found but we anyway return in the next line if the slave address does not match. Fixes: 8cbf89db2941 ("drm/i915/dsi: Parse the I2C element from the VBT MIPI sequence block (v3)") Cc: Hans de Goede Cc: Nabendu Maiti Cc: Matt Roper Cc: Bob Paauwe Cc: Jani Nikula Signed-off-by: Vivek Kasireddy Signed-off-by: Jani Nikula Link: https://patchwork.freedesktop.org/patch/msgid/20200118005848.20382-1-vivek.kasireddy@intel.com (cherry picked from commit de409661c4c90d63cfc64579edbad0a6b10bd50d) Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/display/intel_dsi_vbt.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/gpu/drm/i915/display/intel_dsi_vbt.c b/drivers/gpu/drm/i915/display/intel_dsi_vbt.c index 6ec35d975bd7..04f953ba8f00 100644 --- a/drivers/gpu/drm/i915/display/intel_dsi_vbt.c +++ b/drivers/gpu/drm/i915/display/intel_dsi_vbt.c @@ -394,8 +394,7 @@ static int i2c_adapter_lookup(struct acpi_resource *ares, void *data) acpi_handle adapter_handle; acpi_status status; - if (intel_dsi->i2c_bus_num >= 0 || - !i2c_acpi_get_i2c_resource(ares, &sb)) + if (!i2c_acpi_get_i2c_resource(ares, &sb)) return 1; if (lookup->slave_addr != sb->slave_address) -- cgit v1.2.3 From e73c1486e4c867865fff1cfa0f0315a107ff4c21 Mon Sep 17 00:00:00 2001 From: Vandita Kulkarni Date: Fri, 24 Jan 2020 18:28:29 +0530 Subject: drm/i915/bios: Fix the timing parameters Fix htotal and vtotal parameters derived from DTD block of VBT. The values miss the back porch. Fixes: 33ef6d4fd8df ("drm/i915/vbt: Handle generic DTD block") Signed-off-by: Vandita Kulkarni Signed-off-by: Jani Nikula Link: https://patchwork.freedesktop.org/patch/msgid/20200124125829.16973-1-vandita.kulkarni@intel.com (cherry picked from commit ad278f358446707d03a1fe89f880e6ac80ca06cd) Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/display/intel_bios.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/i915/display/intel_bios.c b/drivers/gpu/drm/i915/display/intel_bios.c index 8beac06e3f10..ef4017a1baba 100644 --- a/drivers/gpu/drm/i915/display/intel_bios.c +++ b/drivers/gpu/drm/i915/display/intel_bios.c @@ -357,14 +357,16 @@ parse_generic_dtd(struct drm_i915_private *dev_priv, panel_fixed_mode->hdisplay + dtd->hfront_porch; panel_fixed_mode->hsync_end = panel_fixed_mode->hsync_start + dtd->hsync; - panel_fixed_mode->htotal = panel_fixed_mode->hsync_end; + panel_fixed_mode->htotal = + panel_fixed_mode->hdisplay + dtd->hblank; panel_fixed_mode->vdisplay = dtd->vactive; panel_fixed_mode->vsync_start = panel_fixed_mode->vdisplay + dtd->vfront_porch; panel_fixed_mode->vsync_end = panel_fixed_mode->vsync_start + dtd->vsync; - panel_fixed_mode->vtotal = panel_fixed_mode->vsync_end; + panel_fixed_mode->vtotal = + panel_fixed_mode->vdisplay + dtd->vblank; panel_fixed_mode->clock = dtd->pixel_clock; panel_fixed_mode->width_mm = dtd->width_mm; -- cgit v1.2.3 From 00bcda13dcbf6bf7fa6f2a5886dd555362de8cfa Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 8 Feb 2020 19:13:32 -0700 Subject: io-wq: make io_wqe_cancel_work() take a match handler We want to use the cancel functionality for canceling based on not just the work itself. Instead of matching on the work address manually, allow a match handler to tell us if we found the right work item or not. No functional changes in this patch. Signed-off-by: Jens Axboe --- fs/io-wq.c | 33 ++++++++++++++++++++++----------- 1 file changed, 22 insertions(+), 11 deletions(-) diff --git a/fs/io-wq.c b/fs/io-wq.c index 7ac4a8876a50..df78de33ff84 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -937,17 +937,19 @@ enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel, return ret; } +struct work_match { + bool (*fn)(struct io_wq_work *, void *data); + void *data; +}; + static bool io_wq_worker_cancel(struct io_worker *worker, void *data) { - struct io_wq_work *work = data; + struct work_match *match = data; unsigned long flags; bool ret = false; - if (worker->cur_work != work) - return false; - spin_lock_irqsave(&worker->lock, flags); - if (worker->cur_work == work && + if (match->fn(worker->cur_work, match->data) && !(worker->cur_work->flags & IO_WQ_WORK_NO_CANCEL)) { send_sig(SIGINT, worker->task, 1); ret = true; @@ -958,15 +960,13 @@ static bool io_wq_worker_cancel(struct io_worker *worker, void *data) } static enum io_wq_cancel io_wqe_cancel_work(struct io_wqe *wqe, - struct io_wq_work *cwork) + struct work_match *match) { struct io_wq_work_node *node, *prev; struct io_wq_work *work; unsigned long flags; bool found = false; - cwork->flags |= IO_WQ_WORK_CANCEL; - /* * First check pending list, if we're lucky we can just remove it * from there. CANCEL_OK means that the work is returned as-new, @@ -976,7 +976,7 @@ static enum io_wq_cancel io_wqe_cancel_work(struct io_wqe *wqe, wq_list_for_each(node, prev, &wqe->work_list) { work = container_of(node, struct io_wq_work, list); - if (work == cwork) { + if (match->fn(work, match->data)) { wq_node_del(&wqe->work_list, node, prev); found = true; break; @@ -997,20 +997,31 @@ static enum io_wq_cancel io_wqe_cancel_work(struct io_wqe *wqe, * completion will run normally in this case. */ rcu_read_lock(); - found = io_wq_for_each_worker(wqe, io_wq_worker_cancel, cwork); + found = io_wq_for_each_worker(wqe, io_wq_worker_cancel, match); rcu_read_unlock(); return found ? IO_WQ_CANCEL_RUNNING : IO_WQ_CANCEL_NOTFOUND; } +static bool io_wq_work_match(struct io_wq_work *work, void *data) +{ + return work == data; +} + enum io_wq_cancel io_wq_cancel_work(struct io_wq *wq, struct io_wq_work *cwork) { + struct work_match match = { + .fn = io_wq_work_match, + .data = cwork + }; enum io_wq_cancel ret = IO_WQ_CANCEL_NOTFOUND; int node; + cwork->flags |= IO_WQ_WORK_CANCEL; + for_each_node(node) { struct io_wqe *wqe = wq->wqes[node]; - ret = io_wqe_cancel_work(wqe, cwork); + ret = io_wqe_cancel_work(wqe, &match); if (ret != IO_WQ_CANCEL_NOTFOUND) break; } -- cgit v1.2.3 From 36282881a795cbf717aca79392ae9cdf0fef59c9 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 8 Feb 2020 19:16:39 -0700 Subject: io-wq: add io_wq_cancel_pid() to cancel based on a specific pid Add a helper that allows the caller to cancel work based on what mm it belongs to. This allows io_uring to cancel work from a given task or thread when it exits. Signed-off-by: Jens Axboe --- fs/io-wq.c | 29 +++++++++++++++++++++++++++++ fs/io-wq.h | 2 ++ 2 files changed, 31 insertions(+) diff --git a/fs/io-wq.c b/fs/io-wq.c index df78de33ff84..182aa17dc2ca 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -1029,6 +1029,35 @@ enum io_wq_cancel io_wq_cancel_work(struct io_wq *wq, struct io_wq_work *cwork) return ret; } +static bool io_wq_pid_match(struct io_wq_work *work, void *data) +{ + pid_t pid = (pid_t) (unsigned long) data; + + if (work) + return work->task_pid == pid; + return false; +} + +enum io_wq_cancel io_wq_cancel_pid(struct io_wq *wq, pid_t pid) +{ + struct work_match match = { + .fn = io_wq_pid_match, + .data = (void *) (unsigned long) pid + }; + enum io_wq_cancel ret = IO_WQ_CANCEL_NOTFOUND; + int node; + + for_each_node(node) { + struct io_wqe *wqe = wq->wqes[node]; + + ret = io_wqe_cancel_work(wqe, &match); + if (ret != IO_WQ_CANCEL_NOTFOUND) + break; + } + + return ret; +} + struct io_wq_flush_data { struct io_wq_work work; struct completion done; diff --git a/fs/io-wq.h b/fs/io-wq.h index f152ba677d8f..ccc7d84af57d 100644 --- a/fs/io-wq.h +++ b/fs/io-wq.h @@ -76,6 +76,7 @@ struct io_wq_work { const struct cred *creds; struct fs_struct *fs; unsigned flags; + pid_t task_pid; }; #define INIT_IO_WORK(work, _func) \ @@ -109,6 +110,7 @@ void io_wq_flush(struct io_wq *wq); void io_wq_cancel_all(struct io_wq *wq); enum io_wq_cancel io_wq_cancel_work(struct io_wq *wq, struct io_wq_work *cwork); +enum io_wq_cancel io_wq_cancel_pid(struct io_wq *wq, pid_t pid); typedef bool (work_cancel_fn)(struct io_wq_work *, void *); -- cgit v1.2.3 From 6ab231448fdc5e37c15a94a4700fca11e80007f7 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 8 Feb 2020 20:23:59 -0700 Subject: io_uring: cancel pending async work if task exits Normally we cancel all work we track, but for untracked work we could leave the async worker behind until that work completes. This is totally fine, but does leave resources pending after the task is gone until that work completes. Cancel work that this task queued up when it goes away. Signed-off-by: Jens Axboe --- fs/io_uring.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/fs/io_uring.c b/fs/io_uring.c index 24ebd5714bf9..971d51c50151 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -925,6 +925,8 @@ static inline void io_req_work_grab_env(struct io_kiocb *req, } spin_unlock(¤t->fs->lock); } + if (!req->work.task_pid) + req->work.task_pid = task_pid_vnr(current); } static inline void io_req_work_drop_env(struct io_kiocb *req) @@ -6474,6 +6476,13 @@ static int io_uring_flush(struct file *file, void *data) struct io_ring_ctx *ctx = file->private_data; io_uring_cancel_files(ctx, data); + + /* + * If the task is going away, cancel work it may have pending + */ + if (fatal_signal_pending(current) || (current->flags & PF_EXITING)) + io_wq_cancel_pid(ctx->io_wq, task_pid_vnr(current)); + return 0; } -- cgit v1.2.3 From c2cebbc4a593bd2ee72d46a8439dcbca512b5507 Mon Sep 17 00:00:00 2001 From: Zhang Xiaoxu Date: Fri, 17 Jan 2020 15:34:36 +0800 Subject: drm/i915: Fix i915_error_state_store error defination Since commit 742379c0c4001 ("drm/i915: Start chopping up the GPU error capture"), function 'i915_error_state_store' was defined and used with only one parameter. But if no 'CONFIG_DRM_I915_CAPTURE_ERROR', this function was defined with two parameter. This may lead compile error. This patch fix it. Fixes: 742379c0c400 ("drm/i915: Start chopping up the GPU error capture") Reported-by: Hulk Robot Signed-off-by: Zhang Xiaoxu Reviewed-by: Andi Shyti Signed-off-by: Chris Wilson Link: https://patchwork.freedesktop.org/patch/msgid/20200117073436.6507-1-zhangxiaoxu5@huawei.com (cherry picked from commit 04062c58faafddf62006c6f8e5077dc050e8207e) Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/i915_gpu_error.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_gpu_error.h b/drivers/gpu/drm/i915/i915_gpu_error.h index 9109004956bd..41c1475e1500 100644 --- a/drivers/gpu/drm/i915/i915_gpu_error.h +++ b/drivers/gpu/drm/i915/i915_gpu_error.h @@ -314,8 +314,7 @@ i915_vma_capture_finish(struct intel_gt_coredump *gt, } static inline void -i915_error_state_store(struct drm_i915_private *i915, - struct i915_gpu_coredump *error) +i915_error_state_store(struct i915_gpu_coredump *error) { } -- cgit v1.2.3 From b537916ca5107c3a8714b8ab3099c0ec205aec12 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sun, 9 Feb 2020 11:29:15 -0700 Subject: io_uring: retain sockaddr_storage across send/recvmsg async punt Jonas reports that he sometimes sees -97/-22 error returns from sendmsg, if it gets punted async. This is due to not retaining the sockaddr_storage between calls. Include that in the state we copy when going async. Cc: stable@vger.kernel.org # 5.3+ Reported-by: Jonas Bonn Tested-by: Jonas Bonn Signed-off-by: Jens Axboe --- fs/io_uring.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 971d51c50151..6d4e20d59729 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -442,6 +442,7 @@ struct io_async_msghdr { struct iovec *iov; struct sockaddr __user *uaddr; struct msghdr msg; + struct sockaddr_storage addr; }; struct io_async_rw { @@ -3032,12 +3033,11 @@ static int io_sendmsg(struct io_kiocb *req, struct io_kiocb **nxt, sock = sock_from_file(req->file, &ret); if (sock) { struct io_async_ctx io; - struct sockaddr_storage addr; unsigned flags; if (req->io) { kmsg = &req->io->msg; - kmsg->msg.msg_name = &addr; + kmsg->msg.msg_name = &req->io->msg.addr; /* if iov is set, it's allocated already */ if (!kmsg->iov) kmsg->iov = kmsg->fast_iov; @@ -3046,7 +3046,7 @@ static int io_sendmsg(struct io_kiocb *req, struct io_kiocb **nxt, struct io_sr_msg *sr = &req->sr_msg; kmsg = &io.msg; - kmsg->msg.msg_name = &addr; + kmsg->msg.msg_name = &io.msg.addr; io.msg.iov = io.msg.fast_iov; ret = sendmsg_copy_msghdr(&io.msg.msg, sr->msg, @@ -3185,12 +3185,11 @@ static int io_recvmsg(struct io_kiocb *req, struct io_kiocb **nxt, sock = sock_from_file(req->file, &ret); if (sock) { struct io_async_ctx io; - struct sockaddr_storage addr; unsigned flags; if (req->io) { kmsg = &req->io->msg; - kmsg->msg.msg_name = &addr; + kmsg->msg.msg_name = &req->io->msg.addr; /* if iov is set, it's allocated already */ if (!kmsg->iov) kmsg->iov = kmsg->fast_iov; @@ -3199,7 +3198,7 @@ static int io_recvmsg(struct io_kiocb *req, struct io_kiocb **nxt, struct io_sr_msg *sr = &req->sr_msg; kmsg = &io.msg; - kmsg->msg.msg_name = &addr; + kmsg->msg.msg_name = &io.msg.addr; io.msg.iov = io.msg.fast_iov; ret = recvmsg_copy_msghdr(&io.msg.msg, sr->msg, -- cgit v1.2.3 From c216f12bed33f779b974cb2d69206d6202bde572 Mon Sep 17 00:00:00 2001 From: Igor Druzhinin Date: Wed, 22 Jan 2020 20:10:24 +0000 Subject: drm/i915/gvt: fix high-order allocation failure on late load If the module happens to be loaded later at runtime there is a chance memory is already fragmented enough to fail allocation of firmware blob storage and consequently GVT init. Since it doesn't seem to be necessary to have the blob contiguous, use vmalloc() instead to avoid the issue. Reviewed-by: Zhenyu Wang Signed-off-by: Igor Druzhinin Signed-off-by: Zhenyu Wang Link: http://patchwork.freedesktop.org/patch/msgid/1579723824-25711-1-git-send-email-igor.druzhinin@citrix.com --- drivers/gpu/drm/i915/gvt/firmware.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/i915/gvt/firmware.c b/drivers/gpu/drm/i915/gvt/firmware.c index 049775e8e350..b0c1fda32977 100644 --- a/drivers/gpu/drm/i915/gvt/firmware.c +++ b/drivers/gpu/drm/i915/gvt/firmware.c @@ -146,7 +146,7 @@ void intel_gvt_free_firmware(struct intel_gvt *gvt) clean_firmware_sysfs(gvt); kfree(gvt->firmware.cfg_space); - kfree(gvt->firmware.mmio); + vfree(gvt->firmware.mmio); } static int verify_firmware(struct intel_gvt *gvt, @@ -229,7 +229,7 @@ int intel_gvt_load_firmware(struct intel_gvt *gvt) firmware->cfg_space = mem; - mem = kmalloc(info->mmio_size, GFP_KERNEL); + mem = vmalloc(info->mmio_size); if (!mem) { kfree(path); kfree(firmware->cfg_space); -- cgit v1.2.3 From 0e9d7bb293f3f9c3ee376b126141407efb265f31 Mon Sep 17 00:00:00 2001 From: Igor Druzhinin Date: Mon, 3 Feb 2020 15:07:01 +0000 Subject: drm/i915/gvt: more locking for ppgtt mm LRU list When the lock was introduced in commit 72aabfb862e40 ("drm/i915/gvt: Add mutual lock for ppgtt mm LRU list") one place got lost. Fixes: 72aabfb862e4 ("drm/i915/gvt: Add mutual lock for ppgtt mm LRU list") Signed-off-by: Igor Druzhinin Reviewed-by: Zhenyu Wang Signed-off-by: Zhenyu Wang Link: http://patchwork.freedesktop.org/patch/msgid/1580742421-25194-1-git-send-email-igor.druzhinin@citrix.com --- drivers/gpu/drm/i915/gvt/gtt.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/gpu/drm/i915/gvt/gtt.c b/drivers/gpu/drm/i915/gvt/gtt.c index 4b04af569c05..7dc7bb850d0a 100644 --- a/drivers/gpu/drm/i915/gvt/gtt.c +++ b/drivers/gpu/drm/i915/gvt/gtt.c @@ -1956,7 +1956,11 @@ void _intel_vgpu_mm_release(struct kref *mm_ref) if (mm->type == INTEL_GVT_MM_PPGTT) { list_del(&mm->ppgtt_mm.list); + + mutex_lock(&mm->vgpu->gvt->gtt.ppgtt_mm_lock); list_del(&mm->ppgtt_mm.lru_list); + mutex_unlock(&mm->vgpu->gvt->gtt.ppgtt_mm_lock); + invalidate_ppgtt_mm(mm); } else { vfree(mm->ggtt_mm.virtual_ggtt); -- cgit v1.2.3 From cf2b012c90e74e85d8aea7d67e48868069cfee0c Mon Sep 17 00:00:00 2001 From: Mike Jones Date: Tue, 28 Jan 2020 10:59:59 -0700 Subject: hwmon: (pmbus/ltc2978) Fix PMBus polling of MFR_COMMON definitions. Change 21537dc driver PMBus polling of MFR_COMMON from bits 5/4 to bits 6/5. This fixs a LTC297X family bug where polling always returns not busy even when the part is busy. This fixes a LTC388X and LTM467X bug where polling used PEND and NOT_IN_TRANS, and BUSY was not polled, which can lead to NACKing of commands. LTC388X and LTM467X modules now poll BUSY and PEND, increasing reliability by eliminating NACKing of commands. Signed-off-by: Mike Jones Link: https://lore.kernel.org/r/1580234400-2829-2-git-send-email-michael-a1.jones@analog.com Fixes: e04d1ce9bbb49 ("hwmon: (ltc2978) Add polling for chips requiring it") Signed-off-by: Guenter Roeck --- drivers/hwmon/pmbus/ltc2978.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/hwmon/pmbus/ltc2978.c b/drivers/hwmon/pmbus/ltc2978.c index f01f4887fb2e..a91ed01abb68 100644 --- a/drivers/hwmon/pmbus/ltc2978.c +++ b/drivers/hwmon/pmbus/ltc2978.c @@ -82,8 +82,8 @@ enum chips { ltc2974, ltc2975, ltc2977, ltc2978, ltc2980, ltc3880, ltc3882, #define LTC_POLL_TIMEOUT 100 /* in milli-seconds */ -#define LTC_NOT_BUSY BIT(5) -#define LTC_NOT_PENDING BIT(4) +#define LTC_NOT_BUSY BIT(6) +#define LTC_NOT_PENDING BIT(5) /* * LTC2978 clears peak data whenever the CLEAR_FAULTS command is executed, which -- cgit v1.2.3 From f166795871be4a6a679a5f61ac7130b3c0b21cab Mon Sep 17 00:00:00 2001 From: Nicolas Saenz Julienne Date: Fri, 24 Jan 2020 22:08:33 +0100 Subject: arm64: defconfig: Set bcm2835-dma as built-in With the introduction of 738987a1d6f1 ("mmc: bcm2835: Use dma_request_chan() instead dma_request_slave_channel()") sdhost-bcm2835 now waits for its DMA channel to be available when defined in the device-tree (it would previously default to PIO). Albeit the right behaviour, the MMC host is needed for booting. So this makes sure the DMA channel shows up in time. Fixes: 738987a1d6f1 ("mmc: bcm2835: Use dma_request_chan() instead dma_request_slave_channel()") Signed-off-by: Nicolas Saenz Julienne Signed-off-by: Florian Fainelli --- arch/arm64/configs/defconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig index 0f212889c931..b598bd7b7d62 100644 --- a/arch/arm64/configs/defconfig +++ b/arch/arm64/configs/defconfig @@ -681,7 +681,7 @@ CONFIG_RTC_DRV_SNVS=m CONFIG_RTC_DRV_IMX_SC=m CONFIG_RTC_DRV_XGENE=y CONFIG_DMADEVICES=y -CONFIG_DMA_BCM2835=m +CONFIG_DMA_BCM2835=y CONFIG_DMA_SUN6I=m CONFIG_FSL_EDMA=y CONFIG_IMX_SDMA=y -- cgit v1.2.3 From 415ae604d4ac0c0f4e1ecd79f42891b9ca69cf70 Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Sun, 29 Dec 2019 11:17:07 +0000 Subject: ARM: sunxi: Enable CONFIG_SUN8I_THERMAL Many sunxi based board needs CONFIG_SUN8I_THERMAL for thermal support. Signed-off-by: Yangtao Li Signed-off-by: Maxime Ripard --- arch/arm/configs/sunxi_defconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm/configs/sunxi_defconfig b/arch/arm/configs/sunxi_defconfig index 3f5d727efc41..e9fb57374b9f 100644 --- a/arch/arm/configs/sunxi_defconfig +++ b/arch/arm/configs/sunxi_defconfig @@ -85,6 +85,7 @@ CONFIG_BATTERY_AXP20X=y CONFIG_AXP20X_POWER=y CONFIG_THERMAL=y CONFIG_CPU_THERMAL=y +CONFIG_SUN8I_THERMAL=y CONFIG_WATCHDOG=y CONFIG_SUNXI_WATCHDOG=y CONFIG_MFD_AC100=y -- cgit v1.2.3 From 4a453ccf87d507e8c9f156f95e708cec0e70ffed Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Sun, 29 Dec 2019 11:17:06 +0000 Subject: arm64: defconfig: Enable CONFIG_SUN8I_THERMAL Many sunxi based board needs CONFIG_SUN8I_THERMAL for thermal support. Signed-off-by: Yangtao Li Signed-off-by: Maxime Ripard --- arch/arm64/configs/defconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig index 0f212889c931..db873d8e03e9 100644 --- a/arch/arm64/configs/defconfig +++ b/arch/arm64/configs/defconfig @@ -452,6 +452,7 @@ CONFIG_THERMAL_GOV_POWER_ALLOCATOR=y CONFIG_CPU_THERMAL=y CONFIG_THERMAL_EMULATION=y CONFIG_QORIQ_THERMAL=m +CONFIG_SUN8I_THERMAL=y CONFIG_ROCKCHIP_THERMAL=m CONFIG_RCAR_THERMAL=y CONFIG_RCAR_GEN3_THERMAL=y -- cgit v1.2.3 From 03c6bf4644287601bf10d0ed9f6137c1854d3e23 Mon Sep 17 00:00:00 2001 From: Jagan Teki Date: Tue, 31 Dec 2019 12:25:08 +0530 Subject: arm64: defconfig: Enable DRM_SUN6I_DSI Now, Allwiner MIPI-DSI support is available for ARM64 Allwinner SoC like A64. So, let's build it as a module. Signed-off-by: Jagan Teki Signed-off-by: Maxime Ripard --- arch/arm64/configs/defconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig index db873d8e03e9..4631a1190719 100644 --- a/arch/arm64/configs/defconfig +++ b/arch/arm64/configs/defconfig @@ -548,6 +548,7 @@ CONFIG_ROCKCHIP_DW_MIPI_DSI=y CONFIG_ROCKCHIP_INNO_HDMI=y CONFIG_DRM_RCAR_DU=m CONFIG_DRM_SUN4I=m +CONFIG_DRM_SUN6I_DSI=m CONFIG_DRM_SUN8I_DW_HDMI=m CONFIG_DRM_SUN8I_MIXER=m CONFIG_DRM_MSM=m -- cgit v1.2.3 From c664a4fa8f69308b8f624cff4fa1294e9aef880d Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 29 Jan 2020 20:30:37 +0300 Subject: USB: serial: ir-usb: Silence harmless uninitialized variable warning The "actual_length" variable might be uninitialized on some failure paths. It's harmless but static analysis tools like Smatch complain and at runtime the UBSan tool will likely complain as well. Fixes: e7542bc382f8 ("USB: serial: ir-usb: make set_termios synchronous") Signed-off-by: Dan Carpenter Signed-off-by: Johan Hovold --- drivers/usb/serial/ir-usb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/serial/ir-usb.c b/drivers/usb/serial/ir-usb.c index 79d0586e2b33..172261a908d8 100644 --- a/drivers/usb/serial/ir-usb.c +++ b/drivers/usb/serial/ir-usb.c @@ -448,7 +448,7 @@ static void ir_set_termios(struct tty_struct *tty, usb_sndbulkpipe(udev, port->bulk_out_endpointAddress), transfer_buffer, 1, &actual_length, 5000); if (ret || actual_length != 1) { - if (actual_length != 1) + if (!ret) ret = -EIO; dev_err(&port->dev, "failed to change line speed: %d\n", ret); } -- cgit v1.2.3 From 7c3d02285ad558691f27fde760bcd841baa27eab Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Thu, 6 Feb 2020 12:18:19 +0100 Subject: USB: serial: ch341: fix receiver regression While assumed not to make a difference, not using the factor-2 prescaler makes the receiver more susceptible to errors. Specifically, there have been reports of problems with devices that cannot generate a 115200 rate with a smaller error than 2.1% (e.g. 117647 bps). But this can also be reproduced with a low-speed RS232 tranceiver at 115200 when the input rate matches the nominal rate. So whenever possible, enable the factor-2 prescaler and halve the divisor in order to use settings closer to that of the previous algorithm. Fixes: 35714565089e ("USB: serial: ch341: reimplement line-speed handling") Cc: stable # 5.5 Reported-by: Jakub Nantl Tested-by: Jakub Nantl Reviewed-by: Greg Kroah-Hartman Signed-off-by: Johan Hovold --- drivers/usb/serial/ch341.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/usb/serial/ch341.c b/drivers/usb/serial/ch341.c index d3f420f3a083..c5ecdcd51ffc 100644 --- a/drivers/usb/serial/ch341.c +++ b/drivers/usb/serial/ch341.c @@ -205,6 +205,16 @@ static int ch341_get_divisor(speed_t speed) 16 * speed - 16 * CH341_CLKRATE / (clk_div * (div + 1))) div++; + /* + * Prefer lower base clock (fact = 0) if even divisor. + * + * Note that this makes the receiver more tolerant to errors. + */ + if (fact == 1 && div % 2 == 0) { + div /= 2; + fact = 0; + } + return (0x100 - div) << 8 | fact << 2 | ps; } -- cgit v1.2.3 From 8a6483ac634acda3f599f50082c652d2d37199c7 Mon Sep 17 00:00:00 2001 From: Tomi Valkeinen Date: Mon, 9 Dec 2019 10:27:07 +0200 Subject: drm/bridge: tc358767: fix poll timeouts Link training fails with: Link training timeout waiting for LT_LOOPDONE! main link enable error: -110 This is caused by too tight timeouts, which were changed recently in aa92213f388b ("drm/bridge: tc358767: Simplify polling in tc_link_training()"). With a quick glance, the commit does not change the timeouts. However, the method of delaying/sleeping is different, and as the timeout in the previous implementation was not explicit, the new version in practice has much tighter timeout. The same change was made to other parts in the driver, but the link training timeout is the only one I have seen causing issues. Nevertheless, 1 us sleep is not very sane, and the timeouts look pretty tight, so lets fix all the timeouts. One exception was the aux busy poll, where the poll sleep was much longer than necessary (or optimal). I measured the times on my setup, and now the sleep times are set to such values that they result in multiple loops, but not too many (say, 5-10 loops). The timeouts were all increased to 100ms, which should be more than enough for all of these, but in case of bad errors, shouldn't stop the driver as multi-second timeouts could do. Signed-off-by: Tomi Valkeinen Fixes: aa92213f388b ("drm/bridge: tc358767: Simplify polling in tc_link_training()") Tested-by: Andrey Smirnov Reviewed-by: Neil Armstrong Signed-off-by: Neil Armstrong Link: https://patchwork.freedesktop.org/patch/msgid/20191209082707.24531-1-tomi.valkeinen@ti.com --- drivers/gpu/drm/bridge/tc358767.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/bridge/tc358767.c b/drivers/gpu/drm/bridge/tc358767.c index 8029478ffebb..b0b0ccbb059d 100644 --- a/drivers/gpu/drm/bridge/tc358767.c +++ b/drivers/gpu/drm/bridge/tc358767.c @@ -297,7 +297,7 @@ static inline int tc_poll_timeout(struct tc_data *tc, unsigned int addr, static int tc_aux_wait_busy(struct tc_data *tc) { - return tc_poll_timeout(tc, DP0_AUXSTATUS, AUX_BUSY, 0, 1000, 100000); + return tc_poll_timeout(tc, DP0_AUXSTATUS, AUX_BUSY, 0, 100, 100000); } static int tc_aux_write_data(struct tc_data *tc, const void *data, @@ -640,7 +640,7 @@ static int tc_aux_link_setup(struct tc_data *tc) if (ret) goto err; - ret = tc_poll_timeout(tc, DP_PHY_CTRL, PHY_RDY, PHY_RDY, 1, 1000); + ret = tc_poll_timeout(tc, DP_PHY_CTRL, PHY_RDY, PHY_RDY, 100, 100000); if (ret == -ETIMEDOUT) { dev_err(tc->dev, "Timeout waiting for PHY to become ready"); return ret; @@ -876,7 +876,7 @@ static int tc_wait_link_training(struct tc_data *tc) int ret; ret = tc_poll_timeout(tc, DP0_LTSTAT, LT_LOOPDONE, - LT_LOOPDONE, 1, 1000); + LT_LOOPDONE, 500, 100000); if (ret) { dev_err(tc->dev, "Link training timeout waiting for LT_LOOPDONE!\n"); return ret; @@ -949,7 +949,7 @@ static int tc_main_link_enable(struct tc_data *tc) dp_phy_ctrl &= ~(DP_PHY_RST | PHY_M1_RST | PHY_M0_RST); ret = regmap_write(tc->regmap, DP_PHY_CTRL, dp_phy_ctrl); - ret = tc_poll_timeout(tc, DP_PHY_CTRL, PHY_RDY, PHY_RDY, 1, 1000); + ret = tc_poll_timeout(tc, DP_PHY_CTRL, PHY_RDY, PHY_RDY, 500, 100000); if (ret) { dev_err(dev, "timeout waiting for phy become ready"); return ret; -- cgit v1.2.3 From 48bc281e4bf049abd3bb98371209315651bf4a14 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Wed, 15 Jan 2020 13:56:53 +0100 Subject: drm/bridge: ti-tfp410: Update drm_connector_init_with_ddc() error message The code was changed to call drm_connector_init_with_ddc() instead of drm_connector_init(), but the corresponding error message was not updated. Fixes: cfb444552926989f ("drm/bridge: ti-tfp410: Provide ddc symlink in connector sysfs directory") Signed-off-by: Geert Uytterhoeven Reviewed-by: Neil Armstrong Signed-off-by: Neil Armstrong Link: https://patchwork.freedesktop.org/patch/msgid/20200115125653.5519-1-geert+renesas@glider.be --- drivers/gpu/drm/bridge/ti-tfp410.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/bridge/ti-tfp410.c b/drivers/gpu/drm/bridge/ti-tfp410.c index 6f6d6d1e60ae..f195a4732e0b 100644 --- a/drivers/gpu/drm/bridge/ti-tfp410.c +++ b/drivers/gpu/drm/bridge/ti-tfp410.c @@ -140,7 +140,8 @@ static int tfp410_attach(struct drm_bridge *bridge) dvi->connector_type, dvi->ddc); if (ret) { - dev_err(dvi->dev, "drm_connector_init() failed: %d\n", ret); + dev_err(dvi->dev, "drm_connector_init_with_ddc() failed: %d\n", + ret); return ret; } -- cgit v1.2.3 From 2437fd7baf299c7b8a39fa3e727755e84ee7c4ea Mon Sep 17 00:00:00 2001 From: Chen Wandun Date: Mon, 10 Feb 2020 16:11:09 +0800 Subject: tipc: make three functions static Fix the following sparse warning: net/tipc/node.c:281:6: warning: symbol 'tipc_node_free' was not declared. Should it be static? net/tipc/node.c:2801:5: warning: symbol '__tipc_nl_node_set_key' was not declared. Should it be static? net/tipc/node.c:2878:5: warning: symbol '__tipc_nl_node_flush_key' was not declared. Should it be static? Fixes: fc1b6d6de220 ("tipc: introduce TIPC encryption & authentication") Fixes: e1f32190cf7d ("tipc: add support for AEAD key setting via netlink") Signed-off-by: Chen Wandun Signed-off-by: David S. Miller --- net/tipc/node.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/net/tipc/node.c b/net/tipc/node.c index 99b28b69fc17..0c88778c88b5 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -278,7 +278,7 @@ struct tipc_crypto *tipc_node_crypto_rx_by_list(struct list_head *pos) } #endif -void tipc_node_free(struct rcu_head *rp) +static void tipc_node_free(struct rcu_head *rp) { struct tipc_node *n = container_of(rp, struct tipc_node, rcu); @@ -2798,7 +2798,7 @@ static int tipc_nl_retrieve_nodeid(struct nlattr **attrs, u8 **node_id) return 0; } -int __tipc_nl_node_set_key(struct sk_buff *skb, struct genl_info *info) +static int __tipc_nl_node_set_key(struct sk_buff *skb, struct genl_info *info) { struct nlattr *attrs[TIPC_NLA_NODE_MAX + 1]; struct net *net = sock_net(skb->sk); @@ -2875,7 +2875,8 @@ int tipc_nl_node_set_key(struct sk_buff *skb, struct genl_info *info) return err; } -int __tipc_nl_node_flush_key(struct sk_buff *skb, struct genl_info *info) +static int __tipc_nl_node_flush_key(struct sk_buff *skb, + struct genl_info *info) { struct net *net = sock_net(skb->sk); struct tipc_net *tn = tipc_net(net); -- cgit v1.2.3 From 5609e2bbefed38df060c2472af9cf0c469b8997f Mon Sep 17 00:00:00 2001 From: Chen Wandun Date: Mon, 10 Feb 2020 16:27:59 +0800 Subject: mptcp: make the symbol 'mptcp_sk_clone_lock' static Fix the following sparse warning: net/mptcp/protocol.c:646:13: warning: symbol 'mptcp_sk_clone_lock' was not declared. Should it be static? Fixes: b0519de8b3f1 ("mptcp: fix use-after-free for ipv6") Signed-off-by: Chen Wandun Signed-off-by: David S. Miller --- net/mptcp/protocol.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 73780b4cb108..030dee668e0a 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -643,7 +643,7 @@ static struct ipv6_pinfo *mptcp_inet6_sk(const struct sock *sk) } #endif -struct sock *mptcp_sk_clone_lock(const struct sock *sk) +static struct sock *mptcp_sk_clone_lock(const struct sock *sk) { struct sock *nsk = sk_clone_lock(sk, GFP_ATOMIC); -- cgit v1.2.3 From 5391a87751a164b3194864126f3b016038abc9fe Mon Sep 17 00:00:00 2001 From: Tuong Lien Date: Mon, 10 Feb 2020 15:35:44 +0700 Subject: tipc: fix successful connect() but timed out MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In commit 9546a0b7ce00 ("tipc: fix wrong connect() return code"), we fixed the issue with the 'connect()' that returns zero even though the connecting has failed by waiting for the connection to be 'ESTABLISHED' really. However, the approach has one drawback in conjunction with our 'lightweight' connection setup mechanism that the following scenario can happen: (server) (client) +- accept()| | wait_for_conn() | | |connect() -------+ | |<-------[SYN]---------| > sleeping | | *CONNECTING | |--------->*ESTABLISHED | | |--------[ACK]-------->*ESTABLISHED > wakeup() send()|--------[DATA]------->|\ > wakeup() send()|--------[DATA]------->| | > wakeup() . . . . |-> recvq . . . . . | . send()|--------[DATA]------->|/ > wakeup() close()|--------[FIN]-------->*DISCONNECTING | *DISCONNECTING | | | ~~~~~~~~~~~~~~~~~~> schedule() | wait again . . | ETIMEDOUT Upon the receipt of the server 'ACK', the client becomes 'ESTABLISHED' and the 'wait_for_conn()' process is woken up but not run. Meanwhile, the server starts to send a number of data following by a 'close()' shortly without waiting any response from the client, which then forces the client socket to be 'DISCONNECTING' immediately. When the wait process is switched to be running, it continues to wait until the timer expires because of the unexpected socket state. The client 'connect()' will finally get ‘-ETIMEDOUT’ and force to release the socket whereas there remains the messages in its receive queue. Obviously the issue would not happen if the server had some delay prior to its 'close()' (or the number of 'DATA' messages is large enough), but any kind of delay would make the connection setup/shutdown "heavy". We solve this by simply allowing the 'connect()' returns zero in this particular case. The socket is already 'DISCONNECTING', so any further write will get '-EPIPE' but the socket is still able to read the messages existing in its receive queue. Note: This solution doesn't break the previous one as it deals with a different situation that the socket state is 'DISCONNECTING' but has no error (i.e. sk->sk_err = 0). Fixes: 9546a0b7ce00 ("tipc: fix wrong connect() return code") Acked-by: Ying Xue Acked-by: Jon Maloy Signed-off-by: Tuong Lien Signed-off-by: David S. Miller --- net/tipc/socket.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/tipc/socket.c b/net/tipc/socket.c index f9b4fb92c0b1..693e8902161e 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -2441,6 +2441,8 @@ static int tipc_wait_for_connect(struct socket *sock, long *timeo_p) return -ETIMEDOUT; if (signal_pending(current)) return sock_intr_errno(*timeo_p); + if (sk->sk_state == TIPC_DISCONNECTING) + break; add_wait_queue(sk_sleep(sk), &wait); done = sk_wait_event(sk, timeo_p, tipc_sk_connected(sk), -- cgit v1.2.3 From e7598fac323aad0e502415edeffd567315994dd6 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 10 Feb 2020 10:36:56 +0100 Subject: iommu/vt-d: Fix compile warning from intel-svm.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The intel_svm_is_pasid_valid() needs to be marked inline, otherwise it causes the compile warning below: CC [M] drivers/dma/idxd/cdev.o In file included from drivers/dma/idxd/cdev.c:9:0: ./include/linux/intel-svm.h:125:12: warning: ‘intel_svm_is_pasid_valid’ defined but not used [-Wunused-function] static int intel_svm_is_pasid_valid(struct device *dev, int pasid) ^~~~~~~~~~~~~~~~~~~~~~~~ Reported-by: Borislav Petkov Fixes: 15060aba71711 ('iommu/vt-d: Helper function to query if a pasid has any active users') Signed-off-by: Joerg Roedel --- include/linux/intel-svm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/intel-svm.h b/include/linux/intel-svm.h index 94f047a8a845..d7c403d0dd27 100644 --- a/include/linux/intel-svm.h +++ b/include/linux/intel-svm.h @@ -122,7 +122,7 @@ static inline int intel_svm_unbind_mm(struct device *dev, int pasid) BUG(); } -static int intel_svm_is_pasid_valid(struct device *dev, int pasid) +static inline int intel_svm_is_pasid_valid(struct device *dev, int pasid) { return -EINVAL; } -- cgit v1.2.3 From d99f88761ba0d135677afe546ffdd26c58e5644d Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Mon, 3 Feb 2020 14:03:37 +0100 Subject: Revert "gpiolib: Remove duplicated function gpio_do_set_config()" This reverts commit d18fddff061d2796525e6d4a958cb3d30aed8efd. This patch came on top of another patch that introduced a regression. Revert it before addressing the culprit. Signed-off-by: Bartosz Golaszewski Tested-by: Guenter Roeck --- drivers/gpio/gpiolib.c | 27 +++++++++++++++++++++++---- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c index 753283486037..a3e4f5f5871f 100644 --- a/drivers/gpio/gpiolib.c +++ b/drivers/gpio/gpiolib.c @@ -3035,8 +3035,8 @@ EXPORT_SYMBOL_GPL(gpiochip_free_own_desc); * rely on gpio_request() having been called beforehand. */ -static int gpio_set_config(struct gpio_chip *gc, unsigned int offset, - enum pin_config_param mode) +static int gpio_do_set_config(struct gpio_chip *gc, unsigned int offset, + enum pin_config_param mode) { if (!gc->set_config) return -ENOTSUPP; @@ -3044,6 +3044,25 @@ static int gpio_set_config(struct gpio_chip *gc, unsigned int offset, return gc->set_config(gc, offset, mode); } +static int gpio_set_config(struct gpio_chip *gc, unsigned int offset, + enum pin_config_param mode) +{ + unsigned arg; + + switch (mode) { + case PIN_CONFIG_BIAS_DISABLE: + case PIN_CONFIG_BIAS_PULL_DOWN: + case PIN_CONFIG_BIAS_PULL_UP: + arg = 1; + break; + + default: + arg = 0; + } + + return gpio_do_set_config(gc, offset, mode); +} + static int gpio_set_bias(struct gpio_chip *chip, struct gpio_desc *desc) { int bias = 0; @@ -3277,7 +3296,7 @@ int gpiod_set_debounce(struct gpio_desc *desc, unsigned debounce) chip = desc->gdev->chip; config = pinconf_to_config_packed(PIN_CONFIG_INPUT_DEBOUNCE, debounce); - return gpio_set_config(chip, gpio_chip_hwgpio(desc), config); + return gpio_do_set_config(chip, gpio_chip_hwgpio(desc), config); } EXPORT_SYMBOL_GPL(gpiod_set_debounce); @@ -3311,7 +3330,7 @@ int gpiod_set_transitory(struct gpio_desc *desc, bool transitory) packed = pinconf_to_config_packed(PIN_CONFIG_PERSIST_STATE, !transitory); gpio = gpio_chip_hwgpio(desc); - rc = gpio_set_config(chip, gpio, packed); + rc = gpio_do_set_config(chip, gpio, packed); if (rc == -ENOTSUPP) { dev_dbg(&desc->gdev->dev, "Persistence not supported for GPIO %d\n", gpio); -- cgit v1.2.3 From 91b4ea5fc57c6a0a1beea7056dc2f83e2ec6968c Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Mon, 3 Feb 2020 14:03:47 +0100 Subject: Revert "gpiolib: remove set but not used variable 'config'" This reverts commit e5e42ad224a040f93bf112e96f82b3a0ed97ffab. This patch came on top of another patch that introduced a regression. Revert it before addressing the culprit. Signed-off-by: Bartosz Golaszewski Tested-by: Guenter Roeck --- drivers/gpio/gpiolib.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c index a3e4f5f5871f..c317567eeaa0 100644 --- a/drivers/gpio/gpiolib.c +++ b/drivers/gpio/gpiolib.c @@ -3047,6 +3047,7 @@ static int gpio_do_set_config(struct gpio_chip *gc, unsigned int offset, static int gpio_set_config(struct gpio_chip *gc, unsigned int offset, enum pin_config_param mode) { + unsigned long config; unsigned arg; switch (mode) { @@ -3060,6 +3061,7 @@ static int gpio_set_config(struct gpio_chip *gc, unsigned int offset, arg = 0; } + config = PIN_CONF_PACKED(mode, arg); return gpio_do_set_config(gc, offset, mode); } -- cgit v1.2.3 From 62adc6f33d6f3c0f4831389dec0f82ea6e9a489f Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Mon, 3 Feb 2020 14:16:16 +0100 Subject: gpiolib: fix gpio_do_set_config() Commit d90f36851d65 ("gpiolib: have a single place of calling set_config()") introduced a regression where we don't pass the right variable as argument to the set_config() callback of gpio driver from gpio_set_config(). After reverting two additional patches that came on top of it - this addresses the issue by changing the type of the last argument of gpio_do_set_config() to unsigned long and making sure the packed config variable is actually used in gpio_set_config(). Fixes: d90f36851d65 ("gpiolib: have a single place of calling set_config()") Signed-off-by: Bartosz Golaszewski Tested-by: Guenter Roeck --- drivers/gpio/gpiolib.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c index c317567eeaa0..2b3e665a36e0 100644 --- a/drivers/gpio/gpiolib.c +++ b/drivers/gpio/gpiolib.c @@ -3036,12 +3036,12 @@ EXPORT_SYMBOL_GPL(gpiochip_free_own_desc); */ static int gpio_do_set_config(struct gpio_chip *gc, unsigned int offset, - enum pin_config_param mode) + unsigned long config) { if (!gc->set_config) return -ENOTSUPP; - return gc->set_config(gc, offset, mode); + return gc->set_config(gc, offset, config); } static int gpio_set_config(struct gpio_chip *gc, unsigned int offset, @@ -3062,7 +3062,7 @@ static int gpio_set_config(struct gpio_chip *gc, unsigned int offset, } config = PIN_CONF_PACKED(mode, arg); - return gpio_do_set_config(gc, offset, mode); + return gpio_do_set_config(gc, offset, config); } static int gpio_set_bias(struct gpio_chip *chip, struct gpio_desc *desc) -- cgit v1.2.3 From e4f41de77f96e199151117646ac6df4f18301c79 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 28 Jan 2020 18:03:25 +0200 Subject: MAINTAINERS: Sort entries in database for GPIO Run parse-maintainers.pl and choose GPIO records. Fix them accordingly. Signed-off-by: Andy Shevchenko Signed-off-by: Bartosz Golaszewski --- MAINTAINERS | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 38fe2f3f7b6f..a0d86490c2c6 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2796,11 +2796,11 @@ F: drivers/block/aoe/ ATHEROS 71XX/9XXX GPIO DRIVER M: Alban Bedel +S: Maintained W: https://github.com/AlbanBedel/linux T: git git://github.com/AlbanBedel/linux -S: Maintained -F: drivers/gpio/gpio-ath79.c F: Documentation/devicetree/bindings/gpio/gpio-ath79.txt +F: drivers/gpio/gpio-ath79.c ATHEROS 71XX/9XXX USB PHY DRIVER M: Alban Bedel @@ -3422,8 +3422,8 @@ BROADCOM BRCMSTB GPIO DRIVER M: Gregory Fong L: bcm-kernel-feedback-list@broadcom.com S: Supported -F: drivers/gpio/gpio-brcmstb.c F: Documentation/devicetree/bindings/gpio/brcm,brcmstb-gpio.txt +F: drivers/gpio/gpio-brcmstb.c BROADCOM BRCMSTB I2C DRIVER M: Kamal Dasu @@ -3481,8 +3481,8 @@ BROADCOM KONA GPIO DRIVER M: Ray Jui L: bcm-kernel-feedback-list@broadcom.com S: Supported -F: drivers/gpio/gpio-bcm-kona.c F: Documentation/devicetree/bindings/gpio/brcm,kona-gpio.txt +F: drivers/gpio/gpio-bcm-kona.c BROADCOM NETXTREME-E ROCE DRIVER M: Selvin Xavier @@ -3597,8 +3597,8 @@ F: sound/pci/bt87x.c BT8XXGPIO DRIVER M: Michael Buesch -W: http://bu3sch.de/btgpio.php S: Maintained +W: http://bu3sch.de/btgpio.php F: drivers/gpio/gpio-bt8xx.c BTRFS FILE SYSTEM @@ -7143,18 +7143,18 @@ GPIO SUBSYSTEM M: Linus Walleij M: Bartosz Golaszewski L: linux-gpio@vger.kernel.org -T: git git://git.kernel.org/pub/scm/linux/kernel/git/linusw/linux-gpio.git S: Maintained +T: git git://git.kernel.org/pub/scm/linux/kernel/git/linusw/linux-gpio.git +F: Documentation/ABI/obsolete/sysfs-gpio +F: Documentation/ABI/testing/gpio-cdev +F: Documentation/admin-guide/gpio/ F: Documentation/devicetree/bindings/gpio/ F: Documentation/driver-api/gpio/ -F: Documentation/admin-guide/gpio/ -F: Documentation/ABI/testing/gpio-cdev -F: Documentation/ABI/obsolete/sysfs-gpio F: drivers/gpio/ +F: include/asm-generic/gpio.h F: include/linux/gpio/ F: include/linux/gpio.h F: include/linux/of_gpio.h -F: include/asm-generic/gpio.h F: include/uapi/linux/gpio.h F: tools/gpio/ @@ -8055,8 +8055,8 @@ F: drivers/scsi/ips.* ICH LPC AND GPIO DRIVER M: Peter Tyser S: Maintained -F: drivers/mfd/lpc_ich.c F: drivers/gpio/gpio-ich.c +F: drivers/mfd/lpc_ich.c ICY I2C DRIVER M: Max Staudt @@ -16075,8 +16075,8 @@ F: Documentation/devicetree/bindings/reset/snps,axs10x-reset.txt SYNOPSYS CREG GPIO DRIVER M: Eugeniy Paltsev S: Maintained -F: drivers/gpio/gpio-creg-snps.c F: Documentation/devicetree/bindings/gpio/snps,creg-gpio.txt +F: drivers/gpio/gpio-creg-snps.c SYNOPSYS DESIGNWARE 8250 UART DRIVER R: Andy Shevchenko @@ -16087,8 +16087,8 @@ SYNOPSYS DESIGNWARE APB GPIO DRIVER M: Hoan Tran L: linux-gpio@vger.kernel.org S: Maintained -F: drivers/gpio/gpio-dwapb.c F: Documentation/devicetree/bindings/gpio/snps-dwapb-gpio.txt +F: drivers/gpio/gpio-dwapb.c SYNOPSYS DESIGNWARE AXI DMAC DRIVER M: Eugeniy Paltsev @@ -18414,8 +18414,8 @@ M: Nandor Han M: Semi Malinen L: linux-gpio@vger.kernel.org S: Maintained -F: drivers/gpio/gpio-xra1403.c F: Documentation/devicetree/bindings/gpio/gpio-xra1403.txt +F: drivers/gpio/gpio-xra1403.c XTENSA XTFPGA PLATFORM SUPPORT M: Max Filippov -- cgit v1.2.3 From 4d0cabbb8b6a6cb010c19e8a3da2da5e0aaf5ef0 Mon Sep 17 00:00:00 2001 From: Axel Lin Date: Wed, 29 Jan 2020 20:30:21 +0800 Subject: gpio: bd71828: Remove unneeded defines for GPIO_LINE_DIRECTION_IN/OUT They are defined in gpio/driver.h now. Signed-off-by: Axel Lin Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-bd71828.c | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/drivers/gpio/gpio-bd71828.c b/drivers/gpio/gpio-bd71828.c index 04aade9e0a4d..3dbbc638e9a9 100644 --- a/drivers/gpio/gpio-bd71828.c +++ b/drivers/gpio/gpio-bd71828.c @@ -10,16 +10,6 @@ #define GPIO_OUT_REG(off) (BD71828_REG_GPIO_CTRL1 + (off)) #define HALL_GPIO_OFFSET 3 -/* - * These defines can be removed when - * "gpio: Add definition for GPIO direction" - * (9208b1e77d6e8e9776f34f46ef4079ecac9c3c25 in GPIO tree) gets merged, - */ -#ifndef GPIO_LINE_DIRECTION_IN - #define GPIO_LINE_DIRECTION_IN 1 - #define GPIO_LINE_DIRECTION_OUT 0 -#endif - struct bd71828_gpio { struct rohm_regmap_dev chip; struct gpio_chip gpio; -- cgit v1.2.3 From 8131b73b22c25e57e926874bb284f61cc3f8ac5e Mon Sep 17 00:00:00 2001 From: Kent Gibson Date: Mon, 20 Jan 2020 18:46:26 +0800 Subject: gpiolib: remove unnecessary argument from set_config call Remove unnecessary argument when setting PIN_CONFIG_BIAS_DISABLE. No argument is expected by pinctrl, so removing it should be harmless. Fixes: 2148ad7790ea ("gpiolib: add support for disabling line bias") Signed-off-by: Kent Gibson Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpiolib.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c index 2b3e665a36e0..4d0106ceeba7 100644 --- a/drivers/gpio/gpiolib.c +++ b/drivers/gpio/gpiolib.c @@ -3051,7 +3051,6 @@ static int gpio_set_config(struct gpio_chip *gc, unsigned int offset, unsigned arg; switch (mode) { - case PIN_CONFIG_BIAS_DISABLE: case PIN_CONFIG_BIAS_PULL_DOWN: case PIN_CONFIG_BIAS_PULL_UP: arg = 1; -- cgit v1.2.3 From 52262ee567ad14c9606be25f3caddcefa3c514e4 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 28 Jan 2020 15:40:06 +0000 Subject: sched/fair: Allow a per-CPU kthread waking a task to stack on the same CPU, to fix XFS performance regression The following XFS commit: 8ab39f11d974 ("xfs: prevent CIL push holdoff in log recovery") changed the logic from using bound workqueues to using unbound workqueues. Functionally this makes sense but it was observed at the time that the dbench performance dropped quite a lot and CPU migrations were increased. The current pattern of the task migration is straight-forward. With XFS, an IO issuer delegates work to xlog_cil_push_work ()on an unbound kworker. This runs on a nearby CPU and on completion, dbench wakes up on its old CPU as it is still idle and no migration occurs. dbench then queues the real IO on the blk_mq_requeue_work() work item which runs on a bound kworker which is forced to run on the same CPU as dbench. When IO completes, the bound kworker wakes dbench but as the kworker is a bound but, real task, the CPU is not considered idle and dbench gets migrated by select_idle_sibling() to a new CPU. dbench may ping-pong between two CPUs for a while but ultimately it starts a round-robin of all CPUs sharing the same LLC. High-frequency migration on each IO completion has poor performance overall. It has negative implications both in commication costs and power management. mpstat confirmed that at low thread counts that all CPUs sharing an LLC has low level of activity. Note that even if the CIL patch was reverted, there still would be migrations but the impact is less noticeable. It turns out that individually the scheduler, XFS, blk-mq and workqueues all made sensible decisions but in combination, the overall effect was sub-optimal. This patch special cases the IO issue/completion pattern and allows a bound kworker waker and a task wakee to stack on the same CPU if there is a strong chance they are directly related. The expectation is that the kworker is likely going back to sleep shortly. This is not guaranteed as the IO could be queued asynchronously but there is a very strong relationship between the task and kworker in this case that would justify stacking on the same CPU instead of migrating. There should be few concerns about kworker starvation given that the special casing is only when the kworker is the waker. DBench on XFS MMTests config: io-dbench4-async modified to run on a fresh XFS filesystem UMA machine with 8 cores sharing LLC 5.5.0-rc7 5.5.0-rc7 tipsched-20200124 kworkerstack Amean 1 22.63 ( 0.00%) 20.54 * 9.23%* Amean 2 25.56 ( 0.00%) 23.40 * 8.44%* Amean 4 28.63 ( 0.00%) 27.85 * 2.70%* Amean 8 37.66 ( 0.00%) 37.68 ( -0.05%) Amean 64 469.47 ( 0.00%) 468.26 ( 0.26%) Stddev 1 1.00 ( 0.00%) 0.72 ( 28.12%) Stddev 2 1.62 ( 0.00%) 1.97 ( -21.54%) Stddev 4 2.53 ( 0.00%) 3.58 ( -41.19%) Stddev 8 5.30 ( 0.00%) 5.20 ( 1.92%) Stddev 64 86.36 ( 0.00%) 94.53 ( -9.46%) NUMA machine, 48 CPUs total, 24 CPUs share cache 5.5.0-rc7 5.5.0-rc7 tipsched-20200124 kworkerstack-v1r2 Amean 1 58.69 ( 0.00%) 30.21 * 48.53%* Amean 2 60.90 ( 0.00%) 35.29 * 42.05%* Amean 4 66.77 ( 0.00%) 46.55 * 30.28%* Amean 8 81.41 ( 0.00%) 68.46 * 15.91%* Amean 16 113.29 ( 0.00%) 107.79 * 4.85%* Amean 32 199.10 ( 0.00%) 198.22 * 0.44%* Amean 64 478.99 ( 0.00%) 477.06 * 0.40%* Amean 128 1345.26 ( 0.00%) 1372.64 * -2.04%* Stddev 1 2.64 ( 0.00%) 4.17 ( -58.08%) Stddev 2 4.35 ( 0.00%) 5.38 ( -23.73%) Stddev 4 6.77 ( 0.00%) 6.56 ( 3.00%) Stddev 8 11.61 ( 0.00%) 10.91 ( 6.04%) Stddev 16 18.63 ( 0.00%) 19.19 ( -3.01%) Stddev 32 38.71 ( 0.00%) 38.30 ( 1.06%) Stddev 64 100.28 ( 0.00%) 91.24 ( 9.02%) Stddev 128 186.87 ( 0.00%) 160.34 ( 14.20%) Dbench has been modified to report the time to complete a single "load file". This is a more meaningful metric for dbench that a throughput metric as the benchmark makes many different system calls that are not throughput-related Patch shows a 9.23% and 48.53% reduction in the time to process a load file with the difference partially explained by the number of CPUs sharing a LLC. In a separate run, task migrations were almost eliminated by the patch for low client counts. In case people have issue with the metric used for the benchmark, this is a comparison of the throughputs as reported by dbench on the NUMA machine. dbench4 Throughput (misleading but traditional) 5.5.0-rc7 5.5.0-rc7 tipsched-20200124 kworkerstack-v1r2 Hmean 1 321.41 ( 0.00%) 617.82 * 92.22%* Hmean 2 622.87 ( 0.00%) 1066.80 * 71.27%* Hmean 4 1134.56 ( 0.00%) 1623.74 * 43.12%* Hmean 8 1869.96 ( 0.00%) 2212.67 * 18.33%* Hmean 16 2673.11 ( 0.00%) 2806.13 * 4.98%* Hmean 32 3032.74 ( 0.00%) 3039.54 ( 0.22%) Hmean 64 2514.25 ( 0.00%) 2498.96 * -0.61%* Hmean 128 1778.49 ( 0.00%) 1746.05 * -1.82%* Note that this is somewhat specific to XFS and ext4 shows no performance difference as it does not rely on kworkers in the same way. No major problem was observed running other workloads on different machines although not all tests have completed yet. Signed-off-by: Mel Gorman Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200128154006.GD3466@techsingularity.net Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 11 ----------- kernel/sched/fair.c | 14 ++++++++++++++ kernel/sched/sched.h | 13 +++++++++++++ 3 files changed, 27 insertions(+), 11 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 89e54f3ed571..1a9983da4408 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1447,17 +1447,6 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) #ifdef CONFIG_SMP -static inline bool is_per_cpu_kthread(struct task_struct *p) -{ - if (!(p->flags & PF_KTHREAD)) - return false; - - if (p->nr_cpus_allowed != 1) - return false; - - return true; -} - /* * Per-CPU kthreads are allowed to run on !active && online CPUs, see * __set_cpus_allowed_ptr() and select_fallback_rq(). diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 25dffc03f0f6..94c3b8469cf6 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5912,6 +5912,20 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) (available_idle_cpu(prev) || sched_idle_cpu(prev))) return prev; + /* + * Allow a per-cpu kthread to stack with the wakee if the + * kworker thread and the tasks previous CPUs are the same. + * The assumption is that the wakee queued work for the + * per-cpu kthread that is now complete and the wakeup is + * essentially a sync wakeup. An obvious example of this + * pattern is IO completions. + */ + if (is_per_cpu_kthread(current) && + prev == smp_processor_id() && + this_rq()->nr_running <= 1) { + return prev; + } + /* Check a recently used CPU as a potential idle candidate: */ recent_used_cpu = p->recent_used_cpu; if (recent_used_cpu != prev && diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 1a88dc8ad11b..5876e6ba5903 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2479,3 +2479,16 @@ static inline void membarrier_switch_mm(struct rq *rq, { } #endif + +#ifdef CONFIG_SMP +static inline bool is_per_cpu_kthread(struct task_struct *p) +{ + if (!(p->flags & PF_KTHREAD)) + return false; + + if (p->nr_cpus_allowed != 1) + return false; + + return true; +} +#endif -- cgit v1.2.3 From 5cb7a1113f94cec20ff16d3981b94b7fdd8d73fa Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Mon, 27 Jan 2020 15:19:35 +0530 Subject: arm64: Drop do_el0_ia_bp_hardening() & do_sp_pc_abort() declarations There is a redundant do_sp_pc_abort() declaration in exceptions.h which can be removed. Also do_el0_ia_bp_hardening() as been already been dropped with the commit bfe298745afc ("arm64: entry-common: don't touch daif before bp-hardening") and hence does not need a declaration any more. This should not introduce any functional change. Cc: Catalin Marinas Cc: Will Deacon Cc: James Morse Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Acked-by: Mark Rutland Signed-off-by: Anshuman Khandual Signed-off-by: Will Deacon --- arch/arm64/include/asm/exception.h | 4 ---- 1 file changed, 4 deletions(-) diff --git a/arch/arm64/include/asm/exception.h b/arch/arm64/include/asm/exception.h index b87c6e276ab1..7a6e81ca23a8 100644 --- a/arch/arm64/include/asm/exception.h +++ b/arch/arm64/include/asm/exception.h @@ -33,7 +33,6 @@ static inline u32 disr_to_esr(u64 disr) asmlinkage void enter_from_user_mode(void); void do_mem_abort(unsigned long addr, unsigned int esr, struct pt_regs *regs); -void do_sp_pc_abort(unsigned long addr, unsigned int esr, struct pt_regs *regs); void do_undefinstr(struct pt_regs *regs); asmlinkage void bad_mode(struct pt_regs *regs, int reason, unsigned int esr); void do_debug_exception(unsigned long addr_if_watchpoint, unsigned int esr, @@ -47,7 +46,4 @@ void bad_el0_sync(struct pt_regs *regs, int reason, unsigned int esr); void do_cp15instr(unsigned int esr, struct pt_regs *regs); void do_el0_svc(struct pt_regs *regs); void do_el0_svc_compat(struct pt_regs *regs); -void do_el0_ia_bp_hardening(unsigned long addr, unsigned int esr, - struct pt_regs *regs); - #endif /* __ASM_EXCEPTION_H */ -- cgit v1.2.3 From 2c614c1194f2803750c14b751871bd168dcc8054 Mon Sep 17 00:00:00 2001 From: Matteo Croce Date: Fri, 24 Jan 2020 16:51:27 +0100 Subject: arm64: use shared sysctl constants Use shared sysctl variables for zero and one constants, as in commit eec4844fae7c ("proc/sysctl: add shared variables for range check") Fixes: 63f0c6037965 ("arm64: Introduce prctl() options to control the tagged user addresses ABI") Signed-off-by: Matteo Croce Signed-off-by: Will Deacon --- arch/arm64/kernel/process.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index bbb0f0c145f6..a480b6760808 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -608,8 +608,6 @@ long get_tagged_addr_ctrl(void) * only prevents the tagged address ABI enabling via prctl() and does not * disable it for tasks that already opted in to the relaxed ABI. */ -static int zero; -static int one = 1; static struct ctl_table tagged_addr_sysctl_table[] = { { @@ -618,8 +616,8 @@ static struct ctl_table tagged_addr_sysctl_table[] = { .data = &tagged_addr_disabled, .maxlen = sizeof(int), .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, { } }; -- cgit v1.2.3 From fca3d33d8ad61eb53eca3ee4cac476d1e31b9008 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Thu, 6 Feb 2020 10:42:58 +0000 Subject: arm64: ssbs: Fix context-switch when SSBS is present on all CPUs When all CPUs in the system implement the SSBS extension, the SSBS field in PSTATE is the definitive indication of the mitigation state. Further, when the CPUs implement the SSBS manipulation instructions (advertised to userspace via an HWCAP), EL0 can toggle the SSBS field directly and so we cannot rely on any shadow state such as TIF_SSBD at all. Avoid forcing the SSBS field in context-switch on such a system, and simply rely on the PSTATE register instead. Cc: Cc: Catalin Marinas Cc: Srinivas Ramana Fixes: cbdf8a189a66 ("arm64: Force SSBS on context switch") Reviewed-by: Marc Zyngier Signed-off-by: Will Deacon --- arch/arm64/kernel/process.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index a480b6760808..00626057a384 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -466,6 +466,13 @@ static void ssbs_thread_switch(struct task_struct *next) if (unlikely(next->flags & PF_KTHREAD)) return; + /* + * If all CPUs implement the SSBS extension, then we just need to + * context-switch the PSTATE field. + */ + if (cpu_have_feature(cpu_feature(SSBS))) + return; + /* If the mitigation is enabled, then we leave SSBS clear. */ if ((arm64_get_ssbd_state() == ARM64_SSBD_FORCE_ENABLE) || test_tsk_thread_flag(next, TIF_SSBD)) -- cgit v1.2.3 From 345d52c184dc7de98cff63f1bfa6f90e9db19809 Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Thu, 23 Jan 2020 15:20:51 -0500 Subject: arm64/spinlock: fix a -Wunused-function warning The commit f5bfdc8e3947 ("locking/osq: Use optimized spinning loop for arm64") introduced a warning from Clang because vcpu_is_preempted() is compiled away, kernel/locking/osq_lock.c:25:19: warning: unused function 'node_cpu' [-Wunused-function] static inline int node_cpu(struct optimistic_spin_node *node) ^ 1 warning generated. Fix it by converting vcpu_is_preempted() to a static inline function. Fixes: f5bfdc8e3947 ("locking/osq: Use optimized spinning loop for arm64") Acked-by: Waiman Long Signed-off-by: Qian Cai Signed-off-by: Will Deacon --- arch/arm64/include/asm/spinlock.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/spinlock.h b/arch/arm64/include/asm/spinlock.h index 102404dc1e13..9083d6992603 100644 --- a/arch/arm64/include/asm/spinlock.h +++ b/arch/arm64/include/asm/spinlock.h @@ -18,6 +18,10 @@ * See: * https://lore.kernel.org/lkml/20200110100612.GC2827@hirez.programming.kicks-ass.net */ -#define vcpu_is_preempted(cpu) false +#define vcpu_is_preempted vcpu_is_preempted +static inline bool vcpu_is_preempted(int cpu) +{ + return false; +} #endif /* __ASM_SPINLOCK_H */ -- cgit v1.2.3 From aab73d278d49c718b722ff5052e16c9cddf144d4 Mon Sep 17 00:00:00 2001 From: Harald Freudenberger Date: Fri, 31 Jan 2020 12:08:31 +0100 Subject: s390/pkey: fix missing length of protected key on return The pkey ioctl call PKEY_SEC2PROTK updates a struct pkey_protkey on return. The protected key is stored in, the protected key type is stored in but the len information was not updated. This patch now fixes this and so the len field gets an update to refrect the actual size of the protected key value returned. Fixes: efc598e6c8a9 ("s390/zcrypt: move cca misc functions to new code file") Cc: Stable Signed-off-by: Harald Freudenberger Reported-by: Christian Rund Suggested-by: Ingo Franzki Signed-off-by: Vasily Gorbik --- drivers/s390/crypto/pkey_api.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/s390/crypto/pkey_api.c b/drivers/s390/crypto/pkey_api.c index 71dae64ba994..2f33c5fcf676 100644 --- a/drivers/s390/crypto/pkey_api.c +++ b/drivers/s390/crypto/pkey_api.c @@ -994,7 +994,7 @@ static long pkey_unlocked_ioctl(struct file *filp, unsigned int cmd, return -EFAULT; rc = cca_sec2protkey(ksp.cardnr, ksp.domain, ksp.seckey.seckey, ksp.protkey.protkey, - NULL, &ksp.protkey.type); + &ksp.protkey.len, &ksp.protkey.type); DEBUG_DBG("%s cca_sec2protkey()=%d\n", __func__, rc); if (rc) break; -- cgit v1.2.3 From fcd98d4002539f1e381916fc1b6648938c1eac76 Mon Sep 17 00:00:00 2001 From: Harald Freudenberger Date: Fri, 20 Dec 2019 16:02:54 +0100 Subject: s390/zcrypt: fix card and queue total counter wrap The internal statistic counters for the total number of requests processed per card and per queue used integers. So they do wrap after a rather huge amount of crypto requests processed. This patch introduces uint64 counters which should hold much longer but still may wrap. The sysfs attributes request_count for card and queue also used only %ld and now display the counter value with %llu. This is not a security relevant fix. The int overflow which happened is not in any way exploitable as a security breach. Signed-off-by: Harald Freudenberger Signed-off-by: Vasily Gorbik --- drivers/s390/crypto/ap_bus.h | 4 ++-- drivers/s390/crypto/ap_card.c | 8 ++++---- drivers/s390/crypto/ap_queue.c | 6 +++--- drivers/s390/crypto/zcrypt_api.c | 16 +++++++++------- 4 files changed, 18 insertions(+), 16 deletions(-) diff --git a/drivers/s390/crypto/ap_bus.h b/drivers/s390/crypto/ap_bus.h index bb35ba4a8d24..4348fdff1c61 100644 --- a/drivers/s390/crypto/ap_bus.h +++ b/drivers/s390/crypto/ap_bus.h @@ -162,7 +162,7 @@ struct ap_card { unsigned int functions; /* AP device function bitfield. */ int queue_depth; /* AP queue depth.*/ int id; /* AP card number. */ - atomic_t total_request_count; /* # requests ever for this AP device.*/ + atomic64_t total_request_count; /* # requests ever for this AP device.*/ }; #define to_ap_card(x) container_of((x), struct ap_card, ap_dev.device) @@ -179,7 +179,7 @@ struct ap_queue { enum ap_state state; /* State of the AP device. */ int pendingq_count; /* # requests on pendingq list. */ int requestq_count; /* # requests on requestq list. */ - int total_request_count; /* # requests ever for this AP device.*/ + u64 total_request_count; /* # requests ever for this AP device.*/ int request_timeout; /* Request timeout in jiffies. */ struct timer_list timeout; /* Timer for request timeouts. */ struct list_head pendingq; /* List of message sent to AP queue. */ diff --git a/drivers/s390/crypto/ap_card.c b/drivers/s390/crypto/ap_card.c index 63b4cc6cd7e5..e85bfca1ed16 100644 --- a/drivers/s390/crypto/ap_card.c +++ b/drivers/s390/crypto/ap_card.c @@ -63,13 +63,13 @@ static ssize_t request_count_show(struct device *dev, char *buf) { struct ap_card *ac = to_ap_card(dev); - unsigned int req_cnt; + u64 req_cnt; req_cnt = 0; spin_lock_bh(&ap_list_lock); - req_cnt = atomic_read(&ac->total_request_count); + req_cnt = atomic64_read(&ac->total_request_count); spin_unlock_bh(&ap_list_lock); - return snprintf(buf, PAGE_SIZE, "%d\n", req_cnt); + return snprintf(buf, PAGE_SIZE, "%llu\n", req_cnt); } static ssize_t request_count_store(struct device *dev, @@ -83,7 +83,7 @@ static ssize_t request_count_store(struct device *dev, for_each_ap_queue(aq, ac) aq->total_request_count = 0; spin_unlock_bh(&ap_list_lock); - atomic_set(&ac->total_request_count, 0); + atomic64_set(&ac->total_request_count, 0); return count; } diff --git a/drivers/s390/crypto/ap_queue.c b/drivers/s390/crypto/ap_queue.c index 37c3bdc3642d..a317ab484932 100644 --- a/drivers/s390/crypto/ap_queue.c +++ b/drivers/s390/crypto/ap_queue.c @@ -479,12 +479,12 @@ static ssize_t request_count_show(struct device *dev, char *buf) { struct ap_queue *aq = to_ap_queue(dev); - unsigned int req_cnt; + u64 req_cnt; spin_lock_bh(&aq->lock); req_cnt = aq->total_request_count; spin_unlock_bh(&aq->lock); - return snprintf(buf, PAGE_SIZE, "%d\n", req_cnt); + return snprintf(buf, PAGE_SIZE, "%llu\n", req_cnt); } static ssize_t request_count_store(struct device *dev, @@ -676,7 +676,7 @@ void ap_queue_message(struct ap_queue *aq, struct ap_message *ap_msg) list_add_tail(&ap_msg->list, &aq->requestq); aq->requestq_count++; aq->total_request_count++; - atomic_inc(&aq->card->total_request_count); + atomic64_inc(&aq->card->total_request_count); /* Send/receive as many request from the queue as possible. */ ap_wait(ap_sm_event_loop(aq, AP_EVENT_POLL)); spin_unlock_bh(&aq->lock); diff --git a/drivers/s390/crypto/zcrypt_api.c b/drivers/s390/crypto/zcrypt_api.c index a42257d6c79e..56a405dce8bc 100644 --- a/drivers/s390/crypto/zcrypt_api.c +++ b/drivers/s390/crypto/zcrypt_api.c @@ -606,8 +606,8 @@ static inline bool zcrypt_card_compare(struct zcrypt_card *zc, weight += atomic_read(&zc->load); pref_weight += atomic_read(&pref_zc->load); if (weight == pref_weight) - return atomic_read(&zc->card->total_request_count) > - atomic_read(&pref_zc->card->total_request_count); + return atomic64_read(&zc->card->total_request_count) > + atomic64_read(&pref_zc->card->total_request_count); return weight > pref_weight; } @@ -1226,11 +1226,12 @@ static void zcrypt_qdepth_mask(char qdepth[], size_t max_adapters) spin_unlock(&zcrypt_list_lock); } -static void zcrypt_perdev_reqcnt(int reqcnt[], size_t max_adapters) +static void zcrypt_perdev_reqcnt(u32 reqcnt[], size_t max_adapters) { struct zcrypt_card *zc; struct zcrypt_queue *zq; int card; + u64 cnt; memset(reqcnt, 0, sizeof(int) * max_adapters); spin_lock(&zcrypt_list_lock); @@ -1242,8 +1243,9 @@ static void zcrypt_perdev_reqcnt(int reqcnt[], size_t max_adapters) || card >= max_adapters) continue; spin_lock(&zq->queue->lock); - reqcnt[card] = zq->queue->total_request_count; + cnt = zq->queue->total_request_count; spin_unlock(&zq->queue->lock); + reqcnt[card] = (cnt < UINT_MAX) ? (u32) cnt : UINT_MAX; } } local_bh_enable(); @@ -1421,9 +1423,9 @@ static long zcrypt_unlocked_ioctl(struct file *filp, unsigned int cmd, return 0; } case ZCRYPT_PERDEV_REQCNT: { - int *reqcnt; + u32 *reqcnt; - reqcnt = kcalloc(AP_DEVICES, sizeof(int), GFP_KERNEL); + reqcnt = kcalloc(AP_DEVICES, sizeof(u32), GFP_KERNEL); if (!reqcnt) return -ENOMEM; zcrypt_perdev_reqcnt(reqcnt, AP_DEVICES); @@ -1480,7 +1482,7 @@ static long zcrypt_unlocked_ioctl(struct file *filp, unsigned int cmd, } case Z90STAT_PERDEV_REQCNT: { /* the old ioctl supports only 64 adapters */ - int reqcnt[MAX_ZDEV_CARDIDS]; + u32 reqcnt[MAX_ZDEV_CARDIDS]; zcrypt_perdev_reqcnt(reqcnt, MAX_ZDEV_CARDIDS); if (copy_to_user((int __user *) arg, reqcnt, sizeof(reqcnt))) -- cgit v1.2.3 From 0b6f499022b6a87d04f56edd2bf863ea76923206 Mon Sep 17 00:00:00 2001 From: Julian Wiedmann Date: Mon, 25 Nov 2019 14:18:29 +0100 Subject: s390/qdio: simplify ACK tracking Current code uses a 'polling' flag to keep track of whether an Input Queue has any ACKed SBALs. QEBSM devices might have multiple ACKed SBALs, and those are tracked separately with 'ack_count'. By also setting ack_count for non-QEBSM devices (to a fixed value of 1), we can use 'ack_count != 0' as replacement for the polling flag. Signed-off-by: Julian Wiedmann Signed-off-by: Vasily Gorbik --- drivers/s390/cio/qdio.h | 4 +--- drivers/s390/cio/qdio_debug.c | 5 ++--- drivers/s390/cio/qdio_main.c | 29 +++++++++++------------------ 3 files changed, 14 insertions(+), 24 deletions(-) diff --git a/drivers/s390/cio/qdio.h b/drivers/s390/cio/qdio.h index 4b0798472643..ff74eb5fce50 100644 --- a/drivers/s390/cio/qdio.h +++ b/drivers/s390/cio/qdio.h @@ -182,11 +182,9 @@ enum qdio_queue_irq_states { }; struct qdio_input_q { - /* input buffer acknowledgement flag */ - int polling; /* first ACK'ed buffer */ int ack_start; - /* how much sbals are acknowledged with qebsm */ + /* how many SBALs are acknowledged */ int ack_count; /* last time of noticing incoming data */ u64 timestamp; diff --git a/drivers/s390/cio/qdio_debug.c b/drivers/s390/cio/qdio_debug.c index 35410e6eda2e..9c0370b27426 100644 --- a/drivers/s390/cio/qdio_debug.c +++ b/drivers/s390/cio/qdio_debug.c @@ -124,9 +124,8 @@ static int qstat_show(struct seq_file *m, void *v) seq_printf(m, "nr_used: %d ftc: %d\n", atomic_read(&q->nr_buf_used), q->first_to_check); if (q->is_input_q) { - seq_printf(m, "polling: %d ack start: %d ack count: %d\n", - q->u.in.polling, q->u.in.ack_start, - q->u.in.ack_count); + seq_printf(m, "ack start: %d ack count: %d\n", + q->u.in.ack_start, q->u.in.ack_count); seq_printf(m, "DSCI: %x IRQs disabled: %u\n", *(u8 *)q->irq_ptr->dsci, test_bit(QDIO_QUEUE_IRQS_DISABLED, diff --git a/drivers/s390/cio/qdio_main.c b/drivers/s390/cio/qdio_main.c index f8b897b7e78b..3475317c42e5 100644 --- a/drivers/s390/cio/qdio_main.c +++ b/drivers/s390/cio/qdio_main.c @@ -393,19 +393,15 @@ int debug_get_buf_state(struct qdio_q *q, unsigned int bufnr, static inline void qdio_stop_polling(struct qdio_q *q) { - if (!q->u.in.polling) + if (!q->u.in.ack_count) return; - q->u.in.polling = 0; qperf_inc(q, stop_polling); /* show the card that we are not polling anymore */ - if (is_qebsm(q)) { - set_buf_states(q, q->u.in.ack_start, SLSB_P_INPUT_NOT_INIT, - q->u.in.ack_count); - q->u.in.ack_count = 0; - } else - set_buf_state(q, q->u.in.ack_start, SLSB_P_INPUT_NOT_INIT); + set_buf_states(q, q->u.in.ack_start, SLSB_P_INPUT_NOT_INIT, + q->u.in.ack_count); + q->u.in.ack_count = 0; } static inline void account_sbals(struct qdio_q *q, unsigned int count) @@ -451,8 +447,7 @@ static inline void inbound_primed(struct qdio_q *q, unsigned int start, /* for QEBSM the ACK was already set by EQBS */ if (is_qebsm(q)) { - if (!q->u.in.polling) { - q->u.in.polling = 1; + if (!q->u.in.ack_count) { q->u.in.ack_count = count; q->u.in.ack_start = start; return; @@ -471,12 +466,12 @@ static inline void inbound_primed(struct qdio_q *q, unsigned int start, * or by the next inbound run. */ new = add_buf(start, count - 1); - if (q->u.in.polling) { + if (q->u.in.ack_count) { /* reset the previous ACK but first set the new one */ set_buf_state(q, new, SLSB_P_INPUT_ACK); set_buf_state(q, q->u.in.ack_start, SLSB_P_INPUT_NOT_INIT); } else { - q->u.in.polling = 1; + q->u.in.ack_count = 1; set_buf_state(q, new, SLSB_P_INPUT_ACK); } @@ -1479,13 +1474,12 @@ static int handle_inbound(struct qdio_q *q, unsigned int callflags, qperf_inc(q, inbound_call); - if (!q->u.in.polling) + if (!q->u.in.ack_count) goto set; /* protect against stop polling setting an ACK for an emptied slsb */ if (count == QDIO_MAX_BUFFERS_PER_Q) { /* overwriting everything, just delete polling status */ - q->u.in.polling = 0; q->u.in.ack_count = 0; goto set; } else if (buf_in_between(q->u.in.ack_start, bufnr, count)) { @@ -1495,15 +1489,14 @@ static int handle_inbound(struct qdio_q *q, unsigned int callflags, diff = sub_buf(diff, q->u.in.ack_start); q->u.in.ack_count -= diff; if (q->u.in.ack_count <= 0) { - q->u.in.polling = 0; q->u.in.ack_count = 0; goto set; } q->u.in.ack_start = add_buf(q->u.in.ack_start, diff); + } else { + /* the only ACK will be deleted */ + q->u.in.ack_count = 0; } - else - /* the only ACK will be deleted, so stop polling */ - q->u.in.polling = 0; } set: -- cgit v1.2.3 From c3afa804c58e5c30ac63858b527fffadc88bce82 Mon Sep 17 00:00:00 2001 From: Paul Thomas Date: Sat, 25 Jan 2020 17:14:10 -0500 Subject: gpio: xilinx: Fix bug where the wrong GPIO register is written to Care is taken with "index", however with the current version the actual xgpio_writereg is using index for data but xgpio_regoffset(chip, i) for the offset. And since i is already incremented it is incorrect. This patch fixes it so that index is used for the offset too. Cc: stable@vger.kernel.org Signed-off-by: Paul Thomas Link: https://lore.kernel.org/r/20200125221410.8022-1-pthomas8589@gmail.com Signed-off-by: Linus Walleij --- drivers/gpio/gpio-xilinx.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/gpio/gpio-xilinx.c b/drivers/gpio/gpio-xilinx.c index a9748b5198e6..67f9f82e0db0 100644 --- a/drivers/gpio/gpio-xilinx.c +++ b/drivers/gpio/gpio-xilinx.c @@ -147,9 +147,10 @@ static void xgpio_set_multiple(struct gpio_chip *gc, unsigned long *mask, for (i = 0; i < gc->ngpio; i++) { if (*mask == 0) break; + /* Once finished with an index write it out to the register */ if (index != xgpio_index(chip, i)) { xgpio_writereg(chip->regs + XGPIO_DATA_OFFSET + - xgpio_regoffset(chip, i), + index * XGPIO_CHANNEL_OFFSET, chip->gpio_state[index]); spin_unlock_irqrestore(&chip->gpio_lock[index], flags); index = xgpio_index(chip, i); @@ -165,7 +166,7 @@ static void xgpio_set_multiple(struct gpio_chip *gc, unsigned long *mask, } xgpio_writereg(chip->regs + XGPIO_DATA_OFFSET + - xgpio_regoffset(chip, i), chip->gpio_state[index]); + index * XGPIO_CHANNEL_OFFSET, chip->gpio_state[index]); spin_unlock_irqrestore(&chip->gpio_lock[index], flags); } -- cgit v1.2.3 From 9437bfda00f3b26eb5f475737ddaaf4dc07fee4f Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 30 Jan 2020 15:05:45 +0200 Subject: ASoC: atmel: fix atmel_ssc_set_audio link failure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The ssc audio driver can call into both pdc and dma backends. With the latest rework, the logic to do this in a safe way avoiding link errors was removed, bringing back link errors that were fixed long ago in commit 061981ff8cc8 ("ASoC: atmel: properly select dma driver state") such as sound/soc/atmel/atmel_ssc_dai.o: In function `atmel_ssc_set_audio': atmel_ssc_dai.c:(.text+0xac): undefined reference to `atmel_pcm_pdc_platform_register' Fix it this time using Makefile hacks and a comment to prevent this from accidentally getting removed again rather than Kconfig hacks. Fixes: 18291410557f ("ASoC: atmel: enable SOC_SSC_PDC and SOC_SSC_DMA in Kconfig") Signed-off-by: Arnd Bergmann Signed-off-by: Codrin Ciubotariu Link: https://lore.kernel.org/r/20200130130545.31148-1-codrin.ciubotariu@microchip.com Reviewed-by: Michał Mirosław Signed-off-by: Mark Brown --- sound/soc/atmel/Kconfig | 4 ++-- sound/soc/atmel/Makefile | 10 ++++++++-- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/sound/soc/atmel/Kconfig b/sound/soc/atmel/Kconfig index d1dc8e6366dc..71f2d42188c4 100644 --- a/sound/soc/atmel/Kconfig +++ b/sound/soc/atmel/Kconfig @@ -10,11 +10,11 @@ config SND_ATMEL_SOC if SND_ATMEL_SOC config SND_ATMEL_SOC_PDC - tristate + bool depends on HAS_DMA config SND_ATMEL_SOC_DMA - tristate + bool select SND_SOC_GENERIC_DMAENGINE_PCM config SND_ATMEL_SOC_SSC diff --git a/sound/soc/atmel/Makefile b/sound/soc/atmel/Makefile index 1f6890ed3738..c7d2989791be 100644 --- a/sound/soc/atmel/Makefile +++ b/sound/soc/atmel/Makefile @@ -6,8 +6,14 @@ snd-soc-atmel_ssc_dai-objs := atmel_ssc_dai.o snd-soc-atmel-i2s-objs := atmel-i2s.o snd-soc-mchp-i2s-mcc-objs := mchp-i2s-mcc.o -obj-$(CONFIG_SND_ATMEL_SOC_PDC) += snd-soc-atmel-pcm-pdc.o -obj-$(CONFIG_SND_ATMEL_SOC_DMA) += snd-soc-atmel-pcm-dma.o +# pdc and dma need to both be built-in if any user of +# ssc is built-in. +ifdef CONFIG_SND_ATMEL_SOC_PDC +obj-$(CONFIG_SND_ATMEL_SOC_SSC) += snd-soc-atmel-pcm-pdc.o +endif +ifdef CONFIG_SND_ATMEL_SOC_DMA +obj-$(CONFIG_SND_ATMEL_SOC_SSC) += snd-soc-atmel-pcm-dma.o +endif obj-$(CONFIG_SND_ATMEL_SOC_SSC) += snd-soc-atmel_ssc_dai.o obj-$(CONFIG_SND_ATMEL_SOC_I2S) += snd-soc-atmel-i2s.o obj-$(CONFIG_SND_MCHP_SOC_I2S_MCC) += snd-soc-mchp-i2s-mcc.o -- cgit v1.2.3 From 2d5a2f913b658a7ae984773a63318ed4daadf4af Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Tue, 21 Jan 2020 10:37:48 -0800 Subject: spmi: pmic-arb: Set lockdep class for hierarchical irq domains I see the following lockdep splat in the qcom pinctrl driver when attempting to suspend the device. WARNING: possible recursive locking detected 5.4.11 #3 Tainted: G W -------------------------------------------- cat/3074 is trying to acquire lock: ffffff81f49804c0 (&irq_desc_lock_class){-.-.}, at: __irq_get_desc_lock+0x64/0x94 but task is already holding lock: ffffff81f1cc10c0 (&irq_desc_lock_class){-.-.}, at: __irq_get_desc_lock+0x64/0x94 other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock(&irq_desc_lock_class); lock(&irq_desc_lock_class); *** DEADLOCK *** May be due to missing lock nesting notation 6 locks held by cat/3074: #0: ffffff81f01d9420 (sb_writers#7){.+.+}, at: vfs_write+0xd0/0x1a4 #1: ffffff81bd7d2080 (&of->mutex){+.+.}, at: kernfs_fop_write+0x12c/0x1fc #2: ffffff81f4c322f0 (kn->count#337){.+.+}, at: kernfs_fop_write+0x134/0x1fc #3: ffffffe411a41d60 (system_transition_mutex){+.+.}, at: pm_suspend+0x108/0x348 #4: ffffff81f1c5e970 (&dev->mutex){....}, at: __device_suspend+0x168/0x41c #5: ffffff81f1cc10c0 (&irq_desc_lock_class){-.-.}, at: __irq_get_desc_lock+0x64/0x94 stack backtrace: CPU: 5 PID: 3074 Comm: cat Tainted: G W 5.4.11 #3 Hardware name: Google Cheza (rev3+) (DT) Call trace: dump_backtrace+0x0/0x174 show_stack+0x20/0x2c dump_stack+0xc8/0x124 __lock_acquire+0x460/0x2388 lock_acquire+0x1cc/0x210 _raw_spin_lock_irqsave+0x64/0x80 __irq_get_desc_lock+0x64/0x94 irq_set_irq_wake+0x40/0x144 qpnpint_irq_set_wake+0x28/0x34 set_irq_wake_real+0x40/0x5c irq_set_irq_wake+0x70/0x144 pm8941_pwrkey_suspend+0x34/0x44 platform_pm_suspend+0x34/0x60 dpm_run_callback+0x64/0xcc __device_suspend+0x310/0x41c dpm_suspend+0xf8/0x298 dpm_suspend_start+0x84/0xb4 suspend_devices_and_enter+0xbc/0x620 pm_suspend+0x210/0x348 state_store+0xb0/0x108 kobj_attr_store+0x14/0x24 sysfs_kf_write+0x4c/0x64 kernfs_fop_write+0x15c/0x1fc __vfs_write+0x54/0x18c vfs_write+0xe4/0x1a4 ksys_write+0x7c/0xe4 __arm64_sys_write+0x20/0x2c el0_svc_common+0xa8/0x160 el0_svc_handler+0x7c/0x98 el0_svc+0x8/0xc Set a lockdep class when we map the irq so that irq_set_wake() doesn't warn about a lockdep bug that doesn't exist. Fixes: 12a9eeaebba3 ("spmi: pmic-arb: convert to v2 irq interfaces to support hierarchical IRQ chips") Cc: Douglas Anderson Cc: Brian Masney Cc: Lina Iyer Cc: Maulik Shah Cc: Bjorn Andersson Signed-off-by: Stephen Boyd Link: https://lore.kernel.org/r/20200121183748.68662-1-swboyd@chromium.org Signed-off-by: Linus Walleij --- drivers/spmi/spmi-pmic-arb.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/spmi/spmi-pmic-arb.c b/drivers/spmi/spmi-pmic-arb.c index 97acc2ba2912..de844b412110 100644 --- a/drivers/spmi/spmi-pmic-arb.c +++ b/drivers/spmi/spmi-pmic-arb.c @@ -731,6 +731,7 @@ static int qpnpint_irq_domain_translate(struct irq_domain *d, return 0; } +static struct lock_class_key qpnpint_irq_lock_class, qpnpint_irq_request_class; static void qpnpint_irq_domain_map(struct spmi_pmic_arb *pmic_arb, struct irq_domain *domain, unsigned int virq, @@ -746,6 +747,9 @@ static void qpnpint_irq_domain_map(struct spmi_pmic_arb *pmic_arb, else handler = handle_level_irq; + + irq_set_lockdep_class(virq, &qpnpint_irq_lock_class, + &qpnpint_irq_request_class); irq_domain_set_info(domain, virq, hwirq, &pmic_arb_irqchip, pmic_arb, handler, NULL, NULL); } -- cgit v1.2.3 From 88a9c66d998b1d2dac412fcd458c5d17d70513c8 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Tue, 14 Jan 2020 10:56:47 +0000 Subject: drm/i915/pmu: Correct the rc6 offset upon enabling The rc6 residency starts ticking from 0 from BIOS POST, but the kernel starts measuring the time from its boot. If we start measuruing I915_PMU_RC6_RESIDENCY while the GT is idle, we start our sampling from 0 and then upon first activity (park/unpark) add in all the rc6 residency since boot. After the first park with the sampler engaged, the sleep/active counters are aligned. v2: With a wakeref to be sure Closes: https://gitlab.freedesktop.org/drm/intel/issues/973 Fixes: df6a42053513 ("drm/i915/pmu: Ensure monotonic rc6") Signed-off-by: Chris Wilson Cc: Tvrtko Ursulin Reviewed-by: Tvrtko Ursulin Link: https://patchwork.freedesktop.org/patch/msgid/20200114105648.2172026-1-chris@chris-wilson.co.uk (cherry picked from commit f4e9894b6952a2819937f363cd42e7cd7894a1e4) Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/i915_pmu.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c index 28a82c849bac..ec0299490dd4 100644 --- a/drivers/gpu/drm/i915/i915_pmu.c +++ b/drivers/gpu/drm/i915/i915_pmu.c @@ -637,8 +637,10 @@ static void i915_pmu_enable(struct perf_event *event) container_of(event->pmu, typeof(*i915), pmu.base); unsigned int bit = event_enabled_bit(event); struct i915_pmu *pmu = &i915->pmu; + intel_wakeref_t wakeref; unsigned long flags; + wakeref = intel_runtime_pm_get(&i915->runtime_pm); spin_lock_irqsave(&pmu->lock, flags); /* @@ -648,6 +650,14 @@ static void i915_pmu_enable(struct perf_event *event) BUILD_BUG_ON(ARRAY_SIZE(pmu->enable_count) != I915_PMU_MASK_BITS); GEM_BUG_ON(bit >= ARRAY_SIZE(pmu->enable_count)); GEM_BUG_ON(pmu->enable_count[bit] == ~0); + + if (pmu->enable_count[bit] == 0 && + config_enabled_mask(I915_PMU_RC6_RESIDENCY) & BIT_ULL(bit)) { + pmu->sample[__I915_SAMPLE_RC6_LAST_REPORTED].cur = 0; + pmu->sample[__I915_SAMPLE_RC6].cur = __get_rc6(&i915->gt); + pmu->sleep_last = ktime_get(); + } + pmu->enable |= BIT_ULL(bit); pmu->enable_count[bit]++; @@ -688,6 +698,8 @@ static void i915_pmu_enable(struct perf_event *event) * an existing non-zero value. */ local64_set(&event->hw.prev_count, __i915_pmu_event_read(event)); + + intel_runtime_pm_put(&i915->runtime_pm, wakeref); } static void i915_pmu_disable(struct perf_event *event) -- cgit v1.2.3 From 01c1b2cbf05224495eec2cf54166934f123bad61 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Mon, 13 Jan 2020 15:45:55 +0000 Subject: drm/i915/gem: Take local vma references for the parser Take and hold a reference to each of the vma (and their objects) as we process them with the cmdparser. This stops them being freed during the work if the GEM execbuf is interrupted and the request we expected to keep the objects alive is incomplete. Fixes: 686c7c35abc2 ("drm/i915/gem: Asynchronous cmdparser") Closes: https://gitlab.freedesktop.org/drm/intel/issues/970 Signed-off-by: Chris Wilson Cc: Joonas Lahtinen Reviewed-by: Mika Kuoppala Link: https://patchwork.freedesktop.org/patch/msgid/20200113154555.1909639-1-chris@chris-wilson.co.uk (cherry picked from commit 36c8e356a76e147f0b631fd29838147c01b50d04) Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c | 37 +++++++++++++++++++++++++- 1 file changed, 36 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c index d5a0f5ae4a8b..60c984e10c4a 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c @@ -1981,9 +1981,20 @@ static int __eb_parse(struct dma_fence_work *work) pw->trampoline); } +static void __eb_parse_release(struct dma_fence_work *work) +{ + struct eb_parse_work *pw = container_of(work, typeof(*pw), base); + + if (pw->trampoline) + i915_active_release(&pw->trampoline->active); + i915_active_release(&pw->shadow->active); + i915_active_release(&pw->batch->active); +} + static const struct dma_fence_work_ops eb_parse_ops = { .name = "eb_parse", .work = __eb_parse, + .release = __eb_parse_release, }; static int eb_parse_pipeline(struct i915_execbuffer *eb, @@ -1997,6 +2008,20 @@ static int eb_parse_pipeline(struct i915_execbuffer *eb, if (!pw) return -ENOMEM; + err = i915_active_acquire(&eb->batch->active); + if (err) + goto err_free; + + err = i915_active_acquire(&shadow->active); + if (err) + goto err_batch; + + if (trampoline) { + err = i915_active_acquire(&trampoline->active); + if (err) + goto err_shadow; + } + dma_fence_work_init(&pw->base, &eb_parse_ops); pw->engine = eb->engine; @@ -2006,7 +2031,9 @@ static int eb_parse_pipeline(struct i915_execbuffer *eb, pw->shadow = shadow; pw->trampoline = trampoline; - dma_resv_lock(pw->batch->resv, NULL); + err = dma_resv_lock_interruptible(pw->batch->resv, NULL); + if (err) + goto err_trampoline; err = dma_resv_reserve_shared(pw->batch->resv, 1); if (err) @@ -2034,6 +2061,14 @@ static int eb_parse_pipeline(struct i915_execbuffer *eb, err_batch_unlock: dma_resv_unlock(pw->batch->resv); +err_trampoline: + if (trampoline) + i915_active_release(&trampoline->active); +err_shadow: + i915_active_release(&shadow->active); +err_batch: + i915_active_release(&eb->batch->active); +err_free: kfree(pw); return err; } -- cgit v1.2.3 From c631cc8f11246f1a8075a203d55ff282ee9416db Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= Date: Fri, 10 Jan 2020 20:32:23 +0200 Subject: drm/i915: Make a copy of the ggtt view for slave plane MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit intel_prepare_plane_fb() will always pin plane_state->hw.fb whenever it is present. We copy that from the master plane to the slave plane, but we fail to copy the corresponding ggtt view. Thus when it comes time to pin the slave plane's fb we use some stale ggtt view left over from the last time the plane was used as a non-slave plane. If that previous use involved 90/270 degree rotation or remapping we'll try to shuffle the pages of the new fb around accordingingly. However the new fb may be backed by a bo with less pages than what the ggtt view rotation/remapped info requires, and so we we trip a GEM_BUG(). Steps to reproduce on icl: 1. plane 1: whatever plane 6: largish !NV12 fb + 90 degree rotation 2. plane 1: smallish NV12 fb plane 6: make invisible so it gets slaved to plane 1 3. GEM_BUG() Cc: Chris Wilson Cc: Maarten Lankhorst Closes: https://gitlab.freedesktop.org/drm/intel/issues/951 Fixes: 1f594b209fe1 ("drm/i915: Remove special case slave handling during hw programming, v3.") Signed-off-by: Ville Syrjälä Link: https://patchwork.freedesktop.org/patch/msgid/20200110183228.8199-1-ville.syrjala@linux.intel.com Reviewed-by: Chris Wilson (cherry picked from commit 103605e0d1e77cfb5d0f5a9e8aba7d97f1b49339) Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/display/intel_display.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/i915/display/intel_display.c b/drivers/gpu/drm/i915/display/intel_display.c index a410a213bd30..064dd99bbc49 100644 --- a/drivers/gpu/drm/i915/display/intel_display.c +++ b/drivers/gpu/drm/i915/display/intel_display.c @@ -12366,6 +12366,7 @@ static int icl_check_nv12_planes(struct intel_crtc_state *crtc_state) /* Copy parameters to slave plane */ linked_state->ctl = plane_state->ctl | PLANE_CTL_YUV420_Y_PLANE; linked_state->color_ctl = plane_state->color_ctl; + linked_state->view = plane_state->view; memcpy(linked_state->color_plane, plane_state->color_plane, sizeof(linked_state->color_plane)); -- cgit v1.2.3 From 1fdea0cb0dba0d42ffcfb619b349c1a2afa2492e Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Tue, 14 Jan 2020 16:00:30 +0000 Subject: drm/i915/selftests: Add a mock i915_vma to the mock_ring Add a i915_vma to the mock_engine/mock_ring so that the core code can always assume the presence of ring->vma. Fixes: 8ccfc20a7d56 ("drm/i915/gt: Mark ring->vma as active while pinned") Signed-off-by: Chris Wilson Reviewed-by: Mika Kuoppala Link: https://patchwork.freedesktop.org/patch/msgid/20200114160030.2468927-1-chris@chris-wilson.co.uk (cherry picked from commit b63b4feaef7363d2cf46dd76bb6e87e060b2b0de) Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/gt/mock_engine.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/gt/mock_engine.c b/drivers/gpu/drm/i915/gt/mock_engine.c index a560b7eee2cd..f2806381733f 100644 --- a/drivers/gpu/drm/i915/gt/mock_engine.c +++ b/drivers/gpu/drm/i915/gt/mock_engine.c @@ -59,11 +59,26 @@ static struct intel_ring *mock_ring(struct intel_engine_cs *engine) ring->vaddr = (void *)(ring + 1); atomic_set(&ring->pin_count, 1); + ring->vma = i915_vma_alloc(); + if (!ring->vma) { + kfree(ring); + return NULL; + } + i915_active_init(&ring->vma->active, NULL, NULL); + intel_ring_update_space(ring); return ring; } +static void mock_ring_free(struct intel_ring *ring) +{ + i915_active_fini(&ring->vma->active); + i915_vma_free(ring->vma); + + kfree(ring); +} + static struct i915_request *first_request(struct mock_engine *engine) { return list_first_entry_or_null(&engine->hw_queue, @@ -121,7 +136,7 @@ static void mock_context_destroy(struct kref *ref) GEM_BUG_ON(intel_context_is_pinned(ce)); if (test_bit(CONTEXT_ALLOC_BIT, &ce->flags)) { - kfree(ce->ring); + mock_ring_free(ce->ring); mock_timeline_unpin(ce->timeline); } -- cgit v1.2.3 From 1b5af53781654706e1a4ee479274f8a3e3f74c01 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Wed, 15 Jan 2020 12:25:09 +0000 Subject: drm/i915/gt: Use the BIT when checking the flags, not the index In converting over to using set_bit()/test_bit(), when manually inspecting the rq->fence.flags, we need to use BIT(). Fixes: e1c31fb5dde3 ("drm/i915: Merge i915_request.flags with i915_request.fence.flags") Signed-off-by: Chris Wilson Cc: Tvrtko Ursulin Cc: Matthew Auld Reviewed-by: Tvrtko Ursulin Link: https://patchwork.freedesktop.org/patch/msgid/20200115122509.2673075-1-chris@chris-wilson.co.uk (cherry picked from commit 72ff2b8d5f2dcb09bfa37b902c23311eec426496) Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/gt/intel_lrc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c index 0cf0f6fae675..5d8c1ebe0731 100644 --- a/drivers/gpu/drm/i915/gt/intel_lrc.c +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c @@ -1535,7 +1535,8 @@ static bool can_merge_rq(const struct i915_request *prev, return true; if (unlikely((prev->fence.flags ^ next->fence.flags) & - (I915_FENCE_FLAG_NOPREEMPT | I915_FENCE_FLAG_SENTINEL))) + (BIT(I915_FENCE_FLAG_NOPREEMPT) | + BIT(I915_FENCE_FLAG_SENTINEL)))) return false; if (!can_merge_ctx(prev->context, next->context)) -- cgit v1.2.3 From a924eae75106258c0797706a0578c5af499c9ff5 Mon Sep 17 00:00:00 2001 From: Yash Shah Date: Tue, 28 Jan 2020 10:54:21 +0530 Subject: gpio: sifive: fix static checker warning Typcasting "irq_state" leads to the below static checker warning: The fix is to declare "irq_state" as unsigned long instead of u32. drivers/gpio/gpio-sifive.c:97 sifive_gpio_irq_enable() warn: passing casted pointer '&chip->irq_state' to 'assign_bit()' 32 vs 64. Fixes: 96868dce644d ("gpio/sifive: Add GPIO driver for SiFive SoCs") Reported-by: Dan Carpenter Signed-off-by: Yash Shah Link: https://lore.kernel.org/r/1580189061-14091-1-git-send-email-yash.shah@sifive.com Reviewed-by: Marc Zyngier Signed-off-by: Linus Walleij --- drivers/gpio/gpio-sifive.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/gpio/gpio-sifive.c b/drivers/gpio/gpio-sifive.c index 147a1bd04515..c54dd08f2cbf 100644 --- a/drivers/gpio/gpio-sifive.c +++ b/drivers/gpio/gpio-sifive.c @@ -35,7 +35,7 @@ struct sifive_gpio { void __iomem *base; struct gpio_chip gc; struct regmap *regs; - u32 irq_state; + unsigned long irq_state; unsigned int trigger[SIFIVE_GPIO_MAX]; unsigned int irq_parent[SIFIVE_GPIO_MAX]; }; @@ -94,7 +94,7 @@ static void sifive_gpio_irq_enable(struct irq_data *d) spin_unlock_irqrestore(&gc->bgpio_lock, flags); /* Enable interrupts */ - assign_bit(offset, (unsigned long *)&chip->irq_state, 1); + assign_bit(offset, &chip->irq_state, 1); sifive_gpio_set_ie(chip, offset); } @@ -104,7 +104,7 @@ static void sifive_gpio_irq_disable(struct irq_data *d) struct sifive_gpio *chip = gpiochip_get_data(gc); int offset = irqd_to_hwirq(d) % SIFIVE_GPIO_MAX; - assign_bit(offset, (unsigned long *)&chip->irq_state, 0); + assign_bit(offset, &chip->irq_state, 0); sifive_gpio_set_ie(chip, offset); irq_chip_disable_parent(d); } -- cgit v1.2.3 From 88bf54603f6f2c137dfee1abf6436ceac3528d2d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Mork?= Date: Sat, 8 Feb 2020 15:50:36 +0100 Subject: qmi_wwan: re-add DW5821e pre-production variant MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit f25e1392fdb5 removed the support for the pre-production variant of the Dell DW5821e to avoid probing another USB interface unnecessarily. However, the pre-production samples are found in the wild, and this lack of support is causing problems for users of such samples. It is therefore necessary to support both variants. Matching on both interfaces 0 and 1 is not expected to cause any problem with either variant, as only the QMI function will be probed successfully on either. Interface 1 will be rejected based on the HID class for the production variant: T: Bus=01 Lev=03 Prnt=04 Port=00 Cnt=01 Dev#= 16 Spd=480 MxCh= 0 D: Ver= 2.10 Cls=ef(misc ) Sub=02 Prot=01 MxPS=64 #Cfgs= 2 P: Vendor=413c ProdID=81d7 Rev=03.18 S: Manufacturer=DELL S: Product=DW5821e Snapdragon X20 LTE S: SerialNumber=0123456789ABCDEF C: #Ifs= 6 Cfg#= 1 Atr=a0 MxPwr=500mA I: If#= 0 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=ff Driver=qmi_wwan I: If#= 1 Alt= 0 #EPs= 1 Cls=03(HID ) Sub=00 Prot=00 Driver=usbhid I: If#= 2 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=option I: If#= 3 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=option I: If#= 4 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=option I: If#= 5 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=ff Driver=option And interface 0 will be rejected based on too few endpoints for the pre-production variant: T: Bus=01 Lev=02 Prnt=02 Port=03 Cnt=03 Dev#= 7 Spd=480 MxCh= 0 D: Ver= 2.10 Cls=ef(misc ) Sub=02 Prot=01 MxPS=64 #Cfgs= 2 P: Vendor=413c ProdID=81d7 Rev= 3.18 S: Manufacturer=DELL S: Product=DW5821e Snapdragon X20 LTE S: SerialNumber=0123456789ABCDEF C: #Ifs= 5 Cfg#= 1 Atr=a0 MxPwr=500mA I: If#= 0 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=ff Driver= I: If#= 1 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=ff Driver=qmi_wwan I: If#= 2 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=option I: If#= 3 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=option I: If#= 4 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=00 Prot=00 Driver=option Fixes: f25e1392fdb5 ("qmi_wwan: fix interface number for DW5821e production firmware") Link: https://whrl.pl/Rf0vNk Reported-by: Lars Melin Cc: Aleksander Morgado Signed-off-by: Bjørn Mork Signed-off-by: David S. Miller --- drivers/net/usb/qmi_wwan.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/usb/qmi_wwan.c b/drivers/net/usb/qmi_wwan.c index 9485c8d1de8a..839cef720cf6 100644 --- a/drivers/net/usb/qmi_wwan.c +++ b/drivers/net/usb/qmi_wwan.c @@ -1363,6 +1363,7 @@ static const struct usb_device_id products[] = { {QMI_FIXED_INTF(0x413c, 0x81b6, 8)}, /* Dell Wireless 5811e */ {QMI_FIXED_INTF(0x413c, 0x81b6, 10)}, /* Dell Wireless 5811e */ {QMI_FIXED_INTF(0x413c, 0x81d7, 0)}, /* Dell Wireless 5821e */ + {QMI_FIXED_INTF(0x413c, 0x81d7, 1)}, /* Dell Wireless 5821e preproduction config */ {QMI_FIXED_INTF(0x413c, 0x81e0, 0)}, /* Dell Wireless 5821e with eSIM support*/ {QMI_FIXED_INTF(0x03f0, 0x4e1d, 8)}, /* HP lt4111 LTE/EV-DO/HSPA+ Gobi 4G Module */ {QMI_FIXED_INTF(0x03f0, 0x9d1d, 1)}, /* HP lt4120 Snapdragon X5 LTE */ -- cgit v1.2.3 From 5d1fbdf238b5175579a34ac1c21cd172b65e9ca7 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Sat, 8 Feb 2020 16:54:32 +0100 Subject: net: dsa: mv88e6xxx: Prevent truncation of longer interrupt names When adding support for unique interrupt names, after testing on a few devices, it was assumed 32 characters would be sufficient. This assumption turned out to be incorrect, ZII RDU2 for example uses a device base name of mv88e6xxx-30be0000.ethernet-1:0, leaving no space for post fixes such as -g1-atu-prob and -watchdog. The names then become identical, defeating the point of the patch. Increase the length of the string to 64 charactoes. Reported-by: Chris Healy Fixes: 3095383a8ab4 ("net: dsa: mv88e6xxx: Unique IRQ name") Signed-off-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/dsa/mv88e6xxx/chip.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/net/dsa/mv88e6xxx/chip.h b/drivers/net/dsa/mv88e6xxx/chip.h index f332cb4b2fbf..79cad5e751c6 100644 --- a/drivers/net/dsa/mv88e6xxx/chip.h +++ b/drivers/net/dsa/mv88e6xxx/chip.h @@ -236,7 +236,7 @@ struct mv88e6xxx_port { bool mirror_ingress; bool mirror_egress; unsigned int serdes_irq; - char serdes_irq_name[32]; + char serdes_irq_name[64]; }; struct mv88e6xxx_chip { @@ -293,16 +293,16 @@ struct mv88e6xxx_chip { struct mv88e6xxx_irq g1_irq; struct mv88e6xxx_irq g2_irq; int irq; - char irq_name[32]; + char irq_name[64]; int device_irq; - char device_irq_name[32]; + char device_irq_name[64]; int watchdog_irq; - char watchdog_irq_name[32]; + char watchdog_irq_name[64]; int atu_prob_irq; - char atu_prob_irq_name[32]; + char atu_prob_irq_name[64]; int vtu_prob_irq; - char vtu_prob_irq_name[32]; + char vtu_prob_irq_name[64]; struct kthread_worker *kworker; struct kthread_delayed_work irq_poll_work; -- cgit v1.2.3 From 00516d13d4cfa56ce39da144db2dbf08b09b9357 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Mork?= Date: Sat, 8 Feb 2020 16:55:04 +0100 Subject: qmi_wwan: unconditionally reject 2 ep interfaces MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We have been using the fact that the QMI and DIAG functions usually are the only ones with class/subclass/protocol being ff/ff/ff on Quectel modems. This has allowed us to match the QMI function without knowing the exact interface number, which can vary depending on firmware configuration. The ability to silently reject the DIAG function, which is usually handled by the option driver, is important for this method to work. This is done based on the knowledge that it has exactly 2 bulk endpoints. QMI function control interfaces will have either 3 or 1 endpoint. This rule is universal so the quirk condition can be removed. The fixed layouts known from the Gobi1k and Gobi2k modems have been gradually replaced by more dynamic layouts, and many vendors now use configurable layouts without changing device IDs. Renaming the class/subclass/protocol matching macro makes it more obvious that this is now not Quectel specific anymore. Cc: Kristian Evensen Cc: Aleksander Morgado Signed-off-by: Bjørn Mork Signed-off-by: David S. Miller --- drivers/net/usb/qmi_wwan.c | 42 +++++++++++++++--------------------------- 1 file changed, 15 insertions(+), 27 deletions(-) diff --git a/drivers/net/usb/qmi_wwan.c b/drivers/net/usb/qmi_wwan.c index 839cef720cf6..3b7a3b8a5e06 100644 --- a/drivers/net/usb/qmi_wwan.c +++ b/drivers/net/usb/qmi_wwan.c @@ -61,7 +61,6 @@ enum qmi_wwan_flags { enum qmi_wwan_quirks { QMI_WWAN_QUIRK_DTR = 1 << 0, /* needs "set DTR" request */ - QMI_WWAN_QUIRK_QUECTEL_DYNCFG = 1 << 1, /* check num. endpoints */ }; struct qmimux_hdr { @@ -916,16 +915,6 @@ static const struct driver_info qmi_wwan_info_quirk_dtr = { .data = QMI_WWAN_QUIRK_DTR, }; -static const struct driver_info qmi_wwan_info_quirk_quectel_dyncfg = { - .description = "WWAN/QMI device", - .flags = FLAG_WWAN | FLAG_SEND_ZLP, - .bind = qmi_wwan_bind, - .unbind = qmi_wwan_unbind, - .manage_power = qmi_wwan_manage_power, - .rx_fixup = qmi_wwan_rx_fixup, - .data = QMI_WWAN_QUIRK_DTR | QMI_WWAN_QUIRK_QUECTEL_DYNCFG, -}; - #define HUAWEI_VENDOR_ID 0x12D1 /* map QMI/wwan function by a fixed interface number */ @@ -946,14 +935,18 @@ static const struct driver_info qmi_wwan_info_quirk_quectel_dyncfg = { #define QMI_GOBI_DEVICE(vend, prod) \ QMI_FIXED_INTF(vend, prod, 0) -/* Quectel does not use fixed interface numbers on at least some of their - * devices. We need to check the number of endpoints to ensure that we bind to - * the correct interface. +/* Many devices have QMI and DIAG functions which are distinguishable + * from other vendor specific functions by class, subclass and + * protocol all being 0xff. The DIAG function has exactly 2 endpoints + * and is silently rejected when probed. + * + * This makes it possible to match dynamically numbered QMI functions + * as seen on e.g. many Quectel modems. */ -#define QMI_QUIRK_QUECTEL_DYNCFG(vend, prod) \ +#define QMI_MATCH_FF_FF_FF(vend, prod) \ USB_DEVICE_AND_INTERFACE_INFO(vend, prod, USB_CLASS_VENDOR_SPEC, \ USB_SUBCLASS_VENDOR_SPEC, 0xff), \ - .driver_info = (unsigned long)&qmi_wwan_info_quirk_quectel_dyncfg + .driver_info = (unsigned long)&qmi_wwan_info_quirk_dtr static const struct usb_device_id products[] = { /* 1. CDC ECM like devices match on the control interface */ @@ -1059,10 +1052,10 @@ static const struct usb_device_id products[] = { USB_DEVICE_AND_INTERFACE_INFO(0x03f0, 0x581d, USB_CLASS_VENDOR_SPEC, 1, 7), .driver_info = (unsigned long)&qmi_wwan_info, }, - {QMI_QUIRK_QUECTEL_DYNCFG(0x2c7c, 0x0125)}, /* Quectel EC25, EC20 R2.0 Mini PCIe */ - {QMI_QUIRK_QUECTEL_DYNCFG(0x2c7c, 0x0306)}, /* Quectel EP06/EG06/EM06 */ - {QMI_QUIRK_QUECTEL_DYNCFG(0x2c7c, 0x0512)}, /* Quectel EG12/EM12 */ - {QMI_QUIRK_QUECTEL_DYNCFG(0x2c7c, 0x0800)}, /* Quectel RM500Q-GL */ + {QMI_MATCH_FF_FF_FF(0x2c7c, 0x0125)}, /* Quectel EC25, EC20 R2.0 Mini PCIe */ + {QMI_MATCH_FF_FF_FF(0x2c7c, 0x0306)}, /* Quectel EP06/EG06/EM06 */ + {QMI_MATCH_FF_FF_FF(0x2c7c, 0x0512)}, /* Quectel EG12/EM12 */ + {QMI_MATCH_FF_FF_FF(0x2c7c, 0x0800)}, /* Quectel RM500Q-GL */ /* 3. Combined interface devices matching on interface number */ {QMI_FIXED_INTF(0x0408, 0xea42, 4)}, /* Yota / Megafon M100-1 */ @@ -1455,7 +1448,6 @@ static int qmi_wwan_probe(struct usb_interface *intf, { struct usb_device_id *id = (struct usb_device_id *)prod; struct usb_interface_descriptor *desc = &intf->cur_altsetting->desc; - const struct driver_info *info; /* Workaround to enable dynamic IDs. This disables usbnet * blacklisting functionality. Which, if required, can be @@ -1491,12 +1483,8 @@ static int qmi_wwan_probe(struct usb_interface *intf, * different. Ignore the current interface if the number of endpoints * equals the number for the diag interface (two). */ - info = (void *)id->driver_info; - - if (info->data & QMI_WWAN_QUIRK_QUECTEL_DYNCFG) { - if (desc->bNumEndpoints == 2) - return -ENODEV; - } + if (desc->bNumEndpoints == 2) + return -ENODEV; return usbnet_probe(intf, id); } -- cgit v1.2.3 From 43bcb1c0507858cdc95e425017dcc33f8105df39 Mon Sep 17 00:00:00 2001 From: Kai Vehmanen Date: Thu, 6 Feb 2020 22:02:21 +0200 Subject: ALSA: hda: do not override bus codec_mask in link_get() snd_hdac_ext_bus_link_get() does not work correctly in case there are multiple codecs on the bus. It unconditionally resets the bus->codec_mask value. As per documentation in hdaudio.h and existing use in client code, this field should be used to store bit flag of detected codecs on the bus. By overwriting value of the codec_mask, information on all detected codecs is lost. No current user of hdac is impacted, but use of bus->codec_mask is planned in future patches for SOF. Signed-off-by: Kai Vehmanen Reviewed-by: Ranjani Sridharan Reviewed-by: Pierre-Louis Bossart Reviewed-by: Takashi Iwai Link: https://lore.kernel.org/r/20200206200223.7715-1-kai.vehmanen@linux.intel.com Signed-off-by: Mark Brown --- sound/hda/ext/hdac_ext_controller.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/sound/hda/ext/hdac_ext_controller.c b/sound/hda/ext/hdac_ext_controller.c index cfab60d88c92..09ff209df4a3 100644 --- a/sound/hda/ext/hdac_ext_controller.c +++ b/sound/hda/ext/hdac_ext_controller.c @@ -254,6 +254,7 @@ EXPORT_SYMBOL_GPL(snd_hdac_ext_bus_link_power_down_all); int snd_hdac_ext_bus_link_get(struct hdac_bus *bus, struct hdac_ext_link *link) { + unsigned long codec_mask; int ret = 0; mutex_lock(&bus->lock); @@ -280,9 +281,11 @@ int snd_hdac_ext_bus_link_get(struct hdac_bus *bus, * HDA spec section 4.3 - Codec Discovery */ udelay(521); - bus->codec_mask = snd_hdac_chip_readw(bus, STATESTS); - dev_dbg(bus->dev, "codec_mask = 0x%lx\n", bus->codec_mask); - snd_hdac_chip_writew(bus, STATESTS, bus->codec_mask); + codec_mask = snd_hdac_chip_readw(bus, STATESTS); + dev_dbg(bus->dev, "codec_mask = 0x%lx\n", codec_mask); + snd_hdac_chip_writew(bus, STATESTS, codec_mask); + if (!bus->codec_mask) + bus->codec_mask = codec_mask; } mutex_unlock(&bus->lock); -- cgit v1.2.3 From 816938b272b0ac0203e25ce50483bd284ea4a2db Mon Sep 17 00:00:00 2001 From: Kai Vehmanen Date: Thu, 6 Feb 2020 22:02:22 +0200 Subject: ASoC: SOF: Intel: hda: fix ordering bug in resume flow When HDA controller is resumed from suspend, i915 HDMI/DP codec requires that following order of actions is kept: - i915 display power up and configuration of link params - hda link reset and setup Current SOF HDA code delegates display codec power control to the codec driver. This works most of the time, but in runtime PM sequences, the above constraint may be violated. On platforms where BIOS values for HDA link parameters do not match hardware reset defaults, this may lead to errors in HDA verb transactions after resume. Fix the issue by explicitly powering the display codec in the HDA controller resume/suspend calls, thus ensuring correct ordering. Special handling is needed for the D0i3 flow, where display power must be turned off even though DSP is left powered. Now that we have more invocations of the display power helper functions, the conditional checks surrounding each call have been moved inside hda_codec_i915_display_power(). The two special cases of display powering at initial probe are handled separately. The intent is to avoid powering the display whenever no display codecs are used. Note that early powering of display was removed in commit 687ae9e287b3 ("ASoC: intel: skl: Fix display power regression"). This change was also copied to the SOF driver. No failures have resulted as hardware default values for link parameters have worked out of the box. However with recent i915 driver changes like done in commit 87c1694533c9 ("drm/i915: save AUD_FREQ_CNTRL state at audio domain suspend"), this does not hold anymore and errors are hit. Cc: Takashi Iwai Signed-off-by: Kai Vehmanen Reviewed-by: Ranjani Sridharan Reviewed-by: Pierre-Louis Bossart Reviewed-by: Takashi Iwai Link: https://lore.kernel.org/r/20200206200223.7715-2-kai.vehmanen@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/sof/intel/hda-codec.c | 12 ++++++++---- sound/soc/sof/intel/hda-dsp.c | 11 +++++++++++ sound/soc/sof/intel/hda.c | 2 +- 3 files changed, 20 insertions(+), 5 deletions(-) diff --git a/sound/soc/sof/intel/hda-codec.c b/sound/soc/sof/intel/hda-codec.c index 9106ab8dac6f..ff45075ef720 100644 --- a/sound/soc/sof/intel/hda-codec.c +++ b/sound/soc/sof/intel/hda-codec.c @@ -174,8 +174,10 @@ void hda_codec_i915_display_power(struct snd_sof_dev *sdev, bool enable) { struct hdac_bus *bus = sof_to_bus(sdev); - dev_dbg(bus->dev, "Turning i915 HDAC power %d\n", enable); - snd_hdac_display_power(bus, HDA_CODEC_IDX_CONTROLLER, enable); + if (HDA_IDISP_CODEC(bus->codec_mask)) { + dev_dbg(bus->dev, "Turning i915 HDAC power %d\n", enable); + snd_hdac_display_power(bus, HDA_CODEC_IDX_CONTROLLER, enable); + } } EXPORT_SYMBOL_NS(hda_codec_i915_display_power, SND_SOC_SOF_HDA_AUDIO_CODEC_I915); @@ -189,7 +191,8 @@ int hda_codec_i915_init(struct snd_sof_dev *sdev) if (ret < 0) return ret; - hda_codec_i915_display_power(sdev, true); + /* codec_mask not yet known, power up for probe */ + snd_hdac_display_power(bus, HDA_CODEC_IDX_CONTROLLER, true); return 0; } @@ -200,7 +203,8 @@ int hda_codec_i915_exit(struct snd_sof_dev *sdev) struct hdac_bus *bus = sof_to_bus(sdev); int ret; - hda_codec_i915_display_power(sdev, false); + /* power down unconditionally */ + snd_hdac_display_power(bus, HDA_CODEC_IDX_CONTROLLER, false); ret = snd_hdac_i915_exit(bus); diff --git a/sound/soc/sof/intel/hda-dsp.c b/sound/soc/sof/intel/hda-dsp.c index 4a4d318f97ff..0848b79967a9 100644 --- a/sound/soc/sof/intel/hda-dsp.c +++ b/sound/soc/sof/intel/hda-dsp.c @@ -428,6 +428,9 @@ static int hda_suspend(struct snd_sof_dev *sdev, bool runtime_suspend) return ret; } + /* display codec can powered off after link reset */ + hda_codec_i915_display_power(sdev, false); + return 0; } @@ -439,6 +442,9 @@ static int hda_resume(struct snd_sof_dev *sdev, bool runtime_resume) #endif int ret; + /* display codec must be powered before link reset */ + hda_codec_i915_display_power(sdev, true); + /* * clear TCSEL to clear playback on some HD Audio * codecs. PCI TCSEL is defined in the Intel manuals. @@ -482,6 +488,8 @@ int hda_dsp_resume(struct snd_sof_dev *sdev) struct pci_dev *pci = to_pci_dev(sdev->dev); if (sdev->s0_suspend) { + hda_codec_i915_display_power(sdev, true); + /* restore L1SEN bit */ if (hda->l1_support_changed) snd_sof_dsp_update_bits(sdev, HDA_DSP_HDA_BAR, @@ -531,6 +539,9 @@ int hda_dsp_suspend(struct snd_sof_dev *sdev) int ret; if (sdev->s0_suspend) { + /* we can't keep a wakeref to display driver at suspend */ + hda_codec_i915_display_power(sdev, false); + /* enable L1SEN to make sure the system can enter S0Ix */ hda->l1_support_changed = snd_sof_dsp_update_bits(sdev, HDA_DSP_HDA_BAR, diff --git a/sound/soc/sof/intel/hda.c b/sound/soc/sof/intel/hda.c index 65b86dd044f1..8fddafb5c1d4 100644 --- a/sound/soc/sof/intel/hda.c +++ b/sound/soc/sof/intel/hda.c @@ -381,7 +381,7 @@ static int hda_init_caps(struct snd_sof_dev *sdev) hda_codec_probe_bus(sdev, hda_codec_use_common_hdmi); if (!HDA_IDISP_CODEC(bus->codec_mask)) - hda_codec_i915_display_power(sdev, false); + hda_codec_i915_exit(sdev); /* * we are done probing so decrement link counts -- cgit v1.2.3 From af7aae1b1f6306a1cda4da393e920a1334eaa3d4 Mon Sep 17 00:00:00 2001 From: Kai Vehmanen Date: Thu, 6 Feb 2020 22:02:23 +0200 Subject: ASoC: SOF: Intel: hda: move i915 init earlier To be compliant with i915 display driver requirements, i915 power-up must be done before any HDA communication takes place, including parsing the bus capabilities. Otherwise the initial codec probe may fail. Move i915 initialization earlier in the SOF HDA sequence. This sequence is now aligned with the snd-hda-intel driver where the display_power() call is before snd_hdac_bus_parse_capabilities() and rest of the capability parsing. Also remove unnecessary ifdef around hda_codec_i915_init(). There's a dummy implementation provided if CONFIG_SND_SOC_SOF_HDA is not enabled. Signed-off-by: Kai Vehmanen Reviewed-by: Ranjani Sridharan Reviewed-by: Pierre-Louis Bossart Reviewed-by: Takashi Iwai Link: https://lore.kernel.org/r/20200206200223.7715-3-kai.vehmanen@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/sof/intel/hda.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/sound/soc/sof/intel/hda.c b/sound/soc/sof/intel/hda.c index 8fddafb5c1d4..25946a1c2822 100644 --- a/sound/soc/sof/intel/hda.c +++ b/sound/soc/sof/intel/hda.c @@ -286,6 +286,13 @@ static int hda_init(struct snd_sof_dev *sdev) /* HDA base */ sdev->bar[HDA_DSP_HDA_BAR] = bus->remap_addr; + /* init i915 and HDMI codecs */ + ret = hda_codec_i915_init(sdev); + if (ret < 0) { + dev_err(sdev->dev, "error: init i915 and HDMI codec failed\n"); + return ret; + } + /* get controller capabilities */ ret = hda_dsp_ctrl_get_caps(sdev); if (ret < 0) @@ -353,15 +360,6 @@ static int hda_init_caps(struct snd_sof_dev *sdev) if (bus->ppcap) dev_dbg(sdev->dev, "PP capability, will probe DSP later.\n"); -#if IS_ENABLED(CONFIG_SND_SOC_SOF_HDA) - /* init i915 and HDMI codecs */ - ret = hda_codec_i915_init(sdev); - if (ret < 0) { - dev_err(sdev->dev, "error: init i915 and HDMI codec failed\n"); - return ret; - } -#endif - /* Init HDA controller after i915 init */ ret = hda_dsp_ctrl_init_chip(sdev, true); if (ret < 0) { @@ -611,6 +609,7 @@ free_streams: iounmap(sdev->bar[HDA_DSP_BAR]); hdac_bus_unmap: iounmap(bus->remap_addr); + hda_codec_i915_exit(sdev); err: return ret; } -- cgit v1.2.3 From 3f6166aaf19902f2f3124b5426405e292e8974dd Mon Sep 17 00:00:00 2001 From: Petr Pavlu Date: Mon, 10 Feb 2020 10:38:14 +0100 Subject: cifs: fix mount option display for sec=krb5i Fix display for sec=krb5i which was wrongly interleaved by cruid, resulting in string "sec=krb5,cruid=<...>i" instead of "sec=krb5i,cruid=<...>". Fixes: 96281b9e46eb ("smb3: for kerberos mounts display the credential uid used") Signed-off-by: Petr Pavlu Signed-off-by: Steve French --- fs/cifs/cifsfs.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index febab27cd838..46ebaf3f0824 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -414,7 +414,7 @@ cifs_show_security(struct seq_file *s, struct cifs_ses *ses) seq_puts(s, "ntlm"); break; case Kerberos: - seq_printf(s, "krb5,cruid=%u", from_kuid_munged(&init_user_ns,ses->cred_uid)); + seq_puts(s, "krb5"); break; case RawNTLMSSP: seq_puts(s, "ntlmssp"); @@ -427,6 +427,10 @@ cifs_show_security(struct seq_file *s, struct cifs_ses *ses) if (ses->sign) seq_puts(s, "i"); + + if (ses->sectype == Kerberos) + seq_printf(s, ",cruid=%u", + from_kuid_munged(&init_user_ns, ses->cred_uid)); } static void -- cgit v1.2.3 From f148b9f402ef002b57bcff3964d45abc8ffb6c3f Mon Sep 17 00:00:00 2001 From: Mathias Nyman Date: Mon, 10 Feb 2020 15:45:50 +0200 Subject: xhci: Force Maximum Packet size for Full-speed bulk devices to valid range. A Full-speed bulk USB audio device (DJ-Tech CTRL) with a invalid Maximum Packet Size of 4 causes a xHC "Parameter Error" at enumeration. This is because valid Maximum packet sizes for Full-speed bulk endpoints are 8, 16, 32 and 64 bytes. Hosts are not required to support other values than these. See usb 2 specs section 5.8.3 for details. The device starts working after forcing the maximum packet size to 8. This is most likely the case with other devices as well, so force the maximum packet size to a valid range. Cc: stable@vger.kernel.org Reported-by: Rene D Obermueller Signed-off-by: Mathias Nyman Link: https://lore.kernel.org/r/20200210134553.9144-2-mathias.nyman@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/xhci-mem.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/drivers/usb/host/xhci-mem.c b/drivers/usb/host/xhci-mem.c index 3b1388fa2f36..0e2701649369 100644 --- a/drivers/usb/host/xhci-mem.c +++ b/drivers/usb/host/xhci-mem.c @@ -1475,9 +1475,15 @@ int xhci_endpoint_init(struct xhci_hcd *xhci, /* Allow 3 retries for everything but isoc, set CErr = 3 */ if (!usb_endpoint_xfer_isoc(&ep->desc)) err_count = 3; - /* Some devices get this wrong */ - if (usb_endpoint_xfer_bulk(&ep->desc) && udev->speed == USB_SPEED_HIGH) - max_packet = 512; + /* HS bulk max packet should be 512, FS bulk supports 8, 16, 32 or 64 */ + if (usb_endpoint_xfer_bulk(&ep->desc)) { + if (udev->speed == USB_SPEED_HIGH) + max_packet = 512; + if (udev->speed == USB_SPEED_FULL) { + max_packet = rounddown_pow_of_two(max_packet); + max_packet = clamp_val(max_packet, 8, 64); + } + } /* xHCI 1.0 and 1.1 indicates that ctrl ep avg TRB Length should be 8 */ if (usb_endpoint_xfer_control(&ep->desc) && xhci->hci_version >= 0x100) avg_trb_len = 8; -- cgit v1.2.3 From fc57313d1017dd6b6f37a94e88daa8df54368ecc Mon Sep 17 00:00:00 2001 From: Mathias Nyman Date: Mon, 10 Feb 2020 15:45:51 +0200 Subject: xhci: Fix memory leak when caching protocol extended capability PSI tables xhci driver assumed that xHC controllers have at most one custom supported speed table (PSI) for all usb 3.x ports. Memory was allocated for one PSI table under the xhci hub structure. Turns out this is not the case, some controllers have a separate "supported protocol capability" entry with a PSI table for each port. This means each usb3 roothub port can in theory support different custom speeds. To solve this, cache all supported protocol capabilities with their PSI tables in an array, and add pointers to the xhci port structure so that every port points to its capability entry in the array. When creating the SuperSpeedPlus USB Device Capability BOS descriptor for the xhci USB 3.1 roothub we for now will use only data from the first USB 3.1 capable protocol capability entry in the array. This could be improved later, this patch focuses resolving the memory leak. Reported-by: Paul Menzel Reported-by: Sajja Venkateswara Rao Fixes: 47189098f8be ("xhci: parse xhci protocol speed ID list for usb 3.1 usage") Cc: stable # v4.4+ Signed-off-by: Mathias Nyman Link: https://lore.kernel.org/r/20200210134553.9144-3-mathias.nyman@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/xhci-hub.c | 25 ++++++++++++------- drivers/usb/host/xhci-mem.c | 58 ++++++++++++++++++++++++++++----------------- drivers/usb/host/xhci.h | 14 ++++++++--- 3 files changed, 64 insertions(+), 33 deletions(-) diff --git a/drivers/usb/host/xhci-hub.c b/drivers/usb/host/xhci-hub.c index 7a3a29e5e9d2..af92b2576fe9 100644 --- a/drivers/usb/host/xhci-hub.c +++ b/drivers/usb/host/xhci-hub.c @@ -55,6 +55,7 @@ static u8 usb_bos_descriptor [] = { static int xhci_create_usb3_bos_desc(struct xhci_hcd *xhci, char *buf, u16 wLength) { + struct xhci_port_cap *port_cap = NULL; int i, ssa_count; u32 temp; u16 desc_size, ssp_cap_size, ssa_size = 0; @@ -64,16 +65,24 @@ static int xhci_create_usb3_bos_desc(struct xhci_hcd *xhci, char *buf, ssp_cap_size = sizeof(usb_bos_descriptor) - desc_size; /* does xhci support USB 3.1 Enhanced SuperSpeed */ - if (xhci->usb3_rhub.min_rev >= 0x01) { + for (i = 0; i < xhci->num_port_caps; i++) { + if (xhci->port_caps[i].maj_rev == 0x03 && + xhci->port_caps[i].min_rev >= 0x01) { + usb3_1 = true; + port_cap = &xhci->port_caps[i]; + break; + } + } + + if (usb3_1) { /* does xhci provide a PSI table for SSA speed attributes? */ - if (xhci->usb3_rhub.psi_count) { + if (port_cap->psi_count) { /* two SSA entries for each unique PSI ID, RX and TX */ - ssa_count = xhci->usb3_rhub.psi_uid_count * 2; + ssa_count = port_cap->psi_uid_count * 2; ssa_size = ssa_count * sizeof(u32); ssp_cap_size -= 16; /* skip copying the default SSA */ } desc_size += ssp_cap_size; - usb3_1 = true; } memcpy(buf, &usb_bos_descriptor, min(desc_size, wLength)); @@ -99,7 +108,7 @@ static int xhci_create_usb3_bos_desc(struct xhci_hcd *xhci, char *buf, } /* If PSI table exists, add the custom speed attributes from it */ - if (usb3_1 && xhci->usb3_rhub.psi_count) { + if (usb3_1 && port_cap->psi_count) { u32 ssp_cap_base, bm_attrib, psi, psi_mant, psi_exp; int offset; @@ -111,7 +120,7 @@ static int xhci_create_usb3_bos_desc(struct xhci_hcd *xhci, char *buf, /* attribute count SSAC bits 4:0 and ID count SSIC bits 8:5 */ bm_attrib = (ssa_count - 1) & 0x1f; - bm_attrib |= (xhci->usb3_rhub.psi_uid_count - 1) << 5; + bm_attrib |= (port_cap->psi_uid_count - 1) << 5; put_unaligned_le32(bm_attrib, &buf[ssp_cap_base + 4]); if (wLength < desc_size + ssa_size) @@ -124,8 +133,8 @@ static int xhci_create_usb3_bos_desc(struct xhci_hcd *xhci, char *buf, * USB 3.1 requires two SSA entries (RX and TX) for every link */ offset = desc_size; - for (i = 0; i < xhci->usb3_rhub.psi_count; i++) { - psi = xhci->usb3_rhub.psi[i]; + for (i = 0; i < port_cap->psi_count; i++) { + psi = port_cap->psi[i]; psi &= ~USB_SSP_SUBLINK_SPEED_RSVD; psi_exp = XHCI_EXT_PORT_PSIE(psi); psi_mant = XHCI_EXT_PORT_PSIM(psi); diff --git a/drivers/usb/host/xhci-mem.c b/drivers/usb/host/xhci-mem.c index 0e2701649369..bd5b152df6c0 100644 --- a/drivers/usb/host/xhci-mem.c +++ b/drivers/usb/host/xhci-mem.c @@ -1915,17 +1915,16 @@ no_bw: xhci->usb3_rhub.num_ports = 0; xhci->num_active_eps = 0; kfree(xhci->usb2_rhub.ports); - kfree(xhci->usb2_rhub.psi); kfree(xhci->usb3_rhub.ports); - kfree(xhci->usb3_rhub.psi); kfree(xhci->hw_ports); kfree(xhci->rh_bw); kfree(xhci->ext_caps); + for (i = 0; i < xhci->num_port_caps; i++) + kfree(xhci->port_caps[i].psi); + kfree(xhci->port_caps); xhci->usb2_rhub.ports = NULL; - xhci->usb2_rhub.psi = NULL; xhci->usb3_rhub.ports = NULL; - xhci->usb3_rhub.psi = NULL; xhci->hw_ports = NULL; xhci->rh_bw = NULL; xhci->ext_caps = NULL; @@ -2126,6 +2125,7 @@ static void xhci_add_in_port(struct xhci_hcd *xhci, unsigned int num_ports, u8 major_revision, minor_revision; struct xhci_hub *rhub; struct device *dev = xhci_to_hcd(xhci)->self.sysdev; + struct xhci_port_cap *port_cap; temp = readl(addr); major_revision = XHCI_EXT_PORT_MAJOR(temp); @@ -2160,31 +2160,39 @@ static void xhci_add_in_port(struct xhci_hcd *xhci, unsigned int num_ports, /* WTF? "Valid values are ‘1’ to MaxPorts" */ return; - rhub->psi_count = XHCI_EXT_PORT_PSIC(temp); - if (rhub->psi_count) { - rhub->psi = kcalloc_node(rhub->psi_count, sizeof(*rhub->psi), - GFP_KERNEL, dev_to_node(dev)); - if (!rhub->psi) - rhub->psi_count = 0; + port_cap = &xhci->port_caps[xhci->num_port_caps++]; + if (xhci->num_port_caps > max_caps) + return; + + port_cap->maj_rev = major_revision; + port_cap->min_rev = minor_revision; + port_cap->psi_count = XHCI_EXT_PORT_PSIC(temp); + + if (port_cap->psi_count) { + port_cap->psi = kcalloc_node(port_cap->psi_count, + sizeof(*port_cap->psi), + GFP_KERNEL, dev_to_node(dev)); + if (!port_cap->psi) + port_cap->psi_count = 0; - rhub->psi_uid_count++; - for (i = 0; i < rhub->psi_count; i++) { - rhub->psi[i] = readl(addr + 4 + i); + port_cap->psi_uid_count++; + for (i = 0; i < port_cap->psi_count; i++) { + port_cap->psi[i] = readl(addr + 4 + i); /* count unique ID values, two consecutive entries can * have the same ID if link is assymetric */ - if (i && (XHCI_EXT_PORT_PSIV(rhub->psi[i]) != - XHCI_EXT_PORT_PSIV(rhub->psi[i - 1]))) - rhub->psi_uid_count++; + if (i && (XHCI_EXT_PORT_PSIV(port_cap->psi[i]) != + XHCI_EXT_PORT_PSIV(port_cap->psi[i - 1]))) + port_cap->psi_uid_count++; xhci_dbg(xhci, "PSIV:%d PSIE:%d PLT:%d PFD:%d LP:%d PSIM:%d\n", - XHCI_EXT_PORT_PSIV(rhub->psi[i]), - XHCI_EXT_PORT_PSIE(rhub->psi[i]), - XHCI_EXT_PORT_PLT(rhub->psi[i]), - XHCI_EXT_PORT_PFD(rhub->psi[i]), - XHCI_EXT_PORT_LP(rhub->psi[i]), - XHCI_EXT_PORT_PSIM(rhub->psi[i])); + XHCI_EXT_PORT_PSIV(port_cap->psi[i]), + XHCI_EXT_PORT_PSIE(port_cap->psi[i]), + XHCI_EXT_PORT_PLT(port_cap->psi[i]), + XHCI_EXT_PORT_PFD(port_cap->psi[i]), + XHCI_EXT_PORT_LP(port_cap->psi[i]), + XHCI_EXT_PORT_PSIM(port_cap->psi[i])); } } /* cache usb2 port capabilities */ @@ -2219,6 +2227,7 @@ static void xhci_add_in_port(struct xhci_hcd *xhci, unsigned int num_ports, continue; } hw_port->rhub = rhub; + hw_port->port_cap = port_cap; rhub->num_ports++; } /* FIXME: Should we disable ports not in the Extended Capabilities? */ @@ -2309,6 +2318,11 @@ static int xhci_setup_port_arrays(struct xhci_hcd *xhci, gfp_t flags) if (!xhci->ext_caps) return -ENOMEM; + xhci->port_caps = kcalloc_node(cap_count, sizeof(*xhci->port_caps), + flags, dev_to_node(dev)); + if (!xhci->port_caps) + return -ENOMEM; + offset = cap_start; while (offset) { diff --git a/drivers/usb/host/xhci.h b/drivers/usb/host/xhci.h index 13d8838cd552..3ecee10fdcdc 100644 --- a/drivers/usb/host/xhci.h +++ b/drivers/usb/host/xhci.h @@ -1702,12 +1702,20 @@ struct xhci_bus_state { * Intel Lynx Point LP xHCI host. */ #define XHCI_MAX_REXIT_TIMEOUT_MS 20 +struct xhci_port_cap { + u32 *psi; /* array of protocol speed ID entries */ + u8 psi_count; + u8 psi_uid_count; + u8 maj_rev; + u8 min_rev; +}; struct xhci_port { __le32 __iomem *addr; int hw_portnum; int hcd_portnum; struct xhci_hub *rhub; + struct xhci_port_cap *port_cap; }; struct xhci_hub { @@ -1719,9 +1727,6 @@ struct xhci_hub { /* supported prococol extended capabiliy values */ u8 maj_rev; u8 min_rev; - u32 *psi; /* array of protocol speed ID entries */ - u8 psi_count; - u8 psi_uid_count; }; /* There is one xhci_hcd structure per controller */ @@ -1880,6 +1885,9 @@ struct xhci_hcd { /* cached usb2 extened protocol capabilites */ u32 *ext_caps; unsigned int num_ext_caps; + /* cached extended protocol port capabilities */ + struct xhci_port_cap *port_caps; + unsigned int num_port_caps; /* Compliance Mode Recovery Data */ struct timer_list comp_mode_recovery_timer; u32 port_status_u0; -- cgit v1.2.3 From 024d411e9c5d49eb96c825af52a3ce2682895676 Mon Sep 17 00:00:00 2001 From: Mathias Nyman Date: Mon, 10 Feb 2020 15:45:52 +0200 Subject: xhci: fix runtime pm enabling for quirky Intel hosts Intel hosts that need the XHCI_PME_STUCK_QUIRK flag should enable runtime pm by calling xhci_pme_acpi_rtd3_enable() before usb_hcd_pci_probe() calls pci_dev_run_wake(). Otherwise usage count for the device won't be decreased, and runtime suspend is prevented. usb_hcd_pci_probe() only decreases the usage count if device can generate run-time wake-up events, i.e. when pci_dev_run_wake() returns true. This issue was exposed by pci_dev_run_wake() change in commit 8feaec33b986 ("PCI / PM: Always check PME wakeup capability for runtime wakeup support") and should be backported to kernels with that change Cc: # 4.13+ Signed-off-by: Mathias Nyman Link: https://lore.kernel.org/r/20200210134553.9144-4-mathias.nyman@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/xhci-pci.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/usb/host/xhci-pci.c b/drivers/usb/host/xhci-pci.c index 4917c5b033fa..da7c2db41671 100644 --- a/drivers/usb/host/xhci-pci.c +++ b/drivers/usb/host/xhci-pci.c @@ -302,6 +302,9 @@ static int xhci_pci_setup(struct usb_hcd *hcd) if (!usb_hcd_is_primary_hcd(hcd)) return 0; + if (xhci->quirks & XHCI_PME_STUCK_QUIRK) + xhci_pme_acpi_rtd3_enable(pdev); + xhci_dbg(xhci, "Got SBRN %u\n", (unsigned int) xhci->sbrn); /* Find any debug ports */ @@ -359,9 +362,6 @@ static int xhci_pci_probe(struct pci_dev *dev, const struct pci_device_id *id) HCC_MAX_PSA(xhci->hcc_params) >= 4) xhci->shared_hcd->can_do_streams = 1; - if (xhci->quirks & XHCI_PME_STUCK_QUIRK) - xhci_pme_acpi_rtd3_enable(dev); - /* USB-2 and USB-3 roothubs initialized, allow runtime pm suspend */ pm_runtime_put_noidle(&dev->dev); -- cgit v1.2.3 From a3ae87dce3a5abe0b57c811bab02b2564b574106 Mon Sep 17 00:00:00 2001 From: Mathias Nyman Date: Mon, 10 Feb 2020 15:45:53 +0200 Subject: xhci: apply XHCI_PME_STUCK_QUIRK to Intel Comet Lake platforms Intel Comet Lake based platform require the XHCI_PME_STUCK_QUIRK quirk as well. Without this xHC can not enter D3 in runtime suspend. Cc: stable@vger.kernel.org Signed-off-by: Mathias Nyman Link: https://lore.kernel.org/r/20200210134553.9144-5-mathias.nyman@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/xhci-pci.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/usb/host/xhci-pci.c b/drivers/usb/host/xhci-pci.c index da7c2db41671..5e9b537df631 100644 --- a/drivers/usb/host/xhci-pci.c +++ b/drivers/usb/host/xhci-pci.c @@ -49,6 +49,7 @@ #define PCI_DEVICE_ID_INTEL_TITAN_RIDGE_4C_XHCI 0x15ec #define PCI_DEVICE_ID_INTEL_TITAN_RIDGE_DD_XHCI 0x15f0 #define PCI_DEVICE_ID_INTEL_ICE_LAKE_XHCI 0x8a13 +#define PCI_DEVICE_ID_INTEL_CML_XHCI 0xa3af #define PCI_DEVICE_ID_AMD_PROMONTORYA_4 0x43b9 #define PCI_DEVICE_ID_AMD_PROMONTORYA_3 0x43ba @@ -187,7 +188,8 @@ static void xhci_pci_quirks(struct device *dev, struct xhci_hcd *xhci) pdev->device == PCI_DEVICE_ID_INTEL_BROXTON_M_XHCI || pdev->device == PCI_DEVICE_ID_INTEL_BROXTON_B_XHCI || pdev->device == PCI_DEVICE_ID_INTEL_APL_XHCI || - pdev->device == PCI_DEVICE_ID_INTEL_DNV_XHCI)) { + pdev->device == PCI_DEVICE_ID_INTEL_DNV_XHCI || + pdev->device == PCI_DEVICE_ID_INTEL_CML_XHCI)) { xhci->quirks |= XHCI_PME_STUCK_QUIRK; } if (pdev->vendor == PCI_VENDOR_ID_INTEL && -- cgit v1.2.3 From a1147b8281bda99bda99892233e1900329a9cbf1 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 5 Feb 2020 09:01:52 -0500 Subject: NFS: Fix up directory verifier races In order to avoid having our dentry revalidation race with an update of the directory on the server, we need to store the verifier before the RPC calls to LOOKUP and READDIR. Signed-off-by: Trond Myklebust Reviewed-by: Benjamin Coddington Tested-by: Benjamin Coddington Signed-off-by: Anna Schumaker --- fs/nfs/dir.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 1320288ff9ec..b4e7558e42ab 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -155,6 +155,7 @@ typedef struct { loff_t current_index; decode_dirent_t decode; + unsigned long dir_verifier; unsigned long timestamp; unsigned long gencount; unsigned int cache_entry_index; @@ -353,6 +354,7 @@ int nfs_readdir_xdr_filler(struct page **pages, nfs_readdir_descriptor_t *desc, again: timestamp = jiffies; gencount = nfs_inc_attr_generation_counter(); + desc->dir_verifier = nfs_save_change_attribute(inode); error = NFS_PROTO(inode)->readdir(file_dentry(file), cred, entry->cookie, pages, NFS_SERVER(inode)->dtsize, desc->plus); if (error < 0) { @@ -455,13 +457,13 @@ void nfs_force_use_readdirplus(struct inode *dir) } static -void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry) +void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry, + unsigned long dir_verifier) { struct qstr filename = QSTR_INIT(entry->name, entry->len); DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); struct dentry *dentry; struct dentry *alias; - struct inode *dir = d_inode(parent); struct inode *inode; int status; @@ -500,7 +502,7 @@ again: if (nfs_same_file(dentry, entry)) { if (!entry->fh->size) goto out; - nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); + nfs_set_verifier(dentry, dir_verifier); status = nfs_refresh_inode(d_inode(dentry), entry->fattr); if (!status) nfs_setsecurity(d_inode(dentry), entry->fattr, entry->label); @@ -526,7 +528,7 @@ again: dput(dentry); dentry = alias; } - nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); + nfs_set_verifier(dentry, dir_verifier); out: dput(dentry); } @@ -564,7 +566,8 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en count++; if (desc->plus) - nfs_prime_dcache(file_dentry(desc->file), entry); + nfs_prime_dcache(file_dentry(desc->file), entry, + desc->dir_verifier); status = nfs_readdir_add_to_array(entry, page); if (status != 0) @@ -1159,6 +1162,7 @@ nfs_lookup_revalidate_dentry(struct inode *dir, struct dentry *dentry, struct nfs_fh *fhandle; struct nfs_fattr *fattr; struct nfs4_label *label; + unsigned long dir_verifier; int ret; ret = -ENOMEM; @@ -1168,6 +1172,7 @@ nfs_lookup_revalidate_dentry(struct inode *dir, struct dentry *dentry, if (fhandle == NULL || fattr == NULL || IS_ERR(label)) goto out; + dir_verifier = nfs_save_change_attribute(dir); ret = NFS_PROTO(dir)->lookup(dir, dentry, fhandle, fattr, label); if (ret < 0) { switch (ret) { @@ -1188,7 +1193,7 @@ nfs_lookup_revalidate_dentry(struct inode *dir, struct dentry *dentry, goto out; nfs_setsecurity(inode, fattr, label); - nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); + nfs_set_verifier(dentry, dir_verifier); /* set a readdirplus hint that we had a cache miss */ nfs_force_use_readdirplus(dir); @@ -1415,6 +1420,7 @@ struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, unsigned in struct nfs_fh *fhandle = NULL; struct nfs_fattr *fattr = NULL; struct nfs4_label *label = NULL; + unsigned long dir_verifier; int error; dfprintk(VFS, "NFS: lookup(%pd2)\n", dentry); @@ -1440,6 +1446,7 @@ struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, unsigned in if (IS_ERR(label)) goto out; + dir_verifier = nfs_save_change_attribute(dir); trace_nfs_lookup_enter(dir, dentry, flags); error = NFS_PROTO(dir)->lookup(dir, dentry, fhandle, fattr, label); if (error == -ENOENT) @@ -1463,7 +1470,7 @@ no_entry: goto out_label; dentry = res; } - nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); + nfs_set_verifier(dentry, dir_verifier); out_label: trace_nfs_lookup_exit(dir, dentry, flags, error); nfs4_label_free(label); -- cgit v1.2.3 From cf5b4059ba7197d6cef9c0e024979d178ed8c8ec Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 5 Feb 2020 09:01:53 -0500 Subject: NFSv4: Fix races between open and dentry revalidation We want to make sure that we revalidate the dentry if and only if we've done an OPEN by filename. In order to avoid races with remote changes to the directory on the server, we want to save the verifier before calling OPEN. The exception is if the server returned a delegation with our OPEN, as we then know that the filename can't have changed on the server. Signed-off-by: Trond Myklebust Reviewed-by: Benjamin Coddington Tested-by: Benjamin Coddington Signed-off-by: Anna Schumaker --- fs/nfs/nfs4file.c | 1 - fs/nfs/nfs4proc.c | 18 ++++++++++++++++-- 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index be4eb720d5b6..1297919e0fce 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -87,7 +87,6 @@ nfs4_file_open(struct inode *inode, struct file *filp) if (inode != d_inode(dentry)) goto out_drop; - nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); nfs_file_set_open_context(filp, ctx); nfs_fscache_open_file(inode, filp); err = 0; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 95d07a3dc5d1..6616a575711e 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2974,10 +2974,13 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata, struct dentry *dentry; struct nfs4_state *state; fmode_t acc_mode = _nfs4_ctx_to_accessmode(ctx); + struct inode *dir = d_inode(opendata->dir); + unsigned long dir_verifier; unsigned int seq; int ret; seq = raw_seqcount_begin(&sp->so_reclaim_seqcount); + dir_verifier = nfs_save_change_attribute(dir); ret = _nfs4_proc_open(opendata, ctx); if (ret != 0) @@ -3005,8 +3008,19 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata, dput(ctx->dentry); ctx->dentry = dentry = alias; } - nfs_set_verifier(dentry, - nfs_save_change_attribute(d_inode(opendata->dir))); + } + + switch(opendata->o_arg.claim) { + default: + break; + case NFS4_OPEN_CLAIM_NULL: + case NFS4_OPEN_CLAIM_DELEGATE_CUR: + case NFS4_OPEN_CLAIM_DELEGATE_PREV: + if (!opendata->rpc_done) + break; + if (opendata->o_res.delegation_type != 0) + dir_verifier = nfs_save_change_attribute(dir); + nfs_set_verifier(dentry, dir_verifier); } /* Parse layoutget results before we check for access */ -- cgit v1.2.3 From 93134df520f23f4e9998c425b8987edca7016817 Mon Sep 17 00:00:00 2001 From: Malcolm Priestley Date: Tue, 4 Feb 2020 19:34:02 +0000 Subject: staging: vt6656: fix sign of rx_dbm to bb_pre_ed_rssi. bb_pre_ed_rssi is an u8 rx_dm always returns negative signed values add minus operator to always yield positive. fixes issue where rx sensitivity is always set to maximum because the unsigned numbers were always greater then 100. Fixes: 63b9907f58f1 ("staging: vt6656: mac80211 conversion: create rx function.") Cc: stable Signed-off-by: Malcolm Priestley Link: https://lore.kernel.org/r/aceac98c-6e69-3ce1-dfec-2bf27b980221@gmail.com Signed-off-by: Greg Kroah-Hartman --- drivers/staging/vt6656/dpc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/staging/vt6656/dpc.c b/drivers/staging/vt6656/dpc.c index 821aae8ca402..a0b60e7d1086 100644 --- a/drivers/staging/vt6656/dpc.c +++ b/drivers/staging/vt6656/dpc.c @@ -98,7 +98,7 @@ int vnt_rx_data(struct vnt_private *priv, struct vnt_rcb *ptr_rcb, vnt_rf_rssi_to_dbm(priv, tail->rssi, &rx_dbm); - priv->bb_pre_ed_rssi = (u8)rx_dbm + 1; + priv->bb_pre_ed_rssi = (u8)-rx_dbm + 1; priv->current_rssi = priv->bb_pre_ed_rssi; skb_pull(skb, sizeof(*head)); -- cgit v1.2.3 From 6d67b0290b4b84c477e6a2fc6e005e174d3c7786 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Mon, 27 Jan 2020 15:56:16 -0800 Subject: staging: android: ashmem: Disallow ashmem memory from being remapped When ashmem file is mmapped, the resulting vma->vm_file points to the backing shmem file with the generic fops that do not check ashmem permissions like fops of ashmem do. If an mremap is done on the ashmem region, then the permission checks will be skipped. Fix that by disallowing mapping operation on the backing shmem file. Reported-by: Jann Horn Signed-off-by: Suren Baghdasaryan Cc: stable # 4.4,4.9,4.14,4.18,5.4 Signed-off-by: Todd Kjos Reviewed-by: Joel Fernandes (Google) Link: https://lore.kernel.org/r/20200127235616.48920-1-tkjos@google.com Signed-off-by: Greg Kroah-Hartman --- drivers/staging/android/ashmem.c | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/drivers/staging/android/ashmem.c b/drivers/staging/android/ashmem.c index 5891d0744a76..8044510d8ec6 100644 --- a/drivers/staging/android/ashmem.c +++ b/drivers/staging/android/ashmem.c @@ -351,8 +351,23 @@ static inline vm_flags_t calc_vm_may_flags(unsigned long prot) _calc_vm_trans(prot, PROT_EXEC, VM_MAYEXEC); } +static int ashmem_vmfile_mmap(struct file *file, struct vm_area_struct *vma) +{ + /* do not allow to mmap ashmem backing shmem file directly */ + return -EPERM; +} + +static unsigned long +ashmem_vmfile_get_unmapped_area(struct file *file, unsigned long addr, + unsigned long len, unsigned long pgoff, + unsigned long flags) +{ + return current->mm->get_unmapped_area(file, addr, len, pgoff, flags); +} + static int ashmem_mmap(struct file *file, struct vm_area_struct *vma) { + static struct file_operations vmfile_fops; struct ashmem_area *asma = file->private_data; int ret = 0; @@ -393,6 +408,19 @@ static int ashmem_mmap(struct file *file, struct vm_area_struct *vma) } vmfile->f_mode |= FMODE_LSEEK; asma->file = vmfile; + /* + * override mmap operation of the vmfile so that it can't be + * remapped which would lead to creation of a new vma with no + * asma permission checks. Have to override get_unmapped_area + * as well to prevent VM_BUG_ON check for f_ops modification. + */ + if (!vmfile_fops.mmap) { + vmfile_fops = *vmfile->f_op; + vmfile_fops.mmap = ashmem_vmfile_mmap; + vmfile_fops.get_unmapped_area = + ashmem_vmfile_get_unmapped_area; + } + vmfile->f_op = &vmfile_fops; } get_file(asma->file); -- cgit v1.2.3 From 8ae9a588ca35eb9c32dc03299c5e1f4a1e9a9617 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Sun, 26 Jan 2020 22:05:49 +0000 Subject: staging: rtl8723bs: fix copy of overlapping memory Currently the rtw_sprintf prints the contents of thread_name onto thread_name and this can lead to a potential copy of a string over itself. Avoid this by printing the literal string RTWHALXT instread of the contents of thread_name. Addresses-Coverity: ("copy of overlapping memory") Fixes: 554c0a3abf21 ("staging: Add rtl8723bs sdio wifi driver") Signed-off-by: Colin Ian King Reviewed-by: Hans de Goede Link: https://lore.kernel.org/r/20200126220549.9849-1-colin.king@canonical.com Signed-off-by: Greg Kroah-Hartman --- drivers/staging/rtl8723bs/hal/rtl8723bs_xmit.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/staging/rtl8723bs/hal/rtl8723bs_xmit.c b/drivers/staging/rtl8723bs/hal/rtl8723bs_xmit.c index b44e902ed338..b6d56cfb0a19 100644 --- a/drivers/staging/rtl8723bs/hal/rtl8723bs_xmit.c +++ b/drivers/staging/rtl8723bs/hal/rtl8723bs_xmit.c @@ -476,14 +476,13 @@ int rtl8723bs_xmit_thread(void *context) s32 ret; struct adapter *padapter; struct xmit_priv *pxmitpriv; - u8 thread_name[20] = "RTWHALXT"; - + u8 thread_name[20]; ret = _SUCCESS; padapter = context; pxmitpriv = &padapter->xmitpriv; - rtw_sprintf(thread_name, 20, "%s-"ADPT_FMT, thread_name, ADPT_ARG(padapter)); + rtw_sprintf(thread_name, 20, "RTWHALXT-" ADPT_FMT, ADPT_ARG(padapter)); thread_enter(thread_name); DBG_871X("start "FUNC_ADPT_FMT"\n", FUNC_ADPT_ARG(padapter)); -- cgit v1.2.3 From c3709b3285009e0c1448510b9460e96146cd5c9a Mon Sep 17 00:00:00 2001 From: Alistair Delva Date: Sun, 2 Feb 2020 20:22:54 -0800 Subject: staging: android: Delete the 'vsoc' driver The 'vsoc' driver was required for an early iteration of the Android 'cuttlefish' virtual platform, but this platform has been wholly converted to use virtio drivers instead. Delete this old driver. Cc: Greg Kroah-Hartman Cc: Joel Fernandes Cc: Greg Hartman Cc: kernel-team@android.com Cc: devel@driverdev.osuosl.org Signed-off-by: Alistair Delva Reviewed-by: Joel Fernandes (Google) Link: https://lore.kernel.org/r/20200203042254.80360-1-adelva@google.com Signed-off-by: Greg Kroah-Hartman --- drivers/staging/android/Kconfig | 8 - drivers/staging/android/Makefile | 1 - drivers/staging/android/TODO | 9 - drivers/staging/android/uapi/vsoc_shm.h | 295 -------- drivers/staging/android/vsoc.c | 1149 ------------------------------- 5 files changed, 1462 deletions(-) delete mode 100644 drivers/staging/android/uapi/vsoc_shm.h delete mode 100644 drivers/staging/android/vsoc.c diff --git a/drivers/staging/android/Kconfig b/drivers/staging/android/Kconfig index d6d605d5cbde..8d8fd5c29349 100644 --- a/drivers/staging/android/Kconfig +++ b/drivers/staging/android/Kconfig @@ -14,14 +14,6 @@ config ASHMEM It is, in theory, a good memory allocator for low-memory devices, because it can discard shared memory units when under memory pressure. -config ANDROID_VSOC - tristate "Android Virtual SoC support" - depends on PCI_MSI - help - This option adds support for the Virtual SoC driver needed to boot - a 'cuttlefish' Android image inside QEmu. The driver interacts with - a QEmu ivshmem device. If built as a module, it will be called vsoc. - source "drivers/staging/android/ion/Kconfig" endif # if ANDROID diff --git a/drivers/staging/android/Makefile b/drivers/staging/android/Makefile index 14bd9c6ce10d..3b66cd0b0ec5 100644 --- a/drivers/staging/android/Makefile +++ b/drivers/staging/android/Makefile @@ -4,4 +4,3 @@ ccflags-y += -I$(src) # needed for trace events obj-y += ion/ obj-$(CONFIG_ASHMEM) += ashmem.o -obj-$(CONFIG_ANDROID_VSOC) += vsoc.o diff --git a/drivers/staging/android/TODO b/drivers/staging/android/TODO index 767dd98fd92d..80eccfaf6db5 100644 --- a/drivers/staging/android/TODO +++ b/drivers/staging/android/TODO @@ -9,14 +9,5 @@ ion/ - Split /dev/ion up into multiple nodes (e.g. /dev/ion/heap0) - Better test framework (integration with VGEM was suggested) -vsoc.c, uapi/vsoc_shm.h - - The current driver uses the same wait queue for all of the futexes in a - region. This will cause false wakeups in regions with a large number of - waiting threads. We should eventually use multiple queues and select the - queue based on the region. - - Add debugfs support for examining the permissions of regions. - - Remove VSOC_WAIT_FOR_INCOMING_INTERRUPT ioctl. This functionality has been - superseded by the futex and is there for legacy reasons. - Please send patches to Greg Kroah-Hartman and Cc: Arve Hjønnevåg and Riley Andrews diff --git a/drivers/staging/android/uapi/vsoc_shm.h b/drivers/staging/android/uapi/vsoc_shm.h deleted file mode 100644 index 6291fb24efb2..000000000000 --- a/drivers/staging/android/uapi/vsoc_shm.h +++ /dev/null @@ -1,295 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Copyright (C) 2017 Google, Inc. - * - */ - -#ifndef _UAPI_LINUX_VSOC_SHM_H -#define _UAPI_LINUX_VSOC_SHM_H - -#include - -/** - * A permission is a token that permits a receiver to read and/or write an area - * of memory within a Vsoc region. - * - * An fd_scoped permission grants both read and write access, and can be - * attached to a file description (see open(2)). - * Ownership of the area can then be shared by passing a file descriptor - * among processes. - * - * begin_offset and end_offset define the area of memory that is controlled by - * the permission. owner_offset points to a word, also in shared memory, that - * controls ownership of the area. - * - * ownership of the region expires when the associated file description is - * released. - * - * At most one permission can be attached to each file description. - * - * This is useful when implementing HALs like gralloc that scope and pass - * ownership of shared resources via file descriptors. - * - * The caller is responsibe for doing any fencing. - * - * The calling process will normally identify a currently free area of - * memory. It will construct a proposed fd_scoped_permission_arg structure: - * - * begin_offset and end_offset describe the area being claimed - * - * owner_offset points to the location in shared memory that indicates the - * owner of the area. - * - * owned_value is the value that will be stored in owner_offset iff the - * permission can be granted. It must be different than VSOC_REGION_FREE. - * - * Two fd_scoped_permission structures are compatible if they vary only by - * their owned_value fields. - * - * The driver ensures that, for any group of simultaneous callers proposing - * compatible fd_scoped_permissions, it will accept exactly one of the - * propopsals. The other callers will get a failure with errno of EAGAIN. - * - * A process receiving a file descriptor can identify the region being - * granted using the VSOC_GET_FD_SCOPED_PERMISSION ioctl. - */ -struct fd_scoped_permission { - __u32 begin_offset; - __u32 end_offset; - __u32 owner_offset; - __u32 owned_value; -}; - -/* - * This value represents a free area of memory. The driver expects to see this - * value at owner_offset when creating a permission otherwise it will not do it, - * and will write this value back once the permission is no longer needed. - */ -#define VSOC_REGION_FREE ((__u32)0) - -/** - * ioctl argument for VSOC_CREATE_FD_SCOPE_PERMISSION - */ -struct fd_scoped_permission_arg { - struct fd_scoped_permission perm; - __s32 managed_region_fd; -}; - -#define VSOC_NODE_FREE ((__u32)0) - -/* - * Describes a signal table in shared memory. Each non-zero entry in the - * table indicates that the receiver should signal the futex at the given - * offset. Offsets are relative to the region, not the shared memory window. - * - * interrupt_signalled_offset is used to reliably signal interrupts across the - * vmm boundary. There are two roles: transmitter and receiver. For example, - * in the host_to_guest_signal_table the host is the transmitter and the - * guest is the receiver. The protocol is as follows: - * - * 1. The transmitter should convert the offset of the futex to an offset - * in the signal table [0, (1 << num_nodes_lg2)) - * The transmitter can choose any appropriate hashing algorithm, including - * hash = futex_offset & ((1 << num_nodes_lg2) - 1) - * - * 3. The transmitter should atomically compare and swap futex_offset with 0 - * at hash. There are 3 possible outcomes - * a. The swap fails because the futex_offset is already in the table. - * The transmitter should stop. - * b. Some other offset is in the table. This is a hash collision. The - * transmitter should move to another table slot and try again. One - * possible algorithm: - * hash = (hash + 1) & ((1 << num_nodes_lg2) - 1) - * c. The swap worked. Continue below. - * - * 3. The transmitter atomically swaps 1 with the value at the - * interrupt_signalled_offset. There are two outcomes: - * a. The prior value was 1. In this case an interrupt has already been - * posted. The transmitter is done. - * b. The prior value was 0, indicating that the receiver may be sleeping. - * The transmitter will issue an interrupt. - * - * 4. On waking the receiver immediately exchanges a 0 with the - * interrupt_signalled_offset. If it receives a 0 then this a spurious - * interrupt. That may occasionally happen in the current protocol, but - * should be rare. - * - * 5. The receiver scans the signal table by atomicaly exchanging 0 at each - * location. If a non-zero offset is returned from the exchange the - * receiver wakes all sleepers at the given offset: - * futex((int*)(region_base + old_value), FUTEX_WAKE, MAX_INT); - * - * 6. The receiver thread then does a conditional wait, waking immediately - * if the value at interrupt_signalled_offset is non-zero. This catches cases - * here additional signals were posted while the table was being scanned. - * On the guest the wait is handled via the VSOC_WAIT_FOR_INCOMING_INTERRUPT - * ioctl. - */ -struct vsoc_signal_table_layout { - /* log_2(Number of signal table entries) */ - __u32 num_nodes_lg2; - /* - * Offset to the first signal table entry relative to the start of the - * region - */ - __u32 futex_uaddr_table_offset; - /* - * Offset to an atomic_t / atomic uint32_t. A non-zero value indicates - * that one or more offsets are currently posted in the table. - * semi-unique access to an entry in the table - */ - __u32 interrupt_signalled_offset; -}; - -#define VSOC_REGION_WHOLE ((__s32)0) -#define VSOC_DEVICE_NAME_SZ 16 - -/** - * Each HAL would (usually) talk to a single device region - * Mulitple entities care about these regions: - * - The ivshmem_server will populate the regions in shared memory - * - The guest kernel will read the region, create minor device nodes, and - * allow interested parties to register for FUTEX_WAKE events in the region - * - HALs will access via the minor device nodes published by the guest kernel - * - Host side processes will access the region via the ivshmem_server: - * 1. Pass name to ivshmem_server at a UNIX socket - * 2. ivshmemserver will reply with 2 fds: - * - host->guest doorbell fd - * - guest->host doorbell fd - * - fd for the shared memory region - * - region offset - * 3. Start a futex receiver thread on the doorbell fd pointed at the - * signal_nodes - */ -struct vsoc_device_region { - __u16 current_version; - __u16 min_compatible_version; - __u32 region_begin_offset; - __u32 region_end_offset; - __u32 offset_of_region_data; - struct vsoc_signal_table_layout guest_to_host_signal_table; - struct vsoc_signal_table_layout host_to_guest_signal_table; - /* Name of the device. Must always be terminated with a '\0', so - * the longest supported device name is 15 characters. - */ - char device_name[VSOC_DEVICE_NAME_SZ]; - /* There are two ways that permissions to access regions are handled: - * - When subdivided_by is VSOC_REGION_WHOLE, any process that can - * open the device node for the region gains complete access to it. - * - When subdivided is set processes that open the region cannot - * access it. Access to a sub-region must be established by invoking - * the VSOC_CREATE_FD_SCOPE_PERMISSION ioctl on the region - * referenced in subdivided_by, providing a fileinstance - * (represented by a fd) opened on this region. - */ - __u32 managed_by; -}; - -/* - * The vsoc layout descriptor. - * The first 4K should be reserved for the shm header and region descriptors. - * The regions should be page aligned. - */ - -struct vsoc_shm_layout_descriptor { - __u16 major_version; - __u16 minor_version; - - /* size of the shm. This may be redundant but nice to have */ - __u32 size; - - /* number of shared memory regions */ - __u32 region_count; - - /* The offset to the start of region descriptors */ - __u32 vsoc_region_desc_offset; -}; - -/* - * This specifies the current version that should be stored in - * vsoc_shm_layout_descriptor.major_version and - * vsoc_shm_layout_descriptor.minor_version. - * It should be updated only if the vsoc_device_region and - * vsoc_shm_layout_descriptor structures have changed. - * Versioning within each region is transferred - * via the min_compatible_version and current_version fields in - * vsoc_device_region. The driver does not consult these fields: they are left - * for the HALs and host processes and will change independently of the layout - * version. - */ -#define CURRENT_VSOC_LAYOUT_MAJOR_VERSION 2 -#define CURRENT_VSOC_LAYOUT_MINOR_VERSION 0 - -#define VSOC_CREATE_FD_SCOPED_PERMISSION \ - _IOW(0xF5, 0, struct fd_scoped_permission) -#define VSOC_GET_FD_SCOPED_PERMISSION _IOR(0xF5, 1, struct fd_scoped_permission) - -/* - * This is used to signal the host to scan the guest_to_host_signal_table - * for new futexes to wake. This sends an interrupt if one is not already - * in flight. - */ -#define VSOC_MAYBE_SEND_INTERRUPT_TO_HOST _IO(0xF5, 2) - -/* - * When this returns the guest will scan host_to_guest_signal_table to - * check for new futexes to wake. - */ -/* TODO(ghartman): Consider moving this to the bottom half */ -#define VSOC_WAIT_FOR_INCOMING_INTERRUPT _IO(0xF5, 3) - -/* - * Guest HALs will use this to retrieve the region description after - * opening their device node. - */ -#define VSOC_DESCRIBE_REGION _IOR(0xF5, 4, struct vsoc_device_region) - -/* - * Wake any threads that may be waiting for a host interrupt on this region. - * This is mostly used during shutdown. - */ -#define VSOC_SELF_INTERRUPT _IO(0xF5, 5) - -/* - * This is used to signal the host to scan the guest_to_host_signal_table - * for new futexes to wake. This sends an interrupt unconditionally. - */ -#define VSOC_SEND_INTERRUPT_TO_HOST _IO(0xF5, 6) - -enum wait_types { - VSOC_WAIT_UNDEFINED = 0, - VSOC_WAIT_IF_EQUAL = 1, - VSOC_WAIT_IF_EQUAL_TIMEOUT = 2 -}; - -/* - * Wait for a condition to be true - * - * Note, this is sized and aligned so the 32 bit and 64 bit layouts are - * identical. - */ -struct vsoc_cond_wait { - /* Input: Offset of the 32 bit word to check */ - __u32 offset; - /* Input: Value that will be compared with the offset */ - __u32 value; - /* Monotonic time to wake at in seconds */ - __u64 wake_time_sec; - /* Input: Monotonic time to wait in nanoseconds */ - __u32 wake_time_nsec; - /* Input: Type of wait */ - __u32 wait_type; - /* Output: Number of times the thread woke before returning. */ - __u32 wakes; - /* Ensure that we're 8-byte aligned and 8 byte length for 32/64 bit - * compatibility. - */ - __u32 reserved_1; -}; - -#define VSOC_COND_WAIT _IOWR(0xF5, 7, struct vsoc_cond_wait) - -/* Wake any local threads waiting at the offset given in arg */ -#define VSOC_COND_WAKE _IO(0xF5, 8) - -#endif /* _UAPI_LINUX_VSOC_SHM_H */ diff --git a/drivers/staging/android/vsoc.c b/drivers/staging/android/vsoc.c deleted file mode 100644 index 1240bb0317d9..000000000000 --- a/drivers/staging/android/vsoc.c +++ /dev/null @@ -1,1149 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * drivers/android/staging/vsoc.c - * - * Android Virtual System on a Chip (VSoC) driver - * - * Copyright (C) 2017 Google, Inc. - * - * Author: ghartman@google.com - * - * Based on drivers/char/kvm_ivshmem.c - driver for KVM Inter-VM shared memory - * Copyright 2009 Cam Macdonell - * - * Based on cirrusfb.c and 8139cp.c: - * Copyright 1999-2001 Jeff Garzik - * Copyright 2001-2004 Jeff Garzik - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "uapi/vsoc_shm.h" - -#define VSOC_DEV_NAME "vsoc" - -/* - * Description of the ivshmem-doorbell PCI device used by QEmu. These - * constants follow docs/specs/ivshmem-spec.txt, which can be found in - * the QEmu repository. This was last reconciled with the version that - * came out with 2.8 - */ - -/* - * These constants are determined KVM Inter-VM shared memory device - * register offsets - */ -enum { - INTR_MASK = 0x00, /* Interrupt Mask */ - INTR_STATUS = 0x04, /* Interrupt Status */ - IV_POSITION = 0x08, /* VM ID */ - DOORBELL = 0x0c, /* Doorbell */ -}; - -static const int REGISTER_BAR; /* Equal to 0 */ -static const int MAX_REGISTER_BAR_LEN = 0x100; -/* - * The MSI-x BAR is not used directly. - * - * static const int MSI_X_BAR = 1; - */ -static const int SHARED_MEMORY_BAR = 2; - -struct vsoc_region_data { - char name[VSOC_DEVICE_NAME_SZ + 1]; - wait_queue_head_t interrupt_wait_queue; - /* TODO(b/73664181): Use multiple futex wait queues */ - wait_queue_head_t futex_wait_queue; - /* Flag indicating that an interrupt has been signalled by the host. */ - atomic_t *incoming_signalled; - /* Flag indicating the guest has signalled the host. */ - atomic_t *outgoing_signalled; - bool irq_requested; - bool device_created; -}; - -struct vsoc_device { - /* Kernel virtual address of REGISTER_BAR. */ - void __iomem *regs; - /* Physical address of SHARED_MEMORY_BAR. */ - phys_addr_t shm_phys_start; - /* Kernel virtual address of SHARED_MEMORY_BAR. */ - void __iomem *kernel_mapped_shm; - /* Size of the entire shared memory window in bytes. */ - size_t shm_size; - /* - * Pointer to the virtual address of the shared memory layout structure. - * This is probably identical to kernel_mapped_shm, but saving this - * here saves a lot of annoying casts. - */ - struct vsoc_shm_layout_descriptor *layout; - /* - * Points to a table of region descriptors in the kernel's virtual - * address space. Calculated from - * vsoc_shm_layout_descriptor.vsoc_region_desc_offset - */ - struct vsoc_device_region *regions; - /* Head of a list of permissions that have been granted. */ - struct list_head permissions; - struct pci_dev *dev; - /* Per-region (and therefore per-interrupt) information. */ - struct vsoc_region_data *regions_data; - /* - * Table of msi-x entries. This has to be separated from struct - * vsoc_region_data because the kernel deals with them as an array. - */ - struct msix_entry *msix_entries; - /* Mutex that protectes the permission list */ - struct mutex mtx; - /* Major number assigned by the kernel */ - int major; - /* Character device assigned by the kernel */ - struct cdev cdev; - /* Device class assigned by the kernel */ - struct class *class; - /* - * Flags that indicate what we've initialized. These are used to do an - * orderly cleanup of the device. - */ - bool enabled_device; - bool requested_regions; - bool cdev_added; - bool class_added; - bool msix_enabled; -}; - -static struct vsoc_device vsoc_dev; - -/* - * TODO(ghartman): Add a /sys filesystem entry that summarizes the permissions. - */ - -struct fd_scoped_permission_node { - struct fd_scoped_permission permission; - struct list_head list; -}; - -struct vsoc_private_data { - struct fd_scoped_permission_node *fd_scoped_permission_node; -}; - -static long vsoc_ioctl(struct file *, unsigned int, unsigned long); -static int vsoc_mmap(struct file *, struct vm_area_struct *); -static int vsoc_open(struct inode *, struct file *); -static int vsoc_release(struct inode *, struct file *); -static ssize_t vsoc_read(struct file *, char __user *, size_t, loff_t *); -static ssize_t vsoc_write(struct file *, const char __user *, size_t, loff_t *); -static loff_t vsoc_lseek(struct file *filp, loff_t offset, int origin); -static int -do_create_fd_scoped_permission(struct vsoc_device_region *region_p, - struct fd_scoped_permission_node *np, - struct fd_scoped_permission_arg __user *arg); -static void -do_destroy_fd_scoped_permission(struct vsoc_device_region *owner_region_p, - struct fd_scoped_permission *perm); -static long do_vsoc_describe_region(struct file *, - struct vsoc_device_region __user *); -static ssize_t vsoc_get_area(struct file *filp, __u32 *perm_off); - -/** - * Validate arguments on entry points to the driver. - */ -inline int vsoc_validate_inode(struct inode *inode) -{ - if (iminor(inode) >= vsoc_dev.layout->region_count) { - dev_err(&vsoc_dev.dev->dev, - "describe_region: invalid region %d\n", iminor(inode)); - return -ENODEV; - } - return 0; -} - -inline int vsoc_validate_filep(struct file *filp) -{ - int ret = vsoc_validate_inode(file_inode(filp)); - - if (ret) - return ret; - if (!filp->private_data) { - dev_err(&vsoc_dev.dev->dev, - "No private data on fd, region %d\n", - iminor(file_inode(filp))); - return -EBADFD; - } - return 0; -} - -/* Converts from shared memory offset to virtual address */ -static inline void *shm_off_to_virtual_addr(__u32 offset) -{ - return (void __force *)vsoc_dev.kernel_mapped_shm + offset; -} - -/* Converts from shared memory offset to physical address */ -static inline phys_addr_t shm_off_to_phys_addr(__u32 offset) -{ - return vsoc_dev.shm_phys_start + offset; -} - -/** - * Convenience functions to obtain the region from the inode or file. - * Dangerous to call before validating the inode/file. - */ -static -inline struct vsoc_device_region *vsoc_region_from_inode(struct inode *inode) -{ - return &vsoc_dev.regions[iminor(inode)]; -} - -static -inline struct vsoc_device_region *vsoc_region_from_filep(struct file *inode) -{ - return vsoc_region_from_inode(file_inode(inode)); -} - -static inline uint32_t vsoc_device_region_size(struct vsoc_device_region *r) -{ - return r->region_end_offset - r->region_begin_offset; -} - -static const struct file_operations vsoc_ops = { - .owner = THIS_MODULE, - .open = vsoc_open, - .mmap = vsoc_mmap, - .read = vsoc_read, - .unlocked_ioctl = vsoc_ioctl, - .compat_ioctl = vsoc_ioctl, - .write = vsoc_write, - .llseek = vsoc_lseek, - .release = vsoc_release, -}; - -static struct pci_device_id vsoc_id_table[] = { - {0x1af4, 0x1110, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - {0}, -}; - -MODULE_DEVICE_TABLE(pci, vsoc_id_table); - -static void vsoc_remove_device(struct pci_dev *pdev); -static int vsoc_probe_device(struct pci_dev *pdev, - const struct pci_device_id *ent); - -static struct pci_driver vsoc_pci_driver = { - .name = "vsoc", - .id_table = vsoc_id_table, - .probe = vsoc_probe_device, - .remove = vsoc_remove_device, -}; - -static int -do_create_fd_scoped_permission(struct vsoc_device_region *region_p, - struct fd_scoped_permission_node *np, - struct fd_scoped_permission_arg __user *arg) -{ - struct file *managed_filp; - s32 managed_fd; - atomic_t *owner_ptr = NULL; - struct vsoc_device_region *managed_region_p; - - if (copy_from_user(&np->permission, - &arg->perm, sizeof(np->permission)) || - copy_from_user(&managed_fd, - &arg->managed_region_fd, sizeof(managed_fd))) { - return -EFAULT; - } - managed_filp = fdget(managed_fd).file; - /* Check that it's a valid fd, */ - if (!managed_filp || vsoc_validate_filep(managed_filp)) - return -EPERM; - /* EEXIST if the given fd already has a permission. */ - if (((struct vsoc_private_data *)managed_filp->private_data)-> - fd_scoped_permission_node) - return -EEXIST; - managed_region_p = vsoc_region_from_filep(managed_filp); - /* Check that the provided region is managed by this one */ - if (&vsoc_dev.regions[managed_region_p->managed_by] != region_p) - return -EPERM; - /* The area must be well formed and have non-zero size */ - if (np->permission.begin_offset >= np->permission.end_offset) - return -EINVAL; - /* The area must fit in the memory window */ - if (np->permission.end_offset > - vsoc_device_region_size(managed_region_p)) - return -ERANGE; - /* The area must be in the region data section */ - if (np->permission.begin_offset < - managed_region_p->offset_of_region_data) - return -ERANGE; - /* The area must be page aligned */ - if (!PAGE_ALIGNED(np->permission.begin_offset) || - !PAGE_ALIGNED(np->permission.end_offset)) - return -EINVAL; - /* Owner offset must be naturally aligned in the window */ - if (np->permission.owner_offset & - (sizeof(np->permission.owner_offset) - 1)) - return -EINVAL; - /* The owner flag must reside in the owner memory */ - if (np->permission.owner_offset + sizeof(np->permission.owner_offset) > - vsoc_device_region_size(region_p)) - return -ERANGE; - /* The owner flag must reside in the data section */ - if (np->permission.owner_offset < region_p->offset_of_region_data) - return -EINVAL; - /* The owner value must change to claim the memory */ - if (np->permission.owned_value == VSOC_REGION_FREE) - return -EINVAL; - owner_ptr = - (atomic_t *)shm_off_to_virtual_addr(region_p->region_begin_offset + - np->permission.owner_offset); - /* We've already verified that this is in the shared memory window, so - * it should be safe to write to this address. - */ - if (atomic_cmpxchg(owner_ptr, - VSOC_REGION_FREE, - np->permission.owned_value) != VSOC_REGION_FREE) { - return -EBUSY; - } - ((struct vsoc_private_data *)managed_filp->private_data)-> - fd_scoped_permission_node = np; - /* The file offset needs to be adjusted if the calling - * process did any read/write operations on the fd - * before creating the permission. - */ - if (managed_filp->f_pos) { - if (managed_filp->f_pos > np->permission.end_offset) { - /* If the offset is beyond the permission end, set it - * to the end. - */ - managed_filp->f_pos = np->permission.end_offset; - } else { - /* If the offset is within the permission interval - * keep it there otherwise reset it to zero. - */ - if (managed_filp->f_pos < np->permission.begin_offset) { - managed_filp->f_pos = 0; - } else { - managed_filp->f_pos -= - np->permission.begin_offset; - } - } - } - return 0; -} - -static void -do_destroy_fd_scoped_permission_node(struct vsoc_device_region *owner_region_p, - struct fd_scoped_permission_node *node) -{ - if (node) { - do_destroy_fd_scoped_permission(owner_region_p, - &node->permission); - mutex_lock(&vsoc_dev.mtx); - list_del(&node->list); - mutex_unlock(&vsoc_dev.mtx); - kfree(node); - } -} - -static void -do_destroy_fd_scoped_permission(struct vsoc_device_region *owner_region_p, - struct fd_scoped_permission *perm) -{ - atomic_t *owner_ptr = NULL; - int prev = 0; - - if (!perm) - return; - owner_ptr = (atomic_t *)shm_off_to_virtual_addr - (owner_region_p->region_begin_offset + perm->owner_offset); - prev = atomic_xchg(owner_ptr, VSOC_REGION_FREE); - if (prev != perm->owned_value) - dev_err(&vsoc_dev.dev->dev, - "%x-%x: owner (%s) %x: expected to be %x was %x", - perm->begin_offset, perm->end_offset, - owner_region_p->device_name, perm->owner_offset, - perm->owned_value, prev); -} - -static long do_vsoc_describe_region(struct file *filp, - struct vsoc_device_region __user *dest) -{ - struct vsoc_device_region *region_p; - int retval = vsoc_validate_filep(filp); - - if (retval) - return retval; - region_p = vsoc_region_from_filep(filp); - if (copy_to_user(dest, region_p, sizeof(*region_p))) - return -EFAULT; - return 0; -} - -/** - * Implements the inner logic of cond_wait. Copies to and from userspace are - * done in the helper function below. - */ -static int handle_vsoc_cond_wait(struct file *filp, struct vsoc_cond_wait *arg) -{ - DEFINE_WAIT(wait); - u32 region_number = iminor(file_inode(filp)); - struct vsoc_region_data *data = vsoc_dev.regions_data + region_number; - struct hrtimer_sleeper timeout, *to = NULL; - int ret = 0; - struct vsoc_device_region *region_p = vsoc_region_from_filep(filp); - atomic_t *address = NULL; - ktime_t wake_time; - - /* Ensure that the offset is aligned */ - if (arg->offset & (sizeof(uint32_t) - 1)) - return -EADDRNOTAVAIL; - /* Ensure that the offset is within shared memory */ - if (((uint64_t)arg->offset) + region_p->region_begin_offset + - sizeof(uint32_t) > region_p->region_end_offset) - return -E2BIG; - address = shm_off_to_virtual_addr(region_p->region_begin_offset + - arg->offset); - - /* Ensure that the type of wait is valid */ - switch (arg->wait_type) { - case VSOC_WAIT_IF_EQUAL: - break; - case VSOC_WAIT_IF_EQUAL_TIMEOUT: - to = &timeout; - break; - default: - return -EINVAL; - } - - if (to) { - /* Copy the user-supplied timesec into the kernel structure. - * We do things this way to flatten differences between 32 bit - * and 64 bit timespecs. - */ - if (arg->wake_time_nsec >= NSEC_PER_SEC) - return -EINVAL; - wake_time = ktime_set(arg->wake_time_sec, arg->wake_time_nsec); - - hrtimer_init_sleeper_on_stack(to, CLOCK_MONOTONIC, - HRTIMER_MODE_ABS); - hrtimer_set_expires_range_ns(&to->timer, wake_time, - current->timer_slack_ns); - } - - while (1) { - prepare_to_wait(&data->futex_wait_queue, &wait, - TASK_INTERRUPTIBLE); - /* - * Check the sentinel value after prepare_to_wait. If the value - * changes after this check the writer will call signal, - * changing the task state from INTERRUPTIBLE to RUNNING. That - * will ensure that schedule() will eventually schedule this - * task. - */ - if (atomic_read(address) != arg->value) { - ret = 0; - break; - } - if (to) { - hrtimer_sleeper_start_expires(to, HRTIMER_MODE_ABS); - if (likely(to->task)) - freezable_schedule(); - hrtimer_cancel(&to->timer); - if (!to->task) { - ret = -ETIMEDOUT; - break; - } - } else { - freezable_schedule(); - } - /* Count the number of times that we woke up. This is useful - * for unit testing. - */ - ++arg->wakes; - if (signal_pending(current)) { - ret = -EINTR; - break; - } - } - finish_wait(&data->futex_wait_queue, &wait); - if (to) - destroy_hrtimer_on_stack(&to->timer); - return ret; -} - -/** - * Handles the details of copying from/to userspace to ensure that the copies - * happen on all of the return paths of cond_wait. - */ -static int do_vsoc_cond_wait(struct file *filp, - struct vsoc_cond_wait __user *untrusted_in) -{ - struct vsoc_cond_wait arg; - int rval = 0; - - if (copy_from_user(&arg, untrusted_in, sizeof(arg))) - return -EFAULT; - /* wakes is an out parameter. Initialize it to something sensible. */ - arg.wakes = 0; - rval = handle_vsoc_cond_wait(filp, &arg); - if (copy_to_user(untrusted_in, &arg, sizeof(arg))) - return -EFAULT; - return rval; -} - -static int do_vsoc_cond_wake(struct file *filp, uint32_t offset) -{ - struct vsoc_device_region *region_p = vsoc_region_from_filep(filp); - u32 region_number = iminor(file_inode(filp)); - struct vsoc_region_data *data = vsoc_dev.regions_data + region_number; - /* Ensure that the offset is aligned */ - if (offset & (sizeof(uint32_t) - 1)) - return -EADDRNOTAVAIL; - /* Ensure that the offset is within shared memory */ - if (((uint64_t)offset) + region_p->region_begin_offset + - sizeof(uint32_t) > region_p->region_end_offset) - return -E2BIG; - /* - * TODO(b/73664181): Use multiple futex wait queues. - * We need to wake every sleeper when the condition changes. Typically - * only a single thread will be waiting on the condition, but there - * are exceptions. The worst case is about 10 threads. - */ - wake_up_interruptible_all(&data->futex_wait_queue); - return 0; -} - -static long vsoc_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) -{ - int rv = 0; - struct vsoc_device_region *region_p; - u32 reg_num; - struct vsoc_region_data *reg_data; - int retval = vsoc_validate_filep(filp); - - if (retval) - return retval; - region_p = vsoc_region_from_filep(filp); - reg_num = iminor(file_inode(filp)); - reg_data = vsoc_dev.regions_data + reg_num; - switch (cmd) { - case VSOC_CREATE_FD_SCOPED_PERMISSION: - { - struct fd_scoped_permission_node *node = NULL; - - node = kzalloc(sizeof(*node), GFP_KERNEL); - /* We can't allocate memory for the permission */ - if (!node) - return -ENOMEM; - INIT_LIST_HEAD(&node->list); - rv = do_create_fd_scoped_permission - (region_p, - node, - (struct fd_scoped_permission_arg __user *)arg); - if (!rv) { - mutex_lock(&vsoc_dev.mtx); - list_add(&node->list, &vsoc_dev.permissions); - mutex_unlock(&vsoc_dev.mtx); - } else { - kfree(node); - return rv; - } - } - break; - - case VSOC_GET_FD_SCOPED_PERMISSION: - { - struct fd_scoped_permission_node *node = - ((struct vsoc_private_data *)filp->private_data)-> - fd_scoped_permission_node; - if (!node) - return -ENOENT; - if (copy_to_user - ((struct fd_scoped_permission __user *)arg, - &node->permission, sizeof(node->permission))) - return -EFAULT; - } - break; - - case VSOC_MAYBE_SEND_INTERRUPT_TO_HOST: - if (!atomic_xchg(reg_data->outgoing_signalled, 1)) { - writel(reg_num, vsoc_dev.regs + DOORBELL); - return 0; - } else { - return -EBUSY; - } - break; - - case VSOC_SEND_INTERRUPT_TO_HOST: - writel(reg_num, vsoc_dev.regs + DOORBELL); - return 0; - case VSOC_WAIT_FOR_INCOMING_INTERRUPT: - wait_event_interruptible - (reg_data->interrupt_wait_queue, - (atomic_read(reg_data->incoming_signalled) != 0)); - break; - - case VSOC_DESCRIBE_REGION: - return do_vsoc_describe_region - (filp, - (struct vsoc_device_region __user *)arg); - - case VSOC_SELF_INTERRUPT: - atomic_set(reg_data->incoming_signalled, 1); - wake_up_interruptible(®_data->interrupt_wait_queue); - break; - - case VSOC_COND_WAIT: - return do_vsoc_cond_wait(filp, - (struct vsoc_cond_wait __user *)arg); - case VSOC_COND_WAKE: - return do_vsoc_cond_wake(filp, arg); - - default: - return -EINVAL; - } - return 0; -} - -static ssize_t vsoc_read(struct file *filp, char __user *buffer, size_t len, - loff_t *poffset) -{ - __u32 area_off; - const void *area_p; - ssize_t area_len; - int retval = vsoc_validate_filep(filp); - - if (retval) - return retval; - area_len = vsoc_get_area(filp, &area_off); - area_p = shm_off_to_virtual_addr(area_off); - area_p += *poffset; - area_len -= *poffset; - if (area_len <= 0) - return 0; - if (area_len < len) - len = area_len; - if (copy_to_user(buffer, area_p, len)) - return -EFAULT; - *poffset += len; - return len; -} - -static loff_t vsoc_lseek(struct file *filp, loff_t offset, int origin) -{ - ssize_t area_len = 0; - int retval = vsoc_validate_filep(filp); - - if (retval) - return retval; - area_len = vsoc_get_area(filp, NULL); - switch (origin) { - case SEEK_SET: - break; - - case SEEK_CUR: - if (offset > 0 && offset + filp->f_pos < 0) - return -EOVERFLOW; - offset += filp->f_pos; - break; - - case SEEK_END: - if (offset > 0 && offset + area_len < 0) - return -EOVERFLOW; - offset += area_len; - break; - - case SEEK_DATA: - if (offset >= area_len) - return -EINVAL; - if (offset < 0) - offset = 0; - break; - - case SEEK_HOLE: - /* Next hole is always the end of the region, unless offset is - * beyond that - */ - if (offset < area_len) - offset = area_len; - break; - - default: - return -EINVAL; - } - - if (offset < 0 || offset > area_len) - return -EINVAL; - filp->f_pos = offset; - - return offset; -} - -static ssize_t vsoc_write(struct file *filp, const char __user *buffer, - size_t len, loff_t *poffset) -{ - __u32 area_off; - void *area_p; - ssize_t area_len; - int retval = vsoc_validate_filep(filp); - - if (retval) - return retval; - area_len = vsoc_get_area(filp, &area_off); - area_p = shm_off_to_virtual_addr(area_off); - area_p += *poffset; - area_len -= *poffset; - if (area_len <= 0) - return 0; - if (area_len < len) - len = area_len; - if (copy_from_user(area_p, buffer, len)) - return -EFAULT; - *poffset += len; - return len; -} - -static irqreturn_t vsoc_interrupt(int irq, void *region_data_v) -{ - struct vsoc_region_data *region_data = - (struct vsoc_region_data *)region_data_v; - int reg_num = region_data - vsoc_dev.regions_data; - - if (unlikely(!region_data)) - return IRQ_NONE; - - if (unlikely(reg_num < 0 || - reg_num >= vsoc_dev.layout->region_count)) { - dev_err(&vsoc_dev.dev->dev, - "invalid irq @%p reg_num=0x%04x\n", - region_data, reg_num); - return IRQ_NONE; - } - if (unlikely(vsoc_dev.regions_data + reg_num != region_data)) { - dev_err(&vsoc_dev.dev->dev, - "irq not aligned @%p reg_num=0x%04x\n", - region_data, reg_num); - return IRQ_NONE; - } - wake_up_interruptible(®ion_data->interrupt_wait_queue); - return IRQ_HANDLED; -} - -static int vsoc_probe_device(struct pci_dev *pdev, - const struct pci_device_id *ent) -{ - int result; - int i; - resource_size_t reg_size; - dev_t devt; - - vsoc_dev.dev = pdev; - result = pci_enable_device(pdev); - if (result) { - dev_err(&pdev->dev, - "pci_enable_device failed %s: error %d\n", - pci_name(pdev), result); - return result; - } - vsoc_dev.enabled_device = true; - result = pci_request_regions(pdev, "vsoc"); - if (result < 0) { - dev_err(&pdev->dev, "pci_request_regions failed\n"); - vsoc_remove_device(pdev); - return -EBUSY; - } - vsoc_dev.requested_regions = true; - /* Set up the control registers in BAR 0 */ - reg_size = pci_resource_len(pdev, REGISTER_BAR); - if (reg_size > MAX_REGISTER_BAR_LEN) - vsoc_dev.regs = - pci_iomap(pdev, REGISTER_BAR, MAX_REGISTER_BAR_LEN); - else - vsoc_dev.regs = pci_iomap(pdev, REGISTER_BAR, reg_size); - - if (!vsoc_dev.regs) { - dev_err(&pdev->dev, - "cannot map registers of size %zu\n", - (size_t)reg_size); - vsoc_remove_device(pdev); - return -EBUSY; - } - - /* Map the shared memory in BAR 2 */ - vsoc_dev.shm_phys_start = pci_resource_start(pdev, SHARED_MEMORY_BAR); - vsoc_dev.shm_size = pci_resource_len(pdev, SHARED_MEMORY_BAR); - - dev_info(&pdev->dev, "shared memory @ DMA %pa size=0x%zx\n", - &vsoc_dev.shm_phys_start, vsoc_dev.shm_size); - vsoc_dev.kernel_mapped_shm = pci_iomap_wc(pdev, SHARED_MEMORY_BAR, 0); - if (!vsoc_dev.kernel_mapped_shm) { - dev_err(&vsoc_dev.dev->dev, "cannot iomap region\n"); - vsoc_remove_device(pdev); - return -EBUSY; - } - - vsoc_dev.layout = (struct vsoc_shm_layout_descriptor __force *) - vsoc_dev.kernel_mapped_shm; - dev_info(&pdev->dev, "major_version: %d\n", - vsoc_dev.layout->major_version); - dev_info(&pdev->dev, "minor_version: %d\n", - vsoc_dev.layout->minor_version); - dev_info(&pdev->dev, "size: 0x%x\n", vsoc_dev.layout->size); - dev_info(&pdev->dev, "regions: %d\n", vsoc_dev.layout->region_count); - if (vsoc_dev.layout->major_version != - CURRENT_VSOC_LAYOUT_MAJOR_VERSION) { - dev_err(&vsoc_dev.dev->dev, - "driver supports only major_version %d\n", - CURRENT_VSOC_LAYOUT_MAJOR_VERSION); - vsoc_remove_device(pdev); - return -EBUSY; - } - result = alloc_chrdev_region(&devt, 0, vsoc_dev.layout->region_count, - VSOC_DEV_NAME); - if (result) { - dev_err(&vsoc_dev.dev->dev, "alloc_chrdev_region failed\n"); - vsoc_remove_device(pdev); - return -EBUSY; - } - vsoc_dev.major = MAJOR(devt); - cdev_init(&vsoc_dev.cdev, &vsoc_ops); - vsoc_dev.cdev.owner = THIS_MODULE; - result = cdev_add(&vsoc_dev.cdev, devt, vsoc_dev.layout->region_count); - if (result) { - dev_err(&vsoc_dev.dev->dev, "cdev_add error\n"); - vsoc_remove_device(pdev); - return -EBUSY; - } - vsoc_dev.cdev_added = true; - vsoc_dev.class = class_create(THIS_MODULE, VSOC_DEV_NAME); - if (IS_ERR(vsoc_dev.class)) { - dev_err(&vsoc_dev.dev->dev, "class_create failed\n"); - vsoc_remove_device(pdev); - return PTR_ERR(vsoc_dev.class); - } - vsoc_dev.class_added = true; - vsoc_dev.regions = (struct vsoc_device_region __force *) - ((void *)vsoc_dev.layout + - vsoc_dev.layout->vsoc_region_desc_offset); - vsoc_dev.msix_entries = - kcalloc(vsoc_dev.layout->region_count, - sizeof(vsoc_dev.msix_entries[0]), GFP_KERNEL); - if (!vsoc_dev.msix_entries) { - dev_err(&vsoc_dev.dev->dev, - "unable to allocate msix_entries\n"); - vsoc_remove_device(pdev); - return -ENOSPC; - } - vsoc_dev.regions_data = - kcalloc(vsoc_dev.layout->region_count, - sizeof(vsoc_dev.regions_data[0]), GFP_KERNEL); - if (!vsoc_dev.regions_data) { - dev_err(&vsoc_dev.dev->dev, - "unable to allocate regions' data\n"); - vsoc_remove_device(pdev); - return -ENOSPC; - } - for (i = 0; i < vsoc_dev.layout->region_count; ++i) - vsoc_dev.msix_entries[i].entry = i; - - result = pci_enable_msix_exact(vsoc_dev.dev, vsoc_dev.msix_entries, - vsoc_dev.layout->region_count); - if (result) { - dev_info(&pdev->dev, "pci_enable_msix failed: %d\n", result); - vsoc_remove_device(pdev); - return -ENOSPC; - } - /* Check that all regions are well formed */ - for (i = 0; i < vsoc_dev.layout->region_count; ++i) { - const struct vsoc_device_region *region = vsoc_dev.regions + i; - - if (!PAGE_ALIGNED(region->region_begin_offset) || - !PAGE_ALIGNED(region->region_end_offset)) { - dev_err(&vsoc_dev.dev->dev, - "region %d not aligned (%x:%x)", i, - region->region_begin_offset, - region->region_end_offset); - vsoc_remove_device(pdev); - return -EFAULT; - } - if (region->region_begin_offset >= region->region_end_offset || - region->region_end_offset > vsoc_dev.shm_size) { - dev_err(&vsoc_dev.dev->dev, - "region %d offsets are wrong: %x %x %zx", - i, region->region_begin_offset, - region->region_end_offset, vsoc_dev.shm_size); - vsoc_remove_device(pdev); - return -EFAULT; - } - if (region->managed_by >= vsoc_dev.layout->region_count) { - dev_err(&vsoc_dev.dev->dev, - "region %d has invalid owner: %u", - i, region->managed_by); - vsoc_remove_device(pdev); - return -EFAULT; - } - } - vsoc_dev.msix_enabled = true; - for (i = 0; i < vsoc_dev.layout->region_count; ++i) { - const struct vsoc_device_region *region = vsoc_dev.regions + i; - size_t name_sz = sizeof(vsoc_dev.regions_data[i].name) - 1; - const struct vsoc_signal_table_layout *h_to_g_signal_table = - ®ion->host_to_guest_signal_table; - const struct vsoc_signal_table_layout *g_to_h_signal_table = - ®ion->guest_to_host_signal_table; - - vsoc_dev.regions_data[i].name[name_sz] = '\0'; - memcpy(vsoc_dev.regions_data[i].name, region->device_name, - name_sz); - dev_info(&pdev->dev, "region %d name=%s\n", - i, vsoc_dev.regions_data[i].name); - init_waitqueue_head - (&vsoc_dev.regions_data[i].interrupt_wait_queue); - init_waitqueue_head(&vsoc_dev.regions_data[i].futex_wait_queue); - vsoc_dev.regions_data[i].incoming_signalled = - shm_off_to_virtual_addr(region->region_begin_offset) + - h_to_g_signal_table->interrupt_signalled_offset; - vsoc_dev.regions_data[i].outgoing_signalled = - shm_off_to_virtual_addr(region->region_begin_offset) + - g_to_h_signal_table->interrupt_signalled_offset; - result = request_irq(vsoc_dev.msix_entries[i].vector, - vsoc_interrupt, 0, - vsoc_dev.regions_data[i].name, - vsoc_dev.regions_data + i); - if (result) { - dev_info(&pdev->dev, - "request_irq failed irq=%d vector=%d\n", - i, vsoc_dev.msix_entries[i].vector); - vsoc_remove_device(pdev); - return -ENOSPC; - } - vsoc_dev.regions_data[i].irq_requested = true; - if (!device_create(vsoc_dev.class, NULL, - MKDEV(vsoc_dev.major, i), - NULL, vsoc_dev.regions_data[i].name)) { - dev_err(&vsoc_dev.dev->dev, "device_create failed\n"); - vsoc_remove_device(pdev); - return -EBUSY; - } - vsoc_dev.regions_data[i].device_created = true; - } - return 0; -} - -/* - * This should undo all of the allocations in the probe function in reverse - * order. - * - * Notes: - * - * The device may have been partially initialized, so double check - * that the allocations happened. - * - * This function may be called multiple times, so mark resources as freed - * as they are deallocated. - */ -static void vsoc_remove_device(struct pci_dev *pdev) -{ - int i; - /* - * pdev is the first thing to be set on probe and the last thing - * to be cleared here. If it's NULL then there is no cleanup. - */ - if (!pdev || !vsoc_dev.dev) - return; - dev_info(&pdev->dev, "remove_device\n"); - if (vsoc_dev.regions_data) { - for (i = 0; i < vsoc_dev.layout->region_count; ++i) { - if (vsoc_dev.regions_data[i].device_created) { - device_destroy(vsoc_dev.class, - MKDEV(vsoc_dev.major, i)); - vsoc_dev.regions_data[i].device_created = false; - } - if (vsoc_dev.regions_data[i].irq_requested) - free_irq(vsoc_dev.msix_entries[i].vector, NULL); - vsoc_dev.regions_data[i].irq_requested = false; - } - kfree(vsoc_dev.regions_data); - vsoc_dev.regions_data = NULL; - } - if (vsoc_dev.msix_enabled) { - pci_disable_msix(pdev); - vsoc_dev.msix_enabled = false; - } - kfree(vsoc_dev.msix_entries); - vsoc_dev.msix_entries = NULL; - vsoc_dev.regions = NULL; - if (vsoc_dev.class_added) { - class_destroy(vsoc_dev.class); - vsoc_dev.class_added = false; - } - if (vsoc_dev.cdev_added) { - cdev_del(&vsoc_dev.cdev); - vsoc_dev.cdev_added = false; - } - if (vsoc_dev.major && vsoc_dev.layout) { - unregister_chrdev_region(MKDEV(vsoc_dev.major, 0), - vsoc_dev.layout->region_count); - vsoc_dev.major = 0; - } - vsoc_dev.layout = NULL; - if (vsoc_dev.kernel_mapped_shm) { - pci_iounmap(pdev, vsoc_dev.kernel_mapped_shm); - vsoc_dev.kernel_mapped_shm = NULL; - } - if (vsoc_dev.regs) { - pci_iounmap(pdev, vsoc_dev.regs); - vsoc_dev.regs = NULL; - } - if (vsoc_dev.requested_regions) { - pci_release_regions(pdev); - vsoc_dev.requested_regions = false; - } - if (vsoc_dev.enabled_device) { - pci_disable_device(pdev); - vsoc_dev.enabled_device = false; - } - /* Do this last: it indicates that the device is not initialized. */ - vsoc_dev.dev = NULL; -} - -static void __exit vsoc_cleanup_module(void) -{ - vsoc_remove_device(vsoc_dev.dev); - pci_unregister_driver(&vsoc_pci_driver); -} - -static int __init vsoc_init_module(void) -{ - int err = -ENOMEM; - - INIT_LIST_HEAD(&vsoc_dev.permissions); - mutex_init(&vsoc_dev.mtx); - - err = pci_register_driver(&vsoc_pci_driver); - if (err < 0) - return err; - return 0; -} - -static int vsoc_open(struct inode *inode, struct file *filp) -{ - /* Can't use vsoc_validate_filep because filp is still incomplete */ - int ret = vsoc_validate_inode(inode); - - if (ret) - return ret; - filp->private_data = - kzalloc(sizeof(struct vsoc_private_data), GFP_KERNEL); - if (!filp->private_data) - return -ENOMEM; - return 0; -} - -static int vsoc_release(struct inode *inode, struct file *filp) -{ - struct vsoc_private_data *private_data = NULL; - struct fd_scoped_permission_node *node = NULL; - struct vsoc_device_region *owner_region_p = NULL; - int retval = vsoc_validate_filep(filp); - - if (retval) - return retval; - private_data = (struct vsoc_private_data *)filp->private_data; - if (!private_data) - return 0; - - node = private_data->fd_scoped_permission_node; - if (node) { - owner_region_p = vsoc_region_from_inode(inode); - if (owner_region_p->managed_by != VSOC_REGION_WHOLE) { - owner_region_p = - &vsoc_dev.regions[owner_region_p->managed_by]; - } - do_destroy_fd_scoped_permission_node(owner_region_p, node); - private_data->fd_scoped_permission_node = NULL; - } - kfree(private_data); - filp->private_data = NULL; - - return 0; -} - -/* - * Returns the device relative offset and length of the area specified by the - * fd scoped permission. If there is no fd scoped permission set, a default - * permission covering the entire region is assumed, unless the region is owned - * by another one, in which case the default is a permission with zero size. - */ -static ssize_t vsoc_get_area(struct file *filp, __u32 *area_offset) -{ - __u32 off = 0; - ssize_t length = 0; - struct vsoc_device_region *region_p; - struct fd_scoped_permission *perm; - - region_p = vsoc_region_from_filep(filp); - off = region_p->region_begin_offset; - perm = &((struct vsoc_private_data *)filp->private_data)-> - fd_scoped_permission_node->permission; - if (perm) { - off += perm->begin_offset; - length = perm->end_offset - perm->begin_offset; - } else if (region_p->managed_by == VSOC_REGION_WHOLE) { - /* No permission set and the regions is not owned by another, - * default to full region access. - */ - length = vsoc_device_region_size(region_p); - } else { - /* return zero length, access is denied. */ - length = 0; - } - if (area_offset) - *area_offset = off; - return length; -} - -static int vsoc_mmap(struct file *filp, struct vm_area_struct *vma) -{ - unsigned long len = vma->vm_end - vma->vm_start; - __u32 area_off; - phys_addr_t mem_off; - ssize_t area_len; - int retval = vsoc_validate_filep(filp); - - if (retval) - return retval; - area_len = vsoc_get_area(filp, &area_off); - /* Add the requested offset */ - area_off += (vma->vm_pgoff << PAGE_SHIFT); - area_len -= (vma->vm_pgoff << PAGE_SHIFT); - if (area_len < len) - return -EINVAL; - vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); - mem_off = shm_off_to_phys_addr(area_off); - if (io_remap_pfn_range(vma, vma->vm_start, mem_off >> PAGE_SHIFT, - len, vma->vm_page_prot)) - return -EAGAIN; - return 0; -} - -module_init(vsoc_init_module); -module_exit(vsoc_cleanup_module); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Greg Hartman "); -MODULE_DESCRIPTION("VSoC interpretation of QEmu's ivshmem device"); -MODULE_VERSION("1.0"); -- cgit v1.2.3 From b7db58105b80fa9232719c8329b995b3addfab55 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 5 Feb 2020 15:32:17 +0300 Subject: staging: greybus: use after free in gb_audio_manager_remove_all() When we call kobject_put() and it's the last reference to the kobject then it calls gb_audio_module_release() and frees module. We dereference "module" on the next line which is a use after free. Fixes: c77f85bbc91a ("greybus: audio: Fix incorrect counting of 'ida'") Signed-off-by: Dan Carpenter Acked-by: Viresh Kumar Reviewed-by: Vaibhav Agarwal Link: https://lore.kernel.org/r/20200205123217.jreendkyxulqsool@kili.mountain Signed-off-by: Greg Kroah-Hartman --- drivers/staging/greybus/audio_manager.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/staging/greybus/audio_manager.c b/drivers/staging/greybus/audio_manager.c index 9b19ea9d3fa1..9a3f7c034ab4 100644 --- a/drivers/staging/greybus/audio_manager.c +++ b/drivers/staging/greybus/audio_manager.c @@ -92,8 +92,8 @@ void gb_audio_manager_remove_all(void) list_for_each_entry_safe(module, next, &modules_list, list) { list_del(&module->list); - kobject_put(&module->kobj); ida_simple_remove(&module_id, module->id); + kobject_put(&module->kobj); } is_empty = list_empty(&modules_list); -- cgit v1.2.3 From 17d25ae7f10e247bcea1f66268f746576cf9c86d Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Fri, 7 Feb 2020 22:55:01 +0900 Subject: tools/bootconfig: Fix wrong __VA_ARGS__ usage Since printk() wrapper macro uses __VA_ARGS__ without "##" prefix, it causes a build error if there is no variable arguments (e.g. only fmt is specified.) To fix this error, use ##__VA_ARGS__ instead of __VAR_ARGS__. Link: http://lkml.kernel.org/r/158108370130.2758.10893830923800978011.stgit@devnote2 Fixes: 950313ebf79c ("tools: bootconfig: Add bootconfig command") Reported-by: Michael Ellerman Tested-by: Michael Ellerman Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- tools/bootconfig/include/linux/printk.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/bootconfig/include/linux/printk.h b/tools/bootconfig/include/linux/printk.h index 017bcd6912a5..e978a63d3222 100644 --- a/tools/bootconfig/include/linux/printk.h +++ b/tools/bootconfig/include/linux/printk.h @@ -7,7 +7,7 @@ /* controllable printf */ extern int pr_output; #define printk(fmt, ...) \ - (pr_output ? printf(fmt, __VA_ARGS__) : 0) + (pr_output ? printf(fmt, ##__VA_ARGS__) : 0) #define pr_err printk #define pr_warn printk -- cgit v1.2.3 From 26445f98ead38b90423a2deffce2caa3b97a3d78 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 6 Feb 2020 20:14:53 +0900 Subject: bootconfig: Remove unneeded CONFIG_LIBXBC Since there is no user except CONFIG_BOOT_CONFIG and no plan to use it from other functions, CONFIG_LIBXBC can be removed and we can use CONFIG_BOOT_CONFIG directly. Link: http://lkml.kernel.org/r/158098769281.939.16293492056419481105.stgit@devnote2 Suggested-by: Geert Uytterhoeven Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- init/Kconfig | 1 - lib/Kconfig | 3 --- lib/Makefile | 2 +- 3 files changed, 1 insertion(+), 5 deletions(-) diff --git a/init/Kconfig b/init/Kconfig index 9506299a53e3..4a672c6629d0 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1218,7 +1218,6 @@ endif config BOOT_CONFIG bool "Boot config support" depends on BLK_DEV_INITRD - select LIBXBC default y help Extra boot config allows system admin to pass a config file as diff --git a/lib/Kconfig b/lib/Kconfig index 10012b646009..6e790dc55c5b 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -566,9 +566,6 @@ config DIMLIB config LIBFDT bool -config LIBXBC - bool - config OID_REGISTRY tristate help diff --git a/lib/Makefile b/lib/Makefile index 75a64d2552a2..74c1223828c1 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -228,7 +228,7 @@ $(foreach file, $(libfdt_files), \ $(eval CFLAGS_$(file) = -I $(srctree)/scripts/dtc/libfdt)) lib-$(CONFIG_LIBFDT) += $(libfdt_files) -lib-$(CONFIG_LIBXBC) += bootconfig.o +lib-$(CONFIG_BOOT_CONFIG) += bootconfig.o obj-$(CONFIG_RBTREE_TEST) += rbtree_test.o obj-$(CONFIG_INTERVAL_TREE_TEST) += interval_tree_test.o -- cgit v1.2.3 From 10f129cb59cf6b4aee419ce4b1325fc532d975fb Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Wed, 5 Feb 2020 16:34:04 -0600 Subject: tracing/kprobe: Fix uninitialized variable bug There is a potential execution path in which variable *ret* is returned without being properly initialized, previously. Fix this by initializing variable *ret* to 0. Link: http://lkml.kernel.org/r/20200205223404.GA3379@embeddedor Addresses-Coverity-ID: 1491142 ("Uninitialized scalar variable") Fixes: 2a588dd1d5d6 ("tracing: Add kprobe event command generation functions") Reviewed-by: Tom Zanussi Acked-by: Masami Hiramatsu Signed-off-by: Gustavo A. R. Silva Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_kprobe.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 51efc790aea8..21bafd48f2ac 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1012,7 +1012,7 @@ int __kprobe_event_add_fields(struct dynevent_cmd *cmd, ...) { struct dynevent_arg arg; va_list args; - int ret; + int ret = 0; if (cmd->type != DYNEVENT_TYPE_KPROBE) return -EINVAL; -- cgit v1.2.3 From f61872bb58a1cd8f0422aab1940eeee8be579d38 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 7 Feb 2020 19:07:37 -0500 Subject: bootconfig: Use parse_args() to find bootconfig and '--' The current implementation does a naive search of "bootconfig" on the kernel command line. But this could find "bootconfig" that is part of another option in quotes (although highly unlikely). But it also needs to find '--' on the kernel command line to know if it should append a '--' or not when a bootconfig in the initrd file has an "init" section. The check uses the naive strstr() to find to see if it exists. But this can return a false positive if it exists in an option and then the "init" section in the initrd will not be appended properly. Using parse_args() to find both of these will solve both of these problems. Link: https://lore.kernel.org/r/202002070954.C18E7F58B@keescook Fixes: 7495e0926fdf3 ("bootconfig: Only load bootconfig if "bootconfig" is on the kernel cmdline") Fixes: 1319916209ce8 ("bootconfig: init: Allow admin to use bootconfig for init command line") Reported-by: Kees Cook Reviewed-by: Kees Cook Acked-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- init/main.c | 37 ++++++++++++++++++++++++++++++------- 1 file changed, 30 insertions(+), 7 deletions(-) diff --git a/init/main.c b/init/main.c index 491f1cdb3105..59248717c925 100644 --- a/init/main.c +++ b/init/main.c @@ -142,6 +142,15 @@ static char *extra_command_line; /* Extra init arguments */ static char *extra_init_args; +#ifdef CONFIG_BOOT_CONFIG +/* Is bootconfig on command line? */ +static bool bootconfig_found; +static bool initargs_found; +#else +# define bootconfig_found false +# define initargs_found false +#endif + static char *execute_command; static char *ramdisk_execute_command; @@ -336,17 +345,30 @@ u32 boot_config_checksum(unsigned char *p, u32 size) return ret; } +static int __init bootconfig_params(char *param, char *val, + const char *unused, void *arg) +{ + if (strcmp(param, "bootconfig") == 0) { + bootconfig_found = true; + } else if (strcmp(param, "--") == 0) { + initargs_found = true; + } + return 0; +} + static void __init setup_boot_config(const char *cmdline) { + static char tmp_cmdline[COMMAND_LINE_SIZE] __initdata; u32 size, csum; char *data, *copy; - const char *p; u32 *hdr; int ret; - p = strstr(cmdline, "bootconfig"); - if (!p || (p != cmdline && !isspace(*(p-1))) || - (p[10] && !isspace(p[10]))) + strlcpy(tmp_cmdline, boot_command_line, COMMAND_LINE_SIZE); + parse_args("bootconfig", tmp_cmdline, NULL, 0, 0, 0, NULL, + bootconfig_params); + + if (!bootconfig_found) return; if (!initrd_end) @@ -563,11 +585,12 @@ static void __init setup_command_line(char *command_line) * to init. */ len = strlen(saved_command_line); - if (!strstr(boot_command_line, " -- ")) { + if (initargs_found) { + saved_command_line[len++] = ' '; + } else { strcpy(saved_command_line + len, " -- "); len += 4; - } else - saved_command_line[len++] = ' '; + } strcpy(saved_command_line + len, extra_init_args); } -- cgit v1.2.3 From fbd1ec000213c8b457dd4fb15b6de9ba02ec5482 Mon Sep 17 00:00:00 2001 From: Luca Weiss Date: Sun, 9 Feb 2020 14:42:36 -0800 Subject: Input: ili210x - fix return value of is_visible function The is_visible function expects the permissions associated with an attribute of the sysfs group or 0 if an attribute is not visible. Change the code to return the attribute permissions when the attribute should be visible which resolves the warning: Attribute calibrate: Invalid permissions 01 Fixes: cc12ba1872c6 ("Input: ili210x - optionally show calibrate sysfs attribute") Signed-off-by: Luca Weiss Reviewed-by: Sven Van Asbroeck Link: https://lore.kernel.org/r/20200209145628.649409-1-luca@z3ntu.xyz Signed-off-by: Dmitry Torokhov --- drivers/input/touchscreen/ili210x.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/input/touchscreen/ili210x.c b/drivers/input/touchscreen/ili210x.c index 4a17096e83e1..84bf51d79888 100644 --- a/drivers/input/touchscreen/ili210x.c +++ b/drivers/input/touchscreen/ili210x.c @@ -321,7 +321,7 @@ static umode_t ili210x_calibrate_visible(struct kobject *kobj, struct i2c_client *client = to_i2c_client(dev); struct ili210x *priv = i2c_get_clientdata(client); - return priv->chip->has_calibrate_reg; + return priv->chip->has_calibrate_reg ? attr->mode : 0; } static const struct attribute_group ili210x_attr_group = { -- cgit v1.2.3 From d0c5e7d4f5e5b76eeb53d098157d5b1f62ebb407 Mon Sep 17 00:00:00 2001 From: Luca Weiss Date: Sun, 9 Feb 2020 14:43:30 -0800 Subject: Input: ili210x - add ili2120 support This adds support for the Ilitek ili2120 touchscreen found in the Fairphone 2 smartphone. Signed-off-by: Luca Weiss Link: https://lore.kernel.org/r/20200209151904.661210-1-luca@z3ntu.xyz Signed-off-by: Dmitry Torokhov --- .../devicetree/bindings/input/ilitek,ili2xxx.txt | 3 +- drivers/input/touchscreen/ili210x.c | 32 ++++++++++++++++++++++ 2 files changed, 34 insertions(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/input/ilitek,ili2xxx.txt b/Documentation/devicetree/bindings/input/ilitek,ili2xxx.txt index dc194b2c151a..cdcaa3f52d25 100644 --- a/Documentation/devicetree/bindings/input/ilitek,ili2xxx.txt +++ b/Documentation/devicetree/bindings/input/ilitek,ili2xxx.txt @@ -1,9 +1,10 @@ -Ilitek ILI210x/ILI2117/ILI251x touchscreen controller +Ilitek ILI210x/ILI2117/ILI2120/ILI251x touchscreen controller Required properties: - compatible: ilitek,ili210x for ILI210x ilitek,ili2117 for ILI2117 + ilitek,ili2120 for ILI2120 ilitek,ili251x for ILI251x - reg: The I2C address of the device diff --git a/drivers/input/touchscreen/ili210x.c b/drivers/input/touchscreen/ili210x.c index 84bf51d79888..199cf3daec10 100644 --- a/drivers/input/touchscreen/ili210x.c +++ b/drivers/input/touchscreen/ili210x.c @@ -167,6 +167,36 @@ static const struct ili2xxx_chip ili211x_chip = { .resolution = 2048, }; +static bool ili212x_touchdata_to_coords(const u8 *touchdata, + unsigned int finger, + unsigned int *x, unsigned int *y) +{ + u16 val; + + val = get_unaligned_be16(touchdata + 3 + (finger * 5) + 0); + if (!(val & BIT(15))) /* Touch indication */ + return false; + + *x = val & 0x3fff; + *y = get_unaligned_be16(touchdata + 3 + (finger * 5) + 2); + + return true; +} + +static bool ili212x_check_continue_polling(const u8 *data, bool touch) +{ + return touch; +} + +static const struct ili2xxx_chip ili212x_chip = { + .read_reg = ili210x_read_reg, + .get_touch_data = ili210x_read_touch_data, + .parse_touch_data = ili212x_touchdata_to_coords, + .continue_polling = ili212x_check_continue_polling, + .max_touches = 10, + .has_calibrate_reg = true, +}; + static int ili251x_read_reg(struct i2c_client *client, u8 reg, void *buf, size_t len) { @@ -447,6 +477,7 @@ static int ili210x_i2c_probe(struct i2c_client *client, static const struct i2c_device_id ili210x_i2c_id[] = { { "ili210x", (long)&ili210x_chip }, { "ili2117", (long)&ili211x_chip }, + { "ili2120", (long)&ili212x_chip }, { "ili251x", (long)&ili251x_chip }, { } }; @@ -455,6 +486,7 @@ MODULE_DEVICE_TABLE(i2c, ili210x_i2c_id); static const struct of_device_id ili210x_dt_ids[] = { { .compatible = "ilitek,ili210x", .data = &ili210x_chip }, { .compatible = "ilitek,ili2117", .data = &ili211x_chip }, + { .compatible = "ilitek,ili2120", .data = &ili212x_chip }, { .compatible = "ilitek,ili251x", .data = &ili251x_chip }, { } }; -- cgit v1.2.3 From 557d0841bc73fbd0da643b6647781bb1f790a84b Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Mon, 10 Feb 2020 09:57:23 -0800 Subject: Input: psmouse - switch to using i2c_new_scanned_device() Move from the deprecated i2c_new_probed_device() to the new i2c_new_scanned_device(). Make use of the new ERRPTR if suitable. Signed-off-by: Wolfram Sang Link: https://lore.kernel.org/r/20200210165902.5250-1-wsa+renesas@sang-engineering.com Signed-off-by: Dmitry Torokhov --- drivers/input/mouse/psmouse-smbus.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/input/mouse/psmouse-smbus.c b/drivers/input/mouse/psmouse-smbus.c index 027efdd2b2ad..a472489ccbad 100644 --- a/drivers/input/mouse/psmouse-smbus.c +++ b/drivers/input/mouse/psmouse-smbus.c @@ -190,6 +190,7 @@ static int psmouse_smbus_create_companion(struct device *dev, void *data) struct psmouse_smbus_dev *smbdev = data; unsigned short addr_list[] = { smbdev->board.addr, I2C_CLIENT_END }; struct i2c_adapter *adapter; + struct i2c_client *client; adapter = i2c_verify_adapter(dev); if (!adapter) @@ -198,12 +199,13 @@ static int psmouse_smbus_create_companion(struct device *dev, void *data) if (!i2c_check_functionality(adapter, I2C_FUNC_SMBUS_HOST_NOTIFY)) return 0; - smbdev->client = i2c_new_probed_device(adapter, &smbdev->board, - addr_list, NULL); - if (!smbdev->client) + client = i2c_new_scanned_device(adapter, &smbdev->board, + addr_list, NULL); + if (IS_ERR(client)) return 0; /* We have our(?) device, stop iterating i2c bus. */ + smbdev->client = client; return 1; } -- cgit v1.2.3 From 0ca2c0319a7bce0e152b51b866979d62dc261e48 Mon Sep 17 00:00:00 2001 From: John Garry Date: Tue, 11 Feb 2020 00:50:17 +0800 Subject: perf/smmuv3: Use platform_get_irq_optional() for wired interrupt Even though a SMMUv3 PMCG implementation may use an MSI as the form of interrupt source, the kernel would still complain that it does not find the wired (GSIV) interrupt in this case: root@(none)$ dmesg | grep arm-smmu-v3-pmcg | grep "not found" [ 59.237219] arm-smmu-v3-pmcg arm-smmu-v3-pmcg.8.auto: IRQ index 0 not found [ 59.322841] arm-smmu-v3-pmcg arm-smmu-v3-pmcg.9.auto: IRQ index 0 not found [ 59.422155] arm-smmu-v3-pmcg arm-smmu-v3-pmcg.10.auto: IRQ index 0 not found [ 59.539014] arm-smmu-v3-pmcg arm-smmu-v3-pmcg.11.auto: IRQ index 0 not found [ 59.640329] arm-smmu-v3-pmcg arm-smmu-v3-pmcg.12.auto: IRQ index 0 not found [ 59.743112] arm-smmu-v3-pmcg arm-smmu-v3-pmcg.13.auto: IRQ index 0 not found [ 59.880577] arm-smmu-v3-pmcg arm-smmu-v3-pmcg.14.auto: IRQ index 0 not found [ 60.017528] arm-smmu-v3-pmcg arm-smmu-v3-pmcg.15.auto: IRQ index 0 not found Use platform_get_irq_optional() to silence the warning. If neither interrupt source is found, then the driver will still warn that IRQ setup errored and the probe will fail. Reviewed-by: Robin Murphy Signed-off-by: John Garry Signed-off-by: Will Deacon --- drivers/perf/arm_smmuv3_pmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/perf/arm_smmuv3_pmu.c b/drivers/perf/arm_smmuv3_pmu.c index d704eccc548f..f01a57e5a5f3 100644 --- a/drivers/perf/arm_smmuv3_pmu.c +++ b/drivers/perf/arm_smmuv3_pmu.c @@ -771,7 +771,7 @@ static int smmu_pmu_probe(struct platform_device *pdev) smmu_pmu->reloc_base = smmu_pmu->reg_base; } - irq = platform_get_irq(pdev, 0); + irq = platform_get_irq_optional(pdev, 0); if (irq > 0) smmu_pmu->irq = irq; -- cgit v1.2.3 From 499c405b2b80bb3a04425ba3541d20305e014d3e Mon Sep 17 00:00:00 2001 From: Larry Finger Date: Mon, 10 Feb 2020 12:02:30 -0600 Subject: staging: rtl8188eu: Fix potential security hole In routine rtw_hostapd_ioctl(), the user-controlled p->length is assumed to be at least the size of struct ieee_param size, but this assumption is never checked. This could result in out-of-bounds read/write on kernel heap in case a p->length less than the size of struct ieee_param is specified by the user. If p->length is allowed to be greater than the size of the struct, then a malicious user could be wasting kernel memory. Fixes commit a2c60d42d97c ("Add files for new driver - part 16"). Reported by: Pietro Oliva Cc: Pietro Oliva Cc: Stable Fixes: a2c60d42d97c ("staging: r8188eu: Add files for new driver - part 16") Signed-off-by: Larry Finger Link: https://lore.kernel.org/r/20200210180235.21691-2-Larry.Finger@lwfinger.net Signed-off-by: Greg Kroah-Hartman --- drivers/staging/rtl8188eu/os_dep/ioctl_linux.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/staging/rtl8188eu/os_dep/ioctl_linux.c b/drivers/staging/rtl8188eu/os_dep/ioctl_linux.c index 9b6ea86d1dcf..7d21f5799640 100644 --- a/drivers/staging/rtl8188eu/os_dep/ioctl_linux.c +++ b/drivers/staging/rtl8188eu/os_dep/ioctl_linux.c @@ -2796,7 +2796,7 @@ static int rtw_hostapd_ioctl(struct net_device *dev, struct iw_point *p) goto out; } - if (!p->pointer) { + if (!p->pointer || p->length != sizeof(struct ieee_param)) { ret = -EINVAL; goto out; } -- cgit v1.2.3 From ac33597c0c0d1d819dccfe001bcd0acef7107e7c Mon Sep 17 00:00:00 2001 From: Larry Finger Date: Mon, 10 Feb 2020 12:02:31 -0600 Subject: staging: rtl8723bs: Fix potential security hole In routine rtw_hostapd_ioctl(), the user-controlled p->length is assumed to be at least the size of struct ieee_param size, but this assumption is never checked. This could result in out-of-bounds read/write on kernel heap in case a p->length less than the size of struct ieee_param is specified by the user. If p->length is allowed to be greater than the size of the struct, then a malicious user could be wasting kernel memory. Fixes commit 554c0a3abf216 ("0taging: Add rtl8723bs sdio wifi driver"). Reported by: Pietro Oliva Cc: Pietro Oliva Cc: Stable Fixes 554c0a3abf216 ("0taging: Add rtl8723bs sdio wifi driver"). Signed-off-by: Larry Finger Link: https://lore.kernel.org/r/20200210180235.21691-3-Larry.Finger@lwfinger.net Signed-off-by: Greg Kroah-Hartman --- drivers/staging/rtl8723bs/os_dep/ioctl_linux.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/staging/rtl8723bs/os_dep/ioctl_linux.c b/drivers/staging/rtl8723bs/os_dep/ioctl_linux.c index db6528a01229..3128766dd50e 100644 --- a/drivers/staging/rtl8723bs/os_dep/ioctl_linux.c +++ b/drivers/staging/rtl8723bs/os_dep/ioctl_linux.c @@ -4207,7 +4207,7 @@ static int rtw_hostapd_ioctl(struct net_device *dev, struct iw_point *p) /* if (p->length < sizeof(struct ieee_param) || !p->pointer) { */ - if (!p->pointer) { + if (!p->pointer || p->length != sizeof(*param)) { ret = -EINVAL; goto out; } -- cgit v1.2.3 From 4ddf8ab8d15ddbc52eefb44eb64e38466ce1f70f Mon Sep 17 00:00:00 2001 From: Larry Finger Date: Mon, 10 Feb 2020 12:02:32 -0600 Subject: staging: rtl8188eu: Fix potential overuse of kernel memory In routine wpa_supplicant_ioctl(), the user-controlled p->length is checked to be at least the size of struct ieee_param size, but the code does not detect the case where p->length is greater than the size of the struct, thus a malicious user could be wasting kernel memory. Fixes commit a2c60d42d97c ("Add files for new driver - part 16"). Reported by: Pietro Oliva Cc: Pietro Oliva Cc: Stable Fixes commit a2c60d42d97c ("Add files for new driver - part 16"). Signed-off-by: Larry Finger Link: https://lore.kernel.org/r/20200210180235.21691-4-Larry.Finger@lwfinger.net Signed-off-by: Greg Kroah-Hartman --- drivers/staging/rtl8188eu/os_dep/ioctl_linux.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/staging/rtl8188eu/os_dep/ioctl_linux.c b/drivers/staging/rtl8188eu/os_dep/ioctl_linux.c index 7d21f5799640..acca3ae8b254 100644 --- a/drivers/staging/rtl8188eu/os_dep/ioctl_linux.c +++ b/drivers/staging/rtl8188eu/os_dep/ioctl_linux.c @@ -2009,7 +2009,7 @@ static int wpa_supplicant_ioctl(struct net_device *dev, struct iw_point *p) struct ieee_param *param; uint ret = 0; - if (p->length < sizeof(struct ieee_param) || !p->pointer) { + if (!p->pointer || p->length != sizeof(struct ieee_param)) { ret = -EINVAL; goto out; } -- cgit v1.2.3 From 23954cb078febfc63a755301fe77e06bccdb4d2a Mon Sep 17 00:00:00 2001 From: Larry Finger Date: Mon, 10 Feb 2020 12:02:33 -0600 Subject: staging: rtl8723bs: Fix potential overuse of kernel memory In routine wpa_supplicant_ioctl(), the user-controlled p->length is checked to be at least the size of struct ieee_param size, but the code does not detect the case where p->length is greater than the size of the struct, thus a malicious user could be wasting kernel memory. Fixes commit 554c0a3abf216 ("staging: Add rtl8723bs sdio wifi driver"). Reported by: Pietro Oliva Cc: Pietro Oliva Cc: Stable Fixes: 554c0a3abf216 ("staging: Add rtl8723bs sdio wifi driver"). Signed-off-by: Larry Finger Link: https://lore.kernel.org/r/20200210180235.21691-5-Larry.Finger@lwfinger.net Signed-off-by: Greg Kroah-Hartman --- drivers/staging/rtl8723bs/os_dep/ioctl_linux.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/staging/rtl8723bs/os_dep/ioctl_linux.c b/drivers/staging/rtl8723bs/os_dep/ioctl_linux.c index 3128766dd50e..2ac0d84f090e 100644 --- a/drivers/staging/rtl8723bs/os_dep/ioctl_linux.c +++ b/drivers/staging/rtl8723bs/os_dep/ioctl_linux.c @@ -3373,7 +3373,7 @@ static int wpa_supplicant_ioctl(struct net_device *dev, struct iw_point *p) /* down(&ieee->wx_sem); */ - if (p->length < sizeof(struct ieee_param) || !p->pointer) { + if (!p->pointer || p->length != sizeof(struct ieee_param)) { ret = -EINVAL; goto out; } -- cgit v1.2.3 From e40c6d0f8763fe67585227d4afc97171db861b3b Mon Sep 17 00:00:00 2001 From: Larry Finger Date: Mon, 10 Feb 2020 12:02:34 -0600 Subject: staging: rtl8188eu: Remove some unneeded goto statements In routines rtw_hostapd_ioctl() and wpa_supplicant_ioctl(), several error conditions involve setting a variable indicating the error, followed by a goto. The code following the target of that goto merely returns the value. It is simpler, therefore to return the error value immediately, and eliminate the got target. Signed-off-by: Larry Finger Cc: Pietro Oliva Link: https://lore.kernel.org/r/20200210180235.21691-6-Larry.Finger@lwfinger.net Signed-off-by: Greg Kroah-Hartman --- drivers/staging/rtl8188eu/os_dep/ioctl_linux.c | 40 ++++++++------------------ 1 file changed, 12 insertions(+), 28 deletions(-) diff --git a/drivers/staging/rtl8188eu/os_dep/ioctl_linux.c b/drivers/staging/rtl8188eu/os_dep/ioctl_linux.c index acca3ae8b254..ba53959e1303 100644 --- a/drivers/staging/rtl8188eu/os_dep/ioctl_linux.c +++ b/drivers/staging/rtl8188eu/os_dep/ioctl_linux.c @@ -2009,21 +2009,16 @@ static int wpa_supplicant_ioctl(struct net_device *dev, struct iw_point *p) struct ieee_param *param; uint ret = 0; - if (!p->pointer || p->length != sizeof(struct ieee_param)) { - ret = -EINVAL; - goto out; - } + if (!p->pointer || p->length != sizeof(struct ieee_param)) + return -EINVAL; param = (struct ieee_param *)rtw_malloc(p->length); - if (!param) { - ret = -ENOMEM; - goto out; - } + if (!param) + return -ENOMEM; if (copy_from_user(param, p->pointer, p->length)) { kfree(param); - ret = -EFAULT; - goto out; + return -EFAULT; } switch (param->cmd) { @@ -2054,9 +2049,6 @@ static int wpa_supplicant_ioctl(struct net_device *dev, struct iw_point *p) ret = -EFAULT; kfree(param); - -out: - return ret; } @@ -2791,26 +2783,19 @@ static int rtw_hostapd_ioctl(struct net_device *dev, struct iw_point *p) * so, we just check hw_init_completed */ - if (!padapter->hw_init_completed) { - ret = -EPERM; - goto out; - } + if (!padapter->hw_init_completed) + return -EPERM; - if (!p->pointer || p->length != sizeof(struct ieee_param)) { - ret = -EINVAL; - goto out; - } + if (!p->pointer || p->length != sizeof(struct ieee_param)) + return -EINVAL; param = (struct ieee_param *)rtw_malloc(p->length); - if (!param) { - ret = -ENOMEM; - goto out; - } + if (!param) + return -ENOMEM; if (copy_from_user(param, p->pointer, p->length)) { kfree(param); - ret = -EFAULT; - goto out; + return -EFAULT; } switch (param->cmd) { @@ -2865,7 +2850,6 @@ static int rtw_hostapd_ioctl(struct net_device *dev, struct iw_point *p) if (ret == 0 && copy_to_user(p->pointer, param, p->length)) ret = -EFAULT; kfree(param); -out: return ret; } #endif -- cgit v1.2.3 From 9a4556bd8f23209c29f152e6a930b6a893b0fc81 Mon Sep 17 00:00:00 2001 From: Larry Finger Date: Mon, 10 Feb 2020 12:02:35 -0600 Subject: staging: rtl8723bs: Remove unneeded goto statements In routines rtw_hostapd_ioctl() and wpa_supplicant_ioctl(), several error conditions involve setting a variable indicating the error, followed by a goto. The code following the target of that goto merely returns the value. It is simpler, therefore to return the error value immediately, and eliminate the got target. Signed-off-by: Larry Finger Cc: Pietro Oliva Link: https://lore.kernel.org/r/20200210180235.21691-7-Larry.Finger@lwfinger.net Signed-off-by: Greg Kroah-Hartman --- drivers/staging/rtl8723bs/os_dep/ioctl_linux.c | 47 +++++++------------------- 1 file changed, 12 insertions(+), 35 deletions(-) diff --git a/drivers/staging/rtl8723bs/os_dep/ioctl_linux.c b/drivers/staging/rtl8723bs/os_dep/ioctl_linux.c index 2ac0d84f090e..9b9038e7deb1 100644 --- a/drivers/staging/rtl8723bs/os_dep/ioctl_linux.c +++ b/drivers/staging/rtl8723bs/os_dep/ioctl_linux.c @@ -3373,21 +3373,16 @@ static int wpa_supplicant_ioctl(struct net_device *dev, struct iw_point *p) /* down(&ieee->wx_sem); */ - if (!p->pointer || p->length != sizeof(struct ieee_param)) { - ret = -EINVAL; - goto out; - } + if (!p->pointer || p->length != sizeof(struct ieee_param)) + return -EINVAL; param = rtw_malloc(p->length); - if (param == NULL) { - ret = -ENOMEM; - goto out; - } + if (param == NULL) + return -ENOMEM; if (copy_from_user(param, p->pointer, p->length)) { kfree(param); - ret = -EFAULT; - goto out; + return -EFAULT; } switch (param->cmd) { @@ -3421,12 +3416,8 @@ static int wpa_supplicant_ioctl(struct net_device *dev, struct iw_point *p) kfree(param); -out: - /* up(&ieee->wx_sem); */ - return ret; - } static int rtw_set_encryption(struct net_device *dev, struct ieee_param *param, u32 param_len) @@ -4200,28 +4191,19 @@ static int rtw_hostapd_ioctl(struct net_device *dev, struct iw_point *p) * so, we just check hw_init_completed */ - if (!padapter->hw_init_completed) { - ret = -EPERM; - goto out; - } - + if (!padapter->hw_init_completed) + return -EPERM; - /* if (p->length < sizeof(struct ieee_param) || !p->pointer) { */ - if (!p->pointer || p->length != sizeof(*param)) { - ret = -EINVAL; - goto out; - } + if (!p->pointer || p->length != sizeof(*param)) + return -EINVAL; param = rtw_malloc(p->length); - if (param == NULL) { - ret = -ENOMEM; - goto out; - } + if (param == NULL) + return -ENOMEM; if (copy_from_user(param, p->pointer, p->length)) { kfree(param); - ret = -EFAULT; - goto out; + return -EFAULT; } /* DBG_871X("%s, cmd =%d\n", __func__, param->cmd); */ @@ -4321,13 +4303,8 @@ static int rtw_hostapd_ioctl(struct net_device *dev, struct iw_point *p) if (ret == 0 && copy_to_user(p->pointer, param, p->length)) ret = -EFAULT; - kfree(param); - -out: - return ret; - } static int rtw_wx_set_priv(struct net_device *dev, -- cgit v1.2.3 From 1208f9e1d758c991b0a46a1bd60c616b906bbe27 Mon Sep 17 00:00:00 2001 From: Hardik Gajjar Date: Thu, 6 Feb 2020 12:49:23 +0100 Subject: USB: hub: Fix the broken detection of USB3 device in SMSC hub Renesas R-Car H3ULCB + Kingfisher Infotainment Board is either not able to detect the USB3.0 mass storage devices or is detecting those as USB2.0 high speed devices. The explanation given by Renesas is that, due to a HW issue, the XHCI driver does not wake up after going to sleep on connecting a USB3.0 device. In order to mitigate that, disable the auto-suspend feature specifically for SMSC hubs from hub_probe() function, as a quirk. Renesas Kingfisher Infotainment Board has two USB3.0 ports (CN2) which are connected via USB5534B 4-port SuperSpeed/Hi-Speed, low-power, configurable hub controller. [1] SanDisk USB 3.0 device detected as USB-2.0 before the patch [ 74.036390] usb 5-1.1: new high-speed USB device number 4 using xhci-hcd [ 74.061598] usb 5-1.1: New USB device found, idVendor=0781, idProduct=5581, bcdDevice= 1.00 [ 74.069976] usb 5-1.1: New USB device strings: Mfr=1, Product=2, SerialNumber=3 [ 74.077303] usb 5-1.1: Product: Ultra [ 74.080980] usb 5-1.1: Manufacturer: SanDisk [ 74.085263] usb 5-1.1: SerialNumber: 4C530001110208116550 [2] SanDisk USB 3.0 device detected as USB-3.0 after the patch [ 34.565078] usb 6-1.1: new SuperSpeed Gen 1 USB device number 3 using xhci-hcd [ 34.588719] usb 6-1.1: New USB device found, idVendor=0781, idProduct=5581, bcdDevice= 1.00 [ 34.597098] usb 6-1.1: New USB device strings: Mfr=1, Product=2, SerialNumber=3 [ 34.604430] usb 6-1.1: Product: Ultra [ 34.608110] usb 6-1.1: Manufacturer: SanDisk [ 34.612397] usb 6-1.1: SerialNumber: 4C530001110208116550 Suggested-by: Alan Stern Signed-off-by: Hardik Gajjar Acked-by: Alan Stern Tested-by: Eugeniu Rosca Cc: stable Link: https://lore.kernel.org/r/1580989763-32291-1-git-send-email-hgajjar@de.adit-jv.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/hub.c | 15 +++++++++++++++ drivers/usb/core/hub.h | 1 + 2 files changed, 16 insertions(+) diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c index 3405b146edc9..de94fa4a4ca7 100644 --- a/drivers/usb/core/hub.c +++ b/drivers/usb/core/hub.c @@ -38,7 +38,9 @@ #include "otg_whitelist.h" #define USB_VENDOR_GENESYS_LOGIC 0x05e3 +#define USB_VENDOR_SMSC 0x0424 #define HUB_QUIRK_CHECK_PORT_AUTOSUSPEND 0x01 +#define HUB_QUIRK_DISABLE_AUTOSUSPEND 0x02 #define USB_TP_TRANSMISSION_DELAY 40 /* ns */ #define USB_TP_TRANSMISSION_DELAY_MAX 65535 /* ns */ @@ -1731,6 +1733,10 @@ static void hub_disconnect(struct usb_interface *intf) kfree(hub->buffer); pm_suspend_ignore_children(&intf->dev, false); + + if (hub->quirk_disable_autosuspend) + usb_autopm_put_interface(intf); + kref_put(&hub->kref, hub_release); } @@ -1863,6 +1869,11 @@ static int hub_probe(struct usb_interface *intf, const struct usb_device_id *id) if (id->driver_info & HUB_QUIRK_CHECK_PORT_AUTOSUSPEND) hub->quirk_check_port_auto_suspend = 1; + if (id->driver_info & HUB_QUIRK_DISABLE_AUTOSUSPEND) { + hub->quirk_disable_autosuspend = 1; + usb_autopm_get_interface(intf); + } + if (hub_configure(hub, &desc->endpoint[0].desc) >= 0) return 0; @@ -5599,6 +5610,10 @@ out_hdev_lock: } static const struct usb_device_id hub_id_table[] = { + { .match_flags = USB_DEVICE_ID_MATCH_VENDOR | USB_DEVICE_ID_MATCH_INT_CLASS, + .idVendor = USB_VENDOR_SMSC, + .bInterfaceClass = USB_CLASS_HUB, + .driver_info = HUB_QUIRK_DISABLE_AUTOSUSPEND}, { .match_flags = USB_DEVICE_ID_MATCH_VENDOR | USB_DEVICE_ID_MATCH_INT_CLASS, .idVendor = USB_VENDOR_GENESYS_LOGIC, diff --git a/drivers/usb/core/hub.h b/drivers/usb/core/hub.h index a9e24e4b8df1..a97dd1ba964e 100644 --- a/drivers/usb/core/hub.h +++ b/drivers/usb/core/hub.h @@ -61,6 +61,7 @@ struct usb_hub { unsigned quiescing:1; unsigned disconnected:1; unsigned in_reset:1; + unsigned quirk_disable_autosuspend:1; unsigned quirk_check_port_auto_suspend:1; -- cgit v1.2.3 From dddb40e83038ec72533b1b5721831caf90224a09 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 28 Jan 2020 16:29:56 +0200 Subject: MAINTAINERS: Sort entries in database for USB TYPEC Run parse-maintainers.pl and choose USB TYPEC records. Fix them accordingly. Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20200128142956.39604-1-andriy.shevchenko@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- MAINTAINERS | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 38fe2f3f7b6f..1277cf33d413 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -17392,11 +17392,14 @@ F: drivers/usb/ F: include/linux/usb.h F: include/linux/usb/ -USB TYPEC PI3USB30532 MUX DRIVER -M: Hans de Goede +USB TYPEC BUS FOR ALTERNATE MODES +M: Heikki Krogerus L: linux-usb@vger.kernel.org S: Maintained -F: drivers/usb/typec/mux/pi3usb30532.c +F: Documentation/ABI/testing/sysfs-bus-typec +F: Documentation/driver-api/usb/typec_bus.rst +F: drivers/usb/typec/altmodes/ +F: include/linux/usb/typec_altmode.h USB TYPEC CLASS M: Heikki Krogerus @@ -17407,14 +17410,11 @@ F: Documentation/driver-api/usb/typec.rst F: drivers/usb/typec/ F: include/linux/usb/typec.h -USB TYPEC BUS FOR ALTERNATE MODES -M: Heikki Krogerus +USB TYPEC PI3USB30532 MUX DRIVER +M: Hans de Goede L: linux-usb@vger.kernel.org S: Maintained -F: Documentation/ABI/testing/sysfs-bus-typec -F: Documentation/driver-api/usb/typec_bus.rst -F: drivers/usb/typec/altmodes/ -F: include/linux/usb/typec_altmode.h +F: drivers/usb/typec/mux/pi3usb30532.c USB TYPEC PORT CONTROLLER DRIVERS M: Guenter Roeck -- cgit v1.2.3 From 3e99862c05a9caa5a27969f41566b428696f5a9a Mon Sep 17 00:00:00 2001 From: EJ Hsu Date: Thu, 30 Jan 2020 01:25:06 -0800 Subject: usb: uas: fix a plug & unplug racing When a uas disk is plugged into an external hub, uas_probe() will be called by the hub thread to do the probe. It will first create a SCSI host and then do the scan for this host. During the scan, it will probe the LUN using SCSI INQUERY command which will be packed in the URB and submitted to uas disk. There might be a chance that this external hub with uas disk attached is unplugged during the scan. In this case, uas driver will fail to submit the URB (due to the NOTATTACHED state of uas device) and try to put this SCSI command back to request queue waiting for next chance to run. In normal case, this cycle will terminate when hub thread gets disconnection event and calls into uas_disconnect() accordingly. But in this case, uas_disconnect() will not be called because hub thread of external hub gets stuck waiting for the completion of this SCSI command. A deadlock happened. In this fix, uas will call scsi_scan_host() asynchronously to avoid the blocking of hub thread. Signed-off-by: EJ Hsu Acked-by: Oliver Neukum Cc: stable Link: https://lore.kernel.org/r/20200130092506.102760-1-ejh@nvidia.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/storage/uas.c | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/drivers/usb/storage/uas.c b/drivers/usb/storage/uas.c index 95bba3ba6ac6..3670fda02c34 100644 --- a/drivers/usb/storage/uas.c +++ b/drivers/usb/storage/uas.c @@ -45,6 +45,7 @@ struct uas_dev_info { struct scsi_cmnd *cmnd[MAX_CMNDS]; spinlock_t lock; struct work_struct work; + struct work_struct scan_work; /* for async scanning */ }; enum { @@ -114,6 +115,17 @@ out: spin_unlock_irqrestore(&devinfo->lock, flags); } +static void uas_scan_work(struct work_struct *work) +{ + struct uas_dev_info *devinfo = + container_of(work, struct uas_dev_info, scan_work); + struct Scsi_Host *shost = usb_get_intfdata(devinfo->intf); + + dev_dbg(&devinfo->intf->dev, "starting scan\n"); + scsi_scan_host(shost); + dev_dbg(&devinfo->intf->dev, "scan complete\n"); +} + static void uas_add_work(struct uas_cmd_info *cmdinfo) { struct scsi_pointer *scp = (void *)cmdinfo; @@ -982,6 +994,7 @@ static int uas_probe(struct usb_interface *intf, const struct usb_device_id *id) init_usb_anchor(&devinfo->data_urbs); spin_lock_init(&devinfo->lock); INIT_WORK(&devinfo->work, uas_do_work); + INIT_WORK(&devinfo->scan_work, uas_scan_work); result = uas_configure_endpoints(devinfo); if (result) @@ -998,7 +1011,9 @@ static int uas_probe(struct usb_interface *intf, const struct usb_device_id *id) if (result) goto free_streams; - scsi_scan_host(shost); + /* Submit the delayed_work for SCSI-device scanning */ + schedule_work(&devinfo->scan_work); + return result; free_streams: @@ -1166,6 +1181,12 @@ static void uas_disconnect(struct usb_interface *intf) usb_kill_anchored_urbs(&devinfo->data_urbs); uas_zap_pending(devinfo, DID_NO_CONNECT); + /* + * Prevent SCSI scanning (if it hasn't started yet) + * or wait for the SCSI-scanning routine to stop. + */ + cancel_work_sync(&devinfo->scan_work); + scsi_remove_host(shost); uas_free_streams(devinfo); scsi_host_put(shost); -- cgit v1.2.3 From ca4b43c14cd88d28cfc6467d2fa075aad6818f1d Mon Sep 17 00:00:00 2001 From: Peter Chen Date: Sat, 1 Feb 2020 14:13:44 +0800 Subject: usb: charger: assign specific number for enum value To work properly on every architectures and compilers, the enum value needs to be specific numbers. Suggested-by: Greg KH Signed-off-by: Peter Chen Link: https://lore.kernel.org/r/1580537624-10179-1-git-send-email-peter.chen@nxp.com Signed-off-by: Greg Kroah-Hartman --- include/uapi/linux/usb/charger.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/include/uapi/linux/usb/charger.h b/include/uapi/linux/usb/charger.h index 5f72af35b3ed..ad22079125bf 100644 --- a/include/uapi/linux/usb/charger.h +++ b/include/uapi/linux/usb/charger.h @@ -14,18 +14,18 @@ * ACA (Accessory Charger Adapters) */ enum usb_charger_type { - UNKNOWN_TYPE, - SDP_TYPE, - DCP_TYPE, - CDP_TYPE, - ACA_TYPE, + UNKNOWN_TYPE = 0, + SDP_TYPE = 1, + DCP_TYPE = 2, + CDP_TYPE = 3, + ACA_TYPE = 4, }; /* USB charger state */ enum usb_charger_state { - USB_CHARGER_DEFAULT, - USB_CHARGER_PRESENT, - USB_CHARGER_ABSENT, + USB_CHARGER_DEFAULT = 0, + USB_CHARGER_PRESENT = 1, + USB_CHARGER_ABSENT = 2, }; #endif /* _UAPI__LINUX_USB_CHARGER_H */ -- cgit v1.2.3 From 8099f58f1ecddf4f374f4828a3dff8397c7cbd74 Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Fri, 31 Jan 2020 10:39:26 -0500 Subject: USB: hub: Don't record a connect-change event during reset-resume Paul Zimmerman reports that his USB Bluetooth adapter sometimes crashes following system resume, when it receives a Get-Device-Descriptor request while it is busy doing something else. Such a request was added by commit a4f55d8b8c14 ("usb: hub: Check device descriptor before resusciation"). It gets sent when the hub driver's work thread checks whether a connect-change event on an enabled port really indicates a new device has been connected, as opposed to an old device momentarily disconnecting and then reconnecting (which can happen with xHCI host controllers, since they automatically enable connected ports). The same kind of thing occurs when a port's power session is lost during system suspend. When the system wakes up it sees a connect-change event on the port, and if the child device's persist_enabled flag was set then hub_activate() sets the device's reset_resume flag as well as the port's bit in hub->change_bits. The reset-resume code then takes responsibility for checking that the same device is still attached to the port, and it does this as part of the device's resume pathway. By the time the hub driver's work thread starts up again, the device has already been fully reinitialized and is busy doing its own thing. There's no need for the work thread to do the same check a second time, and in fact this unnecessary check is what caused the problem that Paul observed. Note that performing the unnecessary check is not actually a bug. Devices are supposed to be able to send descriptors back to the host even when they are busy doing something else. The underlying cause of Paul's problem lies in his Bluetooth adapter. Nevertheless, we shouldn't perform the same check twice in a row -- and as a nice side benefit, removing the extra check allows the Bluetooth adapter to work more reliably. The work thread performs its check when it sees that the port's bit is set in hub->change_bits. In this situation that bit is interpreted as though a connect-change event had occurred on the port _after_ the reset-resume, which is not what actually happened. One possible fix would be to make the reset-resume code clear the port's bit in hub->change_bits. But it seems simpler to just avoid setting the bit during hub_activate() in the first place. That's what this patch does. (Proving that the patch is correct when CONFIG_PM is disabled requires a little thought. In that setting hub_activate() will be called only for initialization and resets, since there won't be any resumes or reset-resumes. During initialization and hub resets the hub doesn't have any child devices, and so this code path never gets executed.) Reported-and-tested-by: Paul Zimmerman Signed-off-by: Alan Stern Link: https://marc.info/?t=157949360700001&r=1&w=2 CC: David Heinzelmann CC: Link: https://lore.kernel.org/r/Pine.LNX.4.44L0.2001311037460.1577-100000@iolanthe.rowland.org Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/hub.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c index de94fa4a4ca7..1d212f82c69b 100644 --- a/drivers/usb/core/hub.c +++ b/drivers/usb/core/hub.c @@ -1219,11 +1219,6 @@ static void hub_activate(struct usb_hub *hub, enum hub_activation_type type) #ifdef CONFIG_PM udev->reset_resume = 1; #endif - /* Don't set the change_bits when the device - * was powered off. - */ - if (test_bit(port1, hub->power_bits)) - set_bit(port1, hub->change_bits); } else { /* The power session is gone; tell hub_wq */ -- cgit v1.2.3 From a4a601948fc8d0a9661b83df03f5ee11e903efe6 Mon Sep 17 00:00:00 2001 From: Dmitry Osipenko Date: Mon, 3 Feb 2020 01:42:59 +0300 Subject: usb: phy: tegra: Add clarifying comments about the shared registers Tools like Coccinelle may erroneously recommend to use the devm_platform_ioremap_resource() API for the registers mapping because these tools are not aware about the implementation details of the driver. Let's add a clarifying comments to the code, which should help to stop future attempts to break the driver. Signed-off-by: Dmitry Osipenko Acked-by: Thierry Reding Link: https://lore.kernel.org/r/20200202224259.29187-1-digetx@gmail.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/phy/phy-tegra-usb.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/usb/phy/phy-tegra-usb.c b/drivers/usb/phy/phy-tegra-usb.c index 037e8eee737d..6153cc35aba0 100644 --- a/drivers/usb/phy/phy-tegra-usb.c +++ b/drivers/usb/phy/phy-tegra-usb.c @@ -969,6 +969,10 @@ static int utmi_phy_probe(struct tegra_usb_phy *tegra_phy, return -ENXIO; } + /* + * Note that UTMI pad registers are shared by all PHYs, therefore + * devm_platform_ioremap_resource() can't be used here. + */ tegra_phy->pad_regs = devm_ioremap(&pdev->dev, res->start, resource_size(res)); if (!tegra_phy->pad_regs) { @@ -1087,6 +1091,10 @@ static int tegra_usb_phy_probe(struct platform_device *pdev) return -ENXIO; } + /* + * Note that PHY and USB controller are using shared registers, + * therefore devm_platform_ioremap_resource() can't be used here. + */ tegra_phy->regs = devm_ioremap(&pdev->dev, res->start, resource_size(res)); if (!tegra_phy->regs) { -- cgit v1.2.3 From b32196e35bd7bbc8038db1aba1fbf022dc469b6a Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 10 Feb 2020 09:51:39 +0000 Subject: usb: dwc3: debug: fix string position formatting mixup with ret and len Currently the string formatting is mixing up the offset of ret and len. Re-work the code to use just len, remove ret and use scnprintf instead of snprintf and len position accumulation where required. Remove the -ve return check since scnprintf never returns a failure -ve size. Also break overly long lines to clean up checkpatch warnings. Addresses-Coverity: ("Unused value") Fixes: 1381a5113caf ("usb: dwc3: debug: purge usage of strcat") Signed-off-by: Colin Ian King Reviewed-by: Dan Carpenter Cc: stable Link: https://lore.kernel.org/r/20200210095139.328711-1-colin.king@canonical.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/dwc3/debug.h | 39 +++++++++++++++------------------------ 1 file changed, 15 insertions(+), 24 deletions(-) diff --git a/drivers/usb/dwc3/debug.h b/drivers/usb/dwc3/debug.h index e56beb9d1e36..4a13ceaf4093 100644 --- a/drivers/usb/dwc3/debug.h +++ b/drivers/usb/dwc3/debug.h @@ -256,86 +256,77 @@ static inline const char *dwc3_ep_event_string(char *str, size_t size, u8 epnum = event->endpoint_number; size_t len; int status; - int ret; - ret = snprintf(str, size, "ep%d%s: ", epnum >> 1, + len = scnprintf(str, size, "ep%d%s: ", epnum >> 1, (epnum & 1) ? "in" : "out"); - if (ret < 0) - return "UNKNOWN"; status = event->status; switch (event->endpoint_event) { case DWC3_DEPEVT_XFERCOMPLETE: - len = strlen(str); - snprintf(str + len, size - len, "Transfer Complete (%c%c%c)", + len += scnprintf(str + len, size - len, + "Transfer Complete (%c%c%c)", status & DEPEVT_STATUS_SHORT ? 'S' : 's', status & DEPEVT_STATUS_IOC ? 'I' : 'i', status & DEPEVT_STATUS_LST ? 'L' : 'l'); - len = strlen(str); - if (epnum <= 1) - snprintf(str + len, size - len, " [%s]", + scnprintf(str + len, size - len, " [%s]", dwc3_ep0_state_string(ep0state)); break; case DWC3_DEPEVT_XFERINPROGRESS: - len = strlen(str); - - snprintf(str + len, size - len, "Transfer In Progress [%d] (%c%c%c)", + scnprintf(str + len, size - len, + "Transfer In Progress [%d] (%c%c%c)", event->parameters, status & DEPEVT_STATUS_SHORT ? 'S' : 's', status & DEPEVT_STATUS_IOC ? 'I' : 'i', status & DEPEVT_STATUS_LST ? 'M' : 'm'); break; case DWC3_DEPEVT_XFERNOTREADY: - len = strlen(str); - - snprintf(str + len, size - len, "Transfer Not Ready [%d]%s", + len += scnprintf(str + len, size - len, + "Transfer Not Ready [%d]%s", event->parameters, status & DEPEVT_STATUS_TRANSFER_ACTIVE ? " (Active)" : " (Not Active)"); - len = strlen(str); - /* Control Endpoints */ if (epnum <= 1) { int phase = DEPEVT_STATUS_CONTROL_PHASE(event->status); switch (phase) { case DEPEVT_STATUS_CONTROL_DATA: - snprintf(str + ret, size - ret, + scnprintf(str + len, size - len, " [Data Phase]"); break; case DEPEVT_STATUS_CONTROL_STATUS: - snprintf(str + ret, size - ret, + scnprintf(str + len, size - len, " [Status Phase]"); } } break; case DWC3_DEPEVT_RXTXFIFOEVT: - snprintf(str + ret, size - ret, "FIFO"); + scnprintf(str + len, size - len, "FIFO"); break; case DWC3_DEPEVT_STREAMEVT: status = event->status; switch (status) { case DEPEVT_STREAMEVT_FOUND: - snprintf(str + ret, size - ret, " Stream %d Found", + scnprintf(str + len, size - len, " Stream %d Found", event->parameters); break; case DEPEVT_STREAMEVT_NOTFOUND: default: - snprintf(str + ret, size - ret, " Stream Not Found"); + scnprintf(str + len, size - len, " Stream Not Found"); break; } break; case DWC3_DEPEVT_EPCMDCMPLT: - snprintf(str + ret, size - ret, "Endpoint Command Complete"); + scnprintf(str + len, size - len, "Endpoint Command Complete"); break; default: - snprintf(str, size, "UNKNOWN"); + scnprintf(str + len, size - len, "UNKNOWN"); } return str; -- cgit v1.2.3 From 73f8bda9b5dc1c69df2bc55c0cbb24461a6391a9 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Mon, 3 Feb 2020 16:38:28 +0100 Subject: USB: core: add endpoint-blacklist quirk Add a new device quirk that can be used to blacklist endpoints. Since commit 3e4f8e21c4f2 ("USB: core: fix check for duplicate endpoints") USB core ignores any duplicate endpoints found during descriptor parsing. In order to handle devices where the first interfaces with duplicate endpoints are the ones that should have their endpoints ignored, we need to add a blacklist. Tested-by: edes Cc: stable Signed-off-by: Johan Hovold Link: https://lore.kernel.org/r/20200203153830.26394-2-johan@kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/config.c | 11 +++++++++++ drivers/usb/core/quirks.c | 32 ++++++++++++++++++++++++++++++++ drivers/usb/core/usb.h | 3 +++ include/linux/usb/quirks.h | 3 +++ 4 files changed, 49 insertions(+) diff --git a/drivers/usb/core/config.c b/drivers/usb/core/config.c index 26bc05e48d8a..7df22bcefa9d 100644 --- a/drivers/usb/core/config.c +++ b/drivers/usb/core/config.c @@ -256,6 +256,7 @@ static int usb_parse_endpoint(struct device *ddev, int cfgno, struct usb_host_interface *ifp, int num_ep, unsigned char *buffer, int size) { + struct usb_device *udev = to_usb_device(ddev); unsigned char *buffer0 = buffer; struct usb_endpoint_descriptor *d; struct usb_host_endpoint *endpoint; @@ -297,6 +298,16 @@ static int usb_parse_endpoint(struct device *ddev, int cfgno, goto skip_to_next_endpoint_or_interface_descriptor; } + /* Ignore blacklisted endpoints */ + if (udev->quirks & USB_QUIRK_ENDPOINT_BLACKLIST) { + if (usb_endpoint_is_blacklisted(udev, ifp, d)) { + dev_warn(ddev, "config %d interface %d altsetting %d has a blacklisted endpoint with address 0x%X, skipping\n", + cfgno, inum, asnum, + d->bEndpointAddress); + goto skip_to_next_endpoint_or_interface_descriptor; + } + } + endpoint = &ifp->endpoint[ifp->desc.bNumEndpoints]; ++ifp->desc.bNumEndpoints; diff --git a/drivers/usb/core/quirks.c b/drivers/usb/core/quirks.c index 6b6413073584..56c8dffaf5f5 100644 --- a/drivers/usb/core/quirks.c +++ b/drivers/usb/core/quirks.c @@ -472,6 +472,38 @@ static const struct usb_device_id usb_amd_resume_quirk_list[] = { { } /* terminating entry must be last */ }; +/* + * Entries for blacklisted endpoints that should be ignored when parsing + * configuration descriptors. + * + * Matched for devices with USB_QUIRK_ENDPOINT_BLACKLIST. + */ +static const struct usb_device_id usb_endpoint_blacklist[] = { + { } +}; + +bool usb_endpoint_is_blacklisted(struct usb_device *udev, + struct usb_host_interface *intf, + struct usb_endpoint_descriptor *epd) +{ + const struct usb_device_id *id; + unsigned int address; + + for (id = usb_endpoint_blacklist; id->match_flags; ++id) { + if (!usb_match_device(udev, id)) + continue; + + if (!usb_match_one_id_intf(udev, intf, id)) + continue; + + address = id->driver_info; + if (address == epd->bEndpointAddress) + return true; + } + + return false; +} + static bool usb_match_any_interface(struct usb_device *udev, const struct usb_device_id *id) { diff --git a/drivers/usb/core/usb.h b/drivers/usb/core/usb.h index cf4783cf661a..3ad0ee57e859 100644 --- a/drivers/usb/core/usb.h +++ b/drivers/usb/core/usb.h @@ -37,6 +37,9 @@ extern void usb_authorize_interface(struct usb_interface *); extern void usb_detect_quirks(struct usb_device *udev); extern void usb_detect_interface_quirks(struct usb_device *udev); extern void usb_release_quirk_list(void); +extern bool usb_endpoint_is_blacklisted(struct usb_device *udev, + struct usb_host_interface *intf, + struct usb_endpoint_descriptor *epd); extern int usb_remove_device(struct usb_device *udev); extern int usb_get_device_descriptor(struct usb_device *dev, diff --git a/include/linux/usb/quirks.h b/include/linux/usb/quirks.h index a1be64c9940f..22c1f579afe3 100644 --- a/include/linux/usb/quirks.h +++ b/include/linux/usb/quirks.h @@ -69,4 +69,7 @@ /* Hub needs extra delay after resetting its port. */ #define USB_QUIRK_HUB_SLOW_RESET BIT(14) +/* device has blacklisted endpoints */ +#define USB_QUIRK_ENDPOINT_BLACKLIST BIT(15) + #endif /* __LINUX_USB_QUIRKS_H */ -- cgit v1.2.3 From bdd1b147b8026df0e4260b387026b251d888ed01 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Mon, 3 Feb 2020 16:38:29 +0100 Subject: USB: quirks: blacklist duplicate ep on Sound Devices USBPre2 This device has a broken vendor-specific altsetting for interface 1, where endpoint 0x85 is declared as an isochronous endpoint despite being used by interface 2 for audio capture. Device Descriptor: bLength 18 bDescriptorType 1 bcdUSB 2.00 bDeviceClass 239 Miscellaneous Device bDeviceSubClass 2 bDeviceProtocol 1 Interface Association bMaxPacketSize0 64 idVendor 0x0926 idProduct 0x0202 bcdDevice 1.00 iManufacturer 1 Sound Devices iProduct 2 USBPre2 iSerial 3 [...] bNumConfigurations 1 [...] Interface Descriptor: bLength 9 bDescriptorType 4 bInterfaceNumber 1 bAlternateSetting 3 bNumEndpoints 2 bInterfaceClass 255 Vendor Specific Class bInterfaceSubClass 0 bInterfaceProtocol 0 iInterface 0 Endpoint Descriptor: bLength 7 bDescriptorType 5 bEndpointAddress 0x85 EP 5 IN bmAttributes 5 Transfer Type Isochronous Synch Type Asynchronous Usage Type Data wMaxPacketSize 0x0126 1x 294 bytes bInterval 1 [...] Interface Descriptor: bLength 9 bDescriptorType 4 bInterfaceNumber 2 bAlternateSetting 1 bNumEndpoints 1 bInterfaceClass 1 Audio bInterfaceSubClass 2 Streaming bInterfaceProtocol 0 iInterface 0 AudioStreaming Interface Descriptor: bLength 7 bDescriptorType 36 bDescriptorSubtype 1 (AS_GENERAL) bTerminalLink 4 bDelay 1 frames wFormatTag 0x0001 PCM AudioStreaming Interface Descriptor: bLength 26 bDescriptorType 36 bDescriptorSubtype 2 (FORMAT_TYPE) bFormatType 1 (FORMAT_TYPE_I) bNrChannels 2 bSubframeSize 2 bBitResolution 16 bSamFreqType 6 Discrete tSamFreq[ 0] 8000 tSamFreq[ 1] 16000 tSamFreq[ 2] 24000 tSamFreq[ 3] 32000 tSamFreq[ 4] 44100 tSamFreq[ 5] 48000 Endpoint Descriptor: bLength 9 bDescriptorType 5 bEndpointAddress 0x85 EP 5 IN bmAttributes 5 Transfer Type Isochronous Synch Type Asynchronous Usage Type Data wMaxPacketSize 0x0126 1x 294 bytes bInterval 4 bRefresh 0 bSynchAddress 0 AudioStreaming Endpoint Descriptor: bLength 7 bDescriptorType 37 bDescriptorSubtype 1 (EP_GENERAL) bmAttributes 0x01 Sampling Frequency bLockDelayUnits 2 Decoded PCM samples wLockDelay 0x0000 Since commit 3e4f8e21c4f2 ("USB: core: fix check for duplicate endpoints") USB core ignores any duplicate endpoints found during descriptor parsing, but in this case we need to ignore the first instance in order to avoid breaking the audio capture interface. Fixes: 3e4f8e21c4f2 ("USB: core: fix check for duplicate endpoints") Cc: stable Reported-by: edes Tested-by: edes Link: https://lore.kernel.org/r/20200201105829.5682c887@acme7.acmenet Signed-off-by: Johan Hovold Link: https://lore.kernel.org/r/20200203153830.26394-3-johan@kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/quirks.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/usb/core/quirks.c b/drivers/usb/core/quirks.c index 56c8dffaf5f5..f27468966a3d 100644 --- a/drivers/usb/core/quirks.c +++ b/drivers/usb/core/quirks.c @@ -354,6 +354,10 @@ static const struct usb_device_id usb_quirk_list[] = { { USB_DEVICE(0x0904, 0x6103), .driver_info = USB_QUIRK_LINEAR_FRAME_INTR_BINTERVAL }, + /* Sound Devices USBPre2 */ + { USB_DEVICE(0x0926, 0x0202), .driver_info = + USB_QUIRK_ENDPOINT_BLACKLIST }, + /* Keytouch QWERTY Panel keyboard */ { USB_DEVICE(0x0926, 0x3333), .driver_info = USB_QUIRK_CONFIG_INTF_STRINGS }, @@ -479,6 +483,7 @@ static const struct usb_device_id usb_amd_resume_quirk_list[] = { * Matched for devices with USB_QUIRK_ENDPOINT_BLACKLIST. */ static const struct usb_device_id usb_endpoint_blacklist[] = { + { USB_DEVICE_INTERFACE_NUMBER(0x0926, 0x0202, 1), .driver_info = 0x85 }, { } }; -- cgit v1.2.3 From 7f1b92a6a7f2b96a8647a488370b9a851433df77 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Mon, 3 Feb 2020 16:38:30 +0100 Subject: USB: core: clean up endpoint-descriptor parsing Use the new usb-device pointer instead of back-casting when accessing the struct usb_device when parsing endpoints. Note that this introduces two lines that are longer than 80 chars on purpose. Signed-off-by: Johan Hovold Link: https://lore.kernel.org/r/20200203153830.26394-4-johan@kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/config.c | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/drivers/usb/core/config.c b/drivers/usb/core/config.c index 7df22bcefa9d..b7918f695434 100644 --- a/drivers/usb/core/config.c +++ b/drivers/usb/core/config.c @@ -322,7 +322,7 @@ static int usb_parse_endpoint(struct device *ddev, int cfgno, j = 255; if (usb_endpoint_xfer_int(d)) { i = 1; - switch (to_usb_device(ddev)->speed) { + switch (udev->speed) { case USB_SPEED_SUPER_PLUS: case USB_SPEED_SUPER: case USB_SPEED_HIGH: @@ -343,8 +343,7 @@ static int usb_parse_endpoint(struct device *ddev, int cfgno, /* * This quirk fixes bIntervals reported in ms. */ - if (to_usb_device(ddev)->quirks & - USB_QUIRK_LINEAR_FRAME_INTR_BINTERVAL) { + if (udev->quirks & USB_QUIRK_LINEAR_FRAME_INTR_BINTERVAL) { n = clamp(fls(d->bInterval) + 3, i, j); i = j = n; } @@ -352,8 +351,7 @@ static int usb_parse_endpoint(struct device *ddev, int cfgno, * This quirk fixes bIntervals reported in * linear microframes. */ - if (to_usb_device(ddev)->quirks & - USB_QUIRK_LINEAR_UFRAME_INTR_BINTERVAL) { + if (udev->quirks & USB_QUIRK_LINEAR_UFRAME_INTR_BINTERVAL) { n = clamp(fls(d->bInterval), i, j); i = j = n; } @@ -370,7 +368,7 @@ static int usb_parse_endpoint(struct device *ddev, int cfgno, } else if (usb_endpoint_xfer_isoc(d)) { i = 1; j = 16; - switch (to_usb_device(ddev)->speed) { + switch (udev->speed) { case USB_SPEED_HIGH: n = 7; /* 8 ms = 2^(7-1) uframes */ break; @@ -392,8 +390,7 @@ static int usb_parse_endpoint(struct device *ddev, int cfgno, * explicitly forbidden by the USB spec. In an attempt to make * them usable, we will try treating them as Interrupt endpoints. */ - if (to_usb_device(ddev)->speed == USB_SPEED_LOW && - usb_endpoint_xfer_bulk(d)) { + if (udev->speed == USB_SPEED_LOW && usb_endpoint_xfer_bulk(d)) { dev_warn(ddev, "config %d interface %d altsetting %d " "endpoint 0x%X is Bulk; changing to Interrupt\n", cfgno, inum, asnum, d->bEndpointAddress); @@ -417,7 +414,7 @@ static int usb_parse_endpoint(struct device *ddev, int cfgno, /* Find the highest legal maxpacket size for this endpoint */ i = 0; /* additional transactions per microframe */ - switch (to_usb_device(ddev)->speed) { + switch (udev->speed) { case USB_SPEED_LOW: maxpacket_maxes = low_speed_maxpacket_maxes; break; @@ -453,8 +450,7 @@ static int usb_parse_endpoint(struct device *ddev, int cfgno, * maxpacket sizes other than 512. High speed HCDs may not * be able to handle that particular bug, so let's warn... */ - if (to_usb_device(ddev)->speed == USB_SPEED_HIGH - && usb_endpoint_xfer_bulk(d)) { + if (udev->speed == USB_SPEED_HIGH && usb_endpoint_xfer_bulk(d)) { if (maxp != 512) dev_warn(ddev, "config %d interface %d altsetting %d " "bulk endpoint 0x%X has invalid maxpacket %d\n", @@ -463,7 +459,7 @@ static int usb_parse_endpoint(struct device *ddev, int cfgno, } /* Parse a possible SuperSpeed endpoint companion descriptor */ - if (to_usb_device(ddev)->speed >= USB_SPEED_SUPER) + if (udev->speed >= USB_SPEED_SUPER) usb_parse_ss_endpoint_companion(ddev, cfgno, inum, asnum, endpoint, buffer, size); -- cgit v1.2.3 From 80cc7bb6c104d733bff60ddda09f19139c61507c Mon Sep 17 00:00:00 2001 From: Kim Phillips Date: Fri, 7 Feb 2020 17:06:11 -0600 Subject: perf stat: Don't report a null stalled cycles per insn metric For data collected on machines with front end stalled cycles supported, such as found on modern AMD CPU families, commit 146540fb545b ("perf stat: Always separate stalled cycles per insn") introduces a new line in CSV output with a leading comma that upsets some automated scripts. Scripts have to use "-e ex_ret_instr" to work around this issue, after upgrading to a version of perf with that commit. We could add "if (have_frontend_stalled && !config->csv_sep)" to the not (total && avg) else clause, to emphasize that CSV users are usually scripts, and are written to do only what is needed, i.e., they wouldn't typically invoke "perf stat" without specifying an explicit event list. But - let alone CSV output - why should users now tolerate a constant 0-reporting extra line in regular terminal output?: BEFORE: $ sudo perf stat --all-cpus -einstructions,cycles -- sleep 1 Performance counter stats for 'system wide': 181,110,981 instructions # 0.58 insn per cycle # 0.00 stalled cycles per insn 309,876,469 cycles 1.002202582 seconds time elapsed The user would not like to see the now permanent: "0.00 stalled cycles per insn" line fixture, as it gives no useful information. So this patch removes the printing of the zeroed stalled cycles line altogether, almost reverting the very original commit fb4605ba47e7 ("perf stat: Check for frontend stalled for metrics"), which seems like it was written to normalize --metric-only column output of common Intel machines at the time: modern Intel machines have ceased to support the genericised frontend stalled metrics AFAICT. AFTER: $ sudo perf stat --all-cpus -einstructions,cycles -- sleep 1 Performance counter stats for 'system wide': 244,071,432 instructions # 0.69 insn per cycle 355,353,490 cycles 1.001862516 seconds time elapsed Output behaviour when stalled cycles is indeed measured is not affected (BEFORE == AFTER): $ sudo perf stat --all-cpus -einstructions,cycles,stalled-cycles-frontend -- sleep 1 Performance counter stats for 'system wide': 247,227,799 instructions # 0.63 insn per cycle # 0.26 stalled cycles per insn 394,745,636 cycles 63,194,485 stalled-cycles-frontend # 16.01% frontend cycles idle 1.002079770 seconds time elapsed Fixes: 146540fb545b ("perf stat: Always separate stalled cycles per insn") Signed-off-by: Kim Phillips Acked-by: Andi Kleen Acked-by: Jiri Olsa Acked-by: Song Liu Cc: Alexander Shishkin Cc: Cong Wang Cc: Davidlohr Bueso Cc: Jin Yao Cc: Kan Liang Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/20200207230613.26709-1-kim.phillips@amd.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/stat-shadow.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/tools/perf/util/stat-shadow.c b/tools/perf/util/stat-shadow.c index 2c41d47f6f83..90d23cc3c8d4 100644 --- a/tools/perf/util/stat-shadow.c +++ b/tools/perf/util/stat-shadow.c @@ -18,7 +18,6 @@ * AGGR_NONE: Use matching CPU * AGGR_THREAD: Not supported? */ -static bool have_frontend_stalled; struct runtime_stat rt_stat; struct stats walltime_nsecs_stats; @@ -144,7 +143,6 @@ void runtime_stat__exit(struct runtime_stat *st) void perf_stat__init_shadow_stats(void) { - have_frontend_stalled = pmu_have_event("cpu", "stalled-cycles-frontend"); runtime_stat__init(&rt_stat); } @@ -853,10 +851,6 @@ void perf_stat__print_shadow_stats(struct perf_stat_config *config, print_metric(config, ctxp, NULL, "%7.2f ", "stalled cycles per insn", ratio); - } else if (have_frontend_stalled) { - out->new_line(config, ctxp); - print_metric(config, ctxp, NULL, "%7.2f ", - "stalled cycles per insn", 0); } } else if (perf_evsel__match(evsel, HARDWARE, HW_BRANCH_MISSES)) { if (runtime_stat_n(st, STAT_BRANCHES, ctx, cpu) != 0) -- cgit v1.2.3 From 0e71459afcbbf69e92a65085c45515d3f3f02c31 Mon Sep 17 00:00:00 2001 From: Kim Phillips Date: Fri, 7 Feb 2020 17:06:12 -0600 Subject: perf symbols: Update the list of kernel idle symbols The "acpi_idle_do_entry", "acpi_processor_ffh_cstate_enter", and "idle_cpu" symbols appear in 'perf top' output, at least on AMD systems. Add them to perf's idle_symbols list, so they don't dominate 'perf top' output. Signed-off-by: Kim Phillips Acked-by: Jiri Olsa Acked-by: Song Liu Cc: Alexander Shishkin Cc: Andi Kleen Cc: Cong Wang Cc: Davidlohr Bueso Cc: Jin Yao Cc: Kan Liang Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/20200207230613.26709-2-kim.phillips@amd.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/symbol.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c index 3b379b1296f1..f3120c4f47ad 100644 --- a/tools/perf/util/symbol.c +++ b/tools/perf/util/symbol.c @@ -635,9 +635,12 @@ out: static bool symbol__is_idle(const char *name) { const char * const idle_symbols[] = { + "acpi_idle_do_entry", + "acpi_processor_ffh_cstate_enter", "arch_cpu_idle", "cpu_idle", "cpu_startup_entry", + "idle_cpu", "intel_idle", "default_idle", "native_safe_halt", -- cgit v1.2.3 From bc5f15be2c814ca1ff6bb4e62d5b275a8c88cbb1 Mon Sep 17 00:00:00 2001 From: Kim Phillips Date: Mon, 10 Feb 2020 10:31:47 -0600 Subject: perf symbols: Convert symbol__is_idle() to use strlist Use the more optimized strlist implementation to do the idle function lookup. Signed-off-by: Kim Phillips Acked-by: Song Liu Tested-by: Arnaldo Carvalho de Melo Cc: Alexander Shishkin Cc: Andi Kleen Cc: Cong Wang Cc: Davidlohr Bueso Cc: Jin Yao Cc: Jiri Olsa Cc: Kan Liang Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/20200210163147.25358-1-kim.phillips@amd.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/symbol.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c index f3120c4f47ad..1077013d8ce2 100644 --- a/tools/perf/util/symbol.c +++ b/tools/perf/util/symbol.c @@ -654,13 +654,17 @@ static bool symbol__is_idle(const char *name) NULL }; int i; + static struct strlist *idle_symbols_list; - for (i = 0; idle_symbols[i]; i++) { - if (!strcmp(idle_symbols[i], name)) - return true; - } + if (idle_symbols_list) + return strlist__has_entry(idle_symbols_list, name); - return false; + idle_symbols_list = strlist__new(NULL, NULL); + + for (i = 0; idle_symbols[i]; i++) + strlist__add(idle_symbols_list, idle_symbols[i]); + + return strlist__has_entry(idle_symbols_list, name); } static int map__process_kallsym_symbol(void *arg, const char *name, -- cgit v1.2.3 From 9d1b38958b077f6c8d4bd196a115b643d7bd6717 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Tue, 11 Feb 2020 01:18:52 +0900 Subject: scripts/kallsyms: fix memory corruption caused by write over-run memcpy() writes one more byte than allocated. Fixes: 8d60526999aa ("scripts/kallsyms: change table to store (strcut sym_entry *)") Reported-by: youling257 Reported-by: Pavel Machek Signed-off-by: Masahiro Yamada Tested-by: Pavel Machek --- scripts/kallsyms.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/kallsyms.c b/scripts/kallsyms.c index a566d8201b56..0133dfaaf352 100644 --- a/scripts/kallsyms.c +++ b/scripts/kallsyms.c @@ -210,7 +210,7 @@ static struct sym_entry *read_symbol(FILE *in) len = strlen(name) + 1; - sym = malloc(sizeof(*sym) + len); + sym = malloc(sizeof(*sym) + len + 1); if (!sym) { fprintf(stderr, "kallsyms failure: " "unable to allocate required amount of memory\n"); @@ -219,7 +219,7 @@ static struct sym_entry *read_symbol(FILE *in) sym->addr = addr; sym->len = len; sym->sym[0] = type; - memcpy(sym_name(sym), name, len); + strcpy(sym_name(sym), name); sym->percpu_absolute = 0; return sym; -- cgit v1.2.3 From 083bc0e1ce91e2a65a1aecdf8d2c5045e5d46c55 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Tue, 11 Feb 2020 05:06:34 +0900 Subject: kbuild: fix mismatch between .version and include/generated/compile.h Since commit 56d589361572 ("kbuild: do not create orphan built-in.a or obj-y objects"), scripts/link-vmlinux.sh does nothing when descending into init/. Once the version number becomes out of sync between .version and include/generated/compile.h, it is not self-healing. [How to reproduce] $ echo 100 > .version $ make You will see the number in the .version is always bigger than that in compile.h by one. After this, every time you run 'make', the vmlinux is re-linked even when none of source files is updated. Fixes: 56d589361572 ("kbuild: do not create orphan built-in.a or obj-y objects") Signed-off-by: Masahiro Yamada --- scripts/link-vmlinux.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/link-vmlinux.sh b/scripts/link-vmlinux.sh index 1919c311c149..dd484e92752e 100755 --- a/scripts/link-vmlinux.sh +++ b/scripts/link-vmlinux.sh @@ -239,7 +239,7 @@ else fi; # final build of init/ -${MAKE} -f "${srctree}/scripts/Makefile.build" obj=init +${MAKE} -f "${srctree}/scripts/Makefile.build" obj=init need-builtin=1 #link vmlinux.o info LD vmlinux.o -- cgit v1.2.3 From 87c5cbf71ecbb9e289d60a2df22eb686c70bf196 Mon Sep 17 00:00:00 2001 From: Daniel Golle Date: Fri, 7 Feb 2020 11:53:35 +0200 Subject: serial: ar933x_uart: set UART_CS_{RX,TX}_READY_ORIDE On AR934x this UART is usually not initialized by the bootloader as it is only used as a secondary serial port while the primary UART is a newly introduced NS16550-compatible. In order to make use of the ar933x-uart on AR934x without RTS/CTS hardware flow control, one needs to set the UART_CS_{RX,TX}_READY_ORIDE bits as other than on AR933x where this UART is used as primary/console, the bootloader on AR934x typically doesn't set those bits. Setting them explicitely on AR933x should not do any harm, so just set them unconditionally. Tested-by: Chuanhong Guo Signed-off-by: Daniel Golle Link: https://lore.kernel.org/r/20200207095335.GA179836@makrotopia.org Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/ar933x_uart.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/tty/serial/ar933x_uart.c b/drivers/tty/serial/ar933x_uart.c index 3bdd56a1021b..ea12f10610b6 100644 --- a/drivers/tty/serial/ar933x_uart.c +++ b/drivers/tty/serial/ar933x_uart.c @@ -286,6 +286,10 @@ static void ar933x_uart_set_termios(struct uart_port *port, ar933x_uart_rmw_set(up, AR933X_UART_CS_REG, AR933X_UART_CS_HOST_INT_EN); + /* enable RX and TX ready overide */ + ar933x_uart_rmw_set(up, AR933X_UART_CS_REG, + AR933X_UART_CS_TX_READY_ORIDE | AR933X_UART_CS_RX_READY_ORIDE); + /* reenable the UART */ ar933x_uart_rmw(up, AR933X_UART_CS_REG, AR933X_UART_CS_IF_MODE_M << AR933X_UART_CS_IF_MODE_S, @@ -418,6 +422,10 @@ static int ar933x_uart_startup(struct uart_port *port) ar933x_uart_rmw_set(up, AR933X_UART_CS_REG, AR933X_UART_CS_HOST_INT_EN); + /* enable RX and TX ready overide */ + ar933x_uart_rmw_set(up, AR933X_UART_CS_REG, + AR933X_UART_CS_TX_READY_ORIDE | AR933X_UART_CS_RX_READY_ORIDE); + /* Enable RX interrupts */ up->ier = AR933X_UART_INT_RX_VALID; ar933x_uart_write(up, AR933X_UART_INT_EN_REG, up->ier); -- cgit v1.2.3 From 0c5aae59270fb1f827acce182786094c9ccf598e Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Mon, 10 Feb 2020 15:57:30 +0100 Subject: serdev: ttyport: restore client ops on deregistration The serdev tty-port controller driver should reset the tty-port client operations also on deregistration to avoid a NULL-pointer dereference in case the port is later re-registered as a normal tty device. Note that this can only happen with tty drivers such as 8250 which have statically allocated port structures that can end up being reused and where a later registration would not register a serdev controller (e.g. due to registration errors or if the devicetree has been changed in between). Specifically, this can be an issue for any statically defined ports that would be registered by 8250 core when an 8250 driver is being unbound. Fixes: bed35c6dfa6a ("serdev: add a tty port controller driver") Cc: stable # 4.11 Reported-by: Loic Poulain Signed-off-by: Johan Hovold Link: https://lore.kernel.org/r/20200210145730.22762-1-johan@kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serdev/serdev-ttyport.c | 6 ++---- drivers/tty/tty_port.c | 5 +++-- include/linux/tty.h | 2 ++ 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/tty/serdev/serdev-ttyport.c b/drivers/tty/serdev/serdev-ttyport.c index d1cdd2ab8b4c..d367803e2044 100644 --- a/drivers/tty/serdev/serdev-ttyport.c +++ b/drivers/tty/serdev/serdev-ttyport.c @@ -265,7 +265,6 @@ struct device *serdev_tty_port_register(struct tty_port *port, struct device *parent, struct tty_driver *drv, int idx) { - const struct tty_port_client_operations *old_ops; struct serdev_controller *ctrl; struct serport *serport; int ret; @@ -284,7 +283,6 @@ struct device *serdev_tty_port_register(struct tty_port *port, ctrl->ops = &ctrl_ops; - old_ops = port->client_ops; port->client_ops = &client_ops; port->client_data = ctrl; @@ -297,7 +295,7 @@ struct device *serdev_tty_port_register(struct tty_port *port, err_reset_data: port->client_data = NULL; - port->client_ops = old_ops; + port->client_ops = &tty_port_default_client_ops; serdev_controller_put(ctrl); return ERR_PTR(ret); @@ -312,8 +310,8 @@ int serdev_tty_port_unregister(struct tty_port *port) return -ENODEV; serdev_controller_remove(ctrl); - port->client_ops = NULL; port->client_data = NULL; + port->client_ops = &tty_port_default_client_ops; serdev_controller_put(ctrl); return 0; diff --git a/drivers/tty/tty_port.c b/drivers/tty/tty_port.c index 044c3cbdcfa4..ea80bf872f54 100644 --- a/drivers/tty/tty_port.c +++ b/drivers/tty/tty_port.c @@ -52,10 +52,11 @@ static void tty_port_default_wakeup(struct tty_port *port) } } -static const struct tty_port_client_operations default_client_ops = { +const struct tty_port_client_operations tty_port_default_client_ops = { .receive_buf = tty_port_default_receive_buf, .write_wakeup = tty_port_default_wakeup, }; +EXPORT_SYMBOL_GPL(tty_port_default_client_ops); void tty_port_init(struct tty_port *port) { @@ -68,7 +69,7 @@ void tty_port_init(struct tty_port *port) spin_lock_init(&port->lock); port->close_delay = (50 * HZ) / 100; port->closing_wait = (3000 * HZ) / 100; - port->client_ops = &default_client_ops; + port->client_ops = &tty_port_default_client_ops; kref_init(&port->kref); } EXPORT_SYMBOL(tty_port_init); diff --git a/include/linux/tty.h b/include/linux/tty.h index bfa4e2ee94a9..bd5fe0e907e8 100644 --- a/include/linux/tty.h +++ b/include/linux/tty.h @@ -225,6 +225,8 @@ struct tty_port_client_operations { void (*write_wakeup)(struct tty_port *port); }; +extern const struct tty_port_client_operations tty_port_default_client_ops; + struct tty_port { struct tty_bufhead buf; /* Locked internally */ struct tty_struct *tty; /* Back pointer */ -- cgit v1.2.3 From 04b5bfe3dc94e64d0590c54045815cb5183fb095 Mon Sep 17 00:00:00 2001 From: Nicolas Ferre Date: Mon, 10 Feb 2020 16:20:53 +0100 Subject: tty/serial: atmel: manage shutdown in case of RS485 or ISO7816 mode In atmel_shutdown() we call atmel_stop_rx() and atmel_stop_tx() functions. Prevent the rx restart that is implemented in RS485 or ISO7816 modes when calling atmel_stop_tx() by using the atomic information tasklet_shutdown that is already in place for this purpose. Fixes: 98f2082c3ac4 ("tty/serial: atmel: enforce tasklet init and termination sequences") Signed-off-by: Nicolas Ferre Cc: stable Link: https://lore.kernel.org/r/20200210152053.8289-1-nicolas.ferre@microchip.com Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/atmel_serial.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/tty/serial/atmel_serial.c b/drivers/tty/serial/atmel_serial.c index c15c398c88a9..a39c87a7c2e1 100644 --- a/drivers/tty/serial/atmel_serial.c +++ b/drivers/tty/serial/atmel_serial.c @@ -570,7 +570,8 @@ static void atmel_stop_tx(struct uart_port *port) atmel_uart_writel(port, ATMEL_US_IDR, atmel_port->tx_done_mask); if (atmel_uart_is_half_duplex(port)) - atmel_start_rx(port); + if (!atomic_read(&atmel_port->tasklet_shutdown)) + atmel_start_rx(port); } -- cgit v1.2.3 From 1f69a1273b3f204a9c00dc3bbdcc4afcd0787428 Mon Sep 17 00:00:00 2001 From: Dmitry Osipenko Date: Sun, 9 Feb 2020 19:44:15 +0300 Subject: tty: serial: tegra: Handle RX transfer in PIO mode if DMA wasn't started It is possible to get an instant RX timeout or end-of-transfer interrupt before RX DMA was started, if transaction is less than 16 bytes. Transfer should be handled in PIO mode in this case because DMA can't handle it. This patch brings back the original behaviour of the driver that was changed by accident by a previous commit, it fixes occasional Bluetooth HW initialization failures which I started to notice recently. Fixes: d5e3fadb7012 ("tty: serial: tegra: Activate RX DMA transfer by request") Signed-off-by: Dmitry Osipenko Link: https://lore.kernel.org/r/20200209164415.9632-1-digetx@gmail.com Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/serial-tegra.c | 35 ++++++++++++++++------------------- 1 file changed, 16 insertions(+), 19 deletions(-) diff --git a/drivers/tty/serial/serial-tegra.c b/drivers/tty/serial/serial-tegra.c index 33034b852a51..8de8bac9c6c7 100644 --- a/drivers/tty/serial/serial-tegra.c +++ b/drivers/tty/serial/serial-tegra.c @@ -692,11 +692,22 @@ static void tegra_uart_copy_rx_to_tty(struct tegra_uart_port *tup, count, DMA_TO_DEVICE); } +static void do_handle_rx_pio(struct tegra_uart_port *tup) +{ + struct tty_struct *tty = tty_port_tty_get(&tup->uport.state->port); + struct tty_port *port = &tup->uport.state->port; + + tegra_uart_handle_rx_pio(tup, port); + if (tty) { + tty_flip_buffer_push(port); + tty_kref_put(tty); + } +} + static void tegra_uart_rx_buffer_push(struct tegra_uart_port *tup, unsigned int residue) { struct tty_port *port = &tup->uport.state->port; - struct tty_struct *tty = tty_port_tty_get(port); unsigned int count; async_tx_ack(tup->rx_dma_desc); @@ -705,11 +716,7 @@ static void tegra_uart_rx_buffer_push(struct tegra_uart_port *tup, /* If we are here, DMA is stopped */ tegra_uart_copy_rx_to_tty(tup, port, count); - tegra_uart_handle_rx_pio(tup, port); - if (tty) { - tty_flip_buffer_push(port); - tty_kref_put(tty); - } + do_handle_rx_pio(tup); } static void tegra_uart_rx_dma_complete(void *args) @@ -749,8 +756,10 @@ static void tegra_uart_terminate_rx_dma(struct tegra_uart_port *tup) { struct dma_tx_state state; - if (!tup->rx_dma_active) + if (!tup->rx_dma_active) { + do_handle_rx_pio(tup); return; + } dmaengine_terminate_all(tup->rx_dma_chan); dmaengine_tx_status(tup->rx_dma_chan, tup->rx_cookie, &state); @@ -816,18 +825,6 @@ static void tegra_uart_handle_modem_signal_change(struct uart_port *u) uart_handle_cts_change(&tup->uport, msr & UART_MSR_CTS); } -static void do_handle_rx_pio(struct tegra_uart_port *tup) -{ - struct tty_struct *tty = tty_port_tty_get(&tup->uport.state->port); - struct tty_port *port = &tup->uport.state->port; - - tegra_uart_handle_rx_pio(tup, port); - if (tty) { - tty_flip_buffer_push(port); - tty_kref_put(tty); - } -} - static irqreturn_t tegra_uart_isr(int irq, void *data) { struct tegra_uart_port *tup = data; -- cgit v1.2.3 From ae7fce069bd7c8a54d920692f93f1d4eff2bff04 Mon Sep 17 00:00:00 2001 From: Grant Likely Date: Wed, 5 Feb 2020 00:16:27 +0000 Subject: Documentation/process: Add Arm contact for embargoed HW issues Adding myself to list after getting voluntold Cc: Catalin Marinas Signed-off-by: Grant Likely Acked-by: Catalin Marinas Link: https://lore.kernel.org/r/20200205001627.27356-1-grant.likely@arm.com Signed-off-by: Greg Kroah-Hartman --- Documentation/process/embargoed-hardware-issues.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/process/embargoed-hardware-issues.rst b/Documentation/process/embargoed-hardware-issues.rst index 33edae654599..572da66be43c 100644 --- a/Documentation/process/embargoed-hardware-issues.rst +++ b/Documentation/process/embargoed-hardware-issues.rst @@ -244,7 +244,7 @@ disclosure of a particular issue, unless requested by a response team or by an involved disclosed party. The current ambassadors list: ============= ======================================================== - ARM + ARM Grant Likely AMD Tom Lendacky IBM Intel Tony Luck -- cgit v1.2.3 From 485d5b75980dd6c62e02522a9e8c09b5d5529e76 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Wed, 5 Feb 2020 12:25:51 +0000 Subject: embargoed-hardware-issues: drop Amazon contact as the email address now bounces Peter's email address bounces, so remove him as the contact for Amazon. Signed-off-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/20200205122551.GA1185549@kroah.com Signed-off-by: Greg Kroah-Hartman --- Documentation/process/embargoed-hardware-issues.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/process/embargoed-hardware-issues.rst b/Documentation/process/embargoed-hardware-issues.rst index 572da66be43c..e405b4659d84 100644 --- a/Documentation/process/embargoed-hardware-issues.rst +++ b/Documentation/process/embargoed-hardware-issues.rst @@ -260,7 +260,7 @@ an involved disclosed party. The current ambassadors list: Red Hat Josh Poimboeuf SUSE Jiri Kosina - Amazon Peter Bowen + Amazon Google Kees Cook ============= ======================================================== -- cgit v1.2.3 From 4bc4f8128c48511b93e7285979a46cf0682cbb2f Mon Sep 17 00:00:00 2001 From: James Morris Date: Thu, 6 Feb 2020 10:08:34 +1100 Subject: Documentation/process: Change Microsoft contact for embargoed hardware issues Update Microsoft contact from Sasha to James. Cc: Sasha Levin Signed-off-by: James Morris Link: https://lore.kernel.org/r/alpine.LRH.2.21.2002061006350.22130@namei.org Signed-off-by: Greg Kroah-Hartman --- Documentation/process/embargoed-hardware-issues.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/process/embargoed-hardware-issues.rst b/Documentation/process/embargoed-hardware-issues.rst index e405b4659d84..64f375c02358 100644 --- a/Documentation/process/embargoed-hardware-issues.rst +++ b/Documentation/process/embargoed-hardware-issues.rst @@ -250,7 +250,7 @@ an involved disclosed party. The current ambassadors list: Intel Tony Luck Qualcomm Trilok Soni - Microsoft Sasha Levin + Microsoft James Morris VMware Xen Andrew Cooper -- cgit v1.2.3 From 74835c7db0322b6eddf091b8b062f127b8999a0a Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Thu, 6 Feb 2020 16:48:00 +0100 Subject: COPYING: state that all contributions really are covered by this file Explicitly state that all contributions to the kernel source tree really are covered under this COPYING file in case someone thought otherwise. Lawyers love to be pedantic, even more so than software engineers at times, and this sentence makes them sleep easier. Reviewed-by: Thomas Gleixner Acked-by: Gustavo A. R. Silva Link: https://lore.kernel.org/r/20200206154800.GA3754085@kroah.com Signed-off-by: Greg Kroah-Hartman --- COPYING | 2 ++ 1 file changed, 2 insertions(+) diff --git a/COPYING b/COPYING index da4cb28febe6..a635a38ef940 100644 --- a/COPYING +++ b/COPYING @@ -16,3 +16,5 @@ In addition, other licenses may also apply. Please see: Documentation/process/license-rules.rst for more details. + +All contributions to the Linux Kernel are subject to this COPYING file. -- cgit v1.2.3 From ea3d147a474cb522bfdfe68f1f2557750dcf41dd Mon Sep 17 00:00:00 2001 From: Brendan Higgins Date: Fri, 31 Jan 2020 14:18:32 +1030 Subject: fsi: aspeed: add unspecified HAS_IOMEM dependency Currently CONFIG_FSI_MASTER_ASPEED=y implicitly depends on CONFIG_HAS_IOMEM=y; consequently, on architectures without IOMEM we get the following build error: ld: drivers/fsi/fsi-master-aspeed.o: in function `fsi_master_aspeed_probe': drivers/fsi/fsi-master-aspeed.c:436: undefined reference to `devm_ioremap_resource' Fix the build error by adding the unspecified dependency. Fixes: 606397d67f41 ("fsi: Add ast2600 master driver") Cc: stable@vger.kernel.org Reported-by: Brendan Higgins Signed-off-by: Brendan Higgins Reviewed-by: Joel Stanley Signed-off-by: Joel Stanley Link: https://lore.kernel.org/r/20200131034832.294268-1-joel@jms.id.au Signed-off-by: Greg Kroah-Hartman --- drivers/fsi/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/fsi/Kconfig b/drivers/fsi/Kconfig index 92ce6d85802c..4cc0e630ab79 100644 --- a/drivers/fsi/Kconfig +++ b/drivers/fsi/Kconfig @@ -55,6 +55,7 @@ config FSI_MASTER_AST_CF config FSI_MASTER_ASPEED tristate "FSI ASPEED master" + depends on HAS_IOMEM help This option enables a FSI master that is present behind an OPB bridge in the AST2600. -- cgit v1.2.3 From 3f4ef485be9d54040b695f32ec76d0f1ea50bbf3 Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Tue, 28 Jan 2020 12:50:33 -0500 Subject: vt: fix scrollback flushing on background consoles Commit a6dbe4427559 ("vt: perform safe console erase in the right order") provided fixes to an earlier commit by gathering all console scrollback flushing operations in a function of its own. This includes the invocation of vc_sw->con_switch() as previously done through a update_screen() call. That commit failed to carry over the con_is_visible() conditional though, as well as cursor handling, which caused problems when "\e[3J" was written to a background console. One could argue for preserving the call to update_screen(). However this does far more than we need, and it is best to remove scrollback assumptions from it. Instead let's gather the minimum needed to actually perform scrollback flushing properly in that one place. While at it, let's document the vc_sw->con_switch() side effect being relied upon. Signed-off-by: Nicolas Pitre Reported-and-tested-by: Lukas Wunner Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/nycvar.YSQ.7.76.2001281205560.1655@knanqh.ubzr Signed-off-by: Greg Kroah-Hartman --- drivers/tty/vt/vt.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/drivers/tty/vt/vt.c b/drivers/tty/vt/vt.c index 35d21cdb60d0..0cfbb7182b5a 100644 --- a/drivers/tty/vt/vt.c +++ b/drivers/tty/vt/vt.c @@ -936,10 +936,21 @@ static void flush_scrollback(struct vc_data *vc) WARN_CONSOLE_UNLOCKED(); set_origin(vc); - if (vc->vc_sw->con_flush_scrollback) + if (vc->vc_sw->con_flush_scrollback) { vc->vc_sw->con_flush_scrollback(vc); - else + } else if (con_is_visible(vc)) { + /* + * When no con_flush_scrollback method is provided then the + * legacy way for flushing the scrollback buffer is to use + * a side effect of the con_switch method. We do it only on + * the foreground console as background consoles have no + * scrollback buffers in that case and we obviously don't + * want to switch to them. + */ + hide_cursor(vc); vc->vc_sw->con_switch(vc); + set_cursor(vc); + } } /* -- cgit v1.2.3 From a91e4f12ffc42fa019b5a66eaa8702f5e0f08a6a Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Fri, 7 Feb 2020 23:28:17 +0900 Subject: bootconfig: Allocate xbc_nodes array dynamically To reduce the large static array from kernel data, allocate xbc_nodes array dynamically only if the kernel loads a bootconfig. Note that this also add dummy memblock.h for user-spacae bootconfig tool. Link: http://lkml.kernel.org/r/158108569699.3187.6512834527603883707.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- lib/bootconfig.c | 15 ++++++++++++--- tools/bootconfig/include/linux/memblock.h | 12 ++++++++++++ 2 files changed, 24 insertions(+), 3 deletions(-) create mode 100644 tools/bootconfig/include/linux/memblock.h diff --git a/lib/bootconfig.c b/lib/bootconfig.c index afb2e767e6fe..3ea601a2eba5 100644 --- a/lib/bootconfig.c +++ b/lib/bootconfig.c @@ -6,12 +6,13 @@ #define pr_fmt(fmt) "bootconfig: " fmt +#include #include #include #include #include +#include #include -#include #include /* @@ -23,7 +24,7 @@ * node (for array). */ -static struct xbc_node xbc_nodes[XBC_NODE_MAX] __initdata; +static struct xbc_node *xbc_nodes __initdata; static int xbc_node_num __initdata; static char *xbc_data __initdata; static size_t xbc_data_size __initdata; @@ -719,7 +720,8 @@ void __init xbc_destroy_all(void) xbc_data = NULL; xbc_data_size = 0; xbc_node_num = 0; - memset(xbc_nodes, 0, sizeof(xbc_nodes)); + memblock_free(__pa(xbc_nodes), sizeof(struct xbc_node) * XBC_NODE_MAX); + xbc_nodes = NULL; } /** @@ -748,6 +750,13 @@ int __init xbc_init(char *buf) return -ERANGE; } + xbc_nodes = memblock_alloc(sizeof(struct xbc_node) * XBC_NODE_MAX, + SMP_CACHE_BYTES); + if (!xbc_nodes) { + pr_err("Failed to allocate memory for bootconfig nodes.\n"); + return -ENOMEM; + } + memset(xbc_nodes, 0, sizeof(struct xbc_node) * XBC_NODE_MAX); xbc_data = buf; xbc_data_size = ret + 1; last_parent = NULL; diff --git a/tools/bootconfig/include/linux/memblock.h b/tools/bootconfig/include/linux/memblock.h new file mode 100644 index 000000000000..7862f217d85d --- /dev/null +++ b/tools/bootconfig/include/linux/memblock.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _XBC_LINUX_MEMBLOCK_H +#define _XBC_LINUX_MEMBLOCK_H + +#include + +#define __pa(addr) (addr) +#define SMP_CACHE_BYTES 0 +#define memblock_alloc(size, align) malloc(size) +#define memblock_free(paddr, size) free(paddr) + +#endif -- cgit v1.2.3 From 973780011106c534d69c4d25fe0749bd3a5f0b53 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Sun, 9 Feb 2020 22:05:13 +0900 Subject: tools/bootconfig: Suppress non-error messages Suppress non-error messages when applying new bootconfig to initrd image. To enable it, replace printf for error message with pr_err() macro. This also adds a testcase for this fix. Link: http://lkml.kernel.org/r/158125351377.16911.13283712972275131160.stgit@devnote2 Reported-by: Michael Ellerman Tested-by: Michael Ellerman Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- tools/bootconfig/main.c | 28 ++++++++++++++-------------- tools/bootconfig/test-bootconfig.sh | 9 +++++++++ 2 files changed, 23 insertions(+), 14 deletions(-) diff --git a/tools/bootconfig/main.c b/tools/bootconfig/main.c index 47f488458328..e18eeb070562 100644 --- a/tools/bootconfig/main.c +++ b/tools/bootconfig/main.c @@ -140,7 +140,7 @@ int load_xbc_from_initrd(int fd, char **buf) return 0; if (lseek(fd, -8, SEEK_END) < 0) { - printf("Failed to lseek: %d\n", -errno); + pr_err("Failed to lseek: %d\n", -errno); return -errno; } @@ -155,7 +155,7 @@ int load_xbc_from_initrd(int fd, char **buf) return 0; if (lseek(fd, stat.st_size - 8 - size, SEEK_SET) < 0) { - printf("Failed to lseek: %d\n", -errno); + pr_err("Failed to lseek: %d\n", -errno); return -errno; } @@ -166,7 +166,7 @@ int load_xbc_from_initrd(int fd, char **buf) /* Wrong Checksum, maybe no boot config here */ rcsum = checksum((unsigned char *)*buf, size); if (csum != rcsum) { - printf("checksum error: %d != %d\n", csum, rcsum); + pr_err("checksum error: %d != %d\n", csum, rcsum); return 0; } @@ -185,13 +185,13 @@ int show_xbc(const char *path) fd = open(path, O_RDONLY); if (fd < 0) { - printf("Failed to open initrd %s: %d\n", path, fd); + pr_err("Failed to open initrd %s: %d\n", path, fd); return -errno; } ret = load_xbc_from_initrd(fd, &buf); if (ret < 0) - printf("Failed to load a boot config from initrd: %d\n", ret); + pr_err("Failed to load a boot config from initrd: %d\n", ret); else xbc_show_compact_tree(); @@ -209,7 +209,7 @@ int delete_xbc(const char *path) fd = open(path, O_RDWR); if (fd < 0) { - printf("Failed to open initrd %s: %d\n", path, fd); + pr_err("Failed to open initrd %s: %d\n", path, fd); return -errno; } @@ -222,7 +222,7 @@ int delete_xbc(const char *path) pr_output = 1; if (size < 0) { ret = size; - printf("Failed to load a boot config from initrd: %d\n", ret); + pr_err("Failed to load a boot config from initrd: %d\n", ret); } else if (size > 0) { ret = fstat(fd, &stat); if (!ret) @@ -245,7 +245,7 @@ int apply_xbc(const char *path, const char *xbc_path) ret = load_xbc_file(xbc_path, &buf); if (ret < 0) { - printf("Failed to load %s : %d\n", xbc_path, ret); + pr_err("Failed to load %s : %d\n", xbc_path, ret); return ret; } size = strlen(buf) + 1; @@ -262,7 +262,7 @@ int apply_xbc(const char *path, const char *xbc_path) /* Check the data format */ ret = xbc_init(buf); if (ret < 0) { - printf("Failed to parse %s: %d\n", xbc_path, ret); + pr_err("Failed to parse %s: %d\n", xbc_path, ret); free(data); free(buf); return ret; @@ -279,20 +279,20 @@ int apply_xbc(const char *path, const char *xbc_path) /* Remove old boot config if exists */ ret = delete_xbc(path); if (ret < 0) { - printf("Failed to delete previous boot config: %d\n", ret); + pr_err("Failed to delete previous boot config: %d\n", ret); return ret; } /* Apply new one */ fd = open(path, O_RDWR | O_APPEND); if (fd < 0) { - printf("Failed to open %s: %d\n", path, fd); + pr_err("Failed to open %s: %d\n", path, fd); return fd; } /* TODO: Ensure the @path is initramfs/initrd image */ ret = write(fd, data, size + 8); if (ret < 0) { - printf("Failed to apply a boot config: %d\n", ret); + pr_err("Failed to apply a boot config: %d\n", ret); return ret; } close(fd); @@ -334,12 +334,12 @@ int main(int argc, char **argv) } if (apply && delete) { - printf("Error: You can not specify both -a and -d at once.\n"); + pr_err("Error: You can not specify both -a and -d at once.\n"); return usage(); } if (optind >= argc) { - printf("Error: No initrd is specified.\n"); + pr_err("Error: No initrd is specified.\n"); return usage(); } diff --git a/tools/bootconfig/test-bootconfig.sh b/tools/bootconfig/test-bootconfig.sh index 87725e8723f8..1de06de328e2 100755 --- a/tools/bootconfig/test-bootconfig.sh +++ b/tools/bootconfig/test-bootconfig.sh @@ -64,6 +64,15 @@ echo "File size check" new_size=$(stat -c %s $INITRD) xpass test $new_size -eq $initrd_size +echo "No error messge while applying" +OUTFILE=`mktemp tempout-XXXX` +dd if=/dev/zero of=$INITRD bs=4096 count=1 +printf " \0\0\0 \0\0\0" >> $INITRD +$BOOTCONF -a $TEMPCONF $INITRD > $OUTFILE 2>&1 +xfail grep -i "failed" $OUTFILE +xfail grep -i "error" $OUTFILE +rm $OUTFILE + echo "Max node number check" echo -n > $TEMPCONF -- cgit v1.2.3 From f2e97dc126b712c0d21219ed0c42710006c1cf52 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Sun, 9 Feb 2020 21:44:37 -0800 Subject: bpf: Selftests build error in sockmap_basic.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix following build error. We could push a tcp.h header into one of the include paths, but I think its easy enough to simply pull in the three defines we need here. If we end up using more of tcp.h at some point we can pull it in later. /home/john/git/bpf/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c: In function ‘connected_socket_v4’: /home/john/git/bpf/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c:20:11: error: ‘TCP_REPAIR_ON’ undeclared (first use in this function) repair = TCP_REPAIR_ON; ^ /home/john/git/bpf/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c:20:11: note: each undeclared identifier is reported only once for each function it appears in /home/john/git/bpf/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c:29:11: error: ‘TCP_REPAIR_OFF_NO_WP’ undeclared (first use in this function) repair = TCP_REPAIR_OFF_NO_WP; Then with fix, $ ./test_progs -n 44 #44/1 sockmap create_update_free:OK #44/2 sockhash create_update_free:OK #44 sockmap_basic:OK Fixes: 5d3919a953c3c ("selftests/bpf: Test freeing sockmap/sockhash with a socket in it") Signed-off-by: John Fastabend Signed-off-by: Alexei Starovoitov Reviewed-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/158131347731.21414.12120493483848386652.stgit@john-Precision-5820-Tower --- tools/testing/selftests/bpf/prog_tests/sockmap_basic.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c index 07f5b462c2ef..aa43e0bd210c 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c @@ -3,6 +3,11 @@ #include "test_progs.h" +#define TCP_REPAIR 19 /* TCP sock is under repair right now */ + +#define TCP_REPAIR_ON 1 +#define TCP_REPAIR_OFF_NO_WP -1 /* Turn off without window probes */ + static int connected_socket_v4(void) { struct sockaddr_in addr = { -- cgit v1.2.3 From 2bf0eb9b3b0d099b20b2c4736436b666d78b94d5 Mon Sep 17 00:00:00 2001 From: Hongbo Yao Date: Mon, 10 Feb 2020 09:14:41 +0800 Subject: bpf: Make btf_check_func_type_match() static Fix the following sparse warning: kernel/bpf/btf.c:4131:5: warning: symbol 'btf_check_func_type_match' was not declared. Should it be static? Reported-by: Hulk Robot Signed-off-by: Hongbo Yao Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200210011441.147102-1-yaohongbo@huawei.com --- kernel/bpf/btf.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 805c43b083e9..787140095e58 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -4142,9 +4142,9 @@ int btf_distill_func_proto(struct bpf_verifier_log *log, * EFAULT - verifier bug * 0 - 99% match. The last 1% is validated by the verifier. */ -int btf_check_func_type_match(struct bpf_verifier_log *log, - struct btf *btf1, const struct btf_type *t1, - struct btf *btf2, const struct btf_type *t2) +static int btf_check_func_type_match(struct bpf_verifier_log *log, + struct btf *btf1, const struct btf_type *t1, + struct btf *btf2, const struct btf_type *t2) { const struct btf_param *args1, *args2; const char *fn1, *fn2, *s1, *s2; -- cgit v1.2.3 From 95ba79e89c107851bad4492ca23e9b9c399b8592 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Thu, 30 Jan 2020 14:55:15 +0100 Subject: MAINTAINERS: remove unnecessary ':' characters Commit e567cb3fef30 ("MAINTAINERS: add an entry for kfifo") added a new entry to MAINTAINERS. Following the example of the previous entry on the list I added a trailing ':' character at the end of the title line. This however results in rather strange looking output from scripts/get_maintainer.pl: $ ./scripts/get_maintainer.pl ./0001-kfifo.patch Stefani Seibold (maintainer:KFIFO:) linux-kernel@vger.kernel.org (open list) It turns out there are more entries like this. Fix the entire file by removing all trailing colons. Signed-off-by: Bartosz Golaszewski Link: https://lore.kernel.org/r/20200130135515.30359-1-brgl@bgdev.pl Signed-off-by: Greg Kroah-Hartman --- MAINTAINERS | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 38fe2f3f7b6f..0e7e88879337 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3909,7 +3909,7 @@ S: Supported F: Documentation/filesystems/ceph.txt F: fs/ceph/ -CERTIFICATE HANDLING: +CERTIFICATE HANDLING M: David Howells M: David Woodhouse L: keyrings@vger.kernel.org @@ -3919,7 +3919,7 @@ F: certs/ F: scripts/sign-file.c F: scripts/extract-cert.c -CERTIFIED WIRELESS USB (WUSB) SUBSYSTEM: +CERTIFIED WIRELESS USB (WUSB) SUBSYSTEM L: devel@driverdev.osuosl.org S: Obsolete F: drivers/staging/wusbcore/ @@ -7047,7 +7047,7 @@ L: kvm@vger.kernel.org S: Supported F: drivers/uio/uio_pci_generic.c -GENERIC VDSO LIBRARY: +GENERIC VDSO LIBRARY M: Andy Lutomirski M: Thomas Gleixner M: Vincenzo Frascino @@ -9278,7 +9278,7 @@ F: include/keys/trusted-type.h F: security/keys/trusted.c F: include/keys/trusted.h -KEYS/KEYRINGS: +KEYS/KEYRINGS M: David Howells M: Jarkko Sakkinen L: keyrings@vger.kernel.org @@ -11484,7 +11484,7 @@ F: drivers/scsi/mac_scsi.* F: drivers/scsi/sun3_scsi.* F: drivers/scsi/sun3_scsi_vme.c -NCSI LIBRARY: +NCSI LIBRARY M: Samuel Mendoza-Jonas S: Maintained F: net/ncsi/ @@ -13512,7 +13512,7 @@ L: linuxppc-dev@lists.ozlabs.org S: Maintained F: drivers/block/ps3vram.c -PSAMPLE PACKET SAMPLING SUPPORT: +PSAMPLE PACKET SAMPLING SUPPORT M: Yotam Gigi S: Maintained F: net/psample @@ -17080,7 +17080,7 @@ S: Maintained F: Documentation/admin-guide/ufs.rst F: fs/ufs/ -UHID USERSPACE HID IO DRIVER: +UHID USERSPACE HID IO DRIVER M: David Herrmann L: linux-input@vger.kernel.org S: Maintained @@ -17094,18 +17094,18 @@ S: Maintained F: drivers/usb/common/ulpi.c F: include/linux/ulpi/ -ULTRA-WIDEBAND (UWB) SUBSYSTEM: +ULTRA-WIDEBAND (UWB) SUBSYSTEM L: devel@driverdev.osuosl.org S: Obsolete F: drivers/staging/uwb/ -UNICODE SUBSYSTEM: +UNICODE SUBSYSTEM M: Gabriel Krisman Bertazi L: linux-fsdevel@vger.kernel.org S: Supported F: fs/unicode/ -UNICORE32 ARCHITECTURE: +UNICORE32 ARCHITECTURE M: Guan Xuetao W: http://mprc.pku.edu.cn/~guanxuetao/linux S: Maintained @@ -17791,7 +17791,7 @@ F: include/linux/vbox_utils.h F: include/uapi/linux/vbox*.h F: drivers/virt/vboxguest/ -VIRTUAL BOX SHARED FOLDER VFS DRIVER: +VIRTUAL BOX SHARED FOLDER VFS DRIVER M: Hans de Goede L: linux-fsdevel@vger.kernel.org S: Maintained -- cgit v1.2.3 From 7714d469dcba572bbfb9cc47217fed7e7ddeb051 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 16 Jan 2020 09:29:03 +0000 Subject: selftests: fix spelling mistaked "chaigned" -> "chained" There is a spelling mistake in a literal string, fix it. Signed-off-by: Colin Ian King Reviewed-by: Aleksa Sarai Signed-off-by: Shuah Khan --- tools/testing/selftests/openat2/resolve_test.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/openat2/resolve_test.c b/tools/testing/selftests/openat2/resolve_test.c index 7a94b1da8e7b..bbafad440893 100644 --- a/tools/testing/selftests/openat2/resolve_test.c +++ b/tools/testing/selftests/openat2/resolve_test.c @@ -230,7 +230,7 @@ void test_openat2_opath_tests(void) { .name = "[in_root] garbage link to /root", .path = "cheeky/garbageself", .how.resolve = RESOLVE_IN_ROOT, .out.path = "root", .pass = true }, - { .name = "[in_root] chainged garbage links to /root", + { .name = "[in_root] chained garbage links to /root", .path = "abscheeky/garbageself", .how.resolve = RESOLVE_IN_ROOT, .out.path = "root", .pass = true }, { .name = "[in_root] relative path to 'root'", -- cgit v1.2.3 From a098d9c82a0bb2f91e5cf6c780859bc00c15e1e4 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 30 Jan 2020 21:45:27 -0500 Subject: selftests/ftrace: Have pid filter test use instance flag While running the ftracetests, the pid filter test failed because the instance "foo" existed, and it was using it to rerun the test under a instance named foo. The collision caused the test to fail as the mkdir failed as the name already existed. As of commit b5b77be812de7 ("selftests: ftrace: Allow some tests to be run in a tracing instance") all a selftest needs to do to be tested in an instance is to set the "instance" flag. There's no reason a selftest needs to create an instance to run its test in an instance directly. Remove the open coded testing in an instance for the pid filter test and have it set the "instance" flag instead. Signed-off-by: Steven Rostedt (VMware) Acked-by: Masami Hiramatsu Signed-off-by: Shuah Khan --- tools/testing/selftests/ftrace/test.d/ftrace/func-filter-pid.tc | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/tools/testing/selftests/ftrace/test.d/ftrace/func-filter-pid.tc b/tools/testing/selftests/ftrace/test.d/ftrace/func-filter-pid.tc index 64cfcc75e3c1..f2ee1e889e13 100644 --- a/tools/testing/selftests/ftrace/test.d/ftrace/func-filter-pid.tc +++ b/tools/testing/selftests/ftrace/test.d/ftrace/func-filter-pid.tc @@ -1,6 +1,7 @@ #!/bin/sh # SPDX-License-Identifier: GPL-2.0 # description: ftrace - function pid filters +# flags: instance # Make sure that function pid matching filter works. # Also test it on an instance directory @@ -96,13 +97,6 @@ do_test() { } do_test - -mkdir instances/foo -cd instances/foo -do_test -cd ../../ -rmdir instances/foo - do_reset exit 0 -- cgit v1.2.3 From b32694cd0724d4ceca2c62cc7c3d3a8d1ffa11fc Mon Sep 17 00:00:00 2001 From: Nikita Sobolev Date: Fri, 31 Jan 2020 18:25:23 +0300 Subject: Kernel selftests: tpm2: check for tpm support tpm2 tests set fails if there is no /dev/tpm0 and /dev/tpmrm0 supported. Check if these files exist before run and mark test as skipped in case of absence. Signed-off-by: Nikita Sobolev Signed-off-by: Shuah Khan --- tools/testing/selftests/tpm2/test_smoke.sh | 13 +++++++++++-- tools/testing/selftests/tpm2/test_space.sh | 9 ++++++++- 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/tpm2/test_smoke.sh b/tools/testing/selftests/tpm2/test_smoke.sh index 8155c2ea7ccb..b630c7b5950a 100755 --- a/tools/testing/selftests/tpm2/test_smoke.sh +++ b/tools/testing/selftests/tpm2/test_smoke.sh @@ -1,8 +1,17 @@ #!/bin/bash # SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) +self.flags = flags -python -m unittest -v tpm2_tests.SmokeTest -python -m unittest -v tpm2_tests.AsyncTest +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 + + +if [ -f /dev/tpm0 ] ; then + python -m unittest -v tpm2_tests.SmokeTest + python -m unittest -v tpm2_tests.AsyncTest +else + exit $ksft_skip +fi CLEAR_CMD=$(which tpm2_clear) if [ -n $CLEAR_CMD ]; then diff --git a/tools/testing/selftests/tpm2/test_space.sh b/tools/testing/selftests/tpm2/test_space.sh index a6f5e346635e..180b469c53b4 100755 --- a/tools/testing/selftests/tpm2/test_space.sh +++ b/tools/testing/selftests/tpm2/test_space.sh @@ -1,4 +1,11 @@ #!/bin/bash # SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) -python -m unittest -v tpm2_tests.SpaceTest +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 + +if [ -f /dev/tpmrm0 ] ; then + python -m unittest -v tpm2_tests.SpaceTest +else + exit $ksft_skip +fi -- cgit v1.2.3 From 9d235a558c689b0ecdd23bbd8beb2e0584f619ed Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Thu, 6 Feb 2020 09:40:00 +0100 Subject: selftests: allow detection of build failures Commit 5f70bde26a48 ("selftests: fix build behaviour on targets' failures") added a logic to track failure of builds of individual targets. However, it does exactly the opposite of what a distro kernel needs: we create a RPM package with a selected set of selftests and we need the build to fail if build of any of the targets fail. Both use cases are valid. A distribution kernel is in control of what is included in the kernel and what is being built; any error needs to be flagged and acted upon. A CI system that tries to build as many tests as possible on the best effort basis is not really interested in a failure here and there. Support both use cases by introducing a FORCE_TARGETS variable. It is switched off by default to make life for CI systems easier, distributions can easily switch it on while building their packages. Reported-by: Yauheni Kaliuta Signed-off-by: Jiri Benc Reviewed-by: Cristian Marussi Tested-by: Cristian Marussi Signed-off-by: Shuah Khan --- tools/testing/selftests/Makefile | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index 63430e2664c2..6ec503912bea 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -77,6 +77,12 @@ ifneq ($(SKIP_TARGETS),) override TARGETS := $(TMP) endif +# User can set FORCE_TARGETS to 1 to require all targets to be successfully +# built; make will fail if any of the targets cannot be built. If +# FORCE_TARGETS is not set (the default), make will succeed if at least one +# of the targets gets built. +FORCE_TARGETS ?= + # Clear LDFLAGS and MAKEFLAGS if called from main # Makefile to avoid test build failures when test # Makefile doesn't have explicit build rules. @@ -151,7 +157,8 @@ all: khdr for TARGET in $(TARGETS); do \ BUILD_TARGET=$$BUILD/$$TARGET; \ mkdir $$BUILD_TARGET -p; \ - $(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET; \ + $(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET \ + $(if $(FORCE_TARGETS),|| exit); \ ret=$$((ret * $$?)); \ done; exit $$ret; @@ -205,7 +212,8 @@ ifdef INSTALL_PATH @ret=1; \ for TARGET in $(TARGETS); do \ BUILD_TARGET=$$BUILD/$$TARGET; \ - $(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET INSTALL_PATH=$(INSTALL_PATH)/$$TARGET install; \ + $(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET INSTALL_PATH=$(INSTALL_PATH)/$$TARGET install \ + $(if $(FORCE_TARGETS),|| exit); \ ret=$$((ret * $$?)); \ done; exit $$ret; -- cgit v1.2.3 From c363eb48ada5cf732b3f489fab799fc881097842 Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Thu, 6 Feb 2020 09:40:52 +0100 Subject: selftests: fix too long argument With some shells, the command construed for install of bpf selftests becomes too large due to long list of files: make[1]: execvp: /bin/sh: Argument list too long make[1]: *** [../lib.mk:73: install] Error 127 Currently, each of the file lists is replicated three times in the command: in the shell 'if' condition, in the 'echo' and in the 'rsync'. Reduce that by one instance by using make conditionals and separate the echo and rsync into two shell commands. (One would be inclined to just remove the '@' at the beginning of the rsync command and let 'make' echo it by itself; unfortunately, it appears that the '@' in the front of mkdir silences output also for the following commands.) Also, separate handling of each of the lists to its own shell command. The semantics of the makefile is unchanged before and after the patch. The ability of individual test directories to override INSTALL_RULE is retained. Reported-by: Yauheni Kaliuta Tested-by: Yauheni Kaliuta Signed-off-by: Jiri Benc Signed-off-by: Shuah Khan --- tools/testing/selftests/lib.mk | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/tools/testing/selftests/lib.mk b/tools/testing/selftests/lib.mk index 1c8a1963d03f..3ed0134a764d 100644 --- a/tools/testing/selftests/lib.mk +++ b/tools/testing/selftests/lib.mk @@ -83,17 +83,20 @@ else $(call RUN_TESTS, $(TEST_GEN_PROGS) $(TEST_CUSTOM_PROGS) $(TEST_PROGS)) endif +define INSTALL_SINGLE_RULE + $(if $(INSTALL_LIST),@mkdir -p $(INSTALL_PATH)) + $(if $(INSTALL_LIST),@echo rsync -a $(INSTALL_LIST) $(INSTALL_PATH)/) + $(if $(INSTALL_LIST),@rsync -a $(INSTALL_LIST) $(INSTALL_PATH)/) +endef + define INSTALL_RULE - @if [ "X$(TEST_PROGS)$(TEST_PROGS_EXTENDED)$(TEST_FILES)" != "X" ]; then \ - mkdir -p ${INSTALL_PATH}; \ - echo "rsync -a $(TEST_PROGS) $(TEST_PROGS_EXTENDED) $(TEST_FILES) $(INSTALL_PATH)/"; \ - rsync -a $(TEST_PROGS) $(TEST_PROGS_EXTENDED) $(TEST_FILES) $(INSTALL_PATH)/; \ - fi - @if [ "X$(TEST_GEN_PROGS)$(TEST_CUSTOM_PROGS)$(TEST_GEN_PROGS_EXTENDED)$(TEST_GEN_FILES)" != "X" ]; then \ - mkdir -p ${INSTALL_PATH}; \ - echo "rsync -a $(TEST_GEN_PROGS) $(TEST_CUSTOM_PROGS) $(TEST_GEN_PROGS_EXTENDED) $(TEST_GEN_FILES) $(INSTALL_PATH)/"; \ - rsync -a $(TEST_GEN_PROGS) $(TEST_CUSTOM_PROGS) $(TEST_GEN_PROGS_EXTENDED) $(TEST_GEN_FILES) $(INSTALL_PATH)/; \ - fi + $(eval INSTALL_LIST = $(TEST_PROGS)) $(INSTALL_SINGLE_RULE) + $(eval INSTALL_LIST = $(TEST_PROGS_EXTENDED)) $(INSTALL_SINGLE_RULE) + $(eval INSTALL_LIST = $(TEST_FILES)) $(INSTALL_SINGLE_RULE) + $(eval INSTALL_LIST = $(TEST_GEN_PROGS)) $(INSTALL_SINGLE_RULE) + $(eval INSTALL_LIST = $(TEST_CUSTOM_PROGS)) $(INSTALL_SINGLE_RULE) + $(eval INSTALL_LIST = $(TEST_GEN_PROGS_EXTENDED)) $(INSTALL_SINGLE_RULE) + $(eval INSTALL_LIST = $(TEST_GEN_FILES)) $(INSTALL_SINGLE_RULE) endef install: all -- cgit v1.2.3 From d090409abbdd1fcbdfd6ed66612390ba8c814749 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Mon, 10 Feb 2020 17:06:48 -0600 Subject: tracing: Add missing nest end to synth_event_trace_start() error case If the ring_buffer reserve in synth_event_trace_start() fails, the matching ring_buffer_nest_end() should be called in the error code, since nothing else will ever call it in this case. Link: http://lkml.kernel.org/r/20abc444b3eeff76425f895815380abe7aa53ff8.1581374549.git.zanussi@kernel.org Fixes: 8dcc53ad956d2 ("tracing: Add synth_event_trace() and related functions") Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index b3bcfd8c7332..a546ffa14785 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -2043,6 +2043,7 @@ int synth_event_trace_start(struct trace_event_file *file, entry = trace_event_buffer_reserve(&trace_state->fbuffer, file, sizeof(*entry) + fields_size); if (!entry) { + ring_buffer_nest_end(trace_state->buffer); ret = -EINVAL; goto out; } -- cgit v1.2.3 From 0c62f6cd9ed320cb0ca39e33addf3a3da51b7328 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Mon, 10 Feb 2020 17:06:49 -0600 Subject: tracing: Don't return -EINVAL when tracing soft disabled synth events There's no reason to return -EINVAL when tracing a synthetic event if it's soft disabled - treat it the same as if it were hard disabled and return normally. Have synth_event_trace() and synth_event_trace_array() just return normally, and have synth_event_trace_start set the trace state to disabled and return. Link: http://lkml.kernel.org/r/df5d02a1625aff97c9866506c5bada6a069982ba.1581374549.git.zanussi@kernel.org Fixes: 8dcc53ad956d2 ("tracing: Add synth_event_trace() and related functions") Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index a546ffa14785..99a02168599b 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -1828,7 +1828,8 @@ int synth_event_trace(struct trace_event_file *file, unsigned int n_vals, ...) * called directly by the user, we don't have that but we * still need to honor not logging when disabled. */ - if (!(file->flags & EVENT_FILE_FL_ENABLED)) + if (!(file->flags & EVENT_FILE_FL_ENABLED) || + trace_trigger_soft_disabled(file)) return 0; event = file->event_call->data; @@ -1836,9 +1837,6 @@ int synth_event_trace(struct trace_event_file *file, unsigned int n_vals, ...) if (n_vals != event->n_fields) return -EINVAL; - if (trace_trigger_soft_disabled(file)) - return -EINVAL; - fields_size = event->n_u64 * sizeof(u64); /* @@ -1918,7 +1916,8 @@ int synth_event_trace_array(struct trace_event_file *file, u64 *vals, * called directly by the user, we don't have that but we * still need to honor not logging when disabled. */ - if (!(file->flags & EVENT_FILE_FL_ENABLED)) + if (!(file->flags & EVENT_FILE_FL_ENABLED) || + trace_trigger_soft_disabled(file)) return 0; event = file->event_call->data; @@ -1926,9 +1925,6 @@ int synth_event_trace_array(struct trace_event_file *file, u64 *vals, if (n_vals != event->n_fields) return -EINVAL; - if (trace_trigger_soft_disabled(file)) - return -EINVAL; - fields_size = event->n_u64 * sizeof(u64); /* @@ -2017,7 +2013,8 @@ int synth_event_trace_start(struct trace_event_file *file, * trace case, we save the enabed state upon start and just * ignore the following data calls. */ - if (!(file->flags & EVENT_FILE_FL_ENABLED)) { + if (!(file->flags & EVENT_FILE_FL_ENABLED) || + trace_trigger_soft_disabled(file)) { trace_state->enabled = false; goto out; } @@ -2026,11 +2023,6 @@ int synth_event_trace_start(struct trace_event_file *file, trace_state->event = file->event_call->data; - if (trace_trigger_soft_disabled(file)) { - ret = -EINVAL; - goto out; - } - fields_size = trace_state->event->n_u64 * sizeof(u64); /* -- cgit v1.2.3 From 7276531d4036f5db2af15c8b6caa02e7741f5d80 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Mon, 10 Feb 2020 17:06:50 -0600 Subject: tracing: Consolidate trace() functions Move the checking, buffer reserve and buffer commit code in synth_event_trace_start/end() into inline functions __synth_event_trace_start/end() so they can also be used by synth_event_trace() and synth_event_trace_array(), and then have all those functions use them. Also, change synth_event_trace_state.enabled to disabled so it only needs to be set if the event is disabled, which is not normally the case. Link: http://lkml.kernel.org/r/b1f3108d0f450e58192955a300e31d0405ab4149.1581374549.git.zanussi@kernel.org Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- include/linux/trace_events.h | 2 +- kernel/trace/trace_events_hist.c | 220 +++++++++++++++------------------------ 2 files changed, 87 insertions(+), 135 deletions(-) diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 67f528ecb9e5..21098298b49b 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -424,7 +424,7 @@ struct synth_event_trace_state { struct synth_event *event; unsigned int cur_field; unsigned int n_u64; - bool enabled; + bool disabled; bool add_next; bool add_name; }; diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 99a02168599b..65b54d6a1422 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -1791,6 +1791,60 @@ void synth_event_cmd_init(struct dynevent_cmd *cmd, char *buf, int maxlen) } EXPORT_SYMBOL_GPL(synth_event_cmd_init); +static inline int +__synth_event_trace_start(struct trace_event_file *file, + struct synth_event_trace_state *trace_state) +{ + int entry_size, fields_size = 0; + int ret = 0; + + /* + * Normal event tracing doesn't get called at all unless the + * ENABLED bit is set (which attaches the probe thus allowing + * this code to be called, etc). Because this is called + * directly by the user, we don't have that but we still need + * to honor not logging when disabled. For the the iterated + * trace case, we save the enabed state upon start and just + * ignore the following data calls. + */ + if (!(file->flags & EVENT_FILE_FL_ENABLED) || + trace_trigger_soft_disabled(file)) { + trace_state->disabled = true; + ret = -ENOENT; + goto out; + } + + trace_state->event = file->event_call->data; + + fields_size = trace_state->event->n_u64 * sizeof(u64); + + /* + * Avoid ring buffer recursion detection, as this event + * is being performed within another event. + */ + trace_state->buffer = file->tr->array_buffer.buffer; + ring_buffer_nest_start(trace_state->buffer); + + entry_size = sizeof(*trace_state->entry) + fields_size; + trace_state->entry = trace_event_buffer_reserve(&trace_state->fbuffer, + file, + entry_size); + if (!trace_state->entry) { + ring_buffer_nest_end(trace_state->buffer); + ret = -EINVAL; + } +out: + return ret; +} + +static inline void +__synth_event_trace_end(struct synth_event_trace_state *trace_state) +{ + trace_event_buffer_commit(&trace_state->fbuffer); + + ring_buffer_nest_end(trace_state->buffer); +} + /** * synth_event_trace - Trace a synthetic event * @file: The trace_event_file representing the synthetic event @@ -1812,69 +1866,38 @@ EXPORT_SYMBOL_GPL(synth_event_cmd_init); */ int synth_event_trace(struct trace_event_file *file, unsigned int n_vals, ...) { - struct trace_event_buffer fbuffer; - struct synth_trace_event *entry; - struct trace_buffer *buffer; - struct synth_event *event; + struct synth_event_trace_state state; unsigned int i, n_u64; - int fields_size = 0; va_list args; - int ret = 0; - - /* - * Normal event generation doesn't get called at all unless - * the ENABLED bit is set (which attaches the probe thus - * allowing this code to be called, etc). Because this is - * called directly by the user, we don't have that but we - * still need to honor not logging when disabled. - */ - if (!(file->flags & EVENT_FILE_FL_ENABLED) || - trace_trigger_soft_disabled(file)) - return 0; - - event = file->event_call->data; - - if (n_vals != event->n_fields) - return -EINVAL; - - fields_size = event->n_u64 * sizeof(u64); - - /* - * Avoid ring buffer recursion detection, as this event - * is being performed within another event. - */ - buffer = file->tr->array_buffer.buffer; - ring_buffer_nest_start(buffer); + int ret; - entry = trace_event_buffer_reserve(&fbuffer, file, - sizeof(*entry) + fields_size); - if (!entry) { - ret = -EINVAL; - goto out; + ret = __synth_event_trace_start(file, &state); + if (ret) { + if (ret == -ENOENT) + ret = 0; /* just disabled, not really an error */ + return ret; } va_start(args, n_vals); - for (i = 0, n_u64 = 0; i < event->n_fields; i++) { + for (i = 0, n_u64 = 0; i < state.event->n_fields; i++) { u64 val; val = va_arg(args, u64); - if (event->fields[i]->is_string) { + if (state.event->fields[i]->is_string) { char *str_val = (char *)(long)val; - char *str_field = (char *)&entry->fields[n_u64]; + char *str_field = (char *)&state.entry->fields[n_u64]; strscpy(str_field, str_val, STR_VAR_LEN_MAX); n_u64 += STR_VAR_LEN_MAX / sizeof(u64); } else { - entry->fields[n_u64] = val; + state.entry->fields[n_u64] = val; n_u64++; } } va_end(args); - trace_event_buffer_commit(&fbuffer); -out: - ring_buffer_nest_end(buffer); + __synth_event_trace_end(&state); return ret; } @@ -1901,62 +1924,31 @@ EXPORT_SYMBOL_GPL(synth_event_trace); int synth_event_trace_array(struct trace_event_file *file, u64 *vals, unsigned int n_vals) { - struct trace_event_buffer fbuffer; - struct synth_trace_event *entry; - struct trace_buffer *buffer; - struct synth_event *event; + struct synth_event_trace_state state; unsigned int i, n_u64; - int fields_size = 0; - int ret = 0; - - /* - * Normal event generation doesn't get called at all unless - * the ENABLED bit is set (which attaches the probe thus - * allowing this code to be called, etc). Because this is - * called directly by the user, we don't have that but we - * still need to honor not logging when disabled. - */ - if (!(file->flags & EVENT_FILE_FL_ENABLED) || - trace_trigger_soft_disabled(file)) - return 0; - - event = file->event_call->data; - - if (n_vals != event->n_fields) - return -EINVAL; - - fields_size = event->n_u64 * sizeof(u64); - - /* - * Avoid ring buffer recursion detection, as this event - * is being performed within another event. - */ - buffer = file->tr->array_buffer.buffer; - ring_buffer_nest_start(buffer); + int ret; - entry = trace_event_buffer_reserve(&fbuffer, file, - sizeof(*entry) + fields_size); - if (!entry) { - ret = -EINVAL; - goto out; + ret = __synth_event_trace_start(file, &state); + if (ret) { + if (ret == -ENOENT) + ret = 0; /* just disabled, not really an error */ + return ret; } - for (i = 0, n_u64 = 0; i < event->n_fields; i++) { - if (event->fields[i]->is_string) { + for (i = 0, n_u64 = 0; i < state.event->n_fields; i++) { + if (state.event->fields[i]->is_string) { char *str_val = (char *)(long)vals[i]; - char *str_field = (char *)&entry->fields[n_u64]; + char *str_field = (char *)&state.entry->fields[n_u64]; strscpy(str_field, str_val, STR_VAR_LEN_MAX); n_u64 += STR_VAR_LEN_MAX / sizeof(u64); } else { - entry->fields[n_u64] = vals[i]; + state.entry->fields[n_u64] = vals[i]; n_u64++; } } - trace_event_buffer_commit(&fbuffer); -out: - ring_buffer_nest_end(buffer); + __synth_event_trace_end(&state); return ret; } @@ -1993,55 +1985,17 @@ EXPORT_SYMBOL_GPL(synth_event_trace_array); int synth_event_trace_start(struct trace_event_file *file, struct synth_event_trace_state *trace_state) { - struct synth_trace_event *entry; - int fields_size = 0; - int ret = 0; + int ret; - if (!trace_state) { - ret = -EINVAL; - goto out; - } + if (!trace_state) + return -EINVAL; memset(trace_state, '\0', sizeof(*trace_state)); - /* - * Normal event tracing doesn't get called at all unless the - * ENABLED bit is set (which attaches the probe thus allowing - * this code to be called, etc). Because this is called - * directly by the user, we don't have that but we still need - * to honor not logging when disabled. For the the iterated - * trace case, we save the enabed state upon start and just - * ignore the following data calls. - */ - if (!(file->flags & EVENT_FILE_FL_ENABLED) || - trace_trigger_soft_disabled(file)) { - trace_state->enabled = false; - goto out; - } - - trace_state->enabled = true; + ret = __synth_event_trace_start(file, trace_state); + if (ret == -ENOENT) + ret = 0; /* just disabled, not really an error */ - trace_state->event = file->event_call->data; - - fields_size = trace_state->event->n_u64 * sizeof(u64); - - /* - * Avoid ring buffer recursion detection, as this event - * is being performed within another event. - */ - trace_state->buffer = file->tr->array_buffer.buffer; - ring_buffer_nest_start(trace_state->buffer); - - entry = trace_event_buffer_reserve(&trace_state->fbuffer, file, - sizeof(*entry) + fields_size); - if (!entry) { - ring_buffer_nest_end(trace_state->buffer); - ret = -EINVAL; - goto out; - } - - trace_state->entry = entry; -out: return ret; } EXPORT_SYMBOL_GPL(synth_event_trace_start); @@ -2074,7 +2028,7 @@ static int __synth_event_add_val(const char *field_name, u64 val, trace_state->add_next = true; } - if (!trace_state->enabled) + if (trace_state->disabled) goto out; event = trace_state->event; @@ -2209,9 +2163,7 @@ int synth_event_trace_end(struct synth_event_trace_state *trace_state) if (!trace_state) return -EINVAL; - trace_event_buffer_commit(&trace_state->fbuffer); - - ring_buffer_nest_end(trace_state->buffer); + __synth_event_trace_end(trace_state); return 0; } -- cgit v1.2.3 From 5ee858975b13a9b40db00f456989a689fdbb296c Mon Sep 17 00:00:00 2001 From: Anurag Kumar Vulisha Date: Mon, 27 Jan 2020 19:30:46 +0000 Subject: usb: dwc3: gadget: Check for IOC/LST bit in TRB->ctrl fields The current code in dwc3_gadget_ep_reclaim_completed_trb() will check for IOC/LST bit in the event->status and returns if IOC/LST bit is set. This logic doesn't work if multiple TRBs are queued per request and the IOC/LST bit is set on the last TRB of that request. Consider an example where a queued request has multiple queued TRBs and IOC/LST bit is set only for the last TRB. In this case, the core generates XferComplete/XferInProgress events only for the last TRB (since IOC/LST are set only for the last TRB). As per the logic in dwc3_gadget_ep_reclaim_completed_trb() event->status is checked for IOC/LST bit and returns on the first TRB. This leaves the remaining TRBs left unhandled. Similarly, if the gadget function enqueues an unaligned request with sglist already in it, it should fail the same way, since we will append another TRB to something that already uses more than one TRB. To aviod this, this patch changes the code to check for IOC/LST bits in TRB->ctrl instead. At a practical level, this patch resolves USB transfer stalls seen with adb on dwc3 based HiKey960 after functionfs gadget added scatter-gather support around v4.20. Cc: Felipe Balbi Cc: Yang Fei Cc: Thinh Nguyen Cc: Tejas Joglekar Cc: Andrzej Pietrasiewicz Cc: Jack Pham Cc: Todd Kjos Cc: Greg KH Cc: Linux USB List Cc: stable Tested-by: Tejas Joglekar Reviewed-by: Thinh Nguyen Signed-off-by: Anurag Kumar Vulisha [jstultz: forward ported to mainline, reworded commit log, reworked to only check trb->ctrl as suggested by Felipe] Signed-off-by: John Stultz Signed-off-by: Felipe Balbi --- drivers/usb/dwc3/gadget.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/usb/dwc3/gadget.c b/drivers/usb/dwc3/gadget.c index 1b8014ab0b25..1b7d2f9cb673 100644 --- a/drivers/usb/dwc3/gadget.c +++ b/drivers/usb/dwc3/gadget.c @@ -2429,7 +2429,8 @@ static int dwc3_gadget_ep_reclaim_completed_trb(struct dwc3_ep *dep, if (event->status & DEPEVT_STATUS_SHORT && !chain) return 1; - if (event->status & DEPEVT_STATUS_IOC) + if ((trb->ctrl & DWC3_TRB_CTRL_IOC) || + (trb->ctrl & DWC3_TRB_CTRL_LST)) return 1; return 0; -- cgit v1.2.3 From 904967c60d87393a3708fed2324b684cdb79b1ee Mon Sep 17 00:00:00 2001 From: John Keeping Date: Fri, 17 Jan 2020 10:40:22 +0000 Subject: usb: gadget: u_audio: Fix high-speed max packet size Prior to commit eb9fecb9e69b ("usb: gadget: f_uac2: split out audio core") the maximum packet size was calculated only from the high-speed descriptor but now we use the largest of the full-speed and high-speed descriptors. This is correct, but the full-speed value is likely to be higher than that for high-speed and this leads to submitting requests for OUT transfers (received by the gadget) which are larger than the endpoint's maximum packet size. These are rightly rejected by the gadget core. config_ep_by_speed() already sets up the correct maximum packet size for the enumerated speed in the usb_ep structure, so we can simply use this instead of the overall value that has been used to allocate buffers for requests. Note that the minimum period for ALSA is still set from the largest value, and this is unavoidable because it's possible to open the audio device before the gadget has been enumerated. Tested-by: Pavel Hofman Signed-off-by: John Keeping Signed-off-by: Felipe Balbi --- drivers/usb/gadget/function/u_audio.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/usb/gadget/function/u_audio.c b/drivers/usb/gadget/function/u_audio.c index 6d956f190f5a..e6d32c536781 100644 --- a/drivers/usb/gadget/function/u_audio.c +++ b/drivers/usb/gadget/function/u_audio.c @@ -361,7 +361,7 @@ int u_audio_start_capture(struct g_audio *audio_dev) ep = audio_dev->out_ep; prm = &uac->c_prm; config_ep_by_speed(gadget, &audio_dev->func, ep); - req_len = prm->max_psize; + req_len = ep->maxpacket; prm->ep_enabled = true; usb_ep_enable(ep); @@ -379,7 +379,7 @@ int u_audio_start_capture(struct g_audio *audio_dev) req->context = &prm->ureq[i]; req->length = req_len; req->complete = u_audio_iso_complete; - req->buf = prm->rbuf + i * prm->max_psize; + req->buf = prm->rbuf + i * ep->maxpacket; } if (usb_ep_queue(ep, prm->ureq[i].req, GFP_ATOMIC)) @@ -430,9 +430,9 @@ int u_audio_start_playback(struct g_audio *audio_dev) uac->p_pktsize = min_t(unsigned int, uac->p_framesize * (params->p_srate / uac->p_interval), - prm->max_psize); + ep->maxpacket); - if (uac->p_pktsize < prm->max_psize) + if (uac->p_pktsize < ep->maxpacket) uac->p_pktsize_residue = uac->p_framesize * (params->p_srate % uac->p_interval); else @@ -457,7 +457,7 @@ int u_audio_start_playback(struct g_audio *audio_dev) req->context = &prm->ureq[i]; req->length = req_len; req->complete = u_audio_iso_complete; - req->buf = prm->rbuf + i * prm->max_psize; + req->buf = prm->rbuf + i * ep->maxpacket; } if (usb_ep_queue(ep, prm->ureq[i].req, GFP_ATOMIC)) -- cgit v1.2.3 From c724417baf162bd3e035659e22cdf990cfb0d917 Mon Sep 17 00:00:00 2001 From: Jack Pham Date: Thu, 30 Jan 2020 19:10:35 -0800 Subject: usb: gadget: composite: Fix bMaxPower for SuperSpeedPlus SuperSpeedPlus peripherals must report their bMaxPower of the configuration descriptor in units of 8mA as per the USB 3.2 specification. The current switch statement in encode_bMaxPower() only checks for USB_SPEED_SUPER but not USB_SPEED_SUPER_PLUS so the latter falls back to USB 2.0 encoding which uses 2mA units. Replace the switch with a simple if/else. Fixes: eae5820b852f ("usb: gadget: composite: Write SuperSpeedPlus config descriptors") Signed-off-by: Jack Pham Signed-off-by: Felipe Balbi --- drivers/usb/gadget/composite.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/drivers/usb/gadget/composite.c b/drivers/usb/gadget/composite.c index 3b4f67000315..cd303a3ea680 100644 --- a/drivers/usb/gadget/composite.c +++ b/drivers/usb/gadget/composite.c @@ -437,12 +437,10 @@ static u8 encode_bMaxPower(enum usb_device_speed speed, val = CONFIG_USB_GADGET_VBUS_DRAW; if (!val) return 0; - switch (speed) { - case USB_SPEED_SUPER: - return DIV_ROUND_UP(val, 8); - default: + if (speed < USB_SPEED_SUPER) return DIV_ROUND_UP(val, 2); - } + else + return DIV_ROUND_UP(val, 8); } static int config_buf(struct usb_configuration *config, -- cgit v1.2.3 From a2035411fa1d1206cea7d5dfe833e78481844a76 Mon Sep 17 00:00:00 2001 From: Jack Pham Date: Thu, 30 Jan 2020 19:10:36 -0800 Subject: usb: gadget: composite: Support more than 500mA MaxPower USB 3.x SuperSpeed peripherals can draw up to 900mA of VBUS power when in configured state. However, if a configuration wanting to take advantage of this is added with MaxPower greater than 500 (currently possible if using a ConfigFS gadget) the composite driver fails to accommodate this for a couple reasons: - usb_gadget_vbus_draw() when called from set_config() and composite_resume() will be passed the MaxPower value without regard for the current connection speed, resulting in a violation for USB 2.0 since the max is 500mA. - the bMaxPower of the configuration descriptor would be incorrectly encoded, again if the connection speed is only at USB 2.0 or below, likely wrapping around U8_MAX since the 2mA multiplier corresponds to a maximum of 510mA. Fix these by adding checks against the current gadget->speed when the c->MaxPower value is used (set_config() and composite_resume()) and appropriately limit based on whether it is currently at a low-/full-/high- or super-speed connection. Because 900 is not divisible by 8, with the round-up division currently used in encode_bMaxPower() a MaxPower of 900mA will result in an encoded value of 0x71. When a host stack (including Linux and Windows) enumerates this on a single port root hub, it reads this value back and decodes (multiplies by 8) to get 904mA which is strictly greater than 900mA that is typically budgeted for that port, causing it to reject the configuration. Instead, we should be using the round-down behavior of normal integral division so that 900 / 8 -> 0x70 or 896mA to stay within range. And we might as well change it for the high/full/low case as well for consistency. N.B. USB 3.2 Gen N x 2 allows for up to 1500mA but there doesn't seem to be any any peripheral controller supported by Linux that does two lane operation, so for now keeping the clamp at 900 should be fine. Signed-off-by: Jack Pham Signed-off-by: Felipe Balbi --- drivers/usb/gadget/composite.c | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/drivers/usb/gadget/composite.c b/drivers/usb/gadget/composite.c index cd303a3ea680..223f72d4d9ed 100644 --- a/drivers/usb/gadget/composite.c +++ b/drivers/usb/gadget/composite.c @@ -438,9 +438,13 @@ static u8 encode_bMaxPower(enum usb_device_speed speed, if (!val) return 0; if (speed < USB_SPEED_SUPER) - return DIV_ROUND_UP(val, 2); + return min(val, 500U) / 2; else - return DIV_ROUND_UP(val, 8); + /* + * USB 3.x supports up to 900mA, but since 900 isn't divisible + * by 8 the integral division will effectively cap to 896mA. + */ + return min(val, 900U) / 8; } static int config_buf(struct usb_configuration *config, @@ -852,6 +856,10 @@ static int set_config(struct usb_composite_dev *cdev, /* when we return, be sure our power usage is valid */ power = c->MaxPower ? c->MaxPower : CONFIG_USB_GADGET_VBUS_DRAW; + if (gadget->speed < USB_SPEED_SUPER) + power = min(power, 500U); + else + power = min(power, 900U); done: usb_gadget_vbus_draw(gadget, power); if (result >= 0 && cdev->delayed_status) @@ -2278,7 +2286,7 @@ void composite_resume(struct usb_gadget *gadget) { struct usb_composite_dev *cdev = get_gadget_data(gadget); struct usb_function *f; - u16 maxpower; + unsigned maxpower; /* REVISIT: should we have config level * suspend/resume callbacks? @@ -2292,10 +2300,14 @@ void composite_resume(struct usb_gadget *gadget) f->resume(f); } - maxpower = cdev->config->MaxPower; + maxpower = cdev->config->MaxPower ? + cdev->config->MaxPower : CONFIG_USB_GADGET_VBUS_DRAW; + if (gadget->speed < USB_SPEED_SUPER) + maxpower = min(maxpower, 500U); + else + maxpower = min(maxpower, 900U); - usb_gadget_vbus_draw(gadget, maxpower ? - maxpower : CONFIG_USB_GADGET_VBUS_DRAW); + usb_gadget_vbus_draw(gadget, maxpower); } cdev->suspended = 0; -- cgit v1.2.3 From 860ef6cd3f90b84a1832f8a6485c90c34d3b588b Mon Sep 17 00:00:00 2001 From: Minas Harutyunyan Date: Tue, 21 Jan 2020 14:24:04 +0400 Subject: usb: dwc2: Fix in ISOC request length checking Moved ISOC request length checking from dwc2_hsotg_start_req() function to dwc2_hsotg_ep_queue(). Fixes: 4fca54aa58293 ("usb: gadget: s3c-hsotg: add multi count support") Signed-off-by: Minas Harutyunyan Signed-off-by: Felipe Balbi --- drivers/usb/dwc2/gadget.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/drivers/usb/dwc2/gadget.c b/drivers/usb/dwc2/gadget.c index 88f7d6d4ff2d..7b40cf5bdc2f 100644 --- a/drivers/usb/dwc2/gadget.c +++ b/drivers/usb/dwc2/gadget.c @@ -1083,11 +1083,6 @@ static void dwc2_hsotg_start_req(struct dwc2_hsotg *hsotg, else packets = 1; /* send one packet if length is zero. */ - if (hs_ep->isochronous && length > (hs_ep->mc * hs_ep->ep.maxpacket)) { - dev_err(hsotg->dev, "req length > maxpacket*mc\n"); - return; - } - if (dir_in && index != 0) if (hs_ep->isochronous) epsize = DXEPTSIZ_MC(packets); @@ -1391,6 +1386,13 @@ static int dwc2_hsotg_ep_queue(struct usb_ep *ep, struct usb_request *req, req->actual = 0; req->status = -EINPROGRESS; + /* Don't queue ISOC request if length greater than mps*mc */ + if (hs_ep->isochronous && + req->length > (hs_ep->mc * hs_ep->ep.maxpacket)) { + dev_err(hs->dev, "req length > maxpacket*mc\n"); + return -EINVAL; + } + /* In DDMA mode for ISOC's don't queue request if length greater * than descriptor limits. */ -- cgit v1.2.3 From 9a0d6f7c0a83844baae1d6d85482863d2bf3b7a7 Mon Sep 17 00:00:00 2001 From: Minas Harutyunyan Date: Tue, 21 Jan 2020 14:17:07 +0400 Subject: usb: dwc2: Fix SET/CLEAR_FEATURE and GET_STATUS flows SET/CLEAR_FEATURE for Remote Wakeup allowance not handled correctly. GET_STATUS handling provided not correct data on DATA Stage. Issue seen when gadget's dr_mode set to "otg" mode and connected to MacOS. Both are fixed and tested using USBCV Ch.9 tests. Signed-off-by: Minas Harutyunyan Fixes: fa389a6d7726 ("usb: dwc2: gadget: Add remote_wakeup_allowed flag") Tested-by: Jack Mitchell Cc: stable@vger.kernel.org Signed-off-by: Felipe Balbi --- drivers/usb/dwc2/gadget.c | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/drivers/usb/dwc2/gadget.c b/drivers/usb/dwc2/gadget.c index 7b40cf5bdc2f..92ed32ec1607 100644 --- a/drivers/usb/dwc2/gadget.c +++ b/drivers/usb/dwc2/gadget.c @@ -1634,6 +1634,7 @@ static int dwc2_hsotg_process_req_status(struct dwc2_hsotg *hsotg, struct dwc2_hsotg_ep *ep0 = hsotg->eps_out[0]; struct dwc2_hsotg_ep *ep; __le16 reply; + u16 status; int ret; dev_dbg(hsotg->dev, "%s: USB_REQ_GET_STATUS\n", __func__); @@ -1645,11 +1646,10 @@ static int dwc2_hsotg_process_req_status(struct dwc2_hsotg *hsotg, switch (ctrl->bRequestType & USB_RECIP_MASK) { case USB_RECIP_DEVICE: - /* - * bit 0 => self powered - * bit 1 => remote wakeup - */ - reply = cpu_to_le16(0); + status = 1 << USB_DEVICE_SELF_POWERED; + status |= hsotg->remote_wakeup_allowed << + USB_DEVICE_REMOTE_WAKEUP; + reply = cpu_to_le16(status); break; case USB_RECIP_INTERFACE: @@ -1760,7 +1760,10 @@ static int dwc2_hsotg_process_req_feature(struct dwc2_hsotg *hsotg, case USB_RECIP_DEVICE: switch (wValue) { case USB_DEVICE_REMOTE_WAKEUP: - hsotg->remote_wakeup_allowed = 1; + if (set) + hsotg->remote_wakeup_allowed = 1; + else + hsotg->remote_wakeup_allowed = 0; break; case USB_DEVICE_TEST_MODE: @@ -1770,16 +1773,17 @@ static int dwc2_hsotg_process_req_feature(struct dwc2_hsotg *hsotg, return -EINVAL; hsotg->test_mode = wIndex >> 8; - ret = dwc2_hsotg_send_reply(hsotg, ep0, NULL, 0); - if (ret) { - dev_err(hsotg->dev, - "%s: failed to send reply\n", __func__); - return ret; - } break; default: return -ENOENT; } + + ret = dwc2_hsotg_send_reply(hsotg, ep0, NULL, 0); + if (ret) { + dev_err(hsotg->dev, + "%s: failed to send reply\n", __func__); + return ret; + } break; case USB_RECIP_ENDPOINT: -- cgit v1.2.3 From 43d565727a3a6fd24e37c7c2116475106af71806 Mon Sep 17 00:00:00 2001 From: Lars-Peter Clausen Date: Thu, 16 Jan 2020 15:29:01 +0200 Subject: usb: gadget: ffs: ffs_aio_cancel(): Save/restore IRQ flags ffs_aio_cancel() can be called from both interrupt and thread context. Make sure that the current IRQ state is saved and restored by using spin_{un,}lock_irq{save,restore}(). Otherwise undefined behavior might occur. Acked-by: Michal Nazarewicz Signed-off-by: Lars-Peter Clausen Signed-off-by: Alexandru Ardelean Signed-off-by: Felipe Balbi --- drivers/usb/gadget/function/f_fs.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/usb/gadget/function/f_fs.c b/drivers/usb/gadget/function/f_fs.c index 6171d28331e6..571917677d35 100644 --- a/drivers/usb/gadget/function/f_fs.c +++ b/drivers/usb/gadget/function/f_fs.c @@ -1162,18 +1162,19 @@ static int ffs_aio_cancel(struct kiocb *kiocb) { struct ffs_io_data *io_data = kiocb->private; struct ffs_epfile *epfile = kiocb->ki_filp->private_data; + unsigned long flags; int value; ENTER(); - spin_lock_irq(&epfile->ffs->eps_lock); + spin_lock_irqsave(&epfile->ffs->eps_lock, flags); if (likely(io_data && io_data->ep && io_data->req)) value = usb_ep_dequeue(io_data->ep, io_data->req); else value = -EINVAL; - spin_unlock_irq(&epfile->ffs->eps_lock); + spin_unlock_irqrestore(&epfile->ffs->eps_lock, flags); return value; } -- cgit v1.2.3 From e4bfded56cf39b8d02733c1e6ef546b97961e18a Mon Sep 17 00:00:00 2001 From: Sergey Organov Date: Wed, 29 Jan 2020 14:21:46 +0300 Subject: usb: gadget: serial: fix Tx stall after buffer overflow MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Symptom: application opens /dev/ttyGS0 and starts sending (writing) to it while either USB cable is not connected, or nobody listens on the other side of the cable. If driver circular buffer overflows before connection is established, no data will be written to the USB layer until/unless /dev/ttyGS0 is closed and re-opened again by the application (the latter besides having no means of being notified about the event of establishing of the connection.) Fix: on open and/or connect, kick Tx to flush circular buffer data to USB layer. Signed-off-by: Sergey Organov Reviewed-by: Michał Mirosław Signed-off-by: Felipe Balbi --- drivers/usb/gadget/function/u_serial.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/usb/gadget/function/u_serial.c b/drivers/usb/gadget/function/u_serial.c index f986e5c55974..8167d379e115 100644 --- a/drivers/usb/gadget/function/u_serial.c +++ b/drivers/usb/gadget/function/u_serial.c @@ -561,8 +561,10 @@ static int gs_start_io(struct gs_port *port) port->n_read = 0; started = gs_start_rx(port); - /* unblock any pending writes into our circular buffer */ if (started) { + gs_start_tx(port); + /* Unblock any pending writes into our circular buffer, in case + * we didn't in gs_start_tx() */ tty_wakeup(port->port.tty); } else { gs_free_requests(ep, head, &port->read_allocated); -- cgit v1.2.3 From 42cd5ffe46c1037d5d9a253c72e71a024a7bfbef Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 10 Feb 2020 09:51:39 +0000 Subject: usb: dwc3: debug: fix string position formatting mixup with ret and len Currently the string formatting is mixing up the offset of ret and len. Re-work the code to use just len, remove ret and use scnprintf instead of snprintf and len position accumulation where required. Remove the -ve return check since scnprintf never returns a failure -ve size. Also break overly long lines to clean up checkpatch warnings. Addresses-Coverity: ("Unused value") Fixes: 1381a5113caf ("usb: dwc3: debug: purge usage of strcat") Reviewed-by: Dan Carpenter Signed-off-by: Colin Ian King Signed-off-by: Felipe Balbi --- drivers/usb/dwc3/debug.h | 39 +++++++++++++++------------------------ 1 file changed, 15 insertions(+), 24 deletions(-) diff --git a/drivers/usb/dwc3/debug.h b/drivers/usb/dwc3/debug.h index e56beb9d1e36..4a13ceaf4093 100644 --- a/drivers/usb/dwc3/debug.h +++ b/drivers/usb/dwc3/debug.h @@ -256,86 +256,77 @@ static inline const char *dwc3_ep_event_string(char *str, size_t size, u8 epnum = event->endpoint_number; size_t len; int status; - int ret; - ret = snprintf(str, size, "ep%d%s: ", epnum >> 1, + len = scnprintf(str, size, "ep%d%s: ", epnum >> 1, (epnum & 1) ? "in" : "out"); - if (ret < 0) - return "UNKNOWN"; status = event->status; switch (event->endpoint_event) { case DWC3_DEPEVT_XFERCOMPLETE: - len = strlen(str); - snprintf(str + len, size - len, "Transfer Complete (%c%c%c)", + len += scnprintf(str + len, size - len, + "Transfer Complete (%c%c%c)", status & DEPEVT_STATUS_SHORT ? 'S' : 's', status & DEPEVT_STATUS_IOC ? 'I' : 'i', status & DEPEVT_STATUS_LST ? 'L' : 'l'); - len = strlen(str); - if (epnum <= 1) - snprintf(str + len, size - len, " [%s]", + scnprintf(str + len, size - len, " [%s]", dwc3_ep0_state_string(ep0state)); break; case DWC3_DEPEVT_XFERINPROGRESS: - len = strlen(str); - - snprintf(str + len, size - len, "Transfer In Progress [%d] (%c%c%c)", + scnprintf(str + len, size - len, + "Transfer In Progress [%d] (%c%c%c)", event->parameters, status & DEPEVT_STATUS_SHORT ? 'S' : 's', status & DEPEVT_STATUS_IOC ? 'I' : 'i', status & DEPEVT_STATUS_LST ? 'M' : 'm'); break; case DWC3_DEPEVT_XFERNOTREADY: - len = strlen(str); - - snprintf(str + len, size - len, "Transfer Not Ready [%d]%s", + len += scnprintf(str + len, size - len, + "Transfer Not Ready [%d]%s", event->parameters, status & DEPEVT_STATUS_TRANSFER_ACTIVE ? " (Active)" : " (Not Active)"); - len = strlen(str); - /* Control Endpoints */ if (epnum <= 1) { int phase = DEPEVT_STATUS_CONTROL_PHASE(event->status); switch (phase) { case DEPEVT_STATUS_CONTROL_DATA: - snprintf(str + ret, size - ret, + scnprintf(str + len, size - len, " [Data Phase]"); break; case DEPEVT_STATUS_CONTROL_STATUS: - snprintf(str + ret, size - ret, + scnprintf(str + len, size - len, " [Status Phase]"); } } break; case DWC3_DEPEVT_RXTXFIFOEVT: - snprintf(str + ret, size - ret, "FIFO"); + scnprintf(str + len, size - len, "FIFO"); break; case DWC3_DEPEVT_STREAMEVT: status = event->status; switch (status) { case DEPEVT_STREAMEVT_FOUND: - snprintf(str + ret, size - ret, " Stream %d Found", + scnprintf(str + len, size - len, " Stream %d Found", event->parameters); break; case DEPEVT_STREAMEVT_NOTFOUND: default: - snprintf(str + ret, size - ret, " Stream Not Found"); + scnprintf(str + len, size - len, " Stream Not Found"); break; } break; case DWC3_DEPEVT_EPCMDCMPLT: - snprintf(str + ret, size - ret, "Endpoint Command Complete"); + scnprintf(str + len, size - len, "Endpoint Command Complete"); break; default: - snprintf(str, size, "UNKNOWN"); + scnprintf(str + len, size - len, "UNKNOWN"); } return str; -- cgit v1.2.3 From f0ac20c3f6137910c8a927953e8a92f5b3716166 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 11 Feb 2020 10:07:43 +0100 Subject: ACPI: EC: Fix flushing of pending work Commit 016b87ca5c8c ("ACPI: EC: Rework flushing of pending work") introduced a subtle bug into the flushing of pending EC work while suspended to idle, which may cause the EC driver to fail to re-enable the EC GPE after handling a non-wakeup event (like a battery status change event, for example). The problem is that the work item flushed by flush_scheduled_work() in __acpi_ec_flush_work() may disable the EC GPE and schedule another work item expected to re-enable it, but that new work item is not flushed, so __acpi_ec_flush_work() returns with the EC GPE disabled and the CPU running it goes into an idle state subsequently. If all of the other CPUs are in idle states at that point, the EC GPE won't be re-enabled until at least one CPU is woken up by another interrupt source, so system wakeup events that would normally come from the EC then don't work. This is reproducible on a Dell XPS13 9360 in my office which sometimes stops reacting to power button and lid events (triggered by the EC on that machine) after switching from AC power to battery power or vice versa while suspended to idle (each of those switches causes the EC GPE to trigger for several times in a row, but they are not system wakeup events). To avoid this problem, it is necessary to drain the workqueue entirely in __acpi_ec_flush_work(), but that cannot be done with respect to system_wq, because work items may be added to it from other places while __acpi_ec_flush_work() is running. For this reason, make the EC driver use a dedicated workqueue for EC events processing (let that workqueue be ordered so that EC events are processed sequentially) and use drain_workqueue() on it in __acpi_ec_flush_work(). Fixes: 016b87ca5c8c ("ACPI: EC: Rework flushing of pending work") Cc: 5.4+ # 5.4+ Signed-off-by: Rafael J. Wysocki --- drivers/acpi/ec.c | 44 ++++++++++++++++++++++++++------------------ 1 file changed, 26 insertions(+), 18 deletions(-) diff --git a/drivers/acpi/ec.c b/drivers/acpi/ec.c index 08bc9751fe66..d1f1cf5d4bf0 100644 --- a/drivers/acpi/ec.c +++ b/drivers/acpi/ec.c @@ -179,6 +179,7 @@ EXPORT_SYMBOL(first_ec); static struct acpi_ec *boot_ec; static bool boot_ec_is_ecdt = false; +static struct workqueue_struct *ec_wq; static struct workqueue_struct *ec_query_wq; static int EC_FLAGS_QUERY_HANDSHAKE; /* Needs QR_EC issued when SCI_EVT set */ @@ -469,7 +470,7 @@ static void acpi_ec_submit_query(struct acpi_ec *ec) ec_dbg_evt("Command(%s) submitted/blocked", acpi_ec_cmd_string(ACPI_EC_COMMAND_QUERY)); ec->nr_pending_queries++; - schedule_work(&ec->work); + queue_work(ec_wq, &ec->work); } } @@ -535,7 +536,7 @@ static void acpi_ec_enable_event(struct acpi_ec *ec) #ifdef CONFIG_PM_SLEEP static void __acpi_ec_flush_work(void) { - flush_scheduled_work(); /* flush ec->work */ + drain_workqueue(ec_wq); /* flush ec->work */ flush_workqueue(ec_query_wq); /* flush queries */ } @@ -556,8 +557,8 @@ static void acpi_ec_disable_event(struct acpi_ec *ec) void acpi_ec_flush_work(void) { - /* Without ec_query_wq there is nothing to flush. */ - if (!ec_query_wq) + /* Without ec_wq there is nothing to flush. */ + if (!ec_wq) return; __acpi_ec_flush_work(); @@ -2107,25 +2108,33 @@ static struct acpi_driver acpi_ec_driver = { .drv.pm = &acpi_ec_pm, }; -static inline int acpi_ec_query_init(void) +static void acpi_ec_destroy_workqueues(void) { - if (!ec_query_wq) { - ec_query_wq = alloc_workqueue("kec_query", 0, - ec_max_queries); - if (!ec_query_wq) - return -ENODEV; + if (ec_wq) { + destroy_workqueue(ec_wq); + ec_wq = NULL; } - return 0; -} - -static inline void acpi_ec_query_exit(void) -{ if (ec_query_wq) { destroy_workqueue(ec_query_wq); ec_query_wq = NULL; } } +static int acpi_ec_init_workqueues(void) +{ + if (!ec_wq) + ec_wq = alloc_ordered_workqueue("kec", 0); + + if (!ec_query_wq) + ec_query_wq = alloc_workqueue("kec_query", 0, ec_max_queries); + + if (!ec_wq || !ec_query_wq) { + acpi_ec_destroy_workqueues(); + return -ENODEV; + } + return 0; +} + static const struct dmi_system_id acpi_ec_no_wakeup[] = { { .ident = "Thinkpad X1 Carbon 6th", @@ -2156,8 +2165,7 @@ int __init acpi_ec_init(void) int result; int ecdt_fail, dsdt_fail; - /* register workqueue for _Qxx evaluations */ - result = acpi_ec_query_init(); + result = acpi_ec_init_workqueues(); if (result) return result; @@ -2188,6 +2196,6 @@ static void __exit acpi_ec_exit(void) { acpi_bus_unregister_driver(&acpi_ec_driver); - acpi_ec_query_exit(); + acpi_ec_destroy_workqueues(); } #endif /* 0 */ -- cgit v1.2.3 From e3728b50cd9be7d4b1469447cdf1feb93e3b7adb Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 11 Feb 2020 10:11:02 +0100 Subject: ACPI: PM: s2idle: Avoid possible race related to the EC GPE It is theoretically possible for the ACPI EC GPE to be set after the s2idle_ops->wake() called from s2idle_loop() has returned and before the subsequent pm_wakeup_pending() check is carried out. If that happens, the resulting wakeup event will cause the system to resume even though it may be a spurious one. To avoid that race, first make the ->wake() callback in struct platform_s2idle_ops return a bool value indicating whether or not to let the system resume and rearrange s2idle_loop() to use that value instad of the direct pm_wakeup_pending() call if ->wake() is present. Next, rework acpi_s2idle_wake() to process EC events and check pm_wakeup_pending() before re-arming the SCI for system wakeup to prevent it from triggering prematurely and add comments to that function to explain the rationale for the new code flow. Fixes: 56b991849009 ("PM: sleep: Simplify suspend-to-idle control flow") Cc: 5.4+ # 5.4+ Signed-off-by: Rafael J. Wysocki --- drivers/acpi/sleep.c | 44 +++++++++++++++++++++++++++++++------------- include/linux/suspend.h | 2 +- kernel/power/suspend.c | 9 +++++---- 3 files changed, 37 insertions(+), 18 deletions(-) diff --git a/drivers/acpi/sleep.c b/drivers/acpi/sleep.c index 439880629839..2c695b196cd2 100644 --- a/drivers/acpi/sleep.c +++ b/drivers/acpi/sleep.c @@ -990,21 +990,28 @@ static void acpi_s2idle_sync(void) acpi_os_wait_events_complete(); /* synchronize Notify handling */ } -static void acpi_s2idle_wake(void) +static bool acpi_s2idle_wake(void) { - /* - * If IRQD_WAKEUP_ARMED is set for the SCI at this point, the SCI has - * not triggered while suspended, so bail out. - */ - if (!acpi_sci_irq_valid() || - irqd_is_wakeup_armed(irq_get_irq_data(acpi_sci_irq))) - return; + if (!acpi_sci_irq_valid()) + return pm_wakeup_pending(); + + while (pm_wakeup_pending()) { + /* + * If IRQD_WAKEUP_ARMED is set for the SCI at this point, the + * SCI has not triggered while suspended, so bail out (the + * wakeup is pending anyway and the SCI is not the source of + * it). + */ + if (irqd_is_wakeup_armed(irq_get_irq_data(acpi_sci_irq))) + return true; + + /* + * If there are no EC events to process, the wakeup is regarded + * as a genuine one. + */ + if (!acpi_ec_dispatch_gpe()) + return true; - /* - * If there are EC events to process, the wakeup may be a spurious one - * coming from the EC. - */ - if (acpi_ec_dispatch_gpe()) { /* * Cancel the wakeup and process all pending events in case * there are any wakeup ones in there. @@ -1017,8 +1024,19 @@ static void acpi_s2idle_wake(void) acpi_s2idle_sync(); + /* + * The SCI is in the "suspended" state now and it cannot produce + * new wakeup events till the rearming below, so if any of them + * are pending here, they must be resulting from the processing + * of EC events above or coming from somewhere else. + */ + if (pm_wakeup_pending()) + return true; + rearm_wake_irq(acpi_sci_irq); } + + return false; } static void acpi_s2idle_restore_early(void) diff --git a/include/linux/suspend.h b/include/linux/suspend.h index 4a230c2f1c31..2b2055b035ee 100644 --- a/include/linux/suspend.h +++ b/include/linux/suspend.h @@ -191,7 +191,7 @@ struct platform_s2idle_ops { int (*begin)(void); int (*prepare)(void); int (*prepare_late)(void); - void (*wake)(void); + bool (*wake)(void); void (*restore_early)(void); void (*restore)(void); void (*end)(void); diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 2c47280fbfc7..8b1bb5ee7e5d 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -131,11 +131,12 @@ static void s2idle_loop(void) * to avoid them upfront. */ for (;;) { - if (s2idle_ops && s2idle_ops->wake) - s2idle_ops->wake(); - - if (pm_wakeup_pending()) + if (s2idle_ops && s2idle_ops->wake) { + if (s2idle_ops->wake()) + break; + } else if (pm_wakeup_pending()) { break; + } pm_wakeup_clear(false); -- cgit v1.2.3 From 908087ffbe896c100ed73d5f0ce11a5b7264af4a Mon Sep 17 00:00:00 2001 From: Oded Gabbay Date: Mon, 23 Dec 2019 17:51:48 +0200 Subject: habanalabs: halt the engines before hard-reset The driver must halt the engines before doing hard-reset, otherwise the device can go into undefined state. There is a place where the driver didn't do that and this patch fixes it. Reviewed-by: Tomer Tayar Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/device.c | 1 + drivers/misc/habanalabs/goya/goya.c | 42 +++++++++++++++++++++++++++++++++++++ 2 files changed, 43 insertions(+) diff --git a/drivers/misc/habanalabs/device.c b/drivers/misc/habanalabs/device.c index b155e9549076..166883b64725 100644 --- a/drivers/misc/habanalabs/device.c +++ b/drivers/misc/habanalabs/device.c @@ -1189,6 +1189,7 @@ int hl_device_init(struct hl_device *hdev, struct class *hclass) if (hdev->asic_funcs->get_hw_state(hdev) == HL_DEVICE_HW_STATE_DIRTY) { dev_info(hdev->dev, "H/W state is dirty, must reset before initializing\n"); + hdev->asic_funcs->halt_engines(hdev, true); hdev->asic_funcs->hw_fini(hdev, true); } diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c index 7344e8a222ae..f24fe909b88d 100644 --- a/drivers/misc/habanalabs/goya/goya.c +++ b/drivers/misc/habanalabs/goya/goya.c @@ -895,6 +895,11 @@ void goya_init_dma_qmans(struct hl_device *hdev) */ static void goya_disable_external_queues(struct hl_device *hdev) { + struct goya_device *goya = hdev->asic_specific; + + if (!(goya->hw_cap_initialized & HW_CAP_DMA)) + return; + WREG32(mmDMA_QM_0_GLBL_CFG0, 0); WREG32(mmDMA_QM_1_GLBL_CFG0, 0); WREG32(mmDMA_QM_2_GLBL_CFG0, 0); @@ -956,6 +961,11 @@ static int goya_stop_external_queues(struct hl_device *hdev) { int rc, retval = 0; + struct goya_device *goya = hdev->asic_specific; + + if (!(goya->hw_cap_initialized & HW_CAP_DMA)) + return retval; + rc = goya_stop_queue(hdev, mmDMA_QM_0_GLBL_CFG1, mmDMA_QM_0_CP_STS, @@ -1744,9 +1754,18 @@ void goya_init_tpc_qmans(struct hl_device *hdev) */ static void goya_disable_internal_queues(struct hl_device *hdev) { + struct goya_device *goya = hdev->asic_specific; + + if (!(goya->hw_cap_initialized & HW_CAP_MME)) + goto disable_tpc; + WREG32(mmMME_QM_GLBL_CFG0, 0); WREG32(mmMME_CMDQ_GLBL_CFG0, 0); +disable_tpc: + if (!(goya->hw_cap_initialized & HW_CAP_TPC)) + return; + WREG32(mmTPC0_QM_GLBL_CFG0, 0); WREG32(mmTPC0_CMDQ_GLBL_CFG0, 0); @@ -1782,8 +1801,12 @@ static void goya_disable_internal_queues(struct hl_device *hdev) */ static int goya_stop_internal_queues(struct hl_device *hdev) { + struct goya_device *goya = hdev->asic_specific; int rc, retval = 0; + if (!(goya->hw_cap_initialized & HW_CAP_MME)) + goto stop_tpc; + /* * Each queue (QMAN) is a separate H/W logic. That means that each * QMAN can be stopped independently and failure to stop one does NOT @@ -1810,6 +1833,10 @@ static int goya_stop_internal_queues(struct hl_device *hdev) retval = -EIO; } +stop_tpc: + if (!(goya->hw_cap_initialized & HW_CAP_TPC)) + return retval; + rc = goya_stop_queue(hdev, mmTPC0_QM_GLBL_CFG1, mmTPC0_QM_CP_STS, @@ -1975,6 +2002,11 @@ static int goya_stop_internal_queues(struct hl_device *hdev) static void goya_dma_stall(struct hl_device *hdev) { + struct goya_device *goya = hdev->asic_specific; + + if (!(goya->hw_cap_initialized & HW_CAP_DMA)) + return; + WREG32(mmDMA_QM_0_GLBL_CFG1, 1 << DMA_QM_0_GLBL_CFG1_DMA_STOP_SHIFT); WREG32(mmDMA_QM_1_GLBL_CFG1, 1 << DMA_QM_1_GLBL_CFG1_DMA_STOP_SHIFT); WREG32(mmDMA_QM_2_GLBL_CFG1, 1 << DMA_QM_2_GLBL_CFG1_DMA_STOP_SHIFT); @@ -1984,6 +2016,11 @@ static void goya_dma_stall(struct hl_device *hdev) static void goya_tpc_stall(struct hl_device *hdev) { + struct goya_device *goya = hdev->asic_specific; + + if (!(goya->hw_cap_initialized & HW_CAP_TPC)) + return; + WREG32(mmTPC0_CFG_TPC_STALL, 1 << TPC0_CFG_TPC_STALL_V_SHIFT); WREG32(mmTPC1_CFG_TPC_STALL, 1 << TPC1_CFG_TPC_STALL_V_SHIFT); WREG32(mmTPC2_CFG_TPC_STALL, 1 << TPC2_CFG_TPC_STALL_V_SHIFT); @@ -1996,6 +2033,11 @@ static void goya_tpc_stall(struct hl_device *hdev) static void goya_mme_stall(struct hl_device *hdev) { + struct goya_device *goya = hdev->asic_specific; + + if (!(goya->hw_cap_initialized & HW_CAP_MME)) + return; + WREG32(mmMME_STALL, 0xFFFFFFFF); } -- cgit v1.2.3 From a37e47192dfa98f79a0cd5ab991c224b5980c982 Mon Sep 17 00:00:00 2001 From: Omer Shpigelman Date: Sun, 5 Jan 2020 09:05:45 +0000 Subject: habanalabs: do not halt CoreSight during hard reset During hard reset we must not write to the device. Hence avoid halting CoreSight during user context close if it is done during hard reset. In addition, we must not re-enable clock gating afterwards as it was deliberately disabled in the beginning of the hard reset flow. Signed-off-by: Omer Shpigelman Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/device.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/misc/habanalabs/device.c b/drivers/misc/habanalabs/device.c index 166883b64725..b680b0caa69b 100644 --- a/drivers/misc/habanalabs/device.c +++ b/drivers/misc/habanalabs/device.c @@ -598,7 +598,9 @@ int hl_device_set_debug_mode(struct hl_device *hdev, bool enable) goto out; } - hdev->asic_funcs->halt_coresight(hdev); + if (!hdev->hard_reset_pending) + hdev->asic_funcs->halt_coresight(hdev); + hdev->in_debug = 0; goto out; -- cgit v1.2.3 From cf01514c5c6efa2d521d35e68dff2e0674d08e91 Mon Sep 17 00:00:00 2001 From: Oded Gabbay Date: Thu, 23 Jan 2020 00:43:06 +0200 Subject: habanalabs: patched cb equals user cb in device memset During device memory memset, the driver allocates and use a CB (command buffer). To reuse existing code, it keeps a pointer to the CB in two variables, user_cb and patched_cb. Therefore, there is no need to "put" both the user_cb and patched_cb, as it will cause an underflow of the refcnt of the CB. Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/goya/goya.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c index f24fe909b88d..b8a8de24aaf7 100644 --- a/drivers/misc/habanalabs/goya/goya.c +++ b/drivers/misc/habanalabs/goya/goya.c @@ -4690,8 +4690,6 @@ static int goya_memset_device_memory(struct hl_device *hdev, u64 addr, u64 size, rc = goya_send_job_on_qman0(hdev, job); - hl_cb_put(job->patched_cb); - hl_debugfs_remove_job(hdev, job); kfree(job); cb->cs_cnt--; -- cgit v1.2.3 From 74a44bed8d93782affb707a33469bda7052b4207 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Mon, 10 Feb 2020 19:21:01 +0000 Subject: arm64: Fix CONFIG_ARCH_RANDOM=n build The entire asm/archrandom.h header is generically included via linux/archrandom.h only when CONFIG_ARCH_RANDOM is already set, so the stub definitions of __arm64_rndr() and __early_cpu_has_rndr() are only visible to KASLR if it explicitly includes the arch-internal header. Acked-by: Mark Brown Signed-off-by: Robin Murphy Signed-off-by: Will Deacon --- arch/arm64/kernel/kaslr.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/kernel/kaslr.c b/arch/arm64/kernel/kaslr.c index 53b8a4ee64ff..91a83104c6e8 100644 --- a/arch/arm64/kernel/kaslr.c +++ b/arch/arm64/kernel/kaslr.c @@ -11,6 +11,7 @@ #include #include +#include #include #include #include -- cgit v1.2.3 From a754012b9f2323a5d640da7eb7b095ac3b8cd012 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Wed, 15 Jan 2020 17:58:29 +0000 Subject: drm/i915/execlists: Leave resetting ring to intel_ring We need to allow concurrent intel_context_unpin, which means avoiding doing destructive operations like intel_ring_reset(). This was already fixed for intel_ring_unpin() in commit 0725d9a31869 ("drm/i915/gt: Make intel_ring_unpin() safe for concurrent pint"), but I overlooked that execlists_context_unpin() also made the same mistake. Reported-by: Matthew Brost Fixes: 841350223816 ("drm/i915/gt: Drop mutex serialisation between context pin/unpin") References: 0725d9a31869 ("drm/i915/gt: Make intel_ring_unpin() safe for concurrent pint") Signed-off-by: Chris Wilson Cc: Matthew Brost Cc: Matthew Auld Cc: Tvrtko Ursulin Reviewed-by: Mika Kuoppala Link: https://patchwork.freedesktop.org/patch/msgid/20200115175829.2761329-1-chris@chris-wilson.co.uk (cherry picked from commit f3c0efc9fe7a4e61544034f525348a3aa86ac5aa) Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/gt/intel_lrc.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c index 5d8c1ebe0731..d879e5e926af 100644 --- a/drivers/gpu/drm/i915/gt/intel_lrc.c +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c @@ -2532,7 +2532,6 @@ static void execlists_context_unpin(struct intel_context *ce) ce->engine); i915_gem_object_unpin_map(ce->state->obj); - intel_ring_reset(ce->ring, ce->ring->tail); } static void -- cgit v1.2.3 From 07ccd6bdafa22aacc4f72b7eb14474d0b356e6c3 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Mon, 20 Jan 2020 10:49:22 +0000 Subject: drm/i915/gem: Store mmap_offsets in an rbtree rather than a plain list Currently we create a new mmap_offset for every call to mmap_offset_ioctl. This exposes ourselves to an abusive client that may simply create new mmap_offsets ad infinitum, which will exhaust physical memory and the virtual address space. In addition to the exhaustion, a very long linear list of mmap_offsets causes other clients using the object to incur long list walks -- these long lists can also be generated by simply having many clients generate their own mmap_offset. However, we can simply use the drm_vma_node itself to manage the file association (allow/revoke) dropping our need to keep an mmo per-file. Then if we keep a small rbtree of per-type mmap_offsets, we can lookup duplicate requests quickly. Fixes: cc662126b413 ("drm/i915: Introduce DRM_I915_GEM_MMAP_OFFSET") Signed-off-by: Chris Wilson Cc: Abdiel Janulgue Reviewed-by: Abdiel Janulgue Link: https://patchwork.freedesktop.org/patch/msgid/20200120104924.4000706-3-chris@chris-wilson.co.uk (cherry picked from commit 7865559872074a9ab169c87915504661d630addf) Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/gem/i915_gem_mman.c | 90 ++++++++++++++++++++---- drivers/gpu/drm/i915/gem/i915_gem_object.c | 18 ++--- drivers/gpu/drm/i915/gem/i915_gem_object_types.h | 6 +- 3 files changed, 85 insertions(+), 29 deletions(-) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_mman.c b/drivers/gpu/drm/i915/gem/i915_gem_mman.c index b9fdac2f9003..e9be2508c04f 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_mman.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_mman.c @@ -455,10 +455,11 @@ out: void i915_gem_object_release_mmap_offset(struct drm_i915_gem_object *obj) { - struct i915_mmap_offset *mmo; + struct i915_mmap_offset *mmo, *mn; spin_lock(&obj->mmo.lock); - list_for_each_entry(mmo, &obj->mmo.offsets, offset) { + rbtree_postorder_for_each_entry_safe(mmo, mn, + &obj->mmo.offsets, offset) { /* * vma_node_unmap for GTT mmaps handled already in * __i915_gem_object_release_mmap_gtt @@ -487,6 +488,67 @@ void i915_gem_object_release_mmap(struct drm_i915_gem_object *obj) i915_gem_object_release_mmap_offset(obj); } +static struct i915_mmap_offset * +lookup_mmo(struct drm_i915_gem_object *obj, + enum i915_mmap_type mmap_type) +{ + struct rb_node *rb; + + spin_lock(&obj->mmo.lock); + rb = obj->mmo.offsets.rb_node; + while (rb) { + struct i915_mmap_offset *mmo = + rb_entry(rb, typeof(*mmo), offset); + + if (mmo->mmap_type == mmap_type) { + spin_unlock(&obj->mmo.lock); + return mmo; + } + + if (mmo->mmap_type < mmap_type) + rb = rb->rb_right; + else + rb = rb->rb_left; + } + spin_unlock(&obj->mmo.lock); + + return NULL; +} + +static struct i915_mmap_offset * +insert_mmo(struct drm_i915_gem_object *obj, struct i915_mmap_offset *mmo) +{ + struct rb_node *rb, **p; + + spin_lock(&obj->mmo.lock); + rb = NULL; + p = &obj->mmo.offsets.rb_node; + while (*p) { + struct i915_mmap_offset *pos; + + rb = *p; + pos = rb_entry(rb, typeof(*pos), offset); + + if (pos->mmap_type == mmo->mmap_type) { + spin_unlock(&obj->mmo.lock); + drm_vma_offset_remove(obj->base.dev->vma_offset_manager, + &mmo->vma_node); + kfree(mmo); + return pos; + } + + if (pos->mmap_type < mmo->mmap_type) + p = &rb->rb_right; + else + p = &rb->rb_left; + } + rb_link_node(&mmo->offset, rb, p); + rb_insert_color(&mmo->offset, &obj->mmo.offsets); + spin_unlock(&obj->mmo.lock); + + return mmo; +} + static struct i915_mmap_offset * mmap_offset_attach(struct drm_i915_gem_object *obj, enum i915_mmap_type mmap_type, @@ -496,20 +558,22 @@ mmap_offset_attach(struct drm_i915_gem_object *obj, struct i915_mmap_offset *mmo; int err; + mmo = lookup_mmo(obj, mmap_type); + if (mmo) + goto out; + mmo = kmalloc(sizeof(*mmo), GFP_KERNEL); if (!mmo) return ERR_PTR(-ENOMEM); mmo->obj = obj; - mmo->dev = obj->base.dev; - mmo->file = file; mmo->mmap_type = mmap_type; drm_vma_node_reset(&mmo->vma_node); - err = drm_vma_offset_add(mmo->dev->vma_offset_manager, &mmo->vma_node, - obj->base.size / PAGE_SIZE); + err = drm_vma_offset_add(obj->base.dev->vma_offset_manager, + &mmo->vma_node, obj->base.size / PAGE_SIZE); if (likely(!err)) - goto out; + goto insert; /* Attempt to reap some mmap space from dead objects */ err = intel_gt_retire_requests_timeout(&i915->gt, MAX_SCHEDULE_TIMEOUT); @@ -517,19 +581,17 @@ mmap_offset_attach(struct drm_i915_gem_object *obj, goto err; i915_gem_drain_freed_objects(i915); - err = drm_vma_offset_add(mmo->dev->vma_offset_manager, &mmo->vma_node, - obj->base.size / PAGE_SIZE); + err = drm_vma_offset_add(obj->base.dev->vma_offset_manager, + &mmo->vma_node, obj->base.size / PAGE_SIZE); if (err) goto err; +insert: + mmo = insert_mmo(obj, mmo); + GEM_BUG_ON(lookup_mmo(obj, mmap_type) != mmo); out: if (file) drm_vma_node_allow(&mmo->vma_node, file); - - spin_lock(&obj->mmo.lock); - list_add(&mmo->offset, &obj->mmo.offsets); - spin_unlock(&obj->mmo.lock); - return mmo; err: diff --git a/drivers/gpu/drm/i915/gem/i915_gem_object.c b/drivers/gpu/drm/i915/gem/i915_gem_object.c index 46bacc82ddc4..35985218bd85 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_object.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_object.c @@ -63,7 +63,7 @@ void i915_gem_object_init(struct drm_i915_gem_object *obj, INIT_LIST_HEAD(&obj->lut_list); spin_lock_init(&obj->mmo.lock); - INIT_LIST_HEAD(&obj->mmo.offsets); + obj->mmo.offsets = RB_ROOT; init_rcu_head(&obj->rcu); @@ -100,8 +100,8 @@ void i915_gem_close_object(struct drm_gem_object *gem, struct drm_file *file) { struct drm_i915_gem_object *obj = to_intel_bo(gem); struct drm_i915_file_private *fpriv = file->driver_priv; + struct i915_mmap_offset *mmo, *mn; struct i915_lut_handle *lut, *ln; - struct i915_mmap_offset *mmo; LIST_HEAD(close); i915_gem_object_lock(obj); @@ -117,14 +117,8 @@ void i915_gem_close_object(struct drm_gem_object *gem, struct drm_file *file) i915_gem_object_unlock(obj); spin_lock(&obj->mmo.lock); - list_for_each_entry(mmo, &obj->mmo.offsets, offset) { - if (mmo->file != file) - continue; - - spin_unlock(&obj->mmo.lock); + rbtree_postorder_for_each_entry_safe(mmo, mn, &obj->mmo.offsets, offset) drm_vma_node_revoke(&mmo->vma_node, file); - spin_lock(&obj->mmo.lock); - } spin_unlock(&obj->mmo.lock); list_for_each_entry_safe(lut, ln, &close, obj_link) { @@ -203,12 +197,14 @@ static void __i915_gem_free_objects(struct drm_i915_private *i915, i915_gem_object_release_mmap(obj); - list_for_each_entry_safe(mmo, mn, &obj->mmo.offsets, offset) { + rbtree_postorder_for_each_entry_safe(mmo, mn, + &obj->mmo.offsets, + offset) { drm_vma_offset_remove(obj->base.dev->vma_offset_manager, &mmo->vma_node); kfree(mmo); } - INIT_LIST_HEAD(&obj->mmo.offsets); + obj->mmo.offsets = RB_ROOT; GEM_BUG_ON(atomic_read(&obj->bind_count)); GEM_BUG_ON(obj->userfault_count); diff --git a/drivers/gpu/drm/i915/gem/i915_gem_object_types.h b/drivers/gpu/drm/i915/gem/i915_gem_object_types.h index 88e268633fdc..f64ad77e6b1e 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_object_types.h +++ b/drivers/gpu/drm/i915/gem/i915_gem_object_types.h @@ -71,13 +71,11 @@ enum i915_mmap_type { }; struct i915_mmap_offset { - struct drm_device *dev; struct drm_vma_offset_node vma_node; struct drm_i915_gem_object *obj; - struct drm_file *file; enum i915_mmap_type mmap_type; - struct list_head offset; + struct rb_node offset; }; struct drm_i915_gem_object { @@ -137,7 +135,7 @@ struct drm_i915_gem_object { struct { spinlock_t lock; /* Protects access to mmo offsets */ - struct list_head offsets; + struct rb_root offsets; } mmo; I915_SELFTEST_DECLARE(struct list_head st_link); -- cgit v1.2.3 From 1a9629d189f57670afd31f1aea4e59b7270d2d89 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Tue, 21 Jan 2020 13:21:07 +0000 Subject: drm/i915: Don't show the blank process name for internal/simulated errors For a simulated preemption reset, we don't populate the request and so do not fill in the guilty context name. [ 79.991294] i915 0000:00:02.0: GPU HANG: ecode 9:1:e757fefe, in [0] Just don't mention the empty string in the logs! Fixes: 742379c0c400 ("drm/i915: Start chopping up the GPU error capture") Signed-off-by: Chris Wilson Cc: Mika Kuoppala Reviewed-by: Mika Kuoppala Link: https://patchwork.freedesktop.org/patch/msgid/20200121132107.267709-1-chris@chris-wilson.co.uk (cherry picked from commit 29baf3ae8daa4c673de58106ff41c7236dff57f4) Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/i915_gpu_error.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c index 4c1836f0a991..594341e27a47 100644 --- a/drivers/gpu/drm/i915/i915_gpu_error.c +++ b/drivers/gpu/drm/i915/i915_gpu_error.c @@ -1681,7 +1681,7 @@ static const char *error_msg(struct i915_gpu_coredump *error) "GPU HANG: ecode %d:%x:%08x", INTEL_GEN(error->i915), engines, generate_ecode(first)); - if (first) { + if (first && first->context.pid) { /* Just show the first executing process, more is confusing */ len += scnprintf(error->error_msg + len, sizeof(error->error_msg) - len, -- cgit v1.2.3 From 051c89cf4ac487e795d87e6f3b9e0ff788da8fb4 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Thu, 23 Jan 2020 12:59:34 +0000 Subject: drm/i915/gem: Detect overflow in calculating dumb buffer size MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To multiply 2 u32 numbers to generate a u64 in C requires a bit of forewarning for the compiler. Signed-off-by: Chris Wilson Cc: Ramalingam C Cc: Joonas Lahtinen Cc: stable@vger.kernel.org Reviewed-by: Ville Syrjälä Link: https://patchwork.freedesktop.org/patch/msgid/20200123125934.1401755-1-chris@chris-wilson.co.uk (cherry picked from commit 0f8f8a64300092852b9361cd835395ee71e6a7d6) Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/i915_gem.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index 94f993e4c12f..c2de2f45b459 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c @@ -265,7 +265,10 @@ i915_gem_dumb_create(struct drm_file *file, DRM_FORMAT_MOD_LINEAR)) args->pitch = ALIGN(args->pitch, 4096); - args->size = args->pitch * args->height; + if (args->pitch < args->width) + return -EINVAL; + + args->size = mul_u32_u32(args->pitch, args->height); mem_type = INTEL_MEMORY_SYSTEM; if (HAS_LMEM(to_i915(dev))) -- cgit v1.2.3 From e4edd4fcbf4daf9d4319bef0bfaf350cb672239a Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Thu, 23 Jan 2020 22:44:58 +0000 Subject: drm/i915: Check activity on i915_vma after confirming pin_count==0 Only assert that the i915_vma is now idle if and only if no other pins are present. If another user has the i915_vma pinned, they may submit more work to the i915_vma skipping the vm->mutex used to serialise the unbind. We need to wait again, if we want to continue and unbind this vma. However, if we own the i915_vma (we hold the vm->mutex for the unbind and the pin_count is 0), we can assert that the vma remains idle as we unbind. Fixes: 2850748ef876 ("drm/i915: Pull i915_vma_pin under the vm->mutex") Closes: https://gitlab.freedesktop.org/drm/intel/issues/530 Signed-off-by: Chris Wilson Cc: Tvrtko Ursulin Reviewed-by: Tvrtko Ursulin Link: https://patchwork.freedesktop.org/patch/msgid/20200123224459.38128-1-chris@chris-wilson.co.uk (cherry picked from commit 60e94557fff1f5514c7fc4da7ddc2c7a13ffff26) Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/i915_vma.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_vma.c b/drivers/gpu/drm/i915/i915_vma.c index 17d7c525ea5c..4ff380770b32 100644 --- a/drivers/gpu/drm/i915/i915_vma.c +++ b/drivers/gpu/drm/i915/i915_vma.c @@ -1202,16 +1202,26 @@ int __i915_vma_unbind(struct i915_vma *vma) if (ret) return ret; - GEM_BUG_ON(i915_vma_is_active(vma)); if (i915_vma_is_pinned(vma)) { vma_print_allocator(vma, "is pinned"); return -EAGAIN; } - GEM_BUG_ON(i915_vma_is_active(vma)); + /* + * After confirming that no one else is pinning this vma, wait for + * any laggards who may have crept in during the wait (through + * a residual pin skipping the vm->mutex) to complete. + */ + ret = i915_vma_sync(vma); + if (ret) + return ret; + if (!drm_mm_node_allocated(&vma->node)) return 0; + GEM_BUG_ON(i915_vma_is_pinned(vma)); + GEM_BUG_ON(i915_vma_is_active(vma)); + if (i915_vma_is_map_and_fenceable(vma)) { /* * Check that we have flushed all writes through the GGTT -- cgit v1.2.3 From 3d1e0b406de16508de96f4a07fc3f94cfc678372 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 3 Feb 2020 17:37:04 +0100 Subject: netfilter: conntrack: remove two args from resolve_clash ctinfo is whats taken from the skb, i.e. ct = nf_ct_get(skb, &ctinfo). We do not pass 'ct' and instead re-fetch it from the skb. Just do the same for both netns and ctinfo. Also add a comment on what clash resolution is supposed to do. While at it, one indent level can be removed. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_core.c | 69 +++++++++++++++++++++++++++++---------- 1 file changed, 51 insertions(+), 18 deletions(-) diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index d1305423640f..5e332b01f3c0 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -894,31 +894,64 @@ static void nf_ct_acct_merge(struct nf_conn *ct, enum ip_conntrack_info ctinfo, } } -/* Resolve race on insertion if this protocol allows this. */ +/** + * nf_ct_resolve_clash - attempt to handle clash without packet drop + * + * @skb: skb that causes the clash + * @h: tuplehash of the clashing entry already in table + * + * A conntrack entry can be inserted to the connection tracking table + * if there is no existing entry with an identical tuple. + * + * If there is one, @skb (and the assocated, unconfirmed conntrack) has + * to be dropped. In case @skb is retransmitted, next conntrack lookup + * will find the already-existing entry. + * + * The major problem with such packet drop is the extra delay added by + * the packet loss -- it will take some time for a retransmit to occur + * (or the sender to time out when waiting for a reply). + * + * This function attempts to handle the situation without packet drop. + * + * If @skb has no NAT transformation or if the colliding entries are + * exactly the same, only the to-be-confirmed conntrack entry is discarded + * and @skb is associated with the conntrack entry already in the table. + * + * Returns NF_DROP if the clash could not be resolved. + */ static __cold noinline int -nf_ct_resolve_clash(struct net *net, struct sk_buff *skb, - enum ip_conntrack_info ctinfo, - struct nf_conntrack_tuple_hash *h) +nf_ct_resolve_clash(struct sk_buff *skb, struct nf_conntrack_tuple_hash *h) { /* This is the conntrack entry already in hashes that won race. */ struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); const struct nf_conntrack_l4proto *l4proto; - enum ip_conntrack_info oldinfo; - struct nf_conn *loser_ct = nf_ct_get(skb, &oldinfo); + enum ip_conntrack_info ctinfo; + struct nf_conn *loser_ct; + struct net *net; + + loser_ct = nf_ct_get(skb, &ctinfo); l4proto = nf_ct_l4proto_find(nf_ct_protonum(ct)); - if (l4proto->allow_clash && - !nf_ct_is_dying(ct) && - atomic_inc_not_zero(&ct->ct_general.use)) { - if (((ct->status & IPS_NAT_DONE_MASK) == 0) || - nf_ct_match(ct, loser_ct)) { - nf_ct_acct_merge(ct, ctinfo, loser_ct); - nf_conntrack_put(&loser_ct->ct_general); - nf_ct_set(skb, ct, oldinfo); - return NF_ACCEPT; - } - nf_ct_put(ct); + if (!l4proto->allow_clash) + goto drop; + + if (nf_ct_is_dying(ct)) + goto drop; + + if (!atomic_inc_not_zero(&ct->ct_general.use)) + goto drop; + + if (((ct->status & IPS_NAT_DONE_MASK) == 0) || + nf_ct_match(ct, loser_ct)) { + nf_ct_acct_merge(ct, ctinfo, loser_ct); + nf_conntrack_put(&loser_ct->ct_general); + nf_ct_set(skb, ct, ctinfo); + return NF_ACCEPT; } + + nf_ct_put(ct); +drop: + net = nf_ct_net(loser_ct); NF_CT_STAT_INC(net, drop); return NF_DROP; } @@ -1036,7 +1069,7 @@ __nf_conntrack_confirm(struct sk_buff *skb) out: nf_ct_add_to_dying_list(ct); - ret = nf_ct_resolve_clash(net, skb, ctinfo, h); + ret = nf_ct_resolve_clash(skb, h); dying: nf_conntrack_double_unlock(hash, reply_hash); NF_CT_STAT_INC(net, insert_failed); -- cgit v1.2.3 From b1b32552c1d81f0cf6a8e79043a2a47e769ff071 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 3 Feb 2020 17:37:05 +0100 Subject: netfilter: conntrack: place confirm-bit setting in a helper ... so it can be re-used from clash resolution in followup patch. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_core.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 5e332b01f3c0..5fda5bd10160 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -894,6 +894,19 @@ static void nf_ct_acct_merge(struct nf_conn *ct, enum ip_conntrack_info ctinfo, } } +static void __nf_conntrack_insert_prepare(struct nf_conn *ct) +{ + struct nf_conn_tstamp *tstamp; + + atomic_inc(&ct->ct_general.use); + ct->status |= IPS_CONFIRMED; + + /* set conntrack timestamp, if enabled. */ + tstamp = nf_conn_tstamp_find(ct); + if (tstamp) + tstamp->start = ktime_get_real_ns(); +} + /** * nf_ct_resolve_clash - attempt to handle clash without packet drop * @@ -965,7 +978,6 @@ __nf_conntrack_confirm(struct sk_buff *skb) struct nf_conntrack_tuple_hash *h; struct nf_conn *ct; struct nf_conn_help *help; - struct nf_conn_tstamp *tstamp; struct hlist_nulls_node *n; enum ip_conntrack_info ctinfo; struct net *net; @@ -1042,13 +1054,8 @@ __nf_conntrack_confirm(struct sk_buff *skb) setting time, otherwise we'd get timer wrap in weird delay cases. */ ct->timeout += nfct_time_stamp; - atomic_inc(&ct->ct_general.use); - ct->status |= IPS_CONFIRMED; - /* set conntrack timestamp, if enabled. */ - tstamp = nf_conn_tstamp_find(ct); - if (tstamp) - tstamp->start = ktime_get_real_ns(); + __nf_conntrack_insert_prepare(ct); /* Since the lookup is lockless, hash insertion must be done after * starting the timer and setting the CONFIRMED bit. The RCU barriers -- cgit v1.2.3 From bb89abe52bf426f1f40850c441efc77426cc31e1 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 3 Feb 2020 17:37:06 +0100 Subject: netfilter: conntrack: split resolve_clash function Followup patch will need a helper function with the 'clashing entries refer to the identical tuple in both directions' resolution logic. This patch will add another resolve_clash helper where loser_ct must not be added to the dying list because it will be inserted into the table. Therefore this also moves the stat counters and dying-list insertion of the losing ct. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_core.c | 58 +++++++++++++++++++++++++++------------ 1 file changed, 41 insertions(+), 17 deletions(-) diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 5fda5bd10160..3f069eb0f0fc 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -907,6 +907,39 @@ static void __nf_conntrack_insert_prepare(struct nf_conn *ct) tstamp->start = ktime_get_real_ns(); } +static int __nf_ct_resolve_clash(struct sk_buff *skb, + struct nf_conntrack_tuple_hash *h) +{ + /* This is the conntrack entry already in hashes that won race. */ + struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); + enum ip_conntrack_info ctinfo; + struct nf_conn *loser_ct; + + loser_ct = nf_ct_get(skb, &ctinfo); + + if (nf_ct_is_dying(ct)) + return NF_DROP; + + if (!atomic_inc_not_zero(&ct->ct_general.use)) + return NF_DROP; + + if (((ct->status & IPS_NAT_DONE_MASK) == 0) || + nf_ct_match(ct, loser_ct)) { + struct net *net = nf_ct_net(ct); + + nf_ct_acct_merge(ct, ctinfo, loser_ct); + nf_ct_add_to_dying_list(loser_ct); + nf_conntrack_put(&loser_ct->ct_general); + nf_ct_set(skb, ct, ctinfo); + + NF_CT_STAT_INC(net, insert_failed); + return NF_ACCEPT; + } + + nf_ct_put(ct); + return NF_DROP; +} + /** * nf_ct_resolve_clash - attempt to handle clash without packet drop * @@ -941,31 +974,23 @@ nf_ct_resolve_clash(struct sk_buff *skb, struct nf_conntrack_tuple_hash *h) enum ip_conntrack_info ctinfo; struct nf_conn *loser_ct; struct net *net; + int ret; loser_ct = nf_ct_get(skb, &ctinfo); + net = nf_ct_net(loser_ct); l4proto = nf_ct_l4proto_find(nf_ct_protonum(ct)); if (!l4proto->allow_clash) goto drop; - if (nf_ct_is_dying(ct)) - goto drop; - - if (!atomic_inc_not_zero(&ct->ct_general.use)) - goto drop; - - if (((ct->status & IPS_NAT_DONE_MASK) == 0) || - nf_ct_match(ct, loser_ct)) { - nf_ct_acct_merge(ct, ctinfo, loser_ct); - nf_conntrack_put(&loser_ct->ct_general); - nf_ct_set(skb, ct, ctinfo); - return NF_ACCEPT; - } + ret = __nf_ct_resolve_clash(skb, h); + if (ret == NF_ACCEPT) + return ret; - nf_ct_put(ct); drop: - net = nf_ct_net(loser_ct); + nf_ct_add_to_dying_list(loser_ct); NF_CT_STAT_INC(net, drop); + NF_CT_STAT_INC(net, insert_failed); return NF_DROP; } @@ -1034,6 +1059,7 @@ __nf_conntrack_confirm(struct sk_buff *skb) if (unlikely(nf_ct_is_dying(ct))) { nf_ct_add_to_dying_list(ct); + NF_CT_STAT_INC(net, insert_failed); goto dying; } @@ -1075,11 +1101,9 @@ __nf_conntrack_confirm(struct sk_buff *skb) return NF_ACCEPT; out: - nf_ct_add_to_dying_list(ct); ret = nf_ct_resolve_clash(skb, h); dying: nf_conntrack_double_unlock(hash, reply_hash); - NF_CT_STAT_INC(net, insert_failed); local_bh_enable(); return ret; } -- cgit v1.2.3 From 74f73476c3755664504b7d266b5e8f91050ffed9 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Tue, 11 Feb 2020 12:14:19 +0100 Subject: ALSA: usb-audio: Apply 48kHz fixed rate playback for Jabra Evolve 65 headset Jabra Evolve 65 headset appears as if supporting lower rates than 48kHz, but it actually doesn't work but with 48kHz for playback. This patch applies a workaround to enforce the 48kHz like LINE6 devices already did. The workaround is put in a unified helper function, set_fixed_rate(), to be called from both places now. BugLink: https://bugzilla.kernel.org/show_bug.cgi?id=206149 Link: https://lore.kernel.org/r/20200211111419.5895-1-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/usb/format.c | 33 ++++++++++++++++++++++----------- 1 file changed, 22 insertions(+), 11 deletions(-) diff --git a/sound/usb/format.c b/sound/usb/format.c index 9260136e4c9b..50cb183958bf 100644 --- a/sound/usb/format.c +++ b/sound/usb/format.c @@ -151,6 +151,19 @@ static u64 parse_audio_format_i_type(struct snd_usb_audio *chip, return pcm_formats; } +static int set_fixed_rate(struct audioformat *fp, int rate, int rate_bits) +{ + kfree(fp->rate_table); + fp->rate_table = kmalloc(sizeof(int), GFP_KERNEL); + if (!fp->rate_table) + return -ENOMEM; + fp->nr_rates = 1; + fp->rate_min = rate; + fp->rate_max = rate; + fp->rates = rate_bits; + fp->rate_table[0] = rate; + return 0; +} /* * parse the format descriptor and stores the possible sample rates @@ -223,6 +236,14 @@ static int parse_audio_format_rates_v1(struct snd_usb_audio *chip, struct audiof fp->rate_min = combine_triple(&fmt[offset + 1]); fp->rate_max = combine_triple(&fmt[offset + 4]); } + + /* Jabra Evolve 65 headset */ + if (chip->usb_id == USB_ID(0x0b0e, 0x030b)) { + /* only 48kHz for playback while keeping 16kHz for capture */ + if (fp->nr_rates != 1) + return set_fixed_rate(fp, 48000, SNDRV_PCM_RATE_48000); + } + return 0; } @@ -299,17 +320,7 @@ static int line6_parse_audio_format_rates_quirk(struct snd_usb_audio *chip, case USB_ID(0x0e41, 0x4248): /* Line6 Helix >= fw 2.82 */ case USB_ID(0x0e41, 0x4249): /* Line6 Helix Rack >= fw 2.82 */ case USB_ID(0x0e41, 0x424a): /* Line6 Helix LT >= fw 2.82 */ - /* supported rates: 48Khz */ - kfree(fp->rate_table); - fp->rate_table = kmalloc(sizeof(int), GFP_KERNEL); - if (!fp->rate_table) - return -ENOMEM; - fp->nr_rates = 1; - fp->rate_min = 48000; - fp->rate_max = 48000; - fp->rates = SNDRV_PCM_RATE_48000; - fp->rate_table[0] = 48000; - return 0; + return set_fixed_rate(fp, 48000, SNDRV_PCM_RATE_48000); } return -ENODEV; -- cgit v1.2.3 From be993e44badc448add6a18d6f12b20615692c4c3 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 7 Feb 2020 12:57:36 +0100 Subject: arm/ftrace: Fix BE text poking The __patch_text() function already applies __opcode_to_mem_*(), so when __opcode_to_mem_*() is not the identity (BE*), it is applied twice, wrecking the instruction. Fixes: 42e51f187f86 ("arm/ftrace: Use __patch_text()") Reported-by: Dmitry Osipenko Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Tested-by: Dmitry Osipenko --- arch/arm/kernel/ftrace.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/arch/arm/kernel/ftrace.c b/arch/arm/kernel/ftrace.c index 2a5ff69c28e6..10499d44964a 100644 --- a/arch/arm/kernel/ftrace.c +++ b/arch/arm/kernel/ftrace.c @@ -78,13 +78,10 @@ static int ftrace_modify_code(unsigned long pc, unsigned long old, { unsigned long replaced; - if (IS_ENABLED(CONFIG_THUMB2_KERNEL)) { + if (IS_ENABLED(CONFIG_THUMB2_KERNEL)) old = __opcode_to_mem_thumb32(old); - new = __opcode_to_mem_thumb32(new); - } else { + else old = __opcode_to_mem_arm(old); - new = __opcode_to_mem_arm(new); - } if (validate) { if (probe_kernel_read(&replaced, (void *)pc, MCOUNT_INSN_SIZE)) -- cgit v1.2.3 From 7a7a8f549ddd18126dfa3dedbe42d877614c7995 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 7 Feb 2020 12:57:37 +0100 Subject: arm/patch: Fix !MMU compile Now that patch.o is unconditionally selected for ftrace, it can also get compiled for !MMU kernels. These (obviously) lack {set,clear}_fixmap() support. Also remove the superfluous __acquire/__release nonsense. Fixes: 42e51f187f86 ("arm/ftrace: Use __patch_text()") Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar --- arch/arm/kernel/patch.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/arch/arm/kernel/patch.c b/arch/arm/kernel/patch.c index d0a05a3bdb96..e9e828b6bb30 100644 --- a/arch/arm/kernel/patch.c +++ b/arch/arm/kernel/patch.c @@ -16,10 +16,10 @@ struct patch { unsigned int insn; }; +#ifdef CONFIG_MMU static DEFINE_RAW_SPINLOCK(patch_lock); static void __kprobes *patch_map(void *addr, int fixmap, unsigned long *flags) - __acquires(&patch_lock) { unsigned int uintaddr = (uintptr_t) addr; bool module = !core_kernel_text(uintaddr); @@ -34,8 +34,6 @@ static void __kprobes *patch_map(void *addr, int fixmap, unsigned long *flags) if (flags) raw_spin_lock_irqsave(&patch_lock, *flags); - else - __acquire(&patch_lock); set_fixmap(fixmap, page_to_phys(page)); @@ -43,15 +41,19 @@ static void __kprobes *patch_map(void *addr, int fixmap, unsigned long *flags) } static void __kprobes patch_unmap(int fixmap, unsigned long *flags) - __releases(&patch_lock) { clear_fixmap(fixmap); if (flags) raw_spin_unlock_irqrestore(&patch_lock, *flags); - else - __release(&patch_lock); } +#else +static void __kprobes *patch_map(void *addr, int fixmap, unsigned long *flags) +{ + return addr; +} +static void __kprobes patch_unmap(int fixmap, unsigned long *flags) { } +#endif void __kprobes __patch_text_real(void *addr, unsigned int insn, bool remap) { @@ -64,8 +66,6 @@ void __kprobes __patch_text_real(void *addr, unsigned int insn, bool remap) if (remap) waddr = patch_map(addr, FIX_TEXT_POKE0, &flags); - else - __acquire(&patch_lock); if (thumb2 && __opcode_is_thumb16(insn)) { *(u16 *)waddr = __opcode_to_mem_thumb16(insn); @@ -102,8 +102,7 @@ void __kprobes __patch_text_real(void *addr, unsigned int insn, bool remap) if (waddr != addr) { flush_kernel_vmap_range(waddr, twopage ? size / 2 : size); patch_unmap(FIX_TEXT_POKE0, &flags); - } else - __release(&patch_lock); + } flush_icache_range((uintptr_t)(addr), (uintptr_t)(addr) + size); -- cgit v1.2.3 From 6fcca0fa48118e6d63733eb4644c6cd880c15b8f Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Mon, 3 Feb 2020 13:22:16 -0800 Subject: sched/psi: Fix OOB write when writing 0 bytes to PSI files Issuing write() with count parameter set to 0 on any file under /proc/pressure/ will cause an OOB write because of the access to buf[buf_size-1] when NUL-termination is performed. Fix this by checking for buf_size to be non-zero. Signed-off-by: Suren Baghdasaryan Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Acked-by: Johannes Weiner Link: https://lkml.kernel.org/r/20200203212216.7076-1-surenb@google.com --- kernel/sched/psi.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index db7b50bba3f1..38ccd49b9bf6 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -1199,6 +1199,9 @@ static ssize_t psi_write(struct file *file, const char __user *user_buf, if (static_branch_likely(&psi_disabled)) return -EOPNOTSUPP; + if (!nbytes) + return -EINVAL; + buf_size = min(nbytes, sizeof(buf)); if (copy_from_user(buf, user_buf, buf_size)) return -EFAULT; -- cgit v1.2.3 From 4104a562e0ca62e971089db9d3c47794a0d7d4eb Mon Sep 17 00:00:00 2001 From: Madhuparna Bhowmik Date: Sat, 1 Feb 2020 18:28:03 +0530 Subject: sched/core: Annotate curr pointer in rq with __rcu This patch fixes the following sparse warnings in sched/core.c and sched/membarrier.c: kernel/sched/core.c:2372:27: error: incompatible types in comparison expression kernel/sched/core.c:4061:17: error: incompatible types in comparison expression kernel/sched/core.c:6067:9: error: incompatible types in comparison expression kernel/sched/membarrier.c:108:21: error: incompatible types in comparison expression kernel/sched/membarrier.c:177:21: error: incompatible types in comparison expression kernel/sched/membarrier.c:243:21: error: incompatible types in comparison expression Signed-off-by: Madhuparna Bhowmik Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lkml.kernel.org/r/20200201125803.20245-1-madhuparnabhowmik10@gmail.com --- kernel/sched/sched.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 5876e6ba5903..9ea647835fd6 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -896,7 +896,7 @@ struct rq { */ unsigned long nr_uninterruptible; - struct task_struct *curr; + struct task_struct __rcu *curr; struct task_struct *idle; struct task_struct *stop; unsigned long next_balance; -- cgit v1.2.3 From e9f5490c3574b435ce7fe7a71724aa3866babc7f Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 9 Feb 2020 19:29:12 -0800 Subject: sched/fair: Fix kernel-doc warning in attach_entity_load_avg() Fix kernel-doc warning in kernel/sched/fair.c, caused by a recent function parameter removal: ../kernel/sched/fair.c:3526: warning: Excess function parameter 'flags' description in 'attach_entity_load_avg' Fixes: a4f9a0e51bbf ("sched/fair: Remove redundant call to cpufreq_update_util()") Signed-off-by: Randy Dunlap Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Vincent Guittot Link: https://lkml.kernel.org/r/cbe964e4-6879-fd08-41c9-ef1917414af4@infradead.org --- kernel/sched/fair.c | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 94c3b8469cf6..3c8a379c357e 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3516,7 +3516,6 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) * attach_entity_load_avg - attach this entity to its cfs_rq load avg * @cfs_rq: cfs_rq to attach to * @se: sched_entity to attach - * @flags: migration hints * * Must call update_cfs_rq_load_avg() before this, since we rely on * cfs_rq->avg.last_update_time being current. -- cgit v1.2.3 From eda23b387f6c4bb2971ac7e874a09913f533b22c Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Tue, 28 Jan 2020 10:31:17 -0800 Subject: perf/x86/intel: Add Elkhart Lake support Elkhart Lake also uses Tremont CPU. From the perspective of Intel PMU, there is nothing changed compared with Jacobsville. Share the perf code with Jacobsville. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Andi Kleen Link: https://lkml.kernel.org/r/1580236279-35492-1-git-send-email-kan.liang@linux.intel.com --- arch/x86/events/intel/core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 3be51aa06e67..dff6623804c2 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -4765,6 +4765,7 @@ __init int intel_pmu_init(void) break; case INTEL_FAM6_ATOM_TREMONT_D: + case INTEL_FAM6_ATOM_TREMONT: x86_pmu.late_ack = true; memcpy(hw_cache_event_ids, glp_hw_cache_event_ids, sizeof(hw_cache_event_ids)); -- cgit v1.2.3 From ecf71fbccb9ac5cb964eb7de59bb9da3755b7885 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Tue, 28 Jan 2020 10:31:18 -0800 Subject: perf/x86/cstate: Add Tremont support Tremont is Intel's successor to Goldmont Plus. From the perspective of Intel cstate residency counters, there is nothing changed compared with Goldmont Plus and Goldmont. Share glm_cstates with Goldmont Plus and Goldmont. Update the comments for Tremont. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Andi Kleen Link: https://lkml.kernel.org/r/1580236279-35492-2-git-send-email-kan.liang@linux.intel.com --- arch/x86/events/intel/cstate.c | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c index e1daf4151e11..4814c964692c 100644 --- a/arch/x86/events/intel/cstate.c +++ b/arch/x86/events/intel/cstate.c @@ -40,17 +40,18 @@ * Model specific counters: * MSR_CORE_C1_RES: CORE C1 Residency Counter * perf code: 0x00 - * Available model: SLM,AMT,GLM,CNL + * Available model: SLM,AMT,GLM,CNL,TNT * Scope: Core (each processor core has a MSR) * MSR_CORE_C3_RESIDENCY: CORE C3 Residency Counter * perf code: 0x01 * Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL,GLM, - * CNL,KBL,CML + * CNL,KBL,CML,TNT * Scope: Core * MSR_CORE_C6_RESIDENCY: CORE C6 Residency Counter * perf code: 0x02 * Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW, - * SKL,KNL,GLM,CNL,KBL,CML,ICL,TGL + * SKL,KNL,GLM,CNL,KBL,CML,ICL,TGL, + * TNT * Scope: Core * MSR_CORE_C7_RESIDENCY: CORE C7 Residency Counter * perf code: 0x03 @@ -60,17 +61,18 @@ * MSR_PKG_C2_RESIDENCY: Package C2 Residency Counter. * perf code: 0x00 * Available model: SNB,IVB,HSW,BDW,SKL,KNL,GLM,CNL, - * KBL,CML,ICL,TGL + * KBL,CML,ICL,TGL,TNT * Scope: Package (physical package) * MSR_PKG_C3_RESIDENCY: Package C3 Residency Counter. * perf code: 0x01 * Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL,KNL, - * GLM,CNL,KBL,CML,ICL,TGL + * GLM,CNL,KBL,CML,ICL,TGL,TNT * Scope: Package (physical package) * MSR_PKG_C6_RESIDENCY: Package C6 Residency Counter. * perf code: 0x02 - * Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW - * SKL,KNL,GLM,CNL,KBL,CML,ICL,TGL + * Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW, + * SKL,KNL,GLM,CNL,KBL,CML,ICL,TGL, + * TNT * Scope: Package (physical package) * MSR_PKG_C7_RESIDENCY: Package C7 Residency Counter. * perf code: 0x03 @@ -87,7 +89,8 @@ * Scope: Package (physical package) * MSR_PKG_C10_RESIDENCY: Package C10 Residency Counter. * perf code: 0x06 - * Available model: HSW ULT,KBL,GLM,CNL,CML,ICL,TGL + * Available model: HSW ULT,KBL,GLM,CNL,CML,ICL,TGL, + * TNT * Scope: Package (physical package) * */ @@ -640,8 +643,9 @@ static const struct x86_cpu_id intel_cstates_match[] __initconst = { X86_CSTATES_MODEL(INTEL_FAM6_ATOM_GOLDMONT, glm_cstates), X86_CSTATES_MODEL(INTEL_FAM6_ATOM_GOLDMONT_D, glm_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_ATOM_GOLDMONT_PLUS, glm_cstates), + X86_CSTATES_MODEL(INTEL_FAM6_ATOM_TREMONT_D, glm_cstates), + X86_CSTATES_MODEL(INTEL_FAM6_ATOM_TREMONT, glm_cstates), X86_CSTATES_MODEL(INTEL_FAM6_ICELAKE_L, icl_cstates), X86_CSTATES_MODEL(INTEL_FAM6_ICELAKE, icl_cstates), -- cgit v1.2.3 From 0aa0e0d6b34b89649e6b5882a7e025a0eb9bd832 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Tue, 28 Jan 2020 10:31:19 -0800 Subject: perf/x86/msr: Add Tremont support Tremont is Intel's successor to Goldmont Plus. SMI_COUNT MSR is also supported. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Andi Kleen Link: https://lkml.kernel.org/r/1580236279-35492-3-git-send-email-kan.liang@linux.intel.com --- arch/x86/events/msr.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/events/msr.c b/arch/x86/events/msr.c index 6f86650b3f77..a949f6f55991 100644 --- a/arch/x86/events/msr.c +++ b/arch/x86/events/msr.c @@ -75,8 +75,9 @@ static bool test_intel(int idx, void *data) case INTEL_FAM6_ATOM_GOLDMONT: case INTEL_FAM6_ATOM_GOLDMONT_D: - case INTEL_FAM6_ATOM_GOLDMONT_PLUS: + case INTEL_FAM6_ATOM_TREMONT_D: + case INTEL_FAM6_ATOM_TREMONT: case INTEL_FAM6_XEON_PHI_KNL: case INTEL_FAM6_XEON_PHI_KNM: -- cgit v1.2.3 From 25d387287cf0330abf2aad761ce6eee67326a355 Mon Sep 17 00:00:00 2001 From: Kim Phillips Date: Tue, 21 Jan 2020 11:12:31 -0600 Subject: perf/x86/amd: Add missing L2 misses event spec to AMD Family 17h's event map Commit 3fe3331bb285 ("perf/x86/amd: Add event map for AMD Family 17h"), claimed L2 misses were unsupported, due to them not being found in its referenced documentation, whose link has now moved [1]. That old documentation listed PMCx064 unit mask bit 3 as: "LsRdBlkC: LS Read Block C S L X Change to X Miss." and bit 0 as: "IcFillMiss: IC Fill Miss" We now have new public documentation [2] with improved descriptions, that clearly indicate what events those unit mask bits represent: Bit 3 now clearly states: "LsRdBlkC: Data Cache Req Miss in L2 (all types)" and bit 0 is: "IcFillMiss: Instruction Cache Req Miss in L2." So we can now add support for L2 misses in perf's genericised events as PMCx064 with both the above unit masks. [1] The commit's original documentation reference, "Processor Programming Reference (PPR) for AMD Family 17h Model 01h, Revision B1 Processors", originally available here: https://www.amd.com/system/files/TechDocs/54945_PPR_Family_17h_Models_00h-0Fh.pdf is now available here: https://developer.amd.com/wordpress/media/2017/11/54945_PPR_Family_17h_Models_00h-0Fh.pdf [2] "Processor Programming Reference (PPR) for Family 17h Model 31h, Revision B0 Processors", available here: https://developer.amd.com/wp-content/resources/55803_0.54-PUB.pdf Fixes: 3fe3331bb285 ("perf/x86/amd: Add event map for AMD Family 17h") Reported-by: Babu Moger Signed-off-by: Kim Phillips Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Tested-by: Babu Moger Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20200121171232.28839-1-kim.phillips@amd.com --- arch/x86/events/amd/core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index 1f22b6bbda68..39eb276d0277 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -250,6 +250,7 @@ static const u64 amd_f17h_perfmon_event_map[PERF_COUNT_HW_MAX] = [PERF_COUNT_HW_CPU_CYCLES] = 0x0076, [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, [PERF_COUNT_HW_CACHE_REFERENCES] = 0xff60, + [PERF_COUNT_HW_CACHE_MISSES] = 0x0964, [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c2, [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c3, [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x0287, -- cgit v1.2.3 From f861854e1b435b27197417f6f90d87188003cb24 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Tue, 21 Jan 2020 11:01:25 -0800 Subject: perf/x86/intel: Fix inaccurate period in context switch for auto-reload Perf doesn't take the left period into account when auto-reload is enabled with fixed period sampling mode in context switch. Here is the MSR trace of the perf command as below. (The MSR trace is simplified from a ftrace log.) #perf record -e cycles:p -c 2000000 -- ./triad_loop //The MSR trace of task schedule out //perf disable all counters, disable PEBS, disable GP counter 0, //read GP counter 0, and re-enable all counters. //The counter 0 stops at 0xfffffff82840 write_msr: MSR_CORE_PERF_GLOBAL_CTRL(38f), value 0 write_msr: MSR_IA32_PEBS_ENABLE(3f1), value 0 write_msr: MSR_P6_EVNTSEL0(186), value 40003003c rdpmc: 0, value fffffff82840 write_msr: MSR_CORE_PERF_GLOBAL_CTRL(38f), value f000000ff //The MSR trace of the same task schedule in again //perf disable all counters, enable and set GP counter 0, //enable PEBS, and re-enable all counters. //0xffffffe17b80 (-2000000) is written to GP counter 0. write_msr: MSR_CORE_PERF_GLOBAL_CTRL(38f), value 0 write_msr: MSR_IA32_PMC0(4c1), value ffffffe17b80 write_msr: MSR_P6_EVNTSEL0(186), value 40043003c write_msr: MSR_IA32_PEBS_ENABLE(3f1), value 1 write_msr: MSR_CORE_PERF_GLOBAL_CTRL(38f), value f000000ff When the same task schedule in again, the counter should starts from previous left. However, it starts from the fixed period -2000000 again. A special variant of intel_pmu_save_and_restart() is used for auto-reload, which doesn't update the hwc->period_left. When the monitored task schedules in again, perf doesn't know the left period. The fixed period is used, which is inaccurate. With auto-reload, the counter always has a negative counter value. So the left period is -value. Update the period_left in intel_pmu_save_and_restart_reload(). With the patch: //The MSR trace of task schedule out write_msr: MSR_CORE_PERF_GLOBAL_CTRL(38f), value 0 write_msr: MSR_IA32_PEBS_ENABLE(3f1), value 0 write_msr: MSR_P6_EVNTSEL0(186), value 40003003c rdpmc: 0, value ffffffe25cbc write_msr: MSR_CORE_PERF_GLOBAL_CTRL(38f), value f000000ff //The MSR trace of the same task schedule in again write_msr: MSR_CORE_PERF_GLOBAL_CTRL(38f), value 0 write_msr: MSR_IA32_PMC0(4c1), value ffffffe25cbc write_msr: MSR_P6_EVNTSEL0(186), value 40043003c write_msr: MSR_IA32_PEBS_ENABLE(3f1), value 1 write_msr: MSR_CORE_PERF_GLOBAL_CTRL(38f), value f000000ff Fixes: d31fc13fdcb2 ("perf/x86/intel: Fix event update for auto-reload") Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lkml.kernel.org/r/20200121190125.3389-1-kan.liang@linux.intel.com --- arch/x86/events/intel/ds.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 4b94ae4ae369..dc43cc124e09 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -1714,6 +1714,8 @@ intel_pmu_save_and_restart_reload(struct perf_event *event, int count) old = ((s64)(prev_raw_count << shift) >> shift); local64_add(new - old + count * period, &event->count); + local64_set(&hwc->period_left, -new); + perf_event_update_userpage(event); return 0; -- cgit v1.2.3 From da0f3e0201b87ee4bbd2175925dd57e1228c35fb Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 10 Feb 2020 17:23:02 +0200 Subject: MAINTAINERS: Sort entries in database for THUNDERBOLT Run parse-maintainers.pl and choose THUNDERBOLT record. Fix it accordingly. Signed-off-by: Andy Shevchenko Signed-off-by: Mika Westerberg --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 38fe2f3f7b6f..d6e118a8f96e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -16552,8 +16552,8 @@ M: Michael Jamet M: Mika Westerberg M: Yehezkel Bernat L: linux-usb@vger.kernel.org -T: git git://git.kernel.org/pub/scm/linux/kernel/git/westeri/thunderbolt.git S: Maintained +T: git git://git.kernel.org/pub/scm/linux/kernel/git/westeri/thunderbolt.git F: Documentation/admin-guide/thunderbolt.rst F: drivers/thunderbolt/ F: include/linux/thunderbolt.h -- cgit v1.2.3 From 30744a68626db6a0029aca9c646831c869c16d83 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Mon, 10 Feb 2020 16:27:12 +0100 Subject: xsk: Publish global consumer pointers when NAPI is finished The commit 4b638f13bab4 ("xsk: Eliminate the RX batch size") introduced a much more lazy way of updating the global consumer pointers from the kernel side, by only doing so when running out of entries in the fill or Tx rings (the rings consumed by the kernel). This can result in a deadlock with the user application if the kernel requires more than one entry to proceed and the application cannot put these entries in the fill ring because the kernel has not updated the global consumer pointer since the ring is not empty. Fix this by publishing the local kernel side consumer pointer whenever we have completed Rx or Tx processing in the kernel. This way, user space will have an up-to-date view of the consumer pointers whenever it gets to execute in the one core case (application and driver on the same core), or after a certain number of packets have been processed in the two core case (application and driver on different cores). A side effect of this patch is that the one core case gets better performance, but the two core case gets worse. The reason that the one core case improves is that updating the global consumer pointer is relatively cheap since the application by definition is not running when the kernel is (they are on the same core) and it is beneficial for the application, once it gets to run, to have pointers that are as up to date as possible since it then can operate on more packets and buffers. In the two core case, the most important performance aspect is to minimize the number of accesses to the global pointers since they are shared between two cores and bounces between the caches of those cores. This patch results in more updates to global state, which means lower performance in the two core case. Fixes: 4b638f13bab4 ("xsk: Eliminate the RX batch size") Reported-by: Ryan Goodfellow Reported-by: Maxim Mikityanskiy Signed-off-by: Magnus Karlsson Signed-off-by: Daniel Borkmann Acked-by: Jonathan Lemon Acked-by: Maxim Mikityanskiy Link: https://lore.kernel.org/bpf/1581348432-6747-1-git-send-email-magnus.karlsson@intel.com --- net/xdp/xsk.c | 2 ++ net/xdp/xsk_queue.h | 3 ++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index df600487a68d..356f90e4522b 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -217,6 +217,7 @@ static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) static void xsk_flush(struct xdp_sock *xs) { xskq_prod_submit(xs->rx); + __xskq_cons_release(xs->umem->fq); sock_def_readable(&xs->sk); } @@ -304,6 +305,7 @@ void xsk_umem_consume_tx_done(struct xdp_umem *umem) rcu_read_lock(); list_for_each_entry_rcu(xs, &umem->xsk_list, list) { + __xskq_cons_release(xs->tx); xs->sk.sk_write_space(&xs->sk); } rcu_read_unlock(); diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index bec2af11853a..89a01ac4e079 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -271,7 +271,8 @@ static inline void xskq_cons_release(struct xsk_queue *q) { /* To improve performance, only update local state here. * Reflect this to global state when we get new entries - * from the ring in xskq_cons_get_entries(). + * from the ring in xskq_cons_get_entries() and whenever + * Rx or Tx processing are completed in the NAPI loop. */ q->cached_cons++; } -- cgit v1.2.3 From ef8c9809acb0805c991bba8bdd4749fc46d44a98 Mon Sep 17 00:00:00 2001 From: Brian Masney Date: Sat, 18 Jan 2020 15:41:20 -0500 Subject: drm/msm/mdp5: rate limit pp done timeout warnings Add rate limiting of the 'pp done time out' warnings since these warnings can quickly fill the dmesg buffer. Signed-off-by: Brian Masney Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/disp/mdp5/mdp5_crtc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/msm/disp/mdp5/mdp5_crtc.c b/drivers/gpu/drm/msm/disp/mdp5/mdp5_crtc.c index 05cc04f729d6..e1cc541e0ef2 100644 --- a/drivers/gpu/drm/msm/disp/mdp5/mdp5_crtc.c +++ b/drivers/gpu/drm/msm/disp/mdp5/mdp5_crtc.c @@ -1109,8 +1109,8 @@ static void mdp5_crtc_wait_for_pp_done(struct drm_crtc *crtc) ret = wait_for_completion_timeout(&mdp5_crtc->pp_completion, msecs_to_jiffies(50)); if (ret == 0) - dev_warn(dev->dev, "pp done time out, lm=%d\n", - mdp5_cstate->pipeline.mixer->lm); + dev_warn_ratelimited(dev->dev, "pp done time out, lm=%d\n", + mdp5_cstate->pipeline.mixer->lm); } static void mdp5_crtc_wait_for_flush_done(struct drm_crtc *crtc) -- cgit v1.2.3 From e4f9bbe9f8beab9a1ce460e7e194595b76868595 Mon Sep 17 00:00:00 2001 From: Kalyan Thota Date: Thu, 23 Jan 2020 15:47:55 +0530 Subject: msm:disp:dpu1: add UBWC support for display on SC7180 Add UBWC global configuration for display on SC7180 target. Signed-off-by: Kalyan Thota Tested-by: Douglas Anderson Fixes: 73bfb790ac78 ("msm:disp:dpu1: setup display datapath for SC7180 target") Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/disp/dpu1/dpu_mdss.c | 58 +++++++++++++++++++++++++++++++- 1 file changed, 57 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_mdss.c b/drivers/gpu/drm/msm/disp/dpu1/dpu_mdss.c index 29705e773a4b..80d3cfc14007 100644 --- a/drivers/gpu/drm/msm/disp/dpu1/dpu_mdss.c +++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_mdss.c @@ -12,6 +12,7 @@ #define to_dpu_mdss(x) container_of(x, struct dpu_mdss, base) +#define HW_REV 0x0 #define HW_INTR_STATUS 0x0010 /* Max BW defined in KBps */ @@ -22,6 +23,17 @@ struct dpu_irq_controller { struct irq_domain *domain; }; +struct dpu_hw_cfg { + u32 val; + u32 offset; +}; + +struct dpu_mdss_hw_init_handler { + u32 hw_rev; + u32 hw_reg_count; + struct dpu_hw_cfg* hw_cfg; +}; + struct dpu_mdss { struct msm_mdss base; void __iomem *mmio; @@ -32,6 +44,44 @@ struct dpu_mdss { u32 num_paths; }; +static struct dpu_hw_cfg hw_cfg[] = { + { + /* UBWC global settings */ + .val = 0x1E, + .offset = 0x144, + } +}; + +static struct dpu_mdss_hw_init_handler cfg_handler[] = { + { .hw_rev = DPU_HW_VER_620, + .hw_reg_count = ARRAY_SIZE(hw_cfg), + .hw_cfg = hw_cfg + }, +}; + +static void dpu_mdss_hw_init(struct dpu_mdss *dpu_mdss, u32 hw_rev) +{ + int i; + u32 count = 0; + struct dpu_hw_cfg *hw_cfg = NULL; + + for (i = 0; i < ARRAY_SIZE(cfg_handler); i++) { + if (cfg_handler[i].hw_rev == hw_rev) { + hw_cfg = cfg_handler[i].hw_cfg; + count = cfg_handler[i].hw_reg_count; + break; + } + } + + for (i = 0; i < count; i++ ) { + writel_relaxed(hw_cfg->val, + dpu_mdss->mmio + hw_cfg->offset); + hw_cfg++; + } + + return; +} + static int dpu_mdss_parse_data_bus_icc_path(struct drm_device *dev, struct dpu_mdss *dpu_mdss) { @@ -174,12 +224,18 @@ static int dpu_mdss_enable(struct msm_mdss *mdss) struct dpu_mdss *dpu_mdss = to_dpu_mdss(mdss); struct dss_module_power *mp = &dpu_mdss->mp; int ret; + u32 mdss_rev; dpu_mdss_icc_request_bw(mdss); ret = msm_dss_enable_clk(mp->clk_config, mp->num_clk, true); - if (ret) + if (ret) { DPU_ERROR("clock enable failed, ret:%d\n", ret); + return ret; + } + + mdss_rev = readl_relaxed(dpu_mdss->mmio + HW_REV); + dpu_mdss_hw_init(dpu_mdss, mdss_rev); return ret; } -- cgit v1.2.3 From 8a4f300b978edbbaa73ef9eca660e45eb9f13873 Mon Sep 17 00:00:00 2001 From: Kamal Heib Date: Wed, 5 Feb 2020 13:05:30 +0200 Subject: RDMA/hfi1: Fix memory leak in _dev_comp_vect_mappings_create Make sure to free the allocated cpumask_var_t's to avoid the following reported memory leak by kmemleak: $ cat /sys/kernel/debug/kmemleak unreferenced object 0xffff8897f812d6a8 (size 8): comm "kworker/1:1", pid 347, jiffies 4294751400 (age 101.703s) hex dump (first 8 bytes): 00 00 00 00 00 00 00 00 ........ backtrace: [<00000000bff49664>] alloc_cpumask_var_node+0x4c/0xb0 [<0000000075d3ca81>] hfi1_comp_vectors_set_up+0x20f/0x800 [hfi1] [<0000000098d420df>] hfi1_init_dd+0x3311/0x4960 [hfi1] [<0000000071be7e52>] init_one+0x25e/0xf10 [hfi1] [<000000005483d4c2>] local_pci_probe+0xd4/0x180 [<000000007c3cbc6e>] work_for_cpu_fn+0x51/0xa0 [<000000001d626905>] process_one_work+0x8f0/0x17b0 [<000000007e569e7e>] worker_thread+0x536/0xb50 [<00000000fd39a4a5>] kthread+0x30c/0x3d0 [<0000000056f2edb3>] ret_from_fork+0x3a/0x50 Fixes: 5d18ee67d4c1 ("IB/{hfi1, rdmavt, qib}: Implement CQ completion vector support") Link: https://lore.kernel.org/r/20200205110530.12129-1-kamalheib1@gmail.com Signed-off-by: Kamal Heib Reviewed-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/affinity.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/infiniband/hw/hfi1/affinity.c b/drivers/infiniband/hw/hfi1/affinity.c index c142b23bb401..1aeea5d65c01 100644 --- a/drivers/infiniband/hw/hfi1/affinity.c +++ b/drivers/infiniband/hw/hfi1/affinity.c @@ -479,6 +479,8 @@ static int _dev_comp_vect_mappings_create(struct hfi1_devdata *dd, rvt_get_ibdev_name(&(dd)->verbs_dev.rdi), i, cpu); } + free_cpumask_var(available_cpus); + free_cpumask_var(non_intr_cpus); return 0; fail: -- cgit v1.2.3 From e8e35c62ba517f73cca32bc9925d62f4c4981768 Mon Sep 17 00:00:00 2001 From: Akhil P Oommen Date: Fri, 24 Jan 2020 17:50:11 +0530 Subject: drm/msm/a6xx: Correct the highestbank configuration Highest bank bit configuration is different for a618 gpu. Update it with the correct configuration which is the reset value incidentally. Signed-off-by: Akhil P Oommen Signed-off-by: Sharat Masetty Fixes: e812744c5f95 ("drm: msm: a6xx: Add support for A618") Reviewed-by: Rob Clark Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/adreno/a6xx_gpu.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c index daf07800cde0..536d1960a188 100644 --- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c +++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c @@ -470,10 +470,12 @@ static int a6xx_hw_init(struct msm_gpu *gpu) /* Select CP0 to always count cycles */ gpu_write(gpu, REG_A6XX_CP_PERFCTR_CP_SEL_0, PERF_CP_ALWAYS_COUNT); - gpu_write(gpu, REG_A6XX_RB_NC_MODE_CNTL, 2 << 1); - gpu_write(gpu, REG_A6XX_TPL1_NC_MODE_CNTL, 2 << 1); - gpu_write(gpu, REG_A6XX_SP_NC_MODE_CNTL, 2 << 1); - gpu_write(gpu, REG_A6XX_UCHE_MODE_CNTL, 2 << 21); + if (adreno_is_a630(adreno_gpu)) { + gpu_write(gpu, REG_A6XX_RB_NC_MODE_CNTL, 2 << 1); + gpu_write(gpu, REG_A6XX_TPL1_NC_MODE_CNTL, 2 << 1); + gpu_write(gpu, REG_A6XX_SP_NC_MODE_CNTL, 2 << 1); + gpu_write(gpu, REG_A6XX_UCHE_MODE_CNTL, 2 << 21); + } /* Enable fault detection */ gpu_write(gpu, REG_A6XX_RBBM_INTERFACE_HANG_INT_CNTL, -- cgit v1.2.3 From 7fd2dfc3694922eb7ace4801b7208cf9f62ebc7d Mon Sep 17 00:00:00 2001 From: John Stultz Date: Wed, 29 Jan 2020 20:12:44 +0000 Subject: drm: msm: Fix return type of dsi_mgr_connector_mode_valid for kCFI I was hitting kCFI crashes when building with clang, and after some digging finally narrowed it down to the dsi_mgr_connector_mode_valid() function being implemented as returning an int, instead of an enum drm_mode_status. This patch fixes it, and appeases the opaque word of the kCFI gods (seriously, clang inlining everything makes the kCFI backtraces only really rough estimates of where things went wrong). Thanks as always to Sami for his help narrowing this down. Cc: Rob Clark Cc: Sean Paul Cc: Sami Tolvanen Cc: Todd Kjos Cc: Alistair Delva Cc: Amit Pundir Cc: Sumit Semwal Cc: freedreno@lists.freedesktop.org Cc: clang-built-linux@googlegroups.com Signed-off-by: John Stultz Reviewed-by: Nick Desaulniers Tested-by: Amit Pundir Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/dsi/dsi_manager.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/msm/dsi/dsi_manager.c b/drivers/gpu/drm/msm/dsi/dsi_manager.c index 104115d112eb..acc711fd14f8 100644 --- a/drivers/gpu/drm/msm/dsi/dsi_manager.c +++ b/drivers/gpu/drm/msm/dsi/dsi_manager.c @@ -336,7 +336,7 @@ static int dsi_mgr_connector_get_modes(struct drm_connector *connector) return num; } -static int dsi_mgr_connector_mode_valid(struct drm_connector *connector, +static enum drm_mode_status dsi_mgr_connector_mode_valid(struct drm_connector *connector, struct drm_display_mode *mode) { int id = dsi_mgr_connector_get_id(connector); -- cgit v1.2.3 From 56d977d5610bc6a83cf5f2d69cec91f3a2b91f77 Mon Sep 17 00:00:00 2001 From: Jordan Crouse Date: Tue, 4 Feb 2020 10:42:28 -0700 Subject: drm/msm/a6xx: Remove unneeded GBIF unhalt Commit e812744c5f95 ("drm: msm: a6xx: Add support for A618") added a universal GBIF un-halt into a6xx_start(). This can cause problems for a630 targets which do not use GBIF and might have access protection enabled on the region now occupied by the GBIF registers. But it turns out that we didn't need to unhalt the GBIF in this path since the stop function already takes care of that after executing a flush but before turning off the headswitch. We should be confident that the GBIF is open for business when we restart the hardware. Signed-off-by: Jordan Crouse Tested-by: John Stultz Reviewed-by: Rob Clark Fixes: e812744c5f95 ("drm: msm: a6xx: Add support for A618") Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/adreno/a6xx_gpu.c | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c index 536d1960a188..7c449d154b4d 100644 --- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c +++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c @@ -378,18 +378,6 @@ static int a6xx_hw_init(struct msm_gpu *gpu) struct a6xx_gpu *a6xx_gpu = to_a6xx_gpu(adreno_gpu); int ret; - /* - * During a previous slumber, GBIF halt is asserted to ensure - * no further transaction can go through GPU before GPU - * headswitch is turned off. - * - * This halt is deasserted once headswitch goes off but - * incase headswitch doesn't goes off clear GBIF halt - * here to ensure GPU wake-up doesn't fail because of - * halted GPU transactions. - */ - gpu_write(gpu, REG_A6XX_GBIF_HALT, 0x0); - /* Make sure the GMU keeps the GPU on while we set it up */ a6xx_gmu_set_oob(&a6xx_gpu->gmu, GMU_OOB_GPU_SET); -- cgit v1.2.3 From 1636295a9f6931e8524c416ae333cd9ff7ef4661 Mon Sep 17 00:00:00 2001 From: Jordan Crouse Date: Wed, 5 Feb 2020 10:01:21 -0700 Subject: drm/msm/a6xx: Update the GMU bus tables for sc7180 Fixup the GMU bus table values for the sc7180 target. Signed-off-by: Jordan Crouse Reviewed-by: Rob Clark Fixes: e812744c5f95 ("drm: msm: a6xx: Add support for A618") Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/adreno/a6xx_hfi.c | 85 ++++++++++++++++++++++++----------- 1 file changed, 60 insertions(+), 25 deletions(-) diff --git a/drivers/gpu/drm/msm/adreno/a6xx_hfi.c b/drivers/gpu/drm/msm/adreno/a6xx_hfi.c index eda11abc5f01..e450e0b97211 100644 --- a/drivers/gpu/drm/msm/adreno/a6xx_hfi.c +++ b/drivers/gpu/drm/msm/adreno/a6xx_hfi.c @@ -7,6 +7,7 @@ #include "a6xx_gmu.h" #include "a6xx_gmu.xml.h" +#include "a6xx_gpu.h" #define HFI_MSG_ID(val) [val] = #val @@ -216,48 +217,82 @@ static int a6xx_hfi_send_perf_table(struct a6xx_gmu *gmu) NULL, 0); } -static int a6xx_hfi_send_bw_table(struct a6xx_gmu *gmu) +static void a618_build_bw_table(struct a6xx_hfi_msg_bw_table *msg) { - struct a6xx_hfi_msg_bw_table msg = { 0 }; + /* Send a single "off" entry since the 618 GMU doesn't do bus scaling */ + msg->bw_level_num = 1; + + msg->ddr_cmds_num = 3; + msg->ddr_wait_bitmask = 0x01; + + msg->ddr_cmds_addrs[0] = 0x50000; + msg->ddr_cmds_addrs[1] = 0x5003c; + msg->ddr_cmds_addrs[2] = 0x5000c; + + msg->ddr_cmds_data[0][0] = 0x40000000; + msg->ddr_cmds_data[0][1] = 0x40000000; + msg->ddr_cmds_data[0][2] = 0x40000000; /* - * The sdm845 GMU doesn't do bus frequency scaling on its own but it - * does need at least one entry in the list because it might be accessed - * when the GMU is shutting down. Send a single "off" entry. + * These are the CX (CNOC) votes - these are used by the GMU but the + * votes are known and fixed for the target */ + msg->cnoc_cmds_num = 1; + msg->cnoc_wait_bitmask = 0x01; + + msg->cnoc_cmds_addrs[0] = 0x5007c; + msg->cnoc_cmds_data[0][0] = 0x40000000; + msg->cnoc_cmds_data[1][0] = 0x60000001; +} - msg.bw_level_num = 1; +static void a6xx_build_bw_table(struct a6xx_hfi_msg_bw_table *msg) +{ + /* Send a single "off" entry since the 630 GMU doesn't do bus scaling */ + msg->bw_level_num = 1; - msg.ddr_cmds_num = 3; - msg.ddr_wait_bitmask = 0x07; + msg->ddr_cmds_num = 3; + msg->ddr_wait_bitmask = 0x07; - msg.ddr_cmds_addrs[0] = 0x50000; - msg.ddr_cmds_addrs[1] = 0x5005c; - msg.ddr_cmds_addrs[2] = 0x5000c; + msg->ddr_cmds_addrs[0] = 0x50000; + msg->ddr_cmds_addrs[1] = 0x5005c; + msg->ddr_cmds_addrs[2] = 0x5000c; - msg.ddr_cmds_data[0][0] = 0x40000000; - msg.ddr_cmds_data[0][1] = 0x40000000; - msg.ddr_cmds_data[0][2] = 0x40000000; + msg->ddr_cmds_data[0][0] = 0x40000000; + msg->ddr_cmds_data[0][1] = 0x40000000; + msg->ddr_cmds_data[0][2] = 0x40000000; /* * These are the CX (CNOC) votes. This is used but the values for the * sdm845 GMU are known and fixed so we can hard code them. */ - msg.cnoc_cmds_num = 3; - msg.cnoc_wait_bitmask = 0x05; + msg->cnoc_cmds_num = 3; + msg->cnoc_wait_bitmask = 0x05; - msg.cnoc_cmds_addrs[0] = 0x50034; - msg.cnoc_cmds_addrs[1] = 0x5007c; - msg.cnoc_cmds_addrs[2] = 0x5004c; + msg->cnoc_cmds_addrs[0] = 0x50034; + msg->cnoc_cmds_addrs[1] = 0x5007c; + msg->cnoc_cmds_addrs[2] = 0x5004c; - msg.cnoc_cmds_data[0][0] = 0x40000000; - msg.cnoc_cmds_data[0][1] = 0x00000000; - msg.cnoc_cmds_data[0][2] = 0x40000000; + msg->cnoc_cmds_data[0][0] = 0x40000000; + msg->cnoc_cmds_data[0][1] = 0x00000000; + msg->cnoc_cmds_data[0][2] = 0x40000000; + + msg->cnoc_cmds_data[1][0] = 0x60000001; + msg->cnoc_cmds_data[1][1] = 0x20000001; + msg->cnoc_cmds_data[1][2] = 0x60000001; +} + + +static int a6xx_hfi_send_bw_table(struct a6xx_gmu *gmu) +{ + struct a6xx_hfi_msg_bw_table msg = { 0 }; + struct a6xx_gpu *a6xx_gpu = container_of(gmu, struct a6xx_gpu, gmu); + struct adreno_gpu *adreno_gpu = &a6xx_gpu->base; - msg.cnoc_cmds_data[1][0] = 0x60000001; - msg.cnoc_cmds_data[1][1] = 0x20000001; - msg.cnoc_cmds_data[1][2] = 0x60000001; + if (adreno_is_a618(adreno_gpu)) + a618_build_bw_table(&msg); + else + a6xx_build_bw_table(&msg); return a6xx_hfi_send_msg(gmu, HFI_H2F_MSG_BW_TABLE, &msg, sizeof(msg), NULL, 0); -- cgit v1.2.3 From 9cc68ee1d92e3ab5bd5c821e5c1f387b0e16a669 Mon Sep 17 00:00:00 2001 From: Jordan Crouse Date: Wed, 5 Feb 2020 13:48:17 -0700 Subject: drm/msm: Fix a6xx GMU shutdown sequence Commit e812744c5f95 ("drm: msm: a6xx: Add support for A618") missed updating the VBIF flush in a6xx_gmu_shutdown and instead inserted the new sequence into a6xx_pm_suspend along with a redundant GMU idle. Move a6xx_bus_clear_pending_transactions to a6xx_gmu.c and use it in the appropriate place in the shutdown routine and remove the redundant idle call. v2: Remove newly unused variable that was triggering a warning Signed-off-by: Jordan Crouse Reviewed-by: Rob Clark Fixes: e812744c5f95 ("drm: msm: a6xx: Add support for A618") Tested-by: Douglas Anderson Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/adreno/a6xx_gmu.c | 37 +++++++++++++++++++++++++----- drivers/gpu/drm/msm/adreno/a6xx_gpu.c | 43 ----------------------------------- 2 files changed, 31 insertions(+), 49 deletions(-) diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gmu.c b/drivers/gpu/drm/msm/adreno/a6xx_gmu.c index 983afeaee737..748cd379065f 100644 --- a/drivers/gpu/drm/msm/adreno/a6xx_gmu.c +++ b/drivers/gpu/drm/msm/adreno/a6xx_gmu.c @@ -796,12 +796,41 @@ bool a6xx_gmu_isidle(struct a6xx_gmu *gmu) return true; } +#define GBIF_CLIENT_HALT_MASK BIT(0) +#define GBIF_ARB_HALT_MASK BIT(1) + +static void a6xx_bus_clear_pending_transactions(struct adreno_gpu *adreno_gpu) +{ + struct msm_gpu *gpu = &adreno_gpu->base; + + if (!a6xx_has_gbif(adreno_gpu)) { + gpu_write(gpu, REG_A6XX_VBIF_XIN_HALT_CTRL0, 0xf); + spin_until((gpu_read(gpu, REG_A6XX_VBIF_XIN_HALT_CTRL1) & + 0xf) == 0xf); + gpu_write(gpu, REG_A6XX_VBIF_XIN_HALT_CTRL0, 0); + + return; + } + + /* Halt new client requests on GBIF */ + gpu_write(gpu, REG_A6XX_GBIF_HALT, GBIF_CLIENT_HALT_MASK); + spin_until((gpu_read(gpu, REG_A6XX_GBIF_HALT_ACK) & + (GBIF_CLIENT_HALT_MASK)) == GBIF_CLIENT_HALT_MASK); + + /* Halt all AXI requests on GBIF */ + gpu_write(gpu, REG_A6XX_GBIF_HALT, GBIF_ARB_HALT_MASK); + spin_until((gpu_read(gpu, REG_A6XX_GBIF_HALT_ACK) & + (GBIF_ARB_HALT_MASK)) == GBIF_ARB_HALT_MASK); + + /* The GBIF halt needs to be explicitly cleared */ + gpu_write(gpu, REG_A6XX_GBIF_HALT, 0x0); +} + /* Gracefully try to shut down the GMU and by extension the GPU */ static void a6xx_gmu_shutdown(struct a6xx_gmu *gmu) { struct a6xx_gpu *a6xx_gpu = container_of(gmu, struct a6xx_gpu, gmu); struct adreno_gpu *adreno_gpu = &a6xx_gpu->base; - struct msm_gpu *gpu = &adreno_gpu->base; u32 val; /* @@ -819,11 +848,7 @@ static void a6xx_gmu_shutdown(struct a6xx_gmu *gmu) return; } - /* Clear the VBIF pipe before shutting down */ - gpu_write(gpu, REG_A6XX_VBIF_XIN_HALT_CTRL0, 0xf); - spin_until((gpu_read(gpu, REG_A6XX_VBIF_XIN_HALT_CTRL1) & 0xf) - == 0xf); - gpu_write(gpu, REG_A6XX_VBIF_XIN_HALT_CTRL0, 0); + a6xx_bus_clear_pending_transactions(adreno_gpu); /* tell the GMU we want to slumber */ a6xx_gmu_notify_slumber(gmu); diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c index 7c449d154b4d..68af24150de5 100644 --- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c +++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c @@ -738,39 +738,6 @@ static const u32 a6xx_register_offsets[REG_ADRENO_REGISTER_MAX] = { REG_ADRENO_DEFINE(REG_ADRENO_CP_RB_CNTL, REG_A6XX_CP_RB_CNTL), }; -#define GBIF_CLIENT_HALT_MASK BIT(0) -#define GBIF_ARB_HALT_MASK BIT(1) - -static void a6xx_bus_clear_pending_transactions(struct adreno_gpu *adreno_gpu) -{ - struct msm_gpu *gpu = &adreno_gpu->base; - - if(!a6xx_has_gbif(adreno_gpu)){ - gpu_write(gpu, REG_A6XX_VBIF_XIN_HALT_CTRL0, 0xf); - spin_until((gpu_read(gpu, REG_A6XX_VBIF_XIN_HALT_CTRL1) & - 0xf) == 0xf); - gpu_write(gpu, REG_A6XX_VBIF_XIN_HALT_CTRL0, 0); - - return; - } - - /* Halt new client requests on GBIF */ - gpu_write(gpu, REG_A6XX_GBIF_HALT, GBIF_CLIENT_HALT_MASK); - spin_until((gpu_read(gpu, REG_A6XX_GBIF_HALT_ACK) & - (GBIF_CLIENT_HALT_MASK)) == GBIF_CLIENT_HALT_MASK); - - /* Halt all AXI requests on GBIF */ - gpu_write(gpu, REG_A6XX_GBIF_HALT, GBIF_ARB_HALT_MASK); - spin_until((gpu_read(gpu, REG_A6XX_GBIF_HALT_ACK) & - (GBIF_ARB_HALT_MASK)) == GBIF_ARB_HALT_MASK); - - /* - * GMU needs DDR access in slumber path. Deassert GBIF halt now - * to allow for GMU to access system memory. - */ - gpu_write(gpu, REG_A6XX_GBIF_HALT, 0x0); -} - static int a6xx_pm_resume(struct msm_gpu *gpu) { struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu); @@ -795,16 +762,6 @@ static int a6xx_pm_suspend(struct msm_gpu *gpu) devfreq_suspend_device(gpu->devfreq.devfreq); - /* - * Make sure the GMU is idle before continuing (because some transitions - * may use VBIF - */ - a6xx_gmu_wait_for_idle(&a6xx_gpu->gmu); - - /* Clear the VBIF pipe before shutting down */ - /* FIXME: This accesses the GPU - do we need to make sure it is on? */ - a6xx_bus_clear_pending_transactions(adreno_gpu); - return a6xx_gmu_stop(a6xx_gpu); } -- cgit v1.2.3 From a70ed0f2e6262e723ae8d70accb984ba309eacc2 Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Mon, 10 Feb 2020 08:10:26 -0500 Subject: IB/hfi1: Acquire lock to release TID entries when user file is closed Each user context is allocated a certain number of RcvArray (TID) entries and these entries are managed through TID groups. These groups are put into one of three lists in each user context: tid_group_list, tid_used_list, and tid_full_list, depending on the number of used TID entries within each group. When TID packets are expected, one or more TID groups will be allocated. After the packets are received, the TID groups will be freed. Since multiple user threads may access the TID groups simultaneously, a mutex exp_mutex is used to synchronize the access. However, when the user file is closed, it tries to release all TID groups without acquiring the mutex first, which risks a race condition with another thread that may be releasing its TID groups, leading to data corruption. This patch addresses the issue by acquiring the mutex first before releasing the TID groups when the file is closed. Fixes: 3abb33ac6521 ("staging/hfi1: Add TID cache receive init and free funcs") Link: https://lore.kernel.org/r/20200210131026.87408.86853.stgit@awfm-01.aw.intel.com Reviewed-by: Mike Marciniszyn Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/user_exp_rcv.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/infiniband/hw/hfi1/user_exp_rcv.c b/drivers/infiniband/hw/hfi1/user_exp_rcv.c index f05742ac0949..2443423585b3 100644 --- a/drivers/infiniband/hw/hfi1/user_exp_rcv.c +++ b/drivers/infiniband/hw/hfi1/user_exp_rcv.c @@ -142,10 +142,12 @@ void hfi1_user_exp_rcv_free(struct hfi1_filedata *fd) { struct hfi1_ctxtdata *uctxt = fd->uctxt; + mutex_lock(&uctxt->exp_mutex); if (!EXP_TID_SET_EMPTY(uctxt->tid_full_list)) unlock_exp_tids(uctxt, &uctxt->tid_full_list, fd); if (!EXP_TID_SET_EMPTY(uctxt->tid_used_list)) unlock_exp_tids(uctxt, &uctxt->tid_used_list, fd); + mutex_unlock(&uctxt->exp_mutex); kfree(fd->invalid_tids); fd->invalid_tids = NULL; -- cgit v1.2.3 From be8638344c70bf492963ace206a9896606b6922d Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Mon, 10 Feb 2020 08:10:33 -0500 Subject: IB/hfi1: Close window for pq and request coliding Cleaning up a pq can result in the following warning and panic: WARNING: CPU: 52 PID: 77418 at lib/list_debug.c:53 __list_del_entry+0x63/0xd0 list_del corruption, ffff88cb2c6ac068->next is LIST_POISON1 (dead000000000100) Modules linked in: mmfs26(OE) mmfslinux(OE) tracedev(OE) 8021q garp mrp ib_isert iscsi_target_mod target_core_mod crc_t10dif crct10dif_generic opa_vnic rpcrdma ib_iser libiscsi scsi_transport_iscsi ib_ipoib(OE) bridge stp llc iTCO_wdt iTCO_vendor_support intel_powerclamp coretemp intel_rapl iosf_mbi kvm_intel kvm irqbypass crct10dif_pclmul crct10dif_common crc32_pclmul ghash_clmulni_intel ast aesni_intel ttm lrw gf128mul glue_helper ablk_helper drm_kms_helper cryptd syscopyarea sysfillrect sysimgblt fb_sys_fops drm pcspkr joydev lpc_ich mei_me drm_panel_orientation_quirks i2c_i801 mei wmi ipmi_si ipmi_devintf ipmi_msghandler nfit libnvdimm acpi_power_meter acpi_pad hfi1(OE) rdmavt(OE) rdma_ucm ib_ucm ib_uverbs ib_umad rdma_cm ib_cm iw_cm ib_core binfmt_misc numatools(OE) xpmem(OE) ip_tables nfsv3 nfs_acl nfs lockd grace sunrpc fscache igb ahci i2c_algo_bit libahci dca ptp libata pps_core crc32c_intel [last unloaded: i2c_algo_bit] CPU: 52 PID: 77418 Comm: pvbatch Kdump: loaded Tainted: G OE ------------ 3.10.0-957.38.3.el7.x86_64 #1 Hardware name: HPE.COM HPE SGI 8600-XA730i Gen10/X11DPT-SB-SG007, BIOS SBED1229 01/22/2019 Call Trace: [] dump_stack+0x19/0x1b [] __warn+0xd8/0x100 [] warn_slowpath_fmt+0x5f/0x80 [] __list_del_entry+0x63/0xd0 [] list_del+0xd/0x30 [] kmem_cache_destroy+0x50/0x110 [] hfi1_user_sdma_free_queues+0xf0/0x200 [hfi1] [] hfi1_file_close+0x70/0x1e0 [hfi1] [] __fput+0xec/0x260 [] ____fput+0xe/0x10 [] task_work_run+0xbb/0xe0 [] do_notify_resume+0xa5/0xc0 [] int_signal+0x12/0x17 BUG: unable to handle kernel NULL pointer dereference at 0000000000000010 IP: [] kmem_cache_close+0x7e/0x300 PGD 2cdab19067 PUD 2f7bfdb067 PMD 0 Oops: 0000 [#1] SMP Modules linked in: mmfs26(OE) mmfslinux(OE) tracedev(OE) 8021q garp mrp ib_isert iscsi_target_mod target_core_mod crc_t10dif crct10dif_generic opa_vnic rpcrdma ib_iser libiscsi scsi_transport_iscsi ib_ipoib(OE) bridge stp llc iTCO_wdt iTCO_vendor_support intel_powerclamp coretemp intel_rapl iosf_mbi kvm_intel kvm irqbypass crct10dif_pclmul crct10dif_common crc32_pclmul ghash_clmulni_intel ast aesni_intel ttm lrw gf128mul glue_helper ablk_helper drm_kms_helper cryptd syscopyarea sysfillrect sysimgblt fb_sys_fops drm pcspkr joydev lpc_ich mei_me drm_panel_orientation_quirks i2c_i801 mei wmi ipmi_si ipmi_devintf ipmi_msghandler nfit libnvdimm acpi_power_meter acpi_pad hfi1(OE) rdmavt(OE) rdma_ucm ib_ucm ib_uverbs ib_umad rdma_cm ib_cm iw_cm ib_core binfmt_misc numatools(OE) xpmem(OE) ip_tables nfsv3 nfs_acl nfs lockd grace sunrpc fscache igb ahci i2c_algo_bit libahci dca ptp libata pps_core crc32c_intel [last unloaded: i2c_algo_bit] CPU: 52 PID: 77418 Comm: pvbatch Kdump: loaded Tainted: G W OE ------------ 3.10.0-957.38.3.el7.x86_64 #1 Hardware name: HPE.COM HPE SGI 8600-XA730i Gen10/X11DPT-SB-SG007, BIOS SBED1229 01/22/2019 task: ffff88cc26db9040 ti: ffff88b5393a8000 task.ti: ffff88b5393a8000 RIP: 0010:[] [] kmem_cache_close+0x7e/0x300 RSP: 0018:ffff88b5393abd60 EFLAGS: 00010287 RAX: 0000000000000000 RBX: ffff88cb2c6ac000 RCX: 0000000000000003 RDX: 0000000000000400 RSI: 0000000000000400 RDI: ffffffff9095b800 RBP: ffff88b5393abdb0 R08: ffffffff9095b808 R09: ffffffff8ff77c19 R10: ffff88b73ce1f160 R11: ffffddecddde9800 R12: ffff88cb2c6ac000 R13: 000000000000000c R14: ffff88cf3fdca780 R15: 0000000000000000 FS: 00002aaaaab52500(0000) GS:ffff88b73ce00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000010 CR3: 0000002d27664000 CR4: 00000000007607e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 PKRU: 55555554 Call Trace: [] __kmem_cache_shutdown+0x14/0x80 [] kmem_cache_destroy+0x58/0x110 [] hfi1_user_sdma_free_queues+0xf0/0x200 [hfi1] [] hfi1_file_close+0x70/0x1e0 [hfi1] [] __fput+0xec/0x260 [] ____fput+0xe/0x10 [] task_work_run+0xbb/0xe0 [] do_notify_resume+0xa5/0xc0 [] int_signal+0x12/0x17 Code: 00 00 ba 00 04 00 00 0f 4f c2 3d 00 04 00 00 89 45 bc 0f 84 e7 01 00 00 48 63 45 bc 49 8d 04 c4 48 89 45 b0 48 8b 80 c8 00 00 00 <48> 8b 78 10 48 89 45 c0 48 83 c0 10 48 89 45 d0 48 8b 17 48 39 RIP [] kmem_cache_close+0x7e/0x300 RSP CR2: 0000000000000010 The panic is the result of slab entries being freed during the destruction of the pq slab. The code attempts to quiesce the pq, but looking for n_req == 0 doesn't account for new requests. Fix the issue by using SRCU to get a pq pointer and adjust the pq free logic to NULL the fd pq pointer prior to the quiesce. Fixes: e87473bc1b6c ("IB/hfi1: Only set fd pointer when base context is completely initialized") Link: https://lore.kernel.org/r/20200210131033.87408.81174.stgit@awfm-01.aw.intel.com Reviewed-by: Kaike Wan Signed-off-by: Mike Marciniszyn Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/file_ops.c | 52 +++++++++++++++++++------------ drivers/infiniband/hw/hfi1/hfi.h | 5 ++- drivers/infiniband/hw/hfi1/user_exp_rcv.c | 3 -- drivers/infiniband/hw/hfi1/user_sdma.c | 17 +++++++--- 4 files changed, 48 insertions(+), 29 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/file_ops.c b/drivers/infiniband/hw/hfi1/file_ops.c index bef6946861b2..259115886d35 100644 --- a/drivers/infiniband/hw/hfi1/file_ops.c +++ b/drivers/infiniband/hw/hfi1/file_ops.c @@ -200,23 +200,24 @@ static int hfi1_file_open(struct inode *inode, struct file *fp) fd = kzalloc(sizeof(*fd), GFP_KERNEL); - if (fd) { - fd->rec_cpu_num = -1; /* no cpu affinity by default */ - fd->mm = current->mm; - mmgrab(fd->mm); - fd->dd = dd; - kobject_get(&fd->dd->kobj); - fp->private_data = fd; - } else { - fp->private_data = NULL; - - if (atomic_dec_and_test(&dd->user_refcount)) - complete(&dd->user_comp); - - return -ENOMEM; - } - + if (!fd || init_srcu_struct(&fd->pq_srcu)) + goto nomem; + spin_lock_init(&fd->pq_rcu_lock); + spin_lock_init(&fd->tid_lock); + spin_lock_init(&fd->invalid_lock); + fd->rec_cpu_num = -1; /* no cpu affinity by default */ + fd->mm = current->mm; + mmgrab(fd->mm); + fd->dd = dd; + kobject_get(&fd->dd->kobj); + fp->private_data = fd; return 0; +nomem: + kfree(fd); + fp->private_data = NULL; + if (atomic_dec_and_test(&dd->user_refcount)) + complete(&dd->user_comp); + return -ENOMEM; } static long hfi1_file_ioctl(struct file *fp, unsigned int cmd, @@ -301,21 +302,30 @@ static long hfi1_file_ioctl(struct file *fp, unsigned int cmd, static ssize_t hfi1_write_iter(struct kiocb *kiocb, struct iov_iter *from) { struct hfi1_filedata *fd = kiocb->ki_filp->private_data; - struct hfi1_user_sdma_pkt_q *pq = fd->pq; + struct hfi1_user_sdma_pkt_q *pq; struct hfi1_user_sdma_comp_q *cq = fd->cq; int done = 0, reqs = 0; unsigned long dim = from->nr_segs; + int idx; - if (!cq || !pq) + idx = srcu_read_lock(&fd->pq_srcu); + pq = srcu_dereference(fd->pq, &fd->pq_srcu); + if (!cq || !pq) { + srcu_read_unlock(&fd->pq_srcu, idx); return -EIO; + } - if (!iter_is_iovec(from) || !dim) + if (!iter_is_iovec(from) || !dim) { + srcu_read_unlock(&fd->pq_srcu, idx); return -EINVAL; + } trace_hfi1_sdma_request(fd->dd, fd->uctxt->ctxt, fd->subctxt, dim); - if (atomic_read(&pq->n_reqs) == pq->n_max_reqs) + if (atomic_read(&pq->n_reqs) == pq->n_max_reqs) { + srcu_read_unlock(&fd->pq_srcu, idx); return -ENOSPC; + } while (dim) { int ret; @@ -333,6 +343,7 @@ static ssize_t hfi1_write_iter(struct kiocb *kiocb, struct iov_iter *from) reqs++; } + srcu_read_unlock(&fd->pq_srcu, idx); return reqs; } @@ -707,6 +718,7 @@ done: if (atomic_dec_and_test(&dd->user_refcount)) complete(&dd->user_comp); + cleanup_srcu_struct(&fdata->pq_srcu); kfree(fdata); return 0; } diff --git a/drivers/infiniband/hw/hfi1/hfi.h b/drivers/infiniband/hw/hfi1/hfi.h index 6365e8ffed9d..cae12f416ca0 100644 --- a/drivers/infiniband/hw/hfi1/hfi.h +++ b/drivers/infiniband/hw/hfi1/hfi.h @@ -1444,10 +1444,13 @@ struct mmu_rb_handler; /* Private data for file operations */ struct hfi1_filedata { + struct srcu_struct pq_srcu; struct hfi1_devdata *dd; struct hfi1_ctxtdata *uctxt; struct hfi1_user_sdma_comp_q *cq; - struct hfi1_user_sdma_pkt_q *pq; + /* update side lock for SRCU */ + spinlock_t pq_rcu_lock; + struct hfi1_user_sdma_pkt_q __rcu *pq; u16 subctxt; /* for cpu affinity; -1 if none */ int rec_cpu_num; diff --git a/drivers/infiniband/hw/hfi1/user_exp_rcv.c b/drivers/infiniband/hw/hfi1/user_exp_rcv.c index 2443423585b3..4da03f823474 100644 --- a/drivers/infiniband/hw/hfi1/user_exp_rcv.c +++ b/drivers/infiniband/hw/hfi1/user_exp_rcv.c @@ -87,9 +87,6 @@ int hfi1_user_exp_rcv_init(struct hfi1_filedata *fd, { int ret = 0; - spin_lock_init(&fd->tid_lock); - spin_lock_init(&fd->invalid_lock); - fd->entry_to_rb = kcalloc(uctxt->expected_count, sizeof(struct rb_node *), GFP_KERNEL); diff --git a/drivers/infiniband/hw/hfi1/user_sdma.c b/drivers/infiniband/hw/hfi1/user_sdma.c index fd754a16475a..c2f0d9ba93de 100644 --- a/drivers/infiniband/hw/hfi1/user_sdma.c +++ b/drivers/infiniband/hw/hfi1/user_sdma.c @@ -179,7 +179,6 @@ int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *uctxt, pq = kzalloc(sizeof(*pq), GFP_KERNEL); if (!pq) return -ENOMEM; - pq->dd = dd; pq->ctxt = uctxt->ctxt; pq->subctxt = fd->subctxt; @@ -236,7 +235,7 @@ int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *uctxt, goto pq_mmu_fail; } - fd->pq = pq; + rcu_assign_pointer(fd->pq, pq); fd->cq = cq; return 0; @@ -264,8 +263,14 @@ int hfi1_user_sdma_free_queues(struct hfi1_filedata *fd, trace_hfi1_sdma_user_free_queues(uctxt->dd, uctxt->ctxt, fd->subctxt); - pq = fd->pq; + spin_lock(&fd->pq_rcu_lock); + pq = srcu_dereference_check(fd->pq, &fd->pq_srcu, + lockdep_is_held(&fd->pq_rcu_lock)); if (pq) { + rcu_assign_pointer(fd->pq, NULL); + spin_unlock(&fd->pq_rcu_lock); + synchronize_srcu(&fd->pq_srcu); + /* at this point there can be no more new requests */ if (pq->handler) hfi1_mmu_rb_unregister(pq->handler); iowait_sdma_drain(&pq->busy); @@ -277,7 +282,8 @@ int hfi1_user_sdma_free_queues(struct hfi1_filedata *fd, kfree(pq->req_in_use); kmem_cache_destroy(pq->txreq_cache); kfree(pq); - fd->pq = NULL; + } else { + spin_unlock(&fd->pq_rcu_lock); } if (fd->cq) { vfree(fd->cq->comps); @@ -321,7 +327,8 @@ int hfi1_user_sdma_process_request(struct hfi1_filedata *fd, { int ret = 0, i; struct hfi1_ctxtdata *uctxt = fd->uctxt; - struct hfi1_user_sdma_pkt_q *pq = fd->pq; + struct hfi1_user_sdma_pkt_q *pq = + srcu_dereference(fd->pq, &fd->pq_srcu); struct hfi1_user_sdma_comp_q *cq = fd->cq; struct hfi1_devdata *dd = pq->dd; unsigned long idx = 0; -- cgit v1.2.3 From f92e48718889b3d49cee41853402aa88cac84a6b Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Mon, 10 Feb 2020 08:10:40 -0500 Subject: IB/rdmavt: Reset all QPs when the device is shut down When the hfi1 device is shut down during a system reboot, it is possible that some QPs might have not not freed by ULPs. More requests could be post sent and a lingering timer could be triggered to schedule more packet sends, leading to a crash: BUG: unable to handle kernel NULL pointer dereference at 0000000000000102 IP: [ffffffff810a65f2] __queue_work+0x32/0x3c0 PGD 0 Oops: 0000 1 SMP Modules linked in: nvmet_rdma(OE) nvmet(OE) nvme(OE) dm_round_robin nvme_rdma(OE) nvme_fabrics(OE) nvme_core(OE) pal_raw(POE) pal_pmt(POE) pal_cache(POE) pal_pile(POE) pal(POE) pal_compatible(OE) rpcrdma sunrpc ib_isert iscsi_target_mod target_core_mod ib_iser libiscsi scsi_transport_iscsi ib_ipoib rdma_ucm ib_ucm ib_uverbs ib_umad rdma_cm ib_cm iw_cm mlx4_ib sb_edac edac_core intel_powerclamp coretemp intel_rapl iosf_mbi kvm irqbypass crc32_pclmul ghash_clmulni_intel aesni_intel lrw gf128mul glue_helper ablk_helper cryptd iTCO_wdt iTCO_vendor_support mxm_wmi ipmi_ssif pcspkr ses enclosure joydev scsi_transport_sas i2c_i801 sg mei_me lpc_ich mei ioatdma shpchp ipmi_si ipmi_devintf ipmi_msghandler wmi acpi_power_meter acpi_pad dm_multipath hangcheck_timer ip_tables ext4 mbcache jbd2 mlx4_en sd_mod crc_t10dif crct10dif_generic mgag200 drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops ttm drm mlx4_core crct10dif_pclmul crct10dif_common hfi1(OE) igb crc32c_intel rdmavt(OE) ahci ib_core libahci libata ptp megaraid_sas pps_core dca i2c_algo_bit i2c_core devlink dm_mirror dm_region_hash dm_log dm_mod CPU: 23 PID: 0 Comm: swapper/23 Tainted: P OE ------------ 3.10.0-693.el7.x86_64 #1 Hardware name: Intel Corporation S2600CWR/S2600CWR, BIOS SE5C610.86B.01.01.0028.121720182203 12/17/2018 task: ffff8808f4ec4f10 ti: ffff8808f4ed8000 task.ti: ffff8808f4ed8000 RIP: 0010:[ffffffff810a65f2] [ffffffff810a65f2] __queue_work+0x32/0x3c0 RSP: 0018:ffff88105df43d48 EFLAGS: 00010046 RAX: 0000000000000086 RBX: 0000000000000086 RCX: 0000000000000000 RDX: ffff880f74e758b0 RSI: 0000000000000000 RDI: 000000000000001f RBP: ffff88105df43d80 R08: ffff8808f3c583c8 R09: ffff8808f3c58000 R10: 0000000000000002 R11: ffff88105df43da8 R12: ffff880f74e758b0 R13: 000000000000001f R14: 0000000000000000 R15: ffff88105a300000 FS: 0000000000000000(0000) GS:ffff88105df40000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000102 CR3: 00000000019f2000 CR4: 00000000001407e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Stack: ffff88105b6dd708 0000001f00000286 0000000000000086 ffff88105a300000 ffff880f74e75800 0000000000000000 ffff88105a300000 ffff88105df43d98 ffffffff810a6b85 ffff88105a301e80 ffff88105df43dc8 ffffffffc0224cde Call Trace: IRQ [ffffffff810a6b85] queue_work_on+0x45/0x50 [ffffffffc0224cde] _hfi1_schedule_send+0x6e/0xc0 [hfi1] [ffffffffc0170570] ? get_map_page+0x60/0x60 [rdmavt] [ffffffffc0224d62] hfi1_schedule_send+0x32/0x70 [hfi1] [ffffffffc0170644] rvt_rc_timeout+0xd4/0x120 [rdmavt] [ffffffffc0170570] ? get_map_page+0x60/0x60 [rdmavt] [ffffffff81097316] call_timer_fn+0x36/0x110 [ffffffffc0170570] ? get_map_page+0x60/0x60 [rdmavt] [ffffffff8109982d] run_timer_softirq+0x22d/0x310 [ffffffff81090b3f] __do_softirq+0xef/0x280 [ffffffff816b6a5c] call_softirq+0x1c/0x30 [ffffffff8102d3c5] do_softirq+0x65/0xa0 [ffffffff81090ec5] irq_exit+0x105/0x110 [ffffffff816b76c2] smp_apic_timer_interrupt+0x42/0x50 [ffffffff816b5c1d] apic_timer_interrupt+0x6d/0x80 EOI [ffffffff81527a02] ? cpuidle_enter_state+0x52/0xc0 [ffffffff81527b48] cpuidle_idle_call+0xd8/0x210 [ffffffff81034fee] arch_cpu_idle+0xe/0x30 [ffffffff810e7bca] cpu_startup_entry+0x14a/0x1c0 [ffffffff81051af6] start_secondary+0x1b6/0x230 Code: 89 e5 41 57 41 56 49 89 f6 41 55 41 89 fd 41 54 49 89 d4 53 48 83 ec 10 89 7d d4 9c 58 0f 1f 44 00 00 f6 c4 02 0f 85 be 02 00 00 41 f6 86 02 01 00 00 01 0f 85 58 02 00 00 49 c7 c7 28 19 01 00 RIP [ffffffff810a65f2] __queue_work+0x32/0x3c0 RSP ffff88105df43d48 CR2: 0000000000000102 The solution is to reset the QPs before the device resources are freed. This reset will change the QP state to prevent post sends and delete timers to prevent callbacks. Fixes: 0acb0cc7ecc1 ("IB/rdmavt: Initialize and teardown of qpn table") Link: https://lore.kernel.org/r/20200210131040.87408.38161.stgit@awfm-01.aw.intel.com Reviewed-by: Mike Marciniszyn Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rdmavt/qp.c | 84 ++++++++++++++++++++++++--------------- 1 file changed, 51 insertions(+), 33 deletions(-) diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c index 3cdf75d0c7a4..7858d499db03 100644 --- a/drivers/infiniband/sw/rdmavt/qp.c +++ b/drivers/infiniband/sw/rdmavt/qp.c @@ -61,6 +61,8 @@ #define RVT_RWQ_COUNT_THRESHOLD 16 static void rvt_rc_timeout(struct timer_list *t); +static void rvt_reset_qp(struct rvt_dev_info *rdi, struct rvt_qp *qp, + enum ib_qp_type type); /* * Convert the AETH RNR timeout code into the number of microseconds. @@ -452,40 +454,41 @@ no_qp_table: } /** - * free_all_qps - check for QPs still in use + * rvt_free_qp_cb - callback function to reset a qp + * @qp: the qp to reset + * @v: a 64-bit value + * + * This function resets the qp and removes it from the + * qp hash table. + */ +static void rvt_free_qp_cb(struct rvt_qp *qp, u64 v) +{ + unsigned int *qp_inuse = (unsigned int *)v; + struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device); + + /* Reset the qp and remove it from the qp hash list */ + rvt_reset_qp(rdi, qp, qp->ibqp.qp_type); + + /* Increment the qp_inuse count */ + (*qp_inuse)++; +} + +/** + * rvt_free_all_qps - check for QPs still in use * @rdi: rvt device info structure * * There should not be any QPs still in use. * Free memory for table. + * Return the number of QPs still in use. */ static unsigned rvt_free_all_qps(struct rvt_dev_info *rdi) { - unsigned long flags; - struct rvt_qp *qp; - unsigned n, qp_inuse = 0; - spinlock_t *ql; /* work around too long line below */ - - if (rdi->driver_f.free_all_qps) - qp_inuse = rdi->driver_f.free_all_qps(rdi); + unsigned int qp_inuse = 0; qp_inuse += rvt_mcast_tree_empty(rdi); - if (!rdi->qp_dev) - return qp_inuse; - - ql = &rdi->qp_dev->qpt_lock; - spin_lock_irqsave(ql, flags); - for (n = 0; n < rdi->qp_dev->qp_table_size; n++) { - qp = rcu_dereference_protected(rdi->qp_dev->qp_table[n], - lockdep_is_held(ql)); - RCU_INIT_POINTER(rdi->qp_dev->qp_table[n], NULL); + rvt_qp_iter(rdi, (u64)&qp_inuse, rvt_free_qp_cb); - for (; qp; qp = rcu_dereference_protected(qp->next, - lockdep_is_held(ql))) - qp_inuse++; - } - spin_unlock_irqrestore(ql, flags); - synchronize_rcu(); return qp_inuse; } @@ -902,14 +905,14 @@ static void rvt_init_qp(struct rvt_dev_info *rdi, struct rvt_qp *qp, } /** - * rvt_reset_qp - initialize the QP state to the reset state + * _rvt_reset_qp - initialize the QP state to the reset state * @qp: the QP to reset * @type: the QP type * * r_lock, s_hlock, and s_lock are required to be held by the caller */ -static void rvt_reset_qp(struct rvt_dev_info *rdi, struct rvt_qp *qp, - enum ib_qp_type type) +static void _rvt_reset_qp(struct rvt_dev_info *rdi, struct rvt_qp *qp, + enum ib_qp_type type) __must_hold(&qp->s_lock) __must_hold(&qp->s_hlock) __must_hold(&qp->r_lock) @@ -955,6 +958,27 @@ static void rvt_reset_qp(struct rvt_dev_info *rdi, struct rvt_qp *qp, lockdep_assert_held(&qp->s_lock); } +/** + * rvt_reset_qp - initialize the QP state to the reset state + * @rdi: the device info + * @qp: the QP to reset + * @type: the QP type + * + * This is the wrapper function to acquire the r_lock, s_hlock, and s_lock + * before calling _rvt_reset_qp(). + */ +static void rvt_reset_qp(struct rvt_dev_info *rdi, struct rvt_qp *qp, + enum ib_qp_type type) +{ + spin_lock_irq(&qp->r_lock); + spin_lock(&qp->s_hlock); + spin_lock(&qp->s_lock); + _rvt_reset_qp(rdi, qp, type); + spin_unlock(&qp->s_lock); + spin_unlock(&qp->s_hlock); + spin_unlock_irq(&qp->r_lock); +} + /** rvt_free_qpn - Free a qpn from the bit map * @qpt: QP table * @qpn: queue pair number to free @@ -1546,7 +1570,7 @@ int rvt_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, switch (new_state) { case IB_QPS_RESET: if (qp->state != IB_QPS_RESET) - rvt_reset_qp(rdi, qp, ibqp->qp_type); + _rvt_reset_qp(rdi, qp, ibqp->qp_type); break; case IB_QPS_RTR: @@ -1695,13 +1719,7 @@ int rvt_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata) struct rvt_qp *qp = ibqp_to_rvtqp(ibqp); struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device); - spin_lock_irq(&qp->r_lock); - spin_lock(&qp->s_hlock); - spin_lock(&qp->s_lock); rvt_reset_qp(rdi, qp, ibqp->qp_type); - spin_unlock(&qp->s_lock); - spin_unlock(&qp->s_hlock); - spin_unlock_irq(&qp->r_lock); wait_event(qp->wait, !atomic_read(&qp->refcount)); /* qpn is now available for use again */ -- cgit v1.2.3 From 8e4473bb50a1796c9c32b244e5dbc5ee24ead937 Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Mon, 3 Feb 2020 21:28:25 -0500 Subject: ceph: do not execute direct write in parallel if O_APPEND is specified In O_APPEND & O_DIRECT mode, the data from different writers will be possibly overlapping each other since they take the shared lock. For example, both Writer1 and Writer2 are in O_APPEND and O_DIRECT mode: Writer1 Writer2 shared_lock() shared_lock() getattr(CAP_SIZE) getattr(CAP_SIZE) iocb->ki_pos = EOF iocb->ki_pos = EOF write(data1) write(data2) shared_unlock() shared_unlock() The data2 will overlap the data1 from the same file offset, the old EOF. Switch to exclusive lock instead when O_APPEND is specified. Signed-off-by: Xiubo Li Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/file.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/fs/ceph/file.c b/fs/ceph/file.c index c3b8e8e0bf17..7e0190b1f821 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -1418,6 +1418,7 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from) struct ceph_cap_flush *prealloc_cf; ssize_t count, written = 0; int err, want, got; + bool direct_lock = false; loff_t pos; loff_t limit = max(i_size_read(inode), fsc->max_file_size); @@ -1428,8 +1429,11 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from) if (!prealloc_cf) return -ENOMEM; + if ((iocb->ki_flags & (IOCB_DIRECT | IOCB_APPEND)) == IOCB_DIRECT) + direct_lock = true; + retry_snap: - if (iocb->ki_flags & IOCB_DIRECT) + if (direct_lock) ceph_start_io_direct(inode); else ceph_start_io_write(inode); @@ -1519,14 +1523,15 @@ retry_snap: /* we might need to revert back to that point */ data = *from; - if (iocb->ki_flags & IOCB_DIRECT) { + if (iocb->ki_flags & IOCB_DIRECT) written = ceph_direct_read_write(iocb, &data, snapc, &prealloc_cf); - ceph_end_io_direct(inode); - } else { + else written = ceph_sync_write(iocb, &data, pos, snapc); + if (direct_lock) + ceph_end_io_direct(inode); + else ceph_end_io_write(inode); - } if (written > 0) iov_iter_advance(from, written); ceph_put_snap_context(snapc); @@ -1577,7 +1582,7 @@ retry_snap: goto out_unlocked; out: - if (iocb->ki_flags & IOCB_DIRECT) + if (direct_lock) ceph_end_io_direct(inode); else ceph_end_io_write(inode); -- cgit v1.2.3 From b27a939e8376a3f1ed09b9c33ef44d20f18ec3d0 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 10 Feb 2020 22:51:08 +0100 Subject: ceph: canonicalize server path in place syzbot reported that 4fbc0c711b24 ("ceph: remove the extra slashes in the server path") had caused a regression where an allocation could be done under a spinlock -- compare_mount_options() is called by sget_fc() with sb_lock held. We don't really need the supplied server path, so canonicalize it in place and compare it directly. To make this work, the leading slash is kept around and the logic in ceph_real_mount() to skip it is restored. CEPH_MSG_CLIENT_SESSION now reports the same (i.e. canonicalized) path, with the leading slash of course. Fixes: 4fbc0c711b24 ("ceph: remove the extra slashes in the server path") Reported-by: syzbot+98704a51af8e3d9425a9@syzkaller.appspotmail.com Signed-off-by: Ilya Dryomov Reviewed-by: Jeff Layton --- fs/ceph/super.c | 121 +++++++++++++------------------------------------------- fs/ceph/super.h | 2 +- 2 files changed, 29 insertions(+), 94 deletions(-) diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 1d9f083b8a11..64ea34ac330b 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -202,6 +202,26 @@ struct ceph_parse_opts_ctx { struct ceph_mount_options *opts; }; +/* + * Remove adjacent slashes and then the trailing slash, unless it is + * the only remaining character. + * + * E.g. "//dir1////dir2///" --> "/dir1/dir2", "///" --> "/". + */ +static void canonicalize_path(char *path) +{ + int i, j = 0; + + for (i = 0; path[i] != '\0'; i++) { + if (path[i] != '/' || j < 1 || path[j - 1] != '/') + path[j++] = path[i]; + } + + if (j > 1 && path[j - 1] == '/') + j--; + path[j] = '\0'; +} + /* * Parse the source parameter. Distinguish the server list from the path. * @@ -224,15 +244,16 @@ static int ceph_parse_source(struct fs_parameter *param, struct fs_context *fc) dev_name_end = strchr(dev_name, '/'); if (dev_name_end) { - kfree(fsopt->server_path); - /* * The server_path will include the whole chars from userland * including the leading '/'. */ + kfree(fsopt->server_path); fsopt->server_path = kstrdup(dev_name_end, GFP_KERNEL); if (!fsopt->server_path) return -ENOMEM; + + canonicalize_path(fsopt->server_path); } else { dev_name_end = dev_name + strlen(dev_name); } @@ -456,73 +477,6 @@ static int strcmp_null(const char *s1, const char *s2) return strcmp(s1, s2); } -/** - * path_remove_extra_slash - Remove the extra slashes in the server path - * @server_path: the server path and could be NULL - * - * Return NULL if the path is NULL or only consists of "/", or a string - * without any extra slashes including the leading slash(es) and the - * slash(es) at the end of the server path, such as: - * "//dir1////dir2///" --> "dir1/dir2" - */ -static char *path_remove_extra_slash(const char *server_path) -{ - const char *path = server_path; - const char *cur, *end; - char *buf, *p; - int len; - - /* if the server path is omitted */ - if (!path) - return NULL; - - /* remove all the leading slashes */ - while (*path == '/') - path++; - - /* if the server path only consists of slashes */ - if (*path == '\0') - return NULL; - - len = strlen(path); - - buf = kmalloc(len + 1, GFP_KERNEL); - if (!buf) - return ERR_PTR(-ENOMEM); - - end = path + len; - p = buf; - do { - cur = strchr(path, '/'); - if (!cur) - cur = end; - - len = cur - path; - - /* including one '/' */ - if (cur != end) - len += 1; - - memcpy(p, path, len); - p += len; - - while (cur <= end && *cur == '/') - cur++; - path = cur; - } while (path < end); - - *p = '\0'; - - /* - * remove the last slash if there has and just to make sure that - * we will get something like "dir1/dir2" - */ - if (*(--p) == '/') - *p = '\0'; - - return buf; -} - static int compare_mount_options(struct ceph_mount_options *new_fsopt, struct ceph_options *new_opt, struct ceph_fs_client *fsc) @@ -530,7 +484,6 @@ static int compare_mount_options(struct ceph_mount_options *new_fsopt, struct ceph_mount_options *fsopt1 = new_fsopt; struct ceph_mount_options *fsopt2 = fsc->mount_options; int ofs = offsetof(struct ceph_mount_options, snapdir_name); - char *p1, *p2; int ret; ret = memcmp(fsopt1, fsopt2, ofs); @@ -540,21 +493,12 @@ static int compare_mount_options(struct ceph_mount_options *new_fsopt, ret = strcmp_null(fsopt1->snapdir_name, fsopt2->snapdir_name); if (ret) return ret; + ret = strcmp_null(fsopt1->mds_namespace, fsopt2->mds_namespace); if (ret) return ret; - p1 = path_remove_extra_slash(fsopt1->server_path); - if (IS_ERR(p1)) - return PTR_ERR(p1); - p2 = path_remove_extra_slash(fsopt2->server_path); - if (IS_ERR(p2)) { - kfree(p1); - return PTR_ERR(p2); - } - ret = strcmp_null(p1, p2); - kfree(p1); - kfree(p2); + ret = strcmp_null(fsopt1->server_path, fsopt2->server_path); if (ret) return ret; @@ -957,7 +901,9 @@ static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc, mutex_lock(&fsc->client->mount_mutex); if (!fsc->sb->s_root) { - const char *path, *p; + const char *path = fsc->mount_options->server_path ? + fsc->mount_options->server_path + 1 : ""; + err = __ceph_open_session(fsc->client, started); if (err < 0) goto out; @@ -969,22 +915,11 @@ static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc, goto out; } - p = path_remove_extra_slash(fsc->mount_options->server_path); - if (IS_ERR(p)) { - err = PTR_ERR(p); - goto out; - } - /* if the server path is omitted or just consists of '/' */ - if (!p) - path = ""; - else - path = p; dout("mount opening path '%s'\n", path); ceph_fs_debugfs_init(fsc); root = open_root_dentry(fsc, path, started); - kfree(p); if (IS_ERR(root)) { err = PTR_ERR(root); goto out; diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 1e456a9011bb..037cdfb2ad4f 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -91,7 +91,7 @@ struct ceph_mount_options { char *snapdir_name; /* default ".snap" */ char *mds_namespace; /* default NULL */ - char *server_path; /* default "/" */ + char *server_path; /* default NULL (means "/") */ char *fscache_uniq; /* default NULL */ }; -- cgit v1.2.3 From 3b20bc2fe4c0cfd82d35838965dc7ff0b93415c6 Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Tue, 11 Feb 2020 01:53:16 -0500 Subject: ceph: noacl mount option is effectively ignored For the old mount API, the module parameters parseing function will be called in ceph_mount() and also just after the default posix acl flag set, so we can control to enable/disable it via the mount option. But for the new mount API, it will call the module parameters parseing function before ceph_get_tree(), so the posix acl will always be enabled. Fixes: 82995cc6c5ae ("libceph, rbd, ceph: convert to use the new mount API") Signed-off-by: Xiubo Li Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- fs/ceph/super.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 64ea34ac330b..c7f150686a53 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -1032,10 +1032,6 @@ static int ceph_get_tree(struct fs_context *fc) if (!fc->source) return invalfc(fc, "No source"); -#ifdef CONFIG_CEPH_FS_POSIX_ACL - fc->sb_flags |= SB_POSIXACL; -#endif - /* create client (which we may/may not use) */ fsc = create_fs_client(pctx->opts, pctx->copts); pctx->opts = NULL; @@ -1158,6 +1154,10 @@ static int ceph_init_fs_context(struct fs_context *fc) fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT; fsopt->congestion_kb = default_congestion_kb(); +#ifdef CONFIG_CEPH_FS_POSIX_ACL + fc->sb_flags |= SB_POSIXACL; +#endif + fc->fs_private = pctx; fc->ops = &ceph_context_ops; return 0; -- cgit v1.2.3 From d75a170fd848f037a1e28893ad10be7a4c51f8a6 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Tue, 11 Feb 2020 17:05:21 +0100 Subject: ALSA: usb-audio: Fix UAC2/3 effect unit parsing We've got a regression report about M-Audio Fast Track C400 device, and the git bisection resulted in the commit e0ccdef92653 ("ALSA: usb-audio: Clean up check_input_term()"). This commit was about the rewrite of the input terminal parser, and it's not too obvious from the change what really broke. The answer is: it's the interpretation of UAC2/3 effect units. In the original code, UAC2 effect unit is as if through UAC1 processing unit because both UAC1 PU and UAC2/3 EU share the same number (0x07). The old code went through a complex switch-case fallthrough, finally bailing out in the middle: if (protocol == UAC_VERSION_2 && hdr[2] == UAC2_EFFECT_UNIT) { /* UAC2/UAC1 unit IDs overlap here in an * uncompatible way. Ignore this unit for now. */ return 0; } ... and this special handling was missing in the new code; the new code treats UAC2/3 effect unit as if it were equivalent with the processing unit. Actually, the old code was too confusing. The effect unit has an incompatible unit description with the processing unit, so we shouldn't have dealt with EU in the same way. This patch addresses the regression by changing the effect unit handling to the own parser function. The own parser function makes the clear distinct with PU, so it improves the readability, too. The EU parser just sets the type and the id like the old kernels. Once when the proper effect unit support is added, we can revisit this parser function, but for now, let's keep this simple setup as is. Fixes: e0ccdef92653 ("ALSA: usb-audio: Clean up check_input_term()") Cc: BugLink: https://bugzilla.kernel.org/show_bug.cgi?id=206147 Link: https://lore.kernel.org/r/20200211160521.31990-1-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/usb/mixer.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/sound/usb/mixer.c b/sound/usb/mixer.c index d659fdb475e2..81b2db0edd5f 100644 --- a/sound/usb/mixer.c +++ b/sound/usb/mixer.c @@ -897,6 +897,15 @@ static int parse_term_proc_unit(struct mixer_build *state, return 0; } +static int parse_term_effect_unit(struct mixer_build *state, + struct usb_audio_term *term, + void *p1, int id) +{ + term->type = UAC3_EFFECT_UNIT << 16; /* virtual type */ + term->id = id; + return 0; +} + static int parse_term_uac2_clock_source(struct mixer_build *state, struct usb_audio_term *term, void *p1, int id) @@ -981,8 +990,7 @@ static int __check_input_term(struct mixer_build *state, int id, UAC3_PROCESSING_UNIT); case PTYPE(UAC_VERSION_2, UAC2_EFFECT_UNIT): case PTYPE(UAC_VERSION_3, UAC3_EFFECT_UNIT): - return parse_term_proc_unit(state, term, p1, id, - UAC3_EFFECT_UNIT); + return parse_term_effect_unit(state, term, p1, id); case PTYPE(UAC_VERSION_1, UAC1_EXTENSION_UNIT): case PTYPE(UAC_VERSION_2, UAC2_EXTENSION_UNIT_V2): case PTYPE(UAC_VERSION_3, UAC3_EXTENSION_UNIT): -- cgit v1.2.3 From 9b9be9e6dcf444de08fc675c61ac8e541b458d56 Mon Sep 17 00:00:00 2001 From: Patrice Chotard Date: Tue, 7 Jan 2020 11:38:33 +0100 Subject: ARM: dts: sti: Remove deprecated snps PHY properties for stih410-b2260 Remove "snps,phy-bus-name", "snps,phy-bus-id" and "snps,phy-addr" properties which are deprecated. Signed-off-by: Patrice Chotard --- arch/arm/boot/dts/stih410-b2260.dts | 3 --- 1 file changed, 3 deletions(-) diff --git a/arch/arm/boot/dts/stih410-b2260.dts b/arch/arm/boot/dts/stih410-b2260.dts index 4fbd8e9eb5b7..e2bb59783146 100644 --- a/arch/arm/boot/dts/stih410-b2260.dts +++ b/arch/arm/boot/dts/stih410-b2260.dts @@ -178,9 +178,6 @@ phy-mode = "rgmii"; pinctrl-0 = <&pinctrl_rgmii1 &pinctrl_rgmii1_mdio_1>; - snps,phy-bus-name = "stmmac"; - snps,phy-bus-id = <0>; - snps,phy-addr = <0>; snps,reset-gpio = <&pio0 7 0>; snps,reset-active-low; snps,reset-delays-us = <0 10000 1000000>; -- cgit v1.2.3 From f24667779b5348279e5e4328312a141a730a1fc7 Mon Sep 17 00:00:00 2001 From: Kuninori Morimoto Date: Mon, 16 Dec 2019 11:08:47 +0900 Subject: ARM: dts: sti: fixup sound frame-inversion for stihxxx-b2120.dtsi frame-inversion is "flag" not "uint32". This patch fixup it. Signed-off-by: Kuninori Morimoto Reviewed-by: Patrice Chotard Signed-off-by: Patrice Chotard --- arch/arm/boot/dts/stihxxx-b2120.dtsi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/boot/dts/stihxxx-b2120.dtsi b/arch/arm/boot/dts/stihxxx-b2120.dtsi index 60e11045ad76..d051f080e52e 100644 --- a/arch/arm/boot/dts/stihxxx-b2120.dtsi +++ b/arch/arm/boot/dts/stihxxx-b2120.dtsi @@ -46,7 +46,7 @@ /* DAC */ format = "i2s"; mclk-fs = <256>; - frame-inversion = <1>; + frame-inversion; cpu { sound-dai = <&sti_uni_player2>; }; -- cgit v1.2.3 From 7aa62404dfdefd759e6a5f8923a0b0704729295f Mon Sep 17 00:00:00 2001 From: Roman Li Date: Tue, 14 Jan 2020 13:56:08 -0500 Subject: drm/amd/display: Fix psr static frames calculation [Why] Driver crash with psr feature enabled due to divide-by-zero error. This is a regression after rework to calculate static screen frame number entry time. [How] Correct order of operations to avoid divide-by-zero. Signed-off-by: Roman Li Reviewed-by: Zhan Liu Acked-by: Bhawanpreet Lakha Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index 279541517a99..63e8a12a74bc 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -8408,7 +8408,6 @@ bool amdgpu_dm_psr_enable(struct dc_stream_state *stream) /* Calculate number of static frames before generating interrupt to * enter PSR. */ - unsigned int frame_time_microsec = 1000000 / vsync_rate_hz; // Init fail safe of 2 frames static unsigned int num_frames_static = 2; @@ -8423,8 +8422,10 @@ bool amdgpu_dm_psr_enable(struct dc_stream_state *stream) * Calculate number of frames such that at least 30 ms of time has * passed. */ - if (vsync_rate_hz != 0) + if (vsync_rate_hz != 0) { + unsigned int frame_time_microsec = 1000000 / vsync_rate_hz; num_frames_static = (30000 / frame_time_microsec) + 1; + } params.triggers.cursor_update = true; params.triggers.overlay_update = true; -- cgit v1.2.3 From df36f6cf23ada812930afa8ee76681d4ad307c61 Mon Sep 17 00:00:00 2001 From: Sung Lee Date: Wed, 15 Jan 2020 11:55:06 -0500 Subject: drm/amd/display: Do not set optimized_require to false after plane disable [WHY] The optimized_require flag is needed to set watermarks and clocks lower in certain conditions. This flag is set to true and then set to false while programming front end in dcn20. [HOW] Do not set the flag to false while disabling plane. Signed-off-by: Sung Lee Reviewed-by: Tony Cheng Acked-by: Bhawanpreet Lakha Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/dc/dcn20/dcn20_hwseq.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/dc/dcn20/dcn20_hwseq.c b/drivers/gpu/drm/amd/display/dc/dcn20/dcn20_hwseq.c index cfbbaffa8654..a444fed94184 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn20/dcn20_hwseq.c +++ b/drivers/gpu/drm/amd/display/dc/dcn20/dcn20_hwseq.c @@ -572,7 +572,6 @@ void dcn20_plane_atomic_disable(struct dc *dc, struct pipe_ctx *pipe_ctx) dpp->funcs->dpp_dppclk_control(dpp, false, false); hubp->power_gated = true; - dc->optimized_required = false; /* We're powering off, no need to optimize */ hws->funcs.plane_atomic_power_down(dc, pipe_ctx->plane_res.dpp, -- cgit v1.2.3 From aad927b5a863178a9d921044e52d66e0ccf0aff9 Mon Sep 17 00:00:00 2001 From: Sung Lee Date: Mon, 20 Jan 2020 18:58:45 -0500 Subject: drm/amd/display: Use dcfclk to populate watermark ranges [WHY & HOW] Previously drain clk was unconstrained and fill clk was constrained on fclk. We want to change it to fill clk unconstrained and drain clock constrained to dcfclk. Signed-off-by: Sung Lee Reviewed-by: Tony Cheng Acked-by: Bhawanpreet Lakha Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/dc/clk_mgr/dcn21/rn_clk_mgr.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn21/rn_clk_mgr.c b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn21/rn_clk_mgr.c index 7ae4c06232dd..034a5852a416 100644 --- a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn21/rn_clk_mgr.c +++ b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn21/rn_clk_mgr.c @@ -412,19 +412,19 @@ void build_watermark_ranges(struct clk_bw_params *bw_params, struct pp_smu_wm_ra ranges->reader_wm_sets[num_valid_sets].wm_inst = bw_params->wm_table.entries[i].wm_inst; ranges->reader_wm_sets[num_valid_sets].wm_type = bw_params->wm_table.entries[i].wm_type; - /* We will not select WM based on dcfclk, so leave it as unconstrained */ - ranges->reader_wm_sets[num_valid_sets].min_drain_clk_mhz = PP_SMU_WM_SET_RANGE_CLK_UNCONSTRAINED_MIN; - ranges->reader_wm_sets[num_valid_sets].max_drain_clk_mhz = PP_SMU_WM_SET_RANGE_CLK_UNCONSTRAINED_MAX; - /* fclk wil be used to select WM*/ + /* We will not select WM based on fclk, so leave it as unconstrained */ + ranges->reader_wm_sets[num_valid_sets].min_fill_clk_mhz = PP_SMU_WM_SET_RANGE_CLK_UNCONSTRAINED_MIN; + ranges->reader_wm_sets[num_valid_sets].max_fill_clk_mhz = PP_SMU_WM_SET_RANGE_CLK_UNCONSTRAINED_MAX; + /* dcfclk wil be used to select WM*/ if (ranges->reader_wm_sets[num_valid_sets].wm_type == WM_TYPE_PSTATE_CHG) { if (i == 0) - ranges->reader_wm_sets[num_valid_sets].min_fill_clk_mhz = 0; + ranges->reader_wm_sets[num_valid_sets].min_drain_clk_mhz = 0; else { /* add 1 to make it non-overlapping with next lvl */ - ranges->reader_wm_sets[num_valid_sets].min_fill_clk_mhz = bw_params->clk_table.entries[i - 1].fclk_mhz + 1; + ranges->reader_wm_sets[num_valid_sets].min_drain_clk_mhz = bw_params->clk_table.entries[i - 1].dcfclk_mhz + 1; } - ranges->reader_wm_sets[num_valid_sets].max_fill_clk_mhz = bw_params->clk_table.entries[i].fclk_mhz; + ranges->reader_wm_sets[num_valid_sets].max_drain_clk_mhz = bw_params->clk_table.entries[i].dcfclk_mhz; } else { /* unconstrained for memory retraining */ -- cgit v1.2.3 From a72f4ac1d778f7bde93dfee69bfc23377ec3d74f Mon Sep 17 00:00:00 2001 From: Avihai Horon Date: Sun, 26 Jan 2020 19:15:00 +0200 Subject: RDMA/core: Fix invalid memory access in spec_filter_size Add a check that the size specified in the flow spec header doesn't cause an overflow when calculating the filter size, and thus prevent access to invalid memory. The following crash from syzkaller revealed it. kasan: CONFIG_KASAN_INLINE enabled kasan: GPF could be caused by NULL-ptr deref or user memory access general protection fault: 0000 [#1] SMP KASAN PTI CPU: 1 PID: 17834 Comm: syz-executor.3 Not tainted 5.5.0-rc5 #2 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.1-0-ga5cab58e9a3f-prebuilt.qemu.org 04/01/2014 RIP: 0010:memchr_inv+0xd3/0x330 Code: 89 f9 89 f5 83 e1 07 0f 85 f9 00 00 00 49 89 d5 49 c1 ed 03 45 85 ed 74 6f 48 89 d9 48 b8 00 00 00 00 00 fc ff df 48 c1 e9 03 <80> 3c 01 00 0f 85 0d 02 00 00 44 0f b6 e5 48 b8 01 01 01 01 01 01 RSP: 0018:ffffc9000a13fa50 EFLAGS: 00010202 RAX: dffffc0000000000 RBX: 7fff88810de9d820 RCX: 0ffff11021bd3b04 RDX: 000000000000fff8 RSI: 0000000000000000 RDI: 7fff88810de9d820 RBP: 0000000000000000 R08: ffff888110d69018 R09: 0000000000000009 R10: 0000000000000001 R11: ffffed10236267cc R12: 0000000000000004 R13: 0000000000001fff R14: ffff88810de9d820 R15: 0000000000000040 FS: 00007f9ee0e51700(0000) GS:ffff88811b100000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 0000000115ea0006 CR4: 0000000000360ee0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: spec_filter_size.part.16+0x34/0x50 ib_uverbs_kern_spec_to_ib_spec_filter+0x691/0x770 ib_uverbs_ex_create_flow+0x9ea/0x1b40 ib_uverbs_write+0xaa5/0xdf0 __vfs_write+0x7c/0x100 vfs_write+0x168/0x4a0 ksys_write+0xc8/0x200 do_syscall_64+0x9c/0x390 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x465b49 Code: f7 d8 64 89 02 b8 ff ff ff ff c3 66 0f 1f 44 00 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 bc ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007f9ee0e50c58 EFLAGS: 00000246 ORIG_RAX: 0000000000000001 RAX: ffffffffffffffda RBX: 000000000073bf00 RCX: 0000000000465b49 RDX: 00000000000003a0 RSI: 00000000200007c0 RDI: 0000000000000004 RBP: 0000000000000003 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 00007f9ee0e516bc R13: 00000000004ca2da R14: 000000000070deb8 R15: 00000000ffffffff Modules linked in: Dumping ftrace buffer: (ftrace buffer empty) Fixes: 94e03f11ad1f ("IB/uverbs: Add support for flow tag") Link: https://lore.kernel.org/r/20200126171500.4623-1-leon@kernel.org Signed-off-by: Avihai Horon Reviewed-by: Maor Gottlieb Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/uverbs_cmd.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index c8693f5231dd..025933752e1d 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -2745,12 +2745,6 @@ static int kern_spec_to_ib_spec_action(struct uverbs_attr_bundle *attrs, return 0; } -static size_t kern_spec_filter_sz(const struct ib_uverbs_flow_spec_hdr *spec) -{ - /* Returns user space filter size, includes padding */ - return (spec->size - sizeof(struct ib_uverbs_flow_spec_hdr)) / 2; -} - static ssize_t spec_filter_size(const void *kern_spec_filter, u16 kern_filter_size, u16 ib_real_filter_sz) { @@ -2894,11 +2888,16 @@ int ib_uverbs_kern_spec_to_ib_spec_filter(enum ib_flow_spec_type type, static int kern_spec_to_ib_spec_filter(struct ib_uverbs_flow_spec *kern_spec, union ib_flow_spec *ib_spec) { - ssize_t kern_filter_sz; + size_t kern_filter_sz; void *kern_spec_mask; void *kern_spec_val; - kern_filter_sz = kern_spec_filter_sz(&kern_spec->hdr); + if (check_sub_overflow((size_t)kern_spec->hdr.size, + sizeof(struct ib_uverbs_flow_spec_hdr), + &kern_filter_sz)) + return -EINVAL; + + kern_filter_sz /= 2; kern_spec_val = (void *)kern_spec + sizeof(struct ib_uverbs_flow_spec_hdr); -- cgit v1.2.3 From 10189e8e6fe8dcde13435f9354800429c4474fb1 Mon Sep 17 00:00:00 2001 From: Mark Zhang Date: Sun, 26 Jan 2020 19:17:08 +0200 Subject: IB/mlx5: Return failure when rts2rts_qp_counters_set_id is not supported When binding a QP with a counter and the QP state is not RESET, return failure if the rts2rts_qp_counters_set_id is not supported by the device. This is to prevent cases like manual bind for Connect-IB devices from returning success when the feature is not supported. Fixes: d14133dd4161 ("IB/mlx5: Support set qp counter") Link: https://lore.kernel.org/r/20200126171708.5167-1-leon@kernel.org Signed-off-by: Mark Zhang Reviewed-by: Maor Gottlieb Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/qp.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index a4f8e7030787..957f3a52589b 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -3441,9 +3441,6 @@ static int __mlx5_ib_qp_set_counter(struct ib_qp *qp, struct mlx5_ib_qp_base *base; u32 set_id; - if (!MLX5_CAP_GEN(dev->mdev, rts2rts_qp_counters_set_id)) - return 0; - if (counter) set_id = counter->id; else @@ -6576,6 +6573,7 @@ void mlx5_ib_drain_rq(struct ib_qp *qp) */ int mlx5_ib_qp_set_counter(struct ib_qp *qp, struct rdma_counter *counter) { + struct mlx5_ib_dev *dev = to_mdev(qp->device); struct mlx5_ib_qp *mqp = to_mqp(qp); int err = 0; @@ -6585,6 +6583,11 @@ int mlx5_ib_qp_set_counter(struct ib_qp *qp, struct rdma_counter *counter) goto out; } + if (!MLX5_CAP_GEN(dev->mdev, rts2rts_qp_counters_set_id)) { + err = -EOPNOTSUPP; + goto out; + } + if (mqp->state == IB_QPS_RTS) { err = __mlx5_ib_qp_set_counter(qp, counter); if (!err) -- cgit v1.2.3 From d219face9059f38ad187bde133451a2a308fdb7c Mon Sep 17 00:00:00 2001 From: Krishnamraju Eraparaju Date: Tue, 4 Feb 2020 14:42:30 +0530 Subject: RDMA/iw_cxgb4: initiate CLOSE when entering TERM As per draft-hilland-iwarp-verbs-v1.0, sec 6.2.3, always initiate a CLOSE when entering into TERM state. In c4iw_modify_qp(), disconnect operation should only be performed when the modify_qp call is invoked from ib_core. And all other internal modify_qp calls(invoked within iw_cxgb4) that needs 'disconnect' should call c4iw_ep_disconnect() explicitly after modify_qp. Otherwise, deadlocks like below can occur: Call Trace: schedule+0x2f/0xa0 schedule_preempt_disabled+0xa/0x10 __mutex_lock.isra.5+0x2d0/0x4a0 c4iw_ep_disconnect+0x39/0x430 => tries to reacquire ep lock again c4iw_modify_qp+0x468/0x10d0 rx_data+0x218/0x570 => acquires ep lock process_work+0x5f/0x70 process_one_work+0x1a7/0x3b0 worker_thread+0x30/0x390 kthread+0x112/0x130 ret_from_fork+0x35/0x40 Fixes: d2c33370ae73 ("RDMA/iw_cxgb4: Always disconnect when QP is transitioning to TERMINATE state") Link: https://lore.kernel.org/r/20200204091230.7210-1-krishna2@chelsio.com Signed-off-by: Krishnamraju Eraparaju Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/cxgb4/cm.c | 4 ++++ drivers/infiniband/hw/cxgb4/qp.c | 4 ++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c index ee1182f9b627..d69dece3b1d5 100644 --- a/drivers/infiniband/hw/cxgb4/cm.c +++ b/drivers/infiniband/hw/cxgb4/cm.c @@ -3036,6 +3036,10 @@ static int terminate(struct c4iw_dev *dev, struct sk_buff *skb) C4IW_QP_ATTR_NEXT_STATE, &attrs, 1); } + /* As per draft-hilland-iwarp-verbs-v1.0, sec 6.2.3, + * when entering the TERM state the RNIC MUST initiate a CLOSE. + */ + c4iw_ep_disconnect(ep, 1, GFP_KERNEL); c4iw_put_ep(&ep->com); } else pr_warn("TERM received tid %u no ep/qp\n", tid); diff --git a/drivers/infiniband/hw/cxgb4/qp.c b/drivers/infiniband/hw/cxgb4/qp.c index bbcac539777a..89ac2f9ae6dd 100644 --- a/drivers/infiniband/hw/cxgb4/qp.c +++ b/drivers/infiniband/hw/cxgb4/qp.c @@ -1948,10 +1948,10 @@ int c4iw_modify_qp(struct c4iw_dev *rhp, struct c4iw_qp *qhp, qhp->attr.layer_etype = attrs->layer_etype; qhp->attr.ecode = attrs->ecode; ep = qhp->ep; - c4iw_get_ep(&ep->com); - disconnect = 1; if (!internal) { + c4iw_get_ep(&ep->com); terminate = 1; + disconnect = 1; } else { terminate = qhp->attr.send_term; ret = rdma_fini(rhp, qhp, ep); -- cgit v1.2.3 From 663218a3e715fd9339d143a3e10088316b180f4f Mon Sep 17 00:00:00 2001 From: Krishnamraju Eraparaju Date: Fri, 7 Feb 2020 19:44:29 +0530 Subject: RDMA/siw: Remove unwanted WARN_ON in siw_cm_llp_data_ready() Warnings like below can fill up the dmesg while disconnecting RDMA connections. Hence, remove the unwanted WARN_ON. WARNING: CPU: 6 PID: 0 at drivers/infiniband/sw/siw/siw_cm.c:1229 siw_cm_llp_data_ready+0xc1/0xd0 [siw] RIP: 0010:siw_cm_llp_data_ready+0xc1/0xd0 [siw] Call Trace: tcp_data_queue+0x226/0xb40 tcp_rcv_established+0x220/0x620 tcp_v4_do_rcv+0x12a/0x1e0 tcp_v4_rcv+0xb05/0xc00 ip_local_deliver_finish+0x69/0x210 ip_local_deliver+0x6b/0xe0 ip_rcv+0x273/0x362 __netif_receive_skb_core+0xb35/0xc30 netif_receive_skb_internal+0x3d/0xb0 napi_gro_frags+0x13b/0x200 t4_ethrx_handler+0x433/0x7d0 [cxgb4] process_responses+0x318/0x580 [cxgb4] napi_rx_handler+0x14/0x100 [cxgb4] net_rx_action+0x149/0x3b0 __do_softirq+0xe3/0x30a irq_exit+0x100/0x110 do_IRQ+0x7f/0xe0 common_interrupt+0xf/0xf Link: https://lore.kernel.org/r/20200207141429.27927-1-krishna2@chelsio.com Signed-off-by: Krishnamraju Eraparaju Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/siw/siw_cm.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/sw/siw/siw_cm.c b/drivers/infiniband/sw/siw/siw_cm.c index 0c3f0588346e..c5651a96b196 100644 --- a/drivers/infiniband/sw/siw/siw_cm.c +++ b/drivers/infiniband/sw/siw/siw_cm.c @@ -1225,10 +1225,9 @@ static void siw_cm_llp_data_ready(struct sock *sk) read_lock(&sk->sk_callback_lock); cep = sk_to_cep(sk); - if (!cep) { - WARN_ON(1); + if (!cep) goto out; - } + siw_dbg_cep(cep, "state: %d\n", cep->state); switch (cep->state) { -- cgit v1.2.3 From 93f9d1a4ac5930654c17412e3911b46ece73755a Mon Sep 17 00:00:00 2001 From: Arvind Sankar Date: Tue, 11 Feb 2020 11:22:35 -0500 Subject: ALSA: usb-audio: Apply sample rate quirk for Audioengine D1 The Audioengine D1 (0x2912:0x30c8) does support reading the sample rate, but it returns the rate in byte-reversed order. When setting sampling rate, the driver produces these warning messages: [168840.944226] usb 3-2.2: current rate 4500480 is different from the runtime rate 44100 [168854.930414] usb 3-2.2: current rate 8436480 is different from the runtime rate 48000 [168905.185825] usb 3-2.1.2: current rate 30465 is different from the runtime rate 96000 As can be seen from the hexadecimal conversion, the current rate read back is byte-reversed from the rate that was set. 44100 == 0x00ac44, 4500480 == 0x44ac00 48000 == 0x00bb80, 8436480 == 0x80bb00 96000 == 0x017700, 30465 == 0x007701 Rather than implementing a new quirk to reverse the order, just skip checking the rate to avoid spamming the log. Signed-off-by: Arvind Sankar Cc: Link: https://lore.kernel.org/r/20200211162235.1639889-1-nivedita@alum.mit.edu Signed-off-by: Takashi Iwai --- sound/usb/quirks.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/usb/quirks.c b/sound/usb/quirks.c index 3a5242e383b2..7f558f4b4520 100644 --- a/sound/usb/quirks.c +++ b/sound/usb/quirks.c @@ -1440,6 +1440,7 @@ bool snd_usb_get_sample_rate_quirk(struct snd_usb_audio *chip) case USB_ID(0x1395, 0x740a): /* Sennheiser DECT */ case USB_ID(0x1901, 0x0191): /* GE B850V3 CP2114 audio interface */ case USB_ID(0x21b4, 0x0081): /* AudioQuest DragonFly */ + case USB_ID(0x2912, 0x30c8): /* Audioengine D1 */ return true; } -- cgit v1.2.3 From 11f0446534679e0a77441a19a65ed8b4a3d475f0 Mon Sep 17 00:00:00 2001 From: Julian Wiedmann Date: Wed, 5 Feb 2020 09:41:42 +0100 Subject: s390/qdio: don't allocate *aob array with GFP_ATOMIC The only way to reach this allocation is via qdio_establish() qdio_detect_hsicq() qdio_enable_async_operation() and since qdio_establish() uses wait_event_*() just a few lines ealier, we can trust that it certainly is never called from atomic context. Signed-off-by: Julian Wiedmann Reviewed-by: Steffen Maier Signed-off-by: Vasily Gorbik --- drivers/s390/cio/qdio_setup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/s390/cio/qdio_setup.c b/drivers/s390/cio/qdio_setup.c index dc430bd86ade..3ab8e80d7bbc 100644 --- a/drivers/s390/cio/qdio_setup.c +++ b/drivers/s390/cio/qdio_setup.c @@ -536,7 +536,7 @@ void qdio_print_subchannel_info(struct qdio_irq *irq_ptr, int qdio_enable_async_operation(struct qdio_output_q *outq) { outq->aobs = kcalloc(QDIO_MAX_BUFFERS_PER_Q, sizeof(struct qaob *), - GFP_ATOMIC); + GFP_KERNEL); if (!outq->aobs) { outq->use_cq = 0; return -ENOMEM; -- cgit v1.2.3 From 27dc0700c3be7c681cea03c5230b93d02f623492 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Mon, 10 Feb 2020 11:27:37 -0500 Subject: s390/uv: Fix handling of length extensions The query parameter block might contain additional information and can be extended in the future. If the size of the block does not suffice we get an error code of rc=0x100. The buffer will contain all information up to the specified size and the hypervisor/guest simply do not need the additional information as they do not know about the new data. That means that we can (and must) accept rc=0x100 as success. Cc: stable@vger.kernel.org Reviewed-by: Cornelia Huck Fixes: 5abb9351dfd9 ("s390/uv: introduce guest side ultravisor code") Signed-off-by: Christian Borntraeger Signed-off-by: Vasily Gorbik --- arch/s390/boot/uv.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/s390/boot/uv.c b/arch/s390/boot/uv.c index ed007f4a6444..3f501159ee9f 100644 --- a/arch/s390/boot/uv.c +++ b/arch/s390/boot/uv.c @@ -15,7 +15,8 @@ void uv_query_info(void) if (!test_facility(158)) return; - if (uv_call(0, (uint64_t)&uvcb)) + /* rc==0x100 means that there is additional data we do not process */ + if (uv_call(0, (uint64_t)&uvcb) && uvcb.header.rc != 0x100) return; if (test_bit_inv(BIT_UVC_CMD_SET_SHARED_ACCESS, (unsigned long *)uvcb.inst_calls_list) && -- cgit v1.2.3 From 0f8a206df7c920150d2aa45574fba0ab7ff6be4f Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Sat, 8 Feb 2020 07:08:59 -0700 Subject: s390/time: Fix clk type in get_tod_clock Clang warns: In file included from ../arch/s390/boot/startup.c:3: In file included from ../include/linux/elf.h:5: In file included from ../arch/s390/include/asm/elf.h:132: In file included from ../include/linux/compat.h:10: In file included from ../include/linux/time.h:74: In file included from ../include/linux/time32.h:13: In file included from ../include/linux/timex.h:65: ../arch/s390/include/asm/timex.h:160:20: warning: passing 'unsigned char [16]' to parameter of type 'char *' converts between pointers to integer types with different sign [-Wpointer-sign] get_tod_clock_ext(clk); ^~~ ../arch/s390/include/asm/timex.h:149:44: note: passing argument to parameter 'clk' here static inline void get_tod_clock_ext(char *clk) ^ Change clk's type to just be char so that it matches what happens in get_tod_clock_ext. Fixes: 57b28f66316d ("[S390] s390_hypfs: Add new attributes") Link: https://github.com/ClangBuiltLinux/linux/issues/861 Link: http://lkml.kernel.org/r/20200208140858.47970-1-natechancellor@gmail.com Reviewed-by: Nick Desaulniers Signed-off-by: Nathan Chancellor Signed-off-by: Vasily Gorbik --- arch/s390/include/asm/timex.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/s390/include/asm/timex.h b/arch/s390/include/asm/timex.h index 670f14a228e5..6bf3a45ccfec 100644 --- a/arch/s390/include/asm/timex.h +++ b/arch/s390/include/asm/timex.h @@ -155,7 +155,7 @@ static inline void get_tod_clock_ext(char *clk) static inline unsigned long long get_tod_clock(void) { - unsigned char clk[STORE_CLOCK_EXT_SIZE]; + char clk[STORE_CLOCK_EXT_SIZE]; get_tod_clock_ext(clk); return *((unsigned long long *)&clk[1]); -- cgit v1.2.3 From c452833387624d1990c9bbb0ee1e98c10c147478 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 11 Feb 2020 11:58:56 -0300 Subject: tools include UAPI: Sync x86's syscalls_64.tbl, generic unistd.h and fcntl.h to pick up openat2 and pidfd_getfd fddb5d430ad9 ("open: introduce openat2(2) syscall") 9a2cef09c801 ("arch: wire up pidfd_getfd syscall") We also need to grab a copy of uapi/linux/openat2.h since it is now needed by fcntl.h, add it to tools/perf/check_headers.h. $ diff -u tools/perf/arch/x86/entry/syscalls/syscall_64.tbl arch/x86/entry/syscalls/syscall_64.tbl --- tools/perf/arch/x86/entry/syscalls/syscall_64.tbl 2019-12-20 16:43:57.662429958 -0300 +++ arch/x86/entry/syscalls/syscall_64.tbl 2020-02-10 16:36:22.070012468 -0300 @@ -357,6 +357,8 @@ 433 common fspick __x64_sys_fspick 434 common pidfd_open __x64_sys_pidfd_open 435 common clone3 __x64_sys_clone3/ptregs +437 common openat2 __x64_sys_openat2 +438 common pidfd_getfd __x64_sys_pidfd_getfd # # x32-specific system call numbers start at 512 to avoid cache impact $ Update tools/'s copy of that file: $ cp arch/x86/entry/syscalls/syscall_64.tbl tools/perf/arch/x86/entry/syscalls/syscall_64.tbl See the result: $ diff -u /tmp/build/perf/arch/x86/include/generated/asm/syscalls_64.c.before /tmp/build/perf/arch/x86/include/generated/asm/syscalls_64.c --- /tmp/build/perf/arch/x86/include/generated/asm/syscalls_64.c.before 2020-02-10 16:42:59.010636041 -0300 +++ /tmp/build/perf/arch/x86/include/generated/asm/syscalls_64.c 2020-02-10 16:43:24.149958337 -0300 @@ -346,5 +346,7 @@ [433] = "fspick", [434] = "pidfd_open", [435] = "clone3", + [437] = "openat2", + [438] = "pidfd_getfd", }; -#define SYSCALLTBL_x86_64_MAX_ID 435 +#define SYSCALLTBL_x86_64_MAX_ID 438 $ Now one can use: perf trace -e openat2,pidfd_getfd To get just those syscalls or use in things like: perf trace -e open* To get all the open variant (open, openat, openat2, etc) or: perf trace pidfd* To get the pidfd syscalls. Cc: Adrian Hunter Cc: Aleksa Sarai Cc: Al Viro Cc: Christian Brauner Cc: Jiri Olsa Cc: Namhyung Kim Cc: Sargun Dhillon Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/uapi/asm-generic/unistd.h | 7 +++- tools/include/uapi/linux/fcntl.h | 2 +- tools/include/uapi/linux/openat2.h | 39 +++++++++++++++++++++++ tools/perf/arch/x86/entry/syscalls/syscall_64.tbl | 2 ++ tools/perf/check-headers.sh | 1 + 5 files changed, 49 insertions(+), 2 deletions(-) create mode 100644 tools/include/uapi/linux/openat2.h diff --git a/tools/include/uapi/asm-generic/unistd.h b/tools/include/uapi/asm-generic/unistd.h index 1fc8faa6e973..3a3201e4618e 100644 --- a/tools/include/uapi/asm-generic/unistd.h +++ b/tools/include/uapi/asm-generic/unistd.h @@ -851,8 +851,13 @@ __SYSCALL(__NR_pidfd_open, sys_pidfd_open) __SYSCALL(__NR_clone3, sys_clone3) #endif +#define __NR_openat2 437 +__SYSCALL(__NR_openat2, sys_openat2) +#define __NR_pidfd_getfd 438 +__SYSCALL(__NR_pidfd_getfd, sys_pidfd_getfd) + #undef __NR_syscalls -#define __NR_syscalls 436 +#define __NR_syscalls 439 /* * 32 bit systems traditionally used different diff --git a/tools/include/uapi/linux/fcntl.h b/tools/include/uapi/linux/fcntl.h index 1f97b33c840e..ca88b7bce553 100644 --- a/tools/include/uapi/linux/fcntl.h +++ b/tools/include/uapi/linux/fcntl.h @@ -3,6 +3,7 @@ #define _UAPI_LINUX_FCNTL_H #include +#include #define F_SETLEASE (F_LINUX_SPECIFIC_BASE + 0) #define F_GETLEASE (F_LINUX_SPECIFIC_BASE + 1) @@ -100,5 +101,4 @@ #define AT_RECURSIVE 0x8000 /* Apply to the entire subtree */ - #endif /* _UAPI_LINUX_FCNTL_H */ diff --git a/tools/include/uapi/linux/openat2.h b/tools/include/uapi/linux/openat2.h new file mode 100644 index 000000000000..58b1eb711360 --- /dev/null +++ b/tools/include/uapi/linux/openat2.h @@ -0,0 +1,39 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _UAPI_LINUX_OPENAT2_H +#define _UAPI_LINUX_OPENAT2_H + +#include + +/* + * Arguments for how openat2(2) should open the target path. If only @flags and + * @mode are non-zero, then openat2(2) operates very similarly to openat(2). + * + * However, unlike openat(2), unknown or invalid bits in @flags result in + * -EINVAL rather than being silently ignored. @mode must be zero unless one of + * {O_CREAT, O_TMPFILE} are set. + * + * @flags: O_* flags. + * @mode: O_CREAT/O_TMPFILE file mode. + * @resolve: RESOLVE_* flags. + */ +struct open_how { + __u64 flags; + __u64 mode; + __u64 resolve; +}; + +/* how->resolve flags for openat2(2). */ +#define RESOLVE_NO_XDEV 0x01 /* Block mount-point crossings + (includes bind-mounts). */ +#define RESOLVE_NO_MAGICLINKS 0x02 /* Block traversal through procfs-style + "magic-links". */ +#define RESOLVE_NO_SYMLINKS 0x04 /* Block traversal through all symlinks + (implies OEXT_NO_MAGICLINKS) */ +#define RESOLVE_BENEATH 0x08 /* Block "lexical" trickery like + "..", symlinks, and absolute + paths which escape the dirfd. */ +#define RESOLVE_IN_ROOT 0x10 /* Make all jumps to "/" and ".." + be scoped inside the dirfd + (similar to chroot(2)). */ + +#endif /* _UAPI_LINUX_OPENAT2_H */ diff --git a/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl b/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl index c29976eca4a8..44d510bc9b78 100644 --- a/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl +++ b/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl @@ -357,6 +357,8 @@ 433 common fspick __x64_sys_fspick 434 common pidfd_open __x64_sys_pidfd_open 435 common clone3 __x64_sys_clone3/ptregs +437 common openat2 __x64_sys_openat2 +438 common pidfd_getfd __x64_sys_pidfd_getfd # # x32-specific system call numbers start at 512 to avoid cache impact diff --git a/tools/perf/check-headers.sh b/tools/perf/check-headers.sh index 68039a96c1dc..bfb21d049e6c 100755 --- a/tools/perf/check-headers.sh +++ b/tools/perf/check-headers.sh @@ -13,6 +13,7 @@ include/uapi/linux/kcmp.h include/uapi/linux/kvm.h include/uapi/linux/in.h include/uapi/linux/mount.h +include/uapi/linux/openat2.h include/uapi/linux/perf_event.h include/uapi/linux/prctl.h include/uapi/linux/sched.h -- cgit v1.2.3 From 02213cec64bbef66d7ad9ddc3b7c47236da64343 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 10 Feb 2020 15:32:15 +0100 Subject: perf maps: Mark module DSOs with kernel type We add kernel module map into machine->kmaps, so it needs to be created as 'struct kmap', which is dependent on its dso having kernel type. Reported-by: Ravi Bangoria Signed-off-by: Jiri Olsa Tested-by: Kim Phillips Tested-by: Ravi Bangoria Cc: Alexander Shishkin Cc: Michael Petlan Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/20200210143218.24948-2-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/machine.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c index c8c5410315e8..e3e5490f6de5 100644 --- a/tools/perf/util/machine.c +++ b/tools/perf/util/machine.c @@ -686,6 +686,7 @@ static struct dso *machine__findnew_module_dso(struct machine *machine, dso__set_module_info(dso, m, machine); dso__set_long_name(dso, strdup(filename), true); + dso->kernel = DSO_TYPE_KERNEL; } dso__get(dso); -- cgit v1.2.3 From 4a4eb6154d67f7766cc7eb74e9f1db424073e832 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 10 Feb 2020 21:08:47 +0100 Subject: perf maps: Mark ksymbol DSOs with kernel type We add ksymbol map into machine->kmaps, so it needs to be created as 'struct kmap', which is dependent on its dso having kernel type. Reported-by: Ravi Bangoria Signed-off-by: Jiri Olsa Tested-by: Ravi Bangoria Tested-by: Kim Phillips Cc: Alexander Shishkin Cc: Michael Petlan Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/20200210200847.GA36715@krava Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/machine.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c index e3e5490f6de5..0ad026561c7f 100644 --- a/tools/perf/util/machine.c +++ b/tools/perf/util/machine.c @@ -727,9 +727,17 @@ static int machine__process_ksymbol_register(struct machine *machine, struct map *map = maps__find(&machine->kmaps, event->ksymbol.addr); if (!map) { - map = dso__new_map(event->ksymbol.name); - if (!map) + struct dso *dso = dso__new(event->ksymbol.name); + + if (dso) { + dso->kernel = DSO_TYPE_KERNEL; + map = map__new2(0, dso); + } + + if (!dso || !map) { + dso__put(dso); return -ENOMEM; + } map->start = event->ksymbol.addr; map->end = map->start + event->ksymbol.len; -- cgit v1.2.3 From 7ce66139a99ce57caaf47b64afed5cb6ed02c5ed Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 10 Feb 2020 15:32:17 +0100 Subject: perf maps: Fix map__clone() for struct kmap The map__clone() function can be called on kernel maps as well, so it needs to duplicate the whole kmap data. Reported-by: Ravi Bangoria Signed-off-by: Jiri Olsa Tested-by: Ravi Bangoria Tested-by: Kim Phillips Cc: Alexander Shishkin Cc: Michael Petlan Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/20200210143218.24948-4-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/map.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tools/perf/util/map.c b/tools/perf/util/map.c index f67960bedebb..cea05fc9595c 100644 --- a/tools/perf/util/map.c +++ b/tools/perf/util/map.c @@ -375,8 +375,13 @@ struct symbol *map__find_symbol_by_name(struct map *map, const char *name) struct map *map__clone(struct map *from) { - struct map *map = memdup(from, sizeof(*map)); + size_t size = sizeof(struct map); + struct map *map; + + if (from->dso && from->dso->kernel) + size += sizeof(struct kmap); + map = memdup(from, size); if (map != NULL) { refcount_set(&map->refcnt, 1); RB_CLEAR_NODE(&map->rb_node); -- cgit v1.2.3 From 484214f49bd0948d716832a94e4737ca4dd02c16 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 10 Feb 2020 15:32:18 +0100 Subject: perf maps: Move kmap::kmaps setup to maps__insert() So the kmaps pointer setup is centralized and we do not need to update it in all those places (2 current places and few more missing) after calling maps__insert(). Reported-by: Ravi Bangoria Signed-off-by: Jiri Olsa Tested-by: Ravi Bangoria Tested-by: Kim Phillips Cc: Alexander Shishkin Cc: Michael Petlan Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/20200210143218.24948-5-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/machine.c | 13 +------------ tools/perf/util/map.c | 10 ++++++++++ 2 files changed, 11 insertions(+), 12 deletions(-) diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c index 0ad026561c7f..fb5c2cd44d30 100644 --- a/tools/perf/util/machine.c +++ b/tools/perf/util/machine.c @@ -981,7 +981,6 @@ int machine__create_extra_kernel_map(struct machine *machine, kmap = map__kmap(map); - kmap->kmaps = &machine->kmaps; strlcpy(kmap->name, xm->name, KMAP_NAME_LEN); maps__insert(&machine->kmaps, map); @@ -1091,9 +1090,6 @@ int __weak machine__create_extra_kernel_maps(struct machine *machine __maybe_unu static int __machine__create_kernel_maps(struct machine *machine, struct dso *kernel) { - struct kmap *kmap; - struct map *map; - /* In case of renewal the kernel map, destroy previous one */ machine__destroy_kernel_maps(machine); @@ -1102,14 +1098,7 @@ __machine__create_kernel_maps(struct machine *machine, struct dso *kernel) return -1; machine->vmlinux_map->map_ip = machine->vmlinux_map->unmap_ip = identity__map_ip; - map = machine__kernel_map(machine); - kmap = map__kmap(map); - if (!kmap) - return -1; - - kmap->kmaps = &machine->kmaps; - maps__insert(&machine->kmaps, map); - + maps__insert(&machine->kmaps, machine->vmlinux_map); return 0; } diff --git a/tools/perf/util/map.c b/tools/perf/util/map.c index cea05fc9595c..a08ca276098e 100644 --- a/tools/perf/util/map.c +++ b/tools/perf/util/map.c @@ -543,6 +543,16 @@ void maps__insert(struct maps *maps, struct map *map) __maps__insert(maps, map); ++maps->nr_maps; + if (map->dso && map->dso->kernel) { + struct kmap *kmap = map__kmap(map); + + if (kmap) + kmap->kmaps = maps; + else + pr_err("Internal error: kernel dso with non kernel map\n"); + } + + /* * If we already performed some search by name, then we need to add the just * inserted map and resort. -- cgit v1.2.3 From c75bec79fc080039e4575a0f239ea7b111aabe88 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 11 Feb 2020 15:19:42 -0300 Subject: tools headers UAPI: Sync copy of arm64's asm/unistd.h with the kernel sources To get the changes in: 3e3c8ca5a351 ("arm64: Move __ARCH_WANT_SYS_CLONE3 definition to uapi headers") Silencing this tools/perf/ build warning: Warning: Kernel ABI header at 'tools/arch/arm64/include/uapi/asm/unistd.h' differs from latest version at 'arch/arm64/include/uapi/asm/unistd.h' diff -u tools/arch/arm64/include/uapi/asm/unistd.h arch/arm64/include/uapi/asm/unistd.h Which will probably end up enabling the use of "clone3" in 'perf trace -e', haven't checked the build with this change on an arm64 system. Cc: Adrian Hunter Cc: Amanieu d'Antras Cc: Christian Brauner Cc: Jiri Olsa Cc: Namhyung Kim Signed-off-by: Arnaldo Carvalho de Melo --- tools/arch/arm64/include/uapi/asm/unistd.h | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/arch/arm64/include/uapi/asm/unistd.h b/tools/arch/arm64/include/uapi/asm/unistd.h index 4703d218663a..f83a70e07df8 100644 --- a/tools/arch/arm64/include/uapi/asm/unistd.h +++ b/tools/arch/arm64/include/uapi/asm/unistd.h @@ -19,5 +19,6 @@ #define __ARCH_WANT_NEW_STAT #define __ARCH_WANT_SET_GET_RLIMIT #define __ARCH_WANT_TIME32_SYSCALLS +#define __ARCH_WANT_SYS_CLONE3 #include -- cgit v1.2.3 From fc9199d46e644e6a978f69195cb849b21d2f485c Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 11 Feb 2020 15:25:21 -0300 Subject: tools headers UAPI: Sync prctl.h with the kernel sources To get the changes in: 8d19f1c8e193 ("prctl: PR_{G,S}ET_IO_FLUSHER to support controlling memory reclaim") Which ends up having this effect in tooling, i.e. the addition of the support to those prctl's options: $ tools/perf/trace/beauty/prctl_option.sh > before $ cp include/uapi/linux/prctl.h tools/include/uapi/linux/prctl.h $ git diff diff --git a/tools/include/uapi/linux/prctl.h b/tools/include/uapi/linux/prctl.h index 7da1b37b27aa..07b4f8131e36 100644 --- a/tools/include/uapi/linux/prctl.h +++ b/tools/include/uapi/linux/prctl.h @@ -234,4 +234,8 @@ struct prctl_mm_map { #define PR_GET_TAGGED_ADDR_CTRL 56 # define PR_TAGGED_ADDR_ENABLE (1UL << 0) +/* Control reclaim behavior when allocating memory */ +#define PR_SET_IO_FLUSHER 57 +#define PR_GET_IO_FLUSHER 58 + #endif /* _LINUX_PRCTL_H */ $ tools/perf/trace/beauty/prctl_option.sh > after $ diff -u before after --- before 2020-02-11 15:24:35.339289912 -0300 +++ after 2020-02-11 15:24:56.319711315 -0300 @@ -51,6 +51,8 @@ [54] = "PAC_RESET_KEYS", [55] = "SET_TAGGED_ADDR_CTRL", [56] = "GET_TAGGED_ADDR_CTRL", + [57] = "SET_IO_FLUSHER", + [58] = "GET_IO_FLUSHER", }; static const char *prctl_set_mm_options[] = { [1] = "START_CODE", $ Cc: Adrian Hunter Cc: Christian Brauner Cc: Jiri Olsa Cc: Mike Christie Cc: Namhyung Kim Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/uapi/linux/prctl.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/include/uapi/linux/prctl.h b/tools/include/uapi/linux/prctl.h index 7da1b37b27aa..07b4f8131e36 100644 --- a/tools/include/uapi/linux/prctl.h +++ b/tools/include/uapi/linux/prctl.h @@ -234,4 +234,8 @@ struct prctl_mm_map { #define PR_GET_TAGGED_ADDR_CTRL 56 # define PR_TAGGED_ADDR_ENABLE (1UL << 0) +/* Control reclaim behavior when allocating memory */ +#define PR_SET_IO_FLUSHER 57 +#define PR_GET_IO_FLUSHER 58 + #endif /* _LINUX_PRCTL_H */ -- cgit v1.2.3 From c0134b3366ba5f0aba41d56006b574d3be7f5ed3 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 11 Feb 2020 15:46:10 -0300 Subject: perf beauty prctl: Export the 'options' strarray So that we can use it with strtoul, allowing string to number conversions in filter expressions. Cc: Adrian Hunter Cc: Christian Brauner Cc: Jiri Olsa Cc: Mike Christie Cc: Namhyung Kim Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/trace/beauty/beauty.h | 2 ++ tools/perf/trace/beauty/prctl.c | 3 ++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/tools/perf/trace/beauty/beauty.h b/tools/perf/trace/beauty/beauty.h index 5a61043c2ff7..d6dfe68a7612 100644 --- a/tools/perf/trace/beauty/beauty.h +++ b/tools/perf/trace/beauty/beauty.h @@ -213,6 +213,8 @@ size_t syscall_arg__scnprintf_x86_arch_prctl_code(char *bf, size_t size, struct size_t syscall_arg__scnprintf_prctl_option(char *bf, size_t size, struct syscall_arg *arg); #define SCA_PRCTL_OPTION syscall_arg__scnprintf_prctl_option +extern struct strarray strarray__prctl_options; + size_t syscall_arg__scnprintf_prctl_arg2(char *bf, size_t size, struct syscall_arg *arg); #define SCA_PRCTL_ARG2 syscall_arg__scnprintf_prctl_arg2 diff --git a/tools/perf/trace/beauty/prctl.c b/tools/perf/trace/beauty/prctl.c index ba2179abed00..6fe5ad5f5d3a 100644 --- a/tools/perf/trace/beauty/prctl.c +++ b/tools/perf/trace/beauty/prctl.c @@ -11,9 +11,10 @@ #include "trace/beauty/generated/prctl_option_array.c" +DEFINE_STRARRAY(prctl_options, "PR_"); + static size_t prctl__scnprintf_option(int option, char *bf, size_t size, bool show_prefix) { - static DEFINE_STRARRAY(prctl_options, "PR_"); return strarray__scnprintf(&strarray__prctl_options, bf, size, "%d", show_prefix, option); } -- cgit v1.2.3 From d7a07b293216e5561705303751bc0d213e9fb328 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 11 Feb 2020 15:54:08 -0300 Subject: perf trace: Resolve prctl's 'option' arg strings to numbers # perf trace -e syscalls:sys_enter_prctl --filter="option==SET_NAME" 0.000 Socket Thread/3860 syscalls:sys_enter_prctl(option: SET_NAME, arg2: 0x7fc50b9733e8) 0.053 SSL Cert #78/3860 syscalls:sys_enter_prctl(option: SET_NAME, arg2: 0x7fc50b9733e8) ^C # If one uses '-v' with 'perf trace', we can see the filter it puts in place: New filter for syscalls:sys_enter_prctl: (option==0xf) && (common_pid != 3859 && common_pid != 2757) We still need to allow using plain '-e prctl' and have this turn into creating a 'syscalls:sys_enter_prctl' event so that the filter can be applied only to it as right now '-e prctl' ends up using the 'raw_syscalls:sys_enter/sys_exit'. The end goal is to have something like: # perf trace -e prctl/option==SET_NAME/ And have that use tracepoint filters or eBPF ones. Cc: Adrian Hunter Cc: Christian Brauner Cc: Jiri Olsa Cc: Mike Christie Cc: Namhyung Kim Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-trace.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index 46a72ecac427..01d542007c8b 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -1065,7 +1065,9 @@ static struct syscall_fmt syscall_fmts[] = { { .name = "poll", .timeout = true, }, { .name = "ppoll", .timeout = true, }, { .name = "prctl", - .arg = { [0] = { .scnprintf = SCA_PRCTL_OPTION, /* option */ }, + .arg = { [0] = { .scnprintf = SCA_PRCTL_OPTION, /* option */ + .strtoul = STUL_STRARRAY, + .parm = &strarray__prctl_options, }, [1] = { .scnprintf = SCA_PRCTL_ARG2, /* arg2 */ }, [2] = { .scnprintf = SCA_PRCTL_ARG3, /* arg3 */ }, }, }, { .name = "pread", .alias = "pread64", }, -- cgit v1.2.3 From d6d829d92c6e82b2627d3bb0058403ff15ee0592 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 11 Feb 2020 16:03:47 -0300 Subject: tools headers UAPI: Sync sched.h with the kernel To get the changes in: 769071ac9f20 ("ns: Introduce Time Namespace") Silencing this tools/perf build warning: Warning: Kernel ABI header at 'tools/include/uapi/linux/sched.h' differs from latest version at 'include/uapi/linux/sched.h' diff -u tools/include/uapi/linux/sched.h include/uapi/linux/sched.h Which enables 'perf trace' to decode the CLONE_NEWTIME bit in the 'flags' argument to the clone syscalls. Example of clone flags being decoded: [root@quaco ~]# perf trace -e clone* 0.000 qemu-system-x8/23923 clone(clone_flags: VM|FS|FILES|SIGHAND|THREAD|SYSVSEM|SETTLS|PARENT_SETTID|CHILD_CLEARTID, newsp: 0x7f0dad7f9870, parent_tidptr: 0x7f0dad7fa9d0, child_tidptr: 0x7f0dad7fa9d0, tls: 0x7f0dad7fa700) = 6806 (qemu-system-x86) ? qemu-system-x8/6806 ... [continued]: clone()) = 0 ^C[root@quaco ~]# At some point this should enable things like: # perf trace -e 'clone*/clone_flags&NEWTIME/' Cc: Adrian Hunter Cc: Andrei Vagin Cc: Jiri Olsa Cc: Namhyung Kim Cc: Thomas Gleixner Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/uapi/linux/sched.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tools/include/uapi/linux/sched.h b/tools/include/uapi/linux/sched.h index 4a0217832464..2e3bc22c6f20 100644 --- a/tools/include/uapi/linux/sched.h +++ b/tools/include/uapi/linux/sched.h @@ -36,6 +36,12 @@ /* Flags for the clone3() syscall. */ #define CLONE_CLEAR_SIGHAND 0x100000000ULL /* Clear any signal handler and reset to SIG_DFL. */ +/* + * cloning flags intersect with CSIGNAL so can be used with unshare and clone3 + * syscalls only: + */ +#define CLONE_NEWTIME 0x00000080 /* New time namespace */ + #ifndef __ASSEMBLY__ /** * struct clone_args - arguments for the clone3 syscall -- cgit v1.2.3 From ff5ac61ee83c13f516544d29847d28be093a40ee Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Sat, 1 Feb 2020 09:32:21 +0100 Subject: x86/ima: use correct identifier for SetupMode variable The IMA arch code attempts to inspect the "SetupMode" EFI variable by populating a variable called efi_SetupMode_name with the string "SecureBoot" and passing that to the EFI GetVariable service, which obviously does not yield the expected result. Given that the string is only referenced a single time, let's get rid of the intermediate variable, and pass the correct string as an immediate argument. While at it, do the same for "SecureBoot". Fixes: 399574c64eaf ("x86/ima: retry detecting secure boot mode") Fixes: 980ef4d22a95 ("x86/ima: check EFI SetupMode too") Cc: Matthew Garrett Signed-off-by: Ard Biesheuvel Cc: stable@vger.kernel.org # v5.3 Signed-off-by: Mimi Zohar --- arch/x86/kernel/ima_arch.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/ima_arch.c b/arch/x86/kernel/ima_arch.c index 4d4f5d9faac3..23054909c8dd 100644 --- a/arch/x86/kernel/ima_arch.c +++ b/arch/x86/kernel/ima_arch.c @@ -10,8 +10,6 @@ extern struct boot_params boot_params; static enum efi_secureboot_mode get_sb_mode(void) { - efi_char16_t efi_SecureBoot_name[] = L"SecureBoot"; - efi_char16_t efi_SetupMode_name[] = L"SecureBoot"; efi_guid_t efi_variable_guid = EFI_GLOBAL_VARIABLE_GUID; efi_status_t status; unsigned long size; @@ -25,7 +23,7 @@ static enum efi_secureboot_mode get_sb_mode(void) } /* Get variable contents into buffer */ - status = efi.get_variable(efi_SecureBoot_name, &efi_variable_guid, + status = efi.get_variable(L"SecureBoot", &efi_variable_guid, NULL, &size, &secboot); if (status == EFI_NOT_FOUND) { pr_info("ima: secureboot mode disabled\n"); @@ -38,7 +36,7 @@ static enum efi_secureboot_mode get_sb_mode(void) } size = sizeof(setupmode); - status = efi.get_variable(efi_SetupMode_name, &efi_variable_guid, + status = efi.get_variable(L"SetupMode", &efi_variable_guid, NULL, &size, &setupmode); if (status != EFI_SUCCESS) /* ignore unknown SetupMode */ -- cgit v1.2.3 From 2b63d0ec0daf79ba503fa8bfa25e07dc3da274f3 Mon Sep 17 00:00:00 2001 From: Aric Cyr Date: Tue, 21 Jan 2020 22:50:13 -0500 Subject: drm/amd/display: Check engine is not NULL before acquiring [Why] Engine can be NULL in some cases, so we must not acquire it. [How] Check for NULL engine before acquiring. Signed-off-by: Aric Cyr Reviewed-by: Harry Wentland Acked-by: Bhawanpreet Lakha Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/dc/dce/dce_aux.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/dc/dce/dce_aux.c b/drivers/gpu/drm/amd/display/dc/dce/dce_aux.c index f1a5d2c6aa37..68c4049cbc2a 100644 --- a/drivers/gpu/drm/amd/display/dc/dce/dce_aux.c +++ b/drivers/gpu/drm/amd/display/dc/dce/dce_aux.c @@ -400,7 +400,7 @@ static bool acquire( { enum gpio_result result; - if (!is_engine_available(engine)) + if ((engine == NULL) || !is_engine_available(engine)) return false; result = dal_ddc_open(ddc, GPIO_MODE_HARDWARE, -- cgit v1.2.3 From 6c81917a0485ee2a1be0dc23321ac10ecfd9578b Mon Sep 17 00:00:00 2001 From: Yongqiang Sun Date: Thu, 23 Jan 2020 16:30:15 -0500 Subject: drm/amd/display: Limit minimum DPPCLK to 100MHz. [Why] Underflow is observed when plug in a 4K@60 monitor with 1366x768 eDP due to DPPCLK is too low. [How] Limit minimum DPPCLK to 100MHz. Signed-off-by: Yongqiang Sun Reviewed-by: Eric Yang Acked-by: Bhawanpreet Lakha Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/dc/clk_mgr/dcn21/rn_clk_mgr.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn21/rn_clk_mgr.c b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn21/rn_clk_mgr.c index 034a5852a416..9ef3f7b91a1d 100644 --- a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn21/rn_clk_mgr.c +++ b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn21/rn_clk_mgr.c @@ -151,6 +151,12 @@ void rn_update_clocks(struct clk_mgr *clk_mgr_base, rn_vbios_smu_set_min_deep_sleep_dcfclk(clk_mgr, clk_mgr_base->clks.dcfclk_deep_sleep_khz); } + // workaround: Limit dppclk to 100Mhz to avoid lower eDP panel switch to plus 4K monitor underflow. + if (!IS_DIAG_DC(dc->ctx->dce_environment)) { + if (new_clocks->dppclk_khz < 100000) + new_clocks->dppclk_khz = 100000; + } + if (should_set_clock(safe_to_lower, new_clocks->dppclk_khz, clk_mgr->base.clks.dppclk_khz)) { if (clk_mgr->base.clks.dppclk_khz > new_clocks->dppclk_khz) dpp_clock_lowered = true; -- cgit v1.2.3 From c134c3cabae46a56ab2e1f5e5fa49405e1758838 Mon Sep 17 00:00:00 2001 From: Isabel Zhang Date: Mon, 27 Jan 2020 10:57:16 -0500 Subject: drm/amd/display: Add initialitions for PLL2 clock source [Why] Starting from 14nm, the PLL is built into the PHY and the PLL is mapped to PHY on 1 to 1 basis. In the code, the DP port is mapped to a PLL that was not initialized. This causes DP to HDMI dongle to not light up the display. [How] Initializations added for PLL2 when creating resources. Signed-off-by: Isabel Zhang Reviewed-by: Eric Yang Acked-by: Bhawanpreet Lakha Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/dc/dcn21/dcn21_resource.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/gpu/drm/amd/display/dc/dcn21/dcn21_resource.c b/drivers/gpu/drm/amd/display/dc/dcn21/dcn21_resource.c index 0d506d30d6b6..33d0a176841a 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn21/dcn21_resource.c +++ b/drivers/gpu/drm/amd/display/dc/dcn21/dcn21_resource.c @@ -60,6 +60,7 @@ #include "dcn20/dcn20_dccg.h" #include "dcn21_hubbub.h" #include "dcn10/dcn10_resource.h" +#include "dce110/dce110_resource.h" #include "dcn20/dcn20_dwb.h" #include "dcn20/dcn20_mmhubbub.h" @@ -856,6 +857,7 @@ static const struct dc_debug_options debug_defaults_diags = { enum dcn20_clk_src_array_id { DCN20_CLK_SRC_PLL0, DCN20_CLK_SRC_PLL1, + DCN20_CLK_SRC_PLL2, DCN20_CLK_SRC_TOTAL_DCN21 }; @@ -1718,6 +1720,10 @@ static bool dcn21_resource_construct( dcn21_clock_source_create(ctx, ctx->dc_bios, CLOCK_SOURCE_COMBO_PHY_PLL1, &clk_src_regs[1], false); + pool->base.clock_sources[DCN20_CLK_SRC_PLL2] = + dcn21_clock_source_create(ctx, ctx->dc_bios, + CLOCK_SOURCE_COMBO_PHY_PLL2, + &clk_src_regs[2], false); pool->base.clk_src_count = DCN20_CLK_SRC_TOTAL_DCN21; -- cgit v1.2.3 From f4d0242b7b43977923f778b4cf787425ef05776e Mon Sep 17 00:00:00 2001 From: James Zhu Date: Wed, 5 Feb 2020 09:20:22 -0500 Subject: drm/amdgpu/vcn2.5: fix DPG mode power off issue on instance 1 Support pause_state for multiple instance, and it will fix vcn2.5 DPG mode power off issue on instance 1. Signed-off-by: James Zhu Reviewed-by: Leo Liu Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h | 3 +-- drivers/gpu/drm/amd/amdgpu/vcn_v1_0.c | 14 ++++++++------ drivers/gpu/drm/amd/amdgpu/vcn_v2_0.c | 6 +++--- drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c | 6 +++--- 4 files changed, 15 insertions(+), 14 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h index d6deb0eb1e15..6fe057329de2 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h @@ -179,6 +179,7 @@ struct amdgpu_vcn_inst { struct amdgpu_irq_src irq; struct amdgpu_vcn_reg external; struct amdgpu_bo *dpg_sram_bo; + struct dpg_pause_state pause_state; void *dpg_sram_cpu_addr; uint64_t dpg_sram_gpu_addr; uint32_t *dpg_sram_curr_addr; @@ -190,8 +191,6 @@ struct amdgpu_vcn { const struct firmware *fw; /* VCN firmware */ unsigned num_enc_rings; enum amd_powergating_state cur_state; - struct dpg_pause_state pause_state; - bool indirect_sram; uint8_t num_vcn_inst; diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v1_0.c b/drivers/gpu/drm/amd/amdgpu/vcn_v1_0.c index 1a24fadd30e2..71f61afdc655 100644 --- a/drivers/gpu/drm/amd/amdgpu/vcn_v1_0.c +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v1_0.c @@ -1207,9 +1207,10 @@ static int vcn_v1_0_pause_dpg_mode(struct amdgpu_device *adev, struct amdgpu_ring *ring; /* pause/unpause if state is changed */ - if (adev->vcn.pause_state.fw_based != new_state->fw_based) { + if (adev->vcn.inst[inst_idx].pause_state.fw_based != new_state->fw_based) { DRM_DEBUG("dpg pause state changed %d:%d -> %d:%d", - adev->vcn.pause_state.fw_based, adev->vcn.pause_state.jpeg, + adev->vcn.inst[inst_idx].pause_state.fw_based, + adev->vcn.inst[inst_idx].pause_state.jpeg, new_state->fw_based, new_state->jpeg); reg_data = RREG32_SOC15(UVD, 0, mmUVD_DPG_PAUSE) & @@ -1258,13 +1259,14 @@ static int vcn_v1_0_pause_dpg_mode(struct amdgpu_device *adev, reg_data &= ~UVD_DPG_PAUSE__NJ_PAUSE_DPG_REQ_MASK; WREG32_SOC15(UVD, 0, mmUVD_DPG_PAUSE, reg_data); } - adev->vcn.pause_state.fw_based = new_state->fw_based; + adev->vcn.inst[inst_idx].pause_state.fw_based = new_state->fw_based; } /* pause/unpause if state is changed */ - if (adev->vcn.pause_state.jpeg != new_state->jpeg) { + if (adev->vcn.inst[inst_idx].pause_state.jpeg != new_state->jpeg) { DRM_DEBUG("dpg pause state changed %d:%d -> %d:%d", - adev->vcn.pause_state.fw_based, adev->vcn.pause_state.jpeg, + adev->vcn.inst[inst_idx].pause_state.fw_based, + adev->vcn.inst[inst_idx].pause_state.jpeg, new_state->fw_based, new_state->jpeg); reg_data = RREG32_SOC15(UVD, 0, mmUVD_DPG_PAUSE) & @@ -1318,7 +1320,7 @@ static int vcn_v1_0_pause_dpg_mode(struct amdgpu_device *adev, reg_data &= ~UVD_DPG_PAUSE__JPEG_PAUSE_DPG_REQ_MASK; WREG32_SOC15(UVD, 0, mmUVD_DPG_PAUSE, reg_data); } - adev->vcn.pause_state.jpeg = new_state->jpeg; + adev->vcn.inst[inst_idx].pause_state.jpeg = new_state->jpeg; } return 0; diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v2_0.c b/drivers/gpu/drm/amd/amdgpu/vcn_v2_0.c index 4f7216788f11..c387c81f8695 100644 --- a/drivers/gpu/drm/amd/amdgpu/vcn_v2_0.c +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v2_0.c @@ -1137,9 +1137,9 @@ static int vcn_v2_0_pause_dpg_mode(struct amdgpu_device *adev, int ret_code; /* pause/unpause if state is changed */ - if (adev->vcn.pause_state.fw_based != new_state->fw_based) { + if (adev->vcn.inst[inst_idx].pause_state.fw_based != new_state->fw_based) { DRM_DEBUG("dpg pause state changed %d -> %d", - adev->vcn.pause_state.fw_based, new_state->fw_based); + adev->vcn.inst[inst_idx].pause_state.fw_based, new_state->fw_based); reg_data = RREG32_SOC15(UVD, 0, mmUVD_DPG_PAUSE) & (~UVD_DPG_PAUSE__NJ_PAUSE_DPG_ACK_MASK); @@ -1185,7 +1185,7 @@ static int vcn_v2_0_pause_dpg_mode(struct amdgpu_device *adev, reg_data &= ~UVD_DPG_PAUSE__NJ_PAUSE_DPG_REQ_MASK; WREG32_SOC15(UVD, 0, mmUVD_DPG_PAUSE, reg_data); } - adev->vcn.pause_state.fw_based = new_state->fw_based; + adev->vcn.inst[inst_idx].pause_state.fw_based = new_state->fw_based; } return 0; diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c b/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c index 70fae7977f8f..97ab44c2f250 100644 --- a/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c @@ -1367,9 +1367,9 @@ static int vcn_v2_5_pause_dpg_mode(struct amdgpu_device *adev, int ret_code; /* pause/unpause if state is changed */ - if (adev->vcn.pause_state.fw_based != new_state->fw_based) { + if (adev->vcn.inst[inst_idx].pause_state.fw_based != new_state->fw_based) { DRM_DEBUG("dpg pause state changed %d -> %d", - adev->vcn.pause_state.fw_based, new_state->fw_based); + adev->vcn.inst[inst_idx].pause_state.fw_based, new_state->fw_based); reg_data = RREG32_SOC15(UVD, inst_idx, mmUVD_DPG_PAUSE) & (~UVD_DPG_PAUSE__NJ_PAUSE_DPG_ACK_MASK); @@ -1414,7 +1414,7 @@ static int vcn_v2_5_pause_dpg_mode(struct amdgpu_device *adev, reg_data &= ~UVD_DPG_PAUSE__NJ_PAUSE_DPG_REQ_MASK; WREG32_SOC15(UVD, inst_idx, mmUVD_DPG_PAUSE, reg_data); } - adev->vcn.pause_state.fw_based = new_state->fw_based; + adev->vcn.inst[inst_idx].pause_state.fw_based = new_state->fw_based; } return 0; -- cgit v1.2.3 From 416611d9b6eebaeae58ed26cc7d23131c69126b1 Mon Sep 17 00:00:00 2001 From: Daniel Kolesa Date: Thu, 6 Feb 2020 20:14:35 +0100 Subject: amdgpu: Prevent build errors regarding soft/hard-float FP ABI tags On PowerPC, the compiler will tag object files with whether they use hard or soft float FP ABI and whether they use 64 or 128-bit long double ABI. On systems with 64-bit long double ABI, a tag will get emitted whenever a double is used, as on those systems a long double is the same as a double. This will prevent linkage as other files are being compiled with hard-float. On ppc64, this code will never actually get used for the time being, as the only currently existing hardware using it are the Renoir APUs. Therefore, until this is testable and can be fixed properly, at least make sure the build will not fail. Signed-off-by: Daniel Kolesa Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/dc/clk_mgr/Makefile | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/gpu/drm/amd/display/dc/clk_mgr/Makefile b/drivers/gpu/drm/amd/display/dc/clk_mgr/Makefile index 3cd283195091..c0f6a8c7de7d 100644 --- a/drivers/gpu/drm/amd/display/dc/clk_mgr/Makefile +++ b/drivers/gpu/drm/amd/display/dc/clk_mgr/Makefile @@ -87,6 +87,12 @@ AMD_DISPLAY_FILES += $(AMD_DAL_CLK_MGR_DCN20) ############################################################################### CLK_MGR_DCN21 = rn_clk_mgr.o rn_clk_mgr_vbios_smu.o +# prevent build errors regarding soft-float vs hard-float FP ABI tags +# this code is currently unused on ppc64, as it applies to Renoir APUs only +ifdef CONFIG_PPC64 +CFLAGS_$(AMDDALPATH)/dc/clk_mgr/dcn21/rn_clk_mgr.o := $(call cc-option,-mno-gnu-attribute) +endif + AMD_DAL_CLK_MGR_DCN21 = $(addprefix $(AMDDALPATH)/dc/clk_mgr/dcn21/,$(CLK_MGR_DCN21)) AMD_DISPLAY_FILES += $(AMD_DAL_CLK_MGR_DCN21) -- cgit v1.2.3 From 46d1da733fbc867cd3c3aabec21aeaacd9a41771 Mon Sep 17 00:00:00 2001 From: Jonathan Kim Date: Thu, 6 Feb 2020 11:57:11 -0500 Subject: drm/amdgpu: fix amdgpu pmu to use hwc->config instead of hwc->conf hwc->conf was designated specifically for AMD APU IOMMU purposes. This could cause problems in performance and/or function since APU IOMMU implementation is elsewhere. Also hwc->conf and hwc->config are different members of an anonymous union so hwc->conf aliases as hw->last_tag. Signed-off-by: Jonathan Kim Reviewed-by: Felix Kuehling Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_pmu.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_pmu.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_pmu.c index 07914e34bc25..1311d6aec5d4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_pmu.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_pmu.c @@ -52,7 +52,7 @@ static int amdgpu_perf_event_init(struct perf_event *event) return -ENOENT; /* update the hw_perf_event struct with config data */ - hwc->conf = event->attr.config; + hwc->config = event->attr.config; return 0; } @@ -74,9 +74,9 @@ static void amdgpu_perf_start(struct perf_event *event, int flags) switch (pe->pmu_perf_type) { case PERF_TYPE_AMDGPU_DF: if (!(flags & PERF_EF_RELOAD)) - pe->adev->df.funcs->pmc_start(pe->adev, hwc->conf, 1); + pe->adev->df.funcs->pmc_start(pe->adev, hwc->config, 1); - pe->adev->df.funcs->pmc_start(pe->adev, hwc->conf, 0); + pe->adev->df.funcs->pmc_start(pe->adev, hwc->config, 0); break; default: break; @@ -101,7 +101,7 @@ static void amdgpu_perf_read(struct perf_event *event) switch (pe->pmu_perf_type) { case PERF_TYPE_AMDGPU_DF: - pe->adev->df.funcs->pmc_get_count(pe->adev, hwc->conf, + pe->adev->df.funcs->pmc_get_count(pe->adev, hwc->config, &count); break; default: @@ -126,7 +126,7 @@ static void amdgpu_perf_stop(struct perf_event *event, int flags) switch (pe->pmu_perf_type) { case PERF_TYPE_AMDGPU_DF: - pe->adev->df.funcs->pmc_stop(pe->adev, hwc->conf, 0); + pe->adev->df.funcs->pmc_stop(pe->adev, hwc->config, 0); break; default: break; @@ -156,7 +156,8 @@ static int amdgpu_perf_add(struct perf_event *event, int flags) switch (pe->pmu_perf_type) { case PERF_TYPE_AMDGPU_DF: - retval = pe->adev->df.funcs->pmc_start(pe->adev, hwc->conf, 1); + retval = pe->adev->df.funcs->pmc_start(pe->adev, + hwc->config, 1); break; default: return 0; @@ -184,7 +185,7 @@ static void amdgpu_perf_del(struct perf_event *event, int flags) switch (pe->pmu_perf_type) { case PERF_TYPE_AMDGPU_DF: - pe->adev->df.funcs->pmc_stop(pe->adev, hwc->conf, 1); + pe->adev->df.funcs->pmc_stop(pe->adev, hwc->config, 1); break; default: break; -- cgit v1.2.3 From 2cabe0d4cd88f7386e9c5a82236ceda46080a80b Mon Sep 17 00:00:00 2001 From: Guchun Chen Date: Sun, 9 Feb 2020 16:21:09 +0800 Subject: drm/amdgpu: limit GDS clearing workaround in cold boot sequence GDS clear workaround will cause gfx failure in suspend/resume case. [ 98.679559] [drm:amdgpu_device_ip_late_init [amdgpu]] *ERROR* late_init of IP block failed -110 [ 98.679561] PM: dpm_run_callback(): pci_pm_resume+0x0/0xa0 returns -110 [ 98.679562] PM: Device 0000:03:00.0 failed to resume async: error -110 As this workaround is specific to the HW bug of GDS's ECC error existing in cold boot up, so bypass this workaround in suspend/ resume case after booting up. Signed-off-by: Guchun Chen Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c index 90f64b8bc358..be289f0fff37 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c @@ -4374,9 +4374,12 @@ static int gfx_v9_0_ecc_late_init(void *handle) struct amdgpu_device *adev = (struct amdgpu_device *)handle; int r; - r = gfx_v9_0_do_edc_gds_workarounds(adev); - if (r) - return r; + /* limit gds clearing operation in cold boot sequence */ + if (!adev->in_suspend) { + r = gfx_v9_0_do_edc_gds_workarounds(adev); + if (r) + return r; + } /* requires IBs so do in late init after IB pool is initialized */ r = gfx_v9_0_do_edc_gpr_workarounds(adev); -- cgit v1.2.3 From b5336bfd6fe512521539e3f0f1cf4a5a6a45d380 Mon Sep 17 00:00:00 2001 From: James Zhu Date: Fri, 7 Feb 2020 08:11:00 -0500 Subject: drm/amdgpu/vcn2.5: fix warning Fix warning during switching to dpg pause mode for VCN firmware Version ENC: 1.1 DEC: 1 VEP: 0 Revision: 16 Signed-off-by: James Zhu Acked-by: Leo Liu Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c b/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c index 97ab44c2f250..2d64ba1adf99 100644 --- a/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c @@ -1407,7 +1407,7 @@ static int vcn_v2_5_pause_dpg_mode(struct amdgpu_device *adev, RREG32_SOC15(UVD, inst_idx, mmUVD_SCRATCH2) & 0x7FFFFFFF); SOC15_WAIT_ON_RREG(UVD, inst_idx, mmUVD_POWER_STATUS, - 0x0, UVD_POWER_STATUS__UVD_POWER_STATUS_MASK, ret_code); + UVD_PGFSM_CONFIG__UVDM_UVDU_PWR_ON, UVD_POWER_STATUS__UVD_POWER_STATUS_MASK, ret_code); } } else { /* unpause dpg, no need to wait */ -- cgit v1.2.3 From 1094c34ec53bc147e53f52ed9fe50899ad075632 Mon Sep 17 00:00:00 2001 From: Nicholas Kazlauskas Date: Mon, 20 Jan 2020 14:56:31 -0500 Subject: drm/amd/display: Don't map ATOM_ENABLE to ATOM_INIT [Why] In DCN hardware sequencer we do actually call ATOM_INIT correctly per pipe. The workaround is not necessary for command table offloading. [How] Drop the workaround since it's not needed. Signed-off-by: Nicholas Kazlauskas Reviewed-by: Chris Park Acked-by: Rodrigo Siqueira Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/dc/bios/command_table2.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/bios/command_table2.c b/drivers/gpu/drm/amd/display/dc/bios/command_table2.c index 629a07a2719b..c4ba6e84db65 100644 --- a/drivers/gpu/drm/amd/display/dc/bios/command_table2.c +++ b/drivers/gpu/drm/amd/display/dc/bios/command_table2.c @@ -711,10 +711,6 @@ static void enable_disp_power_gating_dmcub( power_gating.header.sub_type = DMUB_CMD__VBIOS_ENABLE_DISP_POWER_GATING; power_gating.power_gating.pwr = *pwr; - /* ATOM_ENABLE is old API in DMUB */ - if (power_gating.power_gating.pwr.enable == ATOM_ENABLE) - power_gating.power_gating.pwr.enable = ATOM_INIT; - dc_dmub_srv_cmd_queue(dmcub, &power_gating.header); dc_dmub_srv_cmd_execute(dmcub); dc_dmub_srv_wait_idle(dmcub); -- cgit v1.2.3 From 8fab6a2faa1eb388b5eaf1ead4394f380a6403be Mon Sep 17 00:00:00 2001 From: Sung Lee Date: Tue, 4 Feb 2020 15:49:54 -0500 Subject: drm/amd/display: DCN2.x Do not program DPPCLK if same value [WHY] Programming DPPCLK to the same value currently set may cause underflow while playing video in certain conditions. [HOW] Only program DPPCLK if clock is not the same as the previous value programmed. Signed-off-by: Sung Lee Reviewed-by: Yongqiang Sun Acked-by: Rodrigo Siqueira Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/dc/clk_mgr/dcn20/dcn20_clk_mgr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn20/dcn20_clk_mgr.c b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn20/dcn20_clk_mgr.c index 495f01e9f2ca..49ce46b543ea 100644 --- a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn20/dcn20_clk_mgr.c +++ b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn20/dcn20_clk_mgr.c @@ -117,7 +117,7 @@ void dcn20_update_clocks_update_dpp_dto(struct clk_mgr_internal *clk_mgr, prev_dppclk_khz = clk_mgr->base.ctx->dc->current_state->res_ctx.pipe_ctx[i].plane_res.bw.dppclk_khz; - if (safe_to_lower || prev_dppclk_khz < dppclk_khz) { + if ((prev_dppclk_khz > dppclk_khz && safe_to_lower) || prev_dppclk_khz < dppclk_khz) { clk_mgr->dccg->funcs->update_dpp_dto( clk_mgr->dccg, dpp_inst, dppclk_khz); } -- cgit v1.2.3 From a934f9d866598ec777174d449a0dd903ea3db817 Mon Sep 17 00:00:00 2001 From: Guchun Chen Date: Tue, 11 Feb 2020 12:10:44 +0800 Subject: drm/amdgpu: correct comment to clear up the confusion MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Former comment looks to be one intended behavior in code, actually it's not. So correct it. Suggested-by: Christian König Signed-off-by: Guchun Chen Acked-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c index be289f0fff37..b33a4eb39193 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c @@ -4374,7 +4374,12 @@ static int gfx_v9_0_ecc_late_init(void *handle) struct amdgpu_device *adev = (struct amdgpu_device *)handle; int r; - /* limit gds clearing operation in cold boot sequence */ + /* + * Temp workaround to fix the issue that CP firmware fails to + * update read pointer when CPDMA is writing clearing operation + * to GDS in suspend/resume sequence on several cards. So just + * limit this operation in cold boot sequence. + */ if (!adev->in_suspend) { r = gfx_v9_0_do_edc_gds_workarounds(adev); if (r) -- cgit v1.2.3 From c1d66bc2e531b4ed3a9464b8e87144cc6b2fd63f Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Thu, 6 Feb 2020 14:46:34 -0500 Subject: drm/amdgpu: update smu_v11_0_pptable.h Update to the latest changes. Reviewed-by: Evan Quan Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org # 5.5.x --- .../gpu/drm/amd/powerplay/inc/smu_v11_0_pptable.h | 46 +++++++++++++++------- 1 file changed, 32 insertions(+), 14 deletions(-) diff --git a/drivers/gpu/drm/amd/powerplay/inc/smu_v11_0_pptable.h b/drivers/gpu/drm/amd/powerplay/inc/smu_v11_0_pptable.h index b2f96a101124..7a63cf8e85ed 100644 --- a/drivers/gpu/drm/amd/powerplay/inc/smu_v11_0_pptable.h +++ b/drivers/gpu/drm/amd/powerplay/inc/smu_v11_0_pptable.h @@ -39,21 +39,39 @@ #define SMU_11_0_PP_OVERDRIVE_VERSION 0x0800 #define SMU_11_0_PP_POWERSAVINGCLOCK_VERSION 0x0100 +enum SMU_11_0_ODFEATURE_CAP { + SMU_11_0_ODCAP_GFXCLK_LIMITS = 0, + SMU_11_0_ODCAP_GFXCLK_CURVE, + SMU_11_0_ODCAP_UCLK_MAX, + SMU_11_0_ODCAP_POWER_LIMIT, + SMU_11_0_ODCAP_FAN_ACOUSTIC_LIMIT, + SMU_11_0_ODCAP_FAN_SPEED_MIN, + SMU_11_0_ODCAP_TEMPERATURE_FAN, + SMU_11_0_ODCAP_TEMPERATURE_SYSTEM, + SMU_11_0_ODCAP_MEMORY_TIMING_TUNE, + SMU_11_0_ODCAP_FAN_ZERO_RPM_CONTROL, + SMU_11_0_ODCAP_AUTO_UV_ENGINE, + SMU_11_0_ODCAP_AUTO_OC_ENGINE, + SMU_11_0_ODCAP_AUTO_OC_MEMORY, + SMU_11_0_ODCAP_FAN_CURVE, + SMU_11_0_ODCAP_COUNT, +}; + enum SMU_11_0_ODFEATURE_ID { - SMU_11_0_ODFEATURE_GFXCLK_LIMITS = 1 << 0, //GFXCLK Limit feature - SMU_11_0_ODFEATURE_GFXCLK_CURVE = 1 << 1, //GFXCLK Curve feature - SMU_11_0_ODFEATURE_UCLK_MAX = 1 << 2, //UCLK Limit feature - SMU_11_0_ODFEATURE_POWER_LIMIT = 1 << 3, //Power Limit feature - SMU_11_0_ODFEATURE_FAN_ACOUSTIC_LIMIT = 1 << 4, //Fan Acoustic RPM feature - SMU_11_0_ODFEATURE_FAN_SPEED_MIN = 1 << 5, //Minimum Fan Speed feature - SMU_11_0_ODFEATURE_TEMPERATURE_FAN = 1 << 6, //Fan Target Temperature Limit feature - SMU_11_0_ODFEATURE_TEMPERATURE_SYSTEM = 1 << 7, //Operating Temperature Limit feature - SMU_11_0_ODFEATURE_MEMORY_TIMING_TUNE = 1 << 8, //AC Timing Tuning feature - SMU_11_0_ODFEATURE_FAN_ZERO_RPM_CONTROL = 1 << 9, //Zero RPM feature - SMU_11_0_ODFEATURE_AUTO_UV_ENGINE = 1 << 10, //Auto Under Volt GFXCLK feature - SMU_11_0_ODFEATURE_AUTO_OC_ENGINE = 1 << 11, //Auto Over Clock GFXCLK feature - SMU_11_0_ODFEATURE_AUTO_OC_MEMORY = 1 << 12, //Auto Over Clock MCLK feature - SMU_11_0_ODFEATURE_FAN_CURVE = 1 << 13, //VICTOR TODO + SMU_11_0_ODFEATURE_GFXCLK_LIMITS = 1 << SMU_11_0_ODCAP_GFXCLK_LIMITS, //GFXCLK Limit feature + SMU_11_0_ODFEATURE_GFXCLK_CURVE = 1 << SMU_11_0_ODCAP_GFXCLK_CURVE, //GFXCLK Curve feature + SMU_11_0_ODFEATURE_UCLK_MAX = 1 << SMU_11_0_ODCAP_UCLK_MAX, //UCLK Limit feature + SMU_11_0_ODFEATURE_POWER_LIMIT = 1 << SMU_11_0_ODCAP_POWER_LIMIT, //Power Limit feature + SMU_11_0_ODFEATURE_FAN_ACOUSTIC_LIMIT = 1 << SMU_11_0_ODCAP_FAN_ACOUSTIC_LIMIT, //Fan Acoustic RPM feature + SMU_11_0_ODFEATURE_FAN_SPEED_MIN = 1 << SMU_11_0_ODCAP_FAN_SPEED_MIN, //Minimum Fan Speed feature + SMU_11_0_ODFEATURE_TEMPERATURE_FAN = 1 << SMU_11_0_ODCAP_TEMPERATURE_FAN, //Fan Target Temperature Limit feature + SMU_11_0_ODFEATURE_TEMPERATURE_SYSTEM = 1 << SMU_11_0_ODCAP_TEMPERATURE_SYSTEM, //Operating Temperature Limit feature + SMU_11_0_ODFEATURE_MEMORY_TIMING_TUNE = 1 << SMU_11_0_ODCAP_MEMORY_TIMING_TUNE, //AC Timing Tuning feature + SMU_11_0_ODFEATURE_FAN_ZERO_RPM_CONTROL = 1 << SMU_11_0_ODCAP_FAN_ZERO_RPM_CONTROL, //Zero RPM feature + SMU_11_0_ODFEATURE_AUTO_UV_ENGINE = 1 << SMU_11_0_ODCAP_AUTO_UV_ENGINE, //Auto Under Volt GFXCLK feature + SMU_11_0_ODFEATURE_AUTO_OC_ENGINE = 1 << SMU_11_0_ODCAP_AUTO_OC_ENGINE, //Auto Over Clock GFXCLK feature + SMU_11_0_ODFEATURE_AUTO_OC_MEMORY = 1 << SMU_11_0_ODCAP_AUTO_OC_MEMORY, //Auto Over Clock MCLK feature + SMU_11_0_ODFEATURE_FAN_CURVE = 1 << SMU_11_0_ODCAP_FAN_CURVE, //Fan Curve feature SMU_11_0_ODFEATURE_COUNT = 14, }; #define SMU_11_0_MAX_ODFEATURE 32 //Maximum Number of OD Features -- cgit v1.2.3 From e33a8cfda5198fc09554fdd77ba246de42c886bd Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Thu, 6 Feb 2020 14:53:06 -0500 Subject: drm/amdgpu:/navi10: use the ODCAP enum to index the caps array Rather than the FEATURE_ID flags. Avoids a possible reading past the end of the array. Reviewed-by: Evan Quan Reported-by: Aleksandr Mezin Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org # 5.5.x --- drivers/gpu/drm/amd/powerplay/navi10_ppt.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/drivers/gpu/drm/amd/powerplay/navi10_ppt.c b/drivers/gpu/drm/amd/powerplay/navi10_ppt.c index 19a9846b730e..0d73a49166af 100644 --- a/drivers/gpu/drm/amd/powerplay/navi10_ppt.c +++ b/drivers/gpu/drm/amd/powerplay/navi10_ppt.c @@ -736,9 +736,9 @@ static bool navi10_is_support_fine_grained_dpm(struct smu_context *smu, enum smu return dpm_desc->SnapToDiscrete == 0 ? true : false; } -static inline bool navi10_od_feature_is_supported(struct smu_11_0_overdrive_table *od_table, enum SMU_11_0_ODFEATURE_ID feature) +static inline bool navi10_od_feature_is_supported(struct smu_11_0_overdrive_table *od_table, enum SMU_11_0_ODFEATURE_CAP cap) { - return od_table->cap[feature]; + return od_table->cap[cap]; } static void navi10_od_setting_get_range(struct smu_11_0_overdrive_table *od_table, @@ -846,7 +846,7 @@ static int navi10_print_clk_levels(struct smu_context *smu, case SMU_OD_SCLK: if (!smu->od_enabled || !od_table || !od_settings) break; - if (!navi10_od_feature_is_supported(od_settings, SMU_11_0_ODFEATURE_GFXCLK_LIMITS)) + if (!navi10_od_feature_is_supported(od_settings, SMU_11_0_ODCAP_GFXCLK_LIMITS)) break; size += sprintf(buf + size, "OD_SCLK:\n"); size += sprintf(buf + size, "0: %uMhz\n1: %uMhz\n", od_table->GfxclkFmin, od_table->GfxclkFmax); @@ -854,7 +854,7 @@ static int navi10_print_clk_levels(struct smu_context *smu, case SMU_OD_MCLK: if (!smu->od_enabled || !od_table || !od_settings) break; - if (!navi10_od_feature_is_supported(od_settings, SMU_11_0_ODFEATURE_UCLK_MAX)) + if (!navi10_od_feature_is_supported(od_settings, SMU_11_0_ODCAP_UCLK_MAX)) break; size += sprintf(buf + size, "OD_MCLK:\n"); size += sprintf(buf + size, "1: %uMHz\n", od_table->UclkFmax); @@ -862,7 +862,7 @@ static int navi10_print_clk_levels(struct smu_context *smu, case SMU_OD_VDDC_CURVE: if (!smu->od_enabled || !od_table || !od_settings) break; - if (!navi10_od_feature_is_supported(od_settings, SMU_11_0_ODFEATURE_GFXCLK_CURVE)) + if (!navi10_od_feature_is_supported(od_settings, SMU_11_0_ODCAP_GFXCLK_CURVE)) break; size += sprintf(buf + size, "OD_VDDC_CURVE:\n"); for (i = 0; i < 3; i++) { @@ -887,7 +887,7 @@ static int navi10_print_clk_levels(struct smu_context *smu, break; size = sprintf(buf, "%s:\n", "OD_RANGE"); - if (navi10_od_feature_is_supported(od_settings, SMU_11_0_ODFEATURE_GFXCLK_LIMITS)) { + if (navi10_od_feature_is_supported(od_settings, SMU_11_0_ODCAP_GFXCLK_LIMITS)) { navi10_od_setting_get_range(od_settings, SMU_11_0_ODSETTING_GFXCLKFMIN, &min_value, NULL); navi10_od_setting_get_range(od_settings, SMU_11_0_ODSETTING_GFXCLKFMAX, @@ -896,14 +896,14 @@ static int navi10_print_clk_levels(struct smu_context *smu, min_value, max_value); } - if (navi10_od_feature_is_supported(od_settings, SMU_11_0_ODFEATURE_UCLK_MAX)) { + if (navi10_od_feature_is_supported(od_settings, SMU_11_0_ODCAP_UCLK_MAX)) { navi10_od_setting_get_range(od_settings, SMU_11_0_ODSETTING_UCLKFMAX, &min_value, &max_value); size += sprintf(buf + size, "MCLK: %7uMhz %10uMhz\n", min_value, max_value); } - if (navi10_od_feature_is_supported(od_settings, SMU_11_0_ODFEATURE_GFXCLK_CURVE)) { + if (navi10_od_feature_is_supported(od_settings, SMU_11_0_ODCAP_GFXCLK_CURVE)) { navi10_od_setting_get_range(od_settings, SMU_11_0_ODSETTING_VDDGFXCURVEFREQ_P1, &min_value, &max_value); size += sprintf(buf + size, "VDDC_CURVE_SCLK[0]: %7uMhz %10uMhz\n", @@ -2056,7 +2056,7 @@ static int navi10_od_edit_dpm_table(struct smu_context *smu, enum PP_OD_DPM_TABL switch (type) { case PP_OD_EDIT_SCLK_VDDC_TABLE: - if (!navi10_od_feature_is_supported(od_settings, SMU_11_0_ODFEATURE_GFXCLK_LIMITS)) { + if (!navi10_od_feature_is_supported(od_settings, SMU_11_0_ODCAP_GFXCLK_LIMITS)) { pr_warn("GFXCLK_LIMITS not supported!\n"); return -ENOTSUPP; } @@ -2102,7 +2102,7 @@ static int navi10_od_edit_dpm_table(struct smu_context *smu, enum PP_OD_DPM_TABL } break; case PP_OD_EDIT_MCLK_VDDC_TABLE: - if (!navi10_od_feature_is_supported(od_settings, SMU_11_0_ODFEATURE_UCLK_MAX)) { + if (!navi10_od_feature_is_supported(od_settings, SMU_11_0_ODCAP_UCLK_MAX)) { pr_warn("UCLK_MAX not supported!\n"); return -ENOTSUPP; } @@ -2143,7 +2143,7 @@ static int navi10_od_edit_dpm_table(struct smu_context *smu, enum PP_OD_DPM_TABL } break; case PP_OD_EDIT_VDDC_CURVE: - if (!navi10_od_feature_is_supported(od_settings, SMU_11_0_ODFEATURE_GFXCLK_CURVE)) { + if (!navi10_od_feature_is_supported(od_settings, SMU_11_0_ODCAP_GFXCLK_CURVE)) { pr_warn("GFXCLK_CURVE not supported!\n"); return -ENOTSUPP; } -- cgit v1.2.3 From ea128834dd76f9a72a35d011c651fa96658f06a7 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 11 Feb 2020 17:52:32 +0100 Subject: ACPICA: Introduce acpi_any_gpe_status_set() Introduce a new helper function, acpi_any_gpe_status_set(), for checking the status bits of all enabled GPEs in one go. It is needed to distinguish spurious SCIs from genuine ones when deciding whether or not to wake up the system from suspend-to-idle. Cc: 5.4+ # 5.4+ Signed-off-by: Rafael J. Wysocki --- drivers/acpi/acpica/achware.h | 2 ++ drivers/acpi/acpica/evxfgpe.c | 32 +++++++++++++++++++ drivers/acpi/acpica/hwgpe.c | 71 +++++++++++++++++++++++++++++++++++++++++++ include/acpi/acpixf.h | 1 + 4 files changed, 106 insertions(+) diff --git a/drivers/acpi/acpica/achware.h b/drivers/acpi/acpica/achware.h index 67f282e9e0af..6ad0517553d5 100644 --- a/drivers/acpi/acpica/achware.h +++ b/drivers/acpi/acpica/achware.h @@ -101,6 +101,8 @@ acpi_status acpi_hw_enable_all_runtime_gpes(void); acpi_status acpi_hw_enable_all_wakeup_gpes(void); +u8 acpi_hw_check_all_gpes(void); + acpi_status acpi_hw_enable_runtime_gpe_block(struct acpi_gpe_xrupt_info *gpe_xrupt_info, struct acpi_gpe_block_info *gpe_block, diff --git a/drivers/acpi/acpica/evxfgpe.c b/drivers/acpi/acpica/evxfgpe.c index 2c39ff2a7406..f2de66bfd8a7 100644 --- a/drivers/acpi/acpica/evxfgpe.c +++ b/drivers/acpi/acpica/evxfgpe.c @@ -795,6 +795,38 @@ acpi_status acpi_enable_all_wakeup_gpes(void) ACPI_EXPORT_SYMBOL(acpi_enable_all_wakeup_gpes) +/****************************************************************************** + * + * FUNCTION: acpi_any_gpe_status_set + * + * PARAMETERS: None + * + * RETURN: Whether or not the status bit is set for any GPE + * + * DESCRIPTION: Check the status bits of all enabled GPEs and return TRUE if any + * of them is set or FALSE otherwise. + * + ******************************************************************************/ +u32 acpi_any_gpe_status_set(void) +{ + acpi_status status; + u8 ret; + + ACPI_FUNCTION_TRACE(acpi_any_gpe_status_set); + + status = acpi_ut_acquire_mutex(ACPI_MTX_EVENTS); + if (ACPI_FAILURE(status)) { + return (FALSE); + } + + ret = acpi_hw_check_all_gpes(); + (void)acpi_ut_release_mutex(ACPI_MTX_EVENTS); + + return (ret); +} + +ACPI_EXPORT_SYMBOL(acpi_any_gpe_status_set) + /******************************************************************************* * * FUNCTION: acpi_install_gpe_block diff --git a/drivers/acpi/acpica/hwgpe.c b/drivers/acpi/acpica/hwgpe.c index 1b4252bdcd0b..f4c285c2f595 100644 --- a/drivers/acpi/acpica/hwgpe.c +++ b/drivers/acpi/acpica/hwgpe.c @@ -444,6 +444,53 @@ acpi_hw_enable_wakeup_gpe_block(struct acpi_gpe_xrupt_info *gpe_xrupt_info, return (AE_OK); } +/****************************************************************************** + * + * FUNCTION: acpi_hw_get_gpe_block_status + * + * PARAMETERS: gpe_xrupt_info - GPE Interrupt info + * gpe_block - Gpe Block info + * + * RETURN: Success + * + * DESCRIPTION: Produce a combined GPE status bits mask for the given block. + * + ******************************************************************************/ + +static acpi_status +acpi_hw_get_gpe_block_status(struct acpi_gpe_xrupt_info *gpe_xrupt_info, + struct acpi_gpe_block_info *gpe_block, + void *ret_ptr) +{ + struct acpi_gpe_register_info *gpe_register_info; + u64 in_enable, in_status; + acpi_status status; + u8 *ret = ret_ptr; + u32 i; + + /* Examine each GPE Register within the block */ + + for (i = 0; i < gpe_block->register_count; i++) { + gpe_register_info = &gpe_block->register_info[i]; + + status = acpi_hw_read(&in_enable, + &gpe_register_info->enable_address); + if (ACPI_FAILURE(status)) { + continue; + } + + status = acpi_hw_read(&in_status, + &gpe_register_info->status_address); + if (ACPI_FAILURE(status)) { + continue; + } + + *ret |= in_enable & in_status; + } + + return (AE_OK); +} + /****************************************************************************** * * FUNCTION: acpi_hw_disable_all_gpes @@ -510,4 +557,28 @@ acpi_status acpi_hw_enable_all_wakeup_gpes(void) return_ACPI_STATUS(status); } +/****************************************************************************** + * + * FUNCTION: acpi_hw_check_all_gpes + * + * PARAMETERS: None + * + * RETURN: Combined status of all GPEs + * + * DESCRIPTION: Check all enabled GPEs in all GPE blocks and return TRUE if the + * status bit is set for at least one of them of FALSE otherwise. + * + ******************************************************************************/ + +u8 acpi_hw_check_all_gpes(void) +{ + u8 ret = 0; + + ACPI_FUNCTION_TRACE(acpi_hw_check_all_gpes); + + (void)acpi_ev_walk_gpe_list(acpi_hw_get_gpe_block_status, &ret); + + return (ret != 0); +} + #endif /* !ACPI_REDUCED_HARDWARE */ diff --git a/include/acpi/acpixf.h b/include/acpi/acpixf.h index 00994b1b8681..5867777bb7d0 100644 --- a/include/acpi/acpixf.h +++ b/include/acpi/acpixf.h @@ -752,6 +752,7 @@ ACPI_HW_DEPENDENT_RETURN_UINT32(u32 acpi_dispatch_gpe(acpi_handle gpe_device, u3 ACPI_HW_DEPENDENT_RETURN_STATUS(acpi_status acpi_disable_all_gpes(void)) ACPI_HW_DEPENDENT_RETURN_STATUS(acpi_status acpi_enable_all_runtime_gpes(void)) ACPI_HW_DEPENDENT_RETURN_STATUS(acpi_status acpi_enable_all_wakeup_gpes(void)) +ACPI_HW_DEPENDENT_RETURN_UINT32(u32 acpi_any_gpe_status_set(void)) ACPI_HW_DEPENDENT_RETURN_STATUS(acpi_status acpi_get_gpe_device(u32 gpe_index, -- cgit v1.2.3 From fdde0ff8590b4c1c41b3227f5ac4265fccccb96b Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 11 Feb 2020 17:53:52 +0100 Subject: ACPI: PM: s2idle: Prevent spurious SCIs from waking up the system If the platform triggers a spurious SCI even though the status bit is not set for any GPE when the system is suspended to idle, it will be treated as a genuine wakeup, so avoid that by checking if any GPEs are active at all before returning 'true' from acpi_s2idle_wake(). Link: https://bugzilla.kernel.org/show_bug.cgi?id=206413 Fixes: 56b991849009 ("PM: sleep: Simplify suspend-to-idle control flow") Reported-by: Tsuchiya Yuto Cc: 5.4+ # 5.4+ Signed-off-by: Rafael J. Wysocki --- drivers/acpi/sleep.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/drivers/acpi/sleep.c b/drivers/acpi/sleep.c index 2c695b196cd2..152f7fc0b200 100644 --- a/drivers/acpi/sleep.c +++ b/drivers/acpi/sleep.c @@ -1006,10 +1006,16 @@ static bool acpi_s2idle_wake(void) return true; /* - * If there are no EC events to process, the wakeup is regarded - * as a genuine one. + * If there are no EC events to process and at least one of the + * other enabled GPEs is active, the wakeup is regarded as a + * genuine one. + * + * Note that the checks below must be carried out in this order + * to avoid returning prematurely due to a change of the EC GPE + * status bit from unset to set between the checks with the + * status bits of all the other GPEs unset. */ - if (!acpi_ec_dispatch_gpe()) + if (acpi_any_gpe_status_set() && !acpi_ec_dispatch_gpe()) return true; /* -- cgit v1.2.3 From 6f4ecbe284df5f22e386a640d9a4b32cede62030 Mon Sep 17 00:00:00 2001 From: Thierry Reding Date: Mon, 3 Feb 2020 15:31:14 +0100 Subject: soc/tegra: fuse: Fix build with Tegra194 configuration If only Tegra194 support is enabled, the tegra30_fuse_read() and tegra30_fuse_init() function are not declared and cause a build failure. Add Tegra194 to the preprocessor guard to make sure these functions are available for Tegra194-only builds as well. Link: https://lore.kernel.org/r/20200203143114.3967295-1-thierry.reding@gmail.com Reported-by: kbuild test robot Signed-off-by: Thierry Reding Signed-off-by: Olof Johansson --- drivers/soc/tegra/fuse/fuse-tegra30.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/soc/tegra/fuse/fuse-tegra30.c b/drivers/soc/tegra/fuse/fuse-tegra30.c index f68f4e1c215d..e6037f900fb7 100644 --- a/drivers/soc/tegra/fuse/fuse-tegra30.c +++ b/drivers/soc/tegra/fuse/fuse-tegra30.c @@ -36,7 +36,8 @@ defined(CONFIG_ARCH_TEGRA_124_SOC) || \ defined(CONFIG_ARCH_TEGRA_132_SOC) || \ defined(CONFIG_ARCH_TEGRA_210_SOC) || \ - defined(CONFIG_ARCH_TEGRA_186_SOC) + defined(CONFIG_ARCH_TEGRA_186_SOC) || \ + defined(CONFIG_ARCH_TEGRA_194_SOC) static u32 tegra30_fuse_read_early(struct tegra_fuse *fuse, unsigned int offset) { if (WARN_ON(!fuse->base)) -- cgit v1.2.3 From ad1e03b2b3d4430baaa109b77bc308dc73050de3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Mon, 10 Feb 2020 17:10:46 +0100 Subject: core: Don't skip generic XDP program execution for cloned SKBs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The current generic XDP handler skips execution of XDP programs entirely if an SKB is marked as cloned. This leads to some surprising behaviour, as packets can end up being cloned in various ways, which will make an XDP program not see all the traffic on an interface. This was discovered by a simple test case where an XDP program that always returns XDP_DROP is installed on a veth device. When combining this with the Scapy packet sniffer (which uses an AF_PACKET) socket on the sending side, SKBs reliably end up in the cloned state, causing them to be passed through to the receiving interface instead of being dropped. A minimal reproducer script for this is included below. This patch fixed the issue by simply triggering the existing linearisation code for cloned SKBs instead of skipping the XDP program execution. This behaviour is in line with the behaviour of the native XDP implementation for the veth driver, which will reallocate and copy the SKB data if the SKB is marked as shared. Reproducer Python script (requires BCC and Scapy): from scapy.all import TCP, IP, Ether, sendp, sniff, AsyncSniffer, Raw, UDP from bcc import BPF import time, sys, subprocess, shlex SKB_MODE = (1 << 1) DRV_MODE = (1 << 2) PYTHON=sys.executable def client(): time.sleep(2) # Sniffing on the sender causes skb_cloned() to be set s = AsyncSniffer() s.start() for p in range(10): sendp(Ether(dst="aa:aa:aa:aa:aa:aa", src="cc:cc:cc:cc:cc:cc")/IP()/UDP()/Raw("Test"), verbose=False) time.sleep(0.1) s.stop() return 0 def server(mode): prog = BPF(text="int dummy_drop(struct xdp_md *ctx) {return XDP_DROP;}") func = prog.load_func("dummy_drop", BPF.XDP) prog.attach_xdp("a_to_b", func, mode) time.sleep(1) s = sniff(iface="a_to_b", count=10, timeout=15) if len(s): print(f"Got {len(s)} packets - should have gotten 0") return 1 else: print("Got no packets - as expected") return 0 if len(sys.argv) < 2: print(f"Usage: {sys.argv[0]} ") sys.exit(1) if sys.argv[1] == "client": sys.exit(client()) elif sys.argv[1] == "server": mode = SKB_MODE if sys.argv[2] == 'skb' else DRV_MODE sys.exit(server(mode)) else: try: mode = sys.argv[1] if mode not in ('skb', 'drv'): print(f"Usage: {sys.argv[0]} ") sys.exit(1) print(f"Running in {mode} mode") for cmd in [ 'ip netns add netns_a', 'ip netns add netns_b', 'ip -n netns_a link add a_to_b type veth peer name b_to_a netns netns_b', # Disable ipv6 to make sure there's no address autoconf traffic 'ip netns exec netns_a sysctl -qw net.ipv6.conf.a_to_b.disable_ipv6=1', 'ip netns exec netns_b sysctl -qw net.ipv6.conf.b_to_a.disable_ipv6=1', 'ip -n netns_a link set dev a_to_b address aa:aa:aa:aa:aa:aa', 'ip -n netns_b link set dev b_to_a address cc:cc:cc:cc:cc:cc', 'ip -n netns_a link set dev a_to_b up', 'ip -n netns_b link set dev b_to_a up']: subprocess.check_call(shlex.split(cmd)) server = subprocess.Popen(shlex.split(f"ip netns exec netns_a {PYTHON} {sys.argv[0]} server {mode}")) client = subprocess.Popen(shlex.split(f"ip netns exec netns_b {PYTHON} {sys.argv[0]} client")) client.wait() server.wait() sys.exit(server.returncode) finally: subprocess.run(shlex.split("ip netns delete netns_a")) subprocess.run(shlex.split("ip netns delete netns_b")) Fixes: d445516966dc ("net: xdp: support xdp generic on virtual devices") Reported-by: Stepan Horacek Suggested-by: Paolo Abeni Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: David S. Miller --- net/core/dev.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/core/dev.c b/net/core/dev.c index a69e8bd7ed74..a6316b336128 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4527,14 +4527,14 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, /* Reinjected packets coming from act_mirred or similar should * not get XDP generic processing. */ - if (skb_cloned(skb) || skb_is_tc_redirected(skb)) + if (skb_is_tc_redirected(skb)) return XDP_PASS; /* XDP packets must be linear and must have sufficient headroom * of XDP_PACKET_HEADROOM bytes. This is the guarantee that also * native XDP provides, thus we need to do it here as well. */ - if (skb_is_nonlinear(skb) || + if (skb_cloned(skb) || skb_is_nonlinear(skb) || skb_headroom(skb) < XDP_PACKET_HEADROOM) { int hroom = XDP_PACKET_HEADROOM - skb_headroom(skb); int troom = skb->tail + skb->data_len - skb->end; -- cgit v1.2.3 From f27f37a04a69890ac85d9155f03ee2d23b678d8f Mon Sep 17 00:00:00 2001 From: Brett Creeley Date: Mon, 10 Feb 2020 10:59:18 -0800 Subject: i40e: Fix the conditional for i40e_vc_validate_vqs_bitmaps Commit d9d6a9aed3f6 ("i40e: Fix virtchnl_queue_select bitmap validation") introduced a necessary change for verifying how queue bitmaps from the iavf driver get validated. Unfortunately, the conditional was reversed. Fix this. Fixes: d9d6a9aed3f6 ("i40e: Fix virtchnl_queue_select bitmap validation") Signed-off-by: Brett Creeley Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher Signed-off-by: David S. Miller --- drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c index 69523ac85639..56b9e445732b 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c +++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c @@ -2362,7 +2362,7 @@ static int i40e_vc_enable_queues_msg(struct i40e_vf *vf, u8 *msg) goto error_param; } - if (i40e_vc_validate_vqs_bitmaps(vqs)) { + if (!i40e_vc_validate_vqs_bitmaps(vqs)) { aq_ret = I40E_ERR_PARAM; goto error_param; } @@ -2424,7 +2424,7 @@ static int i40e_vc_disable_queues_msg(struct i40e_vf *vf, u8 *msg) goto error_param; } - if (i40e_vc_validate_vqs_bitmaps(vqs)) { + if (!i40e_vc_validate_vqs_bitmaps(vqs)) { aq_ret = I40E_ERR_PARAM; goto error_param; } -- cgit v1.2.3 From 457fed775c97ac2c0cd1672aaf2ff2c8a6235e87 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 10 Feb 2020 11:36:13 -0800 Subject: net/smc: fix leak of kernel memory to user space As nlmsg_put() does not clear the memory that is reserved, it this the caller responsability to make sure all of this memory will be written, in order to not reveal prior content. While we are at it, we can provide the socket cookie even if clsock is not set. syzbot reported : BUG: KMSAN: uninit-value in __arch_swab32 arch/x86/include/uapi/asm/swab.h:10 [inline] BUG: KMSAN: uninit-value in __fswab32 include/uapi/linux/swab.h:59 [inline] BUG: KMSAN: uninit-value in __swab32p include/uapi/linux/swab.h:179 [inline] BUG: KMSAN: uninit-value in __be32_to_cpup include/uapi/linux/byteorder/little_endian.h:82 [inline] BUG: KMSAN: uninit-value in get_unaligned_be32 include/linux/unaligned/access_ok.h:30 [inline] BUG: KMSAN: uninit-value in ____bpf_skb_load_helper_32 net/core/filter.c:240 [inline] BUG: KMSAN: uninit-value in ____bpf_skb_load_helper_32_no_cache net/core/filter.c:255 [inline] BUG: KMSAN: uninit-value in bpf_skb_load_helper_32_no_cache+0x14a/0x390 net/core/filter.c:252 CPU: 1 PID: 5262 Comm: syz-executor.5 Not tainted 5.5.0-rc5-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x1c9/0x220 lib/dump_stack.c:118 kmsan_report+0xf7/0x1e0 mm/kmsan/kmsan_report.c:118 __msan_warning+0x58/0xa0 mm/kmsan/kmsan_instr.c:215 __arch_swab32 arch/x86/include/uapi/asm/swab.h:10 [inline] __fswab32 include/uapi/linux/swab.h:59 [inline] __swab32p include/uapi/linux/swab.h:179 [inline] __be32_to_cpup include/uapi/linux/byteorder/little_endian.h:82 [inline] get_unaligned_be32 include/linux/unaligned/access_ok.h:30 [inline] ____bpf_skb_load_helper_32 net/core/filter.c:240 [inline] ____bpf_skb_load_helper_32_no_cache net/core/filter.c:255 [inline] bpf_skb_load_helper_32_no_cache+0x14a/0x390 net/core/filter.c:252 Uninit was created at: kmsan_save_stack_with_flags mm/kmsan/kmsan.c:144 [inline] kmsan_internal_poison_shadow+0x66/0xd0 mm/kmsan/kmsan.c:127 kmsan_kmalloc_large+0x73/0xc0 mm/kmsan/kmsan_hooks.c:128 kmalloc_large_node_hook mm/slub.c:1406 [inline] kmalloc_large_node+0x282/0x2c0 mm/slub.c:3841 __kmalloc_node_track_caller+0x44b/0x1200 mm/slub.c:4368 __kmalloc_reserve net/core/skbuff.c:141 [inline] __alloc_skb+0x2fd/0xac0 net/core/skbuff.c:209 alloc_skb include/linux/skbuff.h:1049 [inline] netlink_dump+0x44b/0x1ab0 net/netlink/af_netlink.c:2224 __netlink_dump_start+0xbb2/0xcf0 net/netlink/af_netlink.c:2352 netlink_dump_start include/linux/netlink.h:233 [inline] smc_diag_handler_dump+0x2ba/0x300 net/smc/smc_diag.c:242 sock_diag_rcv_msg+0x211/0x610 net/core/sock_diag.c:256 netlink_rcv_skb+0x451/0x650 net/netlink/af_netlink.c:2477 sock_diag_rcv+0x63/0x80 net/core/sock_diag.c:275 netlink_unicast_kernel net/netlink/af_netlink.c:1302 [inline] netlink_unicast+0xf9e/0x1100 net/netlink/af_netlink.c:1328 netlink_sendmsg+0x1248/0x14d0 net/netlink/af_netlink.c:1917 sock_sendmsg_nosec net/socket.c:639 [inline] sock_sendmsg net/socket.c:659 [inline] kernel_sendmsg+0x433/0x440 net/socket.c:679 sock_no_sendpage+0x235/0x300 net/core/sock.c:2740 kernel_sendpage net/socket.c:3776 [inline] sock_sendpage+0x1e1/0x2c0 net/socket.c:937 pipe_to_sendpage+0x38c/0x4c0 fs/splice.c:458 splice_from_pipe_feed fs/splice.c:512 [inline] __splice_from_pipe+0x539/0xed0 fs/splice.c:636 splice_from_pipe fs/splice.c:671 [inline] generic_splice_sendpage+0x1d5/0x2d0 fs/splice.c:844 do_splice_from fs/splice.c:863 [inline] do_splice fs/splice.c:1170 [inline] __do_sys_splice fs/splice.c:1447 [inline] __se_sys_splice+0x2380/0x3350 fs/splice.c:1427 __x64_sys_splice+0x6e/0x90 fs/splice.c:1427 do_syscall_64+0xb8/0x160 arch/x86/entry/common.c:296 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Fixes: f16a7dd5cf27 ("smc: netlink interface for SMC sockets") Signed-off-by: Eric Dumazet Cc: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_diag.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/net/smc/smc_diag.c b/net/smc/smc_diag.c index f38727ecf8b2..e1f64f4ba236 100644 --- a/net/smc/smc_diag.c +++ b/net/smc/smc_diag.c @@ -39,16 +39,15 @@ static void smc_diag_msg_common_fill(struct smc_diag_msg *r, struct sock *sk) { struct smc_sock *smc = smc_sk(sk); + memset(r, 0, sizeof(*r)); r->diag_family = sk->sk_family; + sock_diag_save_cookie(sk, r->id.idiag_cookie); if (!smc->clcsock) return; r->id.idiag_sport = htons(smc->clcsock->sk->sk_num); r->id.idiag_dport = smc->clcsock->sk->sk_dport; r->id.idiag_if = smc->clcsock->sk->sk_bound_dev_if; - sock_diag_save_cookie(sk, r->id.idiag_cookie); if (sk->sk_protocol == SMCPROTO_SMC) { - memset(&r->id.idiag_src, 0, sizeof(r->id.idiag_src)); - memset(&r->id.idiag_dst, 0, sizeof(r->id.idiag_dst)); r->id.idiag_src[0] = smc->clcsock->sk->sk_rcv_saddr; r->id.idiag_dst[0] = smc->clcsock->sk->sk_daddr; #if IS_ENABLED(CONFIG_IPV6) -- cgit v1.2.3 From 91a65b7d3ed8450f31ab717a65dcb5f9ceb5ab02 Mon Sep 17 00:00:00 2001 From: Arthur Kiyanovski Date: Tue, 11 Feb 2020 15:17:40 +0000 Subject: net: ena: fix potential crash when rxfh key is NULL When ethtool -X is called without an hkey, ena_com_fill_hash_function() is called with key=NULL, which is passed to memcpy causing a crash. This commit fixes this issue by checking key is not NULL. Fixes: 1738cd3ed342 ("net: ena: Add a driver for Amazon Elastic Network Adapters (ENA)") Signed-off-by: Sameeh Jubran Signed-off-by: Arthur Kiyanovski Signed-off-by: David S. Miller --- drivers/net/ethernet/amazon/ena/ena_com.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/amazon/ena/ena_com.c b/drivers/net/ethernet/amazon/ena/ena_com.c index ea62604fdf8c..e54c44fdcaa7 100644 --- a/drivers/net/ethernet/amazon/ena/ena_com.c +++ b/drivers/net/ethernet/amazon/ena/ena_com.c @@ -2297,15 +2297,16 @@ int ena_com_fill_hash_function(struct ena_com_dev *ena_dev, switch (func) { case ENA_ADMIN_TOEPLITZ: - if (key_len > sizeof(hash_key->key)) { - pr_err("key len (%hu) is bigger than the max supported (%zu)\n", - key_len, sizeof(hash_key->key)); - return -EINVAL; + if (key) { + if (key_len != sizeof(hash_key->key)) { + pr_err("key len (%hu) doesn't equal the supported size (%zu)\n", + key_len, sizeof(hash_key->key)); + return -EINVAL; + } + memcpy(hash_key->key, key, key_len); + rss->hash_init_val = init_val; + hash_key->keys_num = key_len >> 2; } - - memcpy(hash_key->key, key, key_len); - rss->hash_init_val = init_val; - hash_key->keys_num = key_len >> 2; break; case ENA_ADMIN_CRC32: rss->hash_init_val = init_val; -- cgit v1.2.3 From 2a6e5fa2f4c25b66c763428a3e65363214946931 Mon Sep 17 00:00:00 2001 From: Arthur Kiyanovski Date: Tue, 11 Feb 2020 15:17:41 +0000 Subject: net: ena: fix uses of round_jiffies() >From the documentation of round_jiffies(): "Rounds a time delta in the future (in jiffies) up or down to (approximately) full seconds. This is useful for timers for which the exact time they fire does not matter too much, as long as they fire approximately every X seconds. By rounding these timers to whole seconds, all such timers will fire at the same time, rather than at various times spread out. The goal of this is to have the CPU wake up less, which saves power." There are 2 parts to this patch: ================================ Part 1: ------- In our case we need timer_service to be called approximately every X=1 seconds, and the exact time does not matter, so using round_jiffies() is the right way to go. Therefore we add round_jiffies() to the mod_timer() in ena_timer_service(). Part 2: ------- round_jiffies() is used in check_for_missing_keep_alive() when getting the jiffies of the expiration of the keep_alive timeout. Here it is actually a mistake to use round_jiffies() because we want the exact time when keep_alive should expire and not an approximate rounded time, which can cause early, false positive, timeouts. Therefore we remove round_jiffies() in the calculation of keep_alive_expired() in check_for_missing_keep_alive(). Fixes: 82ef30f13be0 ("net: ena: add hardware hints capability to the driver") Fixes: 1738cd3ed342 ("net: ena: Add a driver for Amazon Elastic Network Adapters (ENA)") Signed-off-by: Arthur Kiyanovski Signed-off-by: David S. Miller --- drivers/net/ethernet/amazon/ena/ena_netdev.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.c b/drivers/net/ethernet/amazon/ena/ena_netdev.c index 894e8c1a8cf1..0b2fd96b93d7 100644 --- a/drivers/net/ethernet/amazon/ena/ena_netdev.c +++ b/drivers/net/ethernet/amazon/ena/ena_netdev.c @@ -3706,8 +3706,8 @@ static void check_for_missing_keep_alive(struct ena_adapter *adapter) if (adapter->keep_alive_timeout == ENA_HW_HINTS_NO_TIMEOUT) return; - keep_alive_expired = round_jiffies(adapter->last_keep_alive_jiffies + - adapter->keep_alive_timeout); + keep_alive_expired = adapter->last_keep_alive_jiffies + + adapter->keep_alive_timeout; if (unlikely(time_is_before_jiffies(keep_alive_expired))) { netif_err(adapter, drv, adapter->netdev, "Keep alive watchdog timeout.\n"); @@ -3809,7 +3809,7 @@ static void ena_timer_service(struct timer_list *t) } /* Reset the timer */ - mod_timer(&adapter->timer_service, jiffies + HZ); + mod_timer(&adapter->timer_service, round_jiffies(jiffies + HZ)); } static int ena_calc_max_io_queue_num(struct pci_dev *pdev, -- cgit v1.2.3 From cf6d17fde93bdda23c9b02dd5906a12bf8c55209 Mon Sep 17 00:00:00 2001 From: Arthur Kiyanovski Date: Tue, 11 Feb 2020 15:17:42 +0000 Subject: net: ena: add missing ethtool TX timestamping indication Current implementation of the driver calls skb_tx_timestamp()to add a software tx timestamp to the skb, however the software-transmit capability is not reported in ethtool -T. This commit updates the ethtool structure to report the software-transmit capability in ethtool -T using the standard ethtool_op_get_ts_info(). This function reports all software timestamping capabilities (tx and rx), as well as setting phc_index = -1. phc_index is the index of the PTP hardware clock device that will be used for hardware timestamps. Since we don't have such a device in ENA, using the default -1 value is the correct setting. Fixes: 1738cd3ed342 ("net: ena: Add a driver for Amazon Elastic Network Adapters (ENA)") Signed-off-by: Ezequiel Lara Gomez Signed-off-by: Arthur Kiyanovski Signed-off-by: David S. Miller --- drivers/net/ethernet/amazon/ena/ena_ethtool.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/amazon/ena/ena_ethtool.c b/drivers/net/ethernet/amazon/ena/ena_ethtool.c index b4e891d49a94..08273eb226ee 100644 --- a/drivers/net/ethernet/amazon/ena/ena_ethtool.c +++ b/drivers/net/ethernet/amazon/ena/ena_ethtool.c @@ -814,6 +814,7 @@ static const struct ethtool_ops ena_ethtool_ops = { .set_channels = ena_set_channels, .get_tunable = ena_get_tunable, .set_tunable = ena_set_tunable, + .get_ts_info = ethtool_op_get_ts_info, }; void ena_set_ethtool_ops(struct net_device *netdev) -- cgit v1.2.3 From 0d1c3de7b8c78a5e44b74b62ede4a63629f5d811 Mon Sep 17 00:00:00 2001 From: Arthur Kiyanovski Date: Tue, 11 Feb 2020 15:17:43 +0000 Subject: net: ena: fix incorrect default RSS key Bug description: When running "ethtool -x " the key shows up as all zeros. When we use "ethtool -X hfunc toeplitz hkey " to set the key and then try to retrieve it using "ethtool -x " then we return the correct key because we return the one we saved. Bug cause: We don't fetch the key from the device but instead return the key that we have saved internally which is by default set to zero upon allocation. Fix: This commit fixes the issue by initializing the key to a random value using netdev_rss_key_fill(). Fixes: 1738cd3ed342 ("net: ena: Add a driver for Amazon Elastic Network Adapters (ENA)") Signed-off-by: Sameeh Jubran Signed-off-by: Arthur Kiyanovski Signed-off-by: David S. Miller --- drivers/net/ethernet/amazon/ena/ena_com.c | 15 +++++++++++++++ drivers/net/ethernet/amazon/ena/ena_com.h | 1 + 2 files changed, 16 insertions(+) diff --git a/drivers/net/ethernet/amazon/ena/ena_com.c b/drivers/net/ethernet/amazon/ena/ena_com.c index e54c44fdcaa7..d6b894b06fa3 100644 --- a/drivers/net/ethernet/amazon/ena/ena_com.c +++ b/drivers/net/ethernet/amazon/ena/ena_com.c @@ -1041,6 +1041,19 @@ static int ena_com_get_feature(struct ena_com_dev *ena_dev, feature_ver); } +static void ena_com_hash_key_fill_default_key(struct ena_com_dev *ena_dev) +{ + struct ena_admin_feature_rss_flow_hash_control *hash_key = + (ena_dev->rss).hash_key; + + netdev_rss_key_fill(&hash_key->key, sizeof(hash_key->key)); + /* The key is stored in the device in u32 array + * as well as the API requires the key to be passed in this + * format. Thus the size of our array should be divided by 4 + */ + hash_key->keys_num = sizeof(hash_key->key) / sizeof(u32); +} + static int ena_com_hash_key_allocate(struct ena_com_dev *ena_dev) { struct ena_rss *rss = &ena_dev->rss; @@ -2631,6 +2644,8 @@ int ena_com_rss_init(struct ena_com_dev *ena_dev, u16 indr_tbl_log_size) if (unlikely(rc)) goto err_hash_key; + ena_com_hash_key_fill_default_key(ena_dev); + rc = ena_com_hash_ctrl_init(ena_dev); if (unlikely(rc)) goto err_hash_ctrl; diff --git a/drivers/net/ethernet/amazon/ena/ena_com.h b/drivers/net/ethernet/amazon/ena/ena_com.h index 0ce37d54ed10..9b5bd28ed0ac 100644 --- a/drivers/net/ethernet/amazon/ena/ena_com.h +++ b/drivers/net/ethernet/amazon/ena/ena_com.h @@ -44,6 +44,7 @@ #include #include #include +#include #include "ena_common_defs.h" #include "ena_admin_defs.h" -- cgit v1.2.3 From 6a4f7dc82d1e3abd3feb0c60b5041056fcd9880c Mon Sep 17 00:00:00 2001 From: Sameeh Jubran Date: Tue, 11 Feb 2020 15:17:44 +0000 Subject: net: ena: rss: do not allocate key when not supported Currently we allocate the key whether the device supports setting the key or not. This commit adds a check to the allocation function and handles the error accordingly. Fixes: 1738cd3ed342 ("net: ena: Add a driver for Amazon Elastic Network Adapters (ENA)") Signed-off-by: Sameeh Jubran Signed-off-by: David S. Miller --- drivers/net/ethernet/amazon/ena/ena_com.c | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/amazon/ena/ena_com.c b/drivers/net/ethernet/amazon/ena/ena_com.c index d6b894b06fa3..6f758ece86f6 100644 --- a/drivers/net/ethernet/amazon/ena/ena_com.c +++ b/drivers/net/ethernet/amazon/ena/ena_com.c @@ -1057,6 +1057,20 @@ static void ena_com_hash_key_fill_default_key(struct ena_com_dev *ena_dev) static int ena_com_hash_key_allocate(struct ena_com_dev *ena_dev) { struct ena_rss *rss = &ena_dev->rss; + struct ena_admin_feature_rss_flow_hash_control *hash_key; + struct ena_admin_get_feat_resp get_resp; + int rc; + + hash_key = (ena_dev->rss).hash_key; + + rc = ena_com_get_feature_ex(ena_dev, &get_resp, + ENA_ADMIN_RSS_HASH_FUNCTION, + ena_dev->rss.hash_key_dma_addr, + sizeof(ena_dev->rss.hash_key), 0); + if (unlikely(rc)) { + hash_key = NULL; + return -EOPNOTSUPP; + } rss->hash_key = dma_alloc_coherent(ena_dev->dmadev, sizeof(*rss->hash_key), @@ -2640,11 +2654,15 @@ int ena_com_rss_init(struct ena_com_dev *ena_dev, u16 indr_tbl_log_size) if (unlikely(rc)) goto err_indr_tbl; + /* The following function might return unsupported in case the + * device doesn't support setting the key / hash function. We can safely + * ignore this error and have indirection table support only. + */ rc = ena_com_hash_key_allocate(ena_dev); - if (unlikely(rc)) + if (unlikely(rc) && rc != -EOPNOTSUPP) goto err_hash_key; - - ena_com_hash_key_fill_default_key(ena_dev); + else if (rc != -EOPNOTSUPP) + ena_com_hash_key_fill_default_key(ena_dev); rc = ena_com_hash_ctrl_init(ena_dev); if (unlikely(rc)) -- cgit v1.2.3 From 0c8923c0a64fb5d14bebb9a9065d2dc25ac5e600 Mon Sep 17 00:00:00 2001 From: Sameeh Jubran Date: Tue, 11 Feb 2020 15:17:45 +0000 Subject: net: ena: rss: fix failure to get indirection table On old hardware, getting / setting the hash function is not supported while gettting / setting the indirection table is. This commit enables us to still show the indirection table on older hardwares by setting the hash function and key to NULL. Fixes: 1738cd3ed342 ("net: ena: Add a driver for Amazon Elastic Network Adapters (ENA)") Signed-off-by: Sameeh Jubran Signed-off-by: David S. Miller --- drivers/net/ethernet/amazon/ena/ena_ethtool.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/drivers/net/ethernet/amazon/ena/ena_ethtool.c b/drivers/net/ethernet/amazon/ena/ena_ethtool.c index 08273eb226ee..d44ae4fc9c4e 100644 --- a/drivers/net/ethernet/amazon/ena/ena_ethtool.c +++ b/drivers/net/ethernet/amazon/ena/ena_ethtool.c @@ -648,7 +648,21 @@ static int ena_get_rxfh(struct net_device *netdev, u32 *indir, u8 *key, if (rc) return rc; + /* We call this function in order to check if the device + * supports getting/setting the hash function. + */ rc = ena_com_get_hash_function(adapter->ena_dev, &ena_func, key); + + if (rc) { + if (rc == -EOPNOTSUPP) { + key = NULL; + hfunc = NULL; + rc = 0; + } + + return rc; + } + if (rc) return rc; -- cgit v1.2.3 From 4844470d472d660c26149ad764da2406adb13423 Mon Sep 17 00:00:00 2001 From: Arthur Kiyanovski Date: Tue, 11 Feb 2020 15:17:46 +0000 Subject: net: ena: rss: store hash function as values and not bits The device receives, stores and retrieves the hash function value as bits and not as their enum value. The bug: * In ena_com_set_hash_function() we set cmd.u.flow_hash_func.selected_func to the bit value of rss->hash_func. (1 << rss->hash_func) * In ena_com_get_hash_function() we retrieve the hash function and store it's bit value in rss->hash_func. (Now the bit value of rss->hash_func is stored in rss->hash_func instead of it's enum value) The fix: This commit fixes the issue by converting the retrieved hash function values from the device to the matching enum value of the set bit using ffs(). ffs() finds the first set bit's index in a word. Since the function returns 1 for the LSB's index, we need to subtract 1 from the returned value (note that BIT(0) is 1). Fixes: 1738cd3ed342 ("net: ena: Add a driver for Amazon Elastic Network Adapters (ENA)") Signed-off-by: Sameeh Jubran Signed-off-by: Arthur Kiyanovski Signed-off-by: David S. Miller --- drivers/net/ethernet/amazon/ena/ena_com.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/amazon/ena/ena_com.c b/drivers/net/ethernet/amazon/ena/ena_com.c index 6f758ece86f6..8ab192cb26b7 100644 --- a/drivers/net/ethernet/amazon/ena/ena_com.c +++ b/drivers/net/ethernet/amazon/ena/ena_com.c @@ -2370,7 +2370,11 @@ int ena_com_get_hash_function(struct ena_com_dev *ena_dev, if (unlikely(rc)) return rc; - rss->hash_func = get_resp.u.flow_hash_func.selected_func; + /* ffs() returns 1 in case the lsb is set */ + rss->hash_func = ffs(get_resp.u.flow_hash_func.selected_func); + if (rss->hash_func) + rss->hash_func--; + if (func) *func = rss->hash_func; -- cgit v1.2.3 From 92569fd27f5cb0ccbdf7c7d70044b690e89a0277 Mon Sep 17 00:00:00 2001 From: Arthur Kiyanovski Date: Tue, 11 Feb 2020 15:17:47 +0000 Subject: net: ena: fix incorrectly saving queue numbers when setting RSS indirection table The indirection table has the indices of the Rx queues. When we store it during set indirection operation, we convert the indices to our internal representation of the indices. Our internal representation of the indices is: even indices for Tx and uneven indices for Rx, where every Tx/Rx pair are in a consecutive order starting from 0. For example if the driver has 3 queues (3 for Tx and 3 for Rx) then the indices are as follows: 0 1 2 3 4 5 Tx Rx Tx Rx Tx Rx The BUG: The issue is that when we satisfy a get request for the indirection table, we don't convert the indices back to the original representation. The FIX: Simply apply the inverse function for the indices of the indirection table after we set it. Fixes: 1738cd3ed342 ("net: ena: Add a driver for Amazon Elastic Network Adapters (ENA)") Signed-off-by: Sameeh Jubran Signed-off-by: Arthur Kiyanovski Signed-off-by: David S. Miller --- drivers/net/ethernet/amazon/ena/ena_ethtool.c | 24 +++++++++++++++++++++++- drivers/net/ethernet/amazon/ena/ena_netdev.h | 2 ++ 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/amazon/ena/ena_ethtool.c b/drivers/net/ethernet/amazon/ena/ena_ethtool.c index d44ae4fc9c4e..9a7a6ef39f9a 100644 --- a/drivers/net/ethernet/amazon/ena/ena_ethtool.c +++ b/drivers/net/ethernet/amazon/ena/ena_ethtool.c @@ -636,6 +636,28 @@ static u32 ena_get_rxfh_key_size(struct net_device *netdev) return ENA_HASH_KEY_SIZE; } +static int ena_indirection_table_get(struct ena_adapter *adapter, u32 *indir) +{ + struct ena_com_dev *ena_dev = adapter->ena_dev; + int i, rc; + + if (!indir) + return 0; + + rc = ena_com_indirect_table_get(ena_dev, indir); + if (rc) + return rc; + + /* Our internal representation of the indices is: even indices + * for Tx and uneven indices for Rx. We need to convert the Rx + * indices to be consecutive + */ + for (i = 0; i < ENA_RX_RSS_TABLE_SIZE; i++) + indir[i] = ENA_IO_RXQ_IDX_TO_COMBINED_IDX(indir[i]); + + return rc; +} + static int ena_get_rxfh(struct net_device *netdev, u32 *indir, u8 *key, u8 *hfunc) { @@ -644,7 +666,7 @@ static int ena_get_rxfh(struct net_device *netdev, u32 *indir, u8 *key, u8 func; int rc; - rc = ena_com_indirect_table_get(adapter->ena_dev, indir); + rc = ena_indirection_table_get(adapter, indir); if (rc) return rc; diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.h b/drivers/net/ethernet/amazon/ena/ena_netdev.h index 094324fd0edc..8795e0b1dc3c 100644 --- a/drivers/net/ethernet/amazon/ena/ena_netdev.h +++ b/drivers/net/ethernet/amazon/ena/ena_netdev.h @@ -130,6 +130,8 @@ #define ENA_IO_TXQ_IDX(q) (2 * (q)) #define ENA_IO_RXQ_IDX(q) (2 * (q) + 1) +#define ENA_IO_TXQ_IDX_TO_COMBINED_IDX(q) ((q) / 2) +#define ENA_IO_RXQ_IDX_TO_COMBINED_IDX(q) (((q) - 1) / 2) #define ENA_MGMNT_IRQ_IDX 0 #define ENA_IO_IRQ_FIRST_IDX 1 -- cgit v1.2.3 From e3f89f91e98ce07dc0f121a3b70d21aca749ba39 Mon Sep 17 00:00:00 2001 From: Arthur Kiyanovski Date: Tue, 11 Feb 2020 15:17:48 +0000 Subject: net: ena: fix corruption of dev_idx_to_host_tbl The function ena_com_ind_tbl_convert_from_device() has an overflow bug as explained below. Either way, this function is not needed at all since we don't retrieve the indirection table from the device at any point which means that this conversion is not needed. The bug: The for loop iterates over all io_sq_queues, when passing the actual number of used queues the io_sq_queues[i].idx equals 0 since they are uninitialized which results in the following code to be executed till the end of the loop: dev_idx_to_host_tbl[0] = i; This results dev_idx_to_host_tbl[0] in being equal to ENA_TOTAL_NUM_QUEUES - 1. Fixes: 1738cd3ed342 ("net: ena: Add a driver for Amazon Elastic Network Adapters (ENA)") Signed-off-by: Sameeh Jubran Signed-off-by: Arthur Kiyanovski Signed-off-by: David S. Miller --- drivers/net/ethernet/amazon/ena/ena_com.c | 28 ---------------------------- 1 file changed, 28 deletions(-) diff --git a/drivers/net/ethernet/amazon/ena/ena_com.c b/drivers/net/ethernet/amazon/ena/ena_com.c index 8ab192cb26b7..74743fd8a1e0 100644 --- a/drivers/net/ethernet/amazon/ena/ena_com.c +++ b/drivers/net/ethernet/amazon/ena/ena_com.c @@ -1281,30 +1281,6 @@ static int ena_com_ind_tbl_convert_to_device(struct ena_com_dev *ena_dev) return 0; } -static int ena_com_ind_tbl_convert_from_device(struct ena_com_dev *ena_dev) -{ - u16 dev_idx_to_host_tbl[ENA_TOTAL_NUM_QUEUES] = { (u16)-1 }; - struct ena_rss *rss = &ena_dev->rss; - u8 idx; - u16 i; - - for (i = 0; i < ENA_TOTAL_NUM_QUEUES; i++) - dev_idx_to_host_tbl[ena_dev->io_sq_queues[i].idx] = i; - - for (i = 0; i < 1 << rss->tbl_log_size; i++) { - if (rss->rss_ind_tbl[i].cq_idx > ENA_TOTAL_NUM_QUEUES) - return -EINVAL; - idx = (u8)rss->rss_ind_tbl[i].cq_idx; - - if (dev_idx_to_host_tbl[idx] > ENA_TOTAL_NUM_QUEUES) - return -EINVAL; - - rss->host_rss_ind_tbl[i] = dev_idx_to_host_tbl[idx]; - } - - return 0; -} - static void ena_com_update_intr_delay_resolution(struct ena_com_dev *ena_dev, u16 intr_delay_resolution) { @@ -2638,10 +2614,6 @@ int ena_com_indirect_table_get(struct ena_com_dev *ena_dev, u32 *ind_tbl) if (!ind_tbl) return 0; - rc = ena_com_ind_tbl_convert_from_device(ena_dev); - if (unlikely(rc)) - return rc; - for (i = 0; i < (1 << rss->tbl_log_size); i++) ind_tbl[i] = rss->host_rss_ind_tbl[i]; -- cgit v1.2.3 From 470793a78ce344bd53d31e0c2d537f71ba957547 Mon Sep 17 00:00:00 2001 From: Arthur Kiyanovski Date: Tue, 11 Feb 2020 15:17:49 +0000 Subject: net: ena: make ena rxfh support ETH_RSS_HASH_NO_CHANGE As the name suggests ETH_RSS_HASH_NO_CHANGE is received upon changing the key or indirection table using ethtool while keeping the same hash function. Also add a function for retrieving the current hash function from the ena-com layer. Fixes: 1738cd3ed342 ("net: ena: Add a driver for Amazon Elastic Network Adapters (ENA)") Signed-off-by: Sameeh Jubran Signed-off-by: Saeed Bshara Signed-off-by: Arthur Kiyanovski Signed-off-by: David S. Miller --- drivers/net/ethernet/amazon/ena/ena_com.c | 5 +++++ drivers/net/ethernet/amazon/ena/ena_com.h | 8 ++++++++ drivers/net/ethernet/amazon/ena/ena_ethtool.c | 3 +++ 3 files changed, 16 insertions(+) diff --git a/drivers/net/ethernet/amazon/ena/ena_com.c b/drivers/net/ethernet/amazon/ena/ena_com.c index 74743fd8a1e0..0f93d1092435 100644 --- a/drivers/net/ethernet/amazon/ena/ena_com.c +++ b/drivers/net/ethernet/amazon/ena/ena_com.c @@ -1041,6 +1041,11 @@ static int ena_com_get_feature(struct ena_com_dev *ena_dev, feature_ver); } +int ena_com_get_current_hash_function(struct ena_com_dev *ena_dev) +{ + return ena_dev->rss.hash_func; +} + static void ena_com_hash_key_fill_default_key(struct ena_com_dev *ena_dev) { struct ena_admin_feature_rss_flow_hash_control *hash_key = diff --git a/drivers/net/ethernet/amazon/ena/ena_com.h b/drivers/net/ethernet/amazon/ena/ena_com.h index 9b5bd28ed0ac..469f298199a7 100644 --- a/drivers/net/ethernet/amazon/ena/ena_com.h +++ b/drivers/net/ethernet/amazon/ena/ena_com.h @@ -656,6 +656,14 @@ int ena_com_rss_init(struct ena_com_dev *ena_dev, u16 log_size); */ void ena_com_rss_destroy(struct ena_com_dev *ena_dev); +/* ena_com_get_current_hash_function - Get RSS hash function + * @ena_dev: ENA communication layer struct + * + * Return the current hash function. + * @return: 0 or one of the ena_admin_hash_functions values. + */ +int ena_com_get_current_hash_function(struct ena_com_dev *ena_dev); + /* ena_com_fill_hash_function - Fill RSS hash function * @ena_dev: ENA communication layer struct * @func: The hash function (Toeplitz or crc) diff --git a/drivers/net/ethernet/amazon/ena/ena_ethtool.c b/drivers/net/ethernet/amazon/ena/ena_ethtool.c index 9a7a6ef39f9a..452b0e770370 100644 --- a/drivers/net/ethernet/amazon/ena/ena_ethtool.c +++ b/drivers/net/ethernet/amazon/ena/ena_ethtool.c @@ -736,6 +736,9 @@ static int ena_set_rxfh(struct net_device *netdev, const u32 *indir, } switch (hfunc) { + case ETH_RSS_HASH_NO_CHANGE: + func = ena_com_get_current_hash_function(ena_dev); + break; case ETH_RSS_HASH_TOP: func = ENA_ADMIN_TOEPLITZ; break; -- cgit v1.2.3 From 886d2089276e40d460731765083a741c5c762461 Mon Sep 17 00:00:00 2001 From: Sameeh Jubran Date: Tue, 11 Feb 2020 15:17:50 +0000 Subject: net: ena: ethtool: use correct value for crc32 hash Up till kernel 4.11 there was no enum defined for crc32 hash in ethtool, thus the xor enum was used for supporting crc32. Fixes: 1738cd3ed342 ("net: ena: Add a driver for Amazon Elastic Network Adapters (ENA)") Signed-off-by: Sameeh Jubran Signed-off-by: David S. Miller --- drivers/net/ethernet/amazon/ena/ena_ethtool.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/amazon/ena/ena_ethtool.c b/drivers/net/ethernet/amazon/ena/ena_ethtool.c index 452b0e770370..ced1d577b62a 100644 --- a/drivers/net/ethernet/amazon/ena/ena_ethtool.c +++ b/drivers/net/ethernet/amazon/ena/ena_ethtool.c @@ -693,7 +693,7 @@ static int ena_get_rxfh(struct net_device *netdev, u32 *indir, u8 *key, func = ETH_RSS_HASH_TOP; break; case ENA_ADMIN_CRC32: - func = ETH_RSS_HASH_XOR; + func = ETH_RSS_HASH_CRC32; break; default: netif_err(adapter, drv, netdev, @@ -742,7 +742,7 @@ static int ena_set_rxfh(struct net_device *netdev, const u32 *indir, case ETH_RSS_HASH_TOP: func = ENA_ADMIN_TOEPLITZ; break; - case ETH_RSS_HASH_XOR: + case ETH_RSS_HASH_CRC32: func = ENA_ADMIN_CRC32; break; default: -- cgit v1.2.3 From c207979f5ae10ed70aff1bb13f39f0736973de99 Mon Sep 17 00:00:00 2001 From: Arthur Kiyanovski Date: Tue, 11 Feb 2020 15:17:51 +0000 Subject: net: ena: ena-com.c: prevent NULL pointer dereference comp_ctx can be NULL in a very rare case when an admin command is executed during the execution of ena_remove(). The bug scenario is as follows: * ena_destroy_device() sets the comp_ctx to be NULL * An admin command is executed before executing unregister_netdev(), this can still happen because our device can still receive callbacks from the netdev infrastructure such as ethtool commands. * When attempting to access the comp_ctx, the bug occurs since it's set to NULL Fix: Added a check that comp_ctx is not NULL Fixes: 1738cd3ed342 ("net: ena: Add a driver for Amazon Elastic Network Adapters (ENA)") Signed-off-by: Sameeh Jubran Signed-off-by: Arthur Kiyanovski Signed-off-by: David S. Miller --- drivers/net/ethernet/amazon/ena/ena_com.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/net/ethernet/amazon/ena/ena_com.c b/drivers/net/ethernet/amazon/ena/ena_com.c index 0f93d1092435..1fb58f9ad80b 100644 --- a/drivers/net/ethernet/amazon/ena/ena_com.c +++ b/drivers/net/ethernet/amazon/ena/ena_com.c @@ -200,6 +200,11 @@ static void comp_ctxt_release(struct ena_com_admin_queue *queue, static struct ena_comp_ctx *get_comp_ctxt(struct ena_com_admin_queue *queue, u16 command_id, bool capture) { + if (unlikely(!queue->comp_ctx)) { + pr_err("Completion context is NULL\n"); + return NULL; + } + if (unlikely(command_id >= queue->q_depth)) { pr_err("command id is larger than the queue size. cmd_id: %u queue size %d\n", command_id, queue->q_depth); -- cgit v1.2.3 From 2b3b6497c38d123934de68ea82a247b557d95290 Mon Sep 17 00:00:00 2001 From: Kailang Yang Date: Mon, 10 Feb 2020 16:15:14 +0800 Subject: ALSA: hda/realtek - Add more codec supported Headset Button Add supported Headset Button for ALC215/ALC285/ALC289. Signed-off-by: Kailang Yang Cc: Link: https://lore.kernel.org/r/948f70b4488f4cc2b629a39ce4e4be33@realtek.com Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 4770fb3f51fb..3ee88adf57e7 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -5701,8 +5701,11 @@ static void alc_fixup_headset_jack(struct hda_codec *codec, break; case HDA_FIXUP_ACT_INIT: switch (codec->core.vendor_id) { + case 0x10ec0215: case 0x10ec0225: + case 0x10ec0285: case 0x10ec0295: + case 0x10ec0289: case 0x10ec0299: alc_write_coef_idx(codec, 0x48, 0xd011); alc_update_coef_idx(codec, 0x49, 0x007f, 0x0045); -- cgit v1.2.3 From 7dafba3762d6c0083ded00a48f8c1a158bc86717 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Wed, 12 Feb 2020 09:10:47 +0100 Subject: ALSA: hda/realtek - Fix silent output on MSI-GL73 MSI-GL73 laptop with ALC1220 codec requires a similar workaround for Clevo laptops to enforce the DAC/mixer connection path. Set up a quirk entry for that. BugLink: https://bugzilla.kernel.org/show_bug.cgi?id=204159 Cc: Link: https://lore.kernel.org/r/20200212081047.27727-1-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 3ee88adf57e7..6c8cb4ce517e 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -2447,6 +2447,7 @@ static const struct snd_pci_quirk alc882_fixup_tbl[] = { SND_PCI_QUIRK(0x1071, 0x8258, "Evesham Voyaeger", ALC882_FIXUP_EAPD), SND_PCI_QUIRK(0x1458, 0xa002, "Gigabyte EP45-DS3/Z87X-UD3H", ALC889_FIXUP_FRONT_HP_NO_PRESENCE), SND_PCI_QUIRK(0x1458, 0xa0b8, "Gigabyte AZ370-Gaming", ALC1220_FIXUP_GB_DUAL_CODECS), + SND_PCI_QUIRK(0x1462, 0x1276, "MSI-GL73", ALC1220_FIXUP_CLEVO_P950), SND_PCI_QUIRK(0x1462, 0x7350, "MSI-7350", ALC889_FIXUP_CD), SND_PCI_QUIRK(0x1462, 0xda57, "MSI Z270-Gaming", ALC1220_FIXUP_GB_DUAL_CODECS), SND_PCI_QUIRK_VENDOR(0x1462, "MSI", ALC882_FIXUP_GPIO3), -- cgit v1.2.3 From bab0c318ba3da32483da8aad37b9ef98fd8edafb Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 11 Feb 2020 18:40:58 +0100 Subject: KVM: x86: do not reset microcode version on INIT or RESET Do not initialize the microcode version at RESET or INIT, only on vCPU creation. Microcode updates are not lost during INIT, and exact behavior across a warm RESET is not specified by the architecture. Since we do not support a microcode update directly from the hypervisor, but only as a result of userspace setting the microcode version MSR, it's simpler for userspace if we do nothing in KVM and let userspace emulate behavior for RESET as it sees fit. Userspace can tie the fix to the availability of MSR_IA32_UCODE_REV in the list of emulated MSRs. Reported-by: Alex Williamson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm.c | 2 +- arch/x86/kvm/vmx/vmx.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index a3e32d61d60c..bef0ba35f121 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -2175,7 +2175,6 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) u32 dummy; u32 eax = 1; - vcpu->arch.microcode_version = 0x01000065; svm->spec_ctrl = 0; svm->virt_spec_ctrl = 0; @@ -2266,6 +2265,7 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu) init_vmcb(svm); svm_init_osvw(vcpu); + vcpu->arch.microcode_version = 0x01000065; return 0; diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 9a6664886f2e..d625b4b0e7b4 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -4238,7 +4238,6 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) vmx->msr_ia32_umwait_control = 0; - vcpu->arch.microcode_version = 0x100000000ULL; vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val(); vmx->hv_deadline_tsc = -1; kvm_set_cr8(vcpu, 0); @@ -6763,6 +6762,7 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu) vmx->nested.posted_intr_nv = -1; vmx->nested.current_vmptr = -1ull; + vcpu->arch.microcode_version = 0x100000000ULL; vmx->msr_ia32_feature_control_valid_bits = FEAT_CTL_LOCKED; /* -- cgit v1.2.3 From 1f03b2bcd0d7cad4af107339cdef80ed377fe2a8 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 7 Feb 2020 16:34:10 +0000 Subject: KVM: Disable preemption in kvm_get_running_vcpu() Accessing a per-cpu variable only makes sense when preemption is disabled (and the kernel does check this when the right debug options are switched on). For kvm_get_running_vcpu(), it is fine to return the value after re-enabling preemption, as the preempt notifiers will make sure that this is kept consistent across task migration (the comment above the function hints at it, but lacks the crucial preemption management). While we're at it, move the comment from the ARM code, which explains why the whole thing works. Fixes: 7495e22bb165 ("KVM: Move running VCPU from ARM to common code"). Cc: Paolo Bonzini Reported-by: Zenghui Yu Tested-by: Zenghui Yu Reviewed-by: Peter Xu Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/318984f6-bc36-33a3-abc6-bf2295974b06@huawei.com Message-id: <20200207163410.31276-1-maz@kernel.org> Signed-off-by: Paolo Bonzini --- virt/kvm/arm/vgic/vgic-mmio.c | 12 ------------ virt/kvm/kvm_main.c | 16 +++++++++++++--- 2 files changed, 13 insertions(+), 15 deletions(-) diff --git a/virt/kvm/arm/vgic/vgic-mmio.c b/virt/kvm/arm/vgic/vgic-mmio.c index d656ebd5f9d4..97fb2a40e6ba 100644 --- a/virt/kvm/arm/vgic/vgic-mmio.c +++ b/virt/kvm/arm/vgic/vgic-mmio.c @@ -179,18 +179,6 @@ unsigned long vgic_mmio_read_pending(struct kvm_vcpu *vcpu, return value; } -/* - * This function will return the VCPU that performed the MMIO access and - * trapped from within the VM, and will return NULL if this is a userspace - * access. - * - * We can disable preemption locally around accessing the per-CPU variable, - * and use the resolved vcpu pointer after enabling preemption again, because - * even if the current thread is migrated to another CPU, reading the per-CPU - * value later will give us the same value as we update the per-CPU variable - * in the preempt notifier handlers. - */ - /* Must be called with irq->irq_lock held */ static void vgic_hw_irq_spending(struct kvm_vcpu *vcpu, struct vgic_irq *irq, bool is_uaccess) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 67ae2d5c37b2..70f03ce0e5c1 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -4409,12 +4409,22 @@ static void kvm_sched_out(struct preempt_notifier *pn, /** * kvm_get_running_vcpu - get the vcpu running on the current CPU. - * Thanks to preempt notifiers, this can also be called from - * preemptible context. + * + * We can disable preemption locally around accessing the per-CPU variable, + * and use the resolved vcpu pointer after enabling preemption again, + * because even if the current thread is migrated to another CPU, reading + * the per-CPU value later will give us the same value as we update the + * per-CPU variable in the preempt notifier handlers. */ struct kvm_vcpu *kvm_get_running_vcpu(void) { - return __this_cpu_read(kvm_running_vcpu); + struct kvm_vcpu *vcpu; + + preempt_disable(); + vcpu = __this_cpu_read(kvm_running_vcpu); + preempt_enable(); + + return vcpu; } /** -- cgit v1.2.3 From 9556e5c7c40e3d2e7c9417d1e766e3bd88b598e5 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Fri, 24 Jan 2020 19:22:55 +0000 Subject: drm/i915: Stub out i915_gpu_coredump_put i915_gpu_coreddump_put is currently only defined if CONFIG_DRM_I915_CAPTURE_ERROR is enabled, provide a stub otherwise. Reported-by: Mike Lothian Fixes: 742379c0c400 ("drm/i915: Start chopping up the GPU error capture") Signed-off-by: Chris Wilson Cc: Mike Lothian Cc: Andi Shyti Reviewed-by: Andi Shyti Link: https://patchwork.freedesktop.org/patch/msgid/20200124192255.541355-1-chris@chris-wilson.co.uk (cherry picked from commit 7e36505d0cf82f2920f2fd22ebb14a8b540396a3) Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/i915_gpu_error.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/gpu/drm/i915/i915_gpu_error.h b/drivers/gpu/drm/i915/i915_gpu_error.h index 41c1475e1500..e4a6afed3bbf 100644 --- a/drivers/gpu/drm/i915/i915_gpu_error.h +++ b/drivers/gpu/drm/i915/i915_gpu_error.h @@ -318,6 +318,10 @@ i915_error_state_store(struct i915_gpu_coredump *error) { } +static inline void i915_gpu_coredump_put(struct i915_gpu_coredump *gpu) +{ +} + static inline struct i915_gpu_coredump * i915_first_error_state(struct drm_i915_private *i915) { -- cgit v1.2.3 From 7c34bb03983e3c1e42ad2749514dec9e5a19c336 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Sun, 26 Jan 2020 10:23:43 +0000 Subject: drm/i915: Tighten atomicity of i915_active_acquire vs i915_active_release As we use a mutex to serialise the first acquire (as it may be a lengthy operation), but only an atomic decrement for the release, we have to be careful in case a second thread races and completes both acquire/release as the first finishes its acquire. Thread A Thread B i915_active_acquire i915_active_acquire atomic_read() == 0 atomic_read() == 0 mutex_lock() mutex_lock() atomic_read() == 0 ref->active(); atomic_inc() mutex_unlock() atomic_read() == 1 i915_active_release atomic_dec_and_test() -> 0 ref->retire() atomic_inc() -> 1 mutex_unlock() So thread A has acquired the ref->active_count but since the ref was still active at the time, it did not initialise it. By switching the check inside the mutex to an atomic increment only if already active, we close the race. Fixes: c9ad602feabe ("drm/i915: Split i915_active.mutex into an irq-safe spinlock for the rbtree") Signed-off-by: Chris Wilson Reviewed-by: Tvrtko Ursulin Link: https://patchwork.freedesktop.org/patch/msgid/20200126102346.1877661-3-chris@chris-wilson.co.uk (cherry picked from commit ac0e331a628b5ded087eab09fad2ffb082ac61ba) Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/i915_active.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_active.c b/drivers/gpu/drm/i915/i915_active.c index f3da5c06f331..4fcd567ff818 100644 --- a/drivers/gpu/drm/i915/i915_active.c +++ b/drivers/gpu/drm/i915/i915_active.c @@ -416,13 +416,15 @@ int i915_active_acquire(struct i915_active *ref) if (err) return err; - if (!atomic_read(&ref->count) && ref->active) - err = ref->active(ref); - if (!err) { - spin_lock_irq(&ref->tree_lock); /* vs __active_retire() */ - debug_active_activate(ref); - atomic_inc(&ref->count); - spin_unlock_irq(&ref->tree_lock); + if (likely(!i915_active_acquire_if_busy(ref))) { + if (ref->active) + err = ref->active(ref); + if (!err) { + spin_lock_irq(&ref->tree_lock); /* __active_retire() */ + debug_active_activate(ref); + atomic_inc(&ref->count); + spin_unlock_irq(&ref->tree_lock); + } } mutex_unlock(&ref->mutex); -- cgit v1.2.3 From 5b92415e64e145e7da60420ead66b62aa41917bf Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Mon, 27 Jan 2020 15:28:29 +0000 Subject: drm/i915/gt: Acquire ce->active before ce->pin_count/ce->pin_mutex Similar to commit ac0e331a628b ("drm/i915: Tighten atomicity of i915_active_acquire vs i915_active_release") we have the same race of trying to pin the context underneath a mutex while allowing the decrement to be atomic outside of that mutex. This leads to the problem where two threads may simultaneously try to pin the context and the second not notice that they needed to repin the context. <2> [198.669621] kernel BUG at drivers/gpu/drm/i915/gt/intel_timeline.c:387! <4> [198.669703] invalid opcode: 0000 [#1] PREEMPT SMP PTI <4> [198.669712] CPU: 0 PID: 1246 Comm: gem_exec_create Tainted: G U W 5.5.0-rc6-CI-CI_DRM_7755+ #1 <4> [198.669723] Hardware name: /NUC7i5BNB, BIOS BNKBL357.86A.0054.2017.1025.1822 10/25/2017 <4> [198.669776] RIP: 0010:timeline_advance+0x7b/0xe0 [i915] <4> [198.669785] Code: 00 48 c7 c2 10 f1 46 a0 48 c7 c7 70 1b 32 a0 e8 bb dd e7 e0 bf 01 00 00 00 e8 d1 af e7 e0 31 f6 bf 09 00 00 00 e8 35 ef d8 e0 <0f> 0b 48 c7 c1 48 fa 49 a0 ba 84 01 00 00 48 c7 c6 10 f1 46 a0 48 <4> [198.669803] RSP: 0018:ffffc900004c3a38 EFLAGS: 00010296 <4> [198.669810] RAX: ffff888270b35140 RBX: ffff88826f32ee00 RCX: 0000000000000006 <4> [198.669818] RDX: 00000000000017c5 RSI: 0000000000000000 RDI: 0000000000000009 <4> [198.669826] RBP: ffffc900004c3a64 R08: 0000000000000000 R09: 0000000000000000 <4> [198.669834] R10: 0000000000000000 R11: 0000000000000000 R12: ffff88826f9b5980 <4> [198.669841] R13: 0000000000000cc0 R14: ffffc900004c3dc0 R15: ffff888253610068 <4> [198.669849] FS: 00007f63e663fe40(0000) GS:ffff888276c00000(0000) knlGS:0000000000000000 <4> [198.669857] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 <4> [198.669864] CR2: 00007f171f8e39a8 CR3: 000000026b1f6005 CR4: 00000000003606f0 <4> [198.669872] Call Trace: <4> [198.669924] intel_timeline_get_seqno+0x12/0x40 [i915] <4> [198.669977] __i915_request_create+0x76/0x5a0 [i915] <4> [198.670024] i915_request_create+0x86/0x1c0 [i915] <4> [198.670068] i915_gem_do_execbuffer+0xbf2/0x2500 [i915] <4> [198.670082] ? __lock_acquire+0x460/0x15d0 <4> [198.670128] i915_gem_execbuffer2_ioctl+0x11f/0x470 [i915] <4> [198.670171] ? i915_gem_execbuffer_ioctl+0x300/0x300 [i915] <4> [198.670181] drm_ioctl_kernel+0xa7/0xf0 <4> [198.670188] drm_ioctl+0x2e1/0x390 <4> [198.670233] ? i915_gem_execbuffer_ioctl+0x300/0x300 [i915] Fixes: 841350223816 ("drm/i915/gt: Drop mutex serialisation between context pin/unpin") References: ac0e331a628b ("drm/i915: Tighten atomicity of i915_active_acquire vs i915_active_release") Signed-off-by: Chris Wilson Cc: Tvrtko Ursulin Reviewed-by: Tvrtko Ursulin Link: https://patchwork.freedesktop.org/patch/msgid/20200127152829.2842149-1-chris@chris-wilson.co.uk (cherry picked from commit e5429340bfa2dc43a07c3329e0c30cdae4cc0b35) Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/gt/intel_context.c | 46 ++++++++++++++++++--------------- drivers/gpu/drm/i915/i915_active.h | 6 +++++ 2 files changed, 31 insertions(+), 21 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/intel_context.c b/drivers/gpu/drm/i915/gt/intel_context.c index 23137b2a8689..57e8a051ddc2 100644 --- a/drivers/gpu/drm/i915/gt/intel_context.c +++ b/drivers/gpu/drm/i915/gt/intel_context.c @@ -67,21 +67,18 @@ static int intel_context_active_acquire(struct intel_context *ce) { int err; - err = i915_active_acquire(&ce->active); - if (err) - return err; + __i915_active_acquire(&ce->active); + + if (intel_context_is_barrier(ce)) + return 0; /* Preallocate tracking nodes */ - if (!intel_context_is_barrier(ce)) { - err = i915_active_acquire_preallocate_barrier(&ce->active, - ce->engine); - if (err) { - i915_active_release(&ce->active); - return err; - } - } + err = i915_active_acquire_preallocate_barrier(&ce->active, + ce->engine); + if (err) + i915_active_release(&ce->active); - return 0; + return err; } static void intel_context_active_release(struct intel_context *ce) @@ -101,13 +98,19 @@ int __intel_context_do_pin(struct intel_context *ce) return err; } - if (mutex_lock_interruptible(&ce->pin_mutex)) - return -EINTR; + err = i915_active_acquire(&ce->active); + if (err) + return err; + + if (mutex_lock_interruptible(&ce->pin_mutex)) { + err = -EINTR; + goto out_release; + } - if (likely(!atomic_read(&ce->pin_count))) { + if (likely(!atomic_add_unless(&ce->pin_count, 1, 0))) { err = intel_context_active_acquire(ce); if (unlikely(err)) - goto err; + goto out_unlock; err = ce->ops->pin(ce); if (unlikely(err)) @@ -117,18 +120,19 @@ int __intel_context_do_pin(struct intel_context *ce) ce->ring->head, ce->ring->tail); smp_mb__before_atomic(); /* flush pin before it is visible */ + atomic_inc(&ce->pin_count); } - atomic_inc(&ce->pin_count); GEM_BUG_ON(!intel_context_is_pinned(ce)); /* no overflow! */ - - mutex_unlock(&ce->pin_mutex); - return 0; + GEM_BUG_ON(i915_active_is_idle(&ce->active)); + goto out_unlock; err_active: intel_context_active_release(ce); -err: +out_unlock: mutex_unlock(&ce->pin_mutex); +out_release: + i915_active_release(&ce->active); return err; } diff --git a/drivers/gpu/drm/i915/i915_active.h b/drivers/gpu/drm/i915/i915_active.h index b571f675c795..51e1e854ca55 100644 --- a/drivers/gpu/drm/i915/i915_active.h +++ b/drivers/gpu/drm/i915/i915_active.h @@ -188,6 +188,12 @@ int i915_active_acquire(struct i915_active *ref); bool i915_active_acquire_if_busy(struct i915_active *ref); void i915_active_release(struct i915_active *ref); +static inline void __i915_active_acquire(struct i915_active *ref) +{ + GEM_BUG_ON(!atomic_read(&ref->count)); + atomic_inc(&ref->count); +} + static inline bool i915_active_is_idle(const struct i915_active *ref) { -- cgit v1.2.3 From 52144db1309897f279b53e8df8a0d17e1cda7960 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Roberto=20de=20Souza?= Date: Wed, 29 Jan 2020 15:23:45 -0800 Subject: drm/i915: Fix preallocated barrier list append MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Only the first and the last nodes were being added to ref->preallocated_barriers. Renaming variables to make it more easy to read. Fixes: 841350223816 ("drm/i915/gt: Drop mutex serialisation between context pin/unpin") Cc: Chris Wilson Cc: Maarten Lankhorst Signed-off-by: José Roberto de Souza Reviewed-by: Chris Wilson Signed-off-by: Chris Wilson Link: https://patchwork.freedesktop.org/patch/msgid/20200129232345.84512-1-jose.souza@intel.com (cherry picked from commit d4c3c0b8221a72107eaf35c80c40716b81ca463e) Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/i915_active.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_active.c b/drivers/gpu/drm/i915/i915_active.c index 4fcd567ff818..b0a499753526 100644 --- a/drivers/gpu/drm/i915/i915_active.c +++ b/drivers/gpu/drm/i915/i915_active.c @@ -607,7 +607,7 @@ int i915_active_acquire_preallocate_barrier(struct i915_active *ref, struct intel_engine_cs *engine) { intel_engine_mask_t tmp, mask = engine->mask; - struct llist_node *pos = NULL, *next; + struct llist_node *first = NULL, *last = NULL; struct intel_gt *gt = engine->gt; int err; @@ -625,6 +625,7 @@ int i915_active_acquire_preallocate_barrier(struct i915_active *ref, */ for_each_engine_masked(engine, gt, mask, tmp) { u64 idx = engine->kernel_context->timeline->fence_context; + struct llist_node *prev = first; struct active_node *node; node = reuse_idle_barrier(ref, idx); @@ -658,23 +659,23 @@ int i915_active_acquire_preallocate_barrier(struct i915_active *ref, GEM_BUG_ON(rcu_access_pointer(node->base.fence) != ERR_PTR(-EAGAIN)); GEM_BUG_ON(barrier_to_engine(node) != engine); - next = barrier_to_ll(node); - next->next = pos; - if (!pos) - pos = next; + first = barrier_to_ll(node); + first->next = prev; + if (!last) + last = first; intel_engine_pm_get(engine); } GEM_BUG_ON(!llist_empty(&ref->preallocated_barriers)); - llist_add_batch(next, pos, &ref->preallocated_barriers); + llist_add_batch(first, last, &ref->preallocated_barriers); return 0; unwind: - while (pos) { - struct active_node *node = barrier_from_ll(pos); + while (first) { + struct active_node *node = barrier_from_ll(first); - pos = pos->next; + first = first->next; atomic_dec(&ref->count); intel_engine_pm_put(barrier_to_engine(node)); -- cgit v1.2.3 From 2933803bdcd8ac67c0b97a0bb158e0762d5ae236 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Thu, 30 Jan 2020 14:39:31 +0000 Subject: drm/i915/gem: Tighten checks and acquiring the mmap object Make sure we hold the rcu lock as we acquire the rcu protected reference of the object when looking it up from the associated mmap vma. Closes: https://gitlab.freedesktop.org/drm/intel/issues/1083 Fixes: cc662126b413 ("drm/i915: Introduce DRM_I915_GEM_MMAP_OFFSET") Signed-off-by: Chris Wilson Cc: Abdiel Janulgue Cc: Matthew Auld Reviewed-by: Matthew Auld Link: https://patchwork.freedesktop.org/patch/msgid/20200130143931.1906301-1-chris@chris-wilson.co.uk (cherry picked from commit 280d14a69da2e71f43408537c008f2775d5e5360) Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/gem/i915_gem_mman.c | 39 +++++++++--------------------- drivers/gpu/drm/i915/gem/i915_gem_object.h | 12 +++++++-- 2 files changed, 21 insertions(+), 30 deletions(-) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_mman.c b/drivers/gpu/drm/i915/gem/i915_gem_mman.c index e9be2508c04f..0b6a442108de 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_mman.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_mman.c @@ -807,60 +807,43 @@ int i915_gem_mmap(struct file *filp, struct vm_area_struct *vma) struct drm_vma_offset_node *node; struct drm_file *priv = filp->private_data; struct drm_device *dev = priv->minor->dev; + struct drm_i915_gem_object *obj = NULL; struct i915_mmap_offset *mmo = NULL; - struct drm_gem_object *obj = NULL; struct file *anon; if (drm_dev_is_unplugged(dev)) return -ENODEV; + rcu_read_lock(); drm_vma_offset_lock_lookup(dev->vma_offset_manager); node = drm_vma_offset_exact_lookup_locked(dev->vma_offset_manager, vma->vm_pgoff, vma_pages(vma)); - if (likely(node)) { - mmo = container_of(node, struct i915_mmap_offset, - vma_node); - /* - * In our dependency chain, the drm_vma_offset_node - * depends on the validity of the mmo, which depends on - * the gem object. However the only reference we have - * at this point is the mmo (as the parent of the node). - * Try to check if the gem object was at least cleared. - */ - if (!mmo || !mmo->obj) { - drm_vma_offset_unlock_lookup(dev->vma_offset_manager); - return -EINVAL; - } + if (node && drm_vma_node_is_allowed(node, priv)) { /* * Skip 0-refcnted objects as it is in the process of being * destroyed and will be invalid when the vma manager lock * is released. */ - obj = &mmo->obj->base; - if (!kref_get_unless_zero(&obj->refcount)) - obj = NULL; + mmo = container_of(node, struct i915_mmap_offset, vma_node); + obj = i915_gem_object_get_rcu(mmo->obj); } drm_vma_offset_unlock_lookup(dev->vma_offset_manager); + rcu_read_unlock(); if (!obj) - return -EINVAL; - - if (!drm_vma_node_is_allowed(node, priv)) { - drm_gem_object_put_unlocked(obj); - return -EACCES; - } + return node ? -EACCES : -EINVAL; - if (i915_gem_object_is_readonly(to_intel_bo(obj))) { + if (i915_gem_object_is_readonly(obj)) { if (vma->vm_flags & VM_WRITE) { - drm_gem_object_put_unlocked(obj); + i915_gem_object_put(obj); return -EINVAL; } vma->vm_flags &= ~VM_MAYWRITE; } - anon = mmap_singleton(to_i915(obj->dev)); + anon = mmap_singleton(to_i915(dev)); if (IS_ERR(anon)) { - drm_gem_object_put_unlocked(obj); + i915_gem_object_put(obj); return PTR_ERR(anon); } diff --git a/drivers/gpu/drm/i915/gem/i915_gem_object.h b/drivers/gpu/drm/i915/gem/i915_gem_object.h index db70a3306e59..9c86f2dea947 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_object.h +++ b/drivers/gpu/drm/i915/gem/i915_gem_object.h @@ -69,6 +69,15 @@ i915_gem_object_lookup_rcu(struct drm_file *file, u32 handle) return idr_find(&file->object_idr, handle); } +static inline struct drm_i915_gem_object * +i915_gem_object_get_rcu(struct drm_i915_gem_object *obj) +{ + if (obj && !kref_get_unless_zero(&obj->base.refcount)) + obj = NULL; + + return obj; +} + static inline struct drm_i915_gem_object * i915_gem_object_lookup(struct drm_file *file, u32 handle) { @@ -76,8 +85,7 @@ i915_gem_object_lookup(struct drm_file *file, u32 handle) rcu_read_lock(); obj = i915_gem_object_lookup_rcu(file, handle); - if (obj && !kref_get_unless_zero(&obj->base.refcount)) - obj = NULL; + obj = i915_gem_object_get_rcu(obj); rcu_read_unlock(); return obj; -- cgit v1.2.3 From 307f1cfa269657c63cfe2c932386fcc24684d9dd Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Fri, 7 Feb 2020 02:36:04 -0800 Subject: KVM: x86: Mask off reserved bit from #DB exception payload KVM defines the #DB payload as compatible with the 'pending debug exceptions' field under VMX, not DR6. Mask off bit 12 when applying the payload to DR6, as it is reserved on DR6 but not the 'pending debug exceptions' field. Fixes: f10c729ff965 ("kvm: vmx: Defer setting of DR6 until #DB delivery") Signed-off-by: Oliver Upton Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index fbabb2f06273..95b753dab207 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -438,6 +438,14 @@ void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu) * for #DB exceptions under VMX. */ vcpu->arch.dr6 ^= payload & DR6_RTM; + + /* + * The #DB payload is defined as compatible with the 'pending + * debug exceptions' field under VMX, not DR6. While bit 12 is + * defined in the 'pending debug exceptions' field (enabled + * breakpoint), it is reserved and must be zero in DR6. + */ + vcpu->arch.dr6 &= ~BIT(12); break; case PF_VECTOR: vcpu->arch.cr2 = payload; -- cgit v1.2.3 From 684c0422da71da0cd81319c90b8099b563b13da4 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Fri, 7 Feb 2020 02:36:05 -0800 Subject: KVM: nVMX: Handle pending #DB when injecting INIT VM-exit SDM 27.3.4 states that the 'pending debug exceptions' VMCS field will be populated if a VM-exit caused by an INIT signal takes priority over a debug-trap. Emulate this behavior when synthesizing an INIT signal VM-exit into L1. Fixes: 4b9852f4f389 ("KVM: x86: Fix INIT signal handling in various CPU states") Signed-off-by: Oliver Upton Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 657c2eda357c..1586aaae3a6f 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -3575,6 +3575,33 @@ static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu, nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, intr_info, exit_qual); } +/* + * Returns true if a debug trap is pending delivery. + * + * In KVM, debug traps bear an exception payload. As such, the class of a #DB + * exception may be inferred from the presence of an exception payload. + */ +static inline bool vmx_pending_dbg_trap(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.exception.pending && + vcpu->arch.exception.nr == DB_VECTOR && + vcpu->arch.exception.payload; +} + +/* + * Certain VM-exits set the 'pending debug exceptions' field to indicate a + * recognized #DB (data or single-step) that has yet to be delivered. Since KVM + * represents these debug traps with a payload that is said to be compatible + * with the 'pending debug exceptions' field, write the payload to the VMCS + * field if a VM-exit is delivered before the debug trap. + */ +static void nested_vmx_update_pending_dbg(struct kvm_vcpu *vcpu) +{ + if (vmx_pending_dbg_trap(vcpu)) + vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, + vcpu->arch.exception.payload); +} + static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -3587,6 +3614,7 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr) test_bit(KVM_APIC_INIT, &apic->pending_events)) { if (block_nested_events) return -EBUSY; + nested_vmx_update_pending_dbg(vcpu); clear_bit(KVM_APIC_INIT, &apic->pending_events); nested_vmx_vmexit(vcpu, EXIT_REASON_INIT_SIGNAL, 0, 0); return 0; -- cgit v1.2.3 From a06230b62b898e51cfb1de256b2042a09a691f58 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Fri, 7 Feb 2020 02:36:06 -0800 Subject: KVM: x86: Deliver exception payload on KVM_GET_VCPU_EVENTS KVM allows the deferral of exception payloads when a vCPU is in guest mode to allow the L1 hypervisor to intercept certain events (#PF, #DB) before register state has been modified. However, this behavior is incompatible with the KVM_{GET,SET}_VCPU_EVENTS ABI, as userspace expects register state to have been immediately modified. Userspace may opt-in for the payload deferral behavior with the KVM_CAP_EXCEPTION_PAYLOAD per-VM capability. As such, kvm_multiple_exception() will immediately manipulate guest registers if the capability hasn't been requested. Since the deferral is only necessary if a userspace ioctl were to be serviced at the same as a payload bearing exception is recognized, this behavior can be relaxed. Instead, opportunistically defer the payload from kvm_multiple_exception() and deliver the payload before completing a KVM_GET_VCPU_EVENTS ioctl. Signed-off-by: Oliver Upton Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 95b753dab207..4d3310df1758 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -498,19 +498,7 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu, vcpu->arch.exception.error_code = error_code; vcpu->arch.exception.has_payload = has_payload; vcpu->arch.exception.payload = payload; - /* - * In guest mode, payload delivery should be deferred, - * so that the L1 hypervisor can intercept #PF before - * CR2 is modified (or intercept #DB before DR6 is - * modified under nVMX). However, for ABI - * compatibility with KVM_GET_VCPU_EVENTS and - * KVM_SET_VCPU_EVENTS, we can't delay payload - * delivery unless userspace has enabled this - * functionality via the per-VM capability, - * KVM_CAP_EXCEPTION_PAYLOAD. - */ - if (!vcpu->kvm->arch.exception_payload_enabled || - !is_guest_mode(vcpu)) + if (!is_guest_mode(vcpu)) kvm_deliver_exception_payload(vcpu); return; } @@ -3803,6 +3791,21 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, { process_nmi(vcpu); + /* + * In guest mode, payload delivery should be deferred, + * so that the L1 hypervisor can intercept #PF before + * CR2 is modified (or intercept #DB before DR6 is + * modified under nVMX). Unless the per-VM capability, + * KVM_CAP_EXCEPTION_PAYLOAD, is set, we may not defer the delivery of + * an exception payload and handle after a KVM_GET_VCPU_EVENTS. Since we + * opportunistically defer the exception payload, deliver it if the + * capability hasn't been requested before processing a + * KVM_GET_VCPU_EVENTS. + */ + if (!vcpu->kvm->arch.exception_payload_enabled && + vcpu->arch.exception.pending && vcpu->arch.exception.has_payload) + kvm_deliver_exception_payload(vcpu); + /* * The API doesn't provide the instruction length for software * exceptions, so don't report them. As long as the guest RIP -- cgit v1.2.3 From f65b9dba5733de8e285cf9d7e8672b46dd0cb709 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 12 Feb 2020 10:13:25 -0300 Subject: tools headers uapi: Sync linux/fscrypt.h with the kernel sources To pick the changes from: e933adde6f97 ("fscrypt: include in UAPI header") 93edd392cad7 ("fscrypt: support passing a keyring key to FS_IOC_ADD_ENCRYPTION_KEY") That don't trigger any changes in tooling. This silences this perf build warning: Warning: Kernel ABI header at 'tools/include/uapi/linux/fscrypt.h' differs from latest version at 'include/uapi/linux/fscrypt.h' diff -u tools/include/uapi/linux/fscrypt.h include/uapi/linux/fscrypt.h Cc: Adrian Hunter Cc: Eric Biggers Cc: Jiri Olsa Cc: Namhyung Kim Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/uapi/linux/fscrypt.h | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/tools/include/uapi/linux/fscrypt.h b/tools/include/uapi/linux/fscrypt.h index 1beb174ad950..0d8a6f47711c 100644 --- a/tools/include/uapi/linux/fscrypt.h +++ b/tools/include/uapi/linux/fscrypt.h @@ -8,6 +8,7 @@ #ifndef _UAPI_LINUX_FSCRYPT_H #define _UAPI_LINUX_FSCRYPT_H +#include #include /* Encryption policy flags */ @@ -109,11 +110,22 @@ struct fscrypt_key_specifier { } u; }; +/* + * Payload of Linux keyring key of type "fscrypt-provisioning", referenced by + * fscrypt_add_key_arg::key_id as an alternative to fscrypt_add_key_arg::raw. + */ +struct fscrypt_provisioning_key_payload { + __u32 type; + __u32 __reserved; + __u8 raw[]; +}; + /* Struct passed to FS_IOC_ADD_ENCRYPTION_KEY */ struct fscrypt_add_key_arg { struct fscrypt_key_specifier key_spec; __u32 raw_size; - __u32 __reserved[9]; + __u32 key_id; + __u32 __reserved[8]; __u8 raw[]; }; -- cgit v1.2.3 From 365f9cc195a7fae8ac541129cd2a31ad87e46221 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 12 Feb 2020 10:25:27 -0300 Subject: tools headers UAPI: Sync drm/i915_drm.h with the kernel sources To pick the change in: cc662126b413 ("drm/i915: Introduce DRM_I915_GEM_MMAP_OFFSET") That don't result in any changes in tooling, just silences this perf build warning: Warning: Kernel ABI header at 'tools/include/uapi/drm/i915_drm.h' differs from latest version at 'include/uapi/drm/i915_drm.h' diff -u tools/include/uapi/drm/i915_drm.h include/uapi/drm/i915_drm.h Cc: Abdiel Janulgue Cc: Adrian Hunter Cc: Chris Wilson Cc: Jiri Olsa Cc: Namhyung Kim Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/uapi/drm/i915_drm.h | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/tools/include/uapi/drm/i915_drm.h b/tools/include/uapi/drm/i915_drm.h index 5400d7e057f1..829c0a48577f 100644 --- a/tools/include/uapi/drm/i915_drm.h +++ b/tools/include/uapi/drm/i915_drm.h @@ -395,6 +395,7 @@ typedef struct _drm_i915_sarea { #define DRM_IOCTL_I915_GEM_PWRITE DRM_IOW (DRM_COMMAND_BASE + DRM_I915_GEM_PWRITE, struct drm_i915_gem_pwrite) #define DRM_IOCTL_I915_GEM_MMAP DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_GEM_MMAP, struct drm_i915_gem_mmap) #define DRM_IOCTL_I915_GEM_MMAP_GTT DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_GEM_MMAP_GTT, struct drm_i915_gem_mmap_gtt) +#define DRM_IOCTL_I915_GEM_MMAP_OFFSET DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_GEM_MMAP_GTT, struct drm_i915_gem_mmap_offset) #define DRM_IOCTL_I915_GEM_SET_DOMAIN DRM_IOW (DRM_COMMAND_BASE + DRM_I915_GEM_SET_DOMAIN, struct drm_i915_gem_set_domain) #define DRM_IOCTL_I915_GEM_SW_FINISH DRM_IOW (DRM_COMMAND_BASE + DRM_I915_GEM_SW_FINISH, struct drm_i915_gem_sw_finish) #define DRM_IOCTL_I915_GEM_SET_TILING DRM_IOWR (DRM_COMMAND_BASE + DRM_I915_GEM_SET_TILING, struct drm_i915_gem_set_tiling) @@ -793,6 +794,37 @@ struct drm_i915_gem_mmap_gtt { __u64 offset; }; +struct drm_i915_gem_mmap_offset { + /** Handle for the object being mapped. */ + __u32 handle; + __u32 pad; + /** + * Fake offset to use for subsequent mmap call + * + * This is a fixed-size type for 32/64 compatibility. + */ + __u64 offset; + + /** + * Flags for extended behaviour. + * + * It is mandatory that one of the MMAP_OFFSET types + * (GTT, WC, WB, UC, etc) should be included. + */ + __u64 flags; +#define I915_MMAP_OFFSET_GTT 0 +#define I915_MMAP_OFFSET_WC 1 +#define I915_MMAP_OFFSET_WB 2 +#define I915_MMAP_OFFSET_UC 3 + + /* + * Zero-terminated chain of extensions. + * + * No current extensions defined; mbz. + */ + __u64 extensions; +}; + struct drm_i915_gem_set_domain { /** Handle for the object */ __u32 handle; -- cgit v1.2.3 From df5a5f3cf24608457bb5e57297dd9f0d528be58f Mon Sep 17 00:00:00 2001 From: John Garry Date: Fri, 13 Dec 2019 21:54:15 +0800 Subject: perf tools: Add arm64 version of get_cpuid() Add an arm64 version of get_cpuid(), which is used for various annotation and headers - for example, I now get the CPUID in "perf report --header", as shown in this snippet: # hostname : ubuntu # os release : 5.5.0-rc1-dirty # perf version : 5.5.rc1.gbf8a13dc9851 # arch : aarch64 # nrcpus online : 96 # nrcpus avail : 96 # cpuid : 0x00000000480fd010 Since much of the code to read the MIDR is already in get_cpuid_str(), factor out this code. Tester notes: I tested this patch on my new ARM64 Kunpeng 920 server. [root@node1 zsk]# ./perf --version perf version 5.6.rc1.g2cdb955b7252 Both perf list and perf stat can work. Signed-off-by: John Garry Tested-by: Shaokun Zhang Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Will Deacon Cc: linux-arm-kernel@lists.infradead.org Cc: linuxarm@huawei.com Link: http://lore.kernel.org/lkml/1576245255-210926-1-git-send-email-john.garry@huawei.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/arm64/util/header.c | 63 ++++++++++++++++++++++++++++--------- 1 file changed, 48 insertions(+), 15 deletions(-) diff --git a/tools/perf/arch/arm64/util/header.c b/tools/perf/arch/arm64/util/header.c index a32e4b72a98f..d730666ab95d 100644 --- a/tools/perf/arch/arm64/util/header.c +++ b/tools/perf/arch/arm64/util/header.c @@ -1,8 +1,10 @@ #include #include #include +#include #include #include +#include #include "debug.h" #include "header.h" @@ -12,26 +14,21 @@ #define MIDR_VARIANT_SHIFT 20 #define MIDR_VARIANT_MASK (0xf << MIDR_VARIANT_SHIFT) -char *get_cpuid_str(struct perf_pmu *pmu) +static int _get_cpuid(char *buf, size_t sz, struct perf_cpu_map *cpus) { - char *buf = NULL; - char path[PATH_MAX]; const char *sysfs = sysfs__mountpoint(); - int cpu; u64 midr = 0; - struct perf_cpu_map *cpus; - FILE *file; + int cpu; - if (!sysfs || !pmu || !pmu->cpus) - return NULL; + if (!sysfs || sz < MIDR_SIZE) + return EINVAL; - buf = malloc(MIDR_SIZE); - if (!buf) - return NULL; + cpus = perf_cpu_map__get(cpus); - /* read midr from list of cpus mapped to this pmu */ - cpus = perf_cpu_map__get(pmu->cpus); for (cpu = 0; cpu < perf_cpu_map__nr(cpus); cpu++) { + char path[PATH_MAX]; + FILE *file; + scnprintf(path, PATH_MAX, "%s/devices/system/cpu/cpu%d"MIDR, sysfs, cpus->map[cpu]); @@ -57,12 +54,48 @@ char *get_cpuid_str(struct perf_pmu *pmu) break; } - if (!midr) { + perf_cpu_map__put(cpus); + + if (!midr) + return EINVAL; + + return 0; +} + +int get_cpuid(char *buf, size_t sz) +{ + struct perf_cpu_map *cpus = perf_cpu_map__new(NULL); + int ret; + + if (!cpus) + return EINVAL; + + ret = _get_cpuid(buf, sz, cpus); + + perf_cpu_map__put(cpus); + + return ret; +} + +char *get_cpuid_str(struct perf_pmu *pmu) +{ + char *buf = NULL; + int res; + + if (!pmu || !pmu->cpus) + return NULL; + + buf = malloc(MIDR_SIZE); + if (!buf) + return NULL; + + /* read midr from list of cpus mapped to this pmu */ + res = _get_cpuid(buf, MIDR_SIZE, pmu->cpus); + if (res) { pr_err("failed to get cpuid string for PMU %s\n", pmu->name); free(buf); buf = NULL; } - perf_cpu_map__put(cpus); return buf; } -- cgit v1.2.3 From 47f8d94ac5241e30a5ab0b6b91b963b54765ec7c Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 12 Feb 2020 10:53:06 -0300 Subject: tools headers UAPI: Sync asm-generic/mman-common.h with the kernel To pick the changes from: d41938d2cbee ("mm: Reserve asm-generic prot flags 0x10 and 0x20 for arch use") No changes in tooling, just a rebuild as files needed got touched. This addresses the following perf build warning: Warning: Kernel ABI header at 'tools/include/uapi/asm-generic/mman-common.h' differs from latest version at 'include/uapi/asm-generic/mman-common.h' diff -u tools/include/uapi/asm-generic/mman-common.h include/uapi/asm-generic/mman-common.h Cc: Adrian Hunter Cc: Dave Martin Cc: Jiri Olsa Cc: Namhyung Kim Cc: Will Deacon Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/uapi/asm-generic/mman-common.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/include/uapi/asm-generic/mman-common.h b/tools/include/uapi/asm-generic/mman-common.h index c160a5354eb6..f94f65d429be 100644 --- a/tools/include/uapi/asm-generic/mman-common.h +++ b/tools/include/uapi/asm-generic/mman-common.h @@ -11,6 +11,8 @@ #define PROT_WRITE 0x2 /* page can be written */ #define PROT_EXEC 0x4 /* page can be executed */ #define PROT_SEM 0x8 /* page may be used for atomic ops */ +/* 0x10 reserved for arch-specific use */ +/* 0x20 reserved for arch-specific use */ #define PROT_NONE 0x0 /* page can not be accessed */ #define PROT_GROWSDOWN 0x01000000 /* mprotect flag: extend change to start of growsdown vma */ #define PROT_GROWSUP 0x02000000 /* mprotect flag: extend change to end of growsup vma */ -- cgit v1.2.3 From 8c65582f82ee736b63b3c0cd9c7c5b4572f1f4d6 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 12 Feb 2020 11:04:23 -0300 Subject: tools include UAPI: Sync sound/asound.h copy Picking the changes from: 46b770f720bd ("ALSA: uapi: Fix sparse warning") a103a3989993 ("ALSA: control: Fix incompatible protocol error") bd3eb4e87eb3 ("ALSA: ctl: bump protocol version up to v2.1.0") ff16351e3f30 ("ALSA: ctl: remove dimen member from elem_info structure") 542283566679 ("ALSA: ctl: remove unused macro for timestamping of elem_value") 7fd7d6c50451 ("ALSA: uapi: Fix typos and header inclusion in asound.h") 1cfaef961703 ("ALSA: bump uapi version numbers") 80fe7430c708 ("ALSA: add new 32-bit layout for snd_pcm_mmap_status/control") 07094ae6f952 ("ALSA: Avoid using timespec for struct snd_timer_tread") d9e5582c4bb2 ("ALSA: Avoid using timespec for struct snd_rawmidi_status") 3ddee7f88aaf ("ALSA: Avoid using timespec for struct snd_pcm_status") a4e7dd35b9da ("ALSA: Avoid using timespec for struct snd_ctl_elem_value") a07804cc7472 ("ALSA: Avoid using timespec for struct snd_timer_status") Which entails no changes in the tooling side. To silence this perf tools build warning: Warning: Kernel ABI header at 'tools/include/uapi/sound/asound.h' differs from latest version at 'include/uapi/sound/asound.h' diff -u tools/include/uapi/sound/asound.h include/uapi/sound/asound.h Cc: Adrian Hunter Cc: Arnd Bergmann Cc: Baolin Wang Cc: Jiri Olsa Cc: Namhyung Kim Cc: Ranjani Sridharan Cc: Takashi Iwai Cc: Takashi Sakamoto Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/uapi/sound/asound.h | 155 ++++++++++++++++++++++++++++++++------ 1 file changed, 132 insertions(+), 23 deletions(-) diff --git a/tools/include/uapi/sound/asound.h b/tools/include/uapi/sound/asound.h index df1153cea0b7..535a7229e1d9 100644 --- a/tools/include/uapi/sound/asound.h +++ b/tools/include/uapi/sound/asound.h @@ -26,7 +26,9 @@ #if defined(__KERNEL__) || defined(__linux__) #include +#include #else +#include #include #endif @@ -154,7 +156,7 @@ struct snd_hwdep_dsp_image { * * *****************************************************************************/ -#define SNDRV_PCM_VERSION SNDRV_PROTOCOL_VERSION(2, 0, 14) +#define SNDRV_PCM_VERSION SNDRV_PROTOCOL_VERSION(2, 0, 15) typedef unsigned long snd_pcm_uframes_t; typedef signed long snd_pcm_sframes_t; @@ -301,7 +303,9 @@ typedef int __bitwise snd_pcm_subformat_t; #define SNDRV_PCM_INFO_DRAIN_TRIGGER 0x40000000 /* internal kernel flag - trigger in drain */ #define SNDRV_PCM_INFO_FIFO_IN_FRAMES 0x80000000 /* internal kernel flag - FIFO size is in frames */ - +#if (__BITS_PER_LONG == 32 && defined(__USE_TIME_BITS64)) || defined __KERNEL__ +#define __SND_STRUCT_TIME64 +#endif typedef int __bitwise snd_pcm_state_t; #define SNDRV_PCM_STATE_OPEN ((__force snd_pcm_state_t) 0) /* stream is open */ @@ -317,8 +321,17 @@ typedef int __bitwise snd_pcm_state_t; enum { SNDRV_PCM_MMAP_OFFSET_DATA = 0x00000000, - SNDRV_PCM_MMAP_OFFSET_STATUS = 0x80000000, - SNDRV_PCM_MMAP_OFFSET_CONTROL = 0x81000000, + SNDRV_PCM_MMAP_OFFSET_STATUS_OLD = 0x80000000, + SNDRV_PCM_MMAP_OFFSET_CONTROL_OLD = 0x81000000, + SNDRV_PCM_MMAP_OFFSET_STATUS_NEW = 0x82000000, + SNDRV_PCM_MMAP_OFFSET_CONTROL_NEW = 0x83000000, +#ifdef __SND_STRUCT_TIME64 + SNDRV_PCM_MMAP_OFFSET_STATUS = SNDRV_PCM_MMAP_OFFSET_STATUS_NEW, + SNDRV_PCM_MMAP_OFFSET_CONTROL = SNDRV_PCM_MMAP_OFFSET_CONTROL_NEW, +#else + SNDRV_PCM_MMAP_OFFSET_STATUS = SNDRV_PCM_MMAP_OFFSET_STATUS_OLD, + SNDRV_PCM_MMAP_OFFSET_CONTROL = SNDRV_PCM_MMAP_OFFSET_CONTROL_OLD, +#endif }; union snd_pcm_sync_id { @@ -456,8 +469,13 @@ enum { SNDRV_PCM_AUDIO_TSTAMP_TYPE_LAST = SNDRV_PCM_AUDIO_TSTAMP_TYPE_LINK_SYNCHRONIZED }; +#ifndef __KERNEL__ +/* explicit padding avoids incompatibility between i386 and x86-64 */ +typedef struct { unsigned char pad[sizeof(time_t) - sizeof(int)]; } __time_pad; + struct snd_pcm_status { snd_pcm_state_t state; /* stream state */ + __time_pad pad1; /* align to timespec */ struct timespec trigger_tstamp; /* time when stream was started/stopped/paused */ struct timespec tstamp; /* reference timestamp */ snd_pcm_uframes_t appl_ptr; /* appl ptr */ @@ -473,17 +491,48 @@ struct snd_pcm_status { __u32 audio_tstamp_accuracy; /* in ns units, only valid if indicated in audio_tstamp_data */ unsigned char reserved[52-2*sizeof(struct timespec)]; /* must be filled with zero */ }; +#endif + +/* + * For mmap operations, we need the 64-bit layout, both for compat mode, + * and for y2038 compatibility. For 64-bit applications, the two definitions + * are identical, so we keep the traditional version. + */ +#ifdef __SND_STRUCT_TIME64 +#define __snd_pcm_mmap_status64 snd_pcm_mmap_status +#define __snd_pcm_mmap_control64 snd_pcm_mmap_control +#define __snd_pcm_sync_ptr64 snd_pcm_sync_ptr +#ifdef __KERNEL__ +#define __snd_timespec64 __kernel_timespec +#else +#define __snd_timespec64 timespec +#endif +struct __snd_timespec { + __s32 tv_sec; + __s32 tv_nsec; +}; +#else +#define __snd_pcm_mmap_status snd_pcm_mmap_status +#define __snd_pcm_mmap_control snd_pcm_mmap_control +#define __snd_pcm_sync_ptr snd_pcm_sync_ptr +#define __snd_timespec timespec +struct __snd_timespec64 { + __s64 tv_sec; + __s64 tv_nsec; +}; -struct snd_pcm_mmap_status { +#endif + +struct __snd_pcm_mmap_status { snd_pcm_state_t state; /* RO: state - SNDRV_PCM_STATE_XXXX */ int pad1; /* Needed for 64 bit alignment */ snd_pcm_uframes_t hw_ptr; /* RO: hw ptr (0...boundary-1) */ - struct timespec tstamp; /* Timestamp */ + struct __snd_timespec tstamp; /* Timestamp */ snd_pcm_state_t suspended_state; /* RO: suspended stream state */ - struct timespec audio_tstamp; /* from sample counter or wall clock */ + struct __snd_timespec audio_tstamp; /* from sample counter or wall clock */ }; -struct snd_pcm_mmap_control { +struct __snd_pcm_mmap_control { snd_pcm_uframes_t appl_ptr; /* RW: appl ptr (0...boundary-1) */ snd_pcm_uframes_t avail_min; /* RW: min available frames for wakeup */ }; @@ -492,14 +541,59 @@ struct snd_pcm_mmap_control { #define SNDRV_PCM_SYNC_PTR_APPL (1<<1) /* get appl_ptr from driver (r/w op) */ #define SNDRV_PCM_SYNC_PTR_AVAIL_MIN (1<<2) /* get avail_min from driver */ -struct snd_pcm_sync_ptr { +struct __snd_pcm_sync_ptr { unsigned int flags; union { - struct snd_pcm_mmap_status status; + struct __snd_pcm_mmap_status status; + unsigned char reserved[64]; + } s; + union { + struct __snd_pcm_mmap_control control; + unsigned char reserved[64]; + } c; +}; + +#if defined(__BYTE_ORDER) ? __BYTE_ORDER == __BIG_ENDIAN : defined(__BIG_ENDIAN) +typedef char __pad_before_uframe[sizeof(__u64) - sizeof(snd_pcm_uframes_t)]; +typedef char __pad_after_uframe[0]; +#endif + +#if defined(__BYTE_ORDER) ? __BYTE_ORDER == __LITTLE_ENDIAN : defined(__LITTLE_ENDIAN) +typedef char __pad_before_uframe[0]; +typedef char __pad_after_uframe[sizeof(__u64) - sizeof(snd_pcm_uframes_t)]; +#endif + +struct __snd_pcm_mmap_status64 { + snd_pcm_state_t state; /* RO: state - SNDRV_PCM_STATE_XXXX */ + __u32 pad1; /* Needed for 64 bit alignment */ + __pad_before_uframe __pad1; + snd_pcm_uframes_t hw_ptr; /* RO: hw ptr (0...boundary-1) */ + __pad_after_uframe __pad2; + struct __snd_timespec64 tstamp; /* Timestamp */ + snd_pcm_state_t suspended_state;/* RO: suspended stream state */ + __u32 pad3; /* Needed for 64 bit alignment */ + struct __snd_timespec64 audio_tstamp; /* sample counter or wall clock */ +}; + +struct __snd_pcm_mmap_control64 { + __pad_before_uframe __pad1; + snd_pcm_uframes_t appl_ptr; /* RW: appl ptr (0...boundary-1) */ + __pad_before_uframe __pad2; + + __pad_before_uframe __pad3; + snd_pcm_uframes_t avail_min; /* RW: min available frames for wakeup */ + __pad_after_uframe __pad4; +}; + +struct __snd_pcm_sync_ptr64 { + __u32 flags; + __u32 pad1; + union { + struct __snd_pcm_mmap_status64 status; unsigned char reserved[64]; } s; union { - struct snd_pcm_mmap_control control; + struct __snd_pcm_mmap_control64 control; unsigned char reserved[64]; } c; }; @@ -584,6 +678,8 @@ enum { #define SNDRV_PCM_IOCTL_STATUS _IOR('A', 0x20, struct snd_pcm_status) #define SNDRV_PCM_IOCTL_DELAY _IOR('A', 0x21, snd_pcm_sframes_t) #define SNDRV_PCM_IOCTL_HWSYNC _IO('A', 0x22) +#define __SNDRV_PCM_IOCTL_SYNC_PTR _IOWR('A', 0x23, struct __snd_pcm_sync_ptr) +#define __SNDRV_PCM_IOCTL_SYNC_PTR64 _IOWR('A', 0x23, struct __snd_pcm_sync_ptr64) #define SNDRV_PCM_IOCTL_SYNC_PTR _IOWR('A', 0x23, struct snd_pcm_sync_ptr) #define SNDRV_PCM_IOCTL_STATUS_EXT _IOWR('A', 0x24, struct snd_pcm_status) #define SNDRV_PCM_IOCTL_CHANNEL_INFO _IOR('A', 0x32, struct snd_pcm_channel_info) @@ -614,7 +710,7 @@ enum { * Raw MIDI section - /dev/snd/midi?? */ -#define SNDRV_RAWMIDI_VERSION SNDRV_PROTOCOL_VERSION(2, 0, 0) +#define SNDRV_RAWMIDI_VERSION SNDRV_PROTOCOL_VERSION(2, 0, 1) enum { SNDRV_RAWMIDI_STREAM_OUTPUT = 0, @@ -648,13 +744,16 @@ struct snd_rawmidi_params { unsigned char reserved[16]; /* reserved for future use */ }; +#ifndef __KERNEL__ struct snd_rawmidi_status { int stream; + __time_pad pad1; struct timespec tstamp; /* Timestamp */ size_t avail; /* available bytes */ size_t xruns; /* count of overruns since last status (in bytes) */ unsigned char reserved[16]; /* reserved for future use */ }; +#endif #define SNDRV_RAWMIDI_IOCTL_PVERSION _IOR('W', 0x00, int) #define SNDRV_RAWMIDI_IOCTL_INFO _IOR('W', 0x01, struct snd_rawmidi_info) @@ -667,7 +766,7 @@ struct snd_rawmidi_status { * Timer section - /dev/snd/timer */ -#define SNDRV_TIMER_VERSION SNDRV_PROTOCOL_VERSION(2, 0, 6) +#define SNDRV_TIMER_VERSION SNDRV_PROTOCOL_VERSION(2, 0, 7) enum { SNDRV_TIMER_CLASS_NONE = -1, @@ -761,6 +860,7 @@ struct snd_timer_params { unsigned char reserved[60]; /* reserved */ }; +#ifndef __KERNEL__ struct snd_timer_status { struct timespec tstamp; /* Timestamp - last update */ unsigned int resolution; /* current period resolution in ns */ @@ -769,10 +869,11 @@ struct snd_timer_status { unsigned int queue; /* used queue size */ unsigned char reserved[64]; /* reserved */ }; +#endif #define SNDRV_TIMER_IOCTL_PVERSION _IOR('T', 0x00, int) #define SNDRV_TIMER_IOCTL_NEXT_DEVICE _IOWR('T', 0x01, struct snd_timer_id) -#define SNDRV_TIMER_IOCTL_TREAD _IOW('T', 0x02, int) +#define SNDRV_TIMER_IOCTL_TREAD_OLD _IOW('T', 0x02, int) #define SNDRV_TIMER_IOCTL_GINFO _IOWR('T', 0x03, struct snd_timer_ginfo) #define SNDRV_TIMER_IOCTL_GPARAMS _IOW('T', 0x04, struct snd_timer_gparams) #define SNDRV_TIMER_IOCTL_GSTATUS _IOWR('T', 0x05, struct snd_timer_gstatus) @@ -785,6 +886,15 @@ struct snd_timer_status { #define SNDRV_TIMER_IOCTL_STOP _IO('T', 0xa1) #define SNDRV_TIMER_IOCTL_CONTINUE _IO('T', 0xa2) #define SNDRV_TIMER_IOCTL_PAUSE _IO('T', 0xa3) +#define SNDRV_TIMER_IOCTL_TREAD64 _IOW('T', 0xa4, int) + +#if __BITS_PER_LONG == 64 +#define SNDRV_TIMER_IOCTL_TREAD SNDRV_TIMER_IOCTL_TREAD_OLD +#else +#define SNDRV_TIMER_IOCTL_TREAD ((sizeof(__kernel_long_t) >= sizeof(time_t)) ? \ + SNDRV_TIMER_IOCTL_TREAD_OLD : \ + SNDRV_TIMER_IOCTL_TREAD64) +#endif struct snd_timer_read { unsigned int resolution; @@ -810,11 +920,15 @@ enum { SNDRV_TIMER_EVENT_MRESUME = SNDRV_TIMER_EVENT_RESUME + 10, }; +#ifndef __KERNEL__ struct snd_timer_tread { int event; + __time_pad pad1; struct timespec tstamp; unsigned int val; + __time_pad pad2; }; +#endif /**************************************************************************** * * @@ -822,7 +936,7 @@ struct snd_timer_tread { * * ****************************************************************************/ -#define SNDRV_CTL_VERSION SNDRV_PROTOCOL_VERSION(2, 0, 7) +#define SNDRV_CTL_VERSION SNDRV_PROTOCOL_VERSION(2, 0, 8) struct snd_ctl_card_info { int card; /* card number */ @@ -860,7 +974,7 @@ typedef int __bitwise snd_ctl_elem_iface_t; #define SNDRV_CTL_ELEM_ACCESS_WRITE (1<<1) #define SNDRV_CTL_ELEM_ACCESS_READWRITE (SNDRV_CTL_ELEM_ACCESS_READ|SNDRV_CTL_ELEM_ACCESS_WRITE) #define SNDRV_CTL_ELEM_ACCESS_VOLATILE (1<<2) /* control value may be changed without a notification */ -#define SNDRV_CTL_ELEM_ACCESS_TIMESTAMP (1<<3) /* when was control changed */ +// (1 << 3) is unused. #define SNDRV_CTL_ELEM_ACCESS_TLV_READ (1<<4) /* TLV read is possible */ #define SNDRV_CTL_ELEM_ACCESS_TLV_WRITE (1<<5) /* TLV write is possible */ #define SNDRV_CTL_ELEM_ACCESS_TLV_READWRITE (SNDRV_CTL_ELEM_ACCESS_TLV_READ|SNDRV_CTL_ELEM_ACCESS_TLV_WRITE) @@ -926,11 +1040,7 @@ struct snd_ctl_elem_info { } enumerated; unsigned char reserved[128]; } value; - union { - unsigned short d[4]; /* dimensions */ - unsigned short *d_ptr; /* indirect - obsoleted */ - } dimen; - unsigned char reserved[64-4*sizeof(unsigned short)]; + unsigned char reserved[64]; }; struct snd_ctl_elem_value { @@ -955,8 +1065,7 @@ struct snd_ctl_elem_value { } bytes; struct snd_aes_iec958 iec958; } value; /* RO */ - struct timespec tstamp; - unsigned char reserved[128-sizeof(struct timespec)]; + unsigned char reserved[128]; }; struct snd_ctl_tlv { -- cgit v1.2.3 From 9e2750fc80b5cc606365201132d49fed00570dd1 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Thu, 16 Jan 2020 18:47:52 +0000 Subject: drm/i915: Keep track of request among the scheduling lists If we keep track of when the i915_request.sched.link is on the HW runlist, or in the priority queue we can simplify our interactions with the request (such as during rescheduling). This also simplifies the next patch where we introduce a new in-between list, for requests that are ready but neither on the run list or in the queue. v2: Update i915_sched_node.link explanation for current usage where it is a link on both the queue and on the runlists. Signed-off-by: Chris Wilson Cc: Mika Kuoppala Cc: Tvrtko Ursulin Reviewed-by: Tvrtko Ursulin Link: https://patchwork.freedesktop.org/patch/msgid/20200116184754.2860848-1-chris@chris-wilson.co.uk (cherry picked from commit 672c368f9398042b629740cc9816e8e939eff2db) Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/gt/intel_lrc.c | 13 ++++++++----- drivers/gpu/drm/i915/i915_request.c | 4 +++- drivers/gpu/drm/i915/i915_request.h | 17 +++++++++++++++++ drivers/gpu/drm/i915/i915_scheduler.c | 22 ++++++++++------------ 4 files changed, 38 insertions(+), 18 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c index d879e5e926af..f1f49c4aa7af 100644 --- a/drivers/gpu/drm/i915/gt/intel_lrc.c +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c @@ -985,6 +985,8 @@ __unwind_incomplete_requests(struct intel_engine_cs *engine) GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root)); list_move(&rq->sched.link, pl); + set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags); + active = rq; } else { struct intel_engine_cs *owner = rq->context->engine; @@ -2431,11 +2433,12 @@ static void execlists_preempt(struct timer_list *timer) } static void queue_request(struct intel_engine_cs *engine, - struct i915_sched_node *node, - int prio) + struct i915_request *rq) { - GEM_BUG_ON(!list_empty(&node->link)); - list_add_tail(&node->link, i915_sched_lookup_priolist(engine, prio)); + GEM_BUG_ON(!list_empty(&rq->sched.link)); + list_add_tail(&rq->sched.link, + i915_sched_lookup_priolist(engine, rq_prio(rq))); + set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags); } static void __submit_queue_imm(struct intel_engine_cs *engine) @@ -2471,7 +2474,7 @@ static void execlists_submit_request(struct i915_request *request) /* Will be called from irq-context when using foreign fences. */ spin_lock_irqsave(&engine->active.lock, flags); - queue_request(engine, &request->sched, rq_prio(request)); + queue_request(engine, request); GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root)); GEM_BUG_ON(list_empty(&request->sched.link)); diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c index be185886e4fc..9ed0d3bc7249 100644 --- a/drivers/gpu/drm/i915/i915_request.c +++ b/drivers/gpu/drm/i915/i915_request.c @@ -408,8 +408,10 @@ bool __i915_request_submit(struct i915_request *request) xfer: /* We may be recursing from the signal callback of another i915 fence */ spin_lock_nested(&request->lock, SINGLE_DEPTH_NESTING); - if (!test_and_set_bit(I915_FENCE_FLAG_ACTIVE, &request->fence.flags)) + if (!test_and_set_bit(I915_FENCE_FLAG_ACTIVE, &request->fence.flags)) { list_move_tail(&request->sched.link, &engine->active.requests); + clear_bit(I915_FENCE_FLAG_PQUEUE, &request->fence.flags); + } if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, &request->fence.flags) && !test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &request->fence.flags) && diff --git a/drivers/gpu/drm/i915/i915_request.h b/drivers/gpu/drm/i915/i915_request.h index 031433691a06..6f5bbfa95513 100644 --- a/drivers/gpu/drm/i915/i915_request.h +++ b/drivers/gpu/drm/i915/i915_request.h @@ -70,6 +70,18 @@ enum { */ I915_FENCE_FLAG_ACTIVE = DMA_FENCE_FLAG_USER_BITS, + /* + * I915_FENCE_FLAG_PQUEUE - this request is ready for execution + * + * Using the scheduler, when a request is ready for execution it is put + * into the priority queue, and removed from that queue when transferred + * to the HW runlists. We want to track its membership within the + * priority queue so that we can easily check before rescheduling. + * + * See i915_request_in_priority_queue() + */ + I915_FENCE_FLAG_PQUEUE, + /* * I915_FENCE_FLAG_SIGNAL - this request is currently on signal_list * @@ -361,6 +373,11 @@ static inline bool i915_request_is_active(const struct i915_request *rq) return test_bit(I915_FENCE_FLAG_ACTIVE, &rq->fence.flags); } +static inline bool i915_request_in_priority_queue(const struct i915_request *rq) +{ + return test_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags); +} + /** * Returns true if seq1 is later than seq2. */ diff --git a/drivers/gpu/drm/i915/i915_scheduler.c b/drivers/gpu/drm/i915/i915_scheduler.c index bf87c70bfdd9..5d96cfba40f8 100644 --- a/drivers/gpu/drm/i915/i915_scheduler.c +++ b/drivers/gpu/drm/i915/i915_scheduler.c @@ -326,20 +326,18 @@ static void __i915_schedule(struct i915_sched_node *node, node->attr.priority = prio; - if (list_empty(&node->link)) { - /* - * If the request is not in the priolist queue because - * it is not yet runnable, then it doesn't contribute - * to our preemption decisions. On the other hand, - * if the request is on the HW, it too is not in the - * queue; but in that case we may still need to reorder - * the inflight requests. - */ + /* + * Once the request is ready, it will be placed into the + * priority lists and then onto the HW runlist. Before the + * request is ready, it does not contribute to our preemption + * decisions and we can safely ignore it, as it will, and + * any preemption required, be dealt with upon submission. + * See engine->submit_request() + */ + if (list_empty(&node->link)) continue; - } - if (!intel_engine_is_virtual(engine) && - !i915_request_is_active(node_to_request(node))) { + if (i915_request_in_priority_queue(node_to_request(node))) { if (!cache.priolist) cache.priolist = i915_sched_lookup_priolist(engine, -- cgit v1.2.3 From c3f1ed90e6ffbf4e22010522351779f920e53d0d Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Thu, 16 Jan 2020 18:47:53 +0000 Subject: drm/i915/gt: Allow temporary suspension of inflight requests In order to support out-of-line error capture, we need to remove the active request from HW and put it to one side while a worker compresses and stores all the details associated with that request. (As that compression may take an arbitrary user-controlled amount of time, we want to let the engine continue running on other workloads while the hanging request is dumped.) Not only do we need to remove the active request, but we also have to remove its context and all requests that were dependent on it (both in flight, queued and future submission). Finally once the capture is complete, we need to be able to resubmit the request and its dependents and allow them to execute. v2: Replace stack recursion with a simple list. v3: Check all the parents, not just the first, when searching for a stuck ancestor! References: https://gitlab.freedesktop.org/drm/intel/issues/738 Signed-off-by: Chris Wilson Cc: Tvrtko Ursulin Reviewed-by: Tvrtko Ursulin Link: https://patchwork.freedesktop.org/patch/msgid/20200116184754.2860848-2-chris@chris-wilson.co.uk (cherry picked from commit 32ff621fd74496f0c33644125fb69ff175859b1f) Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/gt/intel_engine_cs.c | 13 +++ drivers/gpu/drm/i915/gt/intel_engine_types.h | 1 + drivers/gpu/drm/i915/gt/intel_lrc.c | 167 ++++++++++++++++++++++++++- drivers/gpu/drm/i915/gt/selftest_lrc.c | 103 +++++++++++++++++ drivers/gpu/drm/i915/i915_request.h | 43 +++++++ 5 files changed, 321 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c b/drivers/gpu/drm/i915/gt/intel_engine_cs.c index f451ef376548..06ff7695fa29 100644 --- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c +++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c @@ -671,6 +671,7 @@ void intel_engine_init_active(struct intel_engine_cs *engine, unsigned int subclass) { INIT_LIST_HEAD(&engine->active.requests); + INIT_LIST_HEAD(&engine->active.hold); spin_lock_init(&engine->active.lock); lockdep_set_subclass(&engine->active.lock, subclass); @@ -1422,6 +1423,17 @@ static void print_request_ring(struct drm_printer *m, struct i915_request *rq) } } +static unsigned long list_count(struct list_head *list) +{ + struct list_head *pos; + unsigned long count = 0; + + list_for_each(pos, list) + count++; + + return count; +} + void intel_engine_dump(struct intel_engine_cs *engine, struct drm_printer *m, const char *header, ...) @@ -1491,6 +1503,7 @@ void intel_engine_dump(struct intel_engine_cs *engine, hexdump(m, rq->context->lrc_reg_state, PAGE_SIZE); } } + drm_printf(m, "\tOn hold?: %lu\n", list_count(&engine->active.hold)); spin_unlock_irqrestore(&engine->active.lock, flags); drm_printf(m, "\tMMIO base: 0x%08x\n", engine->mmio_base); diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h b/drivers/gpu/drm/i915/gt/intel_engine_types.h index 00287515e7af..77e68c7643de 100644 --- a/drivers/gpu/drm/i915/gt/intel_engine_types.h +++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h @@ -295,6 +295,7 @@ struct intel_engine_cs { struct { spinlock_t lock; struct list_head requests; + struct list_head hold; /* ready requests, but on hold */ } active; struct llist_head barrier_tasks; diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c index f1f49c4aa7af..93b35cb72aa6 100644 --- a/drivers/gpu/drm/i915/gt/intel_lrc.c +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c @@ -1635,8 +1635,8 @@ static void defer_request(struct i915_request *rq, struct list_head * const pl) !i915_request_completed(rq)); GEM_BUG_ON(i915_request_is_active(w)); - if (list_empty(&w->sched.link)) - continue; /* Not yet submitted; unready */ + if (!i915_request_is_ready(w)) + continue; if (rq_prio(w) < rq_prio(rq)) continue; @@ -2354,6 +2354,145 @@ static void __execlists_submission_tasklet(struct intel_engine_cs *const engine) } } +static void __execlists_hold(struct i915_request *rq) +{ + LIST_HEAD(list); + + do { + struct i915_dependency *p; + + if (i915_request_is_active(rq)) + __i915_request_unsubmit(rq); + + RQ_TRACE(rq, "on hold\n"); + clear_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags); + list_move_tail(&rq->sched.link, &rq->engine->active.hold); + i915_request_set_hold(rq); + + list_for_each_entry(p, &rq->sched.waiters_list, wait_link) { + struct i915_request *w = + container_of(p->waiter, typeof(*w), sched); + + /* Leave semaphores spinning on the other engines */ + if (w->engine != rq->engine) + continue; + + if (!i915_request_is_ready(w)) + continue; + + if (i915_request_completed(w)) + continue; + + if (i915_request_on_hold(rq)) + continue; + + list_move_tail(&w->sched.link, &list); + } + + rq = list_first_entry_or_null(&list, typeof(*rq), sched.link); + } while (rq); +} + +__maybe_unused +static void execlists_hold(struct intel_engine_cs *engine, + struct i915_request *rq) +{ + spin_lock_irq(&engine->active.lock); + + /* + * Transfer this request onto the hold queue to prevent it + * being resumbitted to HW (and potentially completed) before we have + * released it. Since we may have already submitted following + * requests, we need to remove those as well. + */ + GEM_BUG_ON(i915_request_on_hold(rq)); + GEM_BUG_ON(rq->engine != engine); + __execlists_hold(rq); + + spin_unlock_irq(&engine->active.lock); +} + +static bool hold_request(const struct i915_request *rq) +{ + struct i915_dependency *p; + + /* + * If one of our ancestors is on hold, we must also be on hold, + * otherwise we will bypass it and execute before it. + */ + list_for_each_entry(p, &rq->sched.signalers_list, signal_link) { + const struct i915_request *s = + container_of(p->signaler, typeof(*s), sched); + + if (s->engine != rq->engine) + continue; + + if (i915_request_on_hold(s)) + return true; + } + + return false; +} + +static void __execlists_unhold(struct i915_request *rq) +{ + LIST_HEAD(list); + + do { + struct i915_dependency *p; + + GEM_BUG_ON(!i915_request_on_hold(rq)); + GEM_BUG_ON(!i915_sw_fence_signaled(&rq->submit)); + + i915_request_clear_hold(rq); + list_move_tail(&rq->sched.link, + i915_sched_lookup_priolist(rq->engine, + rq_prio(rq))); + set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags); + RQ_TRACE(rq, "hold release\n"); + + /* Also release any children on this engine that are ready */ + list_for_each_entry(p, &rq->sched.waiters_list, wait_link) { + struct i915_request *w = + container_of(p->waiter, typeof(*w), sched); + + if (w->engine != rq->engine) + continue; + + if (!i915_request_on_hold(rq)) + continue; + + /* Check that no other parents are also on hold */ + if (hold_request(rq)) + continue; + + list_move_tail(&w->sched.link, &list); + } + + rq = list_first_entry_or_null(&list, typeof(*rq), sched.link); + } while (rq); +} + +__maybe_unused +static void execlists_unhold(struct intel_engine_cs *engine, + struct i915_request *rq) +{ + spin_lock_irq(&engine->active.lock); + + /* + * Move this request back to the priority queue, and all of its + * children and grandchildren that were suspended along with it. + */ + __execlists_unhold(rq); + + if (rq_prio(rq) > engine->execlists.queue_priority_hint) { + engine->execlists.queue_priority_hint = rq_prio(rq); + tasklet_hi_schedule(&engine->execlists.tasklet); + } + + spin_unlock_irq(&engine->active.lock); +} + static noinline void preempt_reset(struct intel_engine_cs *engine) { const unsigned int bit = I915_RESET_ENGINE + engine->id; @@ -2466,6 +2605,13 @@ static void submit_queue(struct intel_engine_cs *engine, __submit_queue_imm(engine); } +static bool ancestor_on_hold(const struct intel_engine_cs *engine, + const struct i915_request *rq) +{ + GEM_BUG_ON(i915_request_on_hold(rq)); + return !list_empty(&engine->active.hold) && hold_request(rq); +} + static void execlists_submit_request(struct i915_request *request) { struct intel_engine_cs *engine = request->engine; @@ -2474,12 +2620,17 @@ static void execlists_submit_request(struct i915_request *request) /* Will be called from irq-context when using foreign fences. */ spin_lock_irqsave(&engine->active.lock, flags); - queue_request(engine, request); + if (unlikely(ancestor_on_hold(engine, request))) { + list_add_tail(&request->sched.link, &engine->active.hold); + i915_request_set_hold(request); + } else { + queue_request(engine, request); - GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root)); - GEM_BUG_ON(list_empty(&request->sched.link)); + GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root)); + GEM_BUG_ON(list_empty(&request->sched.link)); - submit_queue(engine, request); + submit_queue(engine, request); + } spin_unlock_irqrestore(&engine->active.lock, flags); } @@ -3328,6 +3479,10 @@ static void execlists_reset_cancel(struct intel_engine_cs *engine) i915_priolist_free(p); } + /* On-hold requests will be flushed to timeline upon their release */ + list_for_each_entry(rq, &engine->active.hold, sched.link) + mark_eio(rq); + /* Cancel all attached virtual engines */ while ((rb = rb_first_cached(&execlists->virtual))) { struct virtual_engine *ve = diff --git a/drivers/gpu/drm/i915/gt/selftest_lrc.c b/drivers/gpu/drm/i915/gt/selftest_lrc.c index 15cda024e3e4..b208c2176bbd 100644 --- a/drivers/gpu/drm/i915/gt/selftest_lrc.c +++ b/drivers/gpu/drm/i915/gt/selftest_lrc.c @@ -285,6 +285,108 @@ static int live_unlite_preempt(void *arg) return live_unlite_restore(arg, I915_USER_PRIORITY(I915_PRIORITY_MAX)); } +static int live_hold_reset(void *arg) +{ + struct intel_gt *gt = arg; + struct intel_engine_cs *engine; + enum intel_engine_id id; + struct igt_spinner spin; + int err = 0; + + /* + * In order to support offline error capture for fast preempt reset, + * we need to decouple the guilty request and ensure that it and its + * descendents are not executed while the capture is in progress. + */ + + if (!intel_has_reset_engine(gt)) + return 0; + + if (igt_spinner_init(&spin, gt)) + return -ENOMEM; + + for_each_engine(engine, gt, id) { + struct intel_context *ce; + unsigned long heartbeat; + struct i915_request *rq; + + ce = intel_context_create(engine); + if (IS_ERR(ce)) { + err = PTR_ERR(ce); + break; + } + + engine_heartbeat_disable(engine, &heartbeat); + + rq = igt_spinner_create_request(&spin, ce, MI_ARB_CHECK); + if (IS_ERR(rq)) { + err = PTR_ERR(rq); + goto out; + } + i915_request_add(rq); + + if (!igt_wait_for_spinner(&spin, rq)) { + intel_gt_set_wedged(gt); + err = -ETIME; + goto out; + } + + /* We have our request executing, now remove it and reset */ + + if (test_and_set_bit(I915_RESET_ENGINE + id, + >->reset.flags)) { + spin_unlock_irq(&engine->active.lock); + intel_gt_set_wedged(gt); + err = -EBUSY; + goto out; + } + tasklet_disable(&engine->execlists.tasklet); + + engine->execlists.tasklet.func(engine->execlists.tasklet.data); + GEM_BUG_ON(execlists_active(&engine->execlists) != rq); + + execlists_hold(engine, rq); + GEM_BUG_ON(!i915_request_on_hold(rq)); + + intel_engine_reset(engine, NULL); + GEM_BUG_ON(rq->fence.error != -EIO); + + tasklet_enable(&engine->execlists.tasklet); + clear_and_wake_up_bit(I915_RESET_ENGINE + id, + >->reset.flags); + + /* Check that we do not resubmit the held request */ + i915_request_get(rq); + if (!i915_request_wait(rq, 0, HZ / 5)) { + pr_err("%s: on hold request completed!\n", + engine->name); + i915_request_put(rq); + err = -EIO; + goto out; + } + GEM_BUG_ON(!i915_request_on_hold(rq)); + + /* But is resubmitted on release */ + execlists_unhold(engine, rq); + if (i915_request_wait(rq, 0, HZ / 5) < 0) { + pr_err("%s: held request did not complete!\n", + engine->name); + intel_gt_set_wedged(gt); + err = -ETIME; + } + i915_request_put(rq); + +out: + engine_heartbeat_enable(engine, heartbeat); + intel_context_put(ce); + if (err) + break; + } + + igt_spinner_fini(&spin); + return err; +} + static int emit_semaphore_chain(struct i915_request *rq, struct i915_vma *vma, int idx) { @@ -3315,6 +3417,7 @@ int intel_execlists_live_selftests(struct drm_i915_private *i915) SUBTEST(live_sanitycheck), SUBTEST(live_unlite_switch), SUBTEST(live_unlite_preempt), + SUBTEST(live_hold_reset), SUBTEST(live_timeslice_preempt), SUBTEST(live_timeslice_queue), SUBTEST(live_busywait_preempt), diff --git a/drivers/gpu/drm/i915/i915_request.h b/drivers/gpu/drm/i915/i915_request.h index 6f5bbfa95513..f57eadcf3583 100644 --- a/drivers/gpu/drm/i915/i915_request.h +++ b/drivers/gpu/drm/i915/i915_request.h @@ -90,6 +90,13 @@ enum { */ I915_FENCE_FLAG_SIGNAL, + /* + * I915_FENCE_FLAG_HOLD - this request is currently on hold + * + * This request has been suspended, pending an ongoing investigation. + */ + I915_FENCE_FLAG_HOLD, + /* * I915_FENCE_FLAG_NOPREEMPT - this request should not be preempted * @@ -471,6 +478,27 @@ static inline bool i915_request_is_running(const struct i915_request *rq) return __i915_request_has_started(rq); } +/** + * i915_request_is_running - check if the request is ready for execution + * @rq: the request + * + * Upon construction, the request is instructed to wait upon various + * signals before it is ready to be executed by the HW. That is, we do + * not want to start execution and read data before it is written. In practice, + * this is controlled with a mixture of interrupts and semaphores. Once + * the submit fence is completed, the backend scheduler will place the + * request into its queue and from there submit it for execution. So we + * can detect when a request is eligible for execution (and is under control + * of the scheduler) by querying where it is in any of the scheduler's lists. + * + * Returns true if the request is ready for execution (it may be inflight), + * false otherwise. + */ +static inline bool i915_request_is_ready(const struct i915_request *rq) +{ + return !list_empty(&rq->sched.link); +} + static inline bool i915_request_completed(const struct i915_request *rq) { if (i915_request_signaled(rq)) @@ -500,6 +528,21 @@ static inline bool i915_request_has_sentinel(const struct i915_request *rq) return unlikely(test_bit(I915_FENCE_FLAG_SENTINEL, &rq->fence.flags)); } +static inline bool i915_request_on_hold(const struct i915_request *rq) +{ + return unlikely(test_bit(I915_FENCE_FLAG_HOLD, &rq->fence.flags)); +} + +static inline void i915_request_set_hold(struct i915_request *rq) +{ + set_bit(I915_FENCE_FLAG_HOLD, &rq->fence.flags); +} + +static inline void i915_request_clear_hold(struct i915_request *rq) +{ + clear_bit(I915_FENCE_FLAG_HOLD, &rq->fence.flags); +} + static inline struct intel_timeline * i915_request_timeline(struct i915_request *rq) { -- cgit v1.2.3 From ad18ba7b5eebf58209a898de8519f6bb2280620b Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Thu, 16 Jan 2020 18:47:54 +0000 Subject: drm/i915/execlists: Offline error capture Currently, we skip error capture upon forced preemption. We apply forced preemption when there is a higher priority request that should be running but is being blocked, and we skip inline error capture so that the preemption request is not further delayed by a user controlled capture -- extending the denial of service. However, preemption reset is also used for heartbeats and regular GPU hangs. By skipping the error capture, we remove the ability to debug GPU hangs. In order to capture the error without delaying the preemption request further, we can do an out-of-line capture by removing the guilty request from the execution queue and scheduling a worker to dump that request. When removing a request, we need to remove the entire context and all descendants from the execution queue, so that they do not jump past. Closes: https://gitlab.freedesktop.org/drm/intel/issues/738 Fixes: 3a7a92aba8fb ("drm/i915/execlists: Force preemption") Signed-off-by: Chris Wilson Cc: Mika Kuoppala Cc: Tvrtko Ursulin Reviewed-by: Tvrtko Ursulin Link: https://patchwork.freedesktop.org/patch/msgid/20200116184754.2860848-3-chris@chris-wilson.co.uk (cherry picked from commit 748317386afb235e11616098d2c7772e49776b58) Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/gt/intel_lrc.c | 122 +++++++++++++++++++++++++++++++++++- 1 file changed, 120 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c index 93b35cb72aa6..3a30767ff0c4 100644 --- a/drivers/gpu/drm/i915/gt/intel_lrc.c +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c @@ -2393,7 +2393,6 @@ static void __execlists_hold(struct i915_request *rq) } while (rq); } -__maybe_unused static void execlists_hold(struct intel_engine_cs *engine, struct i915_request *rq) { @@ -2473,7 +2472,6 @@ static void __execlists_unhold(struct i915_request *rq) } while (rq); } -__maybe_unused static void execlists_unhold(struct intel_engine_cs *engine, struct i915_request *rq) { @@ -2493,6 +2491,123 @@ static void execlists_unhold(struct intel_engine_cs *engine, spin_unlock_irq(&engine->active.lock); } +struct execlists_capture { + struct work_struct work; + struct i915_request *rq; + struct i915_gpu_coredump *error; +}; + +static void execlists_capture_work(struct work_struct *work) +{ + struct execlists_capture *cap = container_of(work, typeof(*cap), work); + const gfp_t gfp = GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN; + struct intel_engine_cs *engine = cap->rq->engine; + struct intel_gt_coredump *gt = cap->error->gt; + struct intel_engine_capture_vma *vma; + + /* Compress all the objects attached to the request, slow! */ + vma = intel_engine_coredump_add_request(gt->engine, cap->rq, gfp); + if (vma) { + struct i915_vma_compress *compress = + i915_vma_capture_prepare(gt); + + intel_engine_coredump_add_vma(gt->engine, vma, compress); + i915_vma_capture_finish(gt, compress); + } + + gt->simulated = gt->engine->simulated; + cap->error->simulated = gt->simulated; + + /* Publish the error state, and announce it to the world */ + i915_error_state_store(cap->error); + i915_gpu_coredump_put(cap->error); + + /* Return this request and all that depend upon it for signaling */ + execlists_unhold(engine, cap->rq); + + kfree(cap); +} + +static struct execlists_capture *capture_regs(struct intel_engine_cs *engine) +{ + const gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN; + struct execlists_capture *cap; + + cap = kmalloc(sizeof(*cap), gfp); + if (!cap) + return NULL; + + cap->error = i915_gpu_coredump_alloc(engine->i915, gfp); + if (!cap->error) + goto err_cap; + + cap->error->gt = intel_gt_coredump_alloc(engine->gt, gfp); + if (!cap->error->gt) + goto err_gpu; + + cap->error->gt->engine = intel_engine_coredump_alloc(engine, gfp); + if (!cap->error->gt->engine) + goto err_gt; + + return cap; + +err_gt: + kfree(cap->error->gt); +err_gpu: + kfree(cap->error); +err_cap: + kfree(cap); + return NULL; +} + +static void execlists_capture(struct intel_engine_cs *engine) +{ + struct execlists_capture *cap; + + if (!IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR)) + return; + + /* + * We need to _quickly_ capture the engine state before we reset. + * We are inside an atomic section (softirq) here and we are delaying + * the forced preemption event. + */ + cap = capture_regs(engine); + if (!cap) + return; + + cap->rq = execlists_active(&engine->execlists); + GEM_BUG_ON(!cap->rq); + + cap->rq = active_request(cap->rq->context->timeline, cap->rq); + GEM_BUG_ON(!cap->rq); + + /* + * Remove the request from the execlists queue, and take ownership + * of the request. We pass it to our worker who will _slowly_ compress + * all the pages the _user_ requested for debugging their batch, after + * which we return it to the queue for signaling. + * + * By removing them from the execlists queue, we also remove the + * requests from being processed by __unwind_incomplete_requests() + * during the intel_engine_reset(), and so they will *not* be replayed + * afterwards. + * + * Note that because we have not yet reset the engine at this point, + * it is possible for the request that we have identified as being + * guilty, did in fact complete and we will then hit an arbitration + * point allowing the outstanding preemption to succeed. The likelihood + * of that is very low (as capturing of the engine registers should be + * fast enough to run inside an irq-off atomic section!), so we will + * simply hold that request accountable for being non-preemptible + * long enough to force the reset. + */ + execlists_hold(engine, cap->rq); + + INIT_WORK(&cap->work, execlists_capture_work); + schedule_work(&cap->work); +} + static noinline void preempt_reset(struct intel_engine_cs *engine) { const unsigned int bit = I915_RESET_ENGINE + engine->id; @@ -2510,6 +2625,9 @@ static noinline void preempt_reset(struct intel_engine_cs *engine) ENGINE_TRACE(engine, "preempt timeout %lu+%ums\n", READ_ONCE(engine->props.preempt_timeout_ms), jiffies_to_msecs(jiffies - engine->execlists.preempt.expires)); + + ring_set_paused(engine, 1); /* Freeze the current request in place */ + execlists_capture(engine); intel_engine_reset(engine, "preemption time out"); tasklet_enable(&engine->execlists.tasklet); -- cgit v1.2.3 From 317e0395cc230736e474e499e01992bcdace5a73 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Wed, 22 Jan 2020 14:02:41 +0000 Subject: drm/i915/execlists: Take a reference while capturing the guilty request Thanks to preempt-to-busy, we leave the request on the HW as we submit the preemption request. This means that the request may complete at any moment as we process HW events, and in particular the request may be retired as we are planning to capture it for a preemption timeout. Be more careful while obtaining the request to capture after a preemption timeout, and check to see if it completed before we were able to put it on the on-hold list. If we do see it did complete just before we capture the request, proclaim the preemption-timeout a false positive and pardon the reset as we should hit an arbitration point momentarily and so be able to process the preemption. Note that even after we move the request to be on hold it may be retired (as the reset to stop the HW comes after), so we do require to hold our own reference as we work on the request for capture (and all of the peeking at state within the request needs to be carefully protected). Fixes: c3f1ed90e6ff ("drm/i915/gt: Allow temporary suspension of inflight requests") Closes: https://gitlab.freedesktop.org/drm/intel/issues/997 Signed-off-by: Chris Wilson Cc: Tvrtko Ursulin Reviewed-by: Tvrtko Ursulin Link: https://patchwork.freedesktop.org/patch/msgid/20200122140243.495621-1-chris@chris-wilson.co.uk (cherry picked from commit 4ba5c086a1d8e38d6927967ae1a3271a6ab7a927) Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/gt/intel_lrc.c | 39 +++++++++++++++++++++++++++------- drivers/gpu/drm/i915/gt/selftest_lrc.c | 3 +-- 2 files changed, 32 insertions(+), 10 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c index 3a30767ff0c4..4810c62144ae 100644 --- a/drivers/gpu/drm/i915/gt/intel_lrc.c +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c @@ -2393,11 +2393,16 @@ static void __execlists_hold(struct i915_request *rq) } while (rq); } -static void execlists_hold(struct intel_engine_cs *engine, +static bool execlists_hold(struct intel_engine_cs *engine, struct i915_request *rq) { spin_lock_irq(&engine->active.lock); + if (i915_request_completed(rq)) { /* too late! */ + rq = NULL; + goto unlock; + } + /* * Transfer this request onto the hold queue to prevent it * being resumbitted to HW (and potentially completed) before we have @@ -2408,7 +2413,9 @@ static void execlists_hold(struct intel_engine_cs *engine, GEM_BUG_ON(rq->engine != engine); __execlists_hold(rq); +unlock: spin_unlock_irq(&engine->active.lock); + return rq; } static bool hold_request(const struct i915_request *rq) @@ -2524,6 +2531,7 @@ static void execlists_capture_work(struct work_struct *work) /* Return this request and all that depend upon it for signaling */ execlists_unhold(engine, cap->rq); + i915_request_put(cap->rq); kfree(cap); } @@ -2560,12 +2568,12 @@ err_cap: return NULL; } -static void execlists_capture(struct intel_engine_cs *engine) +static bool execlists_capture(struct intel_engine_cs *engine) { struct execlists_capture *cap; if (!IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR)) - return; + return true; /* * We need to _quickly_ capture the engine state before we reset. @@ -2574,13 +2582,17 @@ static void execlists_capture(struct intel_engine_cs *engine) */ cap = capture_regs(engine); if (!cap) - return; + return true; cap->rq = execlists_active(&engine->execlists); GEM_BUG_ON(!cap->rq); + rcu_read_lock(); cap->rq = active_request(cap->rq->context->timeline, cap->rq); - GEM_BUG_ON(!cap->rq); + cap->rq = i915_request_get_rcu(cap->rq); + rcu_read_unlock(); + if (!cap->rq) + goto err_free; /* * Remove the request from the execlists queue, and take ownership @@ -2602,10 +2614,19 @@ static void execlists_capture(struct intel_engine_cs *engine) * simply hold that request accountable for being non-preemptible * long enough to force the reset. */ - execlists_hold(engine, cap->rq); + if (!execlists_hold(engine, cap->rq)) + goto err_rq; INIT_WORK(&cap->work, execlists_capture_work); schedule_work(&cap->work); + return true; + +err_rq: + i915_request_put(cap->rq); +err_free: + i915_gpu_coredump_put(cap->error); + kfree(cap); + return false; } static noinline void preempt_reset(struct intel_engine_cs *engine) @@ -2627,8 +2648,10 @@ static noinline void preempt_reset(struct intel_engine_cs *engine) jiffies_to_msecs(jiffies - engine->execlists.preempt.expires)); ring_set_paused(engine, 1); /* Freeze the current request in place */ - execlists_capture(engine); - intel_engine_reset(engine, "preemption time out"); + if (execlists_capture(engine)) + intel_engine_reset(engine, "preemption time out"); + else + ring_set_paused(engine, 0); tasklet_enable(&engine->execlists.tasklet); clear_and_wake_up_bit(bit, lock); diff --git a/drivers/gpu/drm/i915/gt/selftest_lrc.c b/drivers/gpu/drm/i915/gt/selftest_lrc.c index b208c2176bbd..e3a408baad0f 100644 --- a/drivers/gpu/drm/i915/gt/selftest_lrc.c +++ b/drivers/gpu/drm/i915/gt/selftest_lrc.c @@ -335,7 +335,6 @@ static int live_hold_reset(void *arg) if (test_and_set_bit(I915_RESET_ENGINE + id, >->reset.flags)) { - spin_unlock_irq(&engine->active.lock); intel_gt_set_wedged(gt); err = -EBUSY; goto out; @@ -345,6 +344,7 @@ static int live_hold_reset(void *arg) engine->execlists.tasklet.func(engine->execlists.tasklet.data); GEM_BUG_ON(execlists_active(&engine->execlists) != rq); + i915_request_get(rq); execlists_hold(engine, rq); GEM_BUG_ON(!i915_request_on_hold(rq)); @@ -356,7 +356,6 @@ static int live_hold_reset(void *arg) >->reset.flags); /* Check that we do not resubmit the held request */ - i915_request_get(rq); if (!i915_request_wait(rq, 0, HZ / 5)) { pr_err("%s: on hold request completed!\n", engine->name); -- cgit v1.2.3 From a2f90f4ff3746c92896e2b7af8763d6fe5206dbc Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Wed, 22 Jan 2020 14:02:42 +0000 Subject: drm/i915/execlists: Reclaim the hanging virtual request If we encounter a hang on a virtual engine, as we process the hang the request may already have been moved back to the virtual engine (we are processing the hang on the physical engine). We need to reclaim the request from the virtual engine so that the locking is consistent and local to the real engine on which we will hold the request for error state capturing. v2: Pull the reclamation into execlists_hold() and assert that cannot be called from outside of the reset (i.e. with the tasklet disabled). v3: Added selftest v4: Drop the reference owned by the virtual engine Fixes: ad18ba7b5eeb ("drm/i915/execlists: Offline error capture") Testcase: igt/gem_exec_balancer/hang Signed-off-by: Chris Wilson Cc: Mika Kuoppala Cc: Tvrtko Ursulin Reviewed-by: Tvrtko Ursulin Link: https://patchwork.freedesktop.org/patch/msgid/20200122140243.495621-2-chris@chris-wilson.co.uk (cherry picked from commit 989df3a7bd2abe566521e61d1aebf603eb013b7f) Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/gt/intel_lrc.c | 29 ++++++ drivers/gpu/drm/i915/gt/selftest_lrc.c | 156 +++++++++++++++++++++++++++++++++ 2 files changed, 185 insertions(+) diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c index 4810c62144ae..a13a8c4b65ab 100644 --- a/drivers/gpu/drm/i915/gt/intel_lrc.c +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c @@ -2403,6 +2403,35 @@ static bool execlists_hold(struct intel_engine_cs *engine, goto unlock; } + if (rq->engine != engine) { /* preempted virtual engine */ + struct virtual_engine *ve = to_virtual_engine(rq->engine); + + /* + * intel_context_inflight() is only protected by virtue + * of process_csb() being called only by the tasklet (or + * directly from inside reset while the tasklet is suspended). + * Assert that neither of those are allowed to run while we + * poke at the request queues. + */ + GEM_BUG_ON(!reset_in_progress(&engine->execlists)); + + /* + * An unsubmitted request along a virtual engine will + * remain on the active (this) engine until we are able + * to process the context switch away (and so mark the + * context as no longer in flight). That cannot have happened + * yet, otherwise we would not be hanging! + */ + spin_lock(&ve->base.active.lock); + GEM_BUG_ON(intel_context_inflight(rq->context) != engine); + GEM_BUG_ON(ve->request != rq); + ve->request = NULL; + spin_unlock(&ve->base.active.lock); + i915_request_put(rq); + + rq->engine = engine; + } + /* * Transfer this request onto the hold queue to prevent it * being resumbitted to HW (and potentially completed) before we have diff --git a/drivers/gpu/drm/i915/gt/selftest_lrc.c b/drivers/gpu/drm/i915/gt/selftest_lrc.c index e3a408baad0f..65718ca2326e 100644 --- a/drivers/gpu/drm/i915/gt/selftest_lrc.c +++ b/drivers/gpu/drm/i915/gt/selftest_lrc.c @@ -3410,6 +3410,161 @@ static int live_virtual_bond(void *arg) return 0; } +static int reset_virtual_engine(struct intel_gt *gt, + struct intel_engine_cs **siblings, + unsigned int nsibling) +{ + struct intel_engine_cs *engine; + struct intel_context *ve; + unsigned long *heartbeat; + struct igt_spinner spin; + struct i915_request *rq; + unsigned int n; + int err = 0; + + /* + * In order to support offline error capture for fast preempt reset, + * we need to decouple the guilty request and ensure that it and its + * descendents are not executed while the capture is in progress. + */ + + heartbeat = kmalloc_array(nsibling, sizeof(*heartbeat), GFP_KERNEL); + if (!heartbeat) + return -ENOMEM; + + if (igt_spinner_init(&spin, gt)) { + err = -ENOMEM; + goto out_free; + } + + ve = intel_execlists_create_virtual(siblings, nsibling); + if (IS_ERR(ve)) { + err = PTR_ERR(ve); + goto out_spin; + } + + for (n = 0; n < nsibling; n++) + engine_heartbeat_disable(siblings[n], &heartbeat[n]); + + rq = igt_spinner_create_request(&spin, ve, MI_ARB_CHECK); + if (IS_ERR(rq)) { + err = PTR_ERR(rq); + goto out_heartbeat; + } + i915_request_add(rq); + + if (!igt_wait_for_spinner(&spin, rq)) { + intel_gt_set_wedged(gt); + err = -ETIME; + goto out_heartbeat; + } + + engine = rq->engine; + GEM_BUG_ON(engine == ve->engine); + + /* Take ownership of the reset and tasklet */ + if (test_and_set_bit(I915_RESET_ENGINE + engine->id, + >->reset.flags)) { + intel_gt_set_wedged(gt); + err = -EBUSY; + goto out_heartbeat; + } + tasklet_disable(&engine->execlists.tasklet); + + engine->execlists.tasklet.func(engine->execlists.tasklet.data); + GEM_BUG_ON(execlists_active(&engine->execlists) != rq); + + /* Fake a preemption event; failed of course */ + spin_lock_irq(&engine->active.lock); + __unwind_incomplete_requests(engine); + spin_unlock_irq(&engine->active.lock); + GEM_BUG_ON(rq->engine != ve->engine); + + /* Reset the engine while keeping our active request on hold */ + execlists_hold(engine, rq); + GEM_BUG_ON(!i915_request_on_hold(rq)); + + intel_engine_reset(engine, NULL); + GEM_BUG_ON(rq->fence.error != -EIO); + + /* Release our grasp on the engine, letting CS flow again */ + tasklet_enable(&engine->execlists.tasklet); + clear_and_wake_up_bit(I915_RESET_ENGINE + engine->id, >->reset.flags); + + /* Check that we do not resubmit the held request */ + i915_request_get(rq); + if (!i915_request_wait(rq, 0, HZ / 5)) { + pr_err("%s: on hold request completed!\n", + engine->name); + intel_gt_set_wedged(gt); + err = -EIO; + goto out_rq; + } + GEM_BUG_ON(!i915_request_on_hold(rq)); + + /* But is resubmitted on release */ + execlists_unhold(engine, rq); + if (i915_request_wait(rq, 0, HZ / 5) < 0) { + pr_err("%s: held request did not complete!\n", + engine->name); + intel_gt_set_wedged(gt); + err = -ETIME; + } + +out_rq: + i915_request_put(rq); +out_heartbeat: + for (n = 0; n < nsibling; n++) + engine_heartbeat_enable(siblings[n], heartbeat[n]); + + intel_context_put(ve); +out_spin: + igt_spinner_fini(&spin); +out_free: + kfree(heartbeat); + return err; +} + +static int live_virtual_reset(void *arg) +{ + struct intel_gt *gt = arg; + struct intel_engine_cs *siblings[MAX_ENGINE_INSTANCE + 1]; + unsigned int class, inst; + + /* + * Check that we handle a reset event within a virtual engine. + * Only the physical engine is reset, but we have to check the flow + * of the virtual requests around the reset, and make sure it is not + * forgotten. + */ + + if (USES_GUC_SUBMISSION(gt->i915)) + return 0; + + if (!intel_has_reset_engine(gt)) + return 0; + + for (class = 0; class <= MAX_ENGINE_CLASS; class++) { + int nsibling, err; + + nsibling = 0; + for (inst = 0; inst <= MAX_ENGINE_INSTANCE; inst++) { + if (!gt->engine_class[class][inst]) + continue; + + siblings[nsibling++] = gt->engine_class[class][inst]; + } + if (nsibling < 2) + continue; + + err = reset_virtual_engine(gt, siblings, nsibling); + if (err) + return err; + } + + return 0; +} + int intel_execlists_live_selftests(struct drm_i915_private *i915) { static const struct i915_subtest tests[] = { @@ -3435,6 +3590,7 @@ int intel_execlists_live_selftests(struct drm_i915_private *i915) SUBTEST(live_virtual_mask), SUBTEST(live_virtual_preserved), SUBTEST(live_virtual_bond), + SUBTEST(live_virtual_reset), }; if (!HAS_EXECLISTS(i915)) -- cgit v1.2.3 From 2aaaa5ee1c3d624a5bcad4ee25f954559c565bc2 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Wed, 22 Jan 2020 14:02:43 +0000 Subject: drm/i915: Mark the removal of the i915_request from the sched.link Keep the rq->fence.flags consistent with the status of the rq->sched.link, and clear the associated bits when decoupling the link on retirement (as we may wish to inspect those flags independent of other state). Fixes: c3f1ed90e6ff ("drm/i915/gt: Allow temporary suspension of inflight requests") References: https://gitlab.freedesktop.org/drm/intel/issues/997 Signed-off-by: Chris Wilson Cc: Tvrtko Ursulin Reviewed-by: Tvrtko Ursulin Link: https://patchwork.freedesktop.org/patch/msgid/20200122140243.495621-3-chris@chris-wilson.co.uk (cherry picked from commit b4a9a149f91ea345da76bcfe3f8a39715ac346a6) Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/i915_request.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c index 9ed0d3bc7249..78a5f5d3c070 100644 --- a/drivers/gpu/drm/i915/i915_request.c +++ b/drivers/gpu/drm/i915/i915_request.c @@ -221,6 +221,8 @@ static void remove_from_engine(struct i915_request *rq) locked = engine; } list_del_init(&rq->sched.link); + clear_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags); + clear_bit(I915_FENCE_FLAG_HOLD, &rq->fence.flags); spin_unlock_irq(&locked->active.lock); } -- cgit v1.2.3 From 7636b586392fc57b84c089147b5e22e52d9650d5 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 12 Feb 2020 12:29:41 -0300 Subject: tools headers x86: Sync disabled-features.h To silence the following tools/perf build warning: Warning: Kernel ABI header at 'tools/arch/x86/include/asm/disabled-features.h' differs from latest version at 'arch/x86/include/asm/disabled-features.h' diff -u tools/arch/x86/include/asm/disabled-features.h arch/x86/include/asm/disabled-features.h Picking up the changes in: 45fc24e89b7c ("x86/mpx: remove MPX from arch/x86") that didn't entail any functionality change in the tooling side. Cc: Adrian Hunter Cc: Dave Hansen Cc: Jiri Olsa Cc: Namhyung Kim Signed-off-by: Arnaldo Carvalho de Melo --- tools/arch/x86/include/asm/disabled-features.h | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/tools/arch/x86/include/asm/disabled-features.h b/tools/arch/x86/include/asm/disabled-features.h index 8e1d0bb46361..4ea8584682f9 100644 --- a/tools/arch/x86/include/asm/disabled-features.h +++ b/tools/arch/x86/include/asm/disabled-features.h @@ -10,12 +10,6 @@ * cpu_feature_enabled(). */ -#ifdef CONFIG_X86_INTEL_MPX -# define DISABLE_MPX 0 -#else -# define DISABLE_MPX (1<<(X86_FEATURE_MPX & 31)) -#endif - #ifdef CONFIG_X86_SMAP # define DISABLE_SMAP 0 #else @@ -74,7 +68,7 @@ #define DISABLED_MASK6 0 #define DISABLED_MASK7 (DISABLE_PTI) #define DISABLED_MASK8 0 -#define DISABLED_MASK9 (DISABLE_MPX|DISABLE_SMAP) +#define DISABLED_MASK9 (DISABLE_SMAP) #define DISABLED_MASK10 0 #define DISABLED_MASK11 0 #define DISABLED_MASK12 0 -- cgit v1.2.3 From 71dd65289793df31e9f10c6b112e5e32dfd89c1d Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 12 Feb 2020 12:33:34 -0300 Subject: tools arch x86: Sync asm/cpufeatures.h with the kernel sources To pick up the changes from: 85c17291e2eb ("x86/cpufeatures: Add flag to track whether MSR IA32_FEAT_CTL is configured") f444a5ff95dc ("x86/cpufeatures: Add support for fast short REP; MOVSB") These don't cause any changes in tooling, just silences this perf build warning: Warning: Kernel ABI header at 'tools/arch/x86/include/asm/cpufeatures.h' differs from latest version at 'arch/x86/include/asm/cpufeatures.h' diff -u tools/arch/x86/include/asm/cpufeatures.h arch/x86/include/asm/cpufeatures.h Cc: Adrian Hunter Cc: Borislav Petkov Cc: Jiri Olsa Cc: Namhyung Kim Cc: Sean Christopherson Cc: Tony Luck Signed-off-by: Arnaldo Carvalho de Melo --- tools/arch/x86/include/asm/cpufeatures.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h index e9b62498fe75..f3327cb56edf 100644 --- a/tools/arch/x86/include/asm/cpufeatures.h +++ b/tools/arch/x86/include/asm/cpufeatures.h @@ -220,6 +220,7 @@ #define X86_FEATURE_ZEN ( 7*32+28) /* "" CPU is AMD family 0x17 (Zen) */ #define X86_FEATURE_L1TF_PTEINV ( 7*32+29) /* "" L1TF workaround PTE inversion */ #define X86_FEATURE_IBRS_ENHANCED ( 7*32+30) /* Enhanced IBRS */ +#define X86_FEATURE_MSR_IA32_FEAT_CTL ( 7*32+31) /* "" MSR IA32_FEAT_CTL configured */ /* Virtualization flags: Linux defined, word 8 */ #define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */ @@ -357,6 +358,7 @@ /* Intel-defined CPU features, CPUID level 0x00000007:0 (EDX), word 18 */ #define X86_FEATURE_AVX512_4VNNIW (18*32+ 2) /* AVX-512 Neural Network Instructions */ #define X86_FEATURE_AVX512_4FMAPS (18*32+ 3) /* AVX-512 Multiply Accumulation Single precision */ +#define X86_FEATURE_FSRM (18*32+ 4) /* Fast Short Rep Mov */ #define X86_FEATURE_AVX512_VP2INTERSECT (18*32+ 8) /* AVX-512 Intersect for D/Q */ #define X86_FEATURE_MD_CLEAR (18*32+10) /* VERW clears CPU buffers */ #define X86_FEATURE_TSX_FORCE_ABORT (18*32+13) /* "" TSX_FORCE_ABORT */ -- cgit v1.2.3 From 391df72fbd144878e2f905d86f1e9a85a059216a Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 12 Feb 2020 12:41:20 -0300 Subject: tools headers kvm: Sync kvm headers with the kernel sources To pick up the changes from: 290a6bb06de9 ("arm64: KVM: Add UAPI notes for swapped registers") No tools changes are caused by this. This addresses these tools/perf build warnings: Cc: Adrian Hunter Cc: Andrew Jones Cc: Jiri Olsa Cc: Marc Zyngier Cc: Namhyung Kim Signed-off-by: Arnaldo Carvalho de Melo --- tools/arch/arm64/include/uapi/asm/kvm.h | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/tools/arch/arm64/include/uapi/asm/kvm.h b/tools/arch/arm64/include/uapi/asm/kvm.h index 820e5751ada7..ba85bb23f060 100644 --- a/tools/arch/arm64/include/uapi/asm/kvm.h +++ b/tools/arch/arm64/include/uapi/asm/kvm.h @@ -220,10 +220,18 @@ struct kvm_vcpu_events { #define KVM_REG_ARM_PTIMER_CVAL ARM64_SYS_REG(3, 3, 14, 2, 2) #define KVM_REG_ARM_PTIMER_CNT ARM64_SYS_REG(3, 3, 14, 0, 1) -/* EL0 Virtual Timer Registers */ +/* + * EL0 Virtual Timer Registers + * + * WARNING: + * KVM_REG_ARM_TIMER_CVAL and KVM_REG_ARM_TIMER_CNT are not defined + * with the appropriate register encodings. Their values have been + * accidentally swapped. As this is set API, the definitions here + * must be used, rather than ones derived from the encodings. + */ #define KVM_REG_ARM_TIMER_CTL ARM64_SYS_REG(3, 3, 14, 3, 1) -#define KVM_REG_ARM_TIMER_CNT ARM64_SYS_REG(3, 3, 14, 3, 2) #define KVM_REG_ARM_TIMER_CVAL ARM64_SYS_REG(3, 3, 14, 0, 2) +#define KVM_REG_ARM_TIMER_CNT ARM64_SYS_REG(3, 3, 14, 3, 2) /* KVM-as-firmware specific pseudo-registers */ #define KVM_REG_ARM_FW (0x0014 << KVM_REG_ARM_COPROC_SHIFT) -- cgit v1.2.3 From 2a8d017d46a3f48dad3319e569cd0aee61bab8fc Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 12 Feb 2020 12:45:24 -0300 Subject: tools headers kvm: Sync linux/kvm.h with the kernel sources To pick up the changes from: 7de3f1423ff9 ("KVM: s390: Add new reset vcpu API") So far we're ignoring those arch specific ioctls, we need to revisit this at some time to have arch specific tables, etc: $ grep S390 tools/perf/trace/beauty/kvm_ioctl.sh egrep -v " ((ARM|PPC|S390)_|[GS]ET_(DEBUGREGS|PIT2|XSAVE|TSC_KHZ)|CREATE_SPAPR_TCE_64)" | \ $ This addresses these tools/perf build warnings: Warning: Kernel ABI header at 'tools/arch/arm/include/uapi/asm/kvm.h' differs from latest version at 'arch/arm/include/uapi/asm/kvm.h' diff -u tools/arch/arm/include/uapi/asm/kvm.h arch/arm/include/uapi/asm/kvm.h Cc: Adrian Hunter Cc: Christian Borntraeger Cc: Janosch Frank Cc: Jiri Olsa Cc: Namhyung Kim Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/uapi/linux/kvm.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h index f0a16b4adbbd..4b95f9a31a2f 100644 --- a/tools/include/uapi/linux/kvm.h +++ b/tools/include/uapi/linux/kvm.h @@ -1009,6 +1009,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_PPC_GUEST_DEBUG_SSTEP 176 #define KVM_CAP_ARM_NISV_TO_USER 177 #define KVM_CAP_ARM_INJECT_EXT_DABT 178 +#define KVM_CAP_S390_VCPU_RESETS 179 #ifdef KVM_CAP_IRQ_ROUTING @@ -1473,6 +1474,10 @@ struct kvm_enc_region { /* Available with KVM_CAP_ARM_SVE */ #define KVM_ARM_VCPU_FINALIZE _IOW(KVMIO, 0xc2, int) +/* Available with KVM_CAP_S390_VCPU_RESETS */ +#define KVM_S390_NORMAL_RESET _IO(KVMIO, 0xc3) +#define KVM_S390_CLEAR_RESET _IO(KVMIO, 0xc4) + /* Secure Encrypted Virtualization command */ enum sev_cmd_id { /* Guest initialization commands */ -- cgit v1.2.3 From f311ade3a7adf31658ed882aaab9f9879fdccef7 Mon Sep 17 00:00:00 2001 From: Wenwen Wang Date: Sat, 1 Feb 2020 20:38:38 +0000 Subject: btrfs: ref-verify: fix memory leaks In btrfs_ref_tree_mod(), 'ref' and 'ra' are allocated through kzalloc() and kmalloc(), respectively. In the following code, if an error occurs, the execution will be redirected to 'out' or 'out_unlock' and the function will be exited. However, on some of the paths, 'ref' and 'ra' are not deallocated, leading to memory leaks. For example, if 'action' is BTRFS_ADD_DELAYED_EXTENT, add_block_entry() will be invoked. If the return value indicates an error, the execution will be redirected to 'out'. But, 'ref' is not deallocated on this path, causing a memory leak. To fix the above issues, deallocate both 'ref' and 'ra' before exiting from the function when an error is encountered. CC: stable@vger.kernel.org # 4.15+ Signed-off-by: Wenwen Wang Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ref-verify.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c index b57f3618e58e..454a1015d026 100644 --- a/fs/btrfs/ref-verify.c +++ b/fs/btrfs/ref-verify.c @@ -744,6 +744,7 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info, */ be = add_block_entry(fs_info, bytenr, num_bytes, ref_root); if (IS_ERR(be)) { + kfree(ref); kfree(ra); ret = PTR_ERR(be); goto out; @@ -757,6 +758,8 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info, "re-allocated a block that still has references to it!"); dump_block_entry(fs_info, be); dump_ref_action(fs_info, ra); + kfree(ref); + kfree(ra); goto out_unlock; } @@ -819,6 +822,7 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info, "dropping a ref for a existing root that doesn't have a ref on the block"); dump_block_entry(fs_info, be); dump_ref_action(fs_info, ra); + kfree(ref); kfree(ra); goto out_unlock; } @@ -834,6 +838,7 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info, "attempting to add another ref for an existing ref on a tree block"); dump_block_entry(fs_info, be); dump_ref_action(fs_info, ra); + kfree(ref); kfree(ra); goto out_unlock; } -- cgit v1.2.3 From ac05ca913e9f3871126d61da275bfe8516ff01ca Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 31 Jan 2020 14:06:07 +0000 Subject: Btrfs: fix race between using extent maps and merging them We have a few cases where we allow an extent map that is in an extent map tree to be merged with other extents in the tree. Such cases include the unpinning of an extent after the respective ordered extent completed or after logging an extent during a fast fsync. This can lead to subtle and dangerous problems because when doing the merge some other task might be using the same extent map and as consequence see an inconsistent state of the extent map - for example sees the new length but has seen the old start offset. With luck this triggers a BUG_ON(), and not some silent bug, such as the following one in __do_readpage(): $ cat -n fs/btrfs/extent_io.c 3061 static int __do_readpage(struct extent_io_tree *tree, 3062 struct page *page, (...) 3127 em = __get_extent_map(inode, page, pg_offset, cur, 3128 end - cur + 1, get_extent, em_cached); 3129 if (IS_ERR_OR_NULL(em)) { 3130 SetPageError(page); 3131 unlock_extent(tree, cur, end); 3132 break; 3133 } 3134 extent_offset = cur - em->start; 3135 BUG_ON(extent_map_end(em) <= cur); (...) Consider the following example scenario, where we end up hitting the BUG_ON() in __do_readpage(). We have an inode with a size of 8KiB and 2 extent maps: extent A: file offset 0, length 4KiB, disk_bytenr = X, persisted on disk by a previous transaction extent B: file offset 4KiB, length 4KiB, disk_bytenr = X + 4KiB, not yet persisted but writeback started for it already. The extent map is pinned since there's writeback and an ordered extent in progress, so it can not be merged with extent map A yet The following sequence of steps leads to the BUG_ON(): 1) The ordered extent for extent B completes, the respective page gets its writeback bit cleared and the extent map is unpinned, at that point it is not yet merged with extent map A because it's in the list of modified extents; 2) Due to memory pressure, or some other reason, the MM subsystem releases the page corresponding to extent B - btrfs_releasepage() is called and returns 1, meaning the page can be released as it's not dirty, not under writeback anymore and the extent range is not locked in the inode's iotree. However the extent map is not released, either because we are not in a context that allows memory allocations to block or because the inode's size is smaller than 16MiB - in this case our inode has a size of 8KiB; 3) Task B needs to read extent B and ends up __do_readpage() through the btrfs_readpage() callback. At __do_readpage() it gets a reference to extent map B; 4) Task A, doing a fast fsync, calls clear_em_loggin() against extent map B while holding the write lock on the inode's extent map tree - this results in try_merge_map() being called and since it's possible to merge extent map B with extent map A now (the extent map B was removed from the list of modified extents), the merging begins - it sets extent map B's start offset to 0 (was 4KiB), but before it increments the map's length to 8KiB (4kb + 4KiB), task A is at: BUG_ON(extent_map_end(em) <= cur); The call to extent_map_end() sees the extent map has a start of 0 and a length still at 4KiB, so it returns 4KiB and 'cur' is 4KiB, so the BUG_ON() is triggered. So it's dangerous to modify an extent map that is in the tree, because some other task might have got a reference to it before and still using it, and needs to see a consistent map while using it. Generally this is very rare since most paths that lookup and use extent maps also have the file range locked in the inode's iotree. The fsync path is pretty much the only exception where we don't do it to avoid serialization with concurrent reads. Fix this by not allowing an extent map do be merged if if it's being used by tasks other then the one attempting to merge the extent map (when the reference count of the extent map is greater than 2). Reported-by: ryusuke1925 Reported-by: Koki Mitani Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=206211 CC: stable@vger.kernel.org # 4.4+ Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/extent_map.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 6f417ff68980..bd6229fb2b6f 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -237,6 +237,17 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em) struct extent_map *merge = NULL; struct rb_node *rb; + /* + * We can't modify an extent map that is in the tree and that is being + * used by another task, as it can cause that other task to see it in + * inconsistent state during the merging. We always have 1 reference for + * the tree and 1 for this task (which is unpinning the extent map or + * clearing the logging flag), so anything > 2 means it's being used by + * other tasks too. + */ + if (refcount_read(&em->refs) > 2) + return; + if (em->start != 0) { rb = rb_prev(&em->rb_node); if (rb) -- cgit v1.2.3 From e8294f2f6aa6208ed0923aa6d70cea3be178309a Mon Sep 17 00:00:00 2001 From: David Sterba Date: Wed, 5 Feb 2020 17:12:16 +0100 Subject: btrfs: print message when tree-log replay starts There's no logged information about tree-log replay although this is something that points to previous unclean unmount. Other filesystems report that as well. Suggested-by: Chris Murphy CC: stable@vger.kernel.org # 4.4+ Reviewed-by: Anand Jain Reviewed-by: Johannes Thumshirn Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 7fa9bb79ad08..89422aa8e9d1 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3164,6 +3164,7 @@ int __cold open_ctree(struct super_block *sb, /* do not make disk changes in broken FS or nologreplay is given */ if (btrfs_super_log_root(disk_super) != 0 && !btrfs_test_opt(fs_info, NOLOGREPLAY)) { + btrfs_info(fs_info, "start tree-log replay"); ret = btrfs_replay_log(fs_info, fs_devices); if (ret) { err = ret; -- cgit v1.2.3 From 10a3a3edc5b89a8cd095bc63495fb1e0f42047d9 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Wed, 5 Feb 2020 17:12:28 +0100 Subject: btrfs: log message when rw remount is attempted with unclean tree-log A remount to a read-write filesystem is not safe when there's tree-log to be replayed. Files that could be opened until now might be affected by the changes in the tree-log. A regular mount is needed to replay the log so the filesystem presents the consistent view with the pending changes included. CC: stable@vger.kernel.org # 4.4+ Reviewed-by: Anand Jain Reviewed-by: Johannes Thumshirn Signed-off-by: David Sterba --- fs/btrfs/super.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 0616a5434793..67c63858812a 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1834,6 +1834,8 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) } if (btrfs_super_log_root(fs_info->super_copy) != 0) { + btrfs_warn(fs_info, + "mount required to replay tree-log, cannot remount read-write"); ret = -EINVAL; goto restore; } -- cgit v1.2.3 From 28553fa992cb28be6a65566681aac6cafabb4f2d Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 7 Feb 2020 12:23:09 +0000 Subject: Btrfs: fix race between shrinking truncate and fiemap When there is a fiemap executing in parallel with a shrinking truncate we can end up in a situation where we have extent maps for which we no longer have corresponding file extent items. This is generally harmless and at the moment the only consequences are missing file extent items representing holes after we expand the file size again after the truncate operation removed the prealloc extent items, and stale information for future fiemap calls (reporting extents that no longer exist or may have been reallocated to other files for example). Consider the following example: 1) Our inode has a size of 128KiB, one 128KiB extent at file offset 0 and a 1MiB prealloc extent at file offset 128KiB; 2) Task A starts doing a shrinking truncate of our inode to reduce it to a size of 64KiB. Before it searches the subvolume tree for file extent items to delete, it drops all the extent maps in the range from 64KiB to (u64)-1 by calling btrfs_drop_extent_cache(); 3) Task B starts doing a fiemap against our inode. When looking up for the inode's extent maps in the range from 128KiB to (u64)-1, it doesn't find any in the inode's extent map tree, since they were removed by task A. Because it didn't find any in the extent map tree, it scans the inode's subvolume tree for file extent items, and it finds the 1MiB prealloc extent at file offset 128KiB, then it creates an extent map based on that file extent item and adds it to inode's extent map tree (this ends up being done by btrfs_get_extent() <- btrfs_get_extent_fiemap() <- get_extent_skip_holes()); 4) Task A then drops the prealloc extent at file offset 128KiB and shrinks the 128KiB extent file offset 0 to a length of 64KiB. The truncation operation finishes and we end up with an extent map representing a 1MiB prealloc extent at file offset 128KiB, despite we don't have any more that extent; After this the two types of problems we have are: 1) Future calls to fiemap always report that a 1MiB prealloc extent exists at file offset 128KiB. This is stale information, no longer correct; 2) If the size of the file is increased, by a truncate operation that increases the file size or by a write into a file offset > 64KiB for example, we end up not inserting file extent items to represent holes for any range between 128KiB and 128KiB + 1MiB, since the hole expansion function, btrfs_cont_expand() will skip hole insertion for any range for which an extent map exists that represents a prealloc extent. This causes fsck to complain about missing file extent items when not using the NO_HOLES feature. The second issue could be often triggered by test case generic/561 from fstests, which runs fsstress and duperemove in parallel, and duperemove does frequent fiemap calls. Essentially the problems happens because fiemap does not acquire the inode's lock while truncate does, and fiemap locks the file range in the inode's iotree while truncate does not. So fix the issue by making btrfs_truncate_inode_items() lock the file range from the new file size to (u64)-1, so that it serializes with fiemap. CC: stable@vger.kernel.org # 4.4+ Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 5b3ec93ff911..7d26b4bfb2c6 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4085,6 +4085,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, u64 bytes_deleted = 0; bool be_nice = false; bool should_throttle = false; + const u64 lock_start = ALIGN_DOWN(new_size, fs_info->sectorsize); + struct extent_state *cached_state = NULL; BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY); @@ -4101,6 +4103,9 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, return -ENOMEM; path->reada = READA_BACK; + lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, (u64)-1, + &cached_state); + /* * We want to drop from the next block forward in case this new size is * not block aligned since we will be keeping the last block of the @@ -4367,6 +4372,9 @@ out: btrfs_ordered_update_i_size(inode, last_size, NULL); } + unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start, (u64)-1, + &cached_state); + btrfs_free_path(path); return ret; } -- cgit v1.2.3 From 2fe77100553f3ac6b2105db8ae14b5ea4b43c108 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 11 Feb 2020 09:59:10 -0800 Subject: selftests/bpf: Fix error checking on reading the tcp_fastopen sysctl There is a typo in checking the "saved_tcp_fo" and instead "saved_tcp_syncookie" is checked again. This patch fixes it and also breaks them into separate if statements such that the test will abort asap. Reported-by: David Binderman Signed-off-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200211175910.3235321-1-kafai@fb.com --- tools/testing/selftests/bpf/prog_tests/select_reuseport.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/prog_tests/select_reuseport.c b/tools/testing/selftests/bpf/prog_tests/select_reuseport.c index 098bcae5f827..b577666d028e 100644 --- a/tools/testing/selftests/bpf/prog_tests/select_reuseport.c +++ b/tools/testing/selftests/bpf/prog_tests/select_reuseport.c @@ -822,8 +822,10 @@ void test_select_reuseport(void) goto out; saved_tcp_fo = read_int_sysctl(TCP_FO_SYSCTL); + if (saved_tcp_fo < 0) + goto out; saved_tcp_syncookie = read_int_sysctl(TCP_SYNCOOKIE_SYSCTL); - if (saved_tcp_syncookie < 0 || saved_tcp_syncookie < 0) + if (saved_tcp_syncookie < 0) goto out; if (enable_fastopen()) -- cgit v1.2.3 From eecd618b45166fdddea3b6366b18479c2be0e11c Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Wed, 12 Feb 2020 10:32:08 +0000 Subject: selftests/bpf: Mark SYN cookie test skipped for UDP sockets SYN cookie test with reuseport BPF doesn't make sense for UDP sockets. We don't run it but the test_progs test runner doesn't know about it. Mark the test as skipped so the test_progs can report correctly how many tests were skipped. Fixes: 7ee0d4e97b88 ("selftests/bpf: Switch reuseport tests for test_progs framework") Reported-by: Lorenz Bauer Signed-off-by: Jakub Sitnicki Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200212103208.438419-1-jakub@cloudflare.com --- tools/testing/selftests/bpf/prog_tests/select_reuseport.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/prog_tests/select_reuseport.c b/tools/testing/selftests/bpf/prog_tests/select_reuseport.c index b577666d028e..0800036ed654 100644 --- a/tools/testing/selftests/bpf/prog_tests/select_reuseport.c +++ b/tools/testing/selftests/bpf/prog_tests/select_reuseport.c @@ -506,8 +506,10 @@ static void test_syncookie(int type, sa_family_t family) .pass_on_failure = 0, }; - if (type != SOCK_STREAM) + if (type != SOCK_STREAM) { + test__skip(); return; + } /* * +1 for TCP-SYN and -- cgit v1.2.3 From d91771848f0ae2eec250a9345926a1a3558fa943 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Wed, 12 Feb 2020 11:09:34 +0100 Subject: arm64: time: Replace by The arm64 time code is not a clock provider, and just needs to call of_clk_init(). Hence it can include instead of . Reviewed-by: Stephen Boyd Signed-off-by: Geert Uytterhoeven Signed-off-by: Will Deacon --- arch/arm64/kernel/time.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kernel/time.c b/arch/arm64/kernel/time.c index 73f06d4b3aae..eebbc8d7123e 100644 --- a/arch/arm64/kernel/time.c +++ b/arch/arm64/kernel/time.c @@ -23,7 +23,7 @@ #include #include #include -#include +#include #include #include -- cgit v1.2.3 From a013d141eceee0f7747385e900da2858141aa0f3 Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Wed, 12 Feb 2020 17:28:10 +0800 Subject: btrfs: sysfs, add UUID/devinfo kobject Create directory /sys/fs/btrfs/UUID/devinfo to hold devices directories by the id (unlike /devices). Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/sysfs.c | 15 +++++++++++++++ fs/btrfs/volumes.h | 1 + 2 files changed, 16 insertions(+) diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 7436422194da..6bac61c42c05 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -901,6 +901,12 @@ static int addrm_unknown_feature_attrs(struct btrfs_fs_info *fs_info, bool add) static void __btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs) { + if (fs_devs->devinfo_kobj) { + kobject_del(fs_devs->devinfo_kobj); + kobject_put(fs_devs->devinfo_kobj); + fs_devs->devinfo_kobj = NULL; + } + if (fs_devs->devices_kobj) { kobject_del(fs_devs->devices_kobj); kobject_put(fs_devs->devices_kobj); @@ -1369,6 +1375,15 @@ int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs) return -ENOMEM; } + fs_devs->devinfo_kobj = kobject_create_and_add("devinfo", + &fs_devs->fsid_kobj); + if (!fs_devs->devinfo_kobj) { + btrfs_err(fs_devs->fs_info, + "failed to init sysfs devinfo kobject"); + btrfs_sysfs_remove_fsid(fs_devs); + return -ENOMEM; + } + return 0; } diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 690d4f5a0653..309cda477589 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -258,6 +258,7 @@ struct btrfs_fs_devices { /* sysfs kobjects */ struct kobject fsid_kobj; struct kobject *devices_kobj; + struct kobject *devinfo_kobj; struct completion kobj_unregister; }; -- cgit v1.2.3 From 1b9867eb6120db85f8dca8ff42789d9ec9ee16a5 Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Wed, 12 Feb 2020 17:28:11 +0800 Subject: btrfs: sysfs, move device id directories to UUID/devinfo Originally it was planned to create device id directories under UUID/devinfo, but it got under UUID/devices by mistake. We really want it under definfo so the bare device node names are not mixed with device ids and are easy to enumerate. Fixes: 668e48af7a94 ("btrfs: sysfs, add devid/dev_state kobject and device attributes") Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/sysfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 6bac61c42c05..3c10e78924d0 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -1295,7 +1295,7 @@ int btrfs_sysfs_add_device_link(struct btrfs_fs_devices *fs_devices, init_completion(&dev->kobj_unregister); error = kobject_init_and_add(&dev->devid_kobj, &devid_ktype, - fs_devices->devices_kobj, "%llu", + fs_devices->devinfo_kobj, "%llu", dev->devid); if (error) { kobject_put(&dev->devid_kobj); -- cgit v1.2.3 From b50f4f940b736b63df1d8a0ddcd28f0a580233eb Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Mon, 10 Feb 2020 11:04:55 +0100 Subject: dt-bindings: display: sunxi: Fix compatible Commit f5a98bfe7b37 ("dt-bindings: display: Convert Allwinner display pipeline to schemas") introduced a YAML schema for the Allwinner TCON DT binding, but the H6 TCON-TV compatible was mistakenly set to fallback on the A83t's, while the initial documentation and the DT are using R40's. Fix that. Fixes: f5a98bfe7b37 ("dt-bindings: display: Convert Allwinner display pipeline to schemas") Signed-off-by: Maxime Ripard Acked-by: Rob Herring Link: https://patchwork.freedesktop.org/patch/msgid/20200210100455.78695-1-maxime@cerno.tech --- .../devicetree/bindings/display/allwinner,sun4i-a10-tcon.yaml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/display/allwinner,sun4i-a10-tcon.yaml b/Documentation/devicetree/bindings/display/allwinner,sun4i-a10-tcon.yaml index 86ad617d2327..5ff9cf26ca38 100644 --- a/Documentation/devicetree/bindings/display/allwinner,sun4i-a10-tcon.yaml +++ b/Documentation/devicetree/bindings/display/allwinner,sun4i-a10-tcon.yaml @@ -43,9 +43,13 @@ properties: - enum: - allwinner,sun8i-h3-tcon-tv - allwinner,sun50i-a64-tcon-tv - - allwinner,sun50i-h6-tcon-tv - const: allwinner,sun8i-a83t-tcon-tv + - items: + - enum: + - allwinner,sun50i-h6-tcon-tv + - const: allwinner,sun8i-r40-tcon-tv + reg: maxItems: 1 -- cgit v1.2.3 From e6980a727154b793adb218fbc7b4d6af52a7e364 Mon Sep 17 00:00:00 2001 From: Stephan Gerhold Date: Fri, 17 Jan 2020 16:34:28 +0100 Subject: drm/modes: Make sure to parse valid rotation value from cmdline A rotation value should have exactly one rotation angle. At the moment there is no validation for this when parsing video= parameters from the command line. This causes problems later on when we try to combine the command line rotation with the panel orientation. To make sure that we generate a valid rotation value: - Set DRM_MODE_ROTATE_0 by default (if no rotate= option is set) - Validate that there is exactly one rotation angle set (i.e. specifying the rotate= option multiple times is invalid) Signed-off-by: Stephan Gerhold Signed-off-by: Maxime Ripard Link: https://patchwork.freedesktop.org/patch/msgid/20200117153429.54700-2-stephan@gerhold.net --- drivers/gpu/drm/drm_modes.c | 7 +++++++ drivers/gpu/drm/selftests/drm_cmdline_selftests.h | 1 + drivers/gpu/drm/selftests/test-drm_cmdline_parser.c | 15 +++++++++++++-- 3 files changed, 21 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/drm_modes.c b/drivers/gpu/drm/drm_modes.c index 10336b144c72..d4d64518e11b 100644 --- a/drivers/gpu/drm/drm_modes.c +++ b/drivers/gpu/drm/drm_modes.c @@ -1698,6 +1698,13 @@ static int drm_mode_parse_cmdline_options(const char *str, if (rotation && freestanding) return -EINVAL; + if (!(rotation & DRM_MODE_ROTATE_MASK)) + rotation |= DRM_MODE_ROTATE_0; + + /* Make sure there is exactly one rotation defined */ + if (!is_power_of_2(rotation & DRM_MODE_ROTATE_MASK)) + return -EINVAL; + mode->rotation_reflection = rotation; return 0; diff --git a/drivers/gpu/drm/selftests/drm_cmdline_selftests.h b/drivers/gpu/drm/selftests/drm_cmdline_selftests.h index ceac7af9a172..29e367db6118 100644 --- a/drivers/gpu/drm/selftests/drm_cmdline_selftests.h +++ b/drivers/gpu/drm/selftests/drm_cmdline_selftests.h @@ -53,6 +53,7 @@ cmdline_test(drm_cmdline_test_rotate_0) cmdline_test(drm_cmdline_test_rotate_90) cmdline_test(drm_cmdline_test_rotate_180) cmdline_test(drm_cmdline_test_rotate_270) +cmdline_test(drm_cmdline_test_rotate_multiple) cmdline_test(drm_cmdline_test_rotate_invalid_val) cmdline_test(drm_cmdline_test_rotate_truncated) cmdline_test(drm_cmdline_test_hmirror) diff --git a/drivers/gpu/drm/selftests/test-drm_cmdline_parser.c b/drivers/gpu/drm/selftests/test-drm_cmdline_parser.c index 520f3e66a384..d96cd890def6 100644 --- a/drivers/gpu/drm/selftests/test-drm_cmdline_parser.c +++ b/drivers/gpu/drm/selftests/test-drm_cmdline_parser.c @@ -856,6 +856,17 @@ static int drm_cmdline_test_rotate_270(void *ignored) return 0; } +static int drm_cmdline_test_rotate_multiple(void *ignored) +{ + struct drm_cmdline_mode mode = { }; + + FAIL_ON(drm_mode_parse_command_line_for_connector("720x480,rotate=0,rotate=90", + &no_connector, + &mode)); + + return 0; +} + static int drm_cmdline_test_rotate_invalid_val(void *ignored) { struct drm_cmdline_mode mode = { }; @@ -888,7 +899,7 @@ static int drm_cmdline_test_hmirror(void *ignored) FAIL_ON(!mode.specified); FAIL_ON(mode.xres != 720); FAIL_ON(mode.yres != 480); - FAIL_ON(mode.rotation_reflection != DRM_MODE_REFLECT_X); + FAIL_ON(mode.rotation_reflection != (DRM_MODE_ROTATE_0 | DRM_MODE_REFLECT_X)); FAIL_ON(mode.refresh_specified); @@ -913,7 +924,7 @@ static int drm_cmdline_test_vmirror(void *ignored) FAIL_ON(!mode.specified); FAIL_ON(mode.xres != 720); FAIL_ON(mode.yres != 480); - FAIL_ON(mode.rotation_reflection != DRM_MODE_REFLECT_Y); + FAIL_ON(mode.rotation_reflection != (DRM_MODE_ROTATE_0 | DRM_MODE_REFLECT_Y)); FAIL_ON(mode.refresh_specified); -- cgit v1.2.3 From 5c320b6ce7510653bce68cecf80cf5b2d67e907f Mon Sep 17 00:00:00 2001 From: Stephan Gerhold Date: Fri, 17 Jan 2020 16:34:29 +0100 Subject: drm/modes: Allow DRM_MODE_ROTATE_0 when applying video mode parameters At the moment, only DRM_MODE_ROTATE_180 is allowed when we try to apply the rotation from the video mode parameters. It is also useful to allow DRM_MODE_ROTATE_0 in case there is only a reflect option in the video mode parameter (e.g. video=540x960,reflect_x). DRM_MODE_ROTATE_0 means "no rotation" and should therefore not require any special handling, so we can just add it to the if condition. Signed-off-by: Stephan Gerhold Signed-off-by: Maxime Ripard Link: https://patchwork.freedesktop.org/patch/msgid/20200117153429.54700-3-stephan@gerhold.net --- drivers/gpu/drm/drm_client_modeset.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/drm_client_modeset.c b/drivers/gpu/drm/drm_client_modeset.c index 6d4a29e99ae2..3035584f6dc7 100644 --- a/drivers/gpu/drm/drm_client_modeset.c +++ b/drivers/gpu/drm/drm_client_modeset.c @@ -951,7 +951,8 @@ bool drm_client_rotation(struct drm_mode_set *modeset, unsigned int *rotation) * depending on the hardware this may require the framebuffer * to be in a specific tiling format. */ - if ((*rotation & DRM_MODE_ROTATE_MASK) != DRM_MODE_ROTATE_180 || + if (((*rotation & DRM_MODE_ROTATE_MASK) != DRM_MODE_ROTATE_0 && + (*rotation & DRM_MODE_ROTATE_MASK) != DRM_MODE_ROTATE_180) || !plane->rotation_property) return false; -- cgit v1.2.3 From 0f90522591fd09dd201065c53ebefdfe3c6b55cb Mon Sep 17 00:00:00 2001 From: Firo Yang Date: Wed, 12 Feb 2020 06:09:17 +0100 Subject: enic: prevent waking up stopped tx queues over watchdog reset Recent months, our customer reported several kernel crashes all preceding with following message: NETDEV WATCHDOG: eth2 (enic): transmit queue 0 timed out Error message of one of those crashes: BUG: unable to handle kernel paging request at ffffffffa007e090 After analyzing severl vmcores, I found that most of crashes are caused by memory corruption. And all the corrupted memory areas are overwritten by data of network packets. Moreover, I also found that the tx queues were enabled over watchdog reset. After going through the source code, I found that in enic_stop(), the tx queues stopped by netif_tx_disable() could be woken up over a small time window between netif_tx_disable() and the napi_disable() by the following code path: napi_poll-> enic_poll_msix_wq-> vnic_cq_service-> enic_wq_service-> netif_wake_subqueue(enic->netdev, q_number)-> test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &txq->state) In turn, upper netowrk stack could queue skb to ENIC NIC though enic_hard_start_xmit(). And this might introduce some race condition. Our customer comfirmed that this kind of kernel crash doesn't occur over 90 days since they applied this patch. Signed-off-by: Firo Yang Signed-off-by: David S. Miller --- drivers/net/ethernet/cisco/enic/enic_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/cisco/enic/enic_main.c b/drivers/net/ethernet/cisco/enic/enic_main.c index bbd7b3175f09..ddf60dc9ad16 100644 --- a/drivers/net/ethernet/cisco/enic/enic_main.c +++ b/drivers/net/ethernet/cisco/enic/enic_main.c @@ -2013,10 +2013,10 @@ static int enic_stop(struct net_device *netdev) napi_disable(&enic->napi[i]); netif_carrier_off(netdev); - netif_tx_disable(netdev); if (vnic_dev_get_intr_mode(enic->vdev) == VNIC_DEV_INTR_MODE_MSIX) for (i = 0; i < enic->wq_count; i++) napi_disable(&enic->napi[enic_cq_wq(enic, i)]); + netif_tx_disable(netdev); if (!enic_is_dynamic(enic) && !enic_is_sriov_vf(enic)) enic_dev_del_station_addr(enic); -- cgit v1.2.3 From 67f68f977a12657028e866c013d43dd87320d210 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Wed, 12 Feb 2020 09:48:57 -0800 Subject: Revert "xhci: Fix memory leak when caching protocol extended capability PSI tables" This reverts commit fc57313d1017dd6b6f37a94e88daa8df54368ecc. Marek reports that it breaks things: This patch landed in today's linux-next (20200211) and causes NULL pointer dereference during second suspend/resume cycle on Samsung Exynos5422-based (arm 32bit) Odroid XU3lite board: A more complete fix will be added soon. Reported-by: Marek Szyprowski Fixes: fc57313d1017 ("xhci: Fix memory leak when caching protocol extended capability PSI tables") Cc: Paul Menzel Cc: Sajja Venkateswara Rao Cc: stable # v4.4+ Cc: Mathias Nyman Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/xhci-hub.c | 25 +++++++------------ drivers/usb/host/xhci-mem.c | 58 +++++++++++++++++---------------------------- drivers/usb/host/xhci.h | 14 +++-------- 3 files changed, 33 insertions(+), 64 deletions(-) diff --git a/drivers/usb/host/xhci-hub.c b/drivers/usb/host/xhci-hub.c index af92b2576fe9..7a3a29e5e9d2 100644 --- a/drivers/usb/host/xhci-hub.c +++ b/drivers/usb/host/xhci-hub.c @@ -55,7 +55,6 @@ static u8 usb_bos_descriptor [] = { static int xhci_create_usb3_bos_desc(struct xhci_hcd *xhci, char *buf, u16 wLength) { - struct xhci_port_cap *port_cap = NULL; int i, ssa_count; u32 temp; u16 desc_size, ssp_cap_size, ssa_size = 0; @@ -65,24 +64,16 @@ static int xhci_create_usb3_bos_desc(struct xhci_hcd *xhci, char *buf, ssp_cap_size = sizeof(usb_bos_descriptor) - desc_size; /* does xhci support USB 3.1 Enhanced SuperSpeed */ - for (i = 0; i < xhci->num_port_caps; i++) { - if (xhci->port_caps[i].maj_rev == 0x03 && - xhci->port_caps[i].min_rev >= 0x01) { - usb3_1 = true; - port_cap = &xhci->port_caps[i]; - break; - } - } - - if (usb3_1) { + if (xhci->usb3_rhub.min_rev >= 0x01) { /* does xhci provide a PSI table for SSA speed attributes? */ - if (port_cap->psi_count) { + if (xhci->usb3_rhub.psi_count) { /* two SSA entries for each unique PSI ID, RX and TX */ - ssa_count = port_cap->psi_uid_count * 2; + ssa_count = xhci->usb3_rhub.psi_uid_count * 2; ssa_size = ssa_count * sizeof(u32); ssp_cap_size -= 16; /* skip copying the default SSA */ } desc_size += ssp_cap_size; + usb3_1 = true; } memcpy(buf, &usb_bos_descriptor, min(desc_size, wLength)); @@ -108,7 +99,7 @@ static int xhci_create_usb3_bos_desc(struct xhci_hcd *xhci, char *buf, } /* If PSI table exists, add the custom speed attributes from it */ - if (usb3_1 && port_cap->psi_count) { + if (usb3_1 && xhci->usb3_rhub.psi_count) { u32 ssp_cap_base, bm_attrib, psi, psi_mant, psi_exp; int offset; @@ -120,7 +111,7 @@ static int xhci_create_usb3_bos_desc(struct xhci_hcd *xhci, char *buf, /* attribute count SSAC bits 4:0 and ID count SSIC bits 8:5 */ bm_attrib = (ssa_count - 1) & 0x1f; - bm_attrib |= (port_cap->psi_uid_count - 1) << 5; + bm_attrib |= (xhci->usb3_rhub.psi_uid_count - 1) << 5; put_unaligned_le32(bm_attrib, &buf[ssp_cap_base + 4]); if (wLength < desc_size + ssa_size) @@ -133,8 +124,8 @@ static int xhci_create_usb3_bos_desc(struct xhci_hcd *xhci, char *buf, * USB 3.1 requires two SSA entries (RX and TX) for every link */ offset = desc_size; - for (i = 0; i < port_cap->psi_count; i++) { - psi = port_cap->psi[i]; + for (i = 0; i < xhci->usb3_rhub.psi_count; i++) { + psi = xhci->usb3_rhub.psi[i]; psi &= ~USB_SSP_SUBLINK_SPEED_RSVD; psi_exp = XHCI_EXT_PORT_PSIE(psi); psi_mant = XHCI_EXT_PORT_PSIM(psi); diff --git a/drivers/usb/host/xhci-mem.c b/drivers/usb/host/xhci-mem.c index bd5b152df6c0..0e2701649369 100644 --- a/drivers/usb/host/xhci-mem.c +++ b/drivers/usb/host/xhci-mem.c @@ -1915,16 +1915,17 @@ no_bw: xhci->usb3_rhub.num_ports = 0; xhci->num_active_eps = 0; kfree(xhci->usb2_rhub.ports); + kfree(xhci->usb2_rhub.psi); kfree(xhci->usb3_rhub.ports); + kfree(xhci->usb3_rhub.psi); kfree(xhci->hw_ports); kfree(xhci->rh_bw); kfree(xhci->ext_caps); - for (i = 0; i < xhci->num_port_caps; i++) - kfree(xhci->port_caps[i].psi); - kfree(xhci->port_caps); xhci->usb2_rhub.ports = NULL; + xhci->usb2_rhub.psi = NULL; xhci->usb3_rhub.ports = NULL; + xhci->usb3_rhub.psi = NULL; xhci->hw_ports = NULL; xhci->rh_bw = NULL; xhci->ext_caps = NULL; @@ -2125,7 +2126,6 @@ static void xhci_add_in_port(struct xhci_hcd *xhci, unsigned int num_ports, u8 major_revision, minor_revision; struct xhci_hub *rhub; struct device *dev = xhci_to_hcd(xhci)->self.sysdev; - struct xhci_port_cap *port_cap; temp = readl(addr); major_revision = XHCI_EXT_PORT_MAJOR(temp); @@ -2160,39 +2160,31 @@ static void xhci_add_in_port(struct xhci_hcd *xhci, unsigned int num_ports, /* WTF? "Valid values are ‘1’ to MaxPorts" */ return; - port_cap = &xhci->port_caps[xhci->num_port_caps++]; - if (xhci->num_port_caps > max_caps) - return; - - port_cap->maj_rev = major_revision; - port_cap->min_rev = minor_revision; - port_cap->psi_count = XHCI_EXT_PORT_PSIC(temp); - - if (port_cap->psi_count) { - port_cap->psi = kcalloc_node(port_cap->psi_count, - sizeof(*port_cap->psi), - GFP_KERNEL, dev_to_node(dev)); - if (!port_cap->psi) - port_cap->psi_count = 0; + rhub->psi_count = XHCI_EXT_PORT_PSIC(temp); + if (rhub->psi_count) { + rhub->psi = kcalloc_node(rhub->psi_count, sizeof(*rhub->psi), + GFP_KERNEL, dev_to_node(dev)); + if (!rhub->psi) + rhub->psi_count = 0; - port_cap->psi_uid_count++; - for (i = 0; i < port_cap->psi_count; i++) { - port_cap->psi[i] = readl(addr + 4 + i); + rhub->psi_uid_count++; + for (i = 0; i < rhub->psi_count; i++) { + rhub->psi[i] = readl(addr + 4 + i); /* count unique ID values, two consecutive entries can * have the same ID if link is assymetric */ - if (i && (XHCI_EXT_PORT_PSIV(port_cap->psi[i]) != - XHCI_EXT_PORT_PSIV(port_cap->psi[i - 1]))) - port_cap->psi_uid_count++; + if (i && (XHCI_EXT_PORT_PSIV(rhub->psi[i]) != + XHCI_EXT_PORT_PSIV(rhub->psi[i - 1]))) + rhub->psi_uid_count++; xhci_dbg(xhci, "PSIV:%d PSIE:%d PLT:%d PFD:%d LP:%d PSIM:%d\n", - XHCI_EXT_PORT_PSIV(port_cap->psi[i]), - XHCI_EXT_PORT_PSIE(port_cap->psi[i]), - XHCI_EXT_PORT_PLT(port_cap->psi[i]), - XHCI_EXT_PORT_PFD(port_cap->psi[i]), - XHCI_EXT_PORT_LP(port_cap->psi[i]), - XHCI_EXT_PORT_PSIM(port_cap->psi[i])); + XHCI_EXT_PORT_PSIV(rhub->psi[i]), + XHCI_EXT_PORT_PSIE(rhub->psi[i]), + XHCI_EXT_PORT_PLT(rhub->psi[i]), + XHCI_EXT_PORT_PFD(rhub->psi[i]), + XHCI_EXT_PORT_LP(rhub->psi[i]), + XHCI_EXT_PORT_PSIM(rhub->psi[i])); } } /* cache usb2 port capabilities */ @@ -2227,7 +2219,6 @@ static void xhci_add_in_port(struct xhci_hcd *xhci, unsigned int num_ports, continue; } hw_port->rhub = rhub; - hw_port->port_cap = port_cap; rhub->num_ports++; } /* FIXME: Should we disable ports not in the Extended Capabilities? */ @@ -2318,11 +2309,6 @@ static int xhci_setup_port_arrays(struct xhci_hcd *xhci, gfp_t flags) if (!xhci->ext_caps) return -ENOMEM; - xhci->port_caps = kcalloc_node(cap_count, sizeof(*xhci->port_caps), - flags, dev_to_node(dev)); - if (!xhci->port_caps) - return -ENOMEM; - offset = cap_start; while (offset) { diff --git a/drivers/usb/host/xhci.h b/drivers/usb/host/xhci.h index 3ecee10fdcdc..13d8838cd552 100644 --- a/drivers/usb/host/xhci.h +++ b/drivers/usb/host/xhci.h @@ -1702,20 +1702,12 @@ struct xhci_bus_state { * Intel Lynx Point LP xHCI host. */ #define XHCI_MAX_REXIT_TIMEOUT_MS 20 -struct xhci_port_cap { - u32 *psi; /* array of protocol speed ID entries */ - u8 psi_count; - u8 psi_uid_count; - u8 maj_rev; - u8 min_rev; -}; struct xhci_port { __le32 __iomem *addr; int hw_portnum; int hcd_portnum; struct xhci_hub *rhub; - struct xhci_port_cap *port_cap; }; struct xhci_hub { @@ -1727,6 +1719,9 @@ struct xhci_hub { /* supported prococol extended capabiliy values */ u8 maj_rev; u8 min_rev; + u32 *psi; /* array of protocol speed ID entries */ + u8 psi_count; + u8 psi_uid_count; }; /* There is one xhci_hcd structure per controller */ @@ -1885,9 +1880,6 @@ struct xhci_hcd { /* cached usb2 extened protocol capabilites */ u32 *ext_caps; unsigned int num_ext_caps; - /* cached extended protocol port capabilities */ - struct xhci_port_cap *port_caps; - unsigned int num_port_caps; /* Compliance Mode Recovery Data */ struct timer_list comp_mode_recovery_timer; u32 port_status_u0; -- cgit v1.2.3 From cf0ee7c60c89641f6e4d1d3c7867fe32b9e30300 Mon Sep 17 00:00:00 2001 From: Mathias Nyman Date: Tue, 11 Feb 2020 17:01:58 +0200 Subject: xhci: Fix memory leak when caching protocol extended capability PSI tables - take 2 xhci driver assumed that xHC controllers have at most one custom supported speed table (PSI) for all usb 3.x ports. Memory was allocated for one PSI table under the xhci hub structure. Turns out this is not the case, some controllers have a separate "supported protocol capability" entry with a PSI table for each port. This means each usb3 roothub port can in theory support different custom speeds. To solve this, cache all supported protocol capabilities with their PSI tables in an array, and add pointers to the xhci port structure so that every port points to its capability entry in the array. When creating the SuperSpeedPlus USB Device Capability BOS descriptor for the xhci USB 3.1 roothub we for now will use only data from the first USB 3.1 capable protocol capability entry in the array. This could be improved later, this patch focuses resolving the memory leak. Reported-by: Paul Menzel Reported-by: Sajja Venkateswara Rao Fixes: 47189098f8be ("xhci: parse xhci protocol speed ID list for usb 3.1 usage") Cc: stable # v4.4+ Signed-off-by: Mathias Nyman Tested-by: Marek Szyprowski Link: https://lore.kernel.org/r/20200211150158.14475-1-mathias.nyman@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/xhci-hub.c | 25 +++++++++++++------ drivers/usb/host/xhci-mem.c | 59 ++++++++++++++++++++++++++++----------------- drivers/usb/host/xhci.h | 14 ++++++++--- 3 files changed, 65 insertions(+), 33 deletions(-) diff --git a/drivers/usb/host/xhci-hub.c b/drivers/usb/host/xhci-hub.c index 7a3a29e5e9d2..af92b2576fe9 100644 --- a/drivers/usb/host/xhci-hub.c +++ b/drivers/usb/host/xhci-hub.c @@ -55,6 +55,7 @@ static u8 usb_bos_descriptor [] = { static int xhci_create_usb3_bos_desc(struct xhci_hcd *xhci, char *buf, u16 wLength) { + struct xhci_port_cap *port_cap = NULL; int i, ssa_count; u32 temp; u16 desc_size, ssp_cap_size, ssa_size = 0; @@ -64,16 +65,24 @@ static int xhci_create_usb3_bos_desc(struct xhci_hcd *xhci, char *buf, ssp_cap_size = sizeof(usb_bos_descriptor) - desc_size; /* does xhci support USB 3.1 Enhanced SuperSpeed */ - if (xhci->usb3_rhub.min_rev >= 0x01) { + for (i = 0; i < xhci->num_port_caps; i++) { + if (xhci->port_caps[i].maj_rev == 0x03 && + xhci->port_caps[i].min_rev >= 0x01) { + usb3_1 = true; + port_cap = &xhci->port_caps[i]; + break; + } + } + + if (usb3_1) { /* does xhci provide a PSI table for SSA speed attributes? */ - if (xhci->usb3_rhub.psi_count) { + if (port_cap->psi_count) { /* two SSA entries for each unique PSI ID, RX and TX */ - ssa_count = xhci->usb3_rhub.psi_uid_count * 2; + ssa_count = port_cap->psi_uid_count * 2; ssa_size = ssa_count * sizeof(u32); ssp_cap_size -= 16; /* skip copying the default SSA */ } desc_size += ssp_cap_size; - usb3_1 = true; } memcpy(buf, &usb_bos_descriptor, min(desc_size, wLength)); @@ -99,7 +108,7 @@ static int xhci_create_usb3_bos_desc(struct xhci_hcd *xhci, char *buf, } /* If PSI table exists, add the custom speed attributes from it */ - if (usb3_1 && xhci->usb3_rhub.psi_count) { + if (usb3_1 && port_cap->psi_count) { u32 ssp_cap_base, bm_attrib, psi, psi_mant, psi_exp; int offset; @@ -111,7 +120,7 @@ static int xhci_create_usb3_bos_desc(struct xhci_hcd *xhci, char *buf, /* attribute count SSAC bits 4:0 and ID count SSIC bits 8:5 */ bm_attrib = (ssa_count - 1) & 0x1f; - bm_attrib |= (xhci->usb3_rhub.psi_uid_count - 1) << 5; + bm_attrib |= (port_cap->psi_uid_count - 1) << 5; put_unaligned_le32(bm_attrib, &buf[ssp_cap_base + 4]); if (wLength < desc_size + ssa_size) @@ -124,8 +133,8 @@ static int xhci_create_usb3_bos_desc(struct xhci_hcd *xhci, char *buf, * USB 3.1 requires two SSA entries (RX and TX) for every link */ offset = desc_size; - for (i = 0; i < xhci->usb3_rhub.psi_count; i++) { - psi = xhci->usb3_rhub.psi[i]; + for (i = 0; i < port_cap->psi_count; i++) { + psi = port_cap->psi[i]; psi &= ~USB_SSP_SUBLINK_SPEED_RSVD; psi_exp = XHCI_EXT_PORT_PSIE(psi); psi_mant = XHCI_EXT_PORT_PSIM(psi); diff --git a/drivers/usb/host/xhci-mem.c b/drivers/usb/host/xhci-mem.c index 0e2701649369..884c601bfa15 100644 --- a/drivers/usb/host/xhci-mem.c +++ b/drivers/usb/host/xhci-mem.c @@ -1915,17 +1915,17 @@ no_bw: xhci->usb3_rhub.num_ports = 0; xhci->num_active_eps = 0; kfree(xhci->usb2_rhub.ports); - kfree(xhci->usb2_rhub.psi); kfree(xhci->usb3_rhub.ports); - kfree(xhci->usb3_rhub.psi); kfree(xhci->hw_ports); kfree(xhci->rh_bw); kfree(xhci->ext_caps); + for (i = 0; i < xhci->num_port_caps; i++) + kfree(xhci->port_caps[i].psi); + kfree(xhci->port_caps); + xhci->num_port_caps = 0; xhci->usb2_rhub.ports = NULL; - xhci->usb2_rhub.psi = NULL; xhci->usb3_rhub.ports = NULL; - xhci->usb3_rhub.psi = NULL; xhci->hw_ports = NULL; xhci->rh_bw = NULL; xhci->ext_caps = NULL; @@ -2126,6 +2126,7 @@ static void xhci_add_in_port(struct xhci_hcd *xhci, unsigned int num_ports, u8 major_revision, minor_revision; struct xhci_hub *rhub; struct device *dev = xhci_to_hcd(xhci)->self.sysdev; + struct xhci_port_cap *port_cap; temp = readl(addr); major_revision = XHCI_EXT_PORT_MAJOR(temp); @@ -2160,31 +2161,39 @@ static void xhci_add_in_port(struct xhci_hcd *xhci, unsigned int num_ports, /* WTF? "Valid values are ‘1’ to MaxPorts" */ return; - rhub->psi_count = XHCI_EXT_PORT_PSIC(temp); - if (rhub->psi_count) { - rhub->psi = kcalloc_node(rhub->psi_count, sizeof(*rhub->psi), - GFP_KERNEL, dev_to_node(dev)); - if (!rhub->psi) - rhub->psi_count = 0; + port_cap = &xhci->port_caps[xhci->num_port_caps++]; + if (xhci->num_port_caps > max_caps) + return; + + port_cap->maj_rev = major_revision; + port_cap->min_rev = minor_revision; + port_cap->psi_count = XHCI_EXT_PORT_PSIC(temp); + + if (port_cap->psi_count) { + port_cap->psi = kcalloc_node(port_cap->psi_count, + sizeof(*port_cap->psi), + GFP_KERNEL, dev_to_node(dev)); + if (!port_cap->psi) + port_cap->psi_count = 0; - rhub->psi_uid_count++; - for (i = 0; i < rhub->psi_count; i++) { - rhub->psi[i] = readl(addr + 4 + i); + port_cap->psi_uid_count++; + for (i = 0; i < port_cap->psi_count; i++) { + port_cap->psi[i] = readl(addr + 4 + i); /* count unique ID values, two consecutive entries can * have the same ID if link is assymetric */ - if (i && (XHCI_EXT_PORT_PSIV(rhub->psi[i]) != - XHCI_EXT_PORT_PSIV(rhub->psi[i - 1]))) - rhub->psi_uid_count++; + if (i && (XHCI_EXT_PORT_PSIV(port_cap->psi[i]) != + XHCI_EXT_PORT_PSIV(port_cap->psi[i - 1]))) + port_cap->psi_uid_count++; xhci_dbg(xhci, "PSIV:%d PSIE:%d PLT:%d PFD:%d LP:%d PSIM:%d\n", - XHCI_EXT_PORT_PSIV(rhub->psi[i]), - XHCI_EXT_PORT_PSIE(rhub->psi[i]), - XHCI_EXT_PORT_PLT(rhub->psi[i]), - XHCI_EXT_PORT_PFD(rhub->psi[i]), - XHCI_EXT_PORT_LP(rhub->psi[i]), - XHCI_EXT_PORT_PSIM(rhub->psi[i])); + XHCI_EXT_PORT_PSIV(port_cap->psi[i]), + XHCI_EXT_PORT_PSIE(port_cap->psi[i]), + XHCI_EXT_PORT_PLT(port_cap->psi[i]), + XHCI_EXT_PORT_PFD(port_cap->psi[i]), + XHCI_EXT_PORT_LP(port_cap->psi[i]), + XHCI_EXT_PORT_PSIM(port_cap->psi[i])); } } /* cache usb2 port capabilities */ @@ -2219,6 +2228,7 @@ static void xhci_add_in_port(struct xhci_hcd *xhci, unsigned int num_ports, continue; } hw_port->rhub = rhub; + hw_port->port_cap = port_cap; rhub->num_ports++; } /* FIXME: Should we disable ports not in the Extended Capabilities? */ @@ -2309,6 +2319,11 @@ static int xhci_setup_port_arrays(struct xhci_hcd *xhci, gfp_t flags) if (!xhci->ext_caps) return -ENOMEM; + xhci->port_caps = kcalloc_node(cap_count, sizeof(*xhci->port_caps), + flags, dev_to_node(dev)); + if (!xhci->port_caps) + return -ENOMEM; + offset = cap_start; while (offset) { diff --git a/drivers/usb/host/xhci.h b/drivers/usb/host/xhci.h index 13d8838cd552..3ecee10fdcdc 100644 --- a/drivers/usb/host/xhci.h +++ b/drivers/usb/host/xhci.h @@ -1702,12 +1702,20 @@ struct xhci_bus_state { * Intel Lynx Point LP xHCI host. */ #define XHCI_MAX_REXIT_TIMEOUT_MS 20 +struct xhci_port_cap { + u32 *psi; /* array of protocol speed ID entries */ + u8 psi_count; + u8 psi_uid_count; + u8 maj_rev; + u8 min_rev; +}; struct xhci_port { __le32 __iomem *addr; int hw_portnum; int hcd_portnum; struct xhci_hub *rhub; + struct xhci_port_cap *port_cap; }; struct xhci_hub { @@ -1719,9 +1727,6 @@ struct xhci_hub { /* supported prococol extended capabiliy values */ u8 maj_rev; u8 min_rev; - u32 *psi; /* array of protocol speed ID entries */ - u8 psi_count; - u8 psi_uid_count; }; /* There is one xhci_hcd structure per controller */ @@ -1880,6 +1885,9 @@ struct xhci_hcd { /* cached usb2 extened protocol capabilites */ u32 *ext_caps; unsigned int num_ext_caps; + /* cached extended protocol port capabilities */ + struct xhci_port_cap *port_caps; + unsigned int num_port_caps; /* Compliance Mode Recovery Data */ struct timer_list comp_mode_recovery_timer; u32 port_status_u0; -- cgit v1.2.3 From b692056db8ecc7f452b934f016c17348282b7699 Mon Sep 17 00:00:00 2001 From: Richard Dodd Date: Wed, 12 Feb 2020 14:22:18 +0000 Subject: USB: Fix novation SourceControl XL after suspend Currently, the SourceControl will stay in power-down mode after resuming from suspend. This patch resets the device after suspend to power it up. Signed-off-by: Richard Dodd Cc: stable Link: https://lore.kernel.org/r/20200212142220.36892-1-richard.o.dodd@gmail.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/quirks.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/usb/core/quirks.c b/drivers/usb/core/quirks.c index f27468966a3d..2b24336a72e5 100644 --- a/drivers/usb/core/quirks.c +++ b/drivers/usb/core/quirks.c @@ -449,6 +449,9 @@ static const struct usb_device_id usb_quirk_list[] = { /* INTEL VALUE SSD */ { USB_DEVICE(0x8086, 0xf1a5), .driver_info = USB_QUIRK_RESET_RESUME }, + /* novation SoundControl XL */ + { USB_DEVICE(0x1235, 0x0061), .driver_info = USB_QUIRK_RESET_RESUME }, + { } /* terminating entry must be last */ }; -- cgit v1.2.3 From 461d8deb26a7d70254bc0391feb4fd8a95e674e8 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 11 Feb 2020 20:04:21 -0800 Subject: USB: misc: iowarrior: add support for 2 OEMed devices Add support for two OEM devices that are identical to existing IO-Warrior devices, except for the USB device id. Cc: Christoph Jung Cc: stable Link: https://lore.kernel.org/r/20200212040422.2991-1-gregkh@linuxfoundation.org Signed-off-by: Greg Kroah-Hartman --- drivers/usb/misc/iowarrior.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/drivers/usb/misc/iowarrior.c b/drivers/usb/misc/iowarrior.c index dce44fbf031f..990acbe14852 100644 --- a/drivers/usb/misc/iowarrior.c +++ b/drivers/usb/misc/iowarrior.c @@ -34,6 +34,10 @@ /* full speed iowarrior */ #define USB_DEVICE_ID_CODEMERCS_IOW56 0x1503 +/* OEMed devices */ +#define USB_DEVICE_ID_CODEMERCS_IOW24SAG 0x158a +#define USB_DEVICE_ID_CODEMERCS_IOW56AM 0x158b + /* Get a minor range for your devices from the usb maintainer */ #ifdef CONFIG_USB_DYNAMIC_MINORS #define IOWARRIOR_MINOR_BASE 0 @@ -133,6 +137,8 @@ static const struct usb_device_id iowarrior_ids[] = { {USB_DEVICE(USB_VENDOR_ID_CODEMERCS, USB_DEVICE_ID_CODEMERCS_IOWPV1)}, {USB_DEVICE(USB_VENDOR_ID_CODEMERCS, USB_DEVICE_ID_CODEMERCS_IOWPV2)}, {USB_DEVICE(USB_VENDOR_ID_CODEMERCS, USB_DEVICE_ID_CODEMERCS_IOW56)}, + {USB_DEVICE(USB_VENDOR_ID_CODEMERCS, USB_DEVICE_ID_CODEMERCS_IOW24SAG)}, + {USB_DEVICE(USB_VENDOR_ID_CODEMERCS, USB_DEVICE_ID_CODEMERCS_IOW56AM)}, {} /* Terminating entry */ }; MODULE_DEVICE_TABLE(usb, iowarrior_ids); @@ -357,6 +363,7 @@ static ssize_t iowarrior_write(struct file *file, } switch (dev->product_id) { case USB_DEVICE_ID_CODEMERCS_IOW24: + case USB_DEVICE_ID_CODEMERCS_IOW24SAG: case USB_DEVICE_ID_CODEMERCS_IOWPV1: case USB_DEVICE_ID_CODEMERCS_IOWPV2: case USB_DEVICE_ID_CODEMERCS_IOW40: @@ -371,6 +378,7 @@ static ssize_t iowarrior_write(struct file *file, goto exit; break; case USB_DEVICE_ID_CODEMERCS_IOW56: + case USB_DEVICE_ID_CODEMERCS_IOW56AM: /* The IOW56 uses asynchronous IO and more urbs */ if (atomic_read(&dev->write_busy) == MAX_WRITES_IN_FLIGHT) { /* Wait until we are below the limit for submitted urbs */ @@ -493,6 +501,7 @@ static long iowarrior_ioctl(struct file *file, unsigned int cmd, switch (cmd) { case IOW_WRITE: if (dev->product_id == USB_DEVICE_ID_CODEMERCS_IOW24 || + dev->product_id == USB_DEVICE_ID_CODEMERCS_IOW24SAG || dev->product_id == USB_DEVICE_ID_CODEMERCS_IOWPV1 || dev->product_id == USB_DEVICE_ID_CODEMERCS_IOWPV2 || dev->product_id == USB_DEVICE_ID_CODEMERCS_IOW40) { @@ -767,7 +776,8 @@ static int iowarrior_probe(struct usb_interface *interface, goto error; } - if (dev->product_id == USB_DEVICE_ID_CODEMERCS_IOW56) { + if ((dev->product_id == USB_DEVICE_ID_CODEMERCS_IOW56) || + (dev->product_id == USB_DEVICE_ID_CODEMERCS_IOW56AM)) { res = usb_find_last_int_out_endpoint(iface_desc, &dev->int_out_endpoint); if (res) { @@ -780,7 +790,8 @@ static int iowarrior_probe(struct usb_interface *interface, /* we have to check the report_size often, so remember it in the endianness suitable for our machine */ dev->report_size = usb_endpoint_maxp(dev->int_in_endpoint); if ((dev->interface->cur_altsetting->desc.bInterfaceNumber == 0) && - (dev->product_id == USB_DEVICE_ID_CODEMERCS_IOW56)) + ((dev->product_id == USB_DEVICE_ID_CODEMERCS_IOW56) || + (dev->product_id == USB_DEVICE_ID_CODEMERCS_IOW56AM))) /* IOWarrior56 has wMaxPacketSize different from report size */ dev->report_size = 7; -- cgit v1.2.3 From 5f6f8da2d7b5a431d3f391d0d73ace8edfb42af7 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 11 Feb 2020 20:04:22 -0800 Subject: USB: misc: iowarrior: add support for the 28 and 28L devices Add new device ids for the 28 and 28L devices. These have 4 interfaces instead of 2, but the driver binds the same, so the driver changes are minimal. Cc: Christoph Jung Cc: stable Link: https://lore.kernel.org/r/20200212040422.2991-2-gregkh@linuxfoundation.org Signed-off-by: Greg Kroah-Hartman --- drivers/usb/misc/iowarrior.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/drivers/usb/misc/iowarrior.c b/drivers/usb/misc/iowarrior.c index 990acbe14852..d20b60acfe8a 100644 --- a/drivers/usb/misc/iowarrior.c +++ b/drivers/usb/misc/iowarrior.c @@ -33,6 +33,9 @@ #define USB_DEVICE_ID_CODEMERCS_IOWPV2 0x1512 /* full speed iowarrior */ #define USB_DEVICE_ID_CODEMERCS_IOW56 0x1503 +/* fuller speed iowarrior */ +#define USB_DEVICE_ID_CODEMERCS_IOW28 0x1504 +#define USB_DEVICE_ID_CODEMERCS_IOW28L 0x1505 /* OEMed devices */ #define USB_DEVICE_ID_CODEMERCS_IOW24SAG 0x158a @@ -139,6 +142,8 @@ static const struct usb_device_id iowarrior_ids[] = { {USB_DEVICE(USB_VENDOR_ID_CODEMERCS, USB_DEVICE_ID_CODEMERCS_IOW56)}, {USB_DEVICE(USB_VENDOR_ID_CODEMERCS, USB_DEVICE_ID_CODEMERCS_IOW24SAG)}, {USB_DEVICE(USB_VENDOR_ID_CODEMERCS, USB_DEVICE_ID_CODEMERCS_IOW56AM)}, + {USB_DEVICE(USB_VENDOR_ID_CODEMERCS, USB_DEVICE_ID_CODEMERCS_IOW28)}, + {USB_DEVICE(USB_VENDOR_ID_CODEMERCS, USB_DEVICE_ID_CODEMERCS_IOW28L)}, {} /* Terminating entry */ }; MODULE_DEVICE_TABLE(usb, iowarrior_ids); @@ -379,6 +384,8 @@ static ssize_t iowarrior_write(struct file *file, break; case USB_DEVICE_ID_CODEMERCS_IOW56: case USB_DEVICE_ID_CODEMERCS_IOW56AM: + case USB_DEVICE_ID_CODEMERCS_IOW28: + case USB_DEVICE_ID_CODEMERCS_IOW28L: /* The IOW56 uses asynchronous IO and more urbs */ if (atomic_read(&dev->write_busy) == MAX_WRITES_IN_FLIGHT) { /* Wait until we are below the limit for submitted urbs */ @@ -777,7 +784,9 @@ static int iowarrior_probe(struct usb_interface *interface, } if ((dev->product_id == USB_DEVICE_ID_CODEMERCS_IOW56) || - (dev->product_id == USB_DEVICE_ID_CODEMERCS_IOW56AM)) { + (dev->product_id == USB_DEVICE_ID_CODEMERCS_IOW56AM) || + (dev->product_id == USB_DEVICE_ID_CODEMERCS_IOW28) || + (dev->product_id == USB_DEVICE_ID_CODEMERCS_IOW28L)) { res = usb_find_last_int_out_endpoint(iface_desc, &dev->int_out_endpoint); if (res) { @@ -791,7 +800,9 @@ static int iowarrior_probe(struct usb_interface *interface, dev->report_size = usb_endpoint_maxp(dev->int_in_endpoint); if ((dev->interface->cur_altsetting->desc.bInterfaceNumber == 0) && ((dev->product_id == USB_DEVICE_ID_CODEMERCS_IOW56) || - (dev->product_id == USB_DEVICE_ID_CODEMERCS_IOW56AM))) + (dev->product_id == USB_DEVICE_ID_CODEMERCS_IOW56AM) || + (dev->product_id == USB_DEVICE_ID_CODEMERCS_IOW28) || + (dev->product_id == USB_DEVICE_ID_CODEMERCS_IOW28L))) /* IOWarrior56 has wMaxPacketSize different from report size */ dev->report_size = 7; -- cgit v1.2.3 From b9287f2ac321ecac56eb51e6231f6579683dcdae Mon Sep 17 00:00:00 2001 From: Kunihiko Hayashi Date: Wed, 12 Feb 2020 19:55:34 +0900 Subject: net: ethernet: ave: Add capability of rgmii-id mode This allows you to specify the type of rgmii-id that will enable phy internal delay in ethernet phy-mode. This adds all RGMII cases to all of get_pinmode() except LD11, because LD11 SoC doesn't support RGMII due to the constraint of the hardware. When RGMII phy mode is specified in the devicetree for LD11, the driver will abort with an error. Signed-off-by: Kunihiko Hayashi Signed-off-by: David S. Miller --- drivers/net/ethernet/socionext/sni_ave.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/net/ethernet/socionext/sni_ave.c b/drivers/net/ethernet/socionext/sni_ave.c index b7032422393f..67ddf782d98a 100644 --- a/drivers/net/ethernet/socionext/sni_ave.c +++ b/drivers/net/ethernet/socionext/sni_ave.c @@ -1810,6 +1810,9 @@ static int ave_pro4_get_pinmode(struct ave_private *priv, break; case PHY_INTERFACE_MODE_MII: case PHY_INTERFACE_MODE_RGMII: + case PHY_INTERFACE_MODE_RGMII_ID: + case PHY_INTERFACE_MODE_RGMII_RXID: + case PHY_INTERFACE_MODE_RGMII_TXID: priv->pinmode_val = 0; break; default: @@ -1854,6 +1857,9 @@ static int ave_ld20_get_pinmode(struct ave_private *priv, priv->pinmode_val = SG_ETPINMODE_RMII(0); break; case PHY_INTERFACE_MODE_RGMII: + case PHY_INTERFACE_MODE_RGMII_ID: + case PHY_INTERFACE_MODE_RGMII_RXID: + case PHY_INTERFACE_MODE_RGMII_TXID: priv->pinmode_val = 0; break; default: @@ -1876,6 +1882,9 @@ static int ave_pxs3_get_pinmode(struct ave_private *priv, priv->pinmode_val = SG_ETPINMODE_RMII(arg); break; case PHY_INTERFACE_MODE_RGMII: + case PHY_INTERFACE_MODE_RGMII_ID: + case PHY_INTERFACE_MODE_RGMII_RXID: + case PHY_INTERFACE_MODE_RGMII_TXID: priv->pinmode_val = 0; break; default: -- cgit v1.2.3 From 1880b1f1d686b17387b5bf45654eb1d087ead918 Mon Sep 17 00:00:00 2001 From: Ravulapati Vishnu vardhan rao Date: Tue, 11 Feb 2020 18:42:28 +0530 Subject: ASoC: amd: Buffer Size instead of MAX Buffer Because of MAX BUFFER size in register,when user/app give small buffer size produces noise of old data in buffer. This patch rectifies this noise when using different buffer sizes less than MAX BUFFER. Signed-off-by: Ravulapati Vishnu vardhan rao Link: https://lore.kernel.org/r/1581426768-8937-1-git-send-email-Vishnuvardhanrao.Ravulapati@amd.com Signed-off-by: Mark Brown --- sound/soc/amd/raven/acp3x-i2s.c | 8 ++++++++ sound/soc/amd/raven/acp3x-pcm-dma.c | 7 +------ 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/sound/soc/amd/raven/acp3x-i2s.c b/sound/soc/amd/raven/acp3x-i2s.c index 31cd4008e33f..91a388184e52 100644 --- a/sound/soc/amd/raven/acp3x-i2s.c +++ b/sound/soc/amd/raven/acp3x-i2s.c @@ -170,6 +170,7 @@ static int acp3x_i2s_trigger(struct snd_pcm_substream *substream, struct snd_soc_card *card; struct acp3x_platform_info *pinfo; u32 ret, val, period_bytes, reg_val, ier_val, water_val; + u32 buf_size, buf_reg; prtd = substream->private_data; rtd = substream->runtime->private_data; @@ -183,6 +184,8 @@ static int acp3x_i2s_trigger(struct snd_pcm_substream *substream, } period_bytes = frames_to_bytes(substream->runtime, substream->runtime->period_size); + buf_size = frames_to_bytes(substream->runtime, + substream->runtime->buffer_size); switch (cmd) { case SNDRV_PCM_TRIGGER_START: case SNDRV_PCM_TRIGGER_RESUME: @@ -196,6 +199,7 @@ static int acp3x_i2s_trigger(struct snd_pcm_substream *substream, mmACP_BT_TX_INTR_WATERMARK_SIZE; reg_val = mmACP_BTTDM_ITER; ier_val = mmACP_BTTDM_IER; + buf_reg = mmACP_BT_TX_RINGBUFSIZE; break; case I2S_SP_INSTANCE: default: @@ -203,6 +207,7 @@ static int acp3x_i2s_trigger(struct snd_pcm_substream *substream, mmACP_I2S_TX_INTR_WATERMARK_SIZE; reg_val = mmACP_I2STDM_ITER; ier_val = mmACP_I2STDM_IER; + buf_reg = mmACP_I2S_TX_RINGBUFSIZE; } } else { switch (rtd->i2s_instance) { @@ -211,6 +216,7 @@ static int acp3x_i2s_trigger(struct snd_pcm_substream *substream, mmACP_BT_RX_INTR_WATERMARK_SIZE; reg_val = mmACP_BTTDM_IRER; ier_val = mmACP_BTTDM_IER; + buf_reg = mmACP_BT_RX_RINGBUFSIZE; break; case I2S_SP_INSTANCE: default: @@ -218,9 +224,11 @@ static int acp3x_i2s_trigger(struct snd_pcm_substream *substream, mmACP_I2S_RX_INTR_WATERMARK_SIZE; reg_val = mmACP_I2STDM_IRER; ier_val = mmACP_I2STDM_IER; + buf_reg = mmACP_I2S_RX_RINGBUFSIZE; } } rv_writel(period_bytes, rtd->acp3x_base + water_val); + rv_writel(buf_size, rtd->acp3x_base + buf_reg); val = rv_readl(rtd->acp3x_base + reg_val); val = val | BIT(0); rv_writel(val, rtd->acp3x_base + reg_val); diff --git a/sound/soc/amd/raven/acp3x-pcm-dma.c b/sound/soc/amd/raven/acp3x-pcm-dma.c index aecc3c061679..d62c0d90c41e 100644 --- a/sound/soc/amd/raven/acp3x-pcm-dma.c +++ b/sound/soc/amd/raven/acp3x-pcm-dma.c @@ -110,7 +110,7 @@ static void config_acp3x_dma(struct i2s_stream_instance *rtd, int direction) { u16 page_idx; u32 low, high, val, acp_fifo_addr, reg_fifo_addr; - u32 reg_ringbuf_size, reg_dma_size, reg_fifo_size; + u32 reg_dma_size, reg_fifo_size; dma_addr_t addr; addr = rtd->dma_addr; @@ -157,7 +157,6 @@ static void config_acp3x_dma(struct i2s_stream_instance *rtd, int direction) if (direction == SNDRV_PCM_STREAM_PLAYBACK) { switch (rtd->i2s_instance) { case I2S_BT_INSTANCE: - reg_ringbuf_size = mmACP_BT_TX_RINGBUFSIZE; reg_dma_size = mmACP_BT_TX_DMA_SIZE; acp_fifo_addr = ACP_SRAM_PTE_OFFSET + BT_PB_FIFO_ADDR_OFFSET; @@ -169,7 +168,6 @@ static void config_acp3x_dma(struct i2s_stream_instance *rtd, int direction) case I2S_SP_INSTANCE: default: - reg_ringbuf_size = mmACP_I2S_TX_RINGBUFSIZE; reg_dma_size = mmACP_I2S_TX_DMA_SIZE; acp_fifo_addr = ACP_SRAM_PTE_OFFSET + SP_PB_FIFO_ADDR_OFFSET; @@ -181,7 +179,6 @@ static void config_acp3x_dma(struct i2s_stream_instance *rtd, int direction) } else { switch (rtd->i2s_instance) { case I2S_BT_INSTANCE: - reg_ringbuf_size = mmACP_BT_RX_RINGBUFSIZE; reg_dma_size = mmACP_BT_RX_DMA_SIZE; acp_fifo_addr = ACP_SRAM_PTE_OFFSET + BT_CAPT_FIFO_ADDR_OFFSET; @@ -193,7 +190,6 @@ static void config_acp3x_dma(struct i2s_stream_instance *rtd, int direction) case I2S_SP_INSTANCE: default: - reg_ringbuf_size = mmACP_I2S_RX_RINGBUFSIZE; reg_dma_size = mmACP_I2S_RX_DMA_SIZE; acp_fifo_addr = ACP_SRAM_PTE_OFFSET + SP_CAPT_FIFO_ADDR_OFFSET; @@ -203,7 +199,6 @@ static void config_acp3x_dma(struct i2s_stream_instance *rtd, int direction) rtd->acp3x_base + mmACP_I2S_RX_RINGBUFADDR); } } - rv_writel(MAX_BUFFER, rtd->acp3x_base + reg_ringbuf_size); rv_writel(DMA_SIZE, rtd->acp3x_base + reg_dma_size); rv_writel(acp_fifo_addr, rtd->acp3x_base + reg_fifo_addr); rv_writel(FIFO_SIZE, rtd->acp3x_base + reg_fifo_size); -- cgit v1.2.3 From 57d7713196ccd83010fcaa82b9f02d740c9e6bb2 Mon Sep 17 00:00:00 2001 From: Michal Simek Date: Wed, 12 Feb 2020 11:59:47 +0100 Subject: usb: gadget: udc-xilinx: Fix xudc_stop() kernel-doc format The patch removes "driver" parameter which has been removed without updating kernel-doc format. Fixes: 22835b807e7c ("usb: gadget: remove unnecessary 'driver' argument") Signed-off-by: Michal Simek Link: https://lore.kernel.org/r/c753b529bdcdfdd40a3cf69121527ec8c63775cb.1581505183.git.michal.simek@xilinx.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/udc/udc-xilinx.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/usb/gadget/udc/udc-xilinx.c b/drivers/usb/gadget/udc/udc-xilinx.c index 29d8e5f8bb58..b1cfc8279c3d 100644 --- a/drivers/usb/gadget/udc/udc-xilinx.c +++ b/drivers/usb/gadget/udc/udc-xilinx.c @@ -1399,7 +1399,6 @@ err: /** * xudc_stop - stops the device. * @gadget: pointer to the usb gadget structure - * @driver: pointer to usb gadget driver structure * * Return: zero always */ -- cgit v1.2.3 From efeda80da38d0b4afd77a12bd4a44f657567d26c Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 5 Feb 2020 09:01:54 -0500 Subject: NFSv4: Fix revalidation of dentries with delegations If a dentry was not initially looked up while we were holding a delegation, then we do still need to revalidate that it still holds the same name. If there are multiple hard links to the same file, then all the hard links need validation. Reported-by: Benjamin Coddington Signed-off-by: Trond Myklebust Reviewed-by: Benjamin Coddington Tested-by: Benjamin Coddington [Anna: Put nfs_unset_verifier_delegated() under CONFIG_NFS_V4] Signed-off-by: Anna Schumaker --- fs/nfs/delegation.c | 6 +++ fs/nfs/dir.c | 105 +++++++++++++++++++++++++++++++++++++++++++++++-- fs/nfs/inode.c | 1 + include/linux/nfs_fs.h | 26 +++--------- 4 files changed, 115 insertions(+), 23 deletions(-) diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index 4a841071d8a7..d856326836a2 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -42,6 +42,8 @@ static void nfs_mark_delegation_revoked(struct nfs_delegation *delegation) if (!test_and_set_bit(NFS_DELEGATION_REVOKED, &delegation->flags)) { delegation->stateid.type = NFS4_INVALID_STATEID_TYPE; atomic_long_dec(&nfs_active_delegations); + if (!test_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) + nfs_clear_verifier_delegated(delegation->inode); } } @@ -276,6 +278,8 @@ nfs_start_delegation_return_locked(struct nfs_inode *nfsi) if (!test_and_set_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) ret = delegation; spin_unlock(&delegation->lock); + if (ret) + nfs_clear_verifier_delegated(&nfsi->vfs_inode); out: return ret; } @@ -689,6 +693,8 @@ void nfs4_inode_return_delegation_on_close(struct inode *inode) ret = delegation; } spin_unlock(&delegation->lock); + if (ret) + nfs_clear_verifier_delegated(inode); } out: rcu_read_unlock(); diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index b4e7558e42ab..193d6fb363b7 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -986,14 +986,113 @@ static int nfs_fsync_dir(struct file *filp, loff_t start, loff_t end, * full lookup on all child dentries of 'dir' whenever a change occurs * on the server that might have invalidated our dcache. * + * Note that we reserve bit '0' as a tag to let us know when a dentry + * was revalidated while holding a delegation on its inode. + * * The caller should be holding dir->i_lock */ void nfs_force_lookup_revalidate(struct inode *dir) { - NFS_I(dir)->cache_change_attribute++; + NFS_I(dir)->cache_change_attribute += 2; } EXPORT_SYMBOL_GPL(nfs_force_lookup_revalidate); +/** + * nfs_verify_change_attribute - Detects NFS remote directory changes + * @dir: pointer to parent directory inode + * @verf: previously saved change attribute + * + * Return "false" if the verifiers doesn't match the change attribute. + * This would usually indicate that the directory contents have changed on + * the server, and that any dentries need revalidating. + */ +static bool nfs_verify_change_attribute(struct inode *dir, unsigned long verf) +{ + return (verf & ~1UL) == nfs_save_change_attribute(dir); +} + +static void nfs_set_verifier_delegated(unsigned long *verf) +{ + *verf |= 1UL; +} + +#if IS_ENABLED(CONFIG_NFS_V4) +static void nfs_unset_verifier_delegated(unsigned long *verf) +{ + *verf &= ~1UL; +} +#endif /* IS_ENABLED(CONFIG_NFS_V4) */ + +static bool nfs_test_verifier_delegated(unsigned long verf) +{ + return verf & 1; +} + +static bool nfs_verifier_is_delegated(struct dentry *dentry) +{ + return nfs_test_verifier_delegated(dentry->d_time); +} + +static void nfs_set_verifier_locked(struct dentry *dentry, unsigned long verf) +{ + struct inode *inode = d_inode(dentry); + + if (!nfs_verifier_is_delegated(dentry) && + !nfs_verify_change_attribute(d_inode(dentry->d_parent), verf)) + goto out; + if (inode && NFS_PROTO(inode)->have_delegation(inode, FMODE_READ)) + nfs_set_verifier_delegated(&verf); +out: + dentry->d_time = verf; +} + +/** + * nfs_set_verifier - save a parent directory verifier in the dentry + * @dentry: pointer to dentry + * @verf: verifier to save + * + * Saves the parent directory verifier in @dentry. If the inode has + * a delegation, we also tag the dentry as having been revalidated + * while holding a delegation so that we know we don't have to + * look it up again after a directory change. + */ +void nfs_set_verifier(struct dentry *dentry, unsigned long verf) +{ + + spin_lock(&dentry->d_lock); + nfs_set_verifier_locked(dentry, verf); + spin_unlock(&dentry->d_lock); +} +EXPORT_SYMBOL_GPL(nfs_set_verifier); + +#if IS_ENABLED(CONFIG_NFS_V4) +/** + * nfs_clear_verifier_delegated - clear the dir verifier delegation tag + * @inode: pointer to inode + * + * Iterates through the dentries in the inode alias list and clears + * the tag used to indicate that the dentry has been revalidated + * while holding a delegation. + * This function is intended for use when the delegation is being + * returned or revoked. + */ +void nfs_clear_verifier_delegated(struct inode *inode) +{ + struct dentry *alias; + + if (!inode) + return; + spin_lock(&inode->i_lock); + hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) { + spin_lock(&alias->d_lock); + nfs_unset_verifier_delegated(&alias->d_time); + spin_unlock(&alias->d_lock); + } + spin_unlock(&inode->i_lock); +} +EXPORT_SYMBOL_GPL(nfs_clear_verifier_delegated); +#endif /* IS_ENABLED(CONFIG_NFS_V4) */ + /* * A check for whether or not the parent directory has changed. * In the case it has, we assume that the dentries are untrustworthy @@ -1235,7 +1334,7 @@ nfs_do_lookup_revalidate(struct inode *dir, struct dentry *dentry, goto out_bad; } - if (NFS_PROTO(dir)->have_delegation(inode, FMODE_READ)) + if (nfs_verifier_is_delegated(dentry)) return nfs_lookup_revalidate_delegated(dir, dentry, inode); /* Force a full look up iff the parent directory has changed */ @@ -1675,7 +1774,7 @@ nfs4_do_lookup_revalidate(struct inode *dir, struct dentry *dentry, if (inode == NULL) goto full_reval; - if (NFS_PROTO(dir)->have_delegation(inode, FMODE_READ)) + if (nfs_verifier_is_delegated(dentry)) return nfs_lookup_revalidate_delegated(dir, dentry, inode); /* NFS only supports OPEN on regular files */ diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 1309e6f47f3d..11bf15800ac9 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -2114,6 +2114,7 @@ static void init_once(void *foo) init_rwsem(&nfsi->rmdir_sem); mutex_init(&nfsi->commit_mutex); nfs4_init_once(nfsi); + nfsi->cache_change_attribute = 0; } static int __init nfs_init_inodecache(void) diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index a5f8f03ecd59..5d5b91e54f73 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -337,35 +337,17 @@ static inline int nfs_server_capable(struct inode *inode, int cap) return NFS_SERVER(inode)->caps & cap; } -static inline void nfs_set_verifier(struct dentry * dentry, unsigned long verf) -{ - dentry->d_time = verf; -} - /** * nfs_save_change_attribute - Returns the inode attribute change cookie * @dir - pointer to parent directory inode - * The "change attribute" is updated every time we finish an operation - * that will result in a metadata change on the server. + * The "cache change attribute" is updated when we need to revalidate + * our dentry cache after a directory was seen to change on the server. */ static inline unsigned long nfs_save_change_attribute(struct inode *dir) { return NFS_I(dir)->cache_change_attribute; } -/** - * nfs_verify_change_attribute - Detects NFS remote directory changes - * @dir - pointer to parent directory inode - * @chattr - previously saved change attribute - * Return "false" if the verifiers doesn't match the change attribute. - * This would usually indicate that the directory contents have changed on - * the server, and that any dentries need revalidating. - */ -static inline int nfs_verify_change_attribute(struct inode *dir, unsigned long chattr) -{ - return chattr == NFS_I(dir)->cache_change_attribute; -} - /* * linux/fs/nfs/inode.c */ @@ -495,6 +477,10 @@ extern const struct file_operations nfs_dir_operations; extern const struct dentry_operations nfs_dentry_operations; extern void nfs_force_lookup_revalidate(struct inode *dir); +extern void nfs_set_verifier(struct dentry * dentry, unsigned long verf); +#if IS_ENABLED(CONFIG_NFS_V4) +extern void nfs_clear_verifier_delegated(struct inode *inode); +#endif /* IS_ENABLED(CONFIG_NFS_V4) */ extern struct dentry *nfs_add_or_obtain(struct dentry *dentry, struct nfs_fh *fh, struct nfs_fattr *fattr, struct nfs4_label *label); -- cgit v1.2.3 From 1ecaabed4e4a0d1027eadd54eb0e179350a79f99 Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Thu, 6 Feb 2020 11:47:08 +0100 Subject: selftests: KVM: Replace get_{gdt,idt}_base() by get_{gdt,idt}() get_gdt_base() and get_idt_base() only return the base address of the descriptor tables. Soon we will need to get the size as well. Change the prototype of those functions so that they return the whole desc_ptr struct instead of the address field. Signed-off-by: Eric Auger Reviewed-by: Vitaly Kuznetsov Reviewed-by: Miaohe Lin Reviewed-by: Wei Huang Reviewed-by: Krish Sadhukhan Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/include/x86_64/processor.h | 8 ++++---- tools/testing/selftests/kvm/lib/x86_64/vmx.c | 6 +++--- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index aa6451b3f740..6f7fffaea2e8 100644 --- a/tools/testing/selftests/kvm/include/x86_64/processor.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -220,20 +220,20 @@ static inline void set_cr4(uint64_t val) __asm__ __volatile__("mov %0, %%cr4" : : "r" (val) : "memory"); } -static inline uint64_t get_gdt_base(void) +static inline struct desc_ptr get_gdt(void) { struct desc_ptr gdt; __asm__ __volatile__("sgdt %[gdt]" : /* output */ [gdt]"=m"(gdt)); - return gdt.address; + return gdt; } -static inline uint64_t get_idt_base(void) +static inline struct desc_ptr get_idt(void) { struct desc_ptr idt; __asm__ __volatile__("sidt %[idt]" : /* output */ [idt]"=m"(idt)); - return idt.address; + return idt; } #define SET_XMM(__var, __xmm) \ diff --git a/tools/testing/selftests/kvm/lib/x86_64/vmx.c b/tools/testing/selftests/kvm/lib/x86_64/vmx.c index 85064baf5e97..7aaa99ca4dbc 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/vmx.c +++ b/tools/testing/selftests/kvm/lib/x86_64/vmx.c @@ -288,9 +288,9 @@ static inline void init_vmcs_host_state(void) vmwrite(HOST_FS_BASE, rdmsr(MSR_FS_BASE)); vmwrite(HOST_GS_BASE, rdmsr(MSR_GS_BASE)); vmwrite(HOST_TR_BASE, - get_desc64_base((struct desc64 *)(get_gdt_base() + get_tr()))); - vmwrite(HOST_GDTR_BASE, get_gdt_base()); - vmwrite(HOST_IDTR_BASE, get_idt_base()); + get_desc64_base((struct desc64 *)(get_gdt().address + get_tr()))); + vmwrite(HOST_GDTR_BASE, get_gdt().address); + vmwrite(HOST_IDTR_BASE, get_idt().address); vmwrite(HOST_IA32_SYSENTER_ESP, rdmsr(MSR_IA32_SYSENTER_ESP)); vmwrite(HOST_IA32_SYSENTER_EIP, rdmsr(MSR_IA32_SYSENTER_EIP)); } -- cgit v1.2.3 From 20ba262f8631aadefa87921481fe569ecc387f20 Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Thu, 6 Feb 2020 11:47:09 +0100 Subject: selftests: KVM: AMD Nested test infrastructure Add the basic infrastructure needed to test AMD nested SVM. This is largely copied from the KVM unit test infrastructure. Signed-off-by: Eric Auger Reviewed-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/Makefile | 2 +- .../selftests/kvm/include/x86_64/processor.h | 20 ++ tools/testing/selftests/kvm/include/x86_64/svm.h | 297 +++++++++++++++++++++ .../selftests/kvm/include/x86_64/svm_util.h | 38 +++ tools/testing/selftests/kvm/lib/x86_64/svm.c | 161 +++++++++++ 5 files changed, 517 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/kvm/include/x86_64/svm.h create mode 100644 tools/testing/selftests/kvm/include/x86_64/svm_util.h create mode 100644 tools/testing/selftests/kvm/lib/x86_64/svm.c diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index 67abc1dd50ee..fb2fa62d7dd5 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -8,7 +8,7 @@ KSFT_KHDR_INSTALL := 1 UNAME_M := $(shell uname -m) LIBKVM = lib/assert.c lib/elf.c lib/io.c lib/kvm_util.c lib/sparsebit.c -LIBKVM_x86_64 = lib/x86_64/processor.c lib/x86_64/vmx.c lib/x86_64/ucall.c +LIBKVM_x86_64 = lib/x86_64/processor.c lib/x86_64/vmx.c lib/x86_64/svm.c lib/x86_64/ucall.c LIBKVM_aarch64 = lib/aarch64/processor.c lib/aarch64/ucall.c LIBKVM_s390x = lib/s390x/processor.c lib/s390x/ucall.c diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index 6f7fffaea2e8..12475047869f 100644 --- a/tools/testing/selftests/kvm/include/x86_64/processor.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -56,6 +56,26 @@ enum x86_register { R15, }; +/* General Registers in 64-Bit Mode */ +struct gpr64_regs { + u64 rax; + u64 rcx; + u64 rdx; + u64 rbx; + u64 rsp; + u64 rbp; + u64 rsi; + u64 rdi; + u64 r8; + u64 r9; + u64 r10; + u64 r11; + u64 r12; + u64 r13; + u64 r14; + u64 r15; +}; + struct desc64 { uint16_t limit0; uint16_t base0; diff --git a/tools/testing/selftests/kvm/include/x86_64/svm.h b/tools/testing/selftests/kvm/include/x86_64/svm.h new file mode 100644 index 000000000000..f4ea2355dbc2 --- /dev/null +++ b/tools/testing/selftests/kvm/include/x86_64/svm.h @@ -0,0 +1,297 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * tools/testing/selftests/kvm/include/x86_64/svm.h + * This is a copy of arch/x86/include/asm/svm.h + * + */ + +#ifndef SELFTEST_KVM_SVM_H +#define SELFTEST_KVM_SVM_H + +enum { + INTERCEPT_INTR, + INTERCEPT_NMI, + INTERCEPT_SMI, + INTERCEPT_INIT, + INTERCEPT_VINTR, + INTERCEPT_SELECTIVE_CR0, + INTERCEPT_STORE_IDTR, + INTERCEPT_STORE_GDTR, + INTERCEPT_STORE_LDTR, + INTERCEPT_STORE_TR, + INTERCEPT_LOAD_IDTR, + INTERCEPT_LOAD_GDTR, + INTERCEPT_LOAD_LDTR, + INTERCEPT_LOAD_TR, + INTERCEPT_RDTSC, + INTERCEPT_RDPMC, + INTERCEPT_PUSHF, + INTERCEPT_POPF, + INTERCEPT_CPUID, + INTERCEPT_RSM, + INTERCEPT_IRET, + INTERCEPT_INTn, + INTERCEPT_INVD, + INTERCEPT_PAUSE, + INTERCEPT_HLT, + INTERCEPT_INVLPG, + INTERCEPT_INVLPGA, + INTERCEPT_IOIO_PROT, + INTERCEPT_MSR_PROT, + INTERCEPT_TASK_SWITCH, + INTERCEPT_FERR_FREEZE, + INTERCEPT_SHUTDOWN, + INTERCEPT_VMRUN, + INTERCEPT_VMMCALL, + INTERCEPT_VMLOAD, + INTERCEPT_VMSAVE, + INTERCEPT_STGI, + INTERCEPT_CLGI, + INTERCEPT_SKINIT, + INTERCEPT_RDTSCP, + INTERCEPT_ICEBP, + INTERCEPT_WBINVD, + INTERCEPT_MONITOR, + INTERCEPT_MWAIT, + INTERCEPT_MWAIT_COND, + INTERCEPT_XSETBV, + INTERCEPT_RDPRU, +}; + + +struct __attribute__ ((__packed__)) vmcb_control_area { + u32 intercept_cr; + u32 intercept_dr; + u32 intercept_exceptions; + u64 intercept; + u8 reserved_1[40]; + u16 pause_filter_thresh; + u16 pause_filter_count; + u64 iopm_base_pa; + u64 msrpm_base_pa; + u64 tsc_offset; + u32 asid; + u8 tlb_ctl; + u8 reserved_2[3]; + u32 int_ctl; + u32 int_vector; + u32 int_state; + u8 reserved_3[4]; + u32 exit_code; + u32 exit_code_hi; + u64 exit_info_1; + u64 exit_info_2; + u32 exit_int_info; + u32 exit_int_info_err; + u64 nested_ctl; + u64 avic_vapic_bar; + u8 reserved_4[8]; + u32 event_inj; + u32 event_inj_err; + u64 nested_cr3; + u64 virt_ext; + u32 clean; + u32 reserved_5; + u64 next_rip; + u8 insn_len; + u8 insn_bytes[15]; + u64 avic_backing_page; /* Offset 0xe0 */ + u8 reserved_6[8]; /* Offset 0xe8 */ + u64 avic_logical_id; /* Offset 0xf0 */ + u64 avic_physical_id; /* Offset 0xf8 */ + u8 reserved_7[768]; +}; + + +#define TLB_CONTROL_DO_NOTHING 0 +#define TLB_CONTROL_FLUSH_ALL_ASID 1 +#define TLB_CONTROL_FLUSH_ASID 3 +#define TLB_CONTROL_FLUSH_ASID_LOCAL 7 + +#define V_TPR_MASK 0x0f + +#define V_IRQ_SHIFT 8 +#define V_IRQ_MASK (1 << V_IRQ_SHIFT) + +#define V_GIF_SHIFT 9 +#define V_GIF_MASK (1 << V_GIF_SHIFT) + +#define V_INTR_PRIO_SHIFT 16 +#define V_INTR_PRIO_MASK (0x0f << V_INTR_PRIO_SHIFT) + +#define V_IGN_TPR_SHIFT 20 +#define V_IGN_TPR_MASK (1 << V_IGN_TPR_SHIFT) + +#define V_INTR_MASKING_SHIFT 24 +#define V_INTR_MASKING_MASK (1 << V_INTR_MASKING_SHIFT) + +#define V_GIF_ENABLE_SHIFT 25 +#define V_GIF_ENABLE_MASK (1 << V_GIF_ENABLE_SHIFT) + +#define AVIC_ENABLE_SHIFT 31 +#define AVIC_ENABLE_MASK (1 << AVIC_ENABLE_SHIFT) + +#define LBR_CTL_ENABLE_MASK BIT_ULL(0) +#define VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK BIT_ULL(1) + +#define SVM_INTERRUPT_SHADOW_MASK 1 + +#define SVM_IOIO_STR_SHIFT 2 +#define SVM_IOIO_REP_SHIFT 3 +#define SVM_IOIO_SIZE_SHIFT 4 +#define SVM_IOIO_ASIZE_SHIFT 7 + +#define SVM_IOIO_TYPE_MASK 1 +#define SVM_IOIO_STR_MASK (1 << SVM_IOIO_STR_SHIFT) +#define SVM_IOIO_REP_MASK (1 << SVM_IOIO_REP_SHIFT) +#define SVM_IOIO_SIZE_MASK (7 << SVM_IOIO_SIZE_SHIFT) +#define SVM_IOIO_ASIZE_MASK (7 << SVM_IOIO_ASIZE_SHIFT) + +#define SVM_VM_CR_VALID_MASK 0x001fULL +#define SVM_VM_CR_SVM_LOCK_MASK 0x0008ULL +#define SVM_VM_CR_SVM_DIS_MASK 0x0010ULL + +#define SVM_NESTED_CTL_NP_ENABLE BIT(0) +#define SVM_NESTED_CTL_SEV_ENABLE BIT(1) + +struct __attribute__ ((__packed__)) vmcb_seg { + u16 selector; + u16 attrib; + u32 limit; + u64 base; +}; + +struct __attribute__ ((__packed__)) vmcb_save_area { + struct vmcb_seg es; + struct vmcb_seg cs; + struct vmcb_seg ss; + struct vmcb_seg ds; + struct vmcb_seg fs; + struct vmcb_seg gs; + struct vmcb_seg gdtr; + struct vmcb_seg ldtr; + struct vmcb_seg idtr; + struct vmcb_seg tr; + u8 reserved_1[43]; + u8 cpl; + u8 reserved_2[4]; + u64 efer; + u8 reserved_3[112]; + u64 cr4; + u64 cr3; + u64 cr0; + u64 dr7; + u64 dr6; + u64 rflags; + u64 rip; + u8 reserved_4[88]; + u64 rsp; + u8 reserved_5[24]; + u64 rax; + u64 star; + u64 lstar; + u64 cstar; + u64 sfmask; + u64 kernel_gs_base; + u64 sysenter_cs; + u64 sysenter_esp; + u64 sysenter_eip; + u64 cr2; + u8 reserved_6[32]; + u64 g_pat; + u64 dbgctl; + u64 br_from; + u64 br_to; + u64 last_excp_from; + u64 last_excp_to; +}; + +struct __attribute__ ((__packed__)) vmcb { + struct vmcb_control_area control; + struct vmcb_save_area save; +}; + +#define SVM_CPUID_FUNC 0x8000000a + +#define SVM_VM_CR_SVM_DISABLE 4 + +#define SVM_SELECTOR_S_SHIFT 4 +#define SVM_SELECTOR_DPL_SHIFT 5 +#define SVM_SELECTOR_P_SHIFT 7 +#define SVM_SELECTOR_AVL_SHIFT 8 +#define SVM_SELECTOR_L_SHIFT 9 +#define SVM_SELECTOR_DB_SHIFT 10 +#define SVM_SELECTOR_G_SHIFT 11 + +#define SVM_SELECTOR_TYPE_MASK (0xf) +#define SVM_SELECTOR_S_MASK (1 << SVM_SELECTOR_S_SHIFT) +#define SVM_SELECTOR_DPL_MASK (3 << SVM_SELECTOR_DPL_SHIFT) +#define SVM_SELECTOR_P_MASK (1 << SVM_SELECTOR_P_SHIFT) +#define SVM_SELECTOR_AVL_MASK (1 << SVM_SELECTOR_AVL_SHIFT) +#define SVM_SELECTOR_L_MASK (1 << SVM_SELECTOR_L_SHIFT) +#define SVM_SELECTOR_DB_MASK (1 << SVM_SELECTOR_DB_SHIFT) +#define SVM_SELECTOR_G_MASK (1 << SVM_SELECTOR_G_SHIFT) + +#define SVM_SELECTOR_WRITE_MASK (1 << 1) +#define SVM_SELECTOR_READ_MASK SVM_SELECTOR_WRITE_MASK +#define SVM_SELECTOR_CODE_MASK (1 << 3) + +#define INTERCEPT_CR0_READ 0 +#define INTERCEPT_CR3_READ 3 +#define INTERCEPT_CR4_READ 4 +#define INTERCEPT_CR8_READ 8 +#define INTERCEPT_CR0_WRITE (16 + 0) +#define INTERCEPT_CR3_WRITE (16 + 3) +#define INTERCEPT_CR4_WRITE (16 + 4) +#define INTERCEPT_CR8_WRITE (16 + 8) + +#define INTERCEPT_DR0_READ 0 +#define INTERCEPT_DR1_READ 1 +#define INTERCEPT_DR2_READ 2 +#define INTERCEPT_DR3_READ 3 +#define INTERCEPT_DR4_READ 4 +#define INTERCEPT_DR5_READ 5 +#define INTERCEPT_DR6_READ 6 +#define INTERCEPT_DR7_READ 7 +#define INTERCEPT_DR0_WRITE (16 + 0) +#define INTERCEPT_DR1_WRITE (16 + 1) +#define INTERCEPT_DR2_WRITE (16 + 2) +#define INTERCEPT_DR3_WRITE (16 + 3) +#define INTERCEPT_DR4_WRITE (16 + 4) +#define INTERCEPT_DR5_WRITE (16 + 5) +#define INTERCEPT_DR6_WRITE (16 + 6) +#define INTERCEPT_DR7_WRITE (16 + 7) + +#define SVM_EVTINJ_VEC_MASK 0xff + +#define SVM_EVTINJ_TYPE_SHIFT 8 +#define SVM_EVTINJ_TYPE_MASK (7 << SVM_EVTINJ_TYPE_SHIFT) + +#define SVM_EVTINJ_TYPE_INTR (0 << SVM_EVTINJ_TYPE_SHIFT) +#define SVM_EVTINJ_TYPE_NMI (2 << SVM_EVTINJ_TYPE_SHIFT) +#define SVM_EVTINJ_TYPE_EXEPT (3 << SVM_EVTINJ_TYPE_SHIFT) +#define SVM_EVTINJ_TYPE_SOFT (4 << SVM_EVTINJ_TYPE_SHIFT) + +#define SVM_EVTINJ_VALID (1 << 31) +#define SVM_EVTINJ_VALID_ERR (1 << 11) + +#define SVM_EXITINTINFO_VEC_MASK SVM_EVTINJ_VEC_MASK +#define SVM_EXITINTINFO_TYPE_MASK SVM_EVTINJ_TYPE_MASK + +#define SVM_EXITINTINFO_TYPE_INTR SVM_EVTINJ_TYPE_INTR +#define SVM_EXITINTINFO_TYPE_NMI SVM_EVTINJ_TYPE_NMI +#define SVM_EXITINTINFO_TYPE_EXEPT SVM_EVTINJ_TYPE_EXEPT +#define SVM_EXITINTINFO_TYPE_SOFT SVM_EVTINJ_TYPE_SOFT + +#define SVM_EXITINTINFO_VALID SVM_EVTINJ_VALID +#define SVM_EXITINTINFO_VALID_ERR SVM_EVTINJ_VALID_ERR + +#define SVM_EXITINFOSHIFT_TS_REASON_IRET 36 +#define SVM_EXITINFOSHIFT_TS_REASON_JMP 38 +#define SVM_EXITINFOSHIFT_TS_HAS_ERROR_CODE 44 + +#define SVM_EXITINFO_REG_MASK 0x0F + +#define SVM_CR0_SELECTIVE_MASK (X86_CR0_TS | X86_CR0_MP) + +#endif /* SELFTEST_KVM_SVM_H */ diff --git a/tools/testing/selftests/kvm/include/x86_64/svm_util.h b/tools/testing/selftests/kvm/include/x86_64/svm_util.h new file mode 100644 index 000000000000..cd037917fece --- /dev/null +++ b/tools/testing/selftests/kvm/include/x86_64/svm_util.h @@ -0,0 +1,38 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * tools/testing/selftests/kvm/include/x86_64/svm_utils.h + * Header for nested SVM testing + * + * Copyright (C) 2020, Red Hat, Inc. + */ + +#ifndef SELFTEST_KVM_SVM_UTILS_H +#define SELFTEST_KVM_SVM_UTILS_H + +#include +#include "svm.h" +#include "processor.h" + +#define CPUID_SVM_BIT 2 +#define CPUID_SVM BIT_ULL(CPUID_SVM_BIT) + +#define SVM_EXIT_VMMCALL 0x081 + +struct svm_test_data { + /* VMCB */ + struct vmcb *vmcb; /* gva */ + void *vmcb_hva; + uint64_t vmcb_gpa; + + /* host state-save area */ + struct vmcb_save_area *save_area; /* gva */ + void *save_area_hva; + uint64_t save_area_gpa; +}; + +struct svm_test_data *vcpu_alloc_svm(struct kvm_vm *vm, vm_vaddr_t *p_svm_gva); +void generic_svm_setup(struct svm_test_data *svm, void *guest_rip, void *guest_rsp); +void run_guest(struct vmcb *vmcb, uint64_t vmcb_gpa); +void nested_svm_check_supported(void); + +#endif /* SELFTEST_KVM_SVM_UTILS_H */ diff --git a/tools/testing/selftests/kvm/lib/x86_64/svm.c b/tools/testing/selftests/kvm/lib/x86_64/svm.c new file mode 100644 index 000000000000..6e05a8fc3fe0 --- /dev/null +++ b/tools/testing/selftests/kvm/lib/x86_64/svm.c @@ -0,0 +1,161 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * tools/testing/selftests/kvm/lib/x86_64/svm.c + * Helpers used for nested SVM testing + * Largely inspired from KVM unit test svm.c + * + * Copyright (C) 2020, Red Hat, Inc. + */ + +#include "test_util.h" +#include "kvm_util.h" +#include "../kvm_util_internal.h" +#include "processor.h" +#include "svm_util.h" + +struct gpr64_regs guest_regs; +u64 rflags; + +/* Allocate memory regions for nested SVM tests. + * + * Input Args: + * vm - The VM to allocate guest-virtual addresses in. + * + * Output Args: + * p_svm_gva - The guest virtual address for the struct svm_test_data. + * + * Return: + * Pointer to structure with the addresses of the SVM areas. + */ +struct svm_test_data * +vcpu_alloc_svm(struct kvm_vm *vm, vm_vaddr_t *p_svm_gva) +{ + vm_vaddr_t svm_gva = vm_vaddr_alloc(vm, getpagesize(), + 0x10000, 0, 0); + struct svm_test_data *svm = addr_gva2hva(vm, svm_gva); + + svm->vmcb = (void *)vm_vaddr_alloc(vm, getpagesize(), + 0x10000, 0, 0); + svm->vmcb_hva = addr_gva2hva(vm, (uintptr_t)svm->vmcb); + svm->vmcb_gpa = addr_gva2gpa(vm, (uintptr_t)svm->vmcb); + + svm->save_area = (void *)vm_vaddr_alloc(vm, getpagesize(), + 0x10000, 0, 0); + svm->save_area_hva = addr_gva2hva(vm, (uintptr_t)svm->save_area); + svm->save_area_gpa = addr_gva2gpa(vm, (uintptr_t)svm->save_area); + + *p_svm_gva = svm_gva; + return svm; +} + +static void vmcb_set_seg(struct vmcb_seg *seg, u16 selector, + u64 base, u32 limit, u32 attr) +{ + seg->selector = selector; + seg->attrib = attr; + seg->limit = limit; + seg->base = base; +} + +void generic_svm_setup(struct svm_test_data *svm, void *guest_rip, void *guest_rsp) +{ + struct vmcb *vmcb = svm->vmcb; + uint64_t vmcb_gpa = svm->vmcb_gpa; + struct vmcb_save_area *save = &vmcb->save; + struct vmcb_control_area *ctrl = &vmcb->control; + u32 data_seg_attr = 3 | SVM_SELECTOR_S_MASK | SVM_SELECTOR_P_MASK + | SVM_SELECTOR_DB_MASK | SVM_SELECTOR_G_MASK; + u32 code_seg_attr = 9 | SVM_SELECTOR_S_MASK | SVM_SELECTOR_P_MASK + | SVM_SELECTOR_L_MASK | SVM_SELECTOR_G_MASK; + uint64_t efer; + + efer = rdmsr(MSR_EFER); + wrmsr(MSR_EFER, efer | EFER_SVME); + wrmsr(MSR_VM_HSAVE_PA, svm->save_area_gpa); + + memset(vmcb, 0, sizeof(*vmcb)); + asm volatile ("vmsave\n\t" : : "a" (vmcb_gpa) : "memory"); + vmcb_set_seg(&save->es, get_es(), 0, -1U, data_seg_attr); + vmcb_set_seg(&save->cs, get_cs(), 0, -1U, code_seg_attr); + vmcb_set_seg(&save->ss, get_ss(), 0, -1U, data_seg_attr); + vmcb_set_seg(&save->ds, get_ds(), 0, -1U, data_seg_attr); + vmcb_set_seg(&save->gdtr, 0, get_gdt().address, get_gdt().size, 0); + vmcb_set_seg(&save->idtr, 0, get_idt().address, get_idt().size, 0); + + ctrl->asid = 1; + save->cpl = 0; + save->efer = rdmsr(MSR_EFER); + asm volatile ("mov %%cr4, %0" : "=r"(save->cr4) : : "memory"); + asm volatile ("mov %%cr3, %0" : "=r"(save->cr3) : : "memory"); + asm volatile ("mov %%cr0, %0" : "=r"(save->cr0) : : "memory"); + asm volatile ("mov %%dr7, %0" : "=r"(save->dr7) : : "memory"); + asm volatile ("mov %%dr6, %0" : "=r"(save->dr6) : : "memory"); + asm volatile ("mov %%cr2, %0" : "=r"(save->cr2) : : "memory"); + save->g_pat = rdmsr(MSR_IA32_CR_PAT); + save->dbgctl = rdmsr(MSR_IA32_DEBUGCTLMSR); + ctrl->intercept = (1ULL << INTERCEPT_VMRUN) | + (1ULL << INTERCEPT_VMMCALL); + + vmcb->save.rip = (u64)guest_rip; + vmcb->save.rsp = (u64)guest_rsp; + guest_regs.rdi = (u64)svm; +} + +/* + * save/restore 64-bit general registers except rax, rip, rsp + * which are directly handed through the VMCB guest processor state + */ +#define SAVE_GPR_C \ + "xchg %%rbx, guest_regs+0x20\n\t" \ + "xchg %%rcx, guest_regs+0x10\n\t" \ + "xchg %%rdx, guest_regs+0x18\n\t" \ + "xchg %%rbp, guest_regs+0x30\n\t" \ + "xchg %%rsi, guest_regs+0x38\n\t" \ + "xchg %%rdi, guest_regs+0x40\n\t" \ + "xchg %%r8, guest_regs+0x48\n\t" \ + "xchg %%r9, guest_regs+0x50\n\t" \ + "xchg %%r10, guest_regs+0x58\n\t" \ + "xchg %%r11, guest_regs+0x60\n\t" \ + "xchg %%r12, guest_regs+0x68\n\t" \ + "xchg %%r13, guest_regs+0x70\n\t" \ + "xchg %%r14, guest_regs+0x78\n\t" \ + "xchg %%r15, guest_regs+0x80\n\t" + +#define LOAD_GPR_C SAVE_GPR_C + +/* + * selftests do not use interrupts so we dropped clgi/sti/cli/stgi + * for now. registers involved in LOAD/SAVE_GPR_C are eventually + * unmodified so they do not need to be in the clobber list. + */ +void run_guest(struct vmcb *vmcb, uint64_t vmcb_gpa) +{ + asm volatile ( + "vmload\n\t" + "mov rflags, %%r15\n\t" // rflags + "mov %%r15, 0x170(%[vmcb])\n\t" + "mov guest_regs, %%r15\n\t" // rax + "mov %%r15, 0x1f8(%[vmcb])\n\t" + LOAD_GPR_C + "vmrun\n\t" + SAVE_GPR_C + "mov 0x170(%[vmcb]), %%r15\n\t" // rflags + "mov %%r15, rflags\n\t" + "mov 0x1f8(%[vmcb]), %%r15\n\t" // rax + "mov %%r15, guest_regs\n\t" + "vmsave\n\t" + : : [vmcb] "r" (vmcb), [vmcb_gpa] "a" (vmcb_gpa) + : "r15", "memory"); +} + +void nested_svm_check_supported(void) +{ + struct kvm_cpuid_entry2 *entry = + kvm_get_supported_cpuid_entry(0x80000001); + + if (!(entry->ecx & CPUID_SVM)) { + fprintf(stderr, "nested SVM not enabled, skipping test\n"); + exit(KSFT_SKIP); + } +} + -- cgit v1.2.3 From 1ea2cc0cd7c676668841f63c915fb55244f0268c Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Thu, 6 Feb 2020 11:47:10 +0100 Subject: selftests: KVM: SVM: Add vmcall test L2 guest calls vmcall and L1 checks the exit status does correspond. Signed-off-by: Eric Auger Reviewed-by: Vitaly Kuznetsov Reviewed-by: Miaohe Lin Tested-by: Wei Huang Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/Makefile | 1 + .../testing/selftests/kvm/x86_64/svm_vmcall_test.c | 79 ++++++++++++++++++++++ 2 files changed, 80 insertions(+) create mode 100644 tools/testing/selftests/kvm/x86_64/svm_vmcall_test.c diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index fb2fa62d7dd5..d91c53b726e6 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -26,6 +26,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/vmx_dirty_log_test TEST_GEN_PROGS_x86_64 += x86_64/vmx_set_nested_state_test TEST_GEN_PROGS_x86_64 += x86_64/vmx_tsc_adjust_test TEST_GEN_PROGS_x86_64 += x86_64/xss_msr_test +TEST_GEN_PROGS_x86_64 += x86_64/svm_vmcall_test TEST_GEN_PROGS_x86_64 += clear_dirty_log_test TEST_GEN_PROGS_x86_64 += dirty_log_test TEST_GEN_PROGS_x86_64 += kvm_create_max_vcpus diff --git a/tools/testing/selftests/kvm/x86_64/svm_vmcall_test.c b/tools/testing/selftests/kvm/x86_64/svm_vmcall_test.c new file mode 100644 index 000000000000..e280f68f6365 --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/svm_vmcall_test.c @@ -0,0 +1,79 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * svm_vmcall_test + * + * Copyright (C) 2020, Red Hat, Inc. + * + * Nested SVM testing: VMCALL + */ + +#include "test_util.h" +#include "kvm_util.h" +#include "processor.h" +#include "svm_util.h" + +#define VCPU_ID 5 + +static struct kvm_vm *vm; + +static void l2_guest_code(struct svm_test_data *svm) +{ + __asm__ __volatile__("vmcall"); +} + +static void l1_guest_code(struct svm_test_data *svm) +{ + #define L2_GUEST_STACK_SIZE 64 + unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; + struct vmcb *vmcb = svm->vmcb; + + /* Prepare for L2 execution. */ + generic_svm_setup(svm, l2_guest_code, + &l2_guest_stack[L2_GUEST_STACK_SIZE]); + + run_guest(vmcb, svm->vmcb_gpa); + + GUEST_ASSERT(vmcb->control.exit_code == SVM_EXIT_VMMCALL); + GUEST_DONE(); +} + +int main(int argc, char *argv[]) +{ + vm_vaddr_t svm_gva; + + nested_svm_check_supported(); + + vm = vm_create_default(VCPU_ID, 0, (void *) l1_guest_code); + vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid()); + + vcpu_alloc_svm(vm, &svm_gva); + vcpu_args_set(vm, VCPU_ID, 1, svm_gva); + + for (;;) { + volatile struct kvm_run *run = vcpu_state(vm, VCPU_ID); + struct ucall uc; + + vcpu_run(vm, VCPU_ID); + TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, + "Got exit_reason other than KVM_EXIT_IO: %u (%s)\n", + run->exit_reason, + exit_reason_str(run->exit_reason)); + + switch (get_ucall(vm, VCPU_ID, &uc)) { + case UCALL_ABORT: + TEST_ASSERT(false, "%s", + (const char *)uc.args[0]); + /* NOT REACHED */ + case UCALL_SYNC: + break; + case UCALL_DONE: + goto done; + default: + TEST_ASSERT(false, + "Unknown ucall 0x%x.", uc.cmd); + } + } +done: + kvm_vm_free(vm); + return 0; +} -- cgit v1.2.3 From 20796447a1abee9afd0c136d5c60651bfbaf46b8 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Fri, 7 Feb 2020 23:27:51 +0800 Subject: KVM: x86: remove duplicated KVM_REQ_EVENT request The KVM_REQ_EVENT request is already made in kvm_set_rflags(). We should not make it again. Signed-off-by: Miaohe Lin Reviewed-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4d3310df1758..5e762c8afcce 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8953,7 +8953,6 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index, kvm_rip_write(vcpu, ctxt->eip); kvm_set_rflags(vcpu, ctxt->eflags); - kvm_make_request(KVM_REQ_EVENT, vcpu); return 1; } EXPORT_SYMBOL_GPL(kvm_task_switch); -- cgit v1.2.3 From 331ca0f89fc206f06b341f7fe037d7d8662b1b9f Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Fri, 7 Feb 2020 23:22:07 +0800 Subject: KVM: apic: reuse smp_wmb() in kvm_make_request() kvm_make_request() provides smp_wmb() so pending_events changes are guaranteed to be visible. Signed-off-by: Miaohe Lin Reviewed-by: Vitaly Kuznetsov Reviewed-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index eafc631d305c..afcd30d44cbb 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1080,9 +1080,6 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, result = 1; /* assumes that there are only KVM_APIC_INIT/SIPI */ apic->pending_events = (1UL << KVM_APIC_INIT); - /* make sure pending_events is visible before sending - * the request */ - smp_wmb(); kvm_make_request(KVM_REQ_EVENT, vcpu); kvm_vcpu_kick(vcpu); } -- cgit v1.2.3 From 7a02674d154d38da33517855b6d1d4cfc27a9a04 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 6 Feb 2020 14:14:34 -0800 Subject: KVM: x86/mmu: Avoid retpoline on ->page_fault() with TDP Wrap calls to ->page_fault() with a small shim to directly invoke the TDP fault handler when the kernel is using retpolines and TDP is being used. Single out the TDP fault handler and annotate the TDP path as likely to coerce the compiler into preferring it over the indirect function call. Rename tdp_page_fault() to kvm_tdp_page_fault(), as it's exposed outside of mmu.c to allow inlining the shim. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.h | 13 +++++++++++++ arch/x86/kvm/mmu/mmu.c | 11 +++++------ arch/x86/kvm/x86.c | 2 +- 3 files changed, 19 insertions(+), 7 deletions(-) diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index d55674f44a18..a647601c9e1c 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -102,6 +102,19 @@ static inline void kvm_mmu_load_cr3(struct kvm_vcpu *vcpu) kvm_get_active_pcid(vcpu)); } +int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, + bool prefault); + +static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, + u32 err, bool prefault) +{ +#ifdef CONFIG_RETPOLINE + if (likely(vcpu->arch.mmu->page_fault == kvm_tdp_page_fault)) + return kvm_tdp_page_fault(vcpu, cr2_or_gpa, err, prefault); +#endif + return vcpu->arch.mmu->page_fault(vcpu, cr2_or_gpa, err, prefault); +} + /* * Currently, we have two sorts of write-protection, a) the first one * write-protects guest page to sync the guest modification, b) another one is diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 7011a4e54866..87e9ba27ada1 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4219,8 +4219,8 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, } EXPORT_SYMBOL_GPL(kvm_handle_page_fault); -static int tdp_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, - bool prefault) +int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, + bool prefault) { int max_level; @@ -4925,7 +4925,7 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) return; context->mmu_role.as_u64 = new_role.as_u64; - context->page_fault = tdp_page_fault; + context->page_fault = kvm_tdp_page_fault; context->sync_page = nonpaging_sync_page; context->invlpg = nonpaging_invlpg; context->update_pte = nonpaging_update_pte; @@ -5436,9 +5436,8 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code, } if (r == RET_PF_INVALID) { - r = vcpu->arch.mmu->page_fault(vcpu, cr2_or_gpa, - lower_32_bits(error_code), - false); + r = kvm_mmu_do_page_fault(vcpu, cr2_or_gpa, + lower_32_bits(error_code), false); WARN_ON(r == RET_PF_INVALID); } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 5e762c8afcce..fd9e2f633d14 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -10192,7 +10192,7 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work) work->arch.cr3 != vcpu->arch.mmu->get_cr3(vcpu)) return; - vcpu->arch.mmu->page_fault(vcpu, work->cr2_or_gpa, 0, true); + kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, 0, true); } static inline u32 kvm_async_pf_hash_fn(gfn_t gfn) -- cgit v1.2.3 From ffdbd50dca67b1f12d6f531a0eaf2028d793e54f Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Fri, 7 Feb 2020 23:22:45 +0800 Subject: KVM: nVMX: Fix some comment typos and coding style Fix some typos in the comments. Also fix coding style. [Sean Christopherson rewrites the comment of write_fault_to_shadow_pgtable field in struct kvm_vcpu_arch.] Signed-off-by: Miaohe Lin Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 16 +++++++++++++--- arch/x86/kvm/vmx/nested.c | 5 +++-- 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 4dffbc10d3f8..40a0c0fd95ca 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -781,9 +781,19 @@ struct kvm_vcpu_arch { u64 msr_kvm_poll_control; /* - * Indicate whether the access faults on its page table in guest - * which is set when fix page fault and used to detect unhandeable - * instruction. + * Indicates the guest is trying to write a gfn that contains one or + * more of the PTEs used to translate the write itself, i.e. the access + * is changing its own translation in the guest page tables. KVM exits + * to userspace if emulation of the faulting instruction fails and this + * flag is set, as KVM cannot make forward progress. + * + * If emulation fails for a write to guest page tables, KVM unprotects + * (zaps) the shadow page for the target gfn and resumes the guest to + * retry the non-emulatable instruction (on hardware). Unprotecting the + * gfn doesn't allow forward progress for a self-changing access because + * doing so also zaps the translation for the gfn, i.e. retrying the + * instruction will hit a !PRESENT fault, which results in a new shadow + * page and sends KVM back to square one. */ bool write_fault_to_shadow_pgtable; diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 1586aaae3a6f..3589cd3c0fcc 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -544,7 +544,8 @@ static void nested_vmx_disable_intercept_for_msr(unsigned long *msr_bitmap_l1, } } -static inline void enable_x2apic_msr_intercepts(unsigned long *msr_bitmap) { +static inline void enable_x2apic_msr_intercepts(unsigned long *msr_bitmap) +{ int msr; for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { @@ -1981,7 +1982,7 @@ static int nested_vmx_handle_enlightened_vmptrld(struct kvm_vcpu *vcpu, } /* - * Clean fields data can't de used on VMLAUNCH and when we switch + * Clean fields data can't be used on VMLAUNCH and when we switch * between different L2 guests as KVM keeps a single VMCS12 per L1. */ if (from_launch || evmcs_gpa_changed) -- cgit v1.2.3 From 148d735eb55d32848c3379e460ce365f2c1cbe4b Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 7 Feb 2020 09:37:41 -0800 Subject: KVM: nVMX: Use correct root level for nested EPT shadow page tables Hardcode the EPT page-walk level for L2 to be 4 levels, as KVM's MMU currently also hardcodes the page walk level for nested EPT to be 4 levels. The L2 guest is all but guaranteed to soft hang on its first instruction when L1 is using EPT, as KVM will construct 4-level page tables and then tell hardware to use 5-level page tables. Fixes: 855feb673640 ("KVM: MMU: Add 5 level EPT & Shadow page table support.") Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index d625b4b0e7b4..3be25ecae145 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -2947,6 +2947,9 @@ void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) static int get_ept_level(struct kvm_vcpu *vcpu) { + /* Nested EPT currently only supports 4-level walks. */ + if (is_guest_mode(vcpu) && nested_cpu_has_ept(get_vmcs12(vcpu))) + return 4; if (cpu_has_vmx_ept_5levels() && (cpuid_maxphyaddr(vcpu) > 48)) return 5; return 4; -- cgit v1.2.3 From f6ab0107a4942dbf9a5cf0cca3f37e184870a360 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 7 Feb 2020 09:37:42 -0800 Subject: KVM: x86/mmu: Fix struct guest_walker arrays for 5-level paging Define PT_MAX_FULL_LEVELS as PT64_ROOT_MAX_LEVEL, i.e. 5, to fix shadow paging for 5-level guest page tables. PT_MAX_FULL_LEVELS is used to size the arrays that track guest pages table information, i.e. using a "max levels" of 4 causes KVM to access garbage beyond the end of an array when querying state for level 5 entries. E.g. FNAME(gpte_changed) will read garbage and most likely return %true for a level 5 entry, soft-hanging the guest because FNAME(fetch) will restart the guest instead of creating SPTEs because it thinks the guest PTE has changed. Note, KVM doesn't yet support 5-level nested EPT, so PT_MAX_FULL_LEVELS gets to stay "4" for the PTTYPE_EPT case. Fixes: 855feb673640 ("KVM: MMU: Add 5 level EPT & Shadow page table support.") Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/paging_tmpl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 4e1ef0473663..e4c8a4cbf407 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -33,7 +33,7 @@ #define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT #define PT_HAVE_ACCESSED_DIRTY(mmu) true #ifdef CONFIG_X86_64 - #define PT_MAX_FULL_LEVELS 4 + #define PT_MAX_FULL_LEVELS PT64_ROOT_MAX_LEVEL #define CMPXCHG cmpxchg #else #define CMPXCHG cmpxchg64 -- cgit v1.2.3 From ff479025349cef3106e165a761281851fd018282 Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Fri, 7 Feb 2020 15:27:13 +0100 Subject: selftests: KVM: Remove unused x86_register enum x86_register enum is not used, let's remove it. Signed-off-by: Eric Auger Suggested-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini --- .../testing/selftests/kvm/include/x86_64/processor.h | 20 -------------------- 1 file changed, 20 deletions(-) diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index 12475047869f..7428513a4c68 100644 --- a/tools/testing/selftests/kvm/include/x86_64/processor.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -36,26 +36,6 @@ #define X86_CR4_SMAP (1ul << 21) #define X86_CR4_PKE (1ul << 22) -/* The enum values match the intruction encoding of each register */ -enum x86_register { - RAX = 0, - RCX, - RDX, - RBX, - RSP, - RBP, - RSI, - RDI, - R8, - R9, - R10, - R11, - R12, - R13, - R14, - R15, -}; - /* General Registers in 64-Bit Mode */ struct gpr64_regs { u64 rax; -- cgit v1.2.3 From 9446e6fce0ab9dfd44b96f630b4e3a0a0ab879fd Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 12 Feb 2020 13:27:10 +0100 Subject: KVM: x86: fix WARN_ON check of an unsigned less than zero The check cpu->hv_clock.system_time < 0 is redundant since system_time is a u64 and hence can never be less than zero. But what was actually meant is to check that the result is positive, since kernel_ns and v->kvm->arch.kvmclock_offset are both s64. Reported-by: Colin King Suggested-by: Sean Christopherson Addresses-Coverity: ("Macro compares unsigned to 0") Reviewed-by: Miaohe Lin Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index fd9e2f633d14..fb5d64ebc35d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2444,7 +2444,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) vcpu->hv_clock.tsc_timestamp = tsc_timestamp; vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset; vcpu->last_guest_tsc = tsc_timestamp; - WARN_ON(vcpu->hv_clock.system_time < 0); + WARN_ON((s64)vcpu->hv_clock.system_time < 0); /* If the host uses TSC clocksource, then it is stable */ pvclock_flags = 0; -- cgit v1.2.3 From 7bd460fc1dfa7d82d99493fc5d7b5f9c7b679af4 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 10 Feb 2020 07:02:39 +0100 Subject: docs: kvm: add arm/pvtime.rst to index.rst Add this file to a new kvm/arm index.rst, in order for it to be shown as part of the virt book. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/arm/index.rst | 10 ++++++++++ Documentation/virt/kvm/index.rst | 2 ++ 2 files changed, 12 insertions(+) create mode 100644 Documentation/virt/kvm/arm/index.rst diff --git a/Documentation/virt/kvm/arm/index.rst b/Documentation/virt/kvm/arm/index.rst new file mode 100644 index 000000000000..e039d9b1e076 --- /dev/null +++ b/Documentation/virt/kvm/arm/index.rst @@ -0,0 +1,10 @@ +.. SPDX-License-Identifier: GPL-2.0 + +=== +ARM +=== + +.. toctree:: + :maxdepth: 2 + + pvtime diff --git a/Documentation/virt/kvm/index.rst b/Documentation/virt/kvm/index.rst index ada224a511fe..488c6370a447 100644 --- a/Documentation/virt/kvm/index.rst +++ b/Documentation/virt/kvm/index.rst @@ -10,3 +10,5 @@ KVM amd-memory-encryption cpuid vcpu-requests + + arm/index -- cgit v1.2.3 From 7d94ab169b8f047ad61ff6ea8e3e7602d0516623 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 10 Feb 2020 07:02:40 +0100 Subject: docs: virt: convert UML documentation to ReST Despite being an old document, it contains lots of information that could still be useful. The document has a nice style with makes easy to convert to ReST. So, let's convert it to ReST. This patch does: - Use proper markups for titles; - Mark and proper indent literal blocks; - don't use an 'o' character for lists; - other minor changes required for the doc to be parsed. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Paolo Bonzini --- Documentation/virt/index.rst | 1 + Documentation/virt/uml/UserModeLinux-HOWTO.txt | 4589 ------------------------ Documentation/virt/uml/user_mode_linux.rst | 4460 +++++++++++++++++++++++ 3 files changed, 4461 insertions(+), 4589 deletions(-) delete mode 100644 Documentation/virt/uml/UserModeLinux-HOWTO.txt create mode 100644 Documentation/virt/uml/user_mode_linux.rst diff --git a/Documentation/virt/index.rst b/Documentation/virt/index.rst index 062ffb527043..0a8f7fda64ad 100644 --- a/Documentation/virt/index.rst +++ b/Documentation/virt/index.rst @@ -8,6 +8,7 @@ Linux Virtualization Support :maxdepth: 2 kvm/index + uml/user_mode_linux paravirt_ops .. only:: html and subproject diff --git a/Documentation/virt/uml/UserModeLinux-HOWTO.txt b/Documentation/virt/uml/UserModeLinux-HOWTO.txt deleted file mode 100644 index 87b80f589e1c..000000000000 --- a/Documentation/virt/uml/UserModeLinux-HOWTO.txt +++ /dev/null @@ -1,4589 +0,0 @@ - User Mode Linux HOWTO - User Mode Linux Core Team - Mon Nov 18 14:16:16 EST 2002 - - This document describes the use and abuse of Jeff Dike's User Mode - Linux: a port of the Linux kernel as a normal Intel Linux process. - ______________________________________________________________________ - - Table of Contents - - 1. Introduction - - 1.1 How is User Mode Linux Different? - 1.2 Why Would I Want User Mode Linux? - - 2. Compiling the kernel and modules - - 2.1 Compiling the kernel - 2.2 Compiling and installing kernel modules - 2.3 Compiling and installing uml_utilities - - 3. Running UML and logging in - - 3.1 Running UML - 3.2 Logging in - 3.3 Examples - - 4. UML on 2G/2G hosts - - 4.1 Introduction - 4.2 The problem - 4.3 The solution - - 5. Setting up serial lines and consoles - - 5.1 Specifying the device - 5.2 Specifying the channel - 5.3 Examples - - 6. Setting up the network - - 6.1 General setup - 6.2 Userspace daemons - 6.3 Specifying ethernet addresses - 6.4 UML interface setup - 6.5 Multicast - 6.6 TUN/TAP with the uml_net helper - 6.7 TUN/TAP with a preconfigured tap device - 6.8 Ethertap - 6.9 The switch daemon - 6.10 Slip - 6.11 Slirp - 6.12 pcap - 6.13 Setting up the host yourself - - 7. Sharing Filesystems between Virtual Machines - - 7.1 A warning - 7.2 Using layered block devices - 7.3 Note! - 7.4 Another warning - 7.5 uml_moo : Merging a COW file with its backing file - - 8. Creating filesystems - - 8.1 Create the filesystem file - 8.2 Assign the file to a UML device - 8.3 Creating and mounting the filesystem - - 9. Host file access - - 9.1 Using hostfs - 9.2 hostfs as the root filesystem - 9.3 Building hostfs - - 10. The Management Console - 10.1 version - 10.2 halt and reboot - 10.3 config - 10.4 remove - 10.5 sysrq - 10.6 help - 10.7 cad - 10.8 stop - 10.9 go - - 11. Kernel debugging - - 11.1 Starting the kernel under gdb - 11.2 Examining sleeping processes - 11.3 Running ddd on UML - 11.4 Debugging modules - 11.5 Attaching gdb to the kernel - 11.6 Using alternate debuggers - - 12. Kernel debugging examples - - 12.1 The case of the hung fsck - 12.2 Episode 2: The case of the hung fsck - - 13. What to do when UML doesn't work - - 13.1 Strange compilation errors when you build from source - 13.2 (obsolete) - 13.3 A variety of panics and hangs with /tmp on a reiserfs filesystem - 13.4 The compile fails with errors about conflicting types for 'open', 'dup', and 'waitpid' - 13.5 UML doesn't work when /tmp is an NFS filesystem - 13.6 UML hangs on boot when compiled with gprof support - 13.7 syslogd dies with a SIGTERM on startup - 13.8 TUN/TAP networking doesn't work on a 2.4 host - 13.9 You can network to the host but not to other machines on the net - 13.10 I have no root and I want to scream - 13.11 UML build conflict between ptrace.h and ucontext.h - 13.12 The UML BogoMips is exactly half the host's BogoMips - 13.13 When you run UML, it immediately segfaults - 13.14 xterms appear, then immediately disappear - 13.15 Any other panic, hang, or strange behavior - - 14. Diagnosing Problems - - 14.1 Case 1 : Normal kernel panics - 14.2 Case 2 : Tracing thread panics - 14.3 Case 3 : Tracing thread panics caused by other threads - 14.4 Case 4 : Hangs - - 15. Thanks - - 15.1 Code and Documentation - 15.2 Flushing out bugs - 15.3 Buglets and clean-ups - 15.4 Case Studies - 15.5 Other contributions - - - ______________________________________________________________________ - - 1. Introduction - - Welcome to User Mode Linux. It's going to be fun. - - - - 1.1. How is User Mode Linux Different? - - Normally, the Linux Kernel talks straight to your hardware (video - card, keyboard, hard drives, etc), and any programs which run ask the - kernel to operate the hardware, like so: - - - - +-----------+-----------+----+ - | Process 1 | Process 2 | ...| - +-----------+-----------+----+ - | Linux Kernel | - +----------------------------+ - | Hardware | - +----------------------------+ - - - - - The User Mode Linux Kernel is different; instead of talking to the - hardware, it talks to a `real' Linux kernel (called the `host kernel' - from now on), like any other program. Programs can then run inside - User-Mode Linux as if they were running under a normal kernel, like - so: - - - - +----------------+ - | Process 2 | ...| - +-----------+----------------+ - | Process 1 | User-Mode Linux| - +----------------------------+ - | Linux Kernel | - +----------------------------+ - | Hardware | - +----------------------------+ - - - - - - 1.2. Why Would I Want User Mode Linux? - - - 1. If User Mode Linux crashes, your host kernel is still fine. - - 2. You can run a usermode kernel as a non-root user. - - 3. You can debug the User Mode Linux like any normal process. - - 4. You can run gprof (profiling) and gcov (coverage testing). - - 5. You can play with your kernel without breaking things. - - 6. You can use it as a sandbox for testing new apps. - - 7. You can try new development kernels safely. - - 8. You can run different distributions simultaneously. - - 9. It's extremely fun. - - - - - - 2. Compiling the kernel and modules - - - - - 2.1. Compiling the kernel - - - Compiling the user mode kernel is just like compiling any other - kernel. Let's go through the steps, using 2.4.0-prerelease (current - as of this writing) as an example: - - - 1. Download the latest UML patch from - - the download page - . - - - 3. Make a directory and unpack the kernel into it. - - - - host% - mkdir ~/uml - - - - - - - host% - cd ~/uml - - - - - - - host% - tar -xzvf linux-2.4.0-prerelease.tar.bz2 - - - - - - - 4. Apply the patch using - - - - host% - cd ~/uml/linux - - - - host% - bzcat uml-patch-2.4.0-prerelease.bz2 | patch -p1 - - - - - - - 5. Run your favorite config; `make xconfig ARCH=um' is the most - convenient. `make config ARCH=um' and 'make menuconfig ARCH=um' - will work as well. The defaults will give you a useful kernel. If - you want to change something, go ahead, it probably won't hurt - anything. - - - Note: If the host is configured with a 2G/2G address space split - rather than the usual 3G/1G split, then the packaged UML binaries - will not run. They will immediately segfault. See ``UML on 2G/2G - hosts'' for the scoop on running UML on your system. - - - - 6. Finish with `make linux ARCH=um': the result is a file called - `linux' in the top directory of your source tree. - - Make sure that you don't build this kernel in /usr/src/linux. On some - distributions, /usr/include/asm is a link into this pool. The user- - mode build changes the other end of that link, and things that include - stop compiling. - - The sources are also available from cvs at the project's cvs page, - which has directions on getting the sources. You can also browse the - CVS pool from there. - - If you get the CVS sources, you will have to check them out into an - empty directory. You will then have to copy each file into the - corresponding directory in the appropriate kernel pool. - - If you don't have the latest kernel pool, you can get the - corresponding user-mode sources with - - - host% cvs co -r v_2_3_x linux - - - - - where 'x' is the version in your pool. Note that you will not get the - bug fixes and enhancements that have gone into subsequent releases. - - - 2.2. Compiling and installing kernel modules - - UML modules are built in the same way as the native kernel (with the - exception of the 'ARCH=um' that you always need for UML): - - - host% make modules ARCH=um - - - - - Any modules that you want to load into this kernel need to be built in - the user-mode pool. Modules from the native kernel won't work. - - You can install them by using ftp or something to copy them into the - virtual machine and dropping them into /lib/modules/`uname -r`. - - You can also get the kernel build process to install them as follows: - - 1. with the kernel not booted, mount the root filesystem in the top - level of the kernel pool: - - - host% mount root_fs mnt -o loop - - - - - - - 2. run - - - host% - make modules_install INSTALL_MOD_PATH=`pwd`/mnt ARCH=um - - - - - - - 3. unmount the filesystem - - - host% umount mnt - - - - - - - 4. boot the kernel on it - - - When the system is booted, you can use insmod as usual to get the - modules into the kernel. A number of things have been loaded into UML - as modules, especially filesystems and network protocols and filters, - so most symbols which need to be exported probably already are. - However, if you do find symbols that need exporting, let us - know, and - they'll be "taken care of". - - - - 2.3. Compiling and installing uml_utilities - - Many features of the UML kernel require a user-space helper program, - so a uml_utilities package is distributed separately from the kernel - patch which provides these helpers. Included within this is: - - o port-helper - Used by consoles which connect to xterms or ports - - o tunctl - Configuration tool to create and delete tap devices - - o uml_net - Setuid binary for automatic tap device configuration - - o uml_switch - User-space virtual switch required for daemon - transport - - The uml_utilities tree is compiled with: - - - host# - make && make install - - - - - Note that UML kernel patches may require a specific version of the - uml_utilities distribution. If you don't keep up with the mailing - lists, ensure that you have the latest release of uml_utilities if you - are experiencing problems with your UML kernel, particularly when - dealing with consoles or command-line switches to the helper programs - - - - - - - - - 3. Running UML and logging in - - - - 3.1. Running UML - - It runs on 2.2.15 or later, and all 2.4 kernels. - - - Booting UML is straightforward. Simply run 'linux': it will try to - mount the file `root_fs' in the current directory. You do not need to - run it as root. If your root filesystem is not named `root_fs', then - you need to put a `ubd0=root_fs_whatever' switch on the linux command - line. - - - You will need a filesystem to boot UML from. There are a number - available for download from here . There are also several tools - which can be - used to generate UML-compatible filesystem images from media. - The kernel will boot up and present you with a login prompt. - - - Note: If the host is configured with a 2G/2G address space split - rather than the usual 3G/1G split, then the packaged UML binaries will - not run. They will immediately segfault. See ``UML on 2G/2G hosts'' - for the scoop on running UML on your system. - - - - 3.2. Logging in - - - - The prepackaged filesystems have a root account with password 'root' - and a user account with password 'user'. The login banner will - generally tell you how to log in. So, you log in and you will find - yourself inside a little virtual machine. Our filesystems have a - variety of commands and utilities installed (and it is fairly easy to - add more), so you will have a lot of tools with which to poke around - the system. - - There are a couple of other ways to log in: - - o On a virtual console - - - - Each virtual console that is configured (i.e. the device exists in - /dev and /etc/inittab runs a getty on it) will come up in its own - xterm. If you get tired of the xterms, read ``Setting up serial - lines and consoles'' to see how to attach the consoles to - something else, like host ptys. - - - - o Over the serial line - - - In the boot output, find a line that looks like: - - - - serial line 0 assigned pty /dev/ptyp1 - - - - - Attach your favorite terminal program to the corresponding tty. I.e. - for minicom, the command would be - - - host% minicom -o -p /dev/ttyp1 - - - - - - - o Over the net - - - If the network is running, then you can telnet to the virtual - machine and log in to it. See ``Setting up the network'' to learn - about setting up a virtual network. - - When you're done using it, run halt, and the kernel will bring itself - down and the process will exit. - - - 3.3. Examples - - Here are some examples of UML in action: - - o A login session - - o A virtual network - - - - - - - - 4. UML on 2G/2G hosts - - - - - 4.1. Introduction - - - Most Linux machines are configured so that the kernel occupies the - upper 1G (0xc0000000 - 0xffffffff) of the 4G address space and - processes use the lower 3G (0x00000000 - 0xbfffffff). However, some - machine are configured with a 2G/2G split, with the kernel occupying - the upper 2G (0x80000000 - 0xffffffff) and processes using the lower - 2G (0x00000000 - 0x7fffffff). - - - - - 4.2. The problem - - - The prebuilt UML binaries on this site will not run on 2G/2G hosts - because UML occupies the upper .5G of the 3G process address space - (0xa0000000 - 0xbfffffff). Obviously, on 2G/2G hosts, this is right - in the middle of the kernel address space, so UML won't even load - it - will immediately segfault. - - - - - 4.3. The solution - - - The fix for this is to rebuild UML from source after enabling - CONFIG_HOST_2G_2G (under 'General Setup'). This will cause UML to - load itself in the top .5G of that smaller process address space, - where it will run fine. See ``Compiling the kernel and modules'' if - you need help building UML from source. - - - - - - - - - - - 5. Setting up serial lines and consoles - - - It is possible to attach UML serial lines and consoles to many types - of host I/O channels by specifying them on the command line. - - - You can attach them to host ptys, ttys, file descriptors, and ports. - This allows you to do things like - - o have a UML console appear on an unused host console, - - o hook two virtual machines together by having one attach to a pty - and having the other attach to the corresponding tty - - o make a virtual machine accessible from the net by attaching a - console to a port on the host. - - - The general format of the command line option is device=channel. - - - - 5.1. Specifying the device - - Devices are specified with "con" or "ssl" (console or serial line, - respectively), optionally with a device number if you are talking - about a specific device. - - - Using just "con" or "ssl" describes all of the consoles or serial - lines. If you want to talk about console #3 or serial line #10, they - would be "con3" and "ssl10", respectively. - - - A specific device name will override a less general "con=" or "ssl=". - So, for example, you can assign a pty to each of the serial lines - except for the first two like this: - - - ssl=pty ssl0=tty:/dev/tty0 ssl1=tty:/dev/tty1 - - - - - The specificity of the device name is all that matters; order on the - command line is irrelevant. - - - - 5.2. Specifying the channel - - There are a number of different types of channels to attach a UML - device to, each with a different way of specifying exactly what to - attach to. - - o pseudo-terminals - device=pty pts terminals - device=pts - - - This will cause UML to allocate a free host pseudo-terminal for the - device. The terminal that it got will be announced in the boot - log. You access it by attaching a terminal program to the - corresponding tty: - - o screen /dev/pts/n - - o screen /dev/ttyxx - - o minicom -o -p /dev/ttyxx - minicom seems not able to handle pts - devices - - o kermit - start it up, 'open' the device, then 'connect' - - - - - - o terminals - device=tty:tty device file - - - This will make UML attach the device to the specified tty (i.e - - - con1=tty:/dev/tty3 - - - - - will attach UML's console 1 to the host's /dev/tty3). If the tty that - you specify is the slave end of a tty/pty pair, something else must - have already opened the corresponding pty in order for this to work. - - - - - - o xterms - device=xterm - - - UML will run an xterm and the device will be attached to it. - - - - - - o Port - device=port:port number - - - This will attach the UML devices to the specified host port. - Attaching console 1 to the host's port 9000 would be done like - this: - - - con1=port:9000 - - - - - Attaching all the serial lines to that port would be done similarly: - - - ssl=port:9000 - - - - - You access these devices by telnetting to that port. Each active tel- - net session gets a different device. If there are more telnets to a - port than UML devices attached to it, then the extra telnet sessions - will block until an existing telnet detaches, or until another device - becomes active (i.e. by being activated in /etc/inittab). - - This channel has the advantage that you can both attach multiple UML - devices to it and know how to access them without reading the UML boot - log. It is also unique in allowing access to a UML from remote - machines without requiring that the UML be networked. This could be - useful in allowing public access to UMLs because they would be - accessible from the net, but wouldn't need any kind of network - filtering or access control because they would have no network access. - - - If you attach the main console to a portal, then the UML boot will - appear to hang. In reality, it's waiting for a telnet to connect, at - which point the boot will proceed. - - - - - - o already-existing file descriptors - device=file descriptor - - - If you set up a file descriptor on the UML command line, you can - attach a UML device to it. This is most commonly used to put the - main console back on stdin and stdout after assigning all the other - consoles to something else: - - - con0=fd:0,fd:1 con=pts - - - - - - - - - o Nothing - device=null - - - This allows the device to be opened, in contrast to 'none', but - reads will block, and writes will succeed and the data will be - thrown out. - - - - - - o None - device=none - - - This causes the device to disappear. - - - - You can also specify different input and output channels for a device - by putting a comma between them: - - - ssl3=tty:/dev/tty2,xterm - - - - - will cause serial line 3 to accept input on the host's /dev/tty2 and - display output on an xterm. That's a silly example - the most common - use of this syntax is to reattach the main console to stdin and stdout - as shown above. - - - If you decide to move the main console away from stdin/stdout, the - initial boot output will appear in the terminal that you're running - UML in. However, once the console driver has been officially - initialized, then the boot output will start appearing wherever you - specified that console 0 should be. That device will receive all - subsequent output. - - - - 5.3. Examples - - There are a number of interesting things you can do with this - capability. - - - First, this is how you get rid of those bleeding console xterms by - attaching them to host ptys: - - - con=pty con0=fd:0,fd:1 - - - - - This will make a UML console take over an unused host virtual console, - so that when you switch to it, you will see the UML login prompt - rather than the host login prompt: - - - con1=tty:/dev/tty6 - - - - - You can attach two virtual machines together with what amounts to a - serial line as follows: - - Run one UML with a serial line attached to a pty - - - - ssl1=pty - - - - - Look at the boot log to see what pty it got (this example will assume - that it got /dev/ptyp1). - - Boot the other UML with a serial line attached to the corresponding - tty - - - - ssl1=tty:/dev/ttyp1 - - - - - Log in, make sure that it has no getty on that serial line, attach a - terminal program like minicom to it, and you should see the login - prompt of the other virtual machine. - - - 6. Setting up the network - - - - This page describes how to set up the various transports and to - provide a UML instance with network access to the host, other machines - on the local net, and the rest of the net. - - - As of 2.4.5, UML networking has been completely redone to make it much - easier to set up, fix bugs, and add new features. - - - There is a new helper, uml_net, which does the host setup that - requires root privileges. - - - There are currently five transport types available for a UML virtual - machine to exchange packets with other hosts: - - o ethertap - - o TUN/TAP - - o Multicast - - o a switch daemon - - o slip - - o slirp - - o pcap - - The TUN/TAP, ethertap, slip, and slirp transports allow a UML - instance to exchange packets with the host. They may be directed - to the host or the host may just act as a router to provide access - to other physical or virtual machines. - - - The pcap transport is a synthetic read-only interface, using the - libpcap binary to collect packets from interfaces on the host and - filter them. This is useful for building preconfigured traffic - monitors or sniffers. - - - The daemon and multicast transports provide a completely virtual - network to other virtual machines. This network is completely - disconnected from the physical network unless one of the virtual - machines on it is acting as a gateway. - - - With so many host transports, which one should you use? Here's when - you should use each one: - - o ethertap - if you want access to the host networking and it is - running 2.2 - - o TUN/TAP - if you want access to the host networking and it is - running 2.4. Also, the TUN/TAP transport is able to use a - preconfigured device, allowing it to avoid using the setuid uml_net - helper, which is a security advantage. - - o Multicast - if you want a purely virtual network and you don't want - to set up anything but the UML - - o a switch daemon - if you want a purely virtual network and you - don't mind running the daemon in order to get somewhat better - performance - - o slip - there is no particular reason to run the slip backend unless - ethertap and TUN/TAP are just not available for some reason - - o slirp - if you don't have root access on the host to setup - networking, or if you don't want to allocate an IP to your UML - - o pcap - not much use for actual network connectivity, but great for - monitoring traffic on the host - - Ethertap is available on 2.4 and works fine. TUN/TAP is preferred - to it because it has better performance and ethertap is officially - considered obsolete in 2.4. Also, the root helper only needs to - run occasionally for TUN/TAP, rather than handling every packet, as - it does with ethertap. This is a slight security advantage since - it provides fewer opportunities for a nasty UML user to somehow - exploit the helper's root privileges. - - - 6.1. General setup - - First, you must have the virtual network enabled in your UML. If are - running a prebuilt kernel from this site, everything is already - enabled. If you build the kernel yourself, under the "Network device - support" menu, enable "Network device support", and then the three - transports. - - - The next step is to provide a network device to the virtual machine. - This is done by describing it on the kernel command line. - - The general format is - - - eth = , - - - - - For example, a virtual ethernet device may be attached to a host - ethertap device as follows: - - - eth0=ethertap,tap0,fe:fd:0:0:0:1,192.168.0.254 - - - - - This sets up eth0 inside the virtual machine to attach itself to the - host /dev/tap0, assigns it an ethernet address, and assigns the host - tap0 interface an IP address. - - - - Note that the IP address you assign to the host end of the tap device - must be different than the IP you assign to the eth device inside UML. - If you are short on IPs and don't want to consume two per UML, then - you can reuse the host's eth IP address for the host ends of the tap - devices. Internally, the UMLs must still get unique IPs for their eth - devices. You can also give the UMLs non-routable IPs (192.168.x.x or - 10.x.x.x) and have the host masquerade them. This will let outgoing - connections work, but incoming connections won't without more work, - such as port forwarding from the host. - Also note that when you configure the host side of an interface, it is - only acting as a gateway. It will respond to pings sent to it - locally, but is not useful to do that since it's a host interface. - You are not talking to the UML when you ping that interface and get a - response. - - - You can also add devices to a UML and remove them at runtime. See the - ``The Management Console'' page for details. - - - The sections below describe this in more detail. - - - Once you've decided how you're going to set up the devices, you boot - UML, log in, configure the UML side of the devices, and set up routes - to the outside world. At that point, you will be able to talk to any - other machines, physical or virtual, on the net. - - - If ifconfig inside UML fails and the network refuses to come up, run - tell you what went wrong. - - - - 6.2. Userspace daemons - - You will likely need the setuid helper, or the switch daemon, or both. - They are both installed with the RPM and deb, so if you've installed - either, you can skip the rest of this section. - - - If not, then you need to check them out of CVS, build them, and - install them. The helper is uml_net, in CVS /tools/uml_net, and the - daemon is uml_switch, in CVS /tools/uml_router. They are both built - with a plain 'make'. Both need to be installed in a directory that's - in your path - /usr/bin is recommend. On top of that, uml_net needs - to be setuid root. - - - - 6.3. Specifying ethernet addresses - - Below, you will see that the TUN/TAP, ethertap, and daemon interfaces - allow you to specify hardware addresses for the virtual ethernet - devices. This is generally not necessary. If you don't have a - specific reason to do it, you probably shouldn't. If one is not - specified on the command line, the driver will assign one based on the - device IP address. It will provide the address fe:fd:nn:nn:nn:nn - where nn.nn.nn.nn is the device IP address. This is nearly always - sufficient to guarantee a unique hardware address for the device. A - couple of exceptions are: - - o Another set of virtual ethernet devices are on the same network and - they are assigned hardware addresses using a different scheme which - may conflict with the UML IP address-based scheme - - o You aren't going to use the device for IP networking, so you don't - assign the device an IP address - - If you let the driver provide the hardware address, you should make - sure that the device IP address is known before the interface is - brought up. So, inside UML, this will guarantee that: - - - - UML# - ifconfig eth0 192.168.0.250 up - - - - - If you decide to assign the hardware address yourself, make sure that - the first byte of the address is even. Addresses with an odd first - byte are broadcast addresses, which you don't want assigned to a - device. - - - - 6.4. UML interface setup - - Once the network devices have been described on the command line, you - should boot UML and log in. - - - The first thing to do is bring the interface up: - - - UML# ifconfig ethn ip-address up - - - - - You should be able to ping the host at this point. - - - To reach the rest of the world, you should set a default route to the - host: - - - UML# route add default gw host ip - - - - - Again, with host ip of 192.168.0.4: - - - UML# route add default gw 192.168.0.4 - - - - - This page used to recommend setting a network route to your local net. - This is wrong, because it will cause UML to try to figure out hardware - addresses of the local machines by arping on the interface to the - host. Since that interface is basically a single strand of ethernet - with two nodes on it (UML and the host) and arp requests don't cross - networks, they will fail to elicit any responses. So, what you want - is for UML to just blindly throw all packets at the host and let it - figure out what to do with them, which is what leaving out the network - route and adding the default route does. - - - Note: If you can't communicate with other hosts on your physical - ethernet, it's probably because of a network route that's - automatically set up. If you run 'route -n' and see a route that - looks like this: - - - - - Destination Gateway Genmask Flags Metric Ref Use Iface - 192.168.0.0 0.0.0.0 255.255.255.0 U 0 0 0 eth0 - - - - - with a mask that's not 255.255.255.255, then replace it with a route - to your host: - - - UML# - route del -net 192.168.0.0 dev eth0 netmask 255.255.255.0 - - - - - - - UML# - route add -host 192.168.0.4 dev eth0 - - - - - This, plus the default route to the host, will allow UML to exchange - packets with any machine on your ethernet. - - - - 6.5. Multicast - - The simplest way to set up a virtual network between multiple UMLs is - to use the mcast transport. This was written by Harald Welte and is - present in UML version 2.4.5-5um and later. Your system must have - multicast enabled in the kernel and there must be a multicast-capable - network device on the host. Normally, this is eth0, but if there is - no ethernet card on the host, then you will likely get strange error - messages when you bring the device up inside UML. - - - To use it, run two UMLs with - - - eth0=mcast - - - - - on their command lines. Log in, configure the ethernet device in each - machine with different IP addresses: - - - UML1# ifconfig eth0 192.168.0.254 - - - - - - - UML2# ifconfig eth0 192.168.0.253 - - - - - and they should be able to talk to each other. - - The full set of command line options for this transport are - - - - ethn=mcast,ethernet address,multicast - address,multicast port,ttl - - - - - Harald's original README is here and explains these in detail, as well as - some other issues. - - There is also a related point-to-point only "ucast" transport. - This is useful when your network does not support multicast, and - all network connections are simple point to point links. - - The full set of command line options for this transport are - - - ethn=ucast,ethernet address,remote address,listen port,remote port - - - - - 6.6. TUN/TAP with the uml_net helper - - TUN/TAP is the preferred mechanism on 2.4 to exchange packets with the - host. The TUN/TAP backend has been in UML since 2.4.9-3um. - - - The easiest way to get up and running is to let the setuid uml_net - helper do the host setup for you. This involves insmod-ing the tun.o - module if necessary, configuring the device, and setting up IP - forwarding, routing, and proxy arp. If you are new to UML networking, - do this first. If you're concerned about the security implications of - the setuid helper, use it to get up and running, then read the next - section to see how to have UML use a preconfigured tap device, which - avoids the use of uml_net. - - - If you specify an IP address for the host side of the device, the - uml_net helper will do all necessary setup on the host - the only - requirement is that TUN/TAP be available, either built in to the host - kernel or as the tun.o module. - - The format of the command line switch to attach a device to a TUN/TAP - device is - - - eth =tuntap,,, - - - - - For example, this argument will attach the UML's eth0 to the next - available tap device and assign an ethernet address to it based on its - IP address - - - eth0=tuntap,,,192.168.0.254 - - - - - - - Note that the IP address that must be used for the eth device inside - UML is fixed by the routing and proxy arp that is set up on the - TUN/TAP device on the host. You can use a different one, but it won't - work because reply packets won't reach the UML. This is a feature. - It prevents a nasty UML user from doing things like setting the UML IP - to the same as the network's nameserver or mail server. - - - There are a couple potential problems with running the TUN/TAP - transport on a 2.4 host kernel - - o TUN/TAP seems not to work on 2.4.3 and earlier. Upgrade the host - kernel or use the ethertap transport. - - o With an upgraded kernel, TUN/TAP may fail with - - - File descriptor in bad state - - - - - This is due to a header mismatch between the upgraded kernel and the - kernel that was originally installed on the machine. The fix is to - make sure that /usr/src/linux points to the headers for the running - kernel. - - These were pointed out by Tim Robinson in - name="this uml- - user post"> . - - - - 6.7. TUN/TAP with a preconfigured tap device - - If you prefer not to have UML use uml_net (which is somewhat - insecure), with UML 2.4.17-11, you can set up a TUN/TAP device - beforehand. The setup needs to be done as root, but once that's done, - there is no need for root assistance. Setting up the device is done - as follows: - - o Create the device with tunctl (available from the UML utilities - tarball) - - - - - host# tunctl -u uid - - - - - where uid is the user id or username that UML will be run as. This - will tell you what device was created. - - o Configure the device IP (change IP addresses and device name to - suit) - - - - - host# ifconfig tap0 192.168.0.254 up - - - - - - o Set up routing and arping if desired - this is my recipe, there are - other ways of doing the same thing - - - host# - bash -c 'echo 1 > /proc/sys/net/ipv4/ip_forward' - - host# - route add -host 192.168.0.253 dev tap0 - - - - - - - host# - bash -c 'echo 1 > /proc/sys/net/ipv4/conf/tap0/proxy_arp' - - - - - - - host# - arp -Ds 192.168.0.253 eth0 pub - - - - - Note that this must be done every time the host boots - this configu- - ration is not stored across host reboots. So, it's probably a good - idea to stick it in an rc file. An even better idea would be a little - utility which reads the information from a config file and sets up - devices at boot time. - - o Rather than using up two IPs and ARPing for one of them, you can - also provide direct access to your LAN by the UML by using a - bridge. - - - host# - brctl addbr br0 - - - - - - - host# - ifconfig eth0 0.0.0.0 promisc up - - - - - - - host# - ifconfig tap0 0.0.0.0 promisc up - - - - - - - host# - ifconfig br0 192.168.0.1 netmask 255.255.255.0 up - - - - - - - - host# - brctl stp br0 off - - - - - - - host# - brctl setfd br0 1 - - - - - - - host# - brctl sethello br0 1 - - - - - - - host# - brctl addif br0 eth0 - - - - - - - host# - brctl addif br0 tap0 - - - - - Note that 'br0' should be setup using ifconfig with the existing IP - address of eth0, as eth0 no longer has its own IP. - - o - - - Also, the /dev/net/tun device must be writable by the user running - UML in order for the UML to use the device that's been configured - for it. The simplest thing to do is - - - host# chmod 666 /dev/net/tun - - - - - Making it world-writable looks bad, but it seems not to be - exploitable as a security hole. However, it does allow anyone to cre- - ate useless tap devices (useless because they can't configure them), - which is a DOS attack. A somewhat more secure alternative would to be - to create a group containing all the users who have preconfigured tap - devices and chgrp /dev/net/tun to that group with mode 664 or 660. - - - o Once the device is set up, run UML with 'eth0=tuntap,device name' - (i.e. 'eth0=tuntap,tap0') on the command line (or do it with the - mconsole config command). - - o Bring the eth device up in UML and you're in business. - - If you don't want that tap device any more, you can make it non- - persistent with - - - host# tunctl -d tap device - - - - - Finally, tunctl has a -b (for brief mode) switch which causes it to - output only the name of the tap device it created. This makes it - suitable for capture by a script: - - - host# TAP=`tunctl -u 1000 -b` - - - - - - - 6.8. Ethertap - - Ethertap is the general mechanism on 2.2 for userspace processes to - exchange packets with the kernel. - - - - To use this transport, you need to describe the virtual network device - on the UML command line. The general format for this is - - - eth =ethertap, , , - - - - - So, the previous example - - - eth0=ethertap,tap0,fe:fd:0:0:0:1,192.168.0.254 - - - - - attaches the UML eth0 device to the host /dev/tap0, assigns it the - ethernet address fe:fd:0:0:0:1, and assigns the IP address - 192.168.0.254 to the tap device. - - - - The tap device is mandatory, but the others are optional. If the - ethernet address is omitted, one will be assigned to it. - - - The presence of the tap IP address will cause the helper to run and do - whatever host setup is needed to allow the virtual machine to - communicate with the outside world. If you're not sure you know what - you're doing, this is the way to go. - - - If it is absent, then you must configure the tap device and whatever - arping and routing you will need on the host. However, even in this - case, the uml_net helper still needs to be in your path and it must be - setuid root if you're not running UML as root. This is because the - tap device doesn't support SIGIO, which UML needs in order to use - something as a source of input. So, the helper is used as a - convenient asynchronous IO thread. - - If you're using the uml_net helper, you can ignore the following host - setup - uml_net will do it for you. You just need to make sure you - have ethertap available, either built in to the host kernel or - available as a module. - - - If you want to set things up yourself, you need to make sure that the - appropriate /dev entry exists. If it doesn't, become root and create - it as follows: - - - mknod /dev/tap c 36 + 16 - - - - - For example, this is how to create /dev/tap0: - - - mknod /dev/tap0 c 36 0 + 16 - - - - - You also need to make sure that the host kernel has ethertap support. - If ethertap is enabled as a module, you apparently need to insmod - ethertap once for each ethertap device you want to enable. So, - - - host# - insmod ethertap - - - - - will give you the tap0 interface. To get the tap1 interface, you need - to run - - - host# - insmod ethertap unit=1 -o ethertap1 - - - - - - - - 6.9. The switch daemon - - Note: This is the daemon formerly known as uml_router, but which was - renamed so the network weenies of the world would stop growling at me. - - - The switch daemon, uml_switch, provides a mechanism for creating a - totally virtual network. By default, it provides no connection to the - host network (but see -tap, below). - - - The first thing you need to do is run the daemon. Running it with no - arguments will make it listen on a default pair of unix domain - sockets. - - - If you want it to listen on a different pair of sockets, use - - - -unix control socket data socket - - - - - - If you want it to act as a hub rather than a switch, use - - - -hub - - - - - - If you want the switch to be connected to host networking (allowing - the umls to get access to the outside world through the host), use - - - -tap tap0 - - - - - - Note that the tap device must be preconfigured (see "TUN/TAP with a - preconfigured tap device", above). If you're using a different tap - device than tap0, specify that instead of tap0. - - - uml_switch can be backgrounded as follows - - - host% - uml_switch [ options ] < /dev/null > /dev/null - - - - - The reason it doesn't background by default is that it listens to - stdin for EOF. When it sees that, it exits. - - - The general format of the kernel command line switch is - - - - ethn=daemon,ethernet address,socket - type,control socket,data socket - - - - - You can leave off everything except the 'daemon'. You only need to - specify the ethernet address if the one that will be assigned to it - isn't acceptable for some reason. The rest of the arguments describe - how to communicate with the daemon. You should only specify them if - you told the daemon to use different sockets than the default. So, if - you ran the daemon with no arguments, running the UML on the same - machine with - eth0=daemon - - - - - will cause the eth0 driver to attach itself to the daemon correctly. - - - - 6.10. Slip - - Slip is another, less general, mechanism for a process to communicate - with the host networking. In contrast to the ethertap interface, - which exchanges ethernet frames with the host and can be used to - transport any higher-level protocol, it can only be used to transport - IP. - - - The general format of the command line switch is - - - - ethn=slip,slip IP - - - - - The slip IP argument is the IP address that will be assigned to the - host end of the slip device. If it is specified, the helper will run - and will set up the host so that the virtual machine can reach it and - the rest of the network. - - - There are some oddities with this interface that you should be aware - of. You should only specify one slip device on a given virtual - machine, and its name inside UML will be 'umn', not 'eth0' or whatever - you specified on the command line. These problems will be fixed at - some point. - - - - 6.11. Slirp - - slirp uses an external program, usually /usr/bin/slirp, to provide IP - only networking connectivity through the host. This is similar to IP - masquerading with a firewall, although the translation is performed in - user-space, rather than by the kernel. As slirp does not set up any - interfaces on the host, or changes routing, slirp does not require - root access or setuid binaries on the host. - - - The general format of the command line switch for slirp is: - - - - ethn=slirp,ethernet address,slirp path - - - - - The ethernet address is optional, as UML will set up the interface - with an ethernet address based upon the initial IP address of the - interface. The slirp path is generally /usr/bin/slirp, although it - will depend on distribution. - - - The slirp program can have a number of options passed to the command - line and we can't add them to the UML command line, as they will be - parsed incorrectly. Instead, a wrapper shell script can be written or - the options inserted into the /.slirprc file. More information on - all of the slirp options can be found in its man pages. - - - The eth0 interface on UML should be set up with the IP 10.2.0.15, - although you can use anything as long as it is not used by a network - you will be connecting to. The default route on UML should be set to - use - - - UML# - route add default dev eth0 - - - - - slirp provides a number of useful IP addresses which can be used by - UML, such as 10.0.2.3 which is an alias for the DNS server specified - in /etc/resolv.conf on the host or the IP given in the 'dns' option - for slirp. - - - Even with a baudrate setting higher than 115200, the slirp connection - is limited to 115200. If you need it to go faster, the slirp binary - needs to be compiled with FULL_BOLT defined in config.h. - - - - 6.12. pcap - - The pcap transport is attached to a UML ethernet device on the command - line or with uml_mconsole with the following syntax: - - - - ethn=pcap,host interface,filter - expression,option1,option2 - - - - - The expression and options are optional. - - - The interface is whatever network device on the host you want to - sniff. The expression is a pcap filter expression, which is also what - tcpdump uses, so if you know how to specify tcpdump filters, you will - use the same expressions here. The options are up to two of - 'promisc', control whether pcap puts the host interface into - promiscuous mode. 'optimize' and 'nooptimize' control whether the pcap - expression optimizer is used. - - - Example: - - - - eth0=pcap,eth0,tcp - - eth1=pcap,eth0,!tcp - - - - will cause the UML eth0 to emit all tcp packets on the host eth0 and - the UML eth1 to emit all non-tcp packets on the host eth0. - - - - 6.13. Setting up the host yourself - - If you don't specify an address for the host side of the ethertap or - slip device, UML won't do any setup on the host. So this is what is - needed to get things working (the examples use a host-side IP of - 192.168.0.251 and a UML-side IP of 192.168.0.250 - adjust to suit your - own network): - - o The device needs to be configured with its IP address. Tap devices - are also configured with an mtu of 1484. Slip devices are - configured with a point-to-point address pointing at the UML ip - address. - - - host# ifconfig tap0 arp mtu 1484 192.168.0.251 up - - - - - - - host# - ifconfig sl0 192.168.0.251 pointopoint 192.168.0.250 up - - - - - - o If a tap device is being set up, a route is set to the UML IP. - - - UML# route add -host 192.168.0.250 gw 192.168.0.251 - - - - - - o To allow other hosts on your network to see the virtual machine, - proxy arp is set up for it. - - - host# arp -Ds 192.168.0.250 eth0 pub - - - - - - o Finally, the host is set up to route packets. - - - host# echo 1 > /proc/sys/net/ipv4/ip_forward - - - - - - - - - - - 7. Sharing Filesystems between Virtual Machines - - - - - 7.1. A warning - - Don't attempt to share filesystems simply by booting two UMLs from the - same file. That's the same thing as booting two physical machines - from a shared disk. It will result in filesystem corruption. - - - - 7.2. Using layered block devices - - The way to share a filesystem between two virtual machines is to use - the copy-on-write (COW) layering capability of the ubd block driver. - As of 2.4.6-2um, the driver supports layering a read-write private - device over a read-only shared device. A machine's writes are stored - in the private device, while reads come from either device - the - private one if the requested block is valid in it, the shared one if - not. Using this scheme, the majority of data which is unchanged is - shared between an arbitrary number of virtual machines, each of which - has a much smaller file containing the changes that it has made. With - a large number of UMLs booting from a large root filesystem, this - leads to a huge disk space saving. It will also help performance, - since the host will be able to cache the shared data using a much - smaller amount of memory, so UML disk requests will be served from the - host's memory rather than its disks. - - - - - To add a copy-on-write layer to an existing block device file, simply - add the name of the COW file to the appropriate ubd switch: - - - ubd0=root_fs_cow,root_fs_debian_22 - - - - - where 'root_fs_cow' is the private COW file and 'root_fs_debian_22' is - the existing shared filesystem. The COW file need not exist. If it - doesn't, the driver will create and initialize it. Once the COW file - has been initialized, it can be used on its own on the command line: - - - ubd0=root_fs_cow - - - - - The name of the backing file is stored in the COW file header, so it - would be redundant to continue specifying it on the command line. - - - - 7.3. Note! - - When checking the size of the COW file in order to see the gobs of - space that you're saving, make sure you use 'ls -ls' to see the actual - disk consumption rather than the length of the file. The COW file is - sparse, so the length will be very different from the disk usage. - Here is a 'ls -l' of a COW file and backing file from one boot and - shutdown: - host% ls -l cow.debian debian2.2 - -rw-r--r-- 1 jdike jdike 492504064 Aug 6 21:16 cow.debian - -rwxrw-rw- 1 jdike jdike 537919488 Aug 6 20:42 debian2.2 - - - - - Doesn't look like much saved space, does it? Well, here's 'ls -ls': - - - host% ls -ls cow.debian debian2.2 - 880 -rw-r--r-- 1 jdike jdike 492504064 Aug 6 21:16 cow.debian - 525832 -rwxrw-rw- 1 jdike jdike 537919488 Aug 6 20:42 debian2.2 - - - - - Now, you can see that the COW file has less than a meg of disk, rather - than 492 meg. - - - - 7.4. Another warning - - Once a filesystem is being used as a readonly backing file for a COW - file, do not boot directly from it or modify it in any way. Doing so - will invalidate any COW files that are using it. The mtime and size - of the backing file are stored in the COW file header at its creation, - and they must continue to match. If they don't, the driver will - refuse to use the COW file. - - - - - If you attempt to evade this restriction by changing either the - backing file or the COW header by hand, you will get a corrupted - filesystem. - - - - - Among other things, this means that upgrading the distribution in a - backing file and expecting that all of the COW files using it will see - the upgrade will not work. - - - - - 7.5. uml_moo : Merging a COW file with its backing file - - Depending on how you use UML and COW devices, it may be advisable to - merge the changes in the COW file into the backing file every once in - a while. - - - - - The utility that does this is uml_moo. Its usage is - - - host% uml_moo COW file new backing file - - - - - There's no need to specify the backing file since that information is - already in the COW file header. If you're paranoid, boot the new - merged file, and if you're happy with it, move it over the old backing - file. - - - - - uml_moo creates a new backing file by default as a safety measure. It - also has a destructive merge option which will merge the COW file - directly into its current backing file. This is really only usable - when the backing file only has one COW file associated with it. If - there are multiple COWs associated with a backing file, a -d merge of - one of them will invalidate all of the others. However, it is - convenient if you're short of disk space, and it should also be - noticeably faster than a non-destructive merge. - - - - - uml_moo is installed with the UML deb and RPM. If you didn't install - UML from one of those packages, you can also get it from the UML - utilities tar file in tools/moo. - - - - - - - - - 8. Creating filesystems - - - You may want to create and mount new UML filesystems, either because - your root filesystem isn't large enough or because you want to use a - filesystem other than ext2. - - - This was written on the occasion of reiserfs being included in the - 2.4.1 kernel pool, and therefore the 2.4.1 UML, so the examples will - talk about reiserfs. This information is generic, and the examples - should be easy to translate to the filesystem of your choice. - - - 8.1. Create the filesystem file - - dd is your friend. All you need to do is tell dd to create an empty - file of the appropriate size. I usually make it sparse to save time - and to avoid allocating disk space until it's actually used. For - example, the following command will create a sparse 100 meg file full - of zeroes. - - - host% - dd if=/dev/zero of=new_filesystem seek=100 count=1 bs=1M - - - - - - - 8.2. Assign the file to a UML device - - Add an argument like the following to the UML command line: - - ubd4=new_filesystem - - - - - making sure that you use an unassigned ubd device number. - - - - 8.3. Creating and mounting the filesystem - - Make sure that the filesystem is available, either by being built into - the kernel, or available as a module, then boot up UML and log in. If - the root filesystem doesn't have the filesystem utilities (mkfs, fsck, - etc), then get them into UML by way of the net or hostfs. - - - Make the new filesystem on the device assigned to the new file: - - - host# mkreiserfs /dev/ubd/4 - - - <----------- MKREISERFSv2 -----------> - - ReiserFS version 3.6.25 - Block size 4096 bytes - Block count 25856 - Used blocks 8212 - Journal - 8192 blocks (18-8209), journal header is in block 8210 - Bitmaps: 17 - Root block 8211 - Hash function "r5" - ATTENTION: ALL DATA WILL BE LOST ON '/dev/ubd/4'! (y/n)y - journal size 8192 (from 18) - Initializing journal - 0%....20%....40%....60%....80%....100% - Syncing..done. - - - - - Now, mount it: - - - UML# - mount /dev/ubd/4 /mnt - - - - - and you're in business. - - - - - - - - - - 9. Host file access - - - If you want to access files on the host machine from inside UML, you - can treat it as a separate machine and either nfs mount directories - from the host or copy files into the virtual machine with scp or rcp. - However, since UML is running on the host, it can access those - files just like any other process and make them available inside the - virtual machine without needing to use the network. - - - This is now possible with the hostfs virtual filesystem. With it, you - can mount a host directory into the UML filesystem and access the - files contained in it just as you would on the host. - - - 9.1. Using hostfs - - To begin with, make sure that hostfs is available inside the virtual - machine with - - - UML# cat /proc/filesystems - - - - . hostfs should be listed. If it's not, either rebuild the kernel - with hostfs configured into it or make sure that hostfs is built as a - module and available inside the virtual machine, and insmod it. - - - Now all you need to do is run mount: - - - UML# mount none /mnt/host -t hostfs - - - - - will mount the host's / on the virtual machine's /mnt/host. - - - If you don't want to mount the host root directory, then you can - specify a subdirectory to mount with the -o switch to mount: - - - UML# mount none /mnt/home -t hostfs -o /home - - - - - will mount the hosts's /home on the virtual machine's /mnt/home. - - - - 9.2. hostfs as the root filesystem - - It's possible to boot from a directory hierarchy on the host using - hostfs rather than using the standard filesystem in a file. - - To start, you need that hierarchy. The easiest way is to loop mount - an existing root_fs file: - - - host# mount root_fs uml_root_dir -o loop - - - - - You need to change the filesystem type of / in etc/fstab to be - 'hostfs', so that line looks like this: - - /dev/ubd/0 / hostfs defaults 1 1 - - - - - Then you need to chown to yourself all the files in that directory - that are owned by root. This worked for me: - - - host# find . -uid 0 -exec chown jdike {} \; - - - - - Next, make sure that your UML kernel has hostfs compiled in, not as a - module. Then run UML with the boot device pointing at that directory: - - - ubd0=/path/to/uml/root/directory - - - - - UML should then boot as it does normally. - - - 9.3. Building hostfs - - If you need to build hostfs because it's not in your kernel, you have - two choices: - - - - o Compiling hostfs into the kernel: - - - Reconfigure the kernel and set the 'Host filesystem' option under - - - o Compiling hostfs as a module: - - - Reconfigure the kernel and set the 'Host filesystem' option under - be in arch/um/fs/hostfs/hostfs.o. Install that in - /lib/modules/`uname -r`/fs in the virtual machine, boot it up, and - - - UML# insmod hostfs - - - - - - - - - - - - - 10. The Management Console - - - - The UML management console is a low-level interface to the kernel, - somewhat like the i386 SysRq interface. Since there is a full-blown - operating system under UML, there is much greater flexibility possible - than with the SysRq mechanism. - - - There are a number of things you can do with the mconsole interface: - - o get the kernel version - - o add and remove devices - - o halt or reboot the machine - - o Send SysRq commands - - o Pause and resume the UML - - - You need the mconsole client (uml_mconsole) which is present in CVS - (/tools/mconsole) in 2.4.5-9um and later, and will be in the RPM in - 2.4.6. - - - You also need CONFIG_MCONSOLE (under 'General Setup') enabled in UML. - When you boot UML, you'll see a line like: - - - mconsole initialized on /home/jdike/.uml/umlNJ32yL/mconsole - - - - - If you specify a unique machine id one the UML command line, i.e. - - - umid=debian - - - - - you'll see this - - - mconsole initialized on /home/jdike/.uml/debian/mconsole - - - - - That file is the socket that uml_mconsole will use to communicate with - UML. Run it with either the umid or the full path as its argument: - - - host% uml_mconsole debian - - - - - or - - - host% uml_mconsole /home/jdike/.uml/debian/mconsole - - - - - You'll get a prompt, at which you can run one of these commands: - - o version - - o halt - - o reboot - - o config - - o remove - - o sysrq - - o help - - o cad - - o stop - - o go - - - 10.1. version - - This takes no arguments. It prints the UML version. - - - (mconsole) version - OK Linux usermode 2.4.5-9um #1 Wed Jun 20 22:47:08 EDT 2001 i686 - - - - - There are a couple actual uses for this. It's a simple no-op which - can be used to check that a UML is running. It's also a way of - sending an interrupt to the UML. This is sometimes useful on SMP - hosts, where there's a bug which causes signals to UML to be lost, - often causing it to appear to hang. Sending such a UML the mconsole - version command is a good way to 'wake it up' before networking has - been enabled, as it does not do anything to the function of the UML. - - - - 10.2. halt and reboot - - These take no arguments. They shut the machine down immediately, with - no syncing of disks and no clean shutdown of userspace. So, they are - pretty close to crashing the machine. - - - (mconsole) halt - OK - - - - - - - 10.3. config - - "config" adds a new device to the virtual machine. Currently the ubd - and network drivers support this. It takes one argument, which is the - device to add, with the same syntax as the kernel command line. - - - - - (mconsole) - config ubd3=/home/jdike/incoming/roots/root_fs_debian22 - - OK - (mconsole) config eth1=mcast - OK - - - - - - - 10.4. remove - - "remove" deletes a device from the system. Its argument is just the - name of the device to be removed. The device must be idle in whatever - sense the driver considers necessary. In the case of the ubd driver, - the removed block device must not be mounted, swapped on, or otherwise - open, and in the case of the network driver, the device must be down. - - - (mconsole) remove ubd3 - OK - (mconsole) remove eth1 - OK - - - - - - - 10.5. sysrq - - This takes one argument, which is a single letter. It calls the - generic kernel's SysRq driver, which does whatever is called for by - that argument. See the SysRq documentation in - Documentation/admin-guide/sysrq.rst in your favorite kernel tree to - see what letters are valid and what they do. - - - - 10.6. help - - "help" returns a string listing the valid commands and what each one - does. - - - - 10.7. cad - - This invokes the Ctl-Alt-Del action on init. What exactly this ends - up doing is up to /etc/inittab. Normally, it reboots the machine. - With UML, this is usually not desired, so if a halt would be better, - then find the section of inittab that looks like this - - - # What to do when CTRL-ALT-DEL is pressed. - ca:12345:ctrlaltdel:/sbin/shutdown -t1 -a -r now - - - - - and change the command to halt. - - - - 10.8. stop - - This puts the UML in a loop reading mconsole requests until a 'go' - mconsole command is received. This is very useful for making backups - of UML filesystems, as the UML can be stopped, then synced via 'sysrq - s', so that everything is written to the filesystem. You can then copy - the filesystem and then send the UML 'go' via mconsole. - - - Note that a UML running with more than one CPU will have problems - after you send the 'stop' command, as only one CPU will be held in a - mconsole loop and all others will continue as normal. This is a bug, - and will be fixed. - - - - 10.9. go - - This resumes a UML after being paused by a 'stop' command. Note that - when the UML has resumed, TCP connections may have timed out and if - the UML is paused for a long period of time, crond might go a little - crazy, running all the jobs it didn't do earlier. - - - - - - - - - 11. Kernel debugging - - - Note: The interface that makes debugging, as described here, possible - is present in 2.4.0-test6 kernels and later. - - - Since the user-mode kernel runs as a normal Linux process, it is - possible to debug it with gdb almost like any other process. It is - slightly different because the kernel's threads are already being - ptraced for system call interception, so gdb can't ptrace them. - However, a mechanism has been added to work around that problem. - - - In order to debug the kernel, you need build it from source. See - ``Compiling the kernel and modules'' for information on doing that. - Make sure that you enable CONFIG_DEBUGSYM and CONFIG_PT_PROXY during - the config. These will compile the kernel with -g, and enable the - ptrace proxy so that gdb works with UML, respectively. - - - - - 11.1. Starting the kernel under gdb - - You can have the kernel running under the control of gdb from the - beginning by putting 'debug' on the command line. You will get an - xterm with gdb running inside it. The kernel will send some commands - to gdb which will leave it stopped at the beginning of start_kernel. - At this point, you can get things going with 'next', 'step', or - 'cont'. - - - There is a transcript of a debugging session here , with breakpoints being set in the scheduler and in an - interrupt handler. - 11.2. Examining sleeping processes - - Not every bug is evident in the currently running process. Sometimes, - processes hang in the kernel when they shouldn't because they've - deadlocked on a semaphore or something similar. In this case, when - you ^C gdb and get a backtrace, you will see the idle thread, which - isn't very relevant. - - - What you want is the stack of whatever process is sleeping when it - shouldn't be. You need to figure out which process that is, which is - generally fairly easy. Then you need to get its host process id, - which you can do either by looking at ps on the host or at - task.thread.extern_pid in gdb. - - - Now what you do is this: - - o detach from the current thread - - - (UML gdb) det - - - - - - o attach to the thread you are interested in - - - (UML gdb) att - - - - - - o look at its stack and anything else of interest - - - (UML gdb) bt - - - - - Note that you can't do anything at this point that requires that a - process execute, e.g. calling a function - - o when you're done looking at that process, reattach to the current - thread and continue it - - - (UML gdb) - att 1 - - - - - - - (UML gdb) - c - - - - - Here, specifying any pid which is not the process id of a UML thread - will cause gdb to reattach to the current thread. I commonly use 1, - but any other invalid pid would work. - - - - 11.3. Running ddd on UML - - ddd works on UML, but requires a special kludge. The process goes - like this: - - o Start ddd - - - host% ddd linux - - - - - - o With ps, get the pid of the gdb that ddd started. You can ask the - gdb to tell you, but for some reason that confuses things and - causes a hang. - - o run UML with 'debug=parent gdb-pid=' added to the command line - - it will just sit there after you hit return - - o type 'att 1' to the ddd gdb and you will see something like - - - 0xa013dc51 in __kill () - - - (gdb) - - - - - - o At this point, type 'c', UML will boot up, and you can use ddd just - as you do on any other process. - - - - 11.4. Debugging modules - - gdb has support for debugging code which is dynamically loaded into - the process. This support is what is needed to debug kernel modules - under UML. - - - Using that support is somewhat complicated. You have to tell gdb what - object file you just loaded into UML and where in memory it is. Then, - it can read the symbol table, and figure out where all the symbols are - from the load address that you provided. It gets more interesting - when you load the module again (i.e. after an rmmod). You have to - tell gdb to forget about all its symbols, including the main UML ones - for some reason, then load then all back in again. - - - There's an easy way and a hard way to do this. The easy way is to use - the umlgdb expect script written by Chandan Kudige. It basically - automates the process for you. - - - First, you must tell it where your modules are. There is a list in - the script that looks like this: - set MODULE_PATHS { - "fat" "/usr/src/uml/linux-2.4.18/fs/fat/fat.o" - "isofs" "/usr/src/uml/linux-2.4.18/fs/isofs/isofs.o" - "minix" "/usr/src/uml/linux-2.4.18/fs/minix/minix.o" - } - - - - - You change that to list the names and paths of the modules that you - are going to debug. Then you run it from the toplevel directory of - your UML pool and it basically tells you what to do: - - - - - ******** GDB pid is 21903 ******** - Start UML as: ./linux debug gdb-pid=21903 - - - - GNU gdb 5.0rh-5 Red Hat Linux 7.1 - Copyright 2001 Free Software Foundation, Inc. - GDB is free software, covered by the GNU General Public License, and you are - welcome to change it and/or distribute copies of it under certain conditions. - Type "show copying" to see the conditions. - There is absolutely no warranty for GDB. Type "show warranty" for details. - This GDB was configured as "i386-redhat-linux"... - (gdb) b sys_init_module - Breakpoint 1 at 0xa0011923: file module.c, line 349. - (gdb) att 1 - - - - - After you run UML and it sits there doing nothing, you hit return at - the 'att 1' and continue it: - - - Attaching to program: /home/jdike/linux/2.4/um/./linux, process 1 - 0xa00f4221 in __kill () - (UML gdb) c - Continuing. - - - - - At this point, you debug normally. When you insmod something, the - expect magic will kick in and you'll see something like: - - - - - - - - - - - - - - - - - - *** Module hostfs loaded *** - Breakpoint 1, sys_init_module (name_user=0x805abb0 "hostfs", - mod_user=0x8070e00) at module.c:349 - 349 char *name, *n_name, *name_tmp = NULL; - (UML gdb) finish - Run till exit from #0 sys_init_module (name_user=0x805abb0 "hostfs", - mod_user=0x8070e00) at module.c:349 - 0xa00e2e23 in execute_syscall (r=0xa8140284) at syscall_kern.c:411 - 411 else res = EXECUTE_SYSCALL(syscall, regs); - Value returned is $1 = 0 - (UML gdb) - p/x (int)module_list + module_list->size_of_struct - - $2 = 0xa9021054 - (UML gdb) symbol-file ./linux - Load new symbol table from "./linux"? (y or n) y - Reading symbols from ./linux... - done. - (UML gdb) - add-symbol-file /home/jdike/linux/2.4/um/arch/um/fs/hostfs/hostfs.o 0xa9021054 - - add symbol table from file "/home/jdike/linux/2.4/um/arch/um/fs/hostfs/hostfs.o" at - .text_addr = 0xa9021054 - (y or n) y - - Reading symbols from /home/jdike/linux/2.4/um/arch/um/fs/hostfs/hostfs.o... - done. - (UML gdb) p *module_list - $1 = {size_of_struct = 84, next = 0xa0178720, name = 0xa9022de0 "hostfs", - size = 9016, uc = {usecount = {counter = 0}, pad = 0}, flags = 1, - nsyms = 57, ndeps = 0, syms = 0xa9023170, deps = 0x0, refs = 0x0, - init = 0xa90221f0 , cleanup = 0xa902222c , - ex_table_start = 0x0, ex_table_end = 0x0, persist_start = 0x0, - persist_end = 0x0, can_unload = 0, runsize = 0, kallsyms_start = 0x0, - kallsyms_end = 0x0, - archdata_start = 0x1b855
, - archdata_end = 0xe5890000
, - kernel_data = 0xf689c35d
} - >> Finished loading symbols for hostfs ... - - - - - That's the easy way. It's highly recommended. The hard way is - described below in case you're interested in what's going on. - - - Boot the kernel under the debugger and load the module with insmod or - modprobe. With gdb, do: - - - (UML gdb) p module_list - - - - - This is a list of modules that have been loaded into the kernel, with - the most recently loaded module first. Normally, the module you want - is at module_list. If it's not, walk down the next links, looking at - the name fields until find the module you want to debug. Take the - address of that structure, and add module.size_of_struct (which in - 2.4.10 kernels is 96 (0x60)) to it. Gdb can make this hard addition - for you :-): - - - - (UML gdb) - printf "%#x\n", (int)module_list module_list->size_of_struct - - - - - The offset from the module start occasionally changes (before 2.4.0, - it was module.size_of_struct + 4), so it's a good idea to check the - init and cleanup addresses once in a while, as describe below. Now - do: - - - (UML gdb) - add-symbol-file /path/to/module/on/host that_address - - - - - Tell gdb you really want to do it, and you're in business. - - - If there's any doubt that you got the offset right, like breakpoints - appear not to work, or they're appearing in the wrong place, you can - check it by looking at the module structure. The init and cleanup - fields should look like: - - - init = 0x588066b0 , cleanup = 0x588066c0 - - - - - with no offsets on the symbol names. If the names are right, but they - are offset, then the offset tells you how much you need to add to the - address you gave to add-symbol-file. - - - When you want to load in a new version of the module, you need to get - gdb to forget about the old one. The only way I've found to do that - is to tell gdb to forget about all symbols that it knows about: - - - (UML gdb) symbol-file - - - - - Then reload the symbols from the kernel binary: - - - (UML gdb) symbol-file /path/to/kernel - - - - - and repeat the process above. You'll also need to re-enable break- - points. They were disabled when you dumped all the symbols because - gdb couldn't figure out where they should go. - - - - 11.5. Attaching gdb to the kernel - - If you don't have the kernel running under gdb, you can attach gdb to - it later by sending the tracing thread a SIGUSR1. The first line of - the console output identifies its pid: - tracing thread pid = 20093 - - - - - When you send it the signal: - - - host% kill -USR1 20093 - - - - - you will get an xterm with gdb running in it. - - - If you have the mconsole compiled into UML, then the mconsole client - can be used to start gdb: - - - (mconsole) (mconsole) config gdb=xterm - - - - - will fire up an xterm with gdb running in it. - - - - 11.6. Using alternate debuggers - - UML has support for attaching to an already running debugger rather - than starting gdb itself. This is present in CVS as of 17 Apr 2001. - I sent it to Alan for inclusion in the ac tree, and it will be in my - 2.4.4 release. - - - This is useful when gdb is a subprocess of some UI, such as emacs or - ddd. It can also be used to run debuggers other than gdb on UML. - Below is an example of using strace as an alternate debugger. - - - To do this, you need to get the pid of the debugger and pass it in - with the - - - If you are using gdb under some UI, then tell it to 'att 1', and - you'll find yourself attached to UML. - - - If you are using something other than gdb as your debugger, then - you'll need to get it to do the equivalent of 'att 1' if it doesn't do - it automatically. - - - An example of an alternate debugger is strace. You can strace the - actual kernel as follows: - - o Run the following in a shell - - - host% - sh -c 'echo pid=$$; echo -n hit return; read x; exec strace -p 1 -o strace.out' - - - - o Run UML with 'debug' and 'gdb-pid=' with the pid printed out - by the previous command - - o Hit return in the shell, and UML will start running, and strace - output will start accumulating in the output file. - - Note that this is different from running - - - host% strace ./linux - - - - - That will strace only the main UML thread, the tracing thread, which - doesn't do any of the actual kernel work. It just oversees the vir- - tual machine. In contrast, using strace as described above will show - you the low-level activity of the virtual machine. - - - - - - 12. Kernel debugging examples - - 12.1. The case of the hung fsck - - When booting up the kernel, fsck failed, and dropped me into a shell - to fix things up. I ran fsck -y, which hung: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Setting hostname uml [ OK ] - Checking root filesystem - /dev/fhd0 was not cleanly unmounted, check forced. - Error reading block 86894 (Attempt to read block from filesystem resulted in short read) while reading indirect blocks of inode 19780. - - /dev/fhd0: UNEXPECTED INCONSISTENCY; RUN fsck MANUALLY. - (i.e., without -a or -p options) - [ FAILED ] - - *** An error occurred during the file system check. - *** Dropping you to a shell; the system will reboot - *** when you leave the shell. - Give root password for maintenance - (or type Control-D for normal startup): - - [root@uml /root]# fsck -y /dev/fhd0 - fsck -y /dev/fhd0 - Parallelizing fsck version 1.14 (9-Jan-1999) - e2fsck 1.14, 9-Jan-1999 for EXT2 FS 0.5b, 95/08/09 - /dev/fhd0 contains a file system with errors, check forced. - Pass 1: Checking inodes, blocks, and sizes - Error reading block 86894 (Attempt to read block from filesystem resulted in short read) while reading indirect blocks of inode 19780. Ignore error? yes - - Inode 19780, i_blocks is 1548, should be 540. Fix? yes - - Pass 2: Checking directory structure - Error reading block 49405 (Attempt to read block from filesystem resulted in short read). Ignore error? yes - - Directory inode 11858, block 0, offset 0: directory corrupted - Salvage? yes - - Missing '.' in directory inode 11858. - Fix? yes - - Missing '..' in directory inode 11858. - Fix? yes - - - - - - The standard drill in this sort of situation is to fire up gdb on the - signal thread, which, in this case, was pid 1935. In another window, - I run gdb and attach pid 1935. - - - - - ~/linux/2.3.26/um 1016: gdb linux - GNU gdb 4.17.0.11 with Linux support - Copyright 1998 Free Software Foundation, Inc. - GDB is free software, covered by the GNU General Public License, and you are - welcome to change it and/or distribute copies of it under certain conditions. - Type "show copying" to see the conditions. - There is absolutely no warranty for GDB. Type "show warranty" for details. - This GDB was configured as "i386-redhat-linux"... - - (gdb) att 1935 - Attaching to program `/home/dike/linux/2.3.26/um/linux', Pid 1935 - 0x100756d9 in __wait4 () - - - - - - - Let's see what's currently running: - - - - (gdb) p current_task.pid - $1 = 0 - - - - - - It's the idle thread, which means that fsck went to sleep for some - reason and never woke up. - - - Let's guess that the last process in the process list is fsck: - - - - (gdb) p current_task.prev_task.comm - $13 = "fsck.ext2\000\000\000\000\000\000" - - - - - - It is, so let's see what it thinks it's up to: - - - - (gdb) p current_task.prev_task.thread - $14 = {extern_pid = 1980, tracing = 0, want_tracing = 0, forking = 0, - kernel_stack_page = 0, signal_stack = 1342627840, syscall = {id = 4, args = { - 3, 134973440, 1024, 0, 1024}, have_result = 0, result = 50590720}, - request = {op = 2, u = {exec = {ip = 1350467584, sp = 2952789424}, fork = { - regs = {1350467584, 2952789424, 0 }, sigstack = 0, - pid = 0}, switch_to = 0x507e8000, thread = {proc = 0x507e8000, - arg = 0xaffffdb0, flags = 0, new_pid = 0}, input_request = { - op = 1350467584, fd = -1342177872, proc = 0, pid = 0}}}} - - - - - - The interesting things here are the fact that its .thread.syscall.id - is __NR_write (see the big switch in arch/um/kernel/syscall_kern.c or - the defines in include/asm-um/arch/unistd.h), and that it never - returned. Also, its .request.op is OP_SWITCH (see - arch/um/include/user_util.h). These mean that it went into a write, - and, for some reason, called schedule(). - - - The fact that it never returned from write means that its stack should - be fairly interesting. Its pid is 1980 (.thread.extern_pid). That - process is being ptraced by the signal thread, so it must be detached - before gdb can attach it: - - - - - - - - - - - (gdb) call detach(1980) - - Program received signal SIGSEGV, Segmentation fault. - - The program being debugged stopped while in a function called from GDB. - When the function (detach) is done executing, GDB will silently - stop (instead of continuing to evaluate the expression containing - the function call). - (gdb) call detach(1980) - $15 = 0 - - - - - - The first detach segfaults for some reason, and the second one - succeeds. - - - Now I detach from the signal thread, attach to the fsck thread, and - look at its stack: - - - (gdb) det - Detaching from program: /home/dike/linux/2.3.26/um/linux Pid 1935 - (gdb) att 1980 - Attaching to program `/home/dike/linux/2.3.26/um/linux', Pid 1980 - 0x10070451 in __kill () - (gdb) bt - #0 0x10070451 in __kill () - #1 0x10068ccd in usr1_pid (pid=1980) at process.c:30 - #2 0x1006a03f in _switch_to (prev=0x50072000, next=0x507e8000) - at process_kern.c:156 - #3 0x1006a052 in switch_to (prev=0x50072000, next=0x507e8000, last=0x50072000) - at process_kern.c:161 - #4 0x10001d12 in schedule () at core.c:777 - #5 0x1006a744 in __down (sem=0x507d241c) at semaphore.c:71 - #6 0x1006aa10 in __down_failed () at semaphore.c:157 - #7 0x1006c5d8 in segv_handler (sc=0x5006e940) at trap_user.c:174 - #8 0x1006c5ec in kern_segv_handler (sig=11) at trap_user.c:182 - #9 - #10 0x10155404 in errno () - #11 0x1006c0aa in segv (address=1342179328, is_write=2) at trap_kern.c:50 - #12 0x1006c5d8 in segv_handler (sc=0x5006eaf8) at trap_user.c:174 - #13 0x1006c5ec in kern_segv_handler (sig=11) at trap_user.c:182 - #14 - #15 0xc0fd in ?? () - #16 0x10016647 in sys_write (fd=3, - buf=0x80b8800
, count=1024) - at read_write.c:159 - #17 0x1006d5b3 in execute_syscall (syscall=4, args=0x5006ef08) - at syscall_kern.c:254 - #18 0x1006af87 in really_do_syscall (sig=12) at syscall_user.c:35 - #19 - #20 0x400dc8b0 in ?? () - - - - - - The interesting things here are : - - o There are two segfaults on this stack (frames 9 and 14) - - o The first faulting address (frame 11) is 0x50000800 - - (gdb) p (void *)1342179328 - $16 = (void *) 0x50000800 - - - - - - The initial faulting address is interesting because it is on the idle - thread's stack. I had been seeing the idle thread segfault for no - apparent reason, and the cause looked like stack corruption. In hopes - of catching the culprit in the act, I had turned off all protections - to that stack while the idle thread wasn't running. This apparently - tripped that trap. - - - However, the more immediate problem is that second segfault and I'm - going to concentrate on that. First, I want to see where the fault - happened, so I have to go look at the sigcontent struct in frame 8: - - - - (gdb) up - #1 0x10068ccd in usr1_pid (pid=1980) at process.c:30 - 30 kill(pid, SIGUSR1); - (gdb) - #2 0x1006a03f in _switch_to (prev=0x50072000, next=0x507e8000) - at process_kern.c:156 - 156 usr1_pid(getpid()); - (gdb) - #3 0x1006a052 in switch_to (prev=0x50072000, next=0x507e8000, last=0x50072000) - at process_kern.c:161 - 161 _switch_to(prev, next); - (gdb) - #4 0x10001d12 in schedule () at core.c:777 - 777 switch_to(prev, next, prev); - (gdb) - #5 0x1006a744 in __down (sem=0x507d241c) at semaphore.c:71 - 71 schedule(); - (gdb) - #6 0x1006aa10 in __down_failed () at semaphore.c:157 - 157 } - (gdb) - #7 0x1006c5d8 in segv_handler (sc=0x5006e940) at trap_user.c:174 - 174 segv(sc->cr2, sc->err & 2); - (gdb) - #8 0x1006c5ec in kern_segv_handler (sig=11) at trap_user.c:182 - 182 segv_handler(sc); - (gdb) p *sc - Cannot access memory at address 0x0. - - - - - That's not very useful, so I'll try a more manual method: - - - (gdb) p *((struct sigcontext *) (&sig + 1)) - $19 = {gs = 0, __gsh = 0, fs = 0, __fsh = 0, es = 43, __esh = 0, ds = 43, - __dsh = 0, edi = 1342179328, esi = 1350378548, ebp = 1342630440, - esp = 1342630420, ebx = 1348150624, edx = 1280, ecx = 0, eax = 0, - trapno = 14, err = 4, eip = 268480945, cs = 35, __csh = 0, eflags = 66118, - esp_at_signal = 1342630420, ss = 43, __ssh = 0, fpstate = 0x0, oldmask = 0, - cr2 = 1280} - - - - The ip is in handle_mm_fault: - - - (gdb) p (void *)268480945 - $20 = (void *) 0x1000b1b1 - (gdb) i sym $20 - handle_mm_fault + 57 in section .text - - - - - - Specifically, it's in pte_alloc: - - - (gdb) i line *$20 - Line 124 of "/home/dike/linux/2.3.26/um/include/asm/pgalloc.h" - starts at address 0x1000b1b1 - and ends at 0x1000b1b7 . - - - - - - To find where in handle_mm_fault this is, I'll jump forward in the - code until I see an address in that procedure: - - - - (gdb) i line *0x1000b1c0 - Line 126 of "/home/dike/linux/2.3.26/um/include/asm/pgalloc.h" - starts at address 0x1000b1b7 - and ends at 0x1000b1c3 . - (gdb) i line *0x1000b1d0 - Line 131 of "/home/dike/linux/2.3.26/um/include/asm/pgalloc.h" - starts at address 0x1000b1d0 - and ends at 0x1000b1da . - (gdb) i line *0x1000b1e0 - Line 61 of "/home/dike/linux/2.3.26/um/include/asm/pgalloc.h" - starts at address 0x1000b1da - and ends at 0x1000b1e1 . - (gdb) i line *0x1000b1f0 - Line 134 of "/home/dike/linux/2.3.26/um/include/asm/pgalloc.h" - starts at address 0x1000b1f0 - and ends at 0x1000b200 . - (gdb) i line *0x1000b200 - Line 135 of "/home/dike/linux/2.3.26/um/include/asm/pgalloc.h" - starts at address 0x1000b200 - and ends at 0x1000b208 . - (gdb) i line *0x1000b210 - Line 139 of "/home/dike/linux/2.3.26/um/include/asm/pgalloc.h" - starts at address 0x1000b210 - and ends at 0x1000b219 . - (gdb) i line *0x1000b220 - Line 1168 of "memory.c" starts at address 0x1000b21e - and ends at 0x1000b222 . - - - - - - Something is apparently wrong with the page tables or vma_structs, so - lets go back to frame 11 and have a look at them: - - - - #11 0x1006c0aa in segv (address=1342179328, is_write=2) at trap_kern.c:50 - 50 handle_mm_fault(current, vma, address, is_write); - (gdb) call pgd_offset_proc(vma->vm_mm, address) - $22 = (pgd_t *) 0x80a548c - - - - - - That's pretty bogus. Page tables aren't supposed to be in process - text or data areas. Let's see what's in the vma: - - - (gdb) p *vma - $23 = {vm_mm = 0x507d2434, vm_start = 0, vm_end = 134512640, - vm_next = 0x80a4f8c, vm_page_prot = {pgprot = 0}, vm_flags = 31200, - vm_avl_height = 2058, vm_avl_left = 0x80a8c94, vm_avl_right = 0x80d1000, - vm_next_share = 0xaffffdb0, vm_pprev_share = 0xaffffe63, - vm_ops = 0xaffffe7a, vm_pgoff = 2952789626, vm_file = 0xafffffec, - vm_private_data = 0x62} - (gdb) p *vma.vm_mm - $24 = {mmap = 0x507d2434, mmap_avl = 0x0, mmap_cache = 0x8048000, - pgd = 0x80a4f8c, mm_users = {counter = 0}, mm_count = {counter = 134904288}, - map_count = 134909076, mmap_sem = {count = {counter = 135073792}, - sleepers = -1342177872, wait = {lock = , - task_list = {next = 0xaffffe63, prev = 0xaffffe7a}, - __magic = -1342177670, __creator = -1342177300}, __magic = 98}, - page_table_lock = {}, context = 138, start_code = 0, end_code = 0, - start_data = 0, end_data = 0, start_brk = 0, brk = 0, start_stack = 0, - arg_start = 0, arg_end = 0, env_start = 0, env_end = 0, rss = 1350381536, - total_vm = 0, locked_vm = 0, def_flags = 0, cpu_vm_mask = 0, swap_cnt = 0, - swap_address = 0, segments = 0x0} - - - - - - This also pretty bogus. With all of the 0x80xxxxx and 0xaffffxxx - addresses, this is looking like a stack was plonked down on top of - these structures. Maybe it's a stack overflow from the next page: - - - - (gdb) p vma - $25 = (struct vm_area_struct *) 0x507d2434 - - - - - - That's towards the lower quarter of the page, so that would have to - have been pretty heavy stack overflow: - - - - - - - - - - - - - - - (gdb) x/100x $25 - 0x507d2434: 0x507d2434 0x00000000 0x08048000 0x080a4f8c - 0x507d2444: 0x00000000 0x080a79e0 0x080a8c94 0x080d1000 - 0x507d2454: 0xaffffdb0 0xaffffe63 0xaffffe7a 0xaffffe7a - 0x507d2464: 0xafffffec 0x00000062 0x0000008a 0x00000000 - 0x507d2474: 0x00000000 0x00000000 0x00000000 0x00000000 - 0x507d2484: 0x00000000 0x00000000 0x00000000 0x00000000 - 0x507d2494: 0x00000000 0x00000000 0x507d2fe0 0x00000000 - 0x507d24a4: 0x00000000 0x00000000 0x00000000 0x00000000 - 0x507d24b4: 0x00000000 0x00000000 0x00000000 0x00000000 - 0x507d24c4: 0x00000000 0x00000000 0x00000000 0x00000000 - 0x507d24d4: 0x00000000 0x00000000 0x00000000 0x00000000 - 0x507d24e4: 0x00000000 0x00000000 0x00000000 0x00000000 - 0x507d24f4: 0x00000000 0x00000000 0x00000000 0x00000000 - 0x507d2504: 0x00000000 0x00000000 0x00000000 0x00000000 - 0x507d2514: 0x00000000 0x00000000 0x00000000 0x00000000 - 0x507d2524: 0x00000000 0x00000000 0x00000000 0x00000000 - 0x507d2534: 0x00000000 0x00000000 0x507d25dc 0x00000000 - 0x507d2544: 0x00000000 0x00000000 0x00000000 0x00000000 - 0x507d2554: 0x00000000 0x00000000 0x00000000 0x00000000 - 0x507d2564: 0x00000000 0x00000000 0x00000000 0x00000000 - 0x507d2574: 0x00000000 0x00000000 0x00000000 0x00000000 - 0x507d2584: 0x00000000 0x00000000 0x00000000 0x00000000 - 0x507d2594: 0x00000000 0x00000000 0x00000000 0x00000000 - 0x507d25a4: 0x00000000 0x00000000 0x00000000 0x00000000 - 0x507d25b4: 0x00000000 0x00000000 0x00000000 0x00000000 - - - - - - It's not stack overflow. The only "stack-like" piece of this data is - the vma_struct itself. - - - At this point, I don't see any avenues to pursue, so I just have to - admit that I have no idea what's going on. What I will do, though, is - stick a trap on the segfault handler which will stop if it sees any - writes to the idle thread's stack. That was the thing that happened - first, and it may be that if I can catch it immediately, what's going - on will be somewhat clearer. - - - 12.2. Episode 2: The case of the hung fsck - - After setting a trap in the SEGV handler for accesses to the signal - thread's stack, I reran the kernel. - - - fsck hung again, this time by hitting the trap: - - - - - - - - - - - - - - - - - Setting hostname uml [ OK ] - Checking root filesystem - /dev/fhd0 contains a file system with errors, check forced. - Error reading block 86894 (Attempt to read block from filesystem resulted in short read) while reading indirect blocks of inode 19780. - - /dev/fhd0: UNEXPECTED INCONSISTENCY; RUN fsck MANUALLY. - (i.e., without -a or -p options) - [ FAILED ] - - *** An error occurred during the file system check. - *** Dropping you to a shell; the system will reboot - *** when you leave the shell. - Give root password for maintenance - (or type Control-D for normal startup): - - [root@uml /root]# fsck -y /dev/fhd0 - fsck -y /dev/fhd0 - Parallelizing fsck version 1.14 (9-Jan-1999) - e2fsck 1.14, 9-Jan-1999 for EXT2 FS 0.5b, 95/08/09 - /dev/fhd0 contains a file system with errors, check forced. - Pass 1: Checking inodes, blocks, and sizes - Error reading block 86894 (Attempt to read block from filesystem resulted in short read) while reading indirect blocks of inode 19780. Ignore error? yes - - Pass 2: Checking directory structure - Error reading block 49405 (Attempt to read block from filesystem resulted in short read). Ignore error? yes - - Directory inode 11858, block 0, offset 0: directory corrupted - Salvage? yes - - Missing '.' in directory inode 11858. - Fix? yes - - Missing '..' in directory inode 11858. - Fix? yes - - Untested (4127) [100fe44c]: trap_kern.c line 31 - - - - - - I need to get the signal thread to detach from pid 4127 so that I can - attach to it with gdb. This is done by sending it a SIGUSR1, which is - caught by the signal thread, which detaches the process: - - - kill -USR1 4127 - - - - - - Now I can run gdb on it: - - - - - - - - - - - - - - ~/linux/2.3.26/um 1034: gdb linux - GNU gdb 4.17.0.11 with Linux support - Copyright 1998 Free Software Foundation, Inc. - GDB is free software, covered by the GNU General Public License, and you are - welcome to change it and/or distribute copies of it under certain conditions. - Type "show copying" to see the conditions. - There is absolutely no warranty for GDB. Type "show warranty" for details. - This GDB was configured as "i386-redhat-linux"... - (gdb) att 4127 - Attaching to program `/home/dike/linux/2.3.26/um/linux', Pid 4127 - 0x10075891 in __libc_nanosleep () - - - - - - The backtrace shows that it was in a write and that the fault address - (address in frame 3) is 0x50000800, which is right in the middle of - the signal thread's stack page: - - - (gdb) bt - #0 0x10075891 in __libc_nanosleep () - #1 0x1007584d in __sleep (seconds=1000000) - at ../sysdeps/unix/sysv/linux/sleep.c:78 - #2 0x1006ce9a in stop () at user_util.c:191 - #3 0x1006bf88 in segv (address=1342179328, is_write=2) at trap_kern.c:31 - #4 0x1006c628 in segv_handler (sc=0x5006eaf8) at trap_user.c:174 - #5 0x1006c63c in kern_segv_handler (sig=11) at trap_user.c:182 - #6 - #7 0xc0fd in ?? () - #8 0x10016647 in sys_write (fd=3, buf=0x80b8800 "R.", count=1024) - at read_write.c:159 - #9 0x1006d603 in execute_syscall (syscall=4, args=0x5006ef08) - at syscall_kern.c:254 - #10 0x1006af87 in really_do_syscall (sig=12) at syscall_user.c:35 - #11 - #12 0x400dc8b0 in ?? () - #13 - #14 0x400dc8b0 in ?? () - #15 0x80545fd in ?? () - #16 0x804daae in ?? () - #17 0x8054334 in ?? () - #18 0x804d23e in ?? () - #19 0x8049632 in ?? () - #20 0x80491d2 in ?? () - #21 0x80596b5 in ?? () - (gdb) p (void *)1342179328 - $3 = (void *) 0x50000800 - - - - - - Going up the stack to the segv_handler frame and looking at where in - the code the access happened shows that it happened near line 110 of - block_dev.c: - - - - - - - - - - (gdb) up - #1 0x1007584d in __sleep (seconds=1000000) - at ../sysdeps/unix/sysv/linux/sleep.c:78 - ../sysdeps/unix/sysv/linux/sleep.c:78: No such file or directory. - (gdb) - #2 0x1006ce9a in stop () at user_util.c:191 - 191 while(1) sleep(1000000); - (gdb) - #3 0x1006bf88 in segv (address=1342179328, is_write=2) at trap_kern.c:31 - 31 KERN_UNTESTED(); - (gdb) - #4 0x1006c628 in segv_handler (sc=0x5006eaf8) at trap_user.c:174 - 174 segv(sc->cr2, sc->err & 2); - (gdb) p *sc - $1 = {gs = 0, __gsh = 0, fs = 0, __fsh = 0, es = 43, __esh = 0, ds = 43, - __dsh = 0, edi = 1342179328, esi = 134973440, ebp = 1342631484, - esp = 1342630864, ebx = 256, edx = 0, ecx = 256, eax = 1024, trapno = 14, - err = 6, eip = 268550834, cs = 35, __csh = 0, eflags = 66070, - esp_at_signal = 1342630864, ss = 43, __ssh = 0, fpstate = 0x0, oldmask = 0, - cr2 = 1342179328} - (gdb) p (void *)268550834 - $2 = (void *) 0x1001c2b2 - (gdb) i sym $2 - block_write + 1090 in section .text - (gdb) i line *$2 - Line 209 of "/home/dike/linux/2.3.26/um/include/asm/arch/string.h" - starts at address 0x1001c2a1 - and ends at 0x1001c2bf . - (gdb) i line *0x1001c2c0 - Line 110 of "block_dev.c" starts at address 0x1001c2bf - and ends at 0x1001c2e3 . - - - - - - Looking at the source shows that the fault happened during a call to - copy_from_user to copy the data into the kernel: - - - 107 count -= chars; - 108 copy_from_user(p,buf,chars); - 109 p += chars; - 110 buf += chars; - - - - - - p is the pointer which must contain 0x50000800, since buf contains - 0x80b8800 (frame 8 above). It is defined as: - - - p = offset + bh->b_data; - - - - - - I need to figure out what bh is, and it just so happens that bh is - passed as an argument to mark_buffer_uptodate and mark_buffer_dirty a - few lines later, so I do a little disassembly: - - - - - (gdb) disas 0x1001c2bf 0x1001c2e0 - Dump of assembler code from 0x1001c2bf to 0x1001c2d0: - 0x1001c2bf : addl %eax,0xc(%ebp) - 0x1001c2c2 : movl 0xfffffdd4(%ebp),%edx - 0x1001c2c8 : btsl $0x0,0x18(%edx) - 0x1001c2cd : btsl $0x1,0x18(%edx) - 0x1001c2d2 : sbbl %ecx,%ecx - 0x1001c2d4 : testl %ecx,%ecx - 0x1001c2d6 : jne 0x1001c2e3 - 0x1001c2d8 : pushl $0x0 - 0x1001c2da : pushl %edx - 0x1001c2db : call 0x1001819c <__mark_buffer_dirty> - End of assembler dump. - - - - - - At that point, bh is in %edx (address 0x1001c2da), which is calculated - at 0x1001c2c2 as %ebp + 0xfffffdd4, so I figure exactly what that is, - taking %ebp from the sigcontext_struct above: - - - (gdb) p (void *)1342631484 - $5 = (void *) 0x5006ee3c - (gdb) p 0x5006ee3c+0xfffffdd4 - $6 = 1342630928 - (gdb) p (void *)$6 - $7 = (void *) 0x5006ec10 - (gdb) p *((void **)$7) - $8 = (void *) 0x50100200 - - - - - - Now, I look at the structure to see what's in it, and particularly, - what its b_data field contains: - - - (gdb) p *((struct buffer_head *)0x50100200) - $13 = {b_next = 0x50289380, b_blocknr = 49405, b_size = 1024, b_list = 0, - b_dev = 15872, b_count = {counter = 1}, b_rdev = 15872, b_state = 24, - b_flushtime = 0, b_next_free = 0x501001a0, b_prev_free = 0x50100260, - b_this_page = 0x501001a0, b_reqnext = 0x0, b_pprev = 0x507fcf58, - b_data = 0x50000800 "", b_page = 0x50004000, - b_end_io = 0x10017f60 , b_dev_id = 0x0, - b_rsector = 98810, b_wait = {lock = , - task_list = {next = 0x50100248, prev = 0x50100248}, __magic = 1343226448, - __creator = 0}, b_kiobuf = 0x0} - - - - - - The b_data field is indeed 0x50000800, so the question becomes how - that happened. The rest of the structure looks fine, so this probably - is not a case of data corruption. It happened on purpose somehow. - - - The b_page field is a pointer to the page_struct representing the - 0x50000000 page. Looking at it shows the kernel's idea of the state - of that page: - - - - (gdb) p *$13.b_page - $17 = {list = {next = 0x50004a5c, prev = 0x100c5174}, mapping = 0x0, - index = 0, next_hash = 0x0, count = {counter = 1}, flags = 132, lru = { - next = 0x50008460, prev = 0x50019350}, wait = { - lock = , task_list = {next = 0x50004024, - prev = 0x50004024}, __magic = 1342193708, __creator = 0}, - pprev_hash = 0x0, buffers = 0x501002c0, virtual = 1342177280, - zone = 0x100c5160} - - - - - - Some sanity-checking: the virtual field shows the "virtual" address of - this page, which in this kernel is the same as its "physical" address, - and the page_struct itself should be mem_map[0], since it represents - the first page of memory: - - - - (gdb) p (void *)1342177280 - $18 = (void *) 0x50000000 - (gdb) p mem_map - $19 = (mem_map_t *) 0x50004000 - - - - - - These check out fine. - - - Now to check out the page_struct itself. In particular, the flags - field shows whether the page is considered free or not: - - - (gdb) p (void *)132 - $21 = (void *) 0x84 - - - - - - The "reserved" bit is the high bit, which is definitely not set, so - the kernel considers the signal stack page to be free and available to - be used. - - - At this point, I jump to conclusions and start looking at my early - boot code, because that's where that page is supposed to be reserved. - - - In my setup_arch procedure, I have the following code which looks just - fine: - - - - bootmap_size = init_bootmem(start_pfn, end_pfn - start_pfn); - free_bootmem(__pa(low_physmem) + bootmap_size, high_physmem - low_physmem); - - - - - - Two stack pages have already been allocated, and low_physmem points to - the third page, which is the beginning of free memory. - The init_bootmem call declares the entire memory to the boot memory - manager, which marks it all reserved. The free_bootmem call frees up - all of it, except for the first two pages. This looks correct to me. - - - So, I decide to see init_bootmem run and make sure that it is marking - those first two pages as reserved. I never get that far. - - - Stepping into init_bootmem, and looking at bootmem_map before looking - at what it contains shows the following: - - - - (gdb) p bootmem_map - $3 = (void *) 0x50000000 - - - - - - Aha! The light dawns. That first page is doing double duty as a - stack and as the boot memory map. The last thing that the boot memory - manager does is to free the pages used by its memory map, so this page - is getting freed even its marked as reserved. - - - The fix was to initialize the boot memory manager before allocating - those two stack pages, and then allocate them through the boot memory - manager. After doing this, and fixing a couple of subsequent buglets, - the stack corruption problem disappeared. - - - - - - 13. What to do when UML doesn't work - - - - - 13.1. Strange compilation errors when you build from source - - As of test11, it is necessary to have "ARCH=um" in the environment or - on the make command line for all steps in building UML, including - clean, distclean, or mrproper, config, menuconfig, or xconfig, dep, - and linux. If you forget for any of them, the i386 build seems to - contaminate the UML build. If this happens, start from scratch with - - - host% - make mrproper ARCH=um - - - - - and repeat the build process with ARCH=um on all the steps. - - - See ``Compiling the kernel and modules'' for more details. - - - Another cause of strange compilation errors is building UML in - /usr/src/linux. If you do this, the first thing you need to do is - clean up the mess you made. The /usr/src/linux/asm link will now - point to /usr/src/linux/asm-um. Make it point back to - /usr/src/linux/asm-i386. Then, move your UML pool someplace else and - build it there. Also see below, where a more specific set of symptoms - is described. - - - - 13.3. A variety of panics and hangs with /tmp on a reiserfs filesys- - tem - - I saw this on reiserfs 3.5.21 and it seems to be fixed in 3.5.27. - Panics preceded by - - - Detaching pid nnnn - - - - are diagnostic of this problem. This is a reiserfs bug which causes a - thread to occasionally read stale data from a mmapped page shared with - another thread. The fix is to upgrade the filesystem or to have /tmp - be an ext2 filesystem. - - - - 13.4. The compile fails with errors about conflicting types for - 'open', 'dup', and 'waitpid' - - This happens when you build in /usr/src/linux. The UML build makes - the include/asm link point to include/asm-um. /usr/include/asm points - to /usr/src/linux/include/asm, so when that link gets moved, files - which need to include the asm-i386 versions of headers get the - incompatible asm-um versions. The fix is to move the include/asm link - back to include/asm-i386 and to do UML builds someplace else. - - - - 13.5. UML doesn't work when /tmp is an NFS filesystem - - This seems to be a similar situation with the ReiserFS problem above. - Some versions of NFS seems not to handle mmap correctly, which UML - depends on. The workaround is have /tmp be a non-NFS directory. - - - 13.6. UML hangs on boot when compiled with gprof support - - If you build UML with gprof support and, early in the boot, it does - this - - - kernel BUG at page_alloc.c:100! - - - - - you have a buggy gcc. You can work around the problem by removing - UM_FASTCALL from CFLAGS in arch/um/Makefile-i386. This will open up - another bug, but that one is fairly hard to reproduce. - - - - 13.7. syslogd dies with a SIGTERM on startup - - The exact boot error depends on the distribution that you're booting, - but Debian produces this: - - - /etc/rc2.d/S10sysklogd: line 49: 93 Terminated - start-stop-daemon --start --quiet --exec /sbin/syslogd -- $SYSLOGD - - - - - This is a syslogd bug. There's a race between a parent process - installing a signal handler and its child sending the signal. See - this uml-devel post for the details. - - - - 13.8. TUN/TAP networking doesn't work on a 2.4 host - - There are a couple of problems which were - name="pointed - out"> by Tim Robinson - - o It doesn't work on hosts running 2.4.7 (or thereabouts) or earlier. - The fix is to upgrade to something more recent and then read the - next item. - - o If you see - - - File descriptor in bad state - - - - when you bring up the device inside UML, you have a header mismatch - between the original kernel and the upgraded one. Make /usr/src/linux - point at the new headers. This will only be a problem if you build - uml_net yourself. - - - - 13.9. You can network to the host but not to other machines on the - net - - If you can connect to the host, and the host can connect to UML, but - you cannot connect to any other machines, then you may need to enable - IP Masquerading on the host. Usually this is only experienced when - using private IP addresses (192.168.x.x or 10.x.x.x) for host/UML - networking, rather than the public address space that your host is - connected to. UML does not enable IP Masquerading, so you will need - to create a static rule to enable it: - - - host% - iptables -t nat -A POSTROUTING -o eth0 -j MASQUERADE - - - - - Replace eth0 with the interface that you use to talk to the rest of - the world. - - - Documentation on IP Masquerading, and SNAT, can be found at - www.netfilter.org . - - - If you can reach the local net, but not the outside Internet, then - that is usually a routing problem. The UML needs a default route: - - - UML# - route add default gw gateway IP - - - - - The gateway IP can be any machine on the local net that knows how to - reach the outside world. Usually, this is the host or the local net- - work's gateway. - - - Occasionally, we hear from someone who can reach some machines, but - not others on the same net, or who can reach some ports on other - machines, but not others. These are usually caused by strange - firewalling somewhere between the UML and the other box. You track - this down by running tcpdump on every interface the packets travel - over and see where they disappear. When you find a machine that takes - the packets in, but does not send them onward, that's the culprit. - - - - 13.10. I have no root and I want to scream - - Thanks to Birgit Wahlich for telling me about this strange one. It - turns out that there's a limit of six environment variables on the - kernel command line. When that limit is reached or exceeded, argument - processing stops, which means that the 'root=' argument that UML - usually adds is not seen. So, the filesystem has no idea what the - root device is, so it panics. - - - The fix is to put less stuff on the command line. Glomming all your - setup variables into one is probably the best way to go. - - - - 13.11. UML build conflict between ptrace.h and ucontext.h - - On some older systems, /usr/include/asm/ptrace.h and - /usr/include/sys/ucontext.h define the same names. So, when they're - included together, the defines from one completely mess up the parsing - of the other, producing errors like: - /usr/include/sys/ucontext.h:47: parse error before - `10' - - - - - plus a pile of warnings. - - - This is a libc botch, which has since been fixed, and I don't see any - way around it besides upgrading. - - - - 13.12. The UML BogoMips is exactly half the host's BogoMips - - On i386 kernels, there are two ways of running the loop that is used - to calculate the BogoMips rating, using the TSC if it's there or using - a one-instruction loop. The TSC produces twice the BogoMips as the - loop. UML uses the loop, since it has nothing resembling a TSC, and - will get almost exactly the same BogoMips as a host using the loop. - However, on a host with a TSC, its BogoMips will be double the loop - BogoMips, and therefore double the UML BogoMips. - - - - 13.13. When you run UML, it immediately segfaults - - If the host is configured with the 2G/2G address space split, that's - why. See ``UML on 2G/2G hosts'' for the details on getting UML to - run on your host. - - - - 13.14. xterms appear, then immediately disappear - - If you're running an up to date kernel with an old release of - uml_utilities, the port-helper program will not work properly, so - xterms will exit straight after they appear. The solution is to - upgrade to the latest release of uml_utilities. Usually this problem - occurs when you have installed a packaged release of UML then compiled - your own development kernel without upgrading the uml_utilities from - the source distribution. - - - - 13.15. Any other panic, hang, or strange behavior - - If you're seeing truly strange behavior, such as hangs or panics that - happen in random places, or you try running the debugger to see what's - happening and it acts strangely, then it could be a problem in the - host kernel. If you're not running a stock Linus or -ac kernel, then - try that. An early version of the preemption patch and a 2.4.10 SuSE - kernel have caused very strange problems in UML. - - - Otherwise, let me know about it. Send a message to one of the UML - mailing lists - either the developer list - user-mode-linux-devel at - lists dot sourceforge dot net (subscription info) or the user list - - user-mode-linux-user at lists dot sourceforge do net (subscription - info), whichever you prefer. Don't assume that everyone knows about - it and that a fix is imminent. - - - If you want to be super-helpful, read ``Diagnosing Problems'' and - follow the instructions contained therein. - 14. Diagnosing Problems - - - If you get UML to crash, hang, or otherwise misbehave, you should - report this on one of the project mailing lists, either the developer - list - user-mode-linux-devel at lists dot sourceforge dot net - (subscription info) or the user list - user-mode-linux-user at lists - dot sourceforge dot net (subscription info). When you do, it is - likely that I will want more information. So, it would be helpful to - read the stuff below, do whatever is applicable in your case, and - report the results to the list. - - - For any diagnosis, you're going to need to build a debugging kernel. - The binaries from this site aren't debuggable. If you haven't done - this before, read about ``Compiling the kernel and modules'' and - ``Kernel debugging'' UML first. - - - 14.1. Case 1 : Normal kernel panics - - The most common case is for a normal thread to panic. To debug this, - you will need to run it under the debugger (add 'debug' to the command - line). An xterm will start up with gdb running inside it. Continue - it when it stops in start_kernel and make it crash. Now ^C gdb and - - - If the panic was a "Kernel mode fault", then there will be a segv - frame on the stack and I'm going to want some more information. The - stack might look something like this: - - - (UML gdb) backtrace - #0 0x1009bf76 in __sigprocmask (how=1, set=0x5f347940, oset=0x0) - at ../sysdeps/unix/sysv/linux/sigprocmask.c:49 - #1 0x10091411 in change_sig (signal=10, on=1) at process.c:218 - #2 0x10094785 in timer_handler (sig=26) at time_kern.c:32 - #3 0x1009bf38 in __restore () - at ../sysdeps/unix/sysv/linux/i386/sigaction.c:125 - #4 0x1009534c in segv (address=8, ip=268849158, is_write=2, is_user=0) - at trap_kern.c:66 - #5 0x10095c04 in segv_handler (sig=11) at trap_user.c:285 - #6 0x1009bf38 in __restore () - - - - - I'm going to want to see the symbol and line information for the value - of ip in the segv frame. In this case, you would do the following: - - - (UML gdb) i sym 268849158 - - - - - and - - - (UML gdb) i line *268849158 - - - - - The reason for this is the __restore frame right above the segv_han- - dler frame is hiding the frame that actually segfaulted. So, I have - to get that information from the faulting ip. - - - 14.2. Case 2 : Tracing thread panics - - The less common and more painful case is when the tracing thread - panics. In this case, the kernel debugger will be useless because it - needs a healthy tracing thread in order to work. The first thing to - do is get a backtrace from the tracing thread. This is done by - figuring out what its pid is, firing up gdb, and attaching it to that - pid. You can figure out the tracing thread pid by looking at the - first line of the console output, which will look like this: - - - tracing thread pid = 15851 - - - - - or by running ps on the host and finding the line that looks like - this: - - - jdike 15851 4.5 0.4 132568 1104 pts/0 S 21:34 0:05 ./linux [(tracing thread)] - - - - - If the panic was 'segfault in signals', then follow the instructions - above for collecting information about the location of the seg fault. - - - If the tracing thread flaked out all by itself, then send that - backtrace in and wait for our crack debugging team to fix the problem. - - - 14.3. Case 3 : Tracing thread panics caused by other threads - - However, there are cases where the misbehavior of another thread - caused the problem. The most common panic of this type is: - - - wait_for_stop failed to wait for to stop with - - - - - In this case, you'll need to get a backtrace from the process men- - tioned in the panic, which is complicated by the fact that the kernel - debugger is defunct and without some fancy footwork, another gdb can't - attach to it. So, this is how the fancy footwork goes: - - In a shell: - - - host% kill -STOP pid - - - - - Run gdb on the tracing thread as described in case 2 and do: - - - (host gdb) call detach(pid) - - - If you get a segfault, do it again. It always works the second time. - - Detach from the tracing thread and attach to that other thread: - - - (host gdb) detach - - - - - - - (host gdb) attach pid - - - - - If gdb hangs when attaching to that process, go back to a shell and - do: - - - host% - kill -CONT pid - - - - - And then get the backtrace: - - - (host gdb) backtrace - - - - - - 14.4. Case 4 : Hangs - - Hangs seem to be fairly rare, but they sometimes happen. When a hang - happens, we need a backtrace from the offending process. Run the - kernel debugger as described in case 1 and get a backtrace. If the - current process is not the idle thread, then send in the backtrace. - You can tell that it's the idle thread if the stack looks like this: - - - #0 0x100b1401 in __libc_nanosleep () - #1 0x100a2885 in idle_sleep (secs=10) at time.c:122 - #2 0x100a546f in do_idle () at process_kern.c:445 - #3 0x100a5508 in cpu_idle () at process_kern.c:471 - #4 0x100ec18f in start_kernel () at init/main.c:592 - #5 0x100a3e10 in start_kernel_proc (unused=0x0) at um_arch.c:71 - #6 0x100a383f in signal_tramp (arg=0x100a3dd8) at trap_user.c:50 - - - - - If this is the case, then some other process is at fault, and went to - sleep when it shouldn't have. Run ps on the host and figure out which - process should not have gone to sleep and stayed asleep. Then attach - to it with gdb and get a backtrace as described in case 3. - - - - - - - 15. Thanks - - - A number of people have helped this project in various ways, and this - page gives recognition where recognition is due. - - - If you're listed here and you would prefer a real link on your name, - or no link at all, instead of the despammed email address pseudo-link, - let me know. - - - If you're not listed here and you think maybe you should be, please - let me know that as well. I try to get everyone, but sometimes my - bookkeeping lapses and I forget about contributions. - - - 15.1. Code and Documentation - - Rusty Russell - - - o wrote the HOWTO - - o prodded me into making this project official and putting it on - SourceForge - - o came up with the way cool UML logo - - o redid the config process - - - Peter Moulder - Fixed my config and build - processes, and added some useful code to the block driver - - - Bill Stearns - - - o HOWTO updates - - o lots of bug reports - - o lots of testing - - o dedicated a box (uml.ists.dartmouth.edu) to support UML development - - o wrote the mkrootfs script, which allows bootable filesystems of - RPM-based distributions to be cranked out - - o cranked out a large number of filesystems with said script - - - Jim Leu - Wrote the virtual ethernet driver - and associated usermode tools - - Lars Brinkhoff - Contributed the ptrace - proxy from his own project to allow easier - kernel debugging - - - Andrea Arcangeli - Redid some of the early boot - code so that it would work on machines with Large File Support - - - Chris Emerson - Did - the first UML port to Linux/ppc - - - Harald Welte - Wrote the multicast - transport for the network driver - - - Jorgen Cederlof - Added special file support to hostfs - - - Greg Lonnon - Changed the ubd driver - to allow it to layer a COW file on a shared read-only filesystem and - wrote the iomem emulation support - - - Henrik Nordstrom - Provided a variety - of patches, fixes, and clues - - - Lennert Buytenhek - Contributed various patches, a rewrite of the - network driver, the first implementation of the mconsole driver, and - did the bulk of the work needed to get SMP working again. - - - Yon Uriarte - Fixed the TUN/TAP network backend while I slept. - - - Adam Heath - Made a bunch of nice cleanups to the initialization code, - plus various other small patches. - - - Matt Zimmerman - Matt volunteered to be the UML Debian maintainer and - is doing a real nice job of it. He also noticed and fixed a number of - actually and potentially exploitable security holes in uml_net. Plus - the occasional patch. I like patches. - - - James McMechan - James seems to have taken over maintenance of the ubd - driver and is doing a nice job of it. - - - Chandan Kudige - wrote the umlgdb script which automates the reloading - of module symbols. - - - Steve Schmidtke - wrote the UML slirp transport and hostaudio drivers, - enabling UML processes to access audio devices on the host. He also - submitted patches for the slip transport and lots of other things. - - - David Coulson - - - o Set up the usermodelinux.org site, - which is a great way of keeping the UML user community on top of - UML goings-on. - - o Site documentation and updates - - o Nifty little UML management daemon UMLd - - - o Lots of testing and bug reports - - - - - 15.2. Flushing out bugs - - - - o Yuri Pudgorodsky - - o Gerald Britton - - o Ian Wehrman - - o Gord Lamb - - o Eugene Koontz - - o John H. Hartman - - o Anders Karlsson - - o Daniel Phillips - - o John Fremlin - - o Rainer Burgstaller - - o James Stevenson - - o Matt Clay - - o Cliff Jefferies - - o Geoff Hoff - - o Lennert Buytenhek - - o Al Viro - - o Frank Klingenhoefer - - o Livio Baldini Soares - - o Jon Burgess - - o Petru Paler - - o Paul - - o Chris Reahard - - o Sverker Nilsson - - o Gong Su - - o johan verrept - - o Bjorn Eriksson - - o Lorenzo Allegrucci - - o Muli Ben-Yehuda - - o David Mansfield - - o Howard Goff - - o Mike Anderson - - o John Byrne - - o Sapan J. Batia - - o Iris Huang - - o Jan Hudec - - o Voluspa - - - - - 15.3. Buglets and clean-ups - - - - o Dave Zarzycki - - o Adam Lazur - - o Boria Feigin - - o Brian J. Murrell - - o JS - - o Roman Zippel - - o Wil Cooley - - o Ayelet Shemesh - - o Will Dyson - - o Sverker Nilsson - - o dvorak - - o v.naga srinivas - - o Shlomi Fish - - o Roger Binns - - o johan verrept - - o MrChuoi - - o Peter Cleve - - o Vincent Guffens - - o Nathan Scott - - o Patrick Caulfield - - o jbearce - - o Catalin Marinas - - o Shane Spencer - - o Zou Min - - - o Ryan Boder - - o Lorenzo Colitti - - o Gwendal Grignou - - o Andre' Breiler - - o Tsutomu Yasuda - - - - 15.4. Case Studies - - - o Jon Wright - - o William McEwan - - o Michael Richardson - - - - 15.5. Other contributions - - - Bill Carr made the Red Hat mkrootfs script - work with RH 6.2. - - Michael Jennings sent in some material which - is now gracing the top of the index page of this site. - - SGI (and more specifically Ralf Baechle ) gave me an account on oss.sgi.com - . The bandwidth there made it possible to - produce most of the filesystems available on the project download - page. - - Laurent Bonnaud took the old grotty - Debian filesystem that I've been distributing and updated it to 2.2. - It is now available by itself here. - - Rik van Riel gave me some ftp space on ftp.nl.linux.org so I can make - releases even when Sourceforge is broken. - - Rodrigo de Castro looked at my broken pte code and told me what was - wrong with it, letting me fix a long-standing (several weeks) and - serious set of bugs. - - Chris Reahard built a specialized root filesystem for running a DNS - server jailed inside UML. It's available from the download - page in the Jail - Filesystems section. - - - - - - - - - - - - diff --git a/Documentation/virt/uml/user_mode_linux.rst b/Documentation/virt/uml/user_mode_linux.rst new file mode 100644 index 000000000000..6085d2c0f8a8 --- /dev/null +++ b/Documentation/virt/uml/user_mode_linux.rst @@ -0,0 +1,4460 @@ +.. SPDX-License-Identifier: GPL-2.0 + +===================== +User Mode Linux HOWTO +===================== + +:Author: User Mode Linux Core Team +:Last-updated: Mon Nov 18 14:16:16 EST 2002 + +This document describes the use and abuse of Jeff Dike's User Mode +Linux: a port of the Linux kernel as a normal Intel Linux process. + + +.. Table of Contents + + 1. Introduction + + 1.1 How is User Mode Linux Different? + 1.2 Why Would I Want User Mode Linux? + + 2. Compiling the kernel and modules + + 2.1 Compiling the kernel + 2.2 Compiling and installing kernel modules + 2.3 Compiling and installing uml_utilities + + 3. Running UML and logging in + + 3.1 Running UML + 3.2 Logging in + 3.3 Examples + + 4. UML on 2G/2G hosts + + 4.1 Introduction + 4.2 The problem + 4.3 The solution + + 5. Setting up serial lines and consoles + + 5.1 Specifying the device + 5.2 Specifying the channel + 5.3 Examples + + 6. Setting up the network + + 6.1 General setup + 6.2 Userspace daemons + 6.3 Specifying ethernet addresses + 6.4 UML interface setup + 6.5 Multicast + 6.6 TUN/TAP with the uml_net helper + 6.7 TUN/TAP with a preconfigured tap device + 6.8 Ethertap + 6.9 The switch daemon + 6.10 Slip + 6.11 Slirp + 6.12 pcap + 6.13 Setting up the host yourself + + 7. Sharing Filesystems between Virtual Machines + + 7.1 A warning + 7.2 Using layered block devices + 7.3 Note! + 7.4 Another warning + 7.5 uml_moo : Merging a COW file with its backing file + + 8. Creating filesystems + + 8.1 Create the filesystem file + 8.2 Assign the file to a UML device + 8.3 Creating and mounting the filesystem + + 9. Host file access + + 9.1 Using hostfs + 9.2 hostfs as the root filesystem + 9.3 Building hostfs + + 10. The Management Console + 10.1 version + 10.2 halt and reboot + 10.3 config + 10.4 remove + 10.5 sysrq + 10.6 help + 10.7 cad + 10.8 stop + 10.9 go + + 11. Kernel debugging + + 11.1 Starting the kernel under gdb + 11.2 Examining sleeping processes + 11.3 Running ddd on UML + 11.4 Debugging modules + 11.5 Attaching gdb to the kernel + 11.6 Using alternate debuggers + + 12. Kernel debugging examples + + 12.1 The case of the hung fsck + 12.2 Episode 2: The case of the hung fsck + + 13. What to do when UML doesn't work + + 13.1 Strange compilation errors when you build from source + 13.2 (obsolete) + 13.3 A variety of panics and hangs with /tmp on a reiserfs filesystem + 13.4 The compile fails with errors about conflicting types for 'open', 'dup', and 'waitpid' + 13.5 UML doesn't work when /tmp is an NFS filesystem + 13.6 UML hangs on boot when compiled with gprof support + 13.7 syslogd dies with a SIGTERM on startup + 13.8 TUN/TAP networking doesn't work on a 2.4 host + 13.9 You can network to the host but not to other machines on the net + 13.10 I have no root and I want to scream + 13.11 UML build conflict between ptrace.h and ucontext.h + 13.12 The UML BogoMips is exactly half the host's BogoMips + 13.13 When you run UML, it immediately segfaults + 13.14 xterms appear, then immediately disappear + 13.15 Any other panic, hang, or strange behavior + + 14. Diagnosing Problems + + 14.1 Case 1 : Normal kernel panics + 14.2 Case 2 : Tracing thread panics + 14.3 Case 3 : Tracing thread panics caused by other threads + 14.4 Case 4 : Hangs + + 15. Thanks + + 15.1 Code and Documentation + 15.2 Flushing out bugs + 15.3 Buglets and clean-ups + 15.4 Case Studies + 15.5 Other contributions + + +1. Introduction +================ + + Welcome to User Mode Linux. It's going to be fun. + + + +1.1. How is User Mode Linux Different? +--------------------------------------- + + Normally, the Linux Kernel talks straight to your hardware (video + card, keyboard, hard drives, etc), and any programs which run ask the + kernel to operate the hardware, like so:: + + + + +-----------+-----------+----+ + | Process 1 | Process 2 | ...| + +-----------+-----------+----+ + | Linux Kernel | + +----------------------------+ + | Hardware | + +----------------------------+ + + + + + The User Mode Linux Kernel is different; instead of talking to the + hardware, it talks to a `real` Linux kernel (called the `host kernel` + from now on), like any other program. Programs can then run inside + User-Mode Linux as if they were running under a normal kernel, like + so:: + + + + +----------------+ + | Process 2 | ...| + +-----------+----------------+ + | Process 1 | User-Mode Linux| + +----------------------------+ + | Linux Kernel | + +----------------------------+ + | Hardware | + +----------------------------+ + + + + + +1.2. Why Would I Want User Mode Linux? +--------------------------------------- + + + 1. If User Mode Linux crashes, your host kernel is still fine. + + 2. You can run a usermode kernel as a non-root user. + + 3. You can debug the User Mode Linux like any normal process. + + 4. You can run gprof (profiling) and gcov (coverage testing). + + 5. You can play with your kernel without breaking things. + + 6. You can use it as a sandbox for testing new apps. + + 7. You can try new development kernels safely. + + 8. You can run different distributions simultaneously. + + 9. It's extremely fun. + + + +.. _Compiling_the_kernel_and_modules: + +2. Compiling the kernel and modules +==================================== + + + + +2.1. Compiling the kernel +-------------------------- + + + Compiling the user mode kernel is just like compiling any other + kernel. Let's go through the steps, using 2.4.0-prerelease (current + as of this writing) as an example: + + + 1. Download the latest UML patch from + the download page stop compiling. + + The sources are also available from cvs at the project's cvs page, + which has directions on getting the sources. You can also browse the + CVS pool from there. + + If you get the CVS sources, you will have to check them out into an + empty directory. You will then have to copy each file into the + corresponding directory in the appropriate kernel pool. + + If you don't have the latest kernel pool, you can get the + corresponding user-mode sources with:: + + + host% cvs co -r v_2_3_x linux + + + + + where 'x' is the version in your pool. Note that you will not get the + bug fixes and enhancements that have gone into subsequent releases. + + +2.2. Compiling and installing kernel modules +--------------------------------------------- + + UML modules are built in the same way as the native kernel (with the + exception of the 'ARCH=um' that you always need for UML):: + + + host% make modules ARCH=um + + + + + Any modules that you want to load into this kernel need to be built in + the user-mode pool. Modules from the native kernel won't work. + + You can install them by using ftp or something to copy them into the + virtual machine and dropping them into ``/lib/modules/$(uname -r)``. + + You can also get the kernel build process to install them as follows: + + 1. with the kernel not booted, mount the root filesystem in the top + level of the kernel pool:: + + + host% mount root_fs mnt -o loop + + + + + + + 2. run:: + + + host% + make modules_install INSTALL_MOD_PATH=`pwd`/mnt ARCH=um + + + + + + + 3. unmount the filesystem:: + + + host% umount mnt + + + + + + + 4. boot the kernel on it + + + When the system is booted, you can use insmod as usual to get the + modules into the kernel. A number of things have been loaded into UML + as modules, especially filesystems and network protocols and filters, + so most symbols which need to be exported probably already are. + However, if you do find symbols that need exporting, let us + know, and + they'll be "taken care of". + + + +2.3. Compiling and installing uml_utilities +-------------------------------------------- + + Many features of the UML kernel require a user-space helper program, + so a uml_utilities package is distributed separately from the kernel + patch which provides these helpers. Included within this is: + + - port-helper - Used by consoles which connect to xterms or ports + + - tunctl - Configuration tool to create and delete tap devices + + - uml_net - Setuid binary for automatic tap device configuration + + - uml_switch - User-space virtual switch required for daemon + transport + + The uml_utilities tree is compiled with:: + + + host# + make && make install + + + + + Note that UML kernel patches may require a specific version of the + uml_utilities distribution. If you don't keep up with the mailing + lists, ensure that you have the latest release of uml_utilities if you + are experiencing problems with your UML kernel, particularly when + dealing with consoles or command-line switches to the helper programs + + + + + + + + +3. Running UML and logging in +============================== + + + +3.1. Running UML +----------------- + + It runs on 2.2.15 or later, and all 2.4 kernels. + + + Booting UML is straightforward. Simply run 'linux': it will try to + mount the file ``root_fs`` in the current directory. You do not need to + run it as root. If your root filesystem is not named ``root_fs``, then + you need to put a ``ubd0=root_fs_whatever`` switch on the linux command + line. + + + You will need a filesystem to boot UML from. There are a number + available for download from here . There are also several tools + which can be + used to generate UML-compatible filesystem images from media. + The kernel will boot up and present you with a login prompt. + + +Note: + If the host is configured with a 2G/2G address space split + rather than the usual 3G/1G split, then the packaged UML binaries will + not run. They will immediately segfault. See :ref:`UML_on_2G/2G_hosts` + for the scoop on running UML on your system. + + + +3.2. Logging in +---------------- + + + + The prepackaged filesystems have a root account with password 'root' + and a user account with password 'user'. The login banner will + generally tell you how to log in. So, you log in and you will find + yourself inside a little virtual machine. Our filesystems have a + variety of commands and utilities installed (and it is fairly easy to + add more), so you will have a lot of tools with which to poke around + the system. + + There are a couple of other ways to log in: + + - On a virtual console + + + + Each virtual console that is configured (i.e. the device exists in + /dev and /etc/inittab runs a getty on it) will come up in its own + xterm. If you get tired of the xterms, read + :ref:`setting_up_serial_lines_and_consoles` to see how to attach + the consoles to something else, like host ptys. + + + + - Over the serial line + + + In the boot output, find a line that looks like:: + + + + serial line 0 assigned pty /dev/ptyp1 + + + + + Attach your favorite terminal program to the corresponding tty. I.e. + for minicom, the command would be:: + + + host% minicom -o -p /dev/ttyp1 + + + + + + + - Over the net + + + If the network is running, then you can telnet to the virtual + machine and log in to it. See :ref:`Setting_up_the_network` to learn + about setting up a virtual network. + + When you're done using it, run halt, and the kernel will bring itself + down and the process will exit. + + +3.3. Examples +-------------- + + Here are some examples of UML in action: + + - A login session + + - A virtual network + + + + + + +.. _UML_on_2G/2G_hosts: + +4. UML on 2G/2G hosts +====================== + + + + +4.1. Introduction +------------------ + + + Most Linux machines are configured so that the kernel occupies the + upper 1G (0xc0000000 - 0xffffffff) of the 4G address space and + processes use the lower 3G (0x00000000 - 0xbfffffff). However, some + machine are configured with a 2G/2G split, with the kernel occupying + the upper 2G (0x80000000 - 0xffffffff) and processes using the lower + 2G (0x00000000 - 0x7fffffff). + + + + +4.2. The problem +----------------- + + + The prebuilt UML binaries on this site will not run on 2G/2G hosts + because UML occupies the upper .5G of the 3G process address space + (0xa0000000 - 0xbfffffff). Obviously, on 2G/2G hosts, this is right + in the middle of the kernel address space, so UML won't even load - it + will immediately segfault. + + + + +4.3. The solution +------------------ + + + The fix for this is to rebuild UML from source after enabling + CONFIG_HOST_2G_2G (under 'General Setup'). This will cause UML to + load itself in the top .5G of that smaller process address space, + where it will run fine. See :ref:`Compiling_the_kernel_and_modules` if + you need help building UML from source. + + + + + + + +.. _setting_up_serial_lines_and_consoles: + + +5. Setting up serial lines and consoles +======================================== + + + It is possible to attach UML serial lines and consoles to many types + of host I/O channels by specifying them on the command line. + + + You can attach them to host ptys, ttys, file descriptors, and ports. + This allows you to do things like: + + - have a UML console appear on an unused host console, + + - hook two virtual machines together by having one attach to a pty + and having the other attach to the corresponding tty + + - make a virtual machine accessible from the net by attaching a + console to a port on the host. + + + The general format of the command line option is ``device=channel``. + + + +5.1. Specifying the device +--------------------------- + + Devices are specified with "con" or "ssl" (console or serial line, + respectively), optionally with a device number if you are talking + about a specific device. + + + Using just "con" or "ssl" describes all of the consoles or serial + lines. If you want to talk about console #3 or serial line #10, they + would be "con3" and "ssl10", respectively. + + + A specific device name will override a less general "con=" or "ssl=". + So, for example, you can assign a pty to each of the serial lines + except for the first two like this:: + + + ssl=pty ssl0=tty:/dev/tty0 ssl1=tty:/dev/tty1 + + + + + The specificity of the device name is all that matters; order on the + command line is irrelevant. + + + +5.2. Specifying the channel +---------------------------- + + There are a number of different types of channels to attach a UML + device to, each with a different way of specifying exactly what to + attach to. + + - pseudo-terminals - device=pty pts terminals - device=pts + + + This will cause UML to allocate a free host pseudo-terminal for the + device. The terminal that it got will be announced in the boot + log. You access it by attaching a terminal program to the + corresponding tty: + + - screen /dev/pts/n + + - screen /dev/ttyxx + + - minicom -o -p /dev/ttyxx - minicom seems not able to handle pts + devices + + - kermit - start it up, 'open' the device, then 'connect' + + + + + + - terminals - device=tty:tty device file + + + This will make UML attach the device to the specified tty (i.e:: + + + con1=tty:/dev/tty3 + + + + + will attach UML's console 1 to the host's /dev/tty3). If the tty that + you specify is the slave end of a tty/pty pair, something else must + have already opened the corresponding pty in order for this to work. + + + + + + - xterms - device=xterm + + + UML will run an xterm and the device will be attached to it. + + + + + + - Port - device=port:port number + + + This will attach the UML devices to the specified host port. + Attaching console 1 to the host's port 9000 would be done like + this:: + + + con1=port:9000 + + + + + Attaching all the serial lines to that port would be done similarly:: + + + ssl=port:9000 + + + + + You access these devices by telnetting to that port. Each active + telnet session gets a different device. If there are more telnets to a + port than UML devices attached to it, then the extra telnet sessions + will block until an existing telnet detaches, or until another device + becomes active (i.e. by being activated in /etc/inittab). + + This channel has the advantage that you can both attach multiple UML + devices to it and know how to access them without reading the UML boot + log. It is also unique in allowing access to a UML from remote + machines without requiring that the UML be networked. This could be + useful in allowing public access to UMLs because they would be + accessible from the net, but wouldn't need any kind of network + filtering or access control because they would have no network access. + + + If you attach the main console to a portal, then the UML boot will + appear to hang. In reality, it's waiting for a telnet to connect, at + which point the boot will proceed. + + + + + + - already-existing file descriptors - device=file descriptor + + + If you set up a file descriptor on the UML command line, you can + attach a UML device to it. This is most commonly used to put the + main console back on stdin and stdout after assigning all the other + consoles to something else:: + + + con0=fd:0,fd:1 con=pts + + + + + + + + + - Nothing - device=null + + + This allows the device to be opened, in contrast to 'none', but + reads will block, and writes will succeed and the data will be + thrown out. + + + + + + - None - device=none + + + This causes the device to disappear. + + + + You can also specify different input and output channels for a device + by putting a comma between them:: + + + ssl3=tty:/dev/tty2,xterm + + + + + will cause serial line 3 to accept input on the host's /dev/tty2 and + display output on an xterm. That's a silly example - the most common + use of this syntax is to reattach the main console to stdin and stdout + as shown above. + + + If you decide to move the main console away from stdin/stdout, the + initial boot output will appear in the terminal that you're running + UML in. However, once the console driver has been officially + initialized, then the boot output will start appearing wherever you + specified that console 0 should be. That device will receive all + subsequent output. + + + +5.3. Examples +-------------- + + There are a number of interesting things you can do with this + capability. + + + First, this is how you get rid of those bleeding console xterms by + attaching them to host ptys:: + + + con=pty con0=fd:0,fd:1 + + + + + This will make a UML console take over an unused host virtual console, + so that when you switch to it, you will see the UML login prompt + rather than the host login prompt:: + + + con1=tty:/dev/tty6 + + + + + You can attach two virtual machines together with what amounts to a + serial line as follows: + + Run one UML with a serial line attached to a pty:: + + + ssl1=pty + + + + + Look at the boot log to see what pty it got (this example will assume + that it got /dev/ptyp1). + + Boot the other UML with a serial line attached to the corresponding + tty:: + + + ssl1=tty:/dev/ttyp1 + + + + + Log in, make sure that it has no getty on that serial line, attach a + terminal program like minicom to it, and you should see the login + prompt of the other virtual machine. + + +.. _setting_up_the_network: + +6. Setting up the network +========================== + + + + This page describes how to set up the various transports and to + provide a UML instance with network access to the host, other machines + on the local net, and the rest of the net. + + + As of 2.4.5, UML networking has been completely redone to make it much + easier to set up, fix bugs, and add new features. + + + There is a new helper, uml_net, which does the host setup that + requires root privileges. + + + There are currently five transport types available for a UML virtual + machine to exchange packets with other hosts: + + - ethertap + + - TUN/TAP + + - Multicast + + - a switch daemon + + - slip + + - slirp + + - pcap + + The TUN/TAP, ethertap, slip, and slirp transports allow a UML + instance to exchange packets with the host. They may be directed + to the host or the host may just act as a router to provide access + to other physical or virtual machines. + + + The pcap transport is a synthetic read-only interface, using the + libpcap binary to collect packets from interfaces on the host and + filter them. This is useful for building preconfigured traffic + monitors or sniffers. + + + The daemon and multicast transports provide a completely virtual + network to other virtual machines. This network is completely + disconnected from the physical network unless one of the virtual + machines on it is acting as a gateway. + + + With so many host transports, which one should you use? Here's when + you should use each one: + + - ethertap - if you want access to the host networking and it is + running 2.2 + + - TUN/TAP - if you want access to the host networking and it is + running 2.4. Also, the TUN/TAP transport is able to use a + preconfigured device, allowing it to avoid using the setuid uml_net + helper, which is a security advantage. + + - Multicast - if you want a purely virtual network and you don't want + to set up anything but the UML + + - a switch daemon - if you want a purely virtual network and you + don't mind running the daemon in order to get somewhat better + performance + + - slip - there is no particular reason to run the slip backend unless + ethertap and TUN/TAP are just not available for some reason + + - slirp - if you don't have root access on the host to setup + networking, or if you don't want to allocate an IP to your UML + + - pcap - not much use for actual network connectivity, but great for + monitoring traffic on the host + + Ethertap is available on 2.4 and works fine. TUN/TAP is preferred + to it because it has better performance and ethertap is officially + considered obsolete in 2.4. Also, the root helper only needs to + run occasionally for TUN/TAP, rather than handling every packet, as + it does with ethertap. This is a slight security advantage since + it provides fewer opportunities for a nasty UML user to somehow + exploit the helper's root privileges. + + +6.1. General setup +------------------- + + First, you must have the virtual network enabled in your UML. If are + running a prebuilt kernel from this site, everything is already + enabled. If you build the kernel yourself, under the "Network device + support" menu, enable "Network device support", and then the three + transports. + + + The next step is to provide a network device to the virtual machine. + This is done by describing it on the kernel command line. + + The general format is:: + + + eth = , + + + + + For example, a virtual ethernet device may be attached to a host + ethertap device as follows:: + + + eth0=ethertap,tap0,fe:fd:0:0:0:1,192.168.0.254 + + + + + This sets up eth0 inside the virtual machine to attach itself to the + host /dev/tap0, assigns it an ethernet address, and assigns the host + tap0 interface an IP address. + + + + Note that the IP address you assign to the host end of the tap device + must be different than the IP you assign to the eth device inside UML. + If you are short on IPs and don't want to consume two per UML, then + you can reuse the host's eth IP address for the host ends of the tap + devices. Internally, the UMLs must still get unique IPs for their eth + devices. You can also give the UMLs non-routable IPs (192.168.x.x or + 10.x.x.x) and have the host masquerade them. This will let outgoing + connections work, but incoming connections won't without more work, + such as port forwarding from the host. + Also note that when you configure the host side of an interface, it is + only acting as a gateway. It will respond to pings sent to it + locally, but is not useful to do that since it's a host interface. + You are not talking to the UML when you ping that interface and get a + response. + + + You can also add devices to a UML and remove them at runtime. See the + :ref:`The_Management_Console` page for details. + + + The sections below describe this in more detail. + + + Once you've decided how you're going to set up the devices, you boot + UML, log in, configure the UML side of the devices, and set up routes + to the outside world. At that point, you will be able to talk to any + other machines, physical or virtual, on the net. + + + If ifconfig inside UML fails and the network refuses to come up, run + tell you what went wrong. + + + +6.2. Userspace daemons +----------------------- + + You will likely need the setuid helper, or the switch daemon, or both. + They are both installed with the RPM and deb, so if you've installed + either, you can skip the rest of this section. + + + If not, then you need to check them out of CVS, build them, and + install them. The helper is uml_net, in CVS /tools/uml_net, and the + daemon is uml_switch, in CVS /tools/uml_router. They are both built + with a plain 'make'. Both need to be installed in a directory that's + in your path - /usr/bin is recommend. On top of that, uml_net needs + to be setuid root. + + + +6.3. Specifying ethernet addresses +----------------------------------- + + Below, you will see that the TUN/TAP, ethertap, and daemon interfaces + allow you to specify hardware addresses for the virtual ethernet + devices. This is generally not necessary. If you don't have a + specific reason to do it, you probably shouldn't. If one is not + specified on the command line, the driver will assign one based on the + device IP address. It will provide the address fe:fd:nn:nn:nn:nn + where nn.nn.nn.nn is the device IP address. This is nearly always + sufficient to guarantee a unique hardware address for the device. A + couple of exceptions are: + + - Another set of virtual ethernet devices are on the same network and + they are assigned hardware addresses using a different scheme which + may conflict with the UML IP address-based scheme + + - You aren't going to use the device for IP networking, so you don't + assign the device an IP address + + If you let the driver provide the hardware address, you should make + sure that the device IP address is known before the interface is + brought up. So, inside UML, this will guarantee that:: + + + + UML# + ifconfig eth0 192.168.0.250 up + + + + + If you decide to assign the hardware address yourself, make sure that + the first byte of the address is even. Addresses with an odd first + byte are broadcast addresses, which you don't want assigned to a + device. + + + +6.4. UML interface setup +------------------------- + + Once the network devices have been described on the command line, you + should boot UML and log in. + + + The first thing to do is bring the interface up:: + + + UML# ifconfig ethn ip-address up + + + + + You should be able to ping the host at this point. + + + To reach the rest of the world, you should set a default route to the + host:: + + + UML# route add default gw host ip + + + + + Again, with host ip of 192.168.0.4:: + + + UML# route add default gw 192.168.0.4 + + + + + This page used to recommend setting a network route to your local net. + This is wrong, because it will cause UML to try to figure out hardware + addresses of the local machines by arping on the interface to the + host. Since that interface is basically a single strand of ethernet + with two nodes on it (UML and the host) and arp requests don't cross + networks, they will fail to elicit any responses. So, what you want + is for UML to just blindly throw all packets at the host and let it + figure out what to do with them, which is what leaving out the network + route and adding the default route does. + + + Note: If you can't communicate with other hosts on your physical + ethernet, it's probably because of a network route that's + automatically set up. If you run 'route -n' and see a route that + looks like this:: + + + + + Destination Gateway Genmask Flags Metric Ref Use Iface + 192.168.0.0 0.0.0.0 255.255.255.0 U 0 0 0 eth0 + + + + + with a mask that's not 255.255.255.255, then replace it with a route + to your host:: + + + UML# + route del -net 192.168.0.0 dev eth0 netmask 255.255.255.0 + + + UML# + route add -host 192.168.0.4 dev eth0 + + + + + This, plus the default route to the host, will allow UML to exchange + packets with any machine on your ethernet. + + + +6.5. Multicast +--------------- + + The simplest way to set up a virtual network between multiple UMLs is + to use the mcast transport. This was written by Harald Welte and is + present in UML version 2.4.5-5um and later. Your system must have + multicast enabled in the kernel and there must be a multicast-capable + network device on the host. Normally, this is eth0, but if there is + no ethernet card on the host, then you will likely get strange error + messages when you bring the device up inside UML. + + + To use it, run two UMLs with:: + + + eth0=mcast + + + + + on their command lines. Log in, configure the ethernet device in each + machine with different IP addresses:: + + + UML1# ifconfig eth0 192.168.0.254 + + + UML2# ifconfig eth0 192.168.0.253 + + + + + and they should be able to talk to each other. + + The full set of command line options for this transport are:: + + + + ethn=mcast,ethernet address,multicast + address,multicast port,ttl + + + + + Harald's original README is here and explains these in detail, as well as + some other issues. + + There is also a related point-to-point only "ucast" transport. + This is useful when your network does not support multicast, and + all network connections are simple point to point links. + + The full set of command line options for this transport are:: + + + ethn=ucast,ethernet address,remote address,listen port,remote port + + + + +6.6. TUN/TAP with the uml_net helper +------------------------------------- + + TUN/TAP is the preferred mechanism on 2.4 to exchange packets with the + host. The TUN/TAP backend has been in UML since 2.4.9-3um. + + + The easiest way to get up and running is to let the setuid uml_net + helper do the host setup for you. This involves insmod-ing the tun.o + module if necessary, configuring the device, and setting up IP + forwarding, routing, and proxy arp. If you are new to UML networking, + do this first. If you're concerned about the security implications of + the setuid helper, use it to get up and running, then read the next + section to see how to have UML use a preconfigured tap device, which + avoids the use of uml_net. + + + If you specify an IP address for the host side of the device, the + uml_net helper will do all necessary setup on the host - the only + requirement is that TUN/TAP be available, either built in to the host + kernel or as the tun.o module. + + The format of the command line switch to attach a device to a TUN/TAP + device is:: + + + eth =tuntap,,, + + + + + For example, this argument will attach the UML's eth0 to the next + available tap device and assign an ethernet address to it based on its + IP address:: + + + eth0=tuntap,,,192.168.0.254 + + + + + + + Note that the IP address that must be used for the eth device inside + UML is fixed by the routing and proxy arp that is set up on the + TUN/TAP device on the host. You can use a different one, but it won't + work because reply packets won't reach the UML. This is a feature. + It prevents a nasty UML user from doing things like setting the UML IP + to the same as the network's nameserver or mail server. + + + There are a couple potential problems with running the TUN/TAP + transport on a 2.4 host kernel + + - TUN/TAP seems not to work on 2.4.3 and earlier. Upgrade the host + kernel or use the ethertap transport. + + - With an upgraded kernel, TUN/TAP may fail with:: + + + File descriptor in bad state + + + + + This is due to a header mismatch between the upgraded kernel and the + kernel that was originally installed on the machine. The fix is to + make sure that /usr/src/linux points to the headers for the running + kernel. + + These were pointed out by Tim Robinson in + name="this uml-user post"> . + + + +6.7. TUN/TAP with a preconfigured tap device +--------------------------------------------- + + If you prefer not to have UML use uml_net (which is somewhat + insecure), with UML 2.4.17-11, you can set up a TUN/TAP device + beforehand. The setup needs to be done as root, but once that's done, + there is no need for root assistance. Setting up the device is done + as follows: + + - Create the device with tunctl (available from the UML utilities + tarball):: + + + + + host# tunctl -u uid + + + + + where uid is the user id or username that UML will be run as. This + will tell you what device was created. + + - Configure the device IP (change IP addresses and device name to + suit):: + + + + + host# ifconfig tap0 192.168.0.254 up + + + + + + - Set up routing and arping if desired - this is my recipe, there are + other ways of doing the same thing:: + + + host# + bash -c 'echo 1 > /proc/sys/net/ipv4/ip_forward' + + host# + route add -host 192.168.0.253 dev tap0 + + host# + bash -c 'echo 1 > /proc/sys/net/ipv4/conf/tap0/proxy_arp' + + host# + arp -Ds 192.168.0.253 eth0 pub + + + + + Note that this must be done every time the host boots - this configu- + ration is not stored across host reboots. So, it's probably a good + idea to stick it in an rc file. An even better idea would be a little + utility which reads the information from a config file and sets up + devices at boot time. + + - Rather than using up two IPs and ARPing for one of them, you can + also provide direct access to your LAN by the UML by using a + bridge:: + + + host# + brctl addbr br0 + + + host# + ifconfig eth0 0.0.0.0 promisc up + + + host# + ifconfig tap0 0.0.0.0 promisc up + + + host# + ifconfig br0 192.168.0.1 netmask 255.255.255.0 up + + + host# + brctl stp br0 off + + + host# + brctl setfd br0 1 + + + host# + brctl sethello br0 1 + + + host# + brctl addif br0 eth0 + + + host# + brctl addif br0 tap0 + + + + + Note that 'br0' should be setup using ifconfig with the existing IP + address of eth0, as eth0 no longer has its own IP. + + - + + + Also, the /dev/net/tun device must be writable by the user running + UML in order for the UML to use the device that's been configured + for it. The simplest thing to do is:: + + + host# chmod 666 /dev/net/tun + + + + + Making it world-writable looks bad, but it seems not to be + exploitable as a security hole. However, it does allow anyone to cre- + ate useless tap devices (useless because they can't configure them), + which is a DOS attack. A somewhat more secure alternative would to be + to create a group containing all the users who have preconfigured tap + devices and chgrp /dev/net/tun to that group with mode 664 or 660. + + + - Once the device is set up, run UML with 'eth0=tuntap,device name' + (i.e. 'eth0=tuntap,tap0') on the command line (or do it with the + mconsole config command). + + - Bring the eth device up in UML and you're in business. + + If you don't want that tap device any more, you can make it non- + persistent with:: + + + host# tunctl -d tap device + + + + + Finally, tunctl has a -b (for brief mode) switch which causes it to + output only the name of the tap device it created. This makes it + suitable for capture by a script:: + + + host# TAP=`tunctl -u 1000 -b` + + + + + + +6.8. Ethertap +-------------- + + Ethertap is the general mechanism on 2.2 for userspace processes to + exchange packets with the kernel. + + + + To use this transport, you need to describe the virtual network device + on the UML command line. The general format for this is:: + + + eth =ethertap, , , + + + + + So, the previous example:: + + + eth0=ethertap,tap0,fe:fd:0:0:0:1,192.168.0.254 + + + + + attaches the UML eth0 device to the host /dev/tap0, assigns it the + ethernet address fe:fd:0:0:0:1, and assigns the IP address + 192.168.0.254 to the tap device. + + + + The tap device is mandatory, but the others are optional. If the + ethernet address is omitted, one will be assigned to it. + + + The presence of the tap IP address will cause the helper to run and do + whatever host setup is needed to allow the virtual machine to + communicate with the outside world. If you're not sure you know what + you're doing, this is the way to go. + + + If it is absent, then you must configure the tap device and whatever + arping and routing you will need on the host. However, even in this + case, the uml_net helper still needs to be in your path and it must be + setuid root if you're not running UML as root. This is because the + tap device doesn't support SIGIO, which UML needs in order to use + something as a source of input. So, the helper is used as a + convenient asynchronous IO thread. + + If you're using the uml_net helper, you can ignore the following host + setup - uml_net will do it for you. You just need to make sure you + have ethertap available, either built in to the host kernel or + available as a module. + + + If you want to set things up yourself, you need to make sure that the + appropriate /dev entry exists. If it doesn't, become root and create + it as follows:: + + + mknod /dev/tap c 36 + 16 + + + + + For example, this is how to create /dev/tap0:: + + + mknod /dev/tap0 c 36 0 + 16 + + + + + You also need to make sure that the host kernel has ethertap support. + If ethertap is enabled as a module, you apparently need to insmod + ethertap once for each ethertap device you want to enable. So,:: + + + host# + insmod ethertap + + + + + will give you the tap0 interface. To get the tap1 interface, you need + to run:: + + + host# + insmod ethertap unit=1 -o ethertap1 + + + + + + + +6.9. The switch daemon +----------------------- + + Note: This is the daemon formerly known as uml_router, but which was + renamed so the network weenies of the world would stop growling at me. + + + The switch daemon, uml_switch, provides a mechanism for creating a + totally virtual network. By default, it provides no connection to the + host network (but see -tap, below). + + + The first thing you need to do is run the daemon. Running it with no + arguments will make it listen on a default pair of unix domain + sockets. + + + If you want it to listen on a different pair of sockets, use:: + + + -unix control socket data socket + + + + + + If you want it to act as a hub rather than a switch, use:: + + + -hub + + + + + + If you want the switch to be connected to host networking (allowing + the umls to get access to the outside world through the host), use:: + + + -tap tap0 + + + + + + Note that the tap device must be preconfigured (see "TUN/TAP with a + preconfigured tap device", above). If you're using a different tap + device than tap0, specify that instead of tap0. + + + uml_switch can be backgrounded as follows:: + + + host% + uml_switch [ options ] < /dev/null > /dev/null + + + + + The reason it doesn't background by default is that it listens to + stdin for EOF. When it sees that, it exits. + + + The general format of the kernel command line switch is:: + + + + ethn=daemon,ethernet address,socket + type,control socket,data socket + + + + + You can leave off everything except the 'daemon'. You only need to + specify the ethernet address if the one that will be assigned to it + isn't acceptable for some reason. The rest of the arguments describe + how to communicate with the daemon. You should only specify them if + you told the daemon to use different sockets than the default. So, if + you ran the daemon with no arguments, running the UML on the same + machine with:: + + eth0=daemon + + + + + will cause the eth0 driver to attach itself to the daemon correctly. + + + +6.10. Slip +----------- + + Slip is another, less general, mechanism for a process to communicate + with the host networking. In contrast to the ethertap interface, + which exchanges ethernet frames with the host and can be used to + transport any higher-level protocol, it can only be used to transport + IP. + + + The general format of the command line switch is:: + + + + ethn=slip,slip IP + + + + + The slip IP argument is the IP address that will be assigned to the + host end of the slip device. If it is specified, the helper will run + and will set up the host so that the virtual machine can reach it and + the rest of the network. + + + There are some oddities with this interface that you should be aware + of. You should only specify one slip device on a given virtual + machine, and its name inside UML will be 'umn', not 'eth0' or whatever + you specified on the command line. These problems will be fixed at + some point. + + + +6.11. Slirp +------------ + + slirp uses an external program, usually /usr/bin/slirp, to provide IP + only networking connectivity through the host. This is similar to IP + masquerading with a firewall, although the translation is performed in + user-space, rather than by the kernel. As slirp does not set up any + interfaces on the host, or changes routing, slirp does not require + root access or setuid binaries on the host. + + + The general format of the command line switch for slirp is:: + + + + ethn=slirp,ethernet address,slirp path + + + + + The ethernet address is optional, as UML will set up the interface + with an ethernet address based upon the initial IP address of the + interface. The slirp path is generally /usr/bin/slirp, although it + will depend on distribution. + + + The slirp program can have a number of options passed to the command + line and we can't add them to the UML command line, as they will be + parsed incorrectly. Instead, a wrapper shell script can be written or + the options inserted into the /.slirprc file. More information on + all of the slirp options can be found in its man pages. + + + The eth0 interface on UML should be set up with the IP 10.2.0.15, + although you can use anything as long as it is not used by a network + you will be connecting to. The default route on UML should be set to + use:: + + + UML# + route add default dev eth0 + + + + + slirp provides a number of useful IP addresses which can be used by + UML, such as 10.0.2.3 which is an alias for the DNS server specified + in /etc/resolv.conf on the host or the IP given in the 'dns' option + for slirp. + + + Even with a baudrate setting higher than 115200, the slirp connection + is limited to 115200. If you need it to go faster, the slirp binary + needs to be compiled with FULL_BOLT defined in config.h. + + + +6.12. pcap +----------- + + The pcap transport is attached to a UML ethernet device on the command + line or with uml_mconsole with the following syntax:: + + + + ethn=pcap,host interface,filter + expression,option1,option2 + + + + + The expression and options are optional. + + + The interface is whatever network device on the host you want to + sniff. The expression is a pcap filter expression, which is also what + tcpdump uses, so if you know how to specify tcpdump filters, you will + use the same expressions here. The options are up to two of + 'promisc', control whether pcap puts the host interface into + promiscuous mode. 'optimize' and 'nooptimize' control whether the pcap + expression optimizer is used. + + + Example:: + + + + eth0=pcap,eth0,tcp + + eth1=pcap,eth0,!tcp + + + + will cause the UML eth0 to emit all tcp packets on the host eth0 and + the UML eth1 to emit all non-tcp packets on the host eth0. + + + +6.13. Setting up the host yourself +----------------------------------- + + If you don't specify an address for the host side of the ethertap or + slip device, UML won't do any setup on the host. So this is what is + needed to get things working (the examples use a host-side IP of + 192.168.0.251 and a UML-side IP of 192.168.0.250 - adjust to suit your + own network): + + - The device needs to be configured with its IP address. Tap devices + are also configured with an mtu of 1484. Slip devices are + configured with a point-to-point address pointing at the UML ip + address:: + + + host# ifconfig tap0 arp mtu 1484 192.168.0.251 up + + + host# + ifconfig sl0 192.168.0.251 pointopoint 192.168.0.250 up + + + + + + - If a tap device is being set up, a route is set to the UML IP:: + + + UML# route add -host 192.168.0.250 gw 192.168.0.251 + + + + + + - To allow other hosts on your network to see the virtual machine, + proxy arp is set up for it:: + + + host# arp -Ds 192.168.0.250 eth0 pub + + + + + + - Finally, the host is set up to route packets:: + + + host# echo 1 > /proc/sys/net/ipv4/ip_forward + + + + + + + + + + +7. Sharing Filesystems between Virtual Machines +================================================ + + + + +7.1. A warning +--------------- + + Don't attempt to share filesystems simply by booting two UMLs from the + same file. That's the same thing as booting two physical machines + from a shared disk. It will result in filesystem corruption. + + + +7.2. Using layered block devices +--------------------------------- + + The way to share a filesystem between two virtual machines is to use + the copy-on-write (COW) layering capability of the ubd block driver. + As of 2.4.6-2um, the driver supports layering a read-write private + device over a read-only shared device. A machine's writes are stored + in the private device, while reads come from either device - the + private one if the requested block is valid in it, the shared one if + not. Using this scheme, the majority of data which is unchanged is + shared between an arbitrary number of virtual machines, each of which + has a much smaller file containing the changes that it has made. With + a large number of UMLs booting from a large root filesystem, this + leads to a huge disk space saving. It will also help performance, + since the host will be able to cache the shared data using a much + smaller amount of memory, so UML disk requests will be served from the + host's memory rather than its disks. + + + + + To add a copy-on-write layer to an existing block device file, simply + add the name of the COW file to the appropriate ubd switch:: + + + ubd0=root_fs_cow,root_fs_debian_22 + + + + + where 'root_fs_cow' is the private COW file and 'root_fs_debian_22' is + the existing shared filesystem. The COW file need not exist. If it + doesn't, the driver will create and initialize it. Once the COW file + has been initialized, it can be used on its own on the command line:: + + + ubd0=root_fs_cow + + + + + The name of the backing file is stored in the COW file header, so it + would be redundant to continue specifying it on the command line. + + + +7.3. Note! +----------- + + When checking the size of the COW file in order to see the gobs of + space that you're saving, make sure you use 'ls -ls' to see the actual + disk consumption rather than the length of the file. The COW file is + sparse, so the length will be very different from the disk usage. + Here is a 'ls -l' of a COW file and backing file from one boot and + shutdown:: + + host% ls -l cow.debian debian2.2 + -rw-r--r-- 1 jdike jdike 492504064 Aug 6 21:16 cow.debian + -rwxrw-rw- 1 jdike jdike 537919488 Aug 6 20:42 debian2.2 + + + + + Doesn't look like much saved space, does it? Well, here's 'ls -ls':: + + + host% ls -ls cow.debian debian2.2 + 880 -rw-r--r-- 1 jdike jdike 492504064 Aug 6 21:16 cow.debian + 525832 -rwxrw-rw- 1 jdike jdike 537919488 Aug 6 20:42 debian2.2 + + + + + Now, you can see that the COW file has less than a meg of disk, rather + than 492 meg. + + + +7.4. Another warning +--------------------- + + Once a filesystem is being used as a readonly backing file for a COW + file, do not boot directly from it or modify it in any way. Doing so + will invalidate any COW files that are using it. The mtime and size + of the backing file are stored in the COW file header at its creation, + and they must continue to match. If they don't, the driver will + refuse to use the COW file. + + + + + If you attempt to evade this restriction by changing either the + backing file or the COW header by hand, you will get a corrupted + filesystem. + + + + + Among other things, this means that upgrading the distribution in a + backing file and expecting that all of the COW files using it will see + the upgrade will not work. + + + + +7.5. uml_moo : Merging a COW file with its backing file +-------------------------------------------------------- + + Depending on how you use UML and COW devices, it may be advisable to + merge the changes in the COW file into the backing file every once in + a while. + + + + + The utility that does this is uml_moo. Its usage is:: + + + host% uml_moo COW file new backing file + + + + + There's no need to specify the backing file since that information is + already in the COW file header. If you're paranoid, boot the new + merged file, and if you're happy with it, move it over the old backing + file. + + + + + uml_moo creates a new backing file by default as a safety measure. It + also has a destructive merge option which will merge the COW file + directly into its current backing file. This is really only usable + when the backing file only has one COW file associated with it. If + there are multiple COWs associated with a backing file, a -d merge of + one of them will invalidate all of the others. However, it is + convenient if you're short of disk space, and it should also be + noticeably faster than a non-destructive merge. + + + + + uml_moo is installed with the UML deb and RPM. If you didn't install + UML from one of those packages, you can also get it from the UML + utilities tar file in tools/moo. + + + + + + + + +8. Creating filesystems +======================== + + + You may want to create and mount new UML filesystems, either because + your root filesystem isn't large enough or because you want to use a + filesystem other than ext2. + + + This was written on the occasion of reiserfs being included in the + 2.4.1 kernel pool, and therefore the 2.4.1 UML, so the examples will + talk about reiserfs. This information is generic, and the examples + should be easy to translate to the filesystem of your choice. + + +8.1. Create the filesystem file +================================ + + dd is your friend. All you need to do is tell dd to create an empty + file of the appropriate size. I usually make it sparse to save time + and to avoid allocating disk space until it's actually used. For + example, the following command will create a sparse 100 meg file full + of zeroes:: + + + host% + dd if=/dev/zero of=new_filesystem seek=100 count=1 bs=1M + + + + + + + 8.2. Assign the file to a UML device + + Add an argument like the following to the UML command line:: + + ubd4=new_filesystem + + + + + making sure that you use an unassigned ubd device number. + + + + 8.3. Creating and mounting the filesystem + + Make sure that the filesystem is available, either by being built into + the kernel, or available as a module, then boot up UML and log in. If + the root filesystem doesn't have the filesystem utilities (mkfs, fsck, + etc), then get them into UML by way of the net or hostfs. + + + Make the new filesystem on the device assigned to the new file:: + + + host# mkreiserfs /dev/ubd/4 + + + <----------- MKREISERFSv2 -----------> + + ReiserFS version 3.6.25 + Block size 4096 bytes + Block count 25856 + Used blocks 8212 + Journal - 8192 blocks (18-8209), journal header is in block 8210 + Bitmaps: 17 + Root block 8211 + Hash function "r5" + ATTENTION: ALL DATA WILL BE LOST ON '/dev/ubd/4'! (y/n)y + journal size 8192 (from 18) + Initializing journal - 0%....20%....40%....60%....80%....100% + Syncing..done. + + + + + Now, mount it:: + + + UML# + mount /dev/ubd/4 /mnt + + + + + and you're in business. + + + + + + + + + +9. Host file access +==================== + + + If you want to access files on the host machine from inside UML, you + can treat it as a separate machine and either nfs mount directories + from the host or copy files into the virtual machine with scp or rcp. + However, since UML is running on the host, it can access those + files just like any other process and make them available inside the + virtual machine without needing to use the network. + + + This is now possible with the hostfs virtual filesystem. With it, you + can mount a host directory into the UML filesystem and access the + files contained in it just as you would on the host. + + +9.1. Using hostfs +------------------ + + To begin with, make sure that hostfs is available inside the virtual + machine with:: + + + UML# cat /proc/filesystems + + + + . hostfs should be listed. If it's not, either rebuild the kernel + with hostfs configured into it or make sure that hostfs is built as a + module and available inside the virtual machine, and insmod it. + + + Now all you need to do is run mount:: + + + UML# mount none /mnt/host -t hostfs + + + + + will mount the host's / on the virtual machine's /mnt/host. + + + If you don't want to mount the host root directory, then you can + specify a subdirectory to mount with the -o switch to mount:: + + + UML# mount none /mnt/home -t hostfs -o /home + + + + + will mount the hosts's /home on the virtual machine's /mnt/home. + + + +9.2. hostfs as the root filesystem +----------------------------------- + + It's possible to boot from a directory hierarchy on the host using + hostfs rather than using the standard filesystem in a file. + + To start, you need that hierarchy. The easiest way is to loop mount + an existing root_fs file:: + + + host# mount root_fs uml_root_dir -o loop + + + + + You need to change the filesystem type of / in etc/fstab to be + 'hostfs', so that line looks like this:: + + /dev/ubd/0 / hostfs defaults 1 1 + + + + + Then you need to chown to yourself all the files in that directory + that are owned by root. This worked for me:: + + + host# find . -uid 0 -exec chown jdike {} \; + + + + + Next, make sure that your UML kernel has hostfs compiled in, not as a + module. Then run UML with the boot device pointing at that directory:: + + + ubd0=/path/to/uml/root/directory + + + + + UML should then boot as it does normally. + + +9.3. Building hostfs +--------------------- + + If you need to build hostfs because it's not in your kernel, you have + two choices: + + + + - Compiling hostfs into the kernel: + + + Reconfigure the kernel and set the 'Host filesystem' option under + + + - Compiling hostfs as a module: + + + Reconfigure the kernel and set the 'Host filesystem' option under + be in arch/um/fs/hostfs/hostfs.o. Install that in + ``/lib/modules/$(uname -r)/fs`` in the virtual machine, boot it up, and:: + + + UML# insmod hostfs + + +.. _The_Management_Console: + +10. The Management Console +=========================== + + + + The UML management console is a low-level interface to the kernel, + somewhat like the i386 SysRq interface. Since there is a full-blown + operating system under UML, there is much greater flexibility possible + than with the SysRq mechanism. + + + There are a number of things you can do with the mconsole interface: + + - get the kernel version + + - add and remove devices + + - halt or reboot the machine + + - Send SysRq commands + + - Pause and resume the UML + + + You need the mconsole client (uml_mconsole) which is present in CVS + (/tools/mconsole) in 2.4.5-9um and later, and will be in the RPM in + 2.4.6. + + + You also need CONFIG_MCONSOLE (under 'General Setup') enabled in UML. + When you boot UML, you'll see a line like:: + + + mconsole initialized on /home/jdike/.uml/umlNJ32yL/mconsole + + + + + If you specify a unique machine id one the UML command line, i.e.:: + + + umid=debian + + + + + you'll see this:: + + + mconsole initialized on /home/jdike/.uml/debian/mconsole + + + + + That file is the socket that uml_mconsole will use to communicate with + UML. Run it with either the umid or the full path as its argument:: + + + host% uml_mconsole debian + + + + + or:: + + + host% uml_mconsole /home/jdike/.uml/debian/mconsole + + + + + You'll get a prompt, at which you can run one of these commands: + + - version + + - halt + + - reboot + + - config + + - remove + + - sysrq + + - help + + - cad + + - stop + + - go + + +10.1. version +-------------- + + This takes no arguments. It prints the UML version:: + + + (mconsole) version + OK Linux usermode 2.4.5-9um #1 Wed Jun 20 22:47:08 EDT 2001 i686 + + + + + There are a couple actual uses for this. It's a simple no-op which + can be used to check that a UML is running. It's also a way of + sending an interrupt to the UML. This is sometimes useful on SMP + hosts, where there's a bug which causes signals to UML to be lost, + often causing it to appear to hang. Sending such a UML the mconsole + version command is a good way to 'wake it up' before networking has + been enabled, as it does not do anything to the function of the UML. + + + +10.2. halt and reboot +---------------------- + + These take no arguments. They shut the machine down immediately, with + no syncing of disks and no clean shutdown of userspace. So, they are + pretty close to crashing the machine:: + + + (mconsole) halt + OK + + + + + + +10.3. config +------------- + + "config" adds a new device to the virtual machine. Currently the ubd + and network drivers support this. It takes one argument, which is the + device to add, with the same syntax as the kernel command line:: + + + + + (mconsole) + config ubd3=/home/jdike/incoming/roots/root_fs_debian22 + + OK + (mconsole) config eth1=mcast + OK + + + + + + +10.4. remove +------------- + + "remove" deletes a device from the system. Its argument is just the + name of the device to be removed. The device must be idle in whatever + sense the driver considers necessary. In the case of the ubd driver, + the removed block device must not be mounted, swapped on, or otherwise + open, and in the case of the network driver, the device must be down:: + + + (mconsole) remove ubd3 + OK + (mconsole) remove eth1 + OK + + + + + + +10.5. sysrq +------------ + + This takes one argument, which is a single letter. It calls the + generic kernel's SysRq driver, which does whatever is called for by + that argument. See the SysRq documentation in + Documentation/admin-guide/sysrq.rst in your favorite kernel tree to + see what letters are valid and what they do. + + + +10.6. help +----------- + + "help" returns a string listing the valid commands and what each one + does. + + + +10.7. cad +---------- + + This invokes the Ctl-Alt-Del action on init. What exactly this ends + up doing is up to /etc/inittab. Normally, it reboots the machine. + With UML, this is usually not desired, so if a halt would be better, + then find the section of inittab that looks like this:: + + + # What to do when CTRL-ALT-DEL is pressed. + ca:12345:ctrlaltdel:/sbin/shutdown -t1 -a -r now + + + + + and change the command to halt. + + + +10.8. stop +----------- + + This puts the UML in a loop reading mconsole requests until a 'go' + mconsole command is received. This is very useful for making backups + of UML filesystems, as the UML can be stopped, then synced via 'sysrq + s', so that everything is written to the filesystem. You can then copy + the filesystem and then send the UML 'go' via mconsole. + + + Note that a UML running with more than one CPU will have problems + after you send the 'stop' command, as only one CPU will be held in a + mconsole loop and all others will continue as normal. This is a bug, + and will be fixed. + + + +10.9. go +--------- + + This resumes a UML after being paused by a 'stop' command. Note that + when the UML has resumed, TCP connections may have timed out and if + the UML is paused for a long period of time, crond might go a little + crazy, running all the jobs it didn't do earlier. + + + + + + +.. _Kernel_debugging: + +11. Kernel debugging +===================== + + + Note: The interface that makes debugging, as described here, possible + is present in 2.4.0-test6 kernels and later. + + + Since the user-mode kernel runs as a normal Linux process, it is + possible to debug it with gdb almost like any other process. It is + slightly different because the kernel's threads are already being + ptraced for system call interception, so gdb can't ptrace them. + However, a mechanism has been added to work around that problem. + + + In order to debug the kernel, you need build it from source. See + :ref:`Compiling_the_kernel_and_modules` for information on doing that. + Make sure that you enable CONFIG_DEBUGSYM and CONFIG_PT_PROXY during + the config. These will compile the kernel with ``-g``, and enable the + ptrace proxy so that gdb works with UML, respectively. + + + + +11.1. Starting the kernel under gdb +------------------------------------ + + You can have the kernel running under the control of gdb from the + beginning by putting 'debug' on the command line. You will get an + xterm with gdb running inside it. The kernel will send some commands + to gdb which will leave it stopped at the beginning of start_kernel. + At this point, you can get things going with 'next', 'step', or + 'cont'. + + + There is a transcript of a debugging session here , with breakpoints being set in the scheduler and in an + interrupt handler. + + +11.2. Examining sleeping processes +----------------------------------- + + + Not every bug is evident in the currently running process. Sometimes, + processes hang in the kernel when they shouldn't because they've + deadlocked on a semaphore or something similar. In this case, when + you ^C gdb and get a backtrace, you will see the idle thread, which + isn't very relevant. + + + What you want is the stack of whatever process is sleeping when it + shouldn't be. You need to figure out which process that is, which is + generally fairly easy. Then you need to get its host process id, + which you can do either by looking at ps on the host or at + task.thread.extern_pid in gdb. + + + Now what you do is this: + + - detach from the current thread:: + + + (UML gdb) det + + + + + + - attach to the thread you are interested in:: + + + (UML gdb) att + + + + + + - look at its stack and anything else of interest:: + + + (UML gdb) bt + + + + + Note that you can't do anything at this point that requires that a + process execute, e.g. calling a function + + - when you're done looking at that process, reattach to the current + thread and continue it:: + + + (UML gdb) + att 1 + + + (UML gdb) + c + + + + + Here, specifying any pid which is not the process id of a UML thread + will cause gdb to reattach to the current thread. I commonly use 1, + but any other invalid pid would work. + + + +11.3. Running ddd on UML +------------------------- + + ddd works on UML, but requires a special kludge. The process goes + like this: + + - Start ddd:: + + + host% ddd linux + + + + + + - With ps, get the pid of the gdb that ddd started. You can ask the + gdb to tell you, but for some reason that confuses things and + causes a hang. + + - run UML with 'debug=parent gdb-pid=' added to the command line + - it will just sit there after you hit return + + - type 'att 1' to the ddd gdb and you will see something like:: + + + 0xa013dc51 in __kill () + + + (gdb) + + + + + + - At this point, type 'c', UML will boot up, and you can use ddd just + as you do on any other process. + + + +11.4. Debugging modules +------------------------ + + + gdb has support for debugging code which is dynamically loaded into + the process. This support is what is needed to debug kernel modules + under UML. + + + Using that support is somewhat complicated. You have to tell gdb what + object file you just loaded into UML and where in memory it is. Then, + it can read the symbol table, and figure out where all the symbols are + from the load address that you provided. It gets more interesting + when you load the module again (i.e. after an rmmod). You have to + tell gdb to forget about all its symbols, including the main UML ones + for some reason, then load then all back in again. + + + There's an easy way and a hard way to do this. The easy way is to use + the umlgdb expect script written by Chandan Kudige. It basically + automates the process for you. + + + First, you must tell it where your modules are. There is a list in + the script that looks like this:: + + set MODULE_PATHS { + "fat" "/usr/src/uml/linux-2.4.18/fs/fat/fat.o" + "isofs" "/usr/src/uml/linux-2.4.18/fs/isofs/isofs.o" + "minix" "/usr/src/uml/linux-2.4.18/fs/minix/minix.o" + } + + + + + You change that to list the names and paths of the modules that you + are going to debug. Then you run it from the toplevel directory of + your UML pool and it basically tells you what to do:: + + + ******** GDB pid is 21903 ******** + Start UML as: ./linux debug gdb-pid=21903 + + + + GNU gdb 5.0rh-5 Red Hat Linux 7.1 + Copyright 2001 Free Software Foundation, Inc. + GDB is free software, covered by the GNU General Public License, and you are + welcome to change it and/or distribute copies of it under certain conditions. + Type "show copying" to see the conditions. + There is absolutely no warranty for GDB. Type "show warranty" for details. + This GDB was configured as "i386-redhat-linux"... + (gdb) b sys_init_module + Breakpoint 1 at 0xa0011923: file module.c, line 349. + (gdb) att 1 + + + + + After you run UML and it sits there doing nothing, you hit return at + the 'att 1' and continue it:: + + + Attaching to program: /home/jdike/linux/2.4/um/./linux, process 1 + 0xa00f4221 in __kill () + (UML gdb) c + Continuing. + + + + + At this point, you debug normally. When you insmod something, the + expect magic will kick in and you'll see something like:: + + + *** Module hostfs loaded *** + Breakpoint 1, sys_init_module (name_user=0x805abb0 "hostfs", + mod_user=0x8070e00) at module.c:349 + 349 char *name, *n_name, *name_tmp = NULL; + (UML gdb) finish + Run till exit from #0 sys_init_module (name_user=0x805abb0 "hostfs", + mod_user=0x8070e00) at module.c:349 + 0xa00e2e23 in execute_syscall (r=0xa8140284) at syscall_kern.c:411 + 411 else res = EXECUTE_SYSCALL(syscall, regs); + Value returned is $1 = 0 + (UML gdb) + p/x (int)module_list + module_list->size_of_struct + + $2 = 0xa9021054 + (UML gdb) symbol-file ./linux + Load new symbol table from "./linux"? (y or n) y + Reading symbols from ./linux... + done. + (UML gdb) + add-symbol-file /home/jdike/linux/2.4/um/arch/um/fs/hostfs/hostfs.o 0xa9021054 + + add symbol table from file "/home/jdike/linux/2.4/um/arch/um/fs/hostfs/hostfs.o" at + .text_addr = 0xa9021054 + (y or n) y + + Reading symbols from /home/jdike/linux/2.4/um/arch/um/fs/hostfs/hostfs.o... + done. + (UML gdb) p *module_list + $1 = {size_of_struct = 84, next = 0xa0178720, name = 0xa9022de0 "hostfs", + size = 9016, uc = {usecount = {counter = 0}, pad = 0}, flags = 1, + nsyms = 57, ndeps = 0, syms = 0xa9023170, deps = 0x0, refs = 0x0, + init = 0xa90221f0 , cleanup = 0xa902222c , + ex_table_start = 0x0, ex_table_end = 0x0, persist_start = 0x0, + persist_end = 0x0, can_unload = 0, runsize = 0, kallsyms_start = 0x0, + kallsyms_end = 0x0, + archdata_start = 0x1b855
, + archdata_end = 0xe5890000
, + kernel_data = 0xf689c35d
} + >> Finished loading symbols for hostfs ... + + + + + That's the easy way. It's highly recommended. The hard way is + described below in case you're interested in what's going on. + + + Boot the kernel under the debugger and load the module with insmod or + modprobe. With gdb, do:: + + + (UML gdb) p module_list + + + + + This is a list of modules that have been loaded into the kernel, with + the most recently loaded module first. Normally, the module you want + is at module_list. If it's not, walk down the next links, looking at + the name fields until find the module you want to debug. Take the + address of that structure, and add module.size_of_struct (which in + 2.4.10 kernels is 96 (0x60)) to it. Gdb can make this hard addition + for you :-):: + + + + (UML gdb) + printf "%#x\n", (int)module_list module_list->size_of_struct + + + + + The offset from the module start occasionally changes (before 2.4.0, + it was module.size_of_struct + 4), so it's a good idea to check the + init and cleanup addresses once in a while, as describe below. Now + do:: + + + (UML gdb) + add-symbol-file /path/to/module/on/host that_address + + + + + Tell gdb you really want to do it, and you're in business. + + + If there's any doubt that you got the offset right, like breakpoints + appear not to work, or they're appearing in the wrong place, you can + check it by looking at the module structure. The init and cleanup + fields should look like:: + + + init = 0x588066b0 , cleanup = 0x588066c0 + + + + + with no offsets on the symbol names. If the names are right, but they + are offset, then the offset tells you how much you need to add to the + address you gave to add-symbol-file. + + + When you want to load in a new version of the module, you need to get + gdb to forget about the old one. The only way I've found to do that + is to tell gdb to forget about all symbols that it knows about:: + + + (UML gdb) symbol-file + + + + + Then reload the symbols from the kernel binary:: + + + (UML gdb) symbol-file /path/to/kernel + + + + + and repeat the process above. You'll also need to re-enable break- + points. They were disabled when you dumped all the symbols because + gdb couldn't figure out where they should go. + + + +11.5. Attaching gdb to the kernel +---------------------------------- + + If you don't have the kernel running under gdb, you can attach gdb to + it later by sending the tracing thread a SIGUSR1. The first line of + the console output identifies its pid:: + + tracing thread pid = 20093 + + + + + When you send it the signal:: + + + host% kill -USR1 20093 + + + + + you will get an xterm with gdb running in it. + + + If you have the mconsole compiled into UML, then the mconsole client + can be used to start gdb:: + + + (mconsole) (mconsole) config gdb=xterm + + + + + will fire up an xterm with gdb running in it. + + + +11.6. Using alternate debuggers +-------------------------------- + + UML has support for attaching to an already running debugger rather + than starting gdb itself. This is present in CVS as of 17 Apr 2001. + I sent it to Alan for inclusion in the ac tree, and it will be in my + 2.4.4 release. + + + This is useful when gdb is a subprocess of some UI, such as emacs or + ddd. It can also be used to run debuggers other than gdb on UML. + Below is an example of using strace as an alternate debugger. + + + To do this, you need to get the pid of the debugger and pass it in + with the + + + If you are using gdb under some UI, then tell it to 'att 1', and + you'll find yourself attached to UML. + + + If you are using something other than gdb as your debugger, then + you'll need to get it to do the equivalent of 'att 1' if it doesn't do + it automatically. + + + An example of an alternate debugger is strace. You can strace the + actual kernel as follows: + + - Run the following in a shell:: + + + host% + sh -c 'echo pid=$$; echo -n hit return; read x; exec strace -p 1 -o strace.out' + + + + - Run UML with 'debug' and 'gdb-pid=' with the pid printed out + by the previous command + + - Hit return in the shell, and UML will start running, and strace + output will start accumulating in the output file. + + Note that this is different from running:: + + + host% strace ./linux + + + + + That will strace only the main UML thread, the tracing thread, which + doesn't do any of the actual kernel work. It just oversees the vir- + tual machine. In contrast, using strace as described above will show + you the low-level activity of the virtual machine. + + + + + +12. Kernel debugging examples +============================== + +12.1. The case of the hung fsck +-------------------------------- + + When booting up the kernel, fsck failed, and dropped me into a shell + to fix things up. I ran fsck -y, which hung:: + + + Setting hostname uml [ OK ] + Checking root filesystem + /dev/fhd0 was not cleanly unmounted, check forced. + Error reading block 86894 (Attempt to read block from filesystem resulted in short read) while reading indirect blocks of inode 19780. + + /dev/fhd0: UNEXPECTED INCONSISTENCY; RUN fsck MANUALLY. + (i.e., without -a or -p options) + [ FAILED ] + + *** An error occurred during the file system check. + *** Dropping you to a shell; the system will reboot + *** when you leave the shell. + Give root password for maintenance + (or type Control-D for normal startup): + + [root@uml /root]# fsck -y /dev/fhd0 + fsck -y /dev/fhd0 + Parallelizing fsck version 1.14 (9-Jan-1999) + e2fsck 1.14, 9-Jan-1999 for EXT2 FS 0.5b, 95/08/09 + /dev/fhd0 contains a file system with errors, check forced. + Pass 1: Checking inodes, blocks, and sizes + Error reading block 86894 (Attempt to read block from filesystem resulted in short read) while reading indirect blocks of inode 19780. Ignore error? yes + + Inode 19780, i_blocks is 1548, should be 540. Fix? yes + + Pass 2: Checking directory structure + Error reading block 49405 (Attempt to read block from filesystem resulted in short read). Ignore error? yes + + Directory inode 11858, block 0, offset 0: directory corrupted + Salvage? yes + + Missing '.' in directory inode 11858. + Fix? yes + + Missing '..' in directory inode 11858. + Fix? yes + + + The standard drill in this sort of situation is to fire up gdb on the + signal thread, which, in this case, was pid 1935. In another window, + I run gdb and attach pid 1935:: + + + ~/linux/2.3.26/um 1016: gdb linux + GNU gdb 4.17.0.11 with Linux support + Copyright 1998 Free Software Foundation, Inc. + GDB is free software, covered by the GNU General Public License, and you are + welcome to change it and/or distribute copies of it under certain conditions. + Type "show copying" to see the conditions. + There is absolutely no warranty for GDB. Type "show warranty" for details. + This GDB was configured as "i386-redhat-linux"... + + (gdb) att 1935 + Attaching to program `/home/dike/linux/2.3.26/um/linux', Pid 1935 + 0x100756d9 in __wait4 () + + + Let's see what's currently running:: + + + + (gdb) p current_task.pid + $1 = 0 + + + + + + It's the idle thread, which means that fsck went to sleep for some + reason and never woke up. + + + Let's guess that the last process in the process list is fsck:: + + + + (gdb) p current_task.prev_task.comm + $13 = "fsck.ext2\000\000\000\000\000\000" + + + + + + It is, so let's see what it thinks it's up to:: + + + + (gdb) p current_task.prev_task.thread + $14 = {extern_pid = 1980, tracing = 0, want_tracing = 0, forking = 0, + kernel_stack_page = 0, signal_stack = 1342627840, syscall = {id = 4, args = { + 3, 134973440, 1024, 0, 1024}, have_result = 0, result = 50590720}, + request = {op = 2, u = {exec = {ip = 1350467584, sp = 2952789424}, fork = { + regs = {1350467584, 2952789424, 0 }, sigstack = 0, + pid = 0}, switch_to = 0x507e8000, thread = {proc = 0x507e8000, + arg = 0xaffffdb0, flags = 0, new_pid = 0}, input_request = { + op = 1350467584, fd = -1342177872, proc = 0, pid = 0}}}} + + + + The interesting things here are the fact that its .thread.syscall.id + is __NR_write (see the big switch in arch/um/kernel/syscall_kern.c or + the defines in include/asm-um/arch/unistd.h), and that it never + returned. Also, its .request.op is OP_SWITCH (see + arch/um/include/user_util.h). These mean that it went into a write, + and, for some reason, called schedule(). + + + The fact that it never returned from write means that its stack should + be fairly interesting. Its pid is 1980 (.thread.extern_pid). That + process is being ptraced by the signal thread, so it must be detached + before gdb can attach it:: + + + + (gdb) call detach(1980) + + Program received signal SIGSEGV, Segmentation fault. + + The program being debugged stopped while in a function called from GDB. + When the function (detach) is done executing, GDB will silently + stop (instead of continuing to evaluate the expression containing + the function call). + (gdb) call detach(1980) + $15 = 0 + + + The first detach segfaults for some reason, and the second one + succeeds. + + + Now I detach from the signal thread, attach to the fsck thread, and + look at its stack:: + + + (gdb) det + Detaching from program: /home/dike/linux/2.3.26/um/linux Pid 1935 + (gdb) att 1980 + Attaching to program `/home/dike/linux/2.3.26/um/linux', Pid 1980 + 0x10070451 in __kill () + (gdb) bt + #0 0x10070451 in __kill () + #1 0x10068ccd in usr1_pid (pid=1980) at process.c:30 + #2 0x1006a03f in _switch_to (prev=0x50072000, next=0x507e8000) + at process_kern.c:156 + #3 0x1006a052 in switch_to (prev=0x50072000, next=0x507e8000, last=0x50072000) + at process_kern.c:161 + #4 0x10001d12 in schedule () at core.c:777 + #5 0x1006a744 in __down (sem=0x507d241c) at semaphore.c:71 + #6 0x1006aa10 in __down_failed () at semaphore.c:157 + #7 0x1006c5d8 in segv_handler (sc=0x5006e940) at trap_user.c:174 + #8 0x1006c5ec in kern_segv_handler (sig=11) at trap_user.c:182 + #9 + #10 0x10155404 in errno () + #11 0x1006c0aa in segv (address=1342179328, is_write=2) at trap_kern.c:50 + #12 0x1006c5d8 in segv_handler (sc=0x5006eaf8) at trap_user.c:174 + #13 0x1006c5ec in kern_segv_handler (sig=11) at trap_user.c:182 + #14 + #15 0xc0fd in ?? () + #16 0x10016647 in sys_write (fd=3, + buf=0x80b8800
, count=1024) + at read_write.c:159 + #17 0x1006d5b3 in execute_syscall (syscall=4, args=0x5006ef08) + at syscall_kern.c:254 + #18 0x1006af87 in really_do_syscall (sig=12) at syscall_user.c:35 + #19 + #20 0x400dc8b0 in ?? () + + + + + + The interesting things here are: + + - There are two segfaults on this stack (frames 9 and 14) + + - The first faulting address (frame 11) is 0x50000800:: + + (gdb) p (void *)1342179328 + $16 = (void *) 0x50000800 + + + + + + The initial faulting address is interesting because it is on the idle + thread's stack. I had been seeing the idle thread segfault for no + apparent reason, and the cause looked like stack corruption. In hopes + of catching the culprit in the act, I had turned off all protections + to that stack while the idle thread wasn't running. This apparently + tripped that trap. + + + However, the more immediate problem is that second segfault and I'm + going to concentrate on that. First, I want to see where the fault + happened, so I have to go look at the sigcontent struct in frame 8:: + + + + (gdb) up + #1 0x10068ccd in usr1_pid (pid=1980) at process.c:30 + 30 kill(pid, SIGUSR1); + (gdb) + #2 0x1006a03f in _switch_to (prev=0x50072000, next=0x507e8000) + at process_kern.c:156 + 156 usr1_pid(getpid()); + (gdb) + #3 0x1006a052 in switch_to (prev=0x50072000, next=0x507e8000, last=0x50072000) + at process_kern.c:161 + 161 _switch_to(prev, next); + (gdb) + #4 0x10001d12 in schedule () at core.c:777 + 777 switch_to(prev, next, prev); + (gdb) + #5 0x1006a744 in __down (sem=0x507d241c) at semaphore.c:71 + 71 schedule(); + (gdb) + #6 0x1006aa10 in __down_failed () at semaphore.c:157 + 157 } + (gdb) + #7 0x1006c5d8 in segv_handler (sc=0x5006e940) at trap_user.c:174 + 174 segv(sc->cr2, sc->err & 2); + (gdb) + #8 0x1006c5ec in kern_segv_handler (sig=11) at trap_user.c:182 + 182 segv_handler(sc); + (gdb) p *sc + Cannot access memory at address 0x0. + + + + + That's not very useful, so I'll try a more manual method:: + + + (gdb) p *((struct sigcontext *) (&sig + 1)) + $19 = {gs = 0, __gsh = 0, fs = 0, __fsh = 0, es = 43, __esh = 0, ds = 43, + __dsh = 0, edi = 1342179328, esi = 1350378548, ebp = 1342630440, + esp = 1342630420, ebx = 1348150624, edx = 1280, ecx = 0, eax = 0, + trapno = 14, err = 4, eip = 268480945, cs = 35, __csh = 0, eflags = 66118, + esp_at_signal = 1342630420, ss = 43, __ssh = 0, fpstate = 0x0, oldmask = 0, + cr2 = 1280} + + + + The ip is in handle_mm_fault:: + + + (gdb) p (void *)268480945 + $20 = (void *) 0x1000b1b1 + (gdb) i sym $20 + handle_mm_fault + 57 in section .text + + + + + + Specifically, it's in pte_alloc:: + + + (gdb) i line *$20 + Line 124 of "/home/dike/linux/2.3.26/um/include/asm/pgalloc.h" + starts at address 0x1000b1b1 + and ends at 0x1000b1b7 . + + + + + + To find where in handle_mm_fault this is, I'll jump forward in the + code until I see an address in that procedure:: + + + + (gdb) i line *0x1000b1c0 + Line 126 of "/home/dike/linux/2.3.26/um/include/asm/pgalloc.h" + starts at address 0x1000b1b7 + and ends at 0x1000b1c3 . + (gdb) i line *0x1000b1d0 + Line 131 of "/home/dike/linux/2.3.26/um/include/asm/pgalloc.h" + starts at address 0x1000b1d0 + and ends at 0x1000b1da . + (gdb) i line *0x1000b1e0 + Line 61 of "/home/dike/linux/2.3.26/um/include/asm/pgalloc.h" + starts at address 0x1000b1da + and ends at 0x1000b1e1 . + (gdb) i line *0x1000b1f0 + Line 134 of "/home/dike/linux/2.3.26/um/include/asm/pgalloc.h" + starts at address 0x1000b1f0 + and ends at 0x1000b200 . + (gdb) i line *0x1000b200 + Line 135 of "/home/dike/linux/2.3.26/um/include/asm/pgalloc.h" + starts at address 0x1000b200 + and ends at 0x1000b208 . + (gdb) i line *0x1000b210 + Line 139 of "/home/dike/linux/2.3.26/um/include/asm/pgalloc.h" + starts at address 0x1000b210 + and ends at 0x1000b219 . + (gdb) i line *0x1000b220 + Line 1168 of "memory.c" starts at address 0x1000b21e + and ends at 0x1000b222 . + + + + + + Something is apparently wrong with the page tables or vma_structs, so + lets go back to frame 11 and have a look at them:: + + + + #11 0x1006c0aa in segv (address=1342179328, is_write=2) at trap_kern.c:50 + 50 handle_mm_fault(current, vma, address, is_write); + (gdb) call pgd_offset_proc(vma->vm_mm, address) + $22 = (pgd_t *) 0x80a548c + + + + + + That's pretty bogus. Page tables aren't supposed to be in process + text or data areas. Let's see what's in the vma:: + + + (gdb) p *vma + $23 = {vm_mm = 0x507d2434, vm_start = 0, vm_end = 134512640, + vm_next = 0x80a4f8c, vm_page_prot = {pgprot = 0}, vm_flags = 31200, + vm_avl_height = 2058, vm_avl_left = 0x80a8c94, vm_avl_right = 0x80d1000, + vm_next_share = 0xaffffdb0, vm_pprev_share = 0xaffffe63, + vm_ops = 0xaffffe7a, vm_pgoff = 2952789626, vm_file = 0xafffffec, + vm_private_data = 0x62} + (gdb) p *vma.vm_mm + $24 = {mmap = 0x507d2434, mmap_avl = 0x0, mmap_cache = 0x8048000, + pgd = 0x80a4f8c, mm_users = {counter = 0}, mm_count = {counter = 134904288}, + map_count = 134909076, mmap_sem = {count = {counter = 135073792}, + sleepers = -1342177872, wait = {lock = , + task_list = {next = 0xaffffe63, prev = 0xaffffe7a}, + __magic = -1342177670, __creator = -1342177300}, __magic = 98}, + page_table_lock = {}, context = 138, start_code = 0, end_code = 0, + start_data = 0, end_data = 0, start_brk = 0, brk = 0, start_stack = 0, + arg_start = 0, arg_end = 0, env_start = 0, env_end = 0, rss = 1350381536, + total_vm = 0, locked_vm = 0, def_flags = 0, cpu_vm_mask = 0, swap_cnt = 0, + swap_address = 0, segments = 0x0} + + + + This also pretty bogus. With all of the 0x80xxxxx and 0xaffffxxx + addresses, this is looking like a stack was plonked down on top of + these structures. Maybe it's a stack overflow from the next page:: + + + (gdb) p vma + $25 = (struct vm_area_struct *) 0x507d2434 + + + + That's towards the lower quarter of the page, so that would have to + have been pretty heavy stack overflow:: + + + (gdb) x/100x $25 + 0x507d2434: 0x507d2434 0x00000000 0x08048000 0x080a4f8c + 0x507d2444: 0x00000000 0x080a79e0 0x080a8c94 0x080d1000 + 0x507d2454: 0xaffffdb0 0xaffffe63 0xaffffe7a 0xaffffe7a + 0x507d2464: 0xafffffec 0x00000062 0x0000008a 0x00000000 + 0x507d2474: 0x00000000 0x00000000 0x00000000 0x00000000 + 0x507d2484: 0x00000000 0x00000000 0x00000000 0x00000000 + 0x507d2494: 0x00000000 0x00000000 0x507d2fe0 0x00000000 + 0x507d24a4: 0x00000000 0x00000000 0x00000000 0x00000000 + 0x507d24b4: 0x00000000 0x00000000 0x00000000 0x00000000 + 0x507d24c4: 0x00000000 0x00000000 0x00000000 0x00000000 + 0x507d24d4: 0x00000000 0x00000000 0x00000000 0x00000000 + 0x507d24e4: 0x00000000 0x00000000 0x00000000 0x00000000 + 0x507d24f4: 0x00000000 0x00000000 0x00000000 0x00000000 + 0x507d2504: 0x00000000 0x00000000 0x00000000 0x00000000 + 0x507d2514: 0x00000000 0x00000000 0x00000000 0x00000000 + 0x507d2524: 0x00000000 0x00000000 0x00000000 0x00000000 + 0x507d2534: 0x00000000 0x00000000 0x507d25dc 0x00000000 + 0x507d2544: 0x00000000 0x00000000 0x00000000 0x00000000 + 0x507d2554: 0x00000000 0x00000000 0x00000000 0x00000000 + 0x507d2564: 0x00000000 0x00000000 0x00000000 0x00000000 + 0x507d2574: 0x00000000 0x00000000 0x00000000 0x00000000 + 0x507d2584: 0x00000000 0x00000000 0x00000000 0x00000000 + 0x507d2594: 0x00000000 0x00000000 0x00000000 0x00000000 + 0x507d25a4: 0x00000000 0x00000000 0x00000000 0x00000000 + 0x507d25b4: 0x00000000 0x00000000 0x00000000 0x00000000 + + + + It's not stack overflow. The only "stack-like" piece of this data is + the vma_struct itself. + + + At this point, I don't see any avenues to pursue, so I just have to + admit that I have no idea what's going on. What I will do, though, is + stick a trap on the segfault handler which will stop if it sees any + writes to the idle thread's stack. That was the thing that happened + first, and it may be that if I can catch it immediately, what's going + on will be somewhat clearer. + + +12.2. Episode 2: The case of the hung fsck +------------------------------------------- + + After setting a trap in the SEGV handler for accesses to the signal + thread's stack, I reran the kernel. + + + fsck hung again, this time by hitting the trap:: + + + + Setting hostname uml [ OK ] + Checking root filesystem + /dev/fhd0 contains a file system with errors, check forced. + Error reading block 86894 (Attempt to read block from filesystem resulted in short read) while reading indirect blocks of inode 19780. + + /dev/fhd0: UNEXPECTED INCONSISTENCY; RUN fsck MANUALLY. + (i.e., without -a or -p options) + [ FAILED ] + + *** An error occurred during the file system check. + *** Dropping you to a shell; the system will reboot + *** when you leave the shell. + Give root password for maintenance + (or type Control-D for normal startup): + + [root@uml /root]# fsck -y /dev/fhd0 + fsck -y /dev/fhd0 + Parallelizing fsck version 1.14 (9-Jan-1999) + e2fsck 1.14, 9-Jan-1999 for EXT2 FS 0.5b, 95/08/09 + /dev/fhd0 contains a file system with errors, check forced. + Pass 1: Checking inodes, blocks, and sizes + Error reading block 86894 (Attempt to read block from filesystem resulted in short read) while reading indirect blocks of inode 19780. Ignore error? yes + + Pass 2: Checking directory structure + Error reading block 49405 (Attempt to read block from filesystem resulted in short read). Ignore error? yes + + Directory inode 11858, block 0, offset 0: directory corrupted + Salvage? yes + + Missing '.' in directory inode 11858. + Fix? yes + + Missing '..' in directory inode 11858. + Fix? yes + + Untested (4127) [100fe44c]: trap_kern.c line 31 + + + + + + I need to get the signal thread to detach from pid 4127 so that I can + attach to it with gdb. This is done by sending it a SIGUSR1, which is + caught by the signal thread, which detaches the process:: + + + kill -USR1 4127 + + + + + + Now I can run gdb on it:: + + + ~/linux/2.3.26/um 1034: gdb linux + GNU gdb 4.17.0.11 with Linux support + Copyright 1998 Free Software Foundation, Inc. + GDB is free software, covered by the GNU General Public License, and you are + welcome to change it and/or distribute copies of it under certain conditions. + Type "show copying" to see the conditions. + There is absolutely no warranty for GDB. Type "show warranty" for details. + This GDB was configured as "i386-redhat-linux"... + (gdb) att 4127 + Attaching to program `/home/dike/linux/2.3.26/um/linux', Pid 4127 + 0x10075891 in __libc_nanosleep () + + + + + + The backtrace shows that it was in a write and that the fault address + (address in frame 3) is 0x50000800, which is right in the middle of + the signal thread's stack page:: + + + (gdb) bt + #0 0x10075891 in __libc_nanosleep () + #1 0x1007584d in __sleep (seconds=1000000) + at ../sysdeps/unix/sysv/linux/sleep.c:78 + #2 0x1006ce9a in stop () at user_util.c:191 + #3 0x1006bf88 in segv (address=1342179328, is_write=2) at trap_kern.c:31 + #4 0x1006c628 in segv_handler (sc=0x5006eaf8) at trap_user.c:174 + #5 0x1006c63c in kern_segv_handler (sig=11) at trap_user.c:182 + #6 + #7 0xc0fd in ?? () + #8 0x10016647 in sys_write (fd=3, buf=0x80b8800 "R.", count=1024) + at read_write.c:159 + #9 0x1006d603 in execute_syscall (syscall=4, args=0x5006ef08) + at syscall_kern.c:254 + #10 0x1006af87 in really_do_syscall (sig=12) at syscall_user.c:35 + #11 + #12 0x400dc8b0 in ?? () + #13 + #14 0x400dc8b0 in ?? () + #15 0x80545fd in ?? () + #16 0x804daae in ?? () + #17 0x8054334 in ?? () + #18 0x804d23e in ?? () + #19 0x8049632 in ?? () + #20 0x80491d2 in ?? () + #21 0x80596b5 in ?? () + (gdb) p (void *)1342179328 + $3 = (void *) 0x50000800 + + + + Going up the stack to the segv_handler frame and looking at where in + the code the access happened shows that it happened near line 110 of + block_dev.c:: + + + + (gdb) up + #1 0x1007584d in __sleep (seconds=1000000) + at ../sysdeps/unix/sysv/linux/sleep.c:78 + ../sysdeps/unix/sysv/linux/sleep.c:78: No such file or directory. + (gdb) + #2 0x1006ce9a in stop () at user_util.c:191 + 191 while(1) sleep(1000000); + (gdb) + #3 0x1006bf88 in segv (address=1342179328, is_write=2) at trap_kern.c:31 + 31 KERN_UNTESTED(); + (gdb) + #4 0x1006c628 in segv_handler (sc=0x5006eaf8) at trap_user.c:174 + 174 segv(sc->cr2, sc->err & 2); + (gdb) p *sc + $1 = {gs = 0, __gsh = 0, fs = 0, __fsh = 0, es = 43, __esh = 0, ds = 43, + __dsh = 0, edi = 1342179328, esi = 134973440, ebp = 1342631484, + esp = 1342630864, ebx = 256, edx = 0, ecx = 256, eax = 1024, trapno = 14, + err = 6, eip = 268550834, cs = 35, __csh = 0, eflags = 66070, + esp_at_signal = 1342630864, ss = 43, __ssh = 0, fpstate = 0x0, oldmask = 0, + cr2 = 1342179328} + (gdb) p (void *)268550834 + $2 = (void *) 0x1001c2b2 + (gdb) i sym $2 + block_write + 1090 in section .text + (gdb) i line *$2 + Line 209 of "/home/dike/linux/2.3.26/um/include/asm/arch/string.h" + starts at address 0x1001c2a1 + and ends at 0x1001c2bf . + (gdb) i line *0x1001c2c0 + Line 110 of "block_dev.c" starts at address 0x1001c2bf + and ends at 0x1001c2e3 . + + + + Looking at the source shows that the fault happened during a call to + copy_from_user to copy the data into the kernel:: + + + 107 count -= chars; + 108 copy_from_user(p,buf,chars); + 109 p += chars; + 110 buf += chars; + + + + p is the pointer which must contain 0x50000800, since buf contains + 0x80b8800 (frame 8 above). It is defined as:: + + + p = offset + bh->b_data; + + + + + + I need to figure out what bh is, and it just so happens that bh is + passed as an argument to mark_buffer_uptodate and mark_buffer_dirty a + few lines later, so I do a little disassembly:: + + + (gdb) disas 0x1001c2bf 0x1001c2e0 + Dump of assembler code from 0x1001c2bf to 0x1001c2d0: + 0x1001c2bf : addl %eax,0xc(%ebp) + 0x1001c2c2 : movl 0xfffffdd4(%ebp),%edx + 0x1001c2c8 : btsl $0x0,0x18(%edx) + 0x1001c2cd : btsl $0x1,0x18(%edx) + 0x1001c2d2 : sbbl %ecx,%ecx + 0x1001c2d4 : testl %ecx,%ecx + 0x1001c2d6 : jne 0x1001c2e3 + 0x1001c2d8 : pushl $0x0 + 0x1001c2da : pushl %edx + 0x1001c2db : call 0x1001819c <__mark_buffer_dirty> + End of assembler dump. + + + + + + At that point, bh is in %edx (address 0x1001c2da), which is calculated + at 0x1001c2c2 as %ebp + 0xfffffdd4, so I figure exactly what that is, + taking %ebp from the sigcontext_struct above:: + + + (gdb) p (void *)1342631484 + $5 = (void *) 0x5006ee3c + (gdb) p 0x5006ee3c+0xfffffdd4 + $6 = 1342630928 + (gdb) p (void *)$6 + $7 = (void *) 0x5006ec10 + (gdb) p *((void **)$7) + $8 = (void *) 0x50100200 + + + + + + Now, I look at the structure to see what's in it, and particularly, + what its b_data field contains:: + + + (gdb) p *((struct buffer_head *)0x50100200) + $13 = {b_next = 0x50289380, b_blocknr = 49405, b_size = 1024, b_list = 0, + b_dev = 15872, b_count = {counter = 1}, b_rdev = 15872, b_state = 24, + b_flushtime = 0, b_next_free = 0x501001a0, b_prev_free = 0x50100260, + b_this_page = 0x501001a0, b_reqnext = 0x0, b_pprev = 0x507fcf58, + b_data = 0x50000800 "", b_page = 0x50004000, + b_end_io = 0x10017f60 , b_dev_id = 0x0, + b_rsector = 98810, b_wait = {lock = , + task_list = {next = 0x50100248, prev = 0x50100248}, __magic = 1343226448, + __creator = 0}, b_kiobuf = 0x0} + + + + + + The b_data field is indeed 0x50000800, so the question becomes how + that happened. The rest of the structure looks fine, so this probably + is not a case of data corruption. It happened on purpose somehow. + + + The b_page field is a pointer to the page_struct representing the + 0x50000000 page. Looking at it shows the kernel's idea of the state + of that page:: + + + + (gdb) p *$13.b_page + $17 = {list = {next = 0x50004a5c, prev = 0x100c5174}, mapping = 0x0, + index = 0, next_hash = 0x0, count = {counter = 1}, flags = 132, lru = { + next = 0x50008460, prev = 0x50019350}, wait = { + lock = , task_list = {next = 0x50004024, + prev = 0x50004024}, __magic = 1342193708, __creator = 0}, + pprev_hash = 0x0, buffers = 0x501002c0, virtual = 1342177280, + zone = 0x100c5160} + + + + + + Some sanity-checking: the virtual field shows the "virtual" address of + this page, which in this kernel is the same as its "physical" address, + and the page_struct itself should be mem_map[0], since it represents + the first page of memory:: + + + + (gdb) p (void *)1342177280 + $18 = (void *) 0x50000000 + (gdb) p mem_map + $19 = (mem_map_t *) 0x50004000 + + + + + + These check out fine. + + + Now to check out the page_struct itself. In particular, the flags + field shows whether the page is considered free or not:: + + + (gdb) p (void *)132 + $21 = (void *) 0x84 + + + + + + The "reserved" bit is the high bit, which is definitely not set, so + the kernel considers the signal stack page to be free and available to + be used. + + + At this point, I jump to conclusions and start looking at my early + boot code, because that's where that page is supposed to be reserved. + + + In my setup_arch procedure, I have the following code which looks just + fine:: + + + + bootmap_size = init_bootmem(start_pfn, end_pfn - start_pfn); + free_bootmem(__pa(low_physmem) + bootmap_size, high_physmem - low_physmem); + + + + + + Two stack pages have already been allocated, and low_physmem points to + the third page, which is the beginning of free memory. + The init_bootmem call declares the entire memory to the boot memory + manager, which marks it all reserved. The free_bootmem call frees up + all of it, except for the first two pages. This looks correct to me. + + + So, I decide to see init_bootmem run and make sure that it is marking + those first two pages as reserved. I never get that far. + + + Stepping into init_bootmem, and looking at bootmem_map before looking + at what it contains shows the following:: + + + + (gdb) p bootmem_map + $3 = (void *) 0x50000000 + + + + + + Aha! The light dawns. That first page is doing double duty as a + stack and as the boot memory map. The last thing that the boot memory + manager does is to free the pages used by its memory map, so this page + is getting freed even its marked as reserved. + + + The fix was to initialize the boot memory manager before allocating + those two stack pages, and then allocate them through the boot memory + manager. After doing this, and fixing a couple of subsequent buglets, + the stack corruption problem disappeared. + + + + + +13. What to do when UML doesn't work +===================================== + + + + +13.1. Strange compilation errors when you build from source +------------------------------------------------------------ + + As of test11, it is necessary to have "ARCH=um" in the environment or + on the make command line for all steps in building UML, including + clean, distclean, or mrproper, config, menuconfig, or xconfig, dep, + and linux. If you forget for any of them, the i386 build seems to + contaminate the UML build. If this happens, start from scratch with:: + + + host% + make mrproper ARCH=um + + + + + and repeat the build process with ARCH=um on all the steps. + + + See :ref:`Compiling_the_kernel_and_modules` for more details. + + + Another cause of strange compilation errors is building UML in + /usr/src/linux. If you do this, the first thing you need to do is + clean up the mess you made. The /usr/src/linux/asm link will now + point to /usr/src/linux/asm-um. Make it point back to + /usr/src/linux/asm-i386. Then, move your UML pool someplace else and + build it there. Also see below, where a more specific set of symptoms + is described. + + + +13.3. A variety of panics and hangs with /tmp on a reiserfs filesystem +----------------------------------------------------------------------- + + I saw this on reiserfs 3.5.21 and it seems to be fixed in 3.5.27. + Panics preceded by:: + + + Detaching pid nnnn + + + + are diagnostic of this problem. This is a reiserfs bug which causes a + thread to occasionally read stale data from a mmapped page shared with + another thread. The fix is to upgrade the filesystem or to have /tmp + be an ext2 filesystem. + + + + 13.4. The compile fails with errors about conflicting types for + 'open', 'dup', and 'waitpid' + + This happens when you build in /usr/src/linux. The UML build makes + the include/asm link point to include/asm-um. /usr/include/asm points + to /usr/src/linux/include/asm, so when that link gets moved, files + which need to include the asm-i386 versions of headers get the + incompatible asm-um versions. The fix is to move the include/asm link + back to include/asm-i386 and to do UML builds someplace else. + + + +13.5. UML doesn't work when /tmp is an NFS filesystem +------------------------------------------------------ + + This seems to be a similar situation with the ReiserFS problem above. + Some versions of NFS seems not to handle mmap correctly, which UML + depends on. The workaround is have /tmp be a non-NFS directory. + + +13.6. UML hangs on boot when compiled with gprof support +--------------------------------------------------------- + + If you build UML with gprof support and, early in the boot, it does + this:: + + + kernel BUG at page_alloc.c:100! + + + + + you have a buggy gcc. You can work around the problem by removing + UM_FASTCALL from CFLAGS in arch/um/Makefile-i386. This will open up + another bug, but that one is fairly hard to reproduce. + + + +13.7. syslogd dies with a SIGTERM on startup +--------------------------------------------- + + The exact boot error depends on the distribution that you're booting, + but Debian produces this:: + + + /etc/rc2.d/S10sysklogd: line 49: 93 Terminated + start-stop-daemon --start --quiet --exec /sbin/syslogd -- $SYSLOGD + + + + + This is a syslogd bug. There's a race between a parent process + installing a signal handler and its child sending the signal. See + this uml-devel post for the details. + + + +13.8. TUN/TAP networking doesn't work on a 2.4 host +---------------------------------------------------- + + There are a couple of problems which were + name="pointed + out"> by Tim Robinson + + - It doesn't work on hosts running 2.4.7 (or thereabouts) or earlier. + The fix is to upgrade to something more recent and then read the + next item. + + - If you see:: + + + File descriptor in bad state + + + + when you bring up the device inside UML, you have a header mismatch + between the original kernel and the upgraded one. Make /usr/src/linux + point at the new headers. This will only be a problem if you build + uml_net yourself. + + + +13.9. You can network to the host but not to other machines on the net +======================================================================= + + If you can connect to the host, and the host can connect to UML, but + you cannot connect to any other machines, then you may need to enable + IP Masquerading on the host. Usually this is only experienced when + using private IP addresses (192.168.x.x or 10.x.x.x) for host/UML + networking, rather than the public address space that your host is + connected to. UML does not enable IP Masquerading, so you will need + to create a static rule to enable it:: + + + host% + iptables -t nat -A POSTROUTING -o eth0 -j MASQUERADE + + + + + Replace eth0 with the interface that you use to talk to the rest of + the world. + + + Documentation on IP Masquerading, and SNAT, can be found at + www.netfilter.org . + + + If you can reach the local net, but not the outside Internet, then + that is usually a routing problem. The UML needs a default route:: + + + UML# + route add default gw gateway IP + + + + + The gateway IP can be any machine on the local net that knows how to + reach the outside world. Usually, this is the host or the local net- + work's gateway. + + + Occasionally, we hear from someone who can reach some machines, but + not others on the same net, or who can reach some ports on other + machines, but not others. These are usually caused by strange + firewalling somewhere between the UML and the other box. You track + this down by running tcpdump on every interface the packets travel + over and see where they disappear. When you find a machine that takes + the packets in, but does not send them onward, that's the culprit. + + + +13.10. I have no root and I want to scream +=========================================== + + Thanks to Birgit Wahlich for telling me about this strange one. It + turns out that there's a limit of six environment variables on the + kernel command line. When that limit is reached or exceeded, argument + processing stops, which means that the 'root=' argument that UML + usually adds is not seen. So, the filesystem has no idea what the + root device is, so it panics. + + + The fix is to put less stuff on the command line. Glomming all your + setup variables into one is probably the best way to go. + + + +13.11. UML build conflict between ptrace.h and ucontext.h +========================================================== + + On some older systems, /usr/include/asm/ptrace.h and + /usr/include/sys/ucontext.h define the same names. So, when they're + included together, the defines from one completely mess up the parsing + of the other, producing errors like:: + + /usr/include/sys/ucontext.h:47: parse error before + `10` + + + + + plus a pile of warnings. + + + This is a libc botch, which has since been fixed, and I don't see any + way around it besides upgrading. + + + +13.12. The UML BogoMips is exactly half the host's BogoMips +------------------------------------------------------------ + + On i386 kernels, there are two ways of running the loop that is used + to calculate the BogoMips rating, using the TSC if it's there or using + a one-instruction loop. The TSC produces twice the BogoMips as the + loop. UML uses the loop, since it has nothing resembling a TSC, and + will get almost exactly the same BogoMips as a host using the loop. + However, on a host with a TSC, its BogoMips will be double the loop + BogoMips, and therefore double the UML BogoMips. + + + +13.13. When you run UML, it immediately segfaults +-------------------------------------------------- + + If the host is configured with the 2G/2G address space split, that's + why. See ref:`UML_on_2G/2G_hosts` for the details on getting UML to + run on your host. + + + +13.14. xterms appear, then immediately disappear +------------------------------------------------- + + If you're running an up to date kernel with an old release of + uml_utilities, the port-helper program will not work properly, so + xterms will exit straight after they appear. The solution is to + upgrade to the latest release of uml_utilities. Usually this problem + occurs when you have installed a packaged release of UML then compiled + your own development kernel without upgrading the uml_utilities from + the source distribution. + + + +13.15. Any other panic, hang, or strange behavior +-------------------------------------------------- + + If you're seeing truly strange behavior, such as hangs or panics that + happen in random places, or you try running the debugger to see what's + happening and it acts strangely, then it could be a problem in the + host kernel. If you're not running a stock Linus or -ac kernel, then + try that. An early version of the preemption patch and a 2.4.10 SuSE + kernel have caused very strange problems in UML. + + + Otherwise, let me know about it. Send a message to one of the UML + mailing lists - either the developer list - user-mode-linux-devel at + lists dot sourceforge dot net (subscription info) or the user list - + user-mode-linux-user at lists dot sourceforge do net (subscription + info), whichever you prefer. Don't assume that everyone knows about + it and that a fix is imminent. + + + If you want to be super-helpful, read :ref:`Diagnosing_Problems` and + follow the instructions contained therein. + +.. _Diagnosing_Problems: + +14. Diagnosing Problems +======================== + + + If you get UML to crash, hang, or otherwise misbehave, you should + report this on one of the project mailing lists, either the developer + list - user-mode-linux-devel at lists dot sourceforge dot net + (subscription info) or the user list - user-mode-linux-user at lists + dot sourceforge dot net (subscription info). When you do, it is + likely that I will want more information. So, it would be helpful to + read the stuff below, do whatever is applicable in your case, and + report the results to the list. + + + For any diagnosis, you're going to need to build a debugging kernel. + The binaries from this site aren't debuggable. If you haven't done + this before, read about :ref:`Compiling_the_kernel_and_modules` and + :ref:`Kernel_debugging` UML first. + + +14.1. Case 1 : Normal kernel panics +------------------------------------ + + The most common case is for a normal thread to panic. To debug this, + you will need to run it under the debugger (add 'debug' to the command + line). An xterm will start up with gdb running inside it. Continue + it when it stops in start_kernel and make it crash. Now ``^C gdb`` and + + + If the panic was a "Kernel mode fault", then there will be a segv + frame on the stack and I'm going to want some more information. The + stack might look something like this:: + + + (UML gdb) backtrace + #0 0x1009bf76 in __sigprocmask (how=1, set=0x5f347940, oset=0x0) + at ../sysdeps/unix/sysv/linux/sigprocmask.c:49 + #1 0x10091411 in change_sig (signal=10, on=1) at process.c:218 + #2 0x10094785 in timer_handler (sig=26) at time_kern.c:32 + #3 0x1009bf38 in __restore () + at ../sysdeps/unix/sysv/linux/i386/sigaction.c:125 + #4 0x1009534c in segv (address=8, ip=268849158, is_write=2, is_user=0) + at trap_kern.c:66 + #5 0x10095c04 in segv_handler (sig=11) at trap_user.c:285 + #6 0x1009bf38 in __restore () + + + + + I'm going to want to see the symbol and line information for the value + of ip in the segv frame. In this case, you would do the following:: + + + (UML gdb) i sym 268849158 + + + + + and:: + + + (UML gdb) i line *268849158 + + + + + The reason for this is the __restore frame right above the segv_han- + dler frame is hiding the frame that actually segfaulted. So, I have + to get that information from the faulting ip. + + +14.2. Case 2 : Tracing thread panics +------------------------------------- + + The less common and more painful case is when the tracing thread + panics. In this case, the kernel debugger will be useless because it + needs a healthy tracing thread in order to work. The first thing to + do is get a backtrace from the tracing thread. This is done by + figuring out what its pid is, firing up gdb, and attaching it to that + pid. You can figure out the tracing thread pid by looking at the + first line of the console output, which will look like this:: + + + tracing thread pid = 15851 + + + + + or by running ps on the host and finding the line that looks like + this:: + + + jdike 15851 4.5 0.4 132568 1104 pts/0 S 21:34 0:05 ./linux [(tracing thread)] + + + + + If the panic was 'segfault in signals', then follow the instructions + above for collecting information about the location of the seg fault. + + + If the tracing thread flaked out all by itself, then send that + backtrace in and wait for our crack debugging team to fix the problem. + + + 14.3. Case 3 : Tracing thread panics caused by other threads + + However, there are cases where the misbehavior of another thread + caused the problem. The most common panic of this type is:: + + + wait_for_stop failed to wait for to stop with + + + + + In this case, you'll need to get a backtrace from the process men- + tioned in the panic, which is complicated by the fact that the kernel + debugger is defunct and without some fancy footwork, another gdb can't + attach to it. So, this is how the fancy footwork goes: + + In a shell:: + + + host% kill -STOP pid + + + + + Run gdb on the tracing thread as described in case 2 and do:: + + + (host gdb) call detach(pid) + + + If you get a segfault, do it again. It always works the second time. + + Detach from the tracing thread and attach to that other thread:: + + + (host gdb) detach + + + + + + + (host gdb) attach pid + + + + + If gdb hangs when attaching to that process, go back to a shell and + do:: + + + host% + kill -CONT pid + + + + + And then get the backtrace:: + + + (host gdb) backtrace + + + + + +14.4. Case 4 : Hangs +--------------------- + + Hangs seem to be fairly rare, but they sometimes happen. When a hang + happens, we need a backtrace from the offending process. Run the + kernel debugger as described in case 1 and get a backtrace. If the + current process is not the idle thread, then send in the backtrace. + You can tell that it's the idle thread if the stack looks like this:: + + + #0 0x100b1401 in __libc_nanosleep () + #1 0x100a2885 in idle_sleep (secs=10) at time.c:122 + #2 0x100a546f in do_idle () at process_kern.c:445 + #3 0x100a5508 in cpu_idle () at process_kern.c:471 + #4 0x100ec18f in start_kernel () at init/main.c:592 + #5 0x100a3e10 in start_kernel_proc (unused=0x0) at um_arch.c:71 + #6 0x100a383f in signal_tramp (arg=0x100a3dd8) at trap_user.c:50 + + + + + If this is the case, then some other process is at fault, and went to + sleep when it shouldn't have. Run ps on the host and figure out which + process should not have gone to sleep and stayed asleep. Then attach + to it with gdb and get a backtrace as described in case 3. + + + + + + +15. Thanks +=========== + + + A number of people have helped this project in various ways, and this + page gives recognition where recognition is due. + + + If you're listed here and you would prefer a real link on your name, + or no link at all, instead of the despammed email address pseudo-link, + let me know. + + + If you're not listed here and you think maybe you should be, please + let me know that as well. I try to get everyone, but sometimes my + bookkeeping lapses and I forget about contributions. + + +15.1. Code and Documentation +----------------------------- + + Rusty Russell - + + - wrote the HOWTO + + - prodded me into making this project official and putting it on + SourceForge + + - came up with the way cool UML logo + + - redid the config process + + + Peter Moulder - Fixed my config and build + processes, and added some useful code to the block driver + + + Bill Stearns - + + - HOWTO updates + + - lots of bug reports + + - lots of testing + + - dedicated a box (uml.ists.dartmouth.edu) to support UML development + + - wrote the mkrootfs script, which allows bootable filesystems of + RPM-based distributions to be cranked out + + - cranked out a large number of filesystems with said script + + + Jim Leu - Wrote the virtual ethernet driver + and associated usermode tools + + Lars Brinkhoff - Contributed the ptrace + proxy from his own project to allow easier + kernel debugging + + + Andrea Arcangeli - Redid some of the early boot + code so that it would work on machines with Large File Support + + + Chris Emerson - Did + the first UML port to Linux/ppc + + + Harald Welte - Wrote the multicast + transport for the network driver + + + Jorgen Cederlof - Added special file support to hostfs + + + Greg Lonnon - Changed the ubd driver + to allow it to layer a COW file on a shared read-only filesystem and + wrote the iomem emulation support + + + Henrik Nordstrom - Provided a variety + of patches, fixes, and clues + + + Lennert Buytenhek - Contributed various patches, a rewrite of the + network driver, the first implementation of the mconsole driver, and + did the bulk of the work needed to get SMP working again. + + + Yon Uriarte - Fixed the TUN/TAP network backend while I slept. + + + Adam Heath - Made a bunch of nice cleanups to the initialization code, + plus various other small patches. + + + Matt Zimmerman - Matt volunteered to be the UML Debian maintainer and + is doing a real nice job of it. He also noticed and fixed a number of + actually and potentially exploitable security holes in uml_net. Plus + the occasional patch. I like patches. + + + James McMechan - James seems to have taken over maintenance of the ubd + driver and is doing a nice job of it. + + + Chandan Kudige - wrote the umlgdb script which automates the reloading + of module symbols. + + + Steve Schmidtke - wrote the UML slirp transport and hostaudio drivers, + enabling UML processes to access audio devices on the host. He also + submitted patches for the slip transport and lots of other things. + + + David Coulson - + + - Set up the usermodelinux.org site, + which is a great way of keeping the UML user community on top of + UML goings-on. + + - Site documentation and updates + + - Nifty little UML management daemon UMLd + + + - Lots of testing and bug reports + + + + +15.2. Flushing out bugs +------------------------ + + + + - Yuri Pudgorodsky + + - Gerald Britton + + - Ian Wehrman + + - Gord Lamb + + - Eugene Koontz + + - John H. Hartman + + - Anders Karlsson + + - Daniel Phillips + + - John Fremlin + + - Rainer Burgstaller + + - James Stevenson + + - Matt Clay + + - Cliff Jefferies + + - Geoff Hoff + + - Lennert Buytenhek + + - Al Viro + + - Frank Klingenhoefer + + - Livio Baldini Soares + + - Jon Burgess + + - Petru Paler + + - Paul + + - Chris Reahard + + - Sverker Nilsson + + - Gong Su + + - johan verrept + + - Bjorn Eriksson + + - Lorenzo Allegrucci + + - Muli Ben-Yehuda + + - David Mansfield + + - Howard Goff + + - Mike Anderson + + - John Byrne + + - Sapan J. Batia + + - Iris Huang + + - Jan Hudec + + - Voluspa + + + + +15.3. Buglets and clean-ups +---------------------------- + + + + - Dave Zarzycki + + - Adam Lazur + + - Boria Feigin + + - Brian J. Murrell + + - JS + + - Roman Zippel + + - Wil Cooley + + - Ayelet Shemesh + + - Will Dyson + + - Sverker Nilsson + + - dvorak + + - v.naga srinivas + + - Shlomi Fish + + - Roger Binns + + - johan verrept + + - MrChuoi + + - Peter Cleve + + - Vincent Guffens + + - Nathan Scott + + - Patrick Caulfield + + - jbearce + + - Catalin Marinas + + - Shane Spencer + + - Zou Min + + + - Ryan Boder + + - Lorenzo Colitti + + - Gwendal Grignou + + - Andre' Breiler + + - Tsutomu Yasuda + + + +15.4. Case Studies +------------------- + + + - Jon Wright + + - William McEwan + + - Michael Richardson + + + +15.5. Other contributions +-------------------------- + + + Bill Carr made the Red Hat mkrootfs script + work with RH 6.2. + + Michael Jennings sent in some material which + is now gracing the top of the index page of this site. + + SGI (and more specifically Ralf Baechle ) gave me an account on oss.sgi.com + . The bandwidth there made it possible to + produce most of the filesystems available on the project download + page. + + Laurent Bonnaud took the old grotty + Debian filesystem that I've been distributing and updated it to 2.2. + It is now available by itself here. + + Rik van Riel gave me some ftp space on ftp.nl.linux.org so I can make + releases even when Sourceforge is broken. + + Rodrigo de Castro looked at my broken pte code and told me what was + wrong with it, letting me fix a long-standing (several weeks) and + serious set of bugs. + + Chris Reahard built a specialized root filesystem for running a DNS + server jailed inside UML. It's available from the download + page in the Jail + Filesystems section. -- cgit v1.2.3 From 72f8a49dc8b9e97a7986b0e6eced00a1a2e28996 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 10 Feb 2020 07:02:41 +0100 Subject: docs: virt: user_mode_linux.rst: update compiling instructions Instead of pointing for a pre-2.4 and a seaparate patch, update it to match current upstream, as UML was merged a long time ago. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Paolo Bonzini --- Documentation/virt/uml/user_mode_linux.rst | 62 +++++------------------------- 1 file changed, 9 insertions(+), 53 deletions(-) diff --git a/Documentation/virt/uml/user_mode_linux.rst b/Documentation/virt/uml/user_mode_linux.rst index 6085d2c0f8a8..e0632d80753e 100644 --- a/Documentation/virt/uml/user_mode_linux.rst +++ b/Documentation/virt/uml/user_mode_linux.rst @@ -5,7 +5,7 @@ User Mode Linux HOWTO ===================== :Author: User Mode Linux Core Team -:Last-updated: Mon Nov 18 14:16:16 EST 2002 +:Last-updated: Sat Jan 25 16:07:55 CET 2020 This document describes the use and abuse of Jeff Dike's User Mode Linux: a port of the Linux kernel as a normal Intel Linux process. @@ -223,23 +223,15 @@ Linux: a port of the Linux kernel as a normal Intel Linux process. Compiling the user mode kernel is just like compiling any other - kernel. Let's go through the steps, using 2.4.0-prerelease (current - as of this writing) as an example: - - - 1. Download the latest UML patch from - the download page stop compiling. - - The sources are also available from cvs at the project's cvs page, - which has directions on getting the sources. You can also browse the - CVS pool from there. - - If you get the CVS sources, you will have to check them out into an - empty directory. You will then have to copy each file into the - corresponding directory in the appropriate kernel pool. - - If you don't have the latest kernel pool, you can get the - corresponding user-mode sources with:: - - - host% cvs co -r v_2_3_x linux - - - - - where 'x' is the version in your pool. Note that you will not get the - bug fixes and enhancements that have gone into subsequent releases. - 2.2. Compiling and installing kernel modules --------------------------------------------- @@ -416,7 +372,7 @@ Linux: a port of the Linux kernel as a normal Intel Linux process. 3.1. Running UML ----------------- - It runs on 2.2.15 or later, and all 2.4 kernels. + It runs on 2.2.15 or later, and all kernel versions since 2.4. Booting UML is straightforward. Simply run 'linux': it will try to -- cgit v1.2.3 From c09708ccb4612890407c10a408624c229bf91b76 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 10 Feb 2020 07:02:42 +0100 Subject: docs: virt: user_mode_linux.rst: fix URL references Several URLs are pointing to outdated places. Update the references for the URLs whose contents still exists, removing the others. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Paolo Bonzini --- Documentation/virt/uml/user_mode_linux.rst | 71 ++++++++++++------------------ 1 file changed, 29 insertions(+), 42 deletions(-) diff --git a/Documentation/virt/uml/user_mode_linux.rst b/Documentation/virt/uml/user_mode_linux.rst index e0632d80753e..de0f0b2c9d5b 100644 --- a/Documentation/virt/uml/user_mode_linux.rst +++ b/Documentation/virt/uml/user_mode_linux.rst @@ -321,7 +321,7 @@ Linux: a port of the Linux kernel as a normal Intel Linux process. as modules, especially filesystems and network protocols and filters, so most symbols which need to be exported probably already are. However, if you do find symbols that need exporting, let us - know, and + know at http://user-mode-linux.sourceforge.net/, and they'll be "taken care of". @@ -383,9 +383,9 @@ Linux: a port of the Linux kernel as a normal Intel Linux process. You will need a filesystem to boot UML from. There are a number - available for download from here . There are also several tools - which can be + available for download from http://user-mode-linux.sourceforge.net. + There are also several tools at + http://user-mode-linux.sourceforge.net/ which can be used to generate UML-compatible filesystem images from media. The kernel will boot up and present you with a login prompt. @@ -464,10 +464,9 @@ Note: Here are some examples of UML in action: - - A login session - - - A virtual network + - A login session http://user-mode-linux.sourceforge.net/old/login.html + - A virtual network http://user-mode-linux.sourceforge.net/old/net.html @@ -1132,11 +1131,6 @@ Note: - - Harald's original README is here and explains these in detail, as well as - some other issues. - There is also a related point-to-point only "ucast" transport. This is useful when your network does not support multicast, and all network connections are simple point to point links. @@ -1219,8 +1213,7 @@ Note: make sure that /usr/src/linux points to the headers for the running kernel. - These were pointed out by Tim Robinson in - name="this uml-user post"> . + These were pointed out by Tim Robinson in the past. @@ -1914,8 +1907,8 @@ Note: uml_moo is installed with the UML deb and RPM. If you didn't install UML from one of those packages, you can also get it from the UML - utilities tar file in tools/moo. + utilities http://user-mode-linux.sourceforge.net/utilities tar file + in tools/moo. @@ -3709,18 +3702,15 @@ Note: This is a syslogd bug. There's a race between a parent process - installing a signal handler and its child sending the signal. See - this uml-devel post for the details. + installing a signal handler and its child sending the signal. 13.8. TUN/TAP networking doesn't work on a 2.4 host ---------------------------------------------------- - There are a couple of problems which were - name="pointed - out"> by Tim Robinson + There are a couple of problems which were reported by + Tim Robinson - It doesn't work on hosts running 2.4.7 (or thereabouts) or earlier. The fix is to upgrade to something more recent and then read the @@ -3763,7 +3753,7 @@ Note: Documentation on IP Masquerading, and SNAT, can be found at - www.netfilter.org . + http://www.netfilter.org. If you can reach the local net, but not the outside Internet, then @@ -4113,14 +4103,14 @@ Note: Rusty Russell - - - wrote the HOWTO + - wrote the HOWTO + http://user-mode-linux.sourceforge.net/old/UserModeLinux-HOWTO.html - prodded me into making this project official and putting it on SourceForge - - came up with the way cool UML logo + - came up with the way cool UML logo + http://user-mode-linux.sourceforge.net/uml-small.png - redid the config process @@ -4148,17 +4138,15 @@ Note: Jim Leu - Wrote the virtual ethernet driver and associated usermode tools - Lars Brinkhoff - Contributed the ptrace - proxy from his own project to allow easier - kernel debugging + Lars Brinkhoff http://lars.nocrew.org/ - Contributed the ptrace + proxy from his own project to allow easier kernel debugging Andrea Arcangeli - Redid some of the early boot code so that it would work on machines with Large File Support - Chris Emerson - Did - the first UML port to Linux/ppc + Chris Emerson - Did the first UML port to Linux/ppc Harald Welte - Wrote the multicast @@ -4173,7 +4161,7 @@ Note: wrote the iomem emulation support - Henrik Nordstrom - Provided a variety + Henrik Nordstrom http://hem.passagen.se/hno/ - Provided a variety of patches, fixes, and clues @@ -4208,16 +4196,15 @@ Note: submitted patches for the slip transport and lots of other things. - David Coulson - + David Coulson http://davidcoulson.net - - - Set up the usermodelinux.org site, + - Set up the http://usermodelinux.org site, which is a great way of keeping the UML user community on top of UML goings-on. - Site documentation and updates - Nifty little UML management daemon UMLd - - Lots of testing and bug reports @@ -4390,12 +4377,12 @@ Note: work with RH 6.2. Michael Jennings sent in some material which - is now gracing the top of the index page of this site. + is now gracing the top of the index page + http://user-mode-linux.sourceforge.net/ of this site. - SGI (and more specifically Ralf Baechle ) gave me an account on oss.sgi.com - . The bandwidth there made it possible to + SGI (and more specifically Ralf Baechle ) gave me an account on oss.sgi.com. + The bandwidth there made it possible to produce most of the filesystems available on the project download page. @@ -4412,5 +4399,5 @@ Note: Chris Reahard built a specialized root filesystem for running a DNS server jailed inside UML. It's available from the download - page in the Jail + http://user-mode-linux.sourceforge.net/old/dl-sf.html page in the Jail Filesystems section. -- cgit v1.2.3 From 2756df60d09748b1ccdc568690f9c4112353c920 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 10 Feb 2020 07:02:43 +0100 Subject: docs: virt: convert halt-polling.txt to ReST format - Fix document title to match ReST format - Convert the table to be properly recognized - Some indentation fixes to match ReST syntax. Signed-off-by: Mauro Carvalho Chehab Reviewed-by: Cornelia Huck Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/halt-polling.rst | 140 ++++++++++++++++++++++++++++++++ Documentation/virt/kvm/halt-polling.txt | 136 ------------------------------- Documentation/virt/kvm/index.rst | 1 + 3 files changed, 141 insertions(+), 136 deletions(-) create mode 100644 Documentation/virt/kvm/halt-polling.rst delete mode 100644 Documentation/virt/kvm/halt-polling.txt diff --git a/Documentation/virt/kvm/halt-polling.rst b/Documentation/virt/kvm/halt-polling.rst new file mode 100644 index 000000000000..4922e4a15f18 --- /dev/null +++ b/Documentation/virt/kvm/halt-polling.rst @@ -0,0 +1,140 @@ +.. SPDX-License-Identifier: GPL-2.0 + +=========================== +The KVM halt polling system +=========================== + +The KVM halt polling system provides a feature within KVM whereby the latency +of a guest can, under some circumstances, be reduced by polling in the host +for some time period after the guest has elected to no longer run by cedeing. +That is, when a guest vcpu has ceded, or in the case of powerpc when all of the +vcpus of a single vcore have ceded, the host kernel polls for wakeup conditions +before giving up the cpu to the scheduler in order to let something else run. + +Polling provides a latency advantage in cases where the guest can be run again +very quickly by at least saving us a trip through the scheduler, normally on +the order of a few micro-seconds, although performance benefits are workload +dependant. In the event that no wakeup source arrives during the polling +interval or some other task on the runqueue is runnable the scheduler is +invoked. Thus halt polling is especially useful on workloads with very short +wakeup periods where the time spent halt polling is minimised and the time +savings of not invoking the scheduler are distinguishable. + +The generic halt polling code is implemented in: + + virt/kvm/kvm_main.c: kvm_vcpu_block() + +The powerpc kvm-hv specific case is implemented in: + + arch/powerpc/kvm/book3s_hv.c: kvmppc_vcore_blocked() + +Halt Polling Interval +===================== + +The maximum time for which to poll before invoking the scheduler, referred to +as the halt polling interval, is increased and decreased based on the perceived +effectiveness of the polling in an attempt to limit pointless polling. +This value is stored in either the vcpu struct: + + kvm_vcpu->halt_poll_ns + +or in the case of powerpc kvm-hv, in the vcore struct: + + kvmppc_vcore->halt_poll_ns + +Thus this is a per vcpu (or vcore) value. + +During polling if a wakeup source is received within the halt polling interval, +the interval is left unchanged. In the event that a wakeup source isn't +received during the polling interval (and thus schedule is invoked) there are +two options, either the polling interval and total block time[0] were less than +the global max polling interval (see module params below), or the total block +time was greater than the global max polling interval. + +In the event that both the polling interval and total block time were less than +the global max polling interval then the polling interval can be increased in +the hope that next time during the longer polling interval the wake up source +will be received while the host is polling and the latency benefits will be +received. The polling interval is grown in the function grow_halt_poll_ns() and +is multiplied by the module parameters halt_poll_ns_grow and +halt_poll_ns_grow_start. + +In the event that the total block time was greater than the global max polling +interval then the host will never poll for long enough (limited by the global +max) to wakeup during the polling interval so it may as well be shrunk in order +to avoid pointless polling. The polling interval is shrunk in the function +shrink_halt_poll_ns() and is divided by the module parameter +halt_poll_ns_shrink, or set to 0 iff halt_poll_ns_shrink == 0. + +It is worth noting that this adjustment process attempts to hone in on some +steady state polling interval but will only really do a good job for wakeups +which come at an approximately constant rate, otherwise there will be constant +adjustment of the polling interval. + +[0] total block time: + the time between when the halt polling function is + invoked and a wakeup source received (irrespective of + whether the scheduler is invoked within that function). + +Module Parameters +================= + +The kvm module has 3 tuneable module parameters to adjust the global max +polling interval as well as the rate at which the polling interval is grown and +shrunk. These variables are defined in include/linux/kvm_host.h and as module +parameters in virt/kvm/kvm_main.c, or arch/powerpc/kvm/book3s_hv.c in the +powerpc kvm-hv case. + ++-----------------------+---------------------------+-------------------------+ +|Module Parameter | Description | Default Value | ++-----------------------+---------------------------+-------------------------+ +|halt_poll_ns | The global max polling | KVM_HALT_POLL_NS_DEFAULT| +| | interval which defines | | +| | the ceiling value of the | | +| | polling interval for | (per arch value) | +| | each vcpu. | | ++-----------------------+---------------------------+-------------------------+ +|halt_poll_ns_grow | The value by which the | 2 | +| | halt polling interval is | | +| | multiplied in the | | +| | grow_halt_poll_ns() | | +| | function. | | ++-----------------------+---------------------------+-------------------------+ +|halt_poll_ns_grow_start| The initial value to grow | 10000 | +| | to from zero in the | | +| | grow_halt_poll_ns() | | +| | function. | | ++-----------------------+---------------------------+-------------------------+ +|halt_poll_ns_shrink | The value by which the | 0 | +| | halt polling interval is | | +| | divided in the | | +| | shrink_halt_poll_ns() | | +| | function. | | ++-----------------------+---------------------------+-------------------------+ + +These module parameters can be set from the debugfs files in: + + /sys/module/kvm/parameters/ + +Note: that these module parameters are system wide values and are not able to + be tuned on a per vm basis. + +Further Notes +============= + +- Care should be taken when setting the halt_poll_ns module parameter as a large value + has the potential to drive the cpu usage to 100% on a machine which would be almost + entirely idle otherwise. This is because even if a guest has wakeups during which very + little work is done and which are quite far apart, if the period is shorter than the + global max polling interval (halt_poll_ns) then the host will always poll for the + entire block time and thus cpu utilisation will go to 100%. + +- Halt polling essentially presents a trade off between power usage and latency and + the module parameters should be used to tune the affinity for this. Idle cpu time is + essentially converted to host kernel time with the aim of decreasing latency when + entering the guest. + +- Halt polling will only be conducted by the host when no other tasks are runnable on + that cpu, otherwise the polling will cease immediately and schedule will be invoked to + allow that other task to run. Thus this doesn't allow a guest to denial of service the + cpu. diff --git a/Documentation/virt/kvm/halt-polling.txt b/Documentation/virt/kvm/halt-polling.txt deleted file mode 100644 index 4f791b128dd2..000000000000 --- a/Documentation/virt/kvm/halt-polling.txt +++ /dev/null @@ -1,136 +0,0 @@ -The KVM halt polling system -=========================== - -The KVM halt polling system provides a feature within KVM whereby the latency -of a guest can, under some circumstances, be reduced by polling in the host -for some time period after the guest has elected to no longer run by cedeing. -That is, when a guest vcpu has ceded, or in the case of powerpc when all of the -vcpus of a single vcore have ceded, the host kernel polls for wakeup conditions -before giving up the cpu to the scheduler in order to let something else run. - -Polling provides a latency advantage in cases where the guest can be run again -very quickly by at least saving us a trip through the scheduler, normally on -the order of a few micro-seconds, although performance benefits are workload -dependant. In the event that no wakeup source arrives during the polling -interval or some other task on the runqueue is runnable the scheduler is -invoked. Thus halt polling is especially useful on workloads with very short -wakeup periods where the time spent halt polling is minimised and the time -savings of not invoking the scheduler are distinguishable. - -The generic halt polling code is implemented in: - - virt/kvm/kvm_main.c: kvm_vcpu_block() - -The powerpc kvm-hv specific case is implemented in: - - arch/powerpc/kvm/book3s_hv.c: kvmppc_vcore_blocked() - -Halt Polling Interval -===================== - -The maximum time for which to poll before invoking the scheduler, referred to -as the halt polling interval, is increased and decreased based on the perceived -effectiveness of the polling in an attempt to limit pointless polling. -This value is stored in either the vcpu struct: - - kvm_vcpu->halt_poll_ns - -or in the case of powerpc kvm-hv, in the vcore struct: - - kvmppc_vcore->halt_poll_ns - -Thus this is a per vcpu (or vcore) value. - -During polling if a wakeup source is received within the halt polling interval, -the interval is left unchanged. In the event that a wakeup source isn't -received during the polling interval (and thus schedule is invoked) there are -two options, either the polling interval and total block time[0] were less than -the global max polling interval (see module params below), or the total block -time was greater than the global max polling interval. - -In the event that both the polling interval and total block time were less than -the global max polling interval then the polling interval can be increased in -the hope that next time during the longer polling interval the wake up source -will be received while the host is polling and the latency benefits will be -received. The polling interval is grown in the function grow_halt_poll_ns() and -is multiplied by the module parameters halt_poll_ns_grow and -halt_poll_ns_grow_start. - -In the event that the total block time was greater than the global max polling -interval then the host will never poll for long enough (limited by the global -max) to wakeup during the polling interval so it may as well be shrunk in order -to avoid pointless polling. The polling interval is shrunk in the function -shrink_halt_poll_ns() and is divided by the module parameter -halt_poll_ns_shrink, or set to 0 iff halt_poll_ns_shrink == 0. - -It is worth noting that this adjustment process attempts to hone in on some -steady state polling interval but will only really do a good job for wakeups -which come at an approximately constant rate, otherwise there will be constant -adjustment of the polling interval. - -[0] total block time: the time between when the halt polling function is - invoked and a wakeup source received (irrespective of - whether the scheduler is invoked within that function). - -Module Parameters -================= - -The kvm module has 3 tuneable module parameters to adjust the global max -polling interval as well as the rate at which the polling interval is grown and -shrunk. These variables are defined in include/linux/kvm_host.h and as module -parameters in virt/kvm/kvm_main.c, or arch/powerpc/kvm/book3s_hv.c in the -powerpc kvm-hv case. - -Module Parameter | Description | Default Value --------------------------------------------------------------------------------- -halt_poll_ns | The global max polling | KVM_HALT_POLL_NS_DEFAULT - | interval which defines | - | the ceiling value of the | - | polling interval for | (per arch value) - | each vcpu. | --------------------------------------------------------------------------------- -halt_poll_ns_grow | The value by which the | 2 - | halt polling interval is | - | multiplied in the | - | grow_halt_poll_ns() | - | function. | --------------------------------------------------------------------------------- -halt_poll_ns_grow_start | The initial value to grow | 10000 - | to from zero in the | - | grow_halt_poll_ns() | - | function. | --------------------------------------------------------------------------------- -halt_poll_ns_shrink | The value by which the | 0 - | halt polling interval is | - | divided in the | - | shrink_halt_poll_ns() | - | function. | --------------------------------------------------------------------------------- - -These module parameters can be set from the debugfs files in: - - /sys/module/kvm/parameters/ - -Note: that these module parameters are system wide values and are not able to - be tuned on a per vm basis. - -Further Notes -============= - -- Care should be taken when setting the halt_poll_ns module parameter as a -large value has the potential to drive the cpu usage to 100% on a machine which -would be almost entirely idle otherwise. This is because even if a guest has -wakeups during which very little work is done and which are quite far apart, if -the period is shorter than the global max polling interval (halt_poll_ns) then -the host will always poll for the entire block time and thus cpu utilisation -will go to 100%. - -- Halt polling essentially presents a trade off between power usage and latency -and the module parameters should be used to tune the affinity for this. Idle -cpu time is essentially converted to host kernel time with the aim of decreasing -latency when entering the guest. - -- Halt polling will only be conducted by the host when no other tasks are -runnable on that cpu, otherwise the polling will cease immediately and -schedule will be invoked to allow that other task to run. Thus this doesn't -allow a guest to denial of service the cpu. diff --git a/Documentation/virt/kvm/index.rst b/Documentation/virt/kvm/index.rst index 488c6370a447..b39f4894b61d 100644 --- a/Documentation/virt/kvm/index.rst +++ b/Documentation/virt/kvm/index.rst @@ -9,6 +9,7 @@ KVM amd-memory-encryption cpuid + halt-polling vcpu-requests arm/index -- cgit v1.2.3 From 263a19ff21c4a10f0a2d77c21feb3a641e5127f0 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 10 Feb 2020 07:02:44 +0100 Subject: docs: virt: Convert msr.txt to ReST format - Use document title markup; - Convert tables; - Add blank lines and adjust indentation. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/index.rst | 1 + Documentation/virt/kvm/msr.rst | 321 +++++++++++++++++++++++++++++++++++++++ Documentation/virt/kvm/msr.txt | 284 ---------------------------------- 3 files changed, 322 insertions(+), 284 deletions(-) create mode 100644 Documentation/virt/kvm/msr.rst delete mode 100644 Documentation/virt/kvm/msr.txt diff --git a/Documentation/virt/kvm/index.rst b/Documentation/virt/kvm/index.rst index b39f4894b61d..cc6dde47b267 100644 --- a/Documentation/virt/kvm/index.rst +++ b/Documentation/virt/kvm/index.rst @@ -10,6 +10,7 @@ KVM amd-memory-encryption cpuid halt-polling + msr vcpu-requests arm/index diff --git a/Documentation/virt/kvm/msr.rst b/Documentation/virt/kvm/msr.rst new file mode 100644 index 000000000000..33892036672d --- /dev/null +++ b/Documentation/virt/kvm/msr.rst @@ -0,0 +1,321 @@ +.. SPDX-License-Identifier: GPL-2.0 + +================= +KVM-specific MSRs +================= + +:Author: Glauber Costa , Red Hat Inc, 2010 + +KVM makes use of some custom MSRs to service some requests. + +Custom MSRs have a range reserved for them, that goes from +0x4b564d00 to 0x4b564dff. There are MSRs outside this area, +but they are deprecated and their use is discouraged. + +Custom MSR list +--------------- + +The current supported Custom MSR list is: + +MSR_KVM_WALL_CLOCK_NEW: + 0x4b564d00 + +data: + 4-byte alignment physical address of a memory area which must be + in guest RAM. This memory is expected to hold a copy of the following + structure:: + + struct pvclock_wall_clock { + u32 version; + u32 sec; + u32 nsec; + } __attribute__((__packed__)); + + whose data will be filled in by the hypervisor. The hypervisor is only + guaranteed to update this data at the moment of MSR write. + Users that want to reliably query this information more than once have + to write more than once to this MSR. Fields have the following meanings: + + version: + guest has to check version before and after grabbing + time information and check that they are both equal and even. + An odd version indicates an in-progress update. + + sec: + number of seconds for wallclock at time of boot. + + nsec: + number of nanoseconds for wallclock at time of boot. + + In order to get the current wallclock time, the system_time from + MSR_KVM_SYSTEM_TIME_NEW needs to be added. + + Note that although MSRs are per-CPU entities, the effect of this + particular MSR is global. + + Availability of this MSR must be checked via bit 3 in 0x4000001 cpuid + leaf prior to usage. + +MSR_KVM_SYSTEM_TIME_NEW: + 0x4b564d01 + +data: + 4-byte aligned physical address of a memory area which must be in + guest RAM, plus an enable bit in bit 0. This memory is expected to hold + a copy of the following structure:: + + struct pvclock_vcpu_time_info { + u32 version; + u32 pad0; + u64 tsc_timestamp; + u64 system_time; + u32 tsc_to_system_mul; + s8 tsc_shift; + u8 flags; + u8 pad[2]; + } __attribute__((__packed__)); /* 32 bytes */ + + whose data will be filled in by the hypervisor periodically. Only one + write, or registration, is needed for each VCPU. The interval between + updates of this structure is arbitrary and implementation-dependent. + The hypervisor may update this structure at any time it sees fit until + anything with bit0 == 0 is written to it. + + Fields have the following meanings: + + version: + guest has to check version before and after grabbing + time information and check that they are both equal and even. + An odd version indicates an in-progress update. + + tsc_timestamp: + the tsc value at the current VCPU at the time + of the update of this structure. Guests can subtract this value + from current tsc to derive a notion of elapsed time since the + structure update. + + system_time: + a host notion of monotonic time, including sleep + time at the time this structure was last updated. Unit is + nanoseconds. + + tsc_to_system_mul: + multiplier to be used when converting + tsc-related quantity to nanoseconds + + tsc_shift: + shift to be used when converting tsc-related + quantity to nanoseconds. This shift will ensure that + multiplication with tsc_to_system_mul does not overflow. + A positive value denotes a left shift, a negative value + a right shift. + + The conversion from tsc to nanoseconds involves an additional + right shift by 32 bits. With this information, guests can + derive per-CPU time by doing:: + + time = (current_tsc - tsc_timestamp) + if (tsc_shift >= 0) + time <<= tsc_shift; + else + time >>= -tsc_shift; + time = (time * tsc_to_system_mul) >> 32 + time = time + system_time + + flags: + bits in this field indicate extended capabilities + coordinated between the guest and the hypervisor. Availability + of specific flags has to be checked in 0x40000001 cpuid leaf. + Current flags are: + + + +-----------+--------------+----------------------------------+ + | flag bit | cpuid bit | meaning | + +-----------+--------------+----------------------------------+ + | | | time measures taken across | + | 0 | 24 | multiple cpus are guaranteed to | + | | | be monotonic | + +-----------+--------------+----------------------------------+ + | | | guest vcpu has been paused by | + | 1 | N/A | the host | + | | | See 4.70 in api.txt | + +-----------+--------------+----------------------------------+ + + Availability of this MSR must be checked via bit 3 in 0x4000001 cpuid + leaf prior to usage. + + +MSR_KVM_WALL_CLOCK: + 0x11 + +data and functioning: + same as MSR_KVM_WALL_CLOCK_NEW. Use that instead. + + This MSR falls outside the reserved KVM range and may be removed in the + future. Its usage is deprecated. + + Availability of this MSR must be checked via bit 0 in 0x4000001 cpuid + leaf prior to usage. + +MSR_KVM_SYSTEM_TIME: + 0x12 + +data and functioning: + same as MSR_KVM_SYSTEM_TIME_NEW. Use that instead. + + This MSR falls outside the reserved KVM range and may be removed in the + future. Its usage is deprecated. + + Availability of this MSR must be checked via bit 0 in 0x4000001 cpuid + leaf prior to usage. + + The suggested algorithm for detecting kvmclock presence is then:: + + if (!kvm_para_available()) /* refer to cpuid.txt */ + return NON_PRESENT; + + flags = cpuid_eax(0x40000001); + if (flags & 3) { + msr_kvm_system_time = MSR_KVM_SYSTEM_TIME_NEW; + msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK_NEW; + return PRESENT; + } else if (flags & 0) { + msr_kvm_system_time = MSR_KVM_SYSTEM_TIME; + msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK; + return PRESENT; + } else + return NON_PRESENT; + +MSR_KVM_ASYNC_PF_EN: + 0x4b564d02 + +data: + Bits 63-6 hold 64-byte aligned physical address of a + 64 byte memory area which must be in guest RAM and must be + zeroed. Bits 5-3 are reserved and should be zero. Bit 0 is 1 + when asynchronous page faults are enabled on the vcpu 0 when + disabled. Bit 1 is 1 if asynchronous page faults can be injected + when vcpu is in cpl == 0. Bit 2 is 1 if asynchronous page faults + are delivered to L1 as #PF vmexits. Bit 2 can be set only if + KVM_FEATURE_ASYNC_PF_VMEXIT is present in CPUID. + + First 4 byte of 64 byte memory location will be written to by + the hypervisor at the time of asynchronous page fault (APF) + injection to indicate type of asynchronous page fault. Value + of 1 means that the page referred to by the page fault is not + present. Value 2 means that the page is now available. Disabling + interrupt inhibits APFs. Guest must not enable interrupt + before the reason is read, or it may be overwritten by another + APF. Since APF uses the same exception vector as regular page + fault guest must reset the reason to 0 before it does + something that can generate normal page fault. If during page + fault APF reason is 0 it means that this is regular page + fault. + + During delivery of type 1 APF cr2 contains a token that will + be used to notify a guest when missing page becomes + available. When page becomes available type 2 APF is sent with + cr2 set to the token associated with the page. There is special + kind of token 0xffffffff which tells vcpu that it should wake + up all processes waiting for APFs and no individual type 2 APFs + will be sent. + + If APF is disabled while there are outstanding APFs, they will + not be delivered. + + Currently type 2 APF will be always delivered on the same vcpu as + type 1 was, but guest should not rely on that. + +MSR_KVM_STEAL_TIME: + 0x4b564d03 + +data: + 64-byte alignment physical address of a memory area which must be + in guest RAM, plus an enable bit in bit 0. This memory is expected to + hold a copy of the following structure:: + + struct kvm_steal_time { + __u64 steal; + __u32 version; + __u32 flags; + __u8 preempted; + __u8 u8_pad[3]; + __u32 pad[11]; + } + + whose data will be filled in by the hypervisor periodically. Only one + write, or registration, is needed for each VCPU. The interval between + updates of this structure is arbitrary and implementation-dependent. + The hypervisor may update this structure at any time it sees fit until + anything with bit0 == 0 is written to it. Guest is required to make sure + this structure is initialized to zero. + + Fields have the following meanings: + + version: + a sequence counter. In other words, guest has to check + this field before and after grabbing time information and make + sure they are both equal and even. An odd version indicates an + in-progress update. + + flags: + At this point, always zero. May be used to indicate + changes in this structure in the future. + + steal: + the amount of time in which this vCPU did not run, in + nanoseconds. Time during which the vcpu is idle, will not be + reported as steal time. + + preempted: + indicate the vCPU who owns this struct is running or + not. Non-zero values mean the vCPU has been preempted. Zero + means the vCPU is not preempted. NOTE, it is always zero if the + the hypervisor doesn't support this field. + +MSR_KVM_EOI_EN: + 0x4b564d04 + +data: + Bit 0 is 1 when PV end of interrupt is enabled on the vcpu; 0 + when disabled. Bit 1 is reserved and must be zero. When PV end of + interrupt is enabled (bit 0 set), bits 63-2 hold a 4-byte aligned + physical address of a 4 byte memory area which must be in guest RAM and + must be zeroed. + + The first, least significant bit of 4 byte memory location will be + written to by the hypervisor, typically at the time of interrupt + injection. Value of 1 means that guest can skip writing EOI to the apic + (using MSR or MMIO write); instead, it is sufficient to signal + EOI by clearing the bit in guest memory - this location will + later be polled by the hypervisor. + Value of 0 means that the EOI write is required. + + It is always safe for the guest to ignore the optimization and perform + the APIC EOI write anyway. + + Hypervisor is guaranteed to only modify this least + significant bit while in the current VCPU context, this means that + guest does not need to use either lock prefix or memory ordering + primitives to synchronise with the hypervisor. + + However, hypervisor can set and clear this memory bit at any time: + therefore to make sure hypervisor does not interrupt the + guest and clear the least significant bit in the memory area + in the window between guest testing it to detect + whether it can skip EOI apic write and between guest + clearing it to signal EOI to the hypervisor, + guest must both read the least significant bit in the memory area and + clear it using a single CPU instruction, such as test and clear, or + compare and exchange. + +MSR_KVM_POLL_CONTROL: + 0x4b564d05 + + Control host-side polling. + +data: + Bit 0 enables (1) or disables (0) host-side HLT polling logic. + + KVM guests can request the host not to poll on HLT, for example if + they are performing polling themselves. diff --git a/Documentation/virt/kvm/msr.txt b/Documentation/virt/kvm/msr.txt deleted file mode 100644 index df1f4338b3ca..000000000000 --- a/Documentation/virt/kvm/msr.txt +++ /dev/null @@ -1,284 +0,0 @@ -KVM-specific MSRs. -Glauber Costa , Red Hat Inc, 2010 -===================================================== - -KVM makes use of some custom MSRs to service some requests. - -Custom MSRs have a range reserved for them, that goes from -0x4b564d00 to 0x4b564dff. There are MSRs outside this area, -but they are deprecated and their use is discouraged. - -Custom MSR list --------- - -The current supported Custom MSR list is: - -MSR_KVM_WALL_CLOCK_NEW: 0x4b564d00 - - data: 4-byte alignment physical address of a memory area which must be - in guest RAM. This memory is expected to hold a copy of the following - structure: - - struct pvclock_wall_clock { - u32 version; - u32 sec; - u32 nsec; - } __attribute__((__packed__)); - - whose data will be filled in by the hypervisor. The hypervisor is only - guaranteed to update this data at the moment of MSR write. - Users that want to reliably query this information more than once have - to write more than once to this MSR. Fields have the following meanings: - - version: guest has to check version before and after grabbing - time information and check that they are both equal and even. - An odd version indicates an in-progress update. - - sec: number of seconds for wallclock at time of boot. - - nsec: number of nanoseconds for wallclock at time of boot. - - In order to get the current wallclock time, the system_time from - MSR_KVM_SYSTEM_TIME_NEW needs to be added. - - Note that although MSRs are per-CPU entities, the effect of this - particular MSR is global. - - Availability of this MSR must be checked via bit 3 in 0x4000001 cpuid - leaf prior to usage. - -MSR_KVM_SYSTEM_TIME_NEW: 0x4b564d01 - - data: 4-byte aligned physical address of a memory area which must be in - guest RAM, plus an enable bit in bit 0. This memory is expected to hold - a copy of the following structure: - - struct pvclock_vcpu_time_info { - u32 version; - u32 pad0; - u64 tsc_timestamp; - u64 system_time; - u32 tsc_to_system_mul; - s8 tsc_shift; - u8 flags; - u8 pad[2]; - } __attribute__((__packed__)); /* 32 bytes */ - - whose data will be filled in by the hypervisor periodically. Only one - write, or registration, is needed for each VCPU. The interval between - updates of this structure is arbitrary and implementation-dependent. - The hypervisor may update this structure at any time it sees fit until - anything with bit0 == 0 is written to it. - - Fields have the following meanings: - - version: guest has to check version before and after grabbing - time information and check that they are both equal and even. - An odd version indicates an in-progress update. - - tsc_timestamp: the tsc value at the current VCPU at the time - of the update of this structure. Guests can subtract this value - from current tsc to derive a notion of elapsed time since the - structure update. - - system_time: a host notion of monotonic time, including sleep - time at the time this structure was last updated. Unit is - nanoseconds. - - tsc_to_system_mul: multiplier to be used when converting - tsc-related quantity to nanoseconds - - tsc_shift: shift to be used when converting tsc-related - quantity to nanoseconds. This shift will ensure that - multiplication with tsc_to_system_mul does not overflow. - A positive value denotes a left shift, a negative value - a right shift. - - The conversion from tsc to nanoseconds involves an additional - right shift by 32 bits. With this information, guests can - derive per-CPU time by doing: - - time = (current_tsc - tsc_timestamp) - if (tsc_shift >= 0) - time <<= tsc_shift; - else - time >>= -tsc_shift; - time = (time * tsc_to_system_mul) >> 32 - time = time + system_time - - flags: bits in this field indicate extended capabilities - coordinated between the guest and the hypervisor. Availability - of specific flags has to be checked in 0x40000001 cpuid leaf. - Current flags are: - - flag bit | cpuid bit | meaning - ------------------------------------------------------------- - | | time measures taken across - 0 | 24 | multiple cpus are guaranteed to - | | be monotonic - ------------------------------------------------------------- - | | guest vcpu has been paused by - 1 | N/A | the host - | | See 4.70 in api.txt - ------------------------------------------------------------- - - Availability of this MSR must be checked via bit 3 in 0x4000001 cpuid - leaf prior to usage. - - -MSR_KVM_WALL_CLOCK: 0x11 - - data and functioning: same as MSR_KVM_WALL_CLOCK_NEW. Use that instead. - - This MSR falls outside the reserved KVM range and may be removed in the - future. Its usage is deprecated. - - Availability of this MSR must be checked via bit 0 in 0x4000001 cpuid - leaf prior to usage. - -MSR_KVM_SYSTEM_TIME: 0x12 - - data and functioning: same as MSR_KVM_SYSTEM_TIME_NEW. Use that instead. - - This MSR falls outside the reserved KVM range and may be removed in the - future. Its usage is deprecated. - - Availability of this MSR must be checked via bit 0 in 0x4000001 cpuid - leaf prior to usage. - - The suggested algorithm for detecting kvmclock presence is then: - - if (!kvm_para_available()) /* refer to cpuid.txt */ - return NON_PRESENT; - - flags = cpuid_eax(0x40000001); - if (flags & 3) { - msr_kvm_system_time = MSR_KVM_SYSTEM_TIME_NEW; - msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK_NEW; - return PRESENT; - } else if (flags & 0) { - msr_kvm_system_time = MSR_KVM_SYSTEM_TIME; - msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK; - return PRESENT; - } else - return NON_PRESENT; - -MSR_KVM_ASYNC_PF_EN: 0x4b564d02 - data: Bits 63-6 hold 64-byte aligned physical address of a - 64 byte memory area which must be in guest RAM and must be - zeroed. Bits 5-3 are reserved and should be zero. Bit 0 is 1 - when asynchronous page faults are enabled on the vcpu 0 when - disabled. Bit 1 is 1 if asynchronous page faults can be injected - when vcpu is in cpl == 0. Bit 2 is 1 if asynchronous page faults - are delivered to L1 as #PF vmexits. Bit 2 can be set only if - KVM_FEATURE_ASYNC_PF_VMEXIT is present in CPUID. - - First 4 byte of 64 byte memory location will be written to by - the hypervisor at the time of asynchronous page fault (APF) - injection to indicate type of asynchronous page fault. Value - of 1 means that the page referred to by the page fault is not - present. Value 2 means that the page is now available. Disabling - interrupt inhibits APFs. Guest must not enable interrupt - before the reason is read, or it may be overwritten by another - APF. Since APF uses the same exception vector as regular page - fault guest must reset the reason to 0 before it does - something that can generate normal page fault. If during page - fault APF reason is 0 it means that this is regular page - fault. - - During delivery of type 1 APF cr2 contains a token that will - be used to notify a guest when missing page becomes - available. When page becomes available type 2 APF is sent with - cr2 set to the token associated with the page. There is special - kind of token 0xffffffff which tells vcpu that it should wake - up all processes waiting for APFs and no individual type 2 APFs - will be sent. - - If APF is disabled while there are outstanding APFs, they will - not be delivered. - - Currently type 2 APF will be always delivered on the same vcpu as - type 1 was, but guest should not rely on that. - -MSR_KVM_STEAL_TIME: 0x4b564d03 - - data: 64-byte alignment physical address of a memory area which must be - in guest RAM, plus an enable bit in bit 0. This memory is expected to - hold a copy of the following structure: - - struct kvm_steal_time { - __u64 steal; - __u32 version; - __u32 flags; - __u8 preempted; - __u8 u8_pad[3]; - __u32 pad[11]; - } - - whose data will be filled in by the hypervisor periodically. Only one - write, or registration, is needed for each VCPU. The interval between - updates of this structure is arbitrary and implementation-dependent. - The hypervisor may update this structure at any time it sees fit until - anything with bit0 == 0 is written to it. Guest is required to make sure - this structure is initialized to zero. - - Fields have the following meanings: - - version: a sequence counter. In other words, guest has to check - this field before and after grabbing time information and make - sure they are both equal and even. An odd version indicates an - in-progress update. - - flags: At this point, always zero. May be used to indicate - changes in this structure in the future. - - steal: the amount of time in which this vCPU did not run, in - nanoseconds. Time during which the vcpu is idle, will not be - reported as steal time. - - preempted: indicate the vCPU who owns this struct is running or - not. Non-zero values mean the vCPU has been preempted. Zero - means the vCPU is not preempted. NOTE, it is always zero if the - the hypervisor doesn't support this field. - -MSR_KVM_EOI_EN: 0x4b564d04 - data: Bit 0 is 1 when PV end of interrupt is enabled on the vcpu; 0 - when disabled. Bit 1 is reserved and must be zero. When PV end of - interrupt is enabled (bit 0 set), bits 63-2 hold a 4-byte aligned - physical address of a 4 byte memory area which must be in guest RAM and - must be zeroed. - - The first, least significant bit of 4 byte memory location will be - written to by the hypervisor, typically at the time of interrupt - injection. Value of 1 means that guest can skip writing EOI to the apic - (using MSR or MMIO write); instead, it is sufficient to signal - EOI by clearing the bit in guest memory - this location will - later be polled by the hypervisor. - Value of 0 means that the EOI write is required. - - It is always safe for the guest to ignore the optimization and perform - the APIC EOI write anyway. - - Hypervisor is guaranteed to only modify this least - significant bit while in the current VCPU context, this means that - guest does not need to use either lock prefix or memory ordering - primitives to synchronise with the hypervisor. - - However, hypervisor can set and clear this memory bit at any time: - therefore to make sure hypervisor does not interrupt the - guest and clear the least significant bit in the memory area - in the window between guest testing it to detect - whether it can skip EOI apic write and between guest - clearing it to signal EOI to the hypervisor, - guest must both read the least significant bit in the memory area and - clear it using a single CPU instruction, such as test and clear, or - compare and exchange. - -MSR_KVM_POLL_CONTROL: 0x4b564d05 - Control host-side polling. - - data: Bit 0 enables (1) or disables (0) host-side HLT polling logic. - - KVM guests can request the host not to poll on HLT, for example if - they are performing polling themselves. - -- cgit v1.2.3 From d371c011fc5e16bc50985bab94b7141204c52153 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 10 Feb 2020 07:02:45 +0100 Subject: docs: kvm: devices/arm-vgic-its.txt to ReST format - Fix document title to match ReST format - Convert the table to be properly recognized - use proper markups for literal blocks - Some indentation fixes to match ReST While here, add an index for kvm devices. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/devices/arm-vgic-its.rst | 209 ++++++++++++++++++++++++ Documentation/virt/kvm/devices/arm-vgic-its.txt | 181 -------------------- Documentation/virt/kvm/devices/index.rst | 10 ++ Documentation/virt/kvm/index.rst | 1 + 4 files changed, 220 insertions(+), 181 deletions(-) create mode 100644 Documentation/virt/kvm/devices/arm-vgic-its.rst delete mode 100644 Documentation/virt/kvm/devices/arm-vgic-its.txt create mode 100644 Documentation/virt/kvm/devices/index.rst diff --git a/Documentation/virt/kvm/devices/arm-vgic-its.rst b/Documentation/virt/kvm/devices/arm-vgic-its.rst new file mode 100644 index 000000000000..6c304fd2b1b4 --- /dev/null +++ b/Documentation/virt/kvm/devices/arm-vgic-its.rst @@ -0,0 +1,209 @@ +.. SPDX-License-Identifier: GPL-2.0 + +=============================================== +ARM Virtual Interrupt Translation Service (ITS) +=============================================== + +Device types supported: + KVM_DEV_TYPE_ARM_VGIC_ITS ARM Interrupt Translation Service Controller + +The ITS allows MSI(-X) interrupts to be injected into guests. This extension is +optional. Creating a virtual ITS controller also requires a host GICv3 (see +arm-vgic-v3.txt), but does not depend on having physical ITS controllers. + +There can be multiple ITS controllers per guest, each of them has to have +a separate, non-overlapping MMIO region. + + +Groups +====== + +KVM_DEV_ARM_VGIC_GRP_ADDR +------------------------- + + Attributes: + KVM_VGIC_ITS_ADDR_TYPE (rw, 64-bit) + Base address in the guest physical address space of the GICv3 ITS + control register frame. + This address needs to be 64K aligned and the region covers 128K. + + Errors: + + ======= ================================================= + -E2BIG Address outside of addressable IPA range + -EINVAL Incorrectly aligned address + -EEXIST Address already configured + -EFAULT Invalid user pointer for attr->addr. + -ENODEV Incorrect attribute or the ITS is not supported. + ======= ================================================= + + +KVM_DEV_ARM_VGIC_GRP_CTRL +------------------------- + + Attributes: + KVM_DEV_ARM_VGIC_CTRL_INIT + request the initialization of the ITS, no additional parameter in + kvm_device_attr.addr. + + KVM_DEV_ARM_ITS_CTRL_RESET + reset the ITS, no additional parameter in kvm_device_attr.addr. + See "ITS Reset State" section. + + KVM_DEV_ARM_ITS_SAVE_TABLES + save the ITS table data into guest RAM, at the location provisioned + by the guest in corresponding registers/table entries. + + The layout of the tables in guest memory defines an ABI. The entries + are laid out in little endian format as described in the last paragraph. + + KVM_DEV_ARM_ITS_RESTORE_TABLES + restore the ITS tables from guest RAM to ITS internal structures. + + The GICV3 must be restored before the ITS and all ITS registers but + the GITS_CTLR must be restored before restoring the ITS tables. + + The GITS_IIDR read-only register must also be restored before + calling KVM_DEV_ARM_ITS_RESTORE_TABLES as the IIDR revision field + encodes the ABI revision. + + The expected ordering when restoring the GICv3/ITS is described in section + "ITS Restore Sequence". + + Errors: + + ======= ========================================================== + -ENXIO ITS not properly configured as required prior to setting + this attribute + -ENOMEM Memory shortage when allocating ITS internal data + -EINVAL Inconsistent restored data + -EFAULT Invalid guest ram access + -EBUSY One or more VCPUS are running + -EACCES The virtual ITS is backed by a physical GICv4 ITS, and the + state is not available + ======= ========================================================== + +KVM_DEV_ARM_VGIC_GRP_ITS_REGS +----------------------------- + + Attributes: + The attr field of kvm_device_attr encodes the offset of the + ITS register, relative to the ITS control frame base address + (ITS_base). + + kvm_device_attr.addr points to a __u64 value whatever the width + of the addressed register (32/64 bits). 64 bit registers can only + be accessed with full length. + + Writes to read-only registers are ignored by the kernel except for: + + - GITS_CREADR. It must be restored otherwise commands in the queue + will be re-executed after restoring CWRITER. GITS_CREADR must be + restored before restoring the GITS_CTLR which is likely to enable the + ITS. Also it must be restored after GITS_CBASER since a write to + GITS_CBASER resets GITS_CREADR. + - GITS_IIDR. The Revision field encodes the table layout ABI revision. + In the future we might implement direct injection of virtual LPIs. + This will require an upgrade of the table layout and an evolution of + the ABI. GITS_IIDR must be restored before calling + KVM_DEV_ARM_ITS_RESTORE_TABLES. + + For other registers, getting or setting a register has the same + effect as reading/writing the register on real hardware. + + Errors: + + ======= ==================================================== + -ENXIO Offset does not correspond to any supported register + -EFAULT Invalid user pointer for attr->addr + -EINVAL Offset is not 64-bit aligned + -EBUSY one or more VCPUS are running + ======= ==================================================== + +ITS Restore Sequence: +--------------------- + +The following ordering must be followed when restoring the GIC and the ITS: + +a) restore all guest memory and create vcpus +b) restore all redistributors +c) provide the ITS base address + (KVM_DEV_ARM_VGIC_GRP_ADDR) +d) restore the ITS in the following order: + + 1. Restore GITS_CBASER + 2. Restore all other ``GITS_`` registers, except GITS_CTLR! + 3. Load the ITS table data (KVM_DEV_ARM_ITS_RESTORE_TABLES) + 4. Restore GITS_CTLR + +Then vcpus can be started. + +ITS Table ABI REV0: +------------------- + + Revision 0 of the ABI only supports the features of a virtual GICv3, and does + not support a virtual GICv4 with support for direct injection of virtual + interrupts for nested hypervisors. + + The device table and ITT are indexed by the DeviceID and EventID, + respectively. The collection table is not indexed by CollectionID, and the + entries in the collection are listed in no particular order. + All entries are 8 bytes. + + Device Table Entry (DTE):: + + bits: | 63| 62 ... 49 | 48 ... 5 | 4 ... 0 | + values: | V | next | ITT_addr | Size | + + where: + + - V indicates whether the entry is valid. If not, other fields + are not meaningful. + - next: equals to 0 if this entry is the last one; otherwise it + corresponds to the DeviceID offset to the next DTE, capped by + 2^14 -1. + - ITT_addr matches bits [51:8] of the ITT address (256 Byte aligned). + - Size specifies the supported number of bits for the EventID, + minus one + + Collection Table Entry (CTE):: + + bits: | 63| 62 .. 52 | 51 ... 16 | 15 ... 0 | + values: | V | RES0 | RDBase | ICID | + + where: + + - V indicates whether the entry is valid. If not, other fields are + not meaningful. + - RES0: reserved field with Should-Be-Zero-or-Preserved behavior. + - RDBase is the PE number (GICR_TYPER.Processor_Number semantic), + - ICID is the collection ID + + Interrupt Translation Entry (ITE):: + + bits: | 63 ... 48 | 47 ... 16 | 15 ... 0 | + values: | next | pINTID | ICID | + + where: + + - next: equals to 0 if this entry is the last one; otherwise it corresponds + to the EventID offset to the next ITE capped by 2^16 -1. + - pINTID is the physical LPI ID; if zero, it means the entry is not valid + and other fields are not meaningful. + - ICID is the collection ID + +ITS Reset State: +---------------- + +RESET returns the ITS to the same state that it was when first created and +initialized. When the RESET command returns, the following things are +guaranteed: + +- The ITS is not enabled and quiescent + GITS_CTLR.Enabled = 0 .Quiescent=1 +- There is no internally cached state +- No collection or device table are used + GITS_BASER.Valid = 0 +- GITS_CBASER = 0, GITS_CREADR = 0, GITS_CWRITER = 0 +- The ABI version is unchanged and remains the one set when the ITS + device was first created. diff --git a/Documentation/virt/kvm/devices/arm-vgic-its.txt b/Documentation/virt/kvm/devices/arm-vgic-its.txt deleted file mode 100644 index eeaa95b893a8..000000000000 --- a/Documentation/virt/kvm/devices/arm-vgic-its.txt +++ /dev/null @@ -1,181 +0,0 @@ -ARM Virtual Interrupt Translation Service (ITS) -=============================================== - -Device types supported: - KVM_DEV_TYPE_ARM_VGIC_ITS ARM Interrupt Translation Service Controller - -The ITS allows MSI(-X) interrupts to be injected into guests. This extension is -optional. Creating a virtual ITS controller also requires a host GICv3 (see -arm-vgic-v3.txt), but does not depend on having physical ITS controllers. - -There can be multiple ITS controllers per guest, each of them has to have -a separate, non-overlapping MMIO region. - - -Groups: - KVM_DEV_ARM_VGIC_GRP_ADDR - Attributes: - KVM_VGIC_ITS_ADDR_TYPE (rw, 64-bit) - Base address in the guest physical address space of the GICv3 ITS - control register frame. - This address needs to be 64K aligned and the region covers 128K. - Errors: - -E2BIG: Address outside of addressable IPA range - -EINVAL: Incorrectly aligned address - -EEXIST: Address already configured - -EFAULT: Invalid user pointer for attr->addr. - -ENODEV: Incorrect attribute or the ITS is not supported. - - - KVM_DEV_ARM_VGIC_GRP_CTRL - Attributes: - KVM_DEV_ARM_VGIC_CTRL_INIT - request the initialization of the ITS, no additional parameter in - kvm_device_attr.addr. - - KVM_DEV_ARM_ITS_CTRL_RESET - reset the ITS, no additional parameter in kvm_device_attr.addr. - See "ITS Reset State" section. - - KVM_DEV_ARM_ITS_SAVE_TABLES - save the ITS table data into guest RAM, at the location provisioned - by the guest in corresponding registers/table entries. - - The layout of the tables in guest memory defines an ABI. The entries - are laid out in little endian format as described in the last paragraph. - - KVM_DEV_ARM_ITS_RESTORE_TABLES - restore the ITS tables from guest RAM to ITS internal structures. - - The GICV3 must be restored before the ITS and all ITS registers but - the GITS_CTLR must be restored before restoring the ITS tables. - - The GITS_IIDR read-only register must also be restored before - calling KVM_DEV_ARM_ITS_RESTORE_TABLES as the IIDR revision field - encodes the ABI revision. - - The expected ordering when restoring the GICv3/ITS is described in section - "ITS Restore Sequence". - - Errors: - -ENXIO: ITS not properly configured as required prior to setting - this attribute - -ENOMEM: Memory shortage when allocating ITS internal data - -EINVAL: Inconsistent restored data - -EFAULT: Invalid guest ram access - -EBUSY: One or more VCPUS are running - -EACCES: The virtual ITS is backed by a physical GICv4 ITS, and the - state is not available - - KVM_DEV_ARM_VGIC_GRP_ITS_REGS - Attributes: - The attr field of kvm_device_attr encodes the offset of the - ITS register, relative to the ITS control frame base address - (ITS_base). - - kvm_device_attr.addr points to a __u64 value whatever the width - of the addressed register (32/64 bits). 64 bit registers can only - be accessed with full length. - - Writes to read-only registers are ignored by the kernel except for: - - GITS_CREADR. It must be restored otherwise commands in the queue - will be re-executed after restoring CWRITER. GITS_CREADR must be - restored before restoring the GITS_CTLR which is likely to enable the - ITS. Also it must be restored after GITS_CBASER since a write to - GITS_CBASER resets GITS_CREADR. - - GITS_IIDR. The Revision field encodes the table layout ABI revision. - In the future we might implement direct injection of virtual LPIs. - This will require an upgrade of the table layout and an evolution of - the ABI. GITS_IIDR must be restored before calling - KVM_DEV_ARM_ITS_RESTORE_TABLES. - - For other registers, getting or setting a register has the same - effect as reading/writing the register on real hardware. - Errors: - -ENXIO: Offset does not correspond to any supported register - -EFAULT: Invalid user pointer for attr->addr - -EINVAL: Offset is not 64-bit aligned - -EBUSY: one or more VCPUS are running - - ITS Restore Sequence: - ------------------------- - -The following ordering must be followed when restoring the GIC and the ITS: -a) restore all guest memory and create vcpus -b) restore all redistributors -c) provide the ITS base address - (KVM_DEV_ARM_VGIC_GRP_ADDR) -d) restore the ITS in the following order: - 1. Restore GITS_CBASER - 2. Restore all other GITS_ registers, except GITS_CTLR! - 3. Load the ITS table data (KVM_DEV_ARM_ITS_RESTORE_TABLES) - 4. Restore GITS_CTLR - -Then vcpus can be started. - - ITS Table ABI REV0: - ------------------- - - Revision 0 of the ABI only supports the features of a virtual GICv3, and does - not support a virtual GICv4 with support for direct injection of virtual - interrupts for nested hypervisors. - - The device table and ITT are indexed by the DeviceID and EventID, - respectively. The collection table is not indexed by CollectionID, and the - entries in the collection are listed in no particular order. - All entries are 8 bytes. - - Device Table Entry (DTE): - - bits: | 63| 62 ... 49 | 48 ... 5 | 4 ... 0 | - values: | V | next | ITT_addr | Size | - - where; - - V indicates whether the entry is valid. If not, other fields - are not meaningful. - - next: equals to 0 if this entry is the last one; otherwise it - corresponds to the DeviceID offset to the next DTE, capped by - 2^14 -1. - - ITT_addr matches bits [51:8] of the ITT address (256 Byte aligned). - - Size specifies the supported number of bits for the EventID, - minus one - - Collection Table Entry (CTE): - - bits: | 63| 62 .. 52 | 51 ... 16 | 15 ... 0 | - values: | V | RES0 | RDBase | ICID | - - where: - - V indicates whether the entry is valid. If not, other fields are - not meaningful. - - RES0: reserved field with Should-Be-Zero-or-Preserved behavior. - - RDBase is the PE number (GICR_TYPER.Processor_Number semantic), - - ICID is the collection ID - - Interrupt Translation Entry (ITE): - - bits: | 63 ... 48 | 47 ... 16 | 15 ... 0 | - values: | next | pINTID | ICID | - - where: - - next: equals to 0 if this entry is the last one; otherwise it corresponds - to the EventID offset to the next ITE capped by 2^16 -1. - - pINTID is the physical LPI ID; if zero, it means the entry is not valid - and other fields are not meaningful. - - ICID is the collection ID - - ITS Reset State: - ---------------- - -RESET returns the ITS to the same state that it was when first created and -initialized. When the RESET command returns, the following things are -guaranteed: - -- The ITS is not enabled and quiescent - GITS_CTLR.Enabled = 0 .Quiescent=1 -- There is no internally cached state -- No collection or device table are used - GITS_BASER.Valid = 0 -- GITS_CBASER = 0, GITS_CREADR = 0, GITS_CWRITER = 0 -- The ABI version is unchanged and remains the one set when the ITS - device was first created. diff --git a/Documentation/virt/kvm/devices/index.rst b/Documentation/virt/kvm/devices/index.rst new file mode 100644 index 000000000000..2aad8d426097 --- /dev/null +++ b/Documentation/virt/kvm/devices/index.rst @@ -0,0 +1,10 @@ +.. SPDX-License-Identifier: GPL-2.0 + +======= +Devices +======= + +.. toctree:: + :maxdepth: 2 + + arm-vgic-its diff --git a/Documentation/virt/kvm/index.rst b/Documentation/virt/kvm/index.rst index cc6dde47b267..24d1076ec680 100644 --- a/Documentation/virt/kvm/index.rst +++ b/Documentation/virt/kvm/index.rst @@ -14,3 +14,4 @@ KVM vcpu-requests arm/index + devices/index -- cgit v1.2.3 From c0d1c8a0af591c139fe7339bf6cdf0e766037cd4 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 10 Feb 2020 07:02:46 +0100 Subject: docs: kvm: devices/arm-vgit-v3.txt to ReST - Use title markups; - change indent to match ReST syntax; - use proper table markups; - use literal block markups. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/devices/arm-vgic-v3.rst | 291 +++++++++++++++++++++++++ Documentation/virt/kvm/devices/arm-vgic-v3.txt | 251 --------------------- Documentation/virt/kvm/devices/index.rst | 1 + 3 files changed, 292 insertions(+), 251 deletions(-) create mode 100644 Documentation/virt/kvm/devices/arm-vgic-v3.rst delete mode 100644 Documentation/virt/kvm/devices/arm-vgic-v3.txt diff --git a/Documentation/virt/kvm/devices/arm-vgic-v3.rst b/Documentation/virt/kvm/devices/arm-vgic-v3.rst new file mode 100644 index 000000000000..5dd3bff51978 --- /dev/null +++ b/Documentation/virt/kvm/devices/arm-vgic-v3.rst @@ -0,0 +1,291 @@ +.. SPDX-License-Identifier: GPL-2.0 + +============================================================== +ARM Virtual Generic Interrupt Controller v3 and later (VGICv3) +============================================================== + + +Device types supported: + - KVM_DEV_TYPE_ARM_VGIC_V3 ARM Generic Interrupt Controller v3.0 + +Only one VGIC instance may be instantiated through this API. The created VGIC +will act as the VM interrupt controller, requiring emulated user-space devices +to inject interrupts to the VGIC instead of directly to CPUs. It is not +possible to create both a GICv3 and GICv2 on the same VM. + +Creating a guest GICv3 device requires a host GICv3 as well. + + +Groups: + KVM_DEV_ARM_VGIC_GRP_ADDR + Attributes: + + KVM_VGIC_V3_ADDR_TYPE_DIST (rw, 64-bit) + Base address in the guest physical address space of the GICv3 distributor + register mappings. Only valid for KVM_DEV_TYPE_ARM_VGIC_V3. + This address needs to be 64K aligned and the region covers 64 KByte. + + KVM_VGIC_V3_ADDR_TYPE_REDIST (rw, 64-bit) + Base address in the guest physical address space of the GICv3 + redistributor register mappings. There are two 64K pages for each + VCPU and all of the redistributor pages are contiguous. + Only valid for KVM_DEV_TYPE_ARM_VGIC_V3. + This address needs to be 64K aligned. + + KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION (rw, 64-bit) + The attribute data pointed to by kvm_device_attr.addr is a __u64 value:: + + bits: | 63 .... 52 | 51 .... 16 | 15 - 12 |11 - 0 + values: | count | base | flags | index + + - index encodes the unique redistributor region index + - flags: reserved for future use, currently 0 + - base field encodes bits [51:16] of the guest physical base address + of the first redistributor in the region. + - count encodes the number of redistributors in the region. Must be + greater than 0. + + There are two 64K pages for each redistributor in the region and + redistributors are laid out contiguously within the region. Regions + are filled with redistributors in the index order. The sum of all + region count fields must be greater than or equal to the number of + VCPUs. Redistributor regions must be registered in the incremental + index order, starting from index 0. + + The characteristics of a specific redistributor region can be read + by presetting the index field in the attr data. + Only valid for KVM_DEV_TYPE_ARM_VGIC_V3. + + It is invalid to mix calls with KVM_VGIC_V3_ADDR_TYPE_REDIST and + KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION attributes. + + Errors: + + ======= ============================================================= + -E2BIG Address outside of addressable IPA range + -EINVAL Incorrectly aligned address, bad redistributor region + count/index, mixed redistributor region attribute usage + -EEXIST Address already configured + -ENOENT Attempt to read the characteristics of a non existing + redistributor region + -ENXIO The group or attribute is unknown/unsupported for this device + or hardware support is missing. + -EFAULT Invalid user pointer for attr->addr. + ======= ============================================================= + + + KVM_DEV_ARM_VGIC_GRP_DIST_REGS, KVM_DEV_ARM_VGIC_GRP_REDIST_REGS + Attributes: + + The attr field of kvm_device_attr encodes two values:: + + bits: | 63 .... 32 | 31 .... 0 | + values: | mpidr | offset | + + All distributor regs are (rw, 32-bit) and kvm_device_attr.addr points to a + __u32 value. 64-bit registers must be accessed by separately accessing the + lower and higher word. + + Writes to read-only registers are ignored by the kernel. + + KVM_DEV_ARM_VGIC_GRP_DIST_REGS accesses the main distributor registers. + KVM_DEV_ARM_VGIC_GRP_REDIST_REGS accesses the redistributor of the CPU + specified by the mpidr. + + The offset is relative to the "[Re]Distributor base address" as defined + in the GICv3/4 specs. Getting or setting such a register has the same + effect as reading or writing the register on real hardware, except for the + following registers: GICD_STATUSR, GICR_STATUSR, GICD_ISPENDR, + GICR_ISPENDR0, GICD_ICPENDR, and GICR_ICPENDR0. These registers behave + differently when accessed via this interface compared to their + architecturally defined behavior to allow software a full view of the + VGIC's internal state. + + The mpidr field is used to specify which + redistributor is accessed. The mpidr is ignored for the distributor. + + The mpidr encoding is based on the affinity information in the + architecture defined MPIDR, and the field is encoded as follows:: + + | 63 .... 56 | 55 .... 48 | 47 .... 40 | 39 .... 32 | + | Aff3 | Aff2 | Aff1 | Aff0 | + + Note that distributor fields are not banked, but return the same value + regardless of the mpidr used to access the register. + + GICD_IIDR.Revision is updated when the KVM implementation is changed in a + way directly observable by the guest or userspace. Userspace should read + GICD_IIDR from KVM and write back the read value to confirm its expected + behavior is aligned with the KVM implementation. Userspace should set + GICD_IIDR before setting any other registers to ensure the expected + behavior. + + + The GICD_STATUSR and GICR_STATUSR registers are architecturally defined such + that a write of a clear bit has no effect, whereas a write with a set bit + clears that value. To allow userspace to freely set the values of these two + registers, setting the attributes with the register offsets for these two + registers simply sets the non-reserved bits to the value written. + + + Accesses (reads and writes) to the GICD_ISPENDR register region and + GICR_ISPENDR0 registers get/set the value of the latched pending state for + the interrupts. + + This is identical to the value returned by a guest read from ISPENDR for an + edge triggered interrupt, but may differ for level triggered interrupts. + For edge triggered interrupts, once an interrupt becomes pending (whether + because of an edge detected on the input line or because of a guest write + to ISPENDR) this state is "latched", and only cleared when either the + interrupt is activated or when the guest writes to ICPENDR. A level + triggered interrupt may be pending either because the level input is held + high by a device, or because of a guest write to the ISPENDR register. Only + ISPENDR writes are latched; if the device lowers the line level then the + interrupt is no longer pending unless the guest also wrote to ISPENDR, and + conversely writes to ICPENDR or activations of the interrupt do not clear + the pending status if the line level is still being held high. (These + rules are documented in the GICv3 specification descriptions of the ICPENDR + and ISPENDR registers.) For a level triggered interrupt the value accessed + here is that of the latch which is set by ISPENDR and cleared by ICPENDR or + interrupt activation, whereas the value returned by a guest read from + ISPENDR is the logical OR of the latch value and the input line level. + + Raw access to the latch state is provided to userspace so that it can save + and restore the entire GIC internal state (which is defined by the + combination of the current input line level and the latch state, and cannot + be deduced from purely the line level and the value of the ISPENDR + registers). + + Accesses to GICD_ICPENDR register region and GICR_ICPENDR0 registers have + RAZ/WI semantics, meaning that reads always return 0 and writes are always + ignored. + + Errors: + + ====== ===================================================== + -ENXIO Getting or setting this register is not yet supported + -EBUSY One or more VCPUs are running + ====== ===================================================== + + + KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS + Attributes: + + The attr field of kvm_device_attr encodes two values:: + + bits: | 63 .... 32 | 31 .... 16 | 15 .... 0 | + values: | mpidr | RES | instr | + + The mpidr field encodes the CPU ID based on the affinity information in the + architecture defined MPIDR, and the field is encoded as follows:: + + | 63 .... 56 | 55 .... 48 | 47 .... 40 | 39 .... 32 | + | Aff3 | Aff2 | Aff1 | Aff0 | + + The instr field encodes the system register to access based on the fields + defined in the A64 instruction set encoding for system register access + (RES means the bits are reserved for future use and should be zero):: + + | 15 ... 14 | 13 ... 11 | 10 ... 7 | 6 ... 3 | 2 ... 0 | + | Op 0 | Op1 | CRn | CRm | Op2 | + + All system regs accessed through this API are (rw, 64-bit) and + kvm_device_attr.addr points to a __u64 value. + + KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS accesses the CPU interface registers for the + CPU specified by the mpidr field. + + CPU interface registers access is not implemented for AArch32 mode. + Error -ENXIO is returned when accessed in AArch32 mode. + + Errors: + + ======= ===================================================== + -ENXIO Getting or setting this register is not yet supported + -EBUSY VCPU is running + -EINVAL Invalid mpidr or register value supplied + ======= ===================================================== + + + KVM_DEV_ARM_VGIC_GRP_NR_IRQS + Attributes: + + A value describing the number of interrupts (SGI, PPI and SPI) for + this GIC instance, ranging from 64 to 1024, in increments of 32. + + kvm_device_attr.addr points to a __u32 value. + + Errors: + + ======= ====================================== + -EINVAL Value set is out of the expected range + -EBUSY Value has already be set. + ======= ====================================== + + + KVM_DEV_ARM_VGIC_GRP_CTRL + Attributes: + + KVM_DEV_ARM_VGIC_CTRL_INIT + request the initialization of the VGIC, no additional parameter in + kvm_device_attr.addr. + KVM_DEV_ARM_VGIC_SAVE_PENDING_TABLES + save all LPI pending bits into guest RAM pending tables. + + The first kB of the pending table is not altered by this operation. + + Errors: + + ======= ======================================================== + -ENXIO VGIC not properly configured as required prior to calling + this attribute + -ENODEV no online VCPU + -ENOMEM memory shortage when allocating vgic internal data + -EFAULT Invalid guest ram access + -EBUSY One or more VCPUS are running + ======= ======================================================== + + + KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO + Attributes: + + The attr field of kvm_device_attr encodes the following values:: + + bits: | 63 .... 32 | 31 .... 10 | 9 .... 0 | + values: | mpidr | info | vINTID | + + The vINTID specifies which set of IRQs is reported on. + + The info field specifies which information userspace wants to get or set + using this interface. Currently we support the following info values: + + VGIC_LEVEL_INFO_LINE_LEVEL: + Get/Set the input level of the IRQ line for a set of 32 contiguously + numbered interrupts. + + vINTID must be a multiple of 32. + + kvm_device_attr.addr points to a __u32 value which will contain a + bitmap where a set bit means the interrupt level is asserted. + + Bit[n] indicates the status for interrupt vINTID + n. + + SGIs and any interrupt with a higher ID than the number of interrupts + supported, will be RAZ/WI. LPIs are always edge-triggered and are + therefore not supported by this interface. + + PPIs are reported per VCPU as specified in the mpidr field, and SPIs are + reported with the same value regardless of the mpidr specified. + + The mpidr field encodes the CPU ID based on the affinity information in the + architecture defined MPIDR, and the field is encoded as follows:: + + | 63 .... 56 | 55 .... 48 | 47 .... 40 | 39 .... 32 | + | Aff3 | Aff2 | Aff1 | Aff0 | + + Errors: + + ======= ============================================= + -EINVAL vINTID is not multiple of 32 or info field is + not VGIC_LEVEL_INFO_LINE_LEVEL + ======= ============================================= diff --git a/Documentation/virt/kvm/devices/arm-vgic-v3.txt b/Documentation/virt/kvm/devices/arm-vgic-v3.txt deleted file mode 100644 index ff290b43c8e5..000000000000 --- a/Documentation/virt/kvm/devices/arm-vgic-v3.txt +++ /dev/null @@ -1,251 +0,0 @@ -ARM Virtual Generic Interrupt Controller v3 and later (VGICv3) -============================================================== - - -Device types supported: - KVM_DEV_TYPE_ARM_VGIC_V3 ARM Generic Interrupt Controller v3.0 - -Only one VGIC instance may be instantiated through this API. The created VGIC -will act as the VM interrupt controller, requiring emulated user-space devices -to inject interrupts to the VGIC instead of directly to CPUs. It is not -possible to create both a GICv3 and GICv2 on the same VM. - -Creating a guest GICv3 device requires a host GICv3 as well. - - -Groups: - KVM_DEV_ARM_VGIC_GRP_ADDR - Attributes: - KVM_VGIC_V3_ADDR_TYPE_DIST (rw, 64-bit) - Base address in the guest physical address space of the GICv3 distributor - register mappings. Only valid for KVM_DEV_TYPE_ARM_VGIC_V3. - This address needs to be 64K aligned and the region covers 64 KByte. - - KVM_VGIC_V3_ADDR_TYPE_REDIST (rw, 64-bit) - Base address in the guest physical address space of the GICv3 - redistributor register mappings. There are two 64K pages for each - VCPU and all of the redistributor pages are contiguous. - Only valid for KVM_DEV_TYPE_ARM_VGIC_V3. - This address needs to be 64K aligned. - - KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION (rw, 64-bit) - The attribute data pointed to by kvm_device_attr.addr is a __u64 value: - bits: | 63 .... 52 | 51 .... 16 | 15 - 12 |11 - 0 - values: | count | base | flags | index - - index encodes the unique redistributor region index - - flags: reserved for future use, currently 0 - - base field encodes bits [51:16] of the guest physical base address - of the first redistributor in the region. - - count encodes the number of redistributors in the region. Must be - greater than 0. - There are two 64K pages for each redistributor in the region and - redistributors are laid out contiguously within the region. Regions - are filled with redistributors in the index order. The sum of all - region count fields must be greater than or equal to the number of - VCPUs. Redistributor regions must be registered in the incremental - index order, starting from index 0. - The characteristics of a specific redistributor region can be read - by presetting the index field in the attr data. - Only valid for KVM_DEV_TYPE_ARM_VGIC_V3. - - It is invalid to mix calls with KVM_VGIC_V3_ADDR_TYPE_REDIST and - KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION attributes. - - Errors: - -E2BIG: Address outside of addressable IPA range - -EINVAL: Incorrectly aligned address, bad redistributor region - count/index, mixed redistributor region attribute usage - -EEXIST: Address already configured - -ENOENT: Attempt to read the characteristics of a non existing - redistributor region - -ENXIO: The group or attribute is unknown/unsupported for this device - or hardware support is missing. - -EFAULT: Invalid user pointer for attr->addr. - - - KVM_DEV_ARM_VGIC_GRP_DIST_REGS - KVM_DEV_ARM_VGIC_GRP_REDIST_REGS - Attributes: - The attr field of kvm_device_attr encodes two values: - bits: | 63 .... 32 | 31 .... 0 | - values: | mpidr | offset | - - All distributor regs are (rw, 32-bit) and kvm_device_attr.addr points to a - __u32 value. 64-bit registers must be accessed by separately accessing the - lower and higher word. - - Writes to read-only registers are ignored by the kernel. - - KVM_DEV_ARM_VGIC_GRP_DIST_REGS accesses the main distributor registers. - KVM_DEV_ARM_VGIC_GRP_REDIST_REGS accesses the redistributor of the CPU - specified by the mpidr. - - The offset is relative to the "[Re]Distributor base address" as defined - in the GICv3/4 specs. Getting or setting such a register has the same - effect as reading or writing the register on real hardware, except for the - following registers: GICD_STATUSR, GICR_STATUSR, GICD_ISPENDR, - GICR_ISPENDR0, GICD_ICPENDR, and GICR_ICPENDR0. These registers behave - differently when accessed via this interface compared to their - architecturally defined behavior to allow software a full view of the - VGIC's internal state. - - The mpidr field is used to specify which - redistributor is accessed. The mpidr is ignored for the distributor. - - The mpidr encoding is based on the affinity information in the - architecture defined MPIDR, and the field is encoded as follows: - | 63 .... 56 | 55 .... 48 | 47 .... 40 | 39 .... 32 | - | Aff3 | Aff2 | Aff1 | Aff0 | - - Note that distributor fields are not banked, but return the same value - regardless of the mpidr used to access the register. - - GICD_IIDR.Revision is updated when the KVM implementation is changed in a - way directly observable by the guest or userspace. Userspace should read - GICD_IIDR from KVM and write back the read value to confirm its expected - behavior is aligned with the KVM implementation. Userspace should set - GICD_IIDR before setting any other registers to ensure the expected - behavior. - - - The GICD_STATUSR and GICR_STATUSR registers are architecturally defined such - that a write of a clear bit has no effect, whereas a write with a set bit - clears that value. To allow userspace to freely set the values of these two - registers, setting the attributes with the register offsets for these two - registers simply sets the non-reserved bits to the value written. - - - Accesses (reads and writes) to the GICD_ISPENDR register region and - GICR_ISPENDR0 registers get/set the value of the latched pending state for - the interrupts. - - This is identical to the value returned by a guest read from ISPENDR for an - edge triggered interrupt, but may differ for level triggered interrupts. - For edge triggered interrupts, once an interrupt becomes pending (whether - because of an edge detected on the input line or because of a guest write - to ISPENDR) this state is "latched", and only cleared when either the - interrupt is activated or when the guest writes to ICPENDR. A level - triggered interrupt may be pending either because the level input is held - high by a device, or because of a guest write to the ISPENDR register. Only - ISPENDR writes are latched; if the device lowers the line level then the - interrupt is no longer pending unless the guest also wrote to ISPENDR, and - conversely writes to ICPENDR or activations of the interrupt do not clear - the pending status if the line level is still being held high. (These - rules are documented in the GICv3 specification descriptions of the ICPENDR - and ISPENDR registers.) For a level triggered interrupt the value accessed - here is that of the latch which is set by ISPENDR and cleared by ICPENDR or - interrupt activation, whereas the value returned by a guest read from - ISPENDR is the logical OR of the latch value and the input line level. - - Raw access to the latch state is provided to userspace so that it can save - and restore the entire GIC internal state (which is defined by the - combination of the current input line level and the latch state, and cannot - be deduced from purely the line level and the value of the ISPENDR - registers). - - Accesses to GICD_ICPENDR register region and GICR_ICPENDR0 registers have - RAZ/WI semantics, meaning that reads always return 0 and writes are always - ignored. - - Errors: - -ENXIO: Getting or setting this register is not yet supported - -EBUSY: One or more VCPUs are running - - - KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS - Attributes: - The attr field of kvm_device_attr encodes two values: - bits: | 63 .... 32 | 31 .... 16 | 15 .... 0 | - values: | mpidr | RES | instr | - - The mpidr field encodes the CPU ID based on the affinity information in the - architecture defined MPIDR, and the field is encoded as follows: - | 63 .... 56 | 55 .... 48 | 47 .... 40 | 39 .... 32 | - | Aff3 | Aff2 | Aff1 | Aff0 | - - The instr field encodes the system register to access based on the fields - defined in the A64 instruction set encoding for system register access - (RES means the bits are reserved for future use and should be zero): - - | 15 ... 14 | 13 ... 11 | 10 ... 7 | 6 ... 3 | 2 ... 0 | - | Op 0 | Op1 | CRn | CRm | Op2 | - - All system regs accessed through this API are (rw, 64-bit) and - kvm_device_attr.addr points to a __u64 value. - - KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS accesses the CPU interface registers for the - CPU specified by the mpidr field. - - CPU interface registers access is not implemented for AArch32 mode. - Error -ENXIO is returned when accessed in AArch32 mode. - Errors: - -ENXIO: Getting or setting this register is not yet supported - -EBUSY: VCPU is running - -EINVAL: Invalid mpidr or register value supplied - - - KVM_DEV_ARM_VGIC_GRP_NR_IRQS - Attributes: - A value describing the number of interrupts (SGI, PPI and SPI) for - this GIC instance, ranging from 64 to 1024, in increments of 32. - - kvm_device_attr.addr points to a __u32 value. - - Errors: - -EINVAL: Value set is out of the expected range - -EBUSY: Value has already be set. - - - KVM_DEV_ARM_VGIC_GRP_CTRL - Attributes: - KVM_DEV_ARM_VGIC_CTRL_INIT - request the initialization of the VGIC, no additional parameter in - kvm_device_attr.addr. - KVM_DEV_ARM_VGIC_SAVE_PENDING_TABLES - save all LPI pending bits into guest RAM pending tables. - - The first kB of the pending table is not altered by this operation. - Errors: - -ENXIO: VGIC not properly configured as required prior to calling - this attribute - -ENODEV: no online VCPU - -ENOMEM: memory shortage when allocating vgic internal data - -EFAULT: Invalid guest ram access - -EBUSY: One or more VCPUS are running - - - KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO - Attributes: - The attr field of kvm_device_attr encodes the following values: - bits: | 63 .... 32 | 31 .... 10 | 9 .... 0 | - values: | mpidr | info | vINTID | - - The vINTID specifies which set of IRQs is reported on. - - The info field specifies which information userspace wants to get or set - using this interface. Currently we support the following info values: - - VGIC_LEVEL_INFO_LINE_LEVEL: - Get/Set the input level of the IRQ line for a set of 32 contiguously - numbered interrupts. - vINTID must be a multiple of 32. - - kvm_device_attr.addr points to a __u32 value which will contain a - bitmap where a set bit means the interrupt level is asserted. - - Bit[n] indicates the status for interrupt vINTID + n. - - SGIs and any interrupt with a higher ID than the number of interrupts - supported, will be RAZ/WI. LPIs are always edge-triggered and are - therefore not supported by this interface. - - PPIs are reported per VCPU as specified in the mpidr field, and SPIs are - reported with the same value regardless of the mpidr specified. - - The mpidr field encodes the CPU ID based on the affinity information in the - architecture defined MPIDR, and the field is encoded as follows: - | 63 .... 56 | 55 .... 48 | 47 .... 40 | 39 .... 32 | - | Aff3 | Aff2 | Aff1 | Aff0 | - Errors: - -EINVAL: vINTID is not multiple of 32 or - info field is not VGIC_LEVEL_INFO_LINE_LEVEL diff --git a/Documentation/virt/kvm/devices/index.rst b/Documentation/virt/kvm/devices/index.rst index 2aad8d426097..80c1e0e225f4 100644 --- a/Documentation/virt/kvm/devices/index.rst +++ b/Documentation/virt/kvm/devices/index.rst @@ -8,3 +8,4 @@ Devices :maxdepth: 2 arm-vgic-its + arm-vgic-v3 -- cgit v1.2.3 From bf6154dba0a7d4defd3e8c9c85d1933f442ef01b Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 10 Feb 2020 07:02:47 +0100 Subject: docs: kvm: convert devices/arm-vgit.txt to ReST - Use title markups; - change indent to match ReST syntax; - use proper table markups; - use literal block markups. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/devices/arm-vgic.rst | 156 ++++++++++++++++++++++++++++ Documentation/virt/kvm/devices/arm-vgic.txt | 127 ---------------------- Documentation/virt/kvm/devices/index.rst | 1 + 3 files changed, 157 insertions(+), 127 deletions(-) create mode 100644 Documentation/virt/kvm/devices/arm-vgic.rst delete mode 100644 Documentation/virt/kvm/devices/arm-vgic.txt diff --git a/Documentation/virt/kvm/devices/arm-vgic.rst b/Documentation/virt/kvm/devices/arm-vgic.rst new file mode 100644 index 000000000000..40bdeea1d86e --- /dev/null +++ b/Documentation/virt/kvm/devices/arm-vgic.rst @@ -0,0 +1,156 @@ +.. SPDX-License-Identifier: GPL-2.0 + +================================================== +ARM Virtual Generic Interrupt Controller v2 (VGIC) +================================================== + +Device types supported: + + - KVM_DEV_TYPE_ARM_VGIC_V2 ARM Generic Interrupt Controller v2.0 + +Only one VGIC instance may be instantiated through either this API or the +legacy KVM_CREATE_IRQCHIP API. The created VGIC will act as the VM interrupt +controller, requiring emulated user-space devices to inject interrupts to the +VGIC instead of directly to CPUs. + +GICv3 implementations with hardware compatibility support allow creating a +guest GICv2 through this interface. For information on creating a guest GICv3 +device and guest ITS devices, see arm-vgic-v3.txt. It is not possible to +create both a GICv3 and GICv2 device on the same VM. + + +Groups: + KVM_DEV_ARM_VGIC_GRP_ADDR + Attributes: + + KVM_VGIC_V2_ADDR_TYPE_DIST (rw, 64-bit) + Base address in the guest physical address space of the GIC distributor + register mappings. Only valid for KVM_DEV_TYPE_ARM_VGIC_V2. + This address needs to be 4K aligned and the region covers 4 KByte. + + KVM_VGIC_V2_ADDR_TYPE_CPU (rw, 64-bit) + Base address in the guest physical address space of the GIC virtual cpu + interface register mappings. Only valid for KVM_DEV_TYPE_ARM_VGIC_V2. + This address needs to be 4K aligned and the region covers 4 KByte. + + Errors: + + ======= ============================================================= + -E2BIG Address outside of addressable IPA range + -EINVAL Incorrectly aligned address + -EEXIST Address already configured + -ENXIO The group or attribute is unknown/unsupported for this device + or hardware support is missing. + -EFAULT Invalid user pointer for attr->addr. + ======= ============================================================= + + KVM_DEV_ARM_VGIC_GRP_DIST_REGS + Attributes: + + The attr field of kvm_device_attr encodes two values:: + + bits: | 63 .... 40 | 39 .. 32 | 31 .... 0 | + values: | reserved | vcpu_index | offset | + + All distributor regs are (rw, 32-bit) + + The offset is relative to the "Distributor base address" as defined in the + GICv2 specs. Getting or setting such a register has the same effect as + reading or writing the register on the actual hardware from the cpu whose + index is specified with the vcpu_index field. Note that most distributor + fields are not banked, but return the same value regardless of the + vcpu_index used to access the register. + + GICD_IIDR.Revision is updated when the KVM implementation of an emulated + GICv2 is changed in a way directly observable by the guest or userspace. + Userspace should read GICD_IIDR from KVM and write back the read value to + confirm its expected behavior is aligned with the KVM implementation. + Userspace should set GICD_IIDR before setting any other registers (both + KVM_DEV_ARM_VGIC_GRP_DIST_REGS and KVM_DEV_ARM_VGIC_GRP_CPU_REGS) to ensure + the expected behavior. Unless GICD_IIDR has been set from userspace, writes + to the interrupt group registers (GICD_IGROUPR) are ignored. + + Errors: + + ======= ===================================================== + -ENXIO Getting or setting this register is not yet supported + -EBUSY One or more VCPUs are running + -EINVAL Invalid vcpu_index supplied + ======= ===================================================== + + KVM_DEV_ARM_VGIC_GRP_CPU_REGS + Attributes: + + The attr field of kvm_device_attr encodes two values:: + + bits: | 63 .... 40 | 39 .. 32 | 31 .... 0 | + values: | reserved | vcpu_index | offset | + + All CPU interface regs are (rw, 32-bit) + + The offset specifies the offset from the "CPU interface base address" as + defined in the GICv2 specs. Getting or setting such a register has the + same effect as reading or writing the register on the actual hardware. + + The Active Priorities Registers APRn are implementation defined, so we set a + fixed format for our implementation that fits with the model of a "GICv2 + implementation without the security extensions" which we present to the + guest. This interface always exposes four register APR[0-3] describing the + maximum possible 128 preemption levels. The semantics of the register + indicate if any interrupts in a given preemption level are in the active + state by setting the corresponding bit. + + Thus, preemption level X has one or more active interrupts if and only if: + + APRn[X mod 32] == 0b1, where n = X / 32 + + Bits for undefined preemption levels are RAZ/WI. + + Note that this differs from a CPU's view of the APRs on hardware in which + a GIC without the security extensions expose group 0 and group 1 active + priorities in separate register groups, whereas we show a combined view + similar to GICv2's GICH_APR. + + For historical reasons and to provide ABI compatibility with userspace we + export the GICC_PMR register in the format of the GICH_VMCR.VMPriMask + field in the lower 5 bits of a word, meaning that userspace must always + use the lower 5 bits to communicate with the KVM device and must shift the + value left by 3 places to obtain the actual priority mask level. + + Errors: + + ======= ===================================================== + -ENXIO Getting or setting this register is not yet supported + -EBUSY One or more VCPUs are running + -EINVAL Invalid vcpu_index supplied + ======= ===================================================== + + KVM_DEV_ARM_VGIC_GRP_NR_IRQS + Attributes: + + A value describing the number of interrupts (SGI, PPI and SPI) for + this GIC instance, ranging from 64 to 1024, in increments of 32. + + Errors: + + ======= ============================================================= + -EINVAL Value set is out of the expected range + -EBUSY Value has already be set, or GIC has already been initialized + with default values. + ======= ============================================================= + + KVM_DEV_ARM_VGIC_GRP_CTRL + Attributes: + + KVM_DEV_ARM_VGIC_CTRL_INIT + request the initialization of the VGIC or ITS, no additional parameter + in kvm_device_attr.addr. + + Errors: + + ======= ========================================================= + -ENXIO VGIC not properly configured as required prior to calling + this attribute + -ENODEV no online VCPU + -ENOMEM memory shortage when allocating vgic internal data + ======= ========================================================= diff --git a/Documentation/virt/kvm/devices/arm-vgic.txt b/Documentation/virt/kvm/devices/arm-vgic.txt deleted file mode 100644 index 97b6518148f8..000000000000 --- a/Documentation/virt/kvm/devices/arm-vgic.txt +++ /dev/null @@ -1,127 +0,0 @@ -ARM Virtual Generic Interrupt Controller v2 (VGIC) -================================================== - -Device types supported: - KVM_DEV_TYPE_ARM_VGIC_V2 ARM Generic Interrupt Controller v2.0 - -Only one VGIC instance may be instantiated through either this API or the -legacy KVM_CREATE_IRQCHIP API. The created VGIC will act as the VM interrupt -controller, requiring emulated user-space devices to inject interrupts to the -VGIC instead of directly to CPUs. - -GICv3 implementations with hardware compatibility support allow creating a -guest GICv2 through this interface. For information on creating a guest GICv3 -device and guest ITS devices, see arm-vgic-v3.txt. It is not possible to -create both a GICv3 and GICv2 device on the same VM. - - -Groups: - KVM_DEV_ARM_VGIC_GRP_ADDR - Attributes: - KVM_VGIC_V2_ADDR_TYPE_DIST (rw, 64-bit) - Base address in the guest physical address space of the GIC distributor - register mappings. Only valid for KVM_DEV_TYPE_ARM_VGIC_V2. - This address needs to be 4K aligned and the region covers 4 KByte. - - KVM_VGIC_V2_ADDR_TYPE_CPU (rw, 64-bit) - Base address in the guest physical address space of the GIC virtual cpu - interface register mappings. Only valid for KVM_DEV_TYPE_ARM_VGIC_V2. - This address needs to be 4K aligned and the region covers 4 KByte. - Errors: - -E2BIG: Address outside of addressable IPA range - -EINVAL: Incorrectly aligned address - -EEXIST: Address already configured - -ENXIO: The group or attribute is unknown/unsupported for this device - or hardware support is missing. - -EFAULT: Invalid user pointer for attr->addr. - - KVM_DEV_ARM_VGIC_GRP_DIST_REGS - Attributes: - The attr field of kvm_device_attr encodes two values: - bits: | 63 .... 40 | 39 .. 32 | 31 .... 0 | - values: | reserved | vcpu_index | offset | - - All distributor regs are (rw, 32-bit) - - The offset is relative to the "Distributor base address" as defined in the - GICv2 specs. Getting or setting such a register has the same effect as - reading or writing the register on the actual hardware from the cpu whose - index is specified with the vcpu_index field. Note that most distributor - fields are not banked, but return the same value regardless of the - vcpu_index used to access the register. - - GICD_IIDR.Revision is updated when the KVM implementation of an emulated - GICv2 is changed in a way directly observable by the guest or userspace. - Userspace should read GICD_IIDR from KVM and write back the read value to - confirm its expected behavior is aligned with the KVM implementation. - Userspace should set GICD_IIDR before setting any other registers (both - KVM_DEV_ARM_VGIC_GRP_DIST_REGS and KVM_DEV_ARM_VGIC_GRP_CPU_REGS) to ensure - the expected behavior. Unless GICD_IIDR has been set from userspace, writes - to the interrupt group registers (GICD_IGROUPR) are ignored. - Errors: - -ENXIO: Getting or setting this register is not yet supported - -EBUSY: One or more VCPUs are running - -EINVAL: Invalid vcpu_index supplied - - KVM_DEV_ARM_VGIC_GRP_CPU_REGS - Attributes: - The attr field of kvm_device_attr encodes two values: - bits: | 63 .... 40 | 39 .. 32 | 31 .... 0 | - values: | reserved | vcpu_index | offset | - - All CPU interface regs are (rw, 32-bit) - - The offset specifies the offset from the "CPU interface base address" as - defined in the GICv2 specs. Getting or setting such a register has the - same effect as reading or writing the register on the actual hardware. - - The Active Priorities Registers APRn are implementation defined, so we set a - fixed format for our implementation that fits with the model of a "GICv2 - implementation without the security extensions" which we present to the - guest. This interface always exposes four register APR[0-3] describing the - maximum possible 128 preemption levels. The semantics of the register - indicate if any interrupts in a given preemption level are in the active - state by setting the corresponding bit. - - Thus, preemption level X has one or more active interrupts if and only if: - - APRn[X mod 32] == 0b1, where n = X / 32 - - Bits for undefined preemption levels are RAZ/WI. - - Note that this differs from a CPU's view of the APRs on hardware in which - a GIC without the security extensions expose group 0 and group 1 active - priorities in separate register groups, whereas we show a combined view - similar to GICv2's GICH_APR. - - For historical reasons and to provide ABI compatibility with userspace we - export the GICC_PMR register in the format of the GICH_VMCR.VMPriMask - field in the lower 5 bits of a word, meaning that userspace must always - use the lower 5 bits to communicate with the KVM device and must shift the - value left by 3 places to obtain the actual priority mask level. - - Errors: - -ENXIO: Getting or setting this register is not yet supported - -EBUSY: One or more VCPUs are running - -EINVAL: Invalid vcpu_index supplied - - KVM_DEV_ARM_VGIC_GRP_NR_IRQS - Attributes: - A value describing the number of interrupts (SGI, PPI and SPI) for - this GIC instance, ranging from 64 to 1024, in increments of 32. - - Errors: - -EINVAL: Value set is out of the expected range - -EBUSY: Value has already be set, or GIC has already been initialized - with default values. - - KVM_DEV_ARM_VGIC_GRP_CTRL - Attributes: - KVM_DEV_ARM_VGIC_CTRL_INIT - request the initialization of the VGIC or ITS, no additional parameter - in kvm_device_attr.addr. - Errors: - -ENXIO: VGIC not properly configured as required prior to calling - this attribute - -ENODEV: no online VCPU - -ENOMEM: memory shortage when allocating vgic internal data diff --git a/Documentation/virt/kvm/devices/index.rst b/Documentation/virt/kvm/devices/index.rst index 80c1e0e225f4..7eabce80c61e 100644 --- a/Documentation/virt/kvm/devices/index.rst +++ b/Documentation/virt/kvm/devices/index.rst @@ -8,4 +8,5 @@ Devices :maxdepth: 2 arm-vgic-its + arm-vgic arm-vgic-v3 -- cgit v1.2.3 From 05c47036c62ea65a8f8aeaef5021c7220488a664 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 10 Feb 2020 07:02:48 +0100 Subject: docs: kvm: convert devices/mpic.txt to ReST This document is almost in ReST format. The only thing needed is to mark a list as such and to add an extra whitespace. Yet, let's also use the standard document title markup, as it makes easier if anyone wants later to add sessions to it. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/devices/index.rst | 1 + Documentation/virt/kvm/devices/mpic.rst | 58 ++++++++++++++++++++++++++++++++ Documentation/virt/kvm/devices/mpic.txt | 53 ----------------------------- 3 files changed, 59 insertions(+), 53 deletions(-) create mode 100644 Documentation/virt/kvm/devices/mpic.rst delete mode 100644 Documentation/virt/kvm/devices/mpic.txt diff --git a/Documentation/virt/kvm/devices/index.rst b/Documentation/virt/kvm/devices/index.rst index 7eabce80c61e..9e5586e371de 100644 --- a/Documentation/virt/kvm/devices/index.rst +++ b/Documentation/virt/kvm/devices/index.rst @@ -10,3 +10,4 @@ Devices arm-vgic-its arm-vgic arm-vgic-v3 + mpic diff --git a/Documentation/virt/kvm/devices/mpic.rst b/Documentation/virt/kvm/devices/mpic.rst new file mode 100644 index 000000000000..55cefe030d41 --- /dev/null +++ b/Documentation/virt/kvm/devices/mpic.rst @@ -0,0 +1,58 @@ +.. SPDX-License-Identifier: GPL-2.0 + +========================= +MPIC interrupt controller +========================= + +Device types supported: + + - KVM_DEV_TYPE_FSL_MPIC_20 Freescale MPIC v2.0 + - KVM_DEV_TYPE_FSL_MPIC_42 Freescale MPIC v4.2 + +Only one MPIC instance, of any type, may be instantiated. The created +MPIC will act as the system interrupt controller, connecting to each +vcpu's interrupt inputs. + +Groups: + KVM_DEV_MPIC_GRP_MISC + Attributes: + + KVM_DEV_MPIC_BASE_ADDR (rw, 64-bit) + Base address of the 256 KiB MPIC register space. Must be + naturally aligned. A value of zero disables the mapping. + Reset value is zero. + + KVM_DEV_MPIC_GRP_REGISTER (rw, 32-bit) + Access an MPIC register, as if the access were made from the guest. + "attr" is the byte offset into the MPIC register space. Accesses + must be 4-byte aligned. + + MSIs may be signaled by using this attribute group to write + to the relevant MSIIR. + + KVM_DEV_MPIC_GRP_IRQ_ACTIVE (rw, 32-bit) + IRQ input line for each standard openpic source. 0 is inactive and 1 + is active, regardless of interrupt sense. + + For edge-triggered interrupts: Writing 1 is considered an activating + edge, and writing 0 is ignored. Reading returns 1 if a previously + signaled edge has not been acknowledged, and 0 otherwise. + + "attr" is the IRQ number. IRQ numbers for standard sources are the + byte offset of the relevant IVPR from EIVPR0, divided by 32. + +IRQ Routing: + + The MPIC emulation supports IRQ routing. Only a single MPIC device can + be instantiated. Once that device has been created, it's available as + irqchip id 0. + + This irqchip 0 has 256 interrupt pins, which expose the interrupts in + the main array of interrupt sources (a.k.a. "SRC" interrupts). + + The numbering is the same as the MPIC device tree binding -- based on + the register offset from the beginning of the sources array, without + regard to any subdivisions in chip documentation such as "internal" + or "external" interrupts. + + Access to non-SRC interrupts is not implemented through IRQ routing mechanisms. diff --git a/Documentation/virt/kvm/devices/mpic.txt b/Documentation/virt/kvm/devices/mpic.txt deleted file mode 100644 index 8257397adc3c..000000000000 --- a/Documentation/virt/kvm/devices/mpic.txt +++ /dev/null @@ -1,53 +0,0 @@ -MPIC interrupt controller -========================= - -Device types supported: - KVM_DEV_TYPE_FSL_MPIC_20 Freescale MPIC v2.0 - KVM_DEV_TYPE_FSL_MPIC_42 Freescale MPIC v4.2 - -Only one MPIC instance, of any type, may be instantiated. The created -MPIC will act as the system interrupt controller, connecting to each -vcpu's interrupt inputs. - -Groups: - KVM_DEV_MPIC_GRP_MISC - Attributes: - KVM_DEV_MPIC_BASE_ADDR (rw, 64-bit) - Base address of the 256 KiB MPIC register space. Must be - naturally aligned. A value of zero disables the mapping. - Reset value is zero. - - KVM_DEV_MPIC_GRP_REGISTER (rw, 32-bit) - Access an MPIC register, as if the access were made from the guest. - "attr" is the byte offset into the MPIC register space. Accesses - must be 4-byte aligned. - - MSIs may be signaled by using this attribute group to write - to the relevant MSIIR. - - KVM_DEV_MPIC_GRP_IRQ_ACTIVE (rw, 32-bit) - IRQ input line for each standard openpic source. 0 is inactive and 1 - is active, regardless of interrupt sense. - - For edge-triggered interrupts: Writing 1 is considered an activating - edge, and writing 0 is ignored. Reading returns 1 if a previously - signaled edge has not been acknowledged, and 0 otherwise. - - "attr" is the IRQ number. IRQ numbers for standard sources are the - byte offset of the relevant IVPR from EIVPR0, divided by 32. - -IRQ Routing: - - The MPIC emulation supports IRQ routing. Only a single MPIC device can - be instantiated. Once that device has been created, it's available as - irqchip id 0. - - This irqchip 0 has 256 interrupt pins, which expose the interrupts in - the main array of interrupt sources (a.k.a. "SRC" interrupts). - - The numbering is the same as the MPIC device tree binding -- based on - the register offset from the beginning of the sources array, without - regard to any subdivisions in chip documentation such as "internal" - or "external" interrupts. - - Access to non-SRC interrupts is not implemented through IRQ routing mechanisms. -- cgit v1.2.3 From e944743003617aeaebebc33adef5de093e701766 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 10 Feb 2020 07:02:49 +0100 Subject: docs: kvm: convert devices/s390_flic.txt to ReST - Use standard markup for document title; - Adjust indentation and add blank lines as needed; - use the notes markup; - mark code blocks as such. Signed-off-by: Mauro Carvalho Chehab Reviewed-by: Cornelia Huck Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/devices/index.rst | 1 + Documentation/virt/kvm/devices/s390_flic.rst | 173 +++++++++++++++++++++++++++ Documentation/virt/kvm/devices/s390_flic.txt | 163 ------------------------- 3 files changed, 174 insertions(+), 163 deletions(-) create mode 100644 Documentation/virt/kvm/devices/s390_flic.rst delete mode 100644 Documentation/virt/kvm/devices/s390_flic.txt diff --git a/Documentation/virt/kvm/devices/index.rst b/Documentation/virt/kvm/devices/index.rst index 9e5586e371de..e6caccc36623 100644 --- a/Documentation/virt/kvm/devices/index.rst +++ b/Documentation/virt/kvm/devices/index.rst @@ -11,3 +11,4 @@ Devices arm-vgic arm-vgic-v3 mpic + s390_flic diff --git a/Documentation/virt/kvm/devices/s390_flic.rst b/Documentation/virt/kvm/devices/s390_flic.rst new file mode 100644 index 000000000000..954190da7d04 --- /dev/null +++ b/Documentation/virt/kvm/devices/s390_flic.rst @@ -0,0 +1,173 @@ +.. SPDX-License-Identifier: GPL-2.0 + +==================================== +FLIC (floating interrupt controller) +==================================== + +FLIC handles floating (non per-cpu) interrupts, i.e. I/O, service and some +machine check interruptions. All interrupts are stored in a per-vm list of +pending interrupts. FLIC performs operations on this list. + +Only one FLIC instance may be instantiated. + +FLIC provides support to +- add interrupts (KVM_DEV_FLIC_ENQUEUE) +- inspect currently pending interrupts (KVM_FLIC_GET_ALL_IRQS) +- purge all pending floating interrupts (KVM_DEV_FLIC_CLEAR_IRQS) +- purge one pending floating I/O interrupt (KVM_DEV_FLIC_CLEAR_IO_IRQ) +- enable/disable for the guest transparent async page faults +- register and modify adapter interrupt sources (KVM_DEV_FLIC_ADAPTER_*) +- modify AIS (adapter-interruption-suppression) mode state (KVM_DEV_FLIC_AISM) +- inject adapter interrupts on a specified adapter (KVM_DEV_FLIC_AIRQ_INJECT) +- get/set all AIS mode states (KVM_DEV_FLIC_AISM_ALL) + +Groups: + KVM_DEV_FLIC_ENQUEUE + Passes a buffer and length into the kernel which are then injected into + the list of pending interrupts. + attr->addr contains the pointer to the buffer and attr->attr contains + the length of the buffer. + The format of the data structure kvm_s390_irq as it is copied from userspace + is defined in usr/include/linux/kvm.h. + + KVM_DEV_FLIC_GET_ALL_IRQS + Copies all floating interrupts into a buffer provided by userspace. + When the buffer is too small it returns -ENOMEM, which is the indication + for userspace to try again with a bigger buffer. + + -ENOBUFS is returned when the allocation of a kernelspace buffer has + failed. + + -EFAULT is returned when copying data to userspace failed. + All interrupts remain pending, i.e. are not deleted from the list of + currently pending interrupts. + attr->addr contains the userspace address of the buffer into which all + interrupt data will be copied. + attr->attr contains the size of the buffer in bytes. + + KVM_DEV_FLIC_CLEAR_IRQS + Simply deletes all elements from the list of currently pending floating + interrupts. No interrupts are injected into the guest. + + KVM_DEV_FLIC_CLEAR_IO_IRQ + Deletes one (if any) I/O interrupt for a subchannel identified by the + subsystem identification word passed via the buffer specified by + attr->addr (address) and attr->attr (length). + + KVM_DEV_FLIC_APF_ENABLE + Enables async page faults for the guest. So in case of a major page fault + the host is allowed to handle this async and continues the guest. + + KVM_DEV_FLIC_APF_DISABLE_WAIT + Disables async page faults for the guest and waits until already pending + async page faults are done. This is necessary to trigger a completion interrupt + for every init interrupt before migrating the interrupt list. + + KVM_DEV_FLIC_ADAPTER_REGISTER + Register an I/O adapter interrupt source. Takes a kvm_s390_io_adapter + describing the adapter to register:: + + struct kvm_s390_io_adapter { + __u32 id; + __u8 isc; + __u8 maskable; + __u8 swap; + __u8 flags; + }; + + id contains the unique id for the adapter, isc the I/O interruption subclass + to use, maskable whether this adapter may be masked (interrupts turned off), + swap whether the indicators need to be byte swapped, and flags contains + further characteristics of the adapter. + + Currently defined values for 'flags' are: + + - KVM_S390_ADAPTER_SUPPRESSIBLE: adapter is subject to AIS + (adapter-interrupt-suppression) facility. This flag only has an effect if + the AIS capability is enabled. + + Unknown flag values are ignored. + + + KVM_DEV_FLIC_ADAPTER_MODIFY + Modifies attributes of an existing I/O adapter interrupt source. Takes + a kvm_s390_io_adapter_req specifying the adapter and the operation:: + + struct kvm_s390_io_adapter_req { + __u32 id; + __u8 type; + __u8 mask; + __u16 pad0; + __u64 addr; + }; + + id specifies the adapter and type the operation. The supported operations + are: + + KVM_S390_IO_ADAPTER_MASK + mask or unmask the adapter, as specified in mask + + KVM_S390_IO_ADAPTER_MAP + perform a gmap translation for the guest address provided in addr, + pin a userspace page for the translated address and add it to the + list of mappings + + .. note:: A new mapping will be created unconditionally; therefore, + the calling code should avoid making duplicate mappings. + + KVM_S390_IO_ADAPTER_UNMAP + release a userspace page for the translated address specified in addr + from the list of mappings + + KVM_DEV_FLIC_AISM + modify the adapter-interruption-suppression mode for a given isc if the + AIS capability is enabled. Takes a kvm_s390_ais_req describing:: + + struct kvm_s390_ais_req { + __u8 isc; + __u16 mode; + }; + + isc contains the target I/O interruption subclass, mode the target + adapter-interruption-suppression mode. The following modes are + currently supported: + + - KVM_S390_AIS_MODE_ALL: ALL-Interruptions Mode, i.e. airq injection + is always allowed; + - KVM_S390_AIS_MODE_SINGLE: SINGLE-Interruption Mode, i.e. airq + injection is only allowed once and the following adapter interrupts + will be suppressed until the mode is set again to ALL-Interruptions + or SINGLE-Interruption mode. + + KVM_DEV_FLIC_AIRQ_INJECT + Inject adapter interrupts on a specified adapter. + attr->attr contains the unique id for the adapter, which allows for + adapter-specific checks and actions. + For adapters subject to AIS, handle the airq injection suppression for + an isc according to the adapter-interruption-suppression mode on condition + that the AIS capability is enabled. + + KVM_DEV_FLIC_AISM_ALL + Gets or sets the adapter-interruption-suppression mode for all ISCs. Takes + a kvm_s390_ais_all describing:: + + struct kvm_s390_ais_all { + __u8 simm; /* Single-Interruption-Mode mask */ + __u8 nimm; /* No-Interruption-Mode mask * + }; + + simm contains Single-Interruption-Mode mask for all ISCs, nimm contains + No-Interruption-Mode mask for all ISCs. Each bit in simm and nimm corresponds + to an ISC (MSB0 bit 0 to ISC 0 and so on). The combination of simm bit and + nimm bit presents AIS mode for a ISC. + + KVM_DEV_FLIC_AISM_ALL is indicated by KVM_CAP_S390_AIS_MIGRATION. + +Note: The KVM_SET_DEVICE_ATTR/KVM_GET_DEVICE_ATTR device ioctls executed on +FLIC with an unknown group or attribute gives the error code EINVAL (instead of +ENXIO, as specified in the API documentation). It is not possible to conclude +that a FLIC operation is unavailable based on the error code resulting from a +usage attempt. + +.. note:: The KVM_DEV_FLIC_CLEAR_IO_IRQ ioctl will return EINVAL in case a + zero schid is specified. diff --git a/Documentation/virt/kvm/devices/s390_flic.txt b/Documentation/virt/kvm/devices/s390_flic.txt deleted file mode 100644 index a4e20a090174..000000000000 --- a/Documentation/virt/kvm/devices/s390_flic.txt +++ /dev/null @@ -1,163 +0,0 @@ -FLIC (floating interrupt controller) -==================================== - -FLIC handles floating (non per-cpu) interrupts, i.e. I/O, service and some -machine check interruptions. All interrupts are stored in a per-vm list of -pending interrupts. FLIC performs operations on this list. - -Only one FLIC instance may be instantiated. - -FLIC provides support to -- add interrupts (KVM_DEV_FLIC_ENQUEUE) -- inspect currently pending interrupts (KVM_FLIC_GET_ALL_IRQS) -- purge all pending floating interrupts (KVM_DEV_FLIC_CLEAR_IRQS) -- purge one pending floating I/O interrupt (KVM_DEV_FLIC_CLEAR_IO_IRQ) -- enable/disable for the guest transparent async page faults -- register and modify adapter interrupt sources (KVM_DEV_FLIC_ADAPTER_*) -- modify AIS (adapter-interruption-suppression) mode state (KVM_DEV_FLIC_AISM) -- inject adapter interrupts on a specified adapter (KVM_DEV_FLIC_AIRQ_INJECT) -- get/set all AIS mode states (KVM_DEV_FLIC_AISM_ALL) - -Groups: - KVM_DEV_FLIC_ENQUEUE - Passes a buffer and length into the kernel which are then injected into - the list of pending interrupts. - attr->addr contains the pointer to the buffer and attr->attr contains - the length of the buffer. - The format of the data structure kvm_s390_irq as it is copied from userspace - is defined in usr/include/linux/kvm.h. - - KVM_DEV_FLIC_GET_ALL_IRQS - Copies all floating interrupts into a buffer provided by userspace. - When the buffer is too small it returns -ENOMEM, which is the indication - for userspace to try again with a bigger buffer. - -ENOBUFS is returned when the allocation of a kernelspace buffer has - failed. - -EFAULT is returned when copying data to userspace failed. - All interrupts remain pending, i.e. are not deleted from the list of - currently pending interrupts. - attr->addr contains the userspace address of the buffer into which all - interrupt data will be copied. - attr->attr contains the size of the buffer in bytes. - - KVM_DEV_FLIC_CLEAR_IRQS - Simply deletes all elements from the list of currently pending floating - interrupts. No interrupts are injected into the guest. - - KVM_DEV_FLIC_CLEAR_IO_IRQ - Deletes one (if any) I/O interrupt for a subchannel identified by the - subsystem identification word passed via the buffer specified by - attr->addr (address) and attr->attr (length). - - KVM_DEV_FLIC_APF_ENABLE - Enables async page faults for the guest. So in case of a major page fault - the host is allowed to handle this async and continues the guest. - - KVM_DEV_FLIC_APF_DISABLE_WAIT - Disables async page faults for the guest and waits until already pending - async page faults are done. This is necessary to trigger a completion interrupt - for every init interrupt before migrating the interrupt list. - - KVM_DEV_FLIC_ADAPTER_REGISTER - Register an I/O adapter interrupt source. Takes a kvm_s390_io_adapter - describing the adapter to register: - -struct kvm_s390_io_adapter { - __u32 id; - __u8 isc; - __u8 maskable; - __u8 swap; - __u8 flags; -}; - - id contains the unique id for the adapter, isc the I/O interruption subclass - to use, maskable whether this adapter may be masked (interrupts turned off), - swap whether the indicators need to be byte swapped, and flags contains - further characteristics of the adapter. - Currently defined values for 'flags' are: - - KVM_S390_ADAPTER_SUPPRESSIBLE: adapter is subject to AIS - (adapter-interrupt-suppression) facility. This flag only has an effect if - the AIS capability is enabled. - Unknown flag values are ignored. - - - KVM_DEV_FLIC_ADAPTER_MODIFY - Modifies attributes of an existing I/O adapter interrupt source. Takes - a kvm_s390_io_adapter_req specifying the adapter and the operation: - -struct kvm_s390_io_adapter_req { - __u32 id; - __u8 type; - __u8 mask; - __u16 pad0; - __u64 addr; -}; - - id specifies the adapter and type the operation. The supported operations - are: - - KVM_S390_IO_ADAPTER_MASK - mask or unmask the adapter, as specified in mask - - KVM_S390_IO_ADAPTER_MAP - perform a gmap translation for the guest address provided in addr, - pin a userspace page for the translated address and add it to the - list of mappings - Note: A new mapping will be created unconditionally; therefore, - the calling code should avoid making duplicate mappings. - - KVM_S390_IO_ADAPTER_UNMAP - release a userspace page for the translated address specified in addr - from the list of mappings - - KVM_DEV_FLIC_AISM - modify the adapter-interruption-suppression mode for a given isc if the - AIS capability is enabled. Takes a kvm_s390_ais_req describing: - -struct kvm_s390_ais_req { - __u8 isc; - __u16 mode; -}; - - isc contains the target I/O interruption subclass, mode the target - adapter-interruption-suppression mode. The following modes are - currently supported: - - KVM_S390_AIS_MODE_ALL: ALL-Interruptions Mode, i.e. airq injection - is always allowed; - - KVM_S390_AIS_MODE_SINGLE: SINGLE-Interruption Mode, i.e. airq - injection is only allowed once and the following adapter interrupts - will be suppressed until the mode is set again to ALL-Interruptions - or SINGLE-Interruption mode. - - KVM_DEV_FLIC_AIRQ_INJECT - Inject adapter interrupts on a specified adapter. - attr->attr contains the unique id for the adapter, which allows for - adapter-specific checks and actions. - For adapters subject to AIS, handle the airq injection suppression for - an isc according to the adapter-interruption-suppression mode on condition - that the AIS capability is enabled. - - KVM_DEV_FLIC_AISM_ALL - Gets or sets the adapter-interruption-suppression mode for all ISCs. Takes - a kvm_s390_ais_all describing: - -struct kvm_s390_ais_all { - __u8 simm; /* Single-Interruption-Mode mask */ - __u8 nimm; /* No-Interruption-Mode mask * -}; - - simm contains Single-Interruption-Mode mask for all ISCs, nimm contains - No-Interruption-Mode mask for all ISCs. Each bit in simm and nimm corresponds - to an ISC (MSB0 bit 0 to ISC 0 and so on). The combination of simm bit and - nimm bit presents AIS mode for a ISC. - - KVM_DEV_FLIC_AISM_ALL is indicated by KVM_CAP_S390_AIS_MIGRATION. - -Note: The KVM_SET_DEVICE_ATTR/KVM_GET_DEVICE_ATTR device ioctls executed on -FLIC with an unknown group or attribute gives the error code EINVAL (instead of -ENXIO, as specified in the API documentation). It is not possible to conclude -that a FLIC operation is unavailable based on the error code resulting from a -usage attempt. - -Note: The KVM_DEV_FLIC_CLEAR_IO_IRQ ioctl will return EINVAL in case a zero -schid is specified. -- cgit v1.2.3 From e777a5bd98c689f1ee15ebdbce739497e7d92f70 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 10 Feb 2020 07:02:50 +0100 Subject: docs: kvm: convert devices/vcpu.txt to ReST - Use title markups; - adjust indentation and add blank lines as needed; - adjust tables to match ReST accepted formats; - use :field: markups; - mark code blocks as such. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/devices/index.rst | 1 + Documentation/virt/kvm/devices/vcpu.rst | 114 +++++++++++++++++++++++++++++++ Documentation/virt/kvm/devices/vcpu.txt | 76 --------------------- 3 files changed, 115 insertions(+), 76 deletions(-) create mode 100644 Documentation/virt/kvm/devices/vcpu.rst delete mode 100644 Documentation/virt/kvm/devices/vcpu.txt diff --git a/Documentation/virt/kvm/devices/index.rst b/Documentation/virt/kvm/devices/index.rst index e6caccc36623..5a61838f0e61 100644 --- a/Documentation/virt/kvm/devices/index.rst +++ b/Documentation/virt/kvm/devices/index.rst @@ -12,3 +12,4 @@ Devices arm-vgic-v3 mpic s390_flic + vcpu diff --git a/Documentation/virt/kvm/devices/vcpu.rst b/Documentation/virt/kvm/devices/vcpu.rst new file mode 100644 index 000000000000..9963e680770a --- /dev/null +++ b/Documentation/virt/kvm/devices/vcpu.rst @@ -0,0 +1,114 @@ +.. SPDX-License-Identifier: GPL-2.0 + +====================== +Generic vcpu interface +====================== + +The virtual cpu "device" also accepts the ioctls KVM_SET_DEVICE_ATTR, +KVM_GET_DEVICE_ATTR, and KVM_HAS_DEVICE_ATTR. The interface uses the same struct +kvm_device_attr as other devices, but targets VCPU-wide settings and controls. + +The groups and attributes per virtual cpu, if any, are architecture specific. + +1. GROUP: KVM_ARM_VCPU_PMU_V3_CTRL +================================== + +:Architectures: ARM64 + +1.1. ATTRIBUTE: KVM_ARM_VCPU_PMU_V3_IRQ +--------------------------------------- + +:Parameters: in kvm_device_attr.addr the address for PMU overflow interrupt is a + pointer to an int + +Returns: + + ======= ======================================================== + -EBUSY The PMU overflow interrupt is already set + -ENXIO The overflow interrupt not set when attempting to get it + -ENODEV PMUv3 not supported + -EINVAL Invalid PMU overflow interrupt number supplied or + trying to set the IRQ number without using an in-kernel + irqchip. + ======= ======================================================== + +A value describing the PMUv3 (Performance Monitor Unit v3) overflow interrupt +number for this vcpu. This interrupt could be a PPI or SPI, but the interrupt +type must be same for each vcpu. As a PPI, the interrupt number is the same for +all vcpus, while as an SPI it must be a separate number per vcpu. + +1.2 ATTRIBUTE: KVM_ARM_VCPU_PMU_V3_INIT +--------------------------------------- + +:Parameters: no additional parameter in kvm_device_attr.addr + +Returns: + + ======= ====================================================== + -ENODEV PMUv3 not supported or GIC not initialized + -ENXIO PMUv3 not properly configured or in-kernel irqchip not + configured as required prior to calling this attribute + -EBUSY PMUv3 already initialized + ======= ====================================================== + +Request the initialization of the PMUv3. If using the PMUv3 with an in-kernel +virtual GIC implementation, this must be done after initializing the in-kernel +irqchip. + + +2. GROUP: KVM_ARM_VCPU_TIMER_CTRL +================================= + +:Architectures: ARM, ARM64 + +2.1. ATTRIBUTES: KVM_ARM_VCPU_TIMER_IRQ_VTIMER, KVM_ARM_VCPU_TIMER_IRQ_PTIMER +----------------------------------------------------------------------------- + +:Parameters: in kvm_device_attr.addr the address for the timer interrupt is a + pointer to an int + +Returns: + + ======= ================================= + -EINVAL Invalid timer interrupt number + -EBUSY One or more VCPUs has already run + ======= ================================= + +A value describing the architected timer interrupt number when connected to an +in-kernel virtual GIC. These must be a PPI (16 <= intid < 32). Setting the +attribute overrides the default values (see below). + +============================= ========================================== +KVM_ARM_VCPU_TIMER_IRQ_VTIMER The EL1 virtual timer intid (default: 27) +KVM_ARM_VCPU_TIMER_IRQ_PTIMER The EL1 physical timer intid (default: 30) +============================= ========================================== + +Setting the same PPI for different timers will prevent the VCPUs from running. +Setting the interrupt number on a VCPU configures all VCPUs created at that +time to use the number provided for a given timer, overwriting any previously +configured values on other VCPUs. Userspace should configure the interrupt +numbers on at least one VCPU after creating all VCPUs and before running any +VCPUs. + +3. GROUP: KVM_ARM_VCPU_PVTIME_CTRL +================================== + +:Architectures: ARM64 + +3.1 ATTRIBUTE: KVM_ARM_VCPU_PVTIME_IPA +-------------------------------------- + +:Parameters: 64-bit base address + +Returns: + + ======= ====================================== + -ENXIO Stolen time not implemented + -EEXIST Base address already set for this VCPU + -EINVAL Base address not 64 byte aligned + ======= ====================================== + +Specifies the base address of the stolen time structure for this VCPU. The +base address must be 64 byte aligned and exist within a valid guest memory +region. See Documentation/virt/kvm/arm/pvtime.txt for more information +including the layout of the stolen time structure. diff --git a/Documentation/virt/kvm/devices/vcpu.txt b/Documentation/virt/kvm/devices/vcpu.txt deleted file mode 100644 index 6f3bd64a05b0..000000000000 --- a/Documentation/virt/kvm/devices/vcpu.txt +++ /dev/null @@ -1,76 +0,0 @@ -Generic vcpu interface -==================================== - -The virtual cpu "device" also accepts the ioctls KVM_SET_DEVICE_ATTR, -KVM_GET_DEVICE_ATTR, and KVM_HAS_DEVICE_ATTR. The interface uses the same struct -kvm_device_attr as other devices, but targets VCPU-wide settings and controls. - -The groups and attributes per virtual cpu, if any, are architecture specific. - -1. GROUP: KVM_ARM_VCPU_PMU_V3_CTRL -Architectures: ARM64 - -1.1. ATTRIBUTE: KVM_ARM_VCPU_PMU_V3_IRQ -Parameters: in kvm_device_attr.addr the address for PMU overflow interrupt is a - pointer to an int -Returns: -EBUSY: The PMU overflow interrupt is already set - -ENXIO: The overflow interrupt not set when attempting to get it - -ENODEV: PMUv3 not supported - -EINVAL: Invalid PMU overflow interrupt number supplied or - trying to set the IRQ number without using an in-kernel - irqchip. - -A value describing the PMUv3 (Performance Monitor Unit v3) overflow interrupt -number for this vcpu. This interrupt could be a PPI or SPI, but the interrupt -type must be same for each vcpu. As a PPI, the interrupt number is the same for -all vcpus, while as an SPI it must be a separate number per vcpu. - -1.2 ATTRIBUTE: KVM_ARM_VCPU_PMU_V3_INIT -Parameters: no additional parameter in kvm_device_attr.addr -Returns: -ENODEV: PMUv3 not supported or GIC not initialized - -ENXIO: PMUv3 not properly configured or in-kernel irqchip not - configured as required prior to calling this attribute - -EBUSY: PMUv3 already initialized - -Request the initialization of the PMUv3. If using the PMUv3 with an in-kernel -virtual GIC implementation, this must be done after initializing the in-kernel -irqchip. - - -2. GROUP: KVM_ARM_VCPU_TIMER_CTRL -Architectures: ARM,ARM64 - -2.1. ATTRIBUTE: KVM_ARM_VCPU_TIMER_IRQ_VTIMER -2.2. ATTRIBUTE: KVM_ARM_VCPU_TIMER_IRQ_PTIMER -Parameters: in kvm_device_attr.addr the address for the timer interrupt is a - pointer to an int -Returns: -EINVAL: Invalid timer interrupt number - -EBUSY: One or more VCPUs has already run - -A value describing the architected timer interrupt number when connected to an -in-kernel virtual GIC. These must be a PPI (16 <= intid < 32). Setting the -attribute overrides the default values (see below). - -KVM_ARM_VCPU_TIMER_IRQ_VTIMER: The EL1 virtual timer intid (default: 27) -KVM_ARM_VCPU_TIMER_IRQ_PTIMER: The EL1 physical timer intid (default: 30) - -Setting the same PPI for different timers will prevent the VCPUs from running. -Setting the interrupt number on a VCPU configures all VCPUs created at that -time to use the number provided for a given timer, overwriting any previously -configured values on other VCPUs. Userspace should configure the interrupt -numbers on at least one VCPU after creating all VCPUs and before running any -VCPUs. - -3. GROUP: KVM_ARM_VCPU_PVTIME_CTRL -Architectures: ARM64 - -3.1 ATTRIBUTE: KVM_ARM_VCPU_PVTIME_IPA -Parameters: 64-bit base address -Returns: -ENXIO: Stolen time not implemented - -EEXIST: Base address already set for this VCPU - -EINVAL: Base address not 64 byte aligned - -Specifies the base address of the stolen time structure for this VCPU. The -base address must be 64 byte aligned and exist within a valid guest memory -region. See Documentation/virt/kvm/arm/pvtime.txt for more information -including the layout of the stolen time structure. -- cgit v1.2.3 From aff7aeea548312cacd146e80efb944bd8f2c0faa Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 10 Feb 2020 07:02:51 +0100 Subject: docs: kvm: convert devices/vfio.txt to ReST - Use standard title markup; - adjust lists; - mark code blocks as such. Signed-off-by: Mauro Carvalho Chehab Reviewed-by: Cornelia Huck Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/devices/index.rst | 1 + Documentation/virt/kvm/devices/vfio.rst | 41 ++++++++++++++++++++++++++++++++ Documentation/virt/kvm/devices/vfio.txt | 36 ---------------------------- 3 files changed, 42 insertions(+), 36 deletions(-) create mode 100644 Documentation/virt/kvm/devices/vfio.rst delete mode 100644 Documentation/virt/kvm/devices/vfio.txt diff --git a/Documentation/virt/kvm/devices/index.rst b/Documentation/virt/kvm/devices/index.rst index 5a61838f0e61..cbbadda080d0 100644 --- a/Documentation/virt/kvm/devices/index.rst +++ b/Documentation/virt/kvm/devices/index.rst @@ -13,3 +13,4 @@ Devices mpic s390_flic vcpu + vfio diff --git a/Documentation/virt/kvm/devices/vfio.rst b/Documentation/virt/kvm/devices/vfio.rst new file mode 100644 index 000000000000..2d20dc561069 --- /dev/null +++ b/Documentation/virt/kvm/devices/vfio.rst @@ -0,0 +1,41 @@ +.. SPDX-License-Identifier: GPL-2.0 + +=================== +VFIO virtual device +=================== + +Device types supported: + + - KVM_DEV_TYPE_VFIO + +Only one VFIO instance may be created per VM. The created device +tracks VFIO groups in use by the VM and features of those groups +important to the correctness and acceleration of the VM. As groups +are enabled and disabled for use by the VM, KVM should be updated +about their presence. When registered with KVM, a reference to the +VFIO-group is held by KVM. + +Groups: + KVM_DEV_VFIO_GROUP + +KVM_DEV_VFIO_GROUP attributes: + KVM_DEV_VFIO_GROUP_ADD: Add a VFIO group to VFIO-KVM device tracking + kvm_device_attr.addr points to an int32_t file descriptor + for the VFIO group. + KVM_DEV_VFIO_GROUP_DEL: Remove a VFIO group from VFIO-KVM device tracking + kvm_device_attr.addr points to an int32_t file descriptor + for the VFIO group. + KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE: attaches a guest visible TCE table + allocated by sPAPR KVM. + kvm_device_attr.addr points to a struct:: + + struct kvm_vfio_spapr_tce { + __s32 groupfd; + __s32 tablefd; + }; + + where: + + - @groupfd is a file descriptor for a VFIO group; + - @tablefd is a file descriptor for a TCE table allocated via + KVM_CREATE_SPAPR_TCE. diff --git a/Documentation/virt/kvm/devices/vfio.txt b/Documentation/virt/kvm/devices/vfio.txt deleted file mode 100644 index 528c77c8022c..000000000000 --- a/Documentation/virt/kvm/devices/vfio.txt +++ /dev/null @@ -1,36 +0,0 @@ -VFIO virtual device -=================== - -Device types supported: - KVM_DEV_TYPE_VFIO - -Only one VFIO instance may be created per VM. The created device -tracks VFIO groups in use by the VM and features of those groups -important to the correctness and acceleration of the VM. As groups -are enabled and disabled for use by the VM, KVM should be updated -about their presence. When registered with KVM, a reference to the -VFIO-group is held by KVM. - -Groups: - KVM_DEV_VFIO_GROUP - -KVM_DEV_VFIO_GROUP attributes: - KVM_DEV_VFIO_GROUP_ADD: Add a VFIO group to VFIO-KVM device tracking - kvm_device_attr.addr points to an int32_t file descriptor - for the VFIO group. - KVM_DEV_VFIO_GROUP_DEL: Remove a VFIO group from VFIO-KVM device tracking - kvm_device_attr.addr points to an int32_t file descriptor - for the VFIO group. - KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE: attaches a guest visible TCE table - allocated by sPAPR KVM. - kvm_device_attr.addr points to a struct: - - struct kvm_vfio_spapr_tce { - __s32 groupfd; - __s32 tablefd; - }; - - where - @groupfd is a file descriptor for a VFIO group; - @tablefd is a file descriptor for a TCE table allocated via - KVM_CREATE_SPAPR_TCE. -- cgit v1.2.3 From 6c972ba685d5849009e0747cf8799adc3b8d5f11 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 10 Feb 2020 07:02:52 +0100 Subject: docs: kvm: convert devices/vm.txt to ReST - Use title markups; - adjust indentation and add blank lines as needed; - use :field: markups; - Use cross-references; - mark code blocks as such. Signed-off-by: Mauro Carvalho Chehab Reviewed-by: Cornelia Huck Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/devices/index.rst | 1 + Documentation/virt/kvm/devices/vm.rst | 316 +++++++++++++++++++++++++++++++ Documentation/virt/kvm/devices/vm.txt | 270 -------------------------- 3 files changed, 317 insertions(+), 270 deletions(-) create mode 100644 Documentation/virt/kvm/devices/vm.rst delete mode 100644 Documentation/virt/kvm/devices/vm.txt diff --git a/Documentation/virt/kvm/devices/index.rst b/Documentation/virt/kvm/devices/index.rst index cbbadda080d0..29f8ecdf7fa0 100644 --- a/Documentation/virt/kvm/devices/index.rst +++ b/Documentation/virt/kvm/devices/index.rst @@ -14,3 +14,4 @@ Devices s390_flic vcpu vfio + vm diff --git a/Documentation/virt/kvm/devices/vm.rst b/Documentation/virt/kvm/devices/vm.rst new file mode 100644 index 000000000000..0aa5b1cfd700 --- /dev/null +++ b/Documentation/virt/kvm/devices/vm.rst @@ -0,0 +1,316 @@ +.. SPDX-License-Identifier: GPL-2.0 + +==================== +Generic vm interface +==================== + +The virtual machine "device" also accepts the ioctls KVM_SET_DEVICE_ATTR, +KVM_GET_DEVICE_ATTR, and KVM_HAS_DEVICE_ATTR. The interface uses the same +struct kvm_device_attr as other devices, but targets VM-wide settings +and controls. + +The groups and attributes per virtual machine, if any, are architecture +specific. + +1. GROUP: KVM_S390_VM_MEM_CTRL +============================== + +:Architectures: s390 + +1.1. ATTRIBUTE: KVM_S390_VM_MEM_ENABLE_CMMA +------------------------------------------- + +:Parameters: none +:Returns: -EBUSY if a vcpu is already defined, otherwise 0 + +Enables Collaborative Memory Management Assist (CMMA) for the virtual machine. + +1.2. ATTRIBUTE: KVM_S390_VM_MEM_CLR_CMMA +---------------------------------------- + +:Parameters: none +:Returns: -EINVAL if CMMA was not enabled; + 0 otherwise + +Clear the CMMA status for all guest pages, so any pages the guest marked +as unused are again used any may not be reclaimed by the host. + +1.3. ATTRIBUTE KVM_S390_VM_MEM_LIMIT_SIZE +----------------------------------------- + +:Parameters: in attr->addr the address for the new limit of guest memory +:Returns: -EFAULT if the given address is not accessible; + -EINVAL if the virtual machine is of type UCONTROL; + -E2BIG if the given guest memory is to big for that machine; + -EBUSY if a vcpu is already defined; + -ENOMEM if not enough memory is available for a new shadow guest mapping; + 0 otherwise. + +Allows userspace to query the actual limit and set a new limit for +the maximum guest memory size. The limit will be rounded up to +2048 MB, 4096 GB, 8192 TB respectively, as this limit is governed by +the number of page table levels. In the case that there is no limit we will set +the limit to KVM_S390_NO_MEM_LIMIT (U64_MAX). + +2. GROUP: KVM_S390_VM_CPU_MODEL +=============================== + +:Architectures: s390 + +2.1. ATTRIBUTE: KVM_S390_VM_CPU_MACHINE (r/o) +--------------------------------------------- + +Allows user space to retrieve machine and kvm specific cpu related information:: + + struct kvm_s390_vm_cpu_machine { + __u64 cpuid; # CPUID of host + __u32 ibc; # IBC level range offered by host + __u8 pad[4]; + __u64 fac_mask[256]; # set of cpu facilities enabled by KVM + __u64 fac_list[256]; # set of cpu facilities offered by host + } + +:Parameters: address of buffer to store the machine related cpu data + of type struct kvm_s390_vm_cpu_machine* +:Returns: -EFAULT if the given address is not accessible from kernel space; + -ENOMEM if not enough memory is available to process the ioctl; + 0 in case of success. + +2.2. ATTRIBUTE: KVM_S390_VM_CPU_PROCESSOR (r/w) +=============================================== + +Allows user space to retrieve or request to change cpu related information for a vcpu:: + + struct kvm_s390_vm_cpu_processor { + __u64 cpuid; # CPUID currently (to be) used by this vcpu + __u16 ibc; # IBC level currently (to be) used by this vcpu + __u8 pad[6]; + __u64 fac_list[256]; # set of cpu facilities currently (to be) used + # by this vcpu + } + +KVM does not enforce or limit the cpu model data in any form. Take the information +retrieved by means of KVM_S390_VM_CPU_MACHINE as hint for reasonable configuration +setups. Instruction interceptions triggered by additionally set facility bits that +are not handled by KVM need to by imlemented in the VM driver code. + +:Parameters: address of buffer to store/set the processor related cpu + data of type struct kvm_s390_vm_cpu_processor*. +:Returns: -EBUSY in case 1 or more vcpus are already activated (only in write case); + -EFAULT if the given address is not accessible from kernel space; + -ENOMEM if not enough memory is available to process the ioctl; + 0 in case of success. + +.. _KVM_S390_VM_CPU_MACHINE_FEAT: + +2.3. ATTRIBUTE: KVM_S390_VM_CPU_MACHINE_FEAT (r/o) +-------------------------------------------------- + +Allows user space to retrieve available cpu features. A feature is available if +provided by the hardware and supported by kvm. In theory, cpu features could +even be completely emulated by kvm. + +:: + + struct kvm_s390_vm_cpu_feat { + __u64 feat[16]; # Bitmap (1 = feature available), MSB 0 bit numbering + }; + +:Parameters: address of a buffer to load the feature list from. +:Returns: -EFAULT if the given address is not accessible from kernel space; + 0 in case of success. + +2.4. ATTRIBUTE: KVM_S390_VM_CPU_PROCESSOR_FEAT (r/w) +---------------------------------------------------- + +Allows user space to retrieve or change enabled cpu features for all VCPUs of a +VM. Features that are not available cannot be enabled. + +See :ref:`KVM_S390_VM_CPU_MACHINE_FEAT` for +a description of the parameter struct. + +:Parameters: address of a buffer to store/load the feature list from. +:Returns: -EFAULT if the given address is not accessible from kernel space; + -EINVAL if a cpu feature that is not available is to be enabled; + -EBUSY if at least one VCPU has already been defined; + 0 in case of success. + +.. _KVM_S390_VM_CPU_MACHINE_SUBFUNC: + +2.5. ATTRIBUTE: KVM_S390_VM_CPU_MACHINE_SUBFUNC (r/o) +----------------------------------------------------- + +Allows user space to retrieve available cpu subfunctions without any filtering +done by a set IBC. These subfunctions are indicated to the guest VCPU via +query or "test bit" subfunctions and used e.g. by cpacf functions, plo and ptff. + +A subfunction block is only valid if KVM_S390_VM_CPU_MACHINE contains the +STFL(E) bit introducing the affected instruction. If the affected instruction +indicates subfunctions via a "query subfunction", the response block is +contained in the returned struct. If the affected instruction +indicates subfunctions via a "test bit" mechanism, the subfunction codes are +contained in the returned struct in MSB 0 bit numbering. + +:: + + struct kvm_s390_vm_cpu_subfunc { + u8 plo[32]; # always valid (ESA/390 feature) + u8 ptff[16]; # valid with TOD-clock steering + u8 kmac[16]; # valid with Message-Security-Assist + u8 kmc[16]; # valid with Message-Security-Assist + u8 km[16]; # valid with Message-Security-Assist + u8 kimd[16]; # valid with Message-Security-Assist + u8 klmd[16]; # valid with Message-Security-Assist + u8 pckmo[16]; # valid with Message-Security-Assist-Extension 3 + u8 kmctr[16]; # valid with Message-Security-Assist-Extension 4 + u8 kmf[16]; # valid with Message-Security-Assist-Extension 4 + u8 kmo[16]; # valid with Message-Security-Assist-Extension 4 + u8 pcc[16]; # valid with Message-Security-Assist-Extension 4 + u8 ppno[16]; # valid with Message-Security-Assist-Extension 5 + u8 kma[16]; # valid with Message-Security-Assist-Extension 8 + u8 kdsa[16]; # valid with Message-Security-Assist-Extension 9 + u8 reserved[1792]; # reserved for future instructions + }; + +:Parameters: address of a buffer to load the subfunction blocks from. +:Returns: -EFAULT if the given address is not accessible from kernel space; + 0 in case of success. + +2.6. ATTRIBUTE: KVM_S390_VM_CPU_PROCESSOR_SUBFUNC (r/w) +------------------------------------------------------- + +Allows user space to retrieve or change cpu subfunctions to be indicated for +all VCPUs of a VM. This attribute will only be available if kernel and +hardware support are in place. + +The kernel uses the configured subfunction blocks for indication to +the guest. A subfunction block will only be used if the associated STFL(E) bit +has not been disabled by user space (so the instruction to be queried is +actually available for the guest). + +As long as no data has been written, a read will fail. The IBC will be used +to determine available subfunctions in this case, this will guarantee backward +compatibility. + +See :ref:`KVM_S390_VM_CPU_MACHINE_SUBFUNC` for a +description of the parameter struct. + +:Parameters: address of a buffer to store/load the subfunction blocks from. +:Returns: -EFAULT if the given address is not accessible from kernel space; + -EINVAL when reading, if there was no write yet; + -EBUSY if at least one VCPU has already been defined; + 0 in case of success. + +3. GROUP: KVM_S390_VM_TOD +========================= + +:Architectures: s390 + +3.1. ATTRIBUTE: KVM_S390_VM_TOD_HIGH +------------------------------------ + +Allows user space to set/get the TOD clock extension (u8) (superseded by +KVM_S390_VM_TOD_EXT). + +:Parameters: address of a buffer in user space to store the data (u8) to +:Returns: -EFAULT if the given address is not accessible from kernel space; + -EINVAL if setting the TOD clock extension to != 0 is not supported + +3.2. ATTRIBUTE: KVM_S390_VM_TOD_LOW +----------------------------------- + +Allows user space to set/get bits 0-63 of the TOD clock register as defined in +the POP (u64). + +:Parameters: address of a buffer in user space to store the data (u64) to +:Returns: -EFAULT if the given address is not accessible from kernel space + +3.3. ATTRIBUTE: KVM_S390_VM_TOD_EXT +----------------------------------- + +Allows user space to set/get bits 0-63 of the TOD clock register as defined in +the POP (u64). If the guest CPU model supports the TOD clock extension (u8), it +also allows user space to get/set it. If the guest CPU model does not support +it, it is stored as 0 and not allowed to be set to a value != 0. + +:Parameters: address of a buffer in user space to store the data + (kvm_s390_vm_tod_clock) to +:Returns: -EFAULT if the given address is not accessible from kernel space; + -EINVAL if setting the TOD clock extension to != 0 is not supported + +4. GROUP: KVM_S390_VM_CRYPTO +============================ + +:Architectures: s390 + +4.1. ATTRIBUTE: KVM_S390_VM_CRYPTO_ENABLE_AES_KW (w/o) +------------------------------------------------------ + +Allows user space to enable aes key wrapping, including generating a new +wrapping key. + +:Parameters: none +:Returns: 0 + +4.2. ATTRIBUTE: KVM_S390_VM_CRYPTO_ENABLE_DEA_KW (w/o) +------------------------------------------------------ + +Allows user space to enable dea key wrapping, including generating a new +wrapping key. + +:Parameters: none +:Returns: 0 + +4.3. ATTRIBUTE: KVM_S390_VM_CRYPTO_DISABLE_AES_KW (w/o) +------------------------------------------------------- + +Allows user space to disable aes key wrapping, clearing the wrapping key. + +:Parameters: none +:Returns: 0 + +4.4. ATTRIBUTE: KVM_S390_VM_CRYPTO_DISABLE_DEA_KW (w/o) +------------------------------------------------------- + +Allows user space to disable dea key wrapping, clearing the wrapping key. + +:Parameters: none +:Returns: 0 + +5. GROUP: KVM_S390_VM_MIGRATION +=============================== + +:Architectures: s390 + +5.1. ATTRIBUTE: KVM_S390_VM_MIGRATION_STOP (w/o) +------------------------------------------------ + +Allows userspace to stop migration mode, needed for PGSTE migration. +Setting this attribute when migration mode is not active will have no +effects. + +:Parameters: none +:Returns: 0 + +5.2. ATTRIBUTE: KVM_S390_VM_MIGRATION_START (w/o) +------------------------------------------------- + +Allows userspace to start migration mode, needed for PGSTE migration. +Setting this attribute when migration mode is already active will have +no effects. + +:Parameters: none +:Returns: -ENOMEM if there is not enough free memory to start migration mode; + -EINVAL if the state of the VM is invalid (e.g. no memory defined); + 0 in case of success. + +5.3. ATTRIBUTE: KVM_S390_VM_MIGRATION_STATUS (r/o) +-------------------------------------------------- + +Allows userspace to query the status of migration mode. + +:Parameters: address of a buffer in user space to store the data (u64) to; + the data itself is either 0 if migration mode is disabled or 1 + if it is enabled +:Returns: -EFAULT if the given address is not accessible from kernel space; + 0 in case of success. diff --git a/Documentation/virt/kvm/devices/vm.txt b/Documentation/virt/kvm/devices/vm.txt deleted file mode 100644 index 4ffb82b02468..000000000000 --- a/Documentation/virt/kvm/devices/vm.txt +++ /dev/null @@ -1,270 +0,0 @@ -Generic vm interface -==================================== - -The virtual machine "device" also accepts the ioctls KVM_SET_DEVICE_ATTR, -KVM_GET_DEVICE_ATTR, and KVM_HAS_DEVICE_ATTR. The interface uses the same -struct kvm_device_attr as other devices, but targets VM-wide settings -and controls. - -The groups and attributes per virtual machine, if any, are architecture -specific. - -1. GROUP: KVM_S390_VM_MEM_CTRL -Architectures: s390 - -1.1. ATTRIBUTE: KVM_S390_VM_MEM_ENABLE_CMMA -Parameters: none -Returns: -EBUSY if a vcpu is already defined, otherwise 0 - -Enables Collaborative Memory Management Assist (CMMA) for the virtual machine. - -1.2. ATTRIBUTE: KVM_S390_VM_MEM_CLR_CMMA -Parameters: none -Returns: -EINVAL if CMMA was not enabled - 0 otherwise - -Clear the CMMA status for all guest pages, so any pages the guest marked -as unused are again used any may not be reclaimed by the host. - -1.3. ATTRIBUTE KVM_S390_VM_MEM_LIMIT_SIZE -Parameters: in attr->addr the address for the new limit of guest memory -Returns: -EFAULT if the given address is not accessible - -EINVAL if the virtual machine is of type UCONTROL - -E2BIG if the given guest memory is to big for that machine - -EBUSY if a vcpu is already defined - -ENOMEM if not enough memory is available for a new shadow guest mapping - 0 otherwise - -Allows userspace to query the actual limit and set a new limit for -the maximum guest memory size. The limit will be rounded up to -2048 MB, 4096 GB, 8192 TB respectively, as this limit is governed by -the number of page table levels. In the case that there is no limit we will set -the limit to KVM_S390_NO_MEM_LIMIT (U64_MAX). - -2. GROUP: KVM_S390_VM_CPU_MODEL -Architectures: s390 - -2.1. ATTRIBUTE: KVM_S390_VM_CPU_MACHINE (r/o) - -Allows user space to retrieve machine and kvm specific cpu related information: - -struct kvm_s390_vm_cpu_machine { - __u64 cpuid; # CPUID of host - __u32 ibc; # IBC level range offered by host - __u8 pad[4]; - __u64 fac_mask[256]; # set of cpu facilities enabled by KVM - __u64 fac_list[256]; # set of cpu facilities offered by host -} - -Parameters: address of buffer to store the machine related cpu data - of type struct kvm_s390_vm_cpu_machine* -Returns: -EFAULT if the given address is not accessible from kernel space - -ENOMEM if not enough memory is available to process the ioctl - 0 in case of success - -2.2. ATTRIBUTE: KVM_S390_VM_CPU_PROCESSOR (r/w) - -Allows user space to retrieve or request to change cpu related information for a vcpu: - -struct kvm_s390_vm_cpu_processor { - __u64 cpuid; # CPUID currently (to be) used by this vcpu - __u16 ibc; # IBC level currently (to be) used by this vcpu - __u8 pad[6]; - __u64 fac_list[256]; # set of cpu facilities currently (to be) used - # by this vcpu -} - -KVM does not enforce or limit the cpu model data in any form. Take the information -retrieved by means of KVM_S390_VM_CPU_MACHINE as hint for reasonable configuration -setups. Instruction interceptions triggered by additionally set facility bits that -are not handled by KVM need to by imlemented in the VM driver code. - -Parameters: address of buffer to store/set the processor related cpu - data of type struct kvm_s390_vm_cpu_processor*. -Returns: -EBUSY in case 1 or more vcpus are already activated (only in write case) - -EFAULT if the given address is not accessible from kernel space - -ENOMEM if not enough memory is available to process the ioctl - 0 in case of success - -2.3. ATTRIBUTE: KVM_S390_VM_CPU_MACHINE_FEAT (r/o) - -Allows user space to retrieve available cpu features. A feature is available if -provided by the hardware and supported by kvm. In theory, cpu features could -even be completely emulated by kvm. - -struct kvm_s390_vm_cpu_feat { - __u64 feat[16]; # Bitmap (1 = feature available), MSB 0 bit numbering -}; - -Parameters: address of a buffer to load the feature list from. -Returns: -EFAULT if the given address is not accessible from kernel space. - 0 in case of success. - -2.4. ATTRIBUTE: KVM_S390_VM_CPU_PROCESSOR_FEAT (r/w) - -Allows user space to retrieve or change enabled cpu features for all VCPUs of a -VM. Features that are not available cannot be enabled. - -See 2.3. for a description of the parameter struct. - -Parameters: address of a buffer to store/load the feature list from. -Returns: -EFAULT if the given address is not accessible from kernel space. - -EINVAL if a cpu feature that is not available is to be enabled. - -EBUSY if at least one VCPU has already been defined. - 0 in case of success. - -2.5. ATTRIBUTE: KVM_S390_VM_CPU_MACHINE_SUBFUNC (r/o) - -Allows user space to retrieve available cpu subfunctions without any filtering -done by a set IBC. These subfunctions are indicated to the guest VCPU via -query or "test bit" subfunctions and used e.g. by cpacf functions, plo and ptff. - -A subfunction block is only valid if KVM_S390_VM_CPU_MACHINE contains the -STFL(E) bit introducing the affected instruction. If the affected instruction -indicates subfunctions via a "query subfunction", the response block is -contained in the returned struct. If the affected instruction -indicates subfunctions via a "test bit" mechanism, the subfunction codes are -contained in the returned struct in MSB 0 bit numbering. - -struct kvm_s390_vm_cpu_subfunc { - u8 plo[32]; # always valid (ESA/390 feature) - u8 ptff[16]; # valid with TOD-clock steering - u8 kmac[16]; # valid with Message-Security-Assist - u8 kmc[16]; # valid with Message-Security-Assist - u8 km[16]; # valid with Message-Security-Assist - u8 kimd[16]; # valid with Message-Security-Assist - u8 klmd[16]; # valid with Message-Security-Assist - u8 pckmo[16]; # valid with Message-Security-Assist-Extension 3 - u8 kmctr[16]; # valid with Message-Security-Assist-Extension 4 - u8 kmf[16]; # valid with Message-Security-Assist-Extension 4 - u8 kmo[16]; # valid with Message-Security-Assist-Extension 4 - u8 pcc[16]; # valid with Message-Security-Assist-Extension 4 - u8 ppno[16]; # valid with Message-Security-Assist-Extension 5 - u8 kma[16]; # valid with Message-Security-Assist-Extension 8 - u8 kdsa[16]; # valid with Message-Security-Assist-Extension 9 - u8 reserved[1792]; # reserved for future instructions -}; - -Parameters: address of a buffer to load the subfunction blocks from. -Returns: -EFAULT if the given address is not accessible from kernel space. - 0 in case of success. - -2.6. ATTRIBUTE: KVM_S390_VM_CPU_PROCESSOR_SUBFUNC (r/w) - -Allows user space to retrieve or change cpu subfunctions to be indicated for -all VCPUs of a VM. This attribute will only be available if kernel and -hardware support are in place. - -The kernel uses the configured subfunction blocks for indication to -the guest. A subfunction block will only be used if the associated STFL(E) bit -has not been disabled by user space (so the instruction to be queried is -actually available for the guest). - -As long as no data has been written, a read will fail. The IBC will be used -to determine available subfunctions in this case, this will guarantee backward -compatibility. - -See 2.5. for a description of the parameter struct. - -Parameters: address of a buffer to store/load the subfunction blocks from. -Returns: -EFAULT if the given address is not accessible from kernel space. - -EINVAL when reading, if there was no write yet. - -EBUSY if at least one VCPU has already been defined. - 0 in case of success. - -3. GROUP: KVM_S390_VM_TOD -Architectures: s390 - -3.1. ATTRIBUTE: KVM_S390_VM_TOD_HIGH - -Allows user space to set/get the TOD clock extension (u8) (superseded by -KVM_S390_VM_TOD_EXT). - -Parameters: address of a buffer in user space to store the data (u8) to -Returns: -EFAULT if the given address is not accessible from kernel space - -EINVAL if setting the TOD clock extension to != 0 is not supported - -3.2. ATTRIBUTE: KVM_S390_VM_TOD_LOW - -Allows user space to set/get bits 0-63 of the TOD clock register as defined in -the POP (u64). - -Parameters: address of a buffer in user space to store the data (u64) to -Returns: -EFAULT if the given address is not accessible from kernel space - -3.3. ATTRIBUTE: KVM_S390_VM_TOD_EXT -Allows user space to set/get bits 0-63 of the TOD clock register as defined in -the POP (u64). If the guest CPU model supports the TOD clock extension (u8), it -also allows user space to get/set it. If the guest CPU model does not support -it, it is stored as 0 and not allowed to be set to a value != 0. - -Parameters: address of a buffer in user space to store the data - (kvm_s390_vm_tod_clock) to -Returns: -EFAULT if the given address is not accessible from kernel space - -EINVAL if setting the TOD clock extension to != 0 is not supported - -4. GROUP: KVM_S390_VM_CRYPTO -Architectures: s390 - -4.1. ATTRIBUTE: KVM_S390_VM_CRYPTO_ENABLE_AES_KW (w/o) - -Allows user space to enable aes key wrapping, including generating a new -wrapping key. - -Parameters: none -Returns: 0 - -4.2. ATTRIBUTE: KVM_S390_VM_CRYPTO_ENABLE_DEA_KW (w/o) - -Allows user space to enable dea key wrapping, including generating a new -wrapping key. - -Parameters: none -Returns: 0 - -4.3. ATTRIBUTE: KVM_S390_VM_CRYPTO_DISABLE_AES_KW (w/o) - -Allows user space to disable aes key wrapping, clearing the wrapping key. - -Parameters: none -Returns: 0 - -4.4. ATTRIBUTE: KVM_S390_VM_CRYPTO_DISABLE_DEA_KW (w/o) - -Allows user space to disable dea key wrapping, clearing the wrapping key. - -Parameters: none -Returns: 0 - -5. GROUP: KVM_S390_VM_MIGRATION -Architectures: s390 - -5.1. ATTRIBUTE: KVM_S390_VM_MIGRATION_STOP (w/o) - -Allows userspace to stop migration mode, needed for PGSTE migration. -Setting this attribute when migration mode is not active will have no -effects. - -Parameters: none -Returns: 0 - -5.2. ATTRIBUTE: KVM_S390_VM_MIGRATION_START (w/o) - -Allows userspace to start migration mode, needed for PGSTE migration. -Setting this attribute when migration mode is already active will have -no effects. - -Parameters: none -Returns: -ENOMEM if there is not enough free memory to start migration mode - -EINVAL if the state of the VM is invalid (e.g. no memory defined) - 0 in case of success. - -5.3. ATTRIBUTE: KVM_S390_VM_MIGRATION_STATUS (r/o) - -Allows userspace to query the status of migration mode. - -Parameters: address of a buffer in user space to store the data (u64) to; - the data itself is either 0 if migration mode is disabled or 1 - if it is enabled -Returns: -EFAULT if the given address is not accessible from kernel space - 0 in case of success. -- cgit v1.2.3 From 5cccf3797435008b7cd8d9d98d37db3962368710 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 10 Feb 2020 07:02:53 +0100 Subject: docs: kvm: convert devices/xics.txt to ReST - Use title markups; - adjust indentation and add blank lines as needed; - adjust tables to match ReST accepted formats; - use :field: markups. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/devices/index.rst | 1 + Documentation/virt/kvm/devices/xics.rst | 92 ++++++++++++++++++++++++++++++++ Documentation/virt/kvm/devices/xics.txt | 76 -------------------------- 3 files changed, 93 insertions(+), 76 deletions(-) create mode 100644 Documentation/virt/kvm/devices/xics.rst delete mode 100644 Documentation/virt/kvm/devices/xics.txt diff --git a/Documentation/virt/kvm/devices/index.rst b/Documentation/virt/kvm/devices/index.rst index 29f8ecdf7fa0..63b61369d09b 100644 --- a/Documentation/virt/kvm/devices/index.rst +++ b/Documentation/virt/kvm/devices/index.rst @@ -15,3 +15,4 @@ Devices vcpu vfio vm + xics diff --git a/Documentation/virt/kvm/devices/xics.rst b/Documentation/virt/kvm/devices/xics.rst new file mode 100644 index 000000000000..2d6927e0b776 --- /dev/null +++ b/Documentation/virt/kvm/devices/xics.rst @@ -0,0 +1,92 @@ +.. SPDX-License-Identifier: GPL-2.0 + +========================= +XICS interrupt controller +========================= + +Device type supported: KVM_DEV_TYPE_XICS + +Groups: + 1. KVM_DEV_XICS_GRP_SOURCES + Attributes: + + One per interrupt source, indexed by the source number. + 2. KVM_DEV_XICS_GRP_CTRL + Attributes: + + 2.1 KVM_DEV_XICS_NR_SERVERS (write only) + + The kvm_device_attr.addr points to a __u32 value which is the number of + interrupt server numbers (ie, highest possible vcpu id plus one). + + Errors: + + ======= ========================================== + -EINVAL Value greater than KVM_MAX_VCPU_ID. + -EFAULT Invalid user pointer for attr->addr. + -EBUSY A vcpu is already connected to the device. + ======= ========================================== + +This device emulates the XICS (eXternal Interrupt Controller +Specification) defined in PAPR. The XICS has a set of interrupt +sources, each identified by a 20-bit source number, and a set of +Interrupt Control Presentation (ICP) entities, also called "servers", +each associated with a virtual CPU. + +The ICP entities are created by enabling the KVM_CAP_IRQ_ARCH +capability for each vcpu, specifying KVM_CAP_IRQ_XICS in args[0] and +the interrupt server number (i.e. the vcpu number from the XICS's +point of view) in args[1] of the kvm_enable_cap struct. Each ICP has +64 bits of state which can be read and written using the +KVM_GET_ONE_REG and KVM_SET_ONE_REG ioctls on the vcpu. The 64 bit +state word has the following bitfields, starting at the +least-significant end of the word: + +* Unused, 16 bits + +* Pending interrupt priority, 8 bits + Zero is the highest priority, 255 means no interrupt is pending. + +* Pending IPI (inter-processor interrupt) priority, 8 bits + Zero is the highest priority, 255 means no IPI is pending. + +* Pending interrupt source number, 24 bits + Zero means no interrupt pending, 2 means an IPI is pending + +* Current processor priority, 8 bits + Zero is the highest priority, meaning no interrupts can be + delivered, and 255 is the lowest priority. + +Each source has 64 bits of state that can be read and written using +the KVM_GET_DEVICE_ATTR and KVM_SET_DEVICE_ATTR ioctls, specifying the +KVM_DEV_XICS_GRP_SOURCES attribute group, with the attribute number being +the interrupt source number. The 64 bit state word has the following +bitfields, starting from the least-significant end of the word: + +* Destination (server number), 32 bits + + This specifies where the interrupt should be sent, and is the + interrupt server number specified for the destination vcpu. + +* Priority, 8 bits + + This is the priority specified for this interrupt source, where 0 is + the highest priority and 255 is the lowest. An interrupt with a + priority of 255 will never be delivered. + +* Level sensitive flag, 1 bit + + This bit is 1 for a level-sensitive interrupt source, or 0 for + edge-sensitive (or MSI). + +* Masked flag, 1 bit + + This bit is set to 1 if the interrupt is masked (cannot be delivered + regardless of its priority), for example by the ibm,int-off RTAS + call, or 0 if it is not masked. + +* Pending flag, 1 bit + + This bit is 1 if the source has a pending interrupt, otherwise 0. + +Only one XICS instance may be created per VM. diff --git a/Documentation/virt/kvm/devices/xics.txt b/Documentation/virt/kvm/devices/xics.txt deleted file mode 100644 index 423332dda7bc..000000000000 --- a/Documentation/virt/kvm/devices/xics.txt +++ /dev/null @@ -1,76 +0,0 @@ -XICS interrupt controller - -Device type supported: KVM_DEV_TYPE_XICS - -Groups: - 1. KVM_DEV_XICS_GRP_SOURCES - Attributes: One per interrupt source, indexed by the source number. - - 2. KVM_DEV_XICS_GRP_CTRL - Attributes: - 2.1 KVM_DEV_XICS_NR_SERVERS (write only) - The kvm_device_attr.addr points to a __u32 value which is the number of - interrupt server numbers (ie, highest possible vcpu id plus one). - Errors: - -EINVAL: Value greater than KVM_MAX_VCPU_ID. - -EFAULT: Invalid user pointer for attr->addr. - -EBUSY: A vcpu is already connected to the device. - -This device emulates the XICS (eXternal Interrupt Controller -Specification) defined in PAPR. The XICS has a set of interrupt -sources, each identified by a 20-bit source number, and a set of -Interrupt Control Presentation (ICP) entities, also called "servers", -each associated with a virtual CPU. - -The ICP entities are created by enabling the KVM_CAP_IRQ_ARCH -capability for each vcpu, specifying KVM_CAP_IRQ_XICS in args[0] and -the interrupt server number (i.e. the vcpu number from the XICS's -point of view) in args[1] of the kvm_enable_cap struct. Each ICP has -64 bits of state which can be read and written using the -KVM_GET_ONE_REG and KVM_SET_ONE_REG ioctls on the vcpu. The 64 bit -state word has the following bitfields, starting at the -least-significant end of the word: - -* Unused, 16 bits - -* Pending interrupt priority, 8 bits - Zero is the highest priority, 255 means no interrupt is pending. - -* Pending IPI (inter-processor interrupt) priority, 8 bits - Zero is the highest priority, 255 means no IPI is pending. - -* Pending interrupt source number, 24 bits - Zero means no interrupt pending, 2 means an IPI is pending - -* Current processor priority, 8 bits - Zero is the highest priority, meaning no interrupts can be - delivered, and 255 is the lowest priority. - -Each source has 64 bits of state that can be read and written using -the KVM_GET_DEVICE_ATTR and KVM_SET_DEVICE_ATTR ioctls, specifying the -KVM_DEV_XICS_GRP_SOURCES attribute group, with the attribute number being -the interrupt source number. The 64 bit state word has the following -bitfields, starting from the least-significant end of the word: - -* Destination (server number), 32 bits - This specifies where the interrupt should be sent, and is the - interrupt server number specified for the destination vcpu. - -* Priority, 8 bits - This is the priority specified for this interrupt source, where 0 is - the highest priority and 255 is the lowest. An interrupt with a - priority of 255 will never be delivered. - -* Level sensitive flag, 1 bit - This bit is 1 for a level-sensitive interrupt source, or 0 for - edge-sensitive (or MSI). - -* Masked flag, 1 bit - This bit is set to 1 if the interrupt is masked (cannot be delivered - regardless of its priority), for example by the ibm,int-off RTAS - call, or 0 if it is not masked. - -* Pending flag, 1 bit - This bit is 1 if the source has a pending interrupt, otherwise 0. - -Only one XICS instance may be created per VM. -- cgit v1.2.3 From d3b52e4976cec9c830bc14bd38f043e8ca00ba68 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 10 Feb 2020 07:02:54 +0100 Subject: docs: kvm: convert devices/xive.txt to ReST - Use title markups; - adjust indentation and add blank lines as needed; - adjust tables to match ReST accepted formats; - mark code blocks as such. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/devices/index.rst | 1 + Documentation/virt/kvm/devices/xive.rst | 247 +++++++++++++++++++++++++++++++ Documentation/virt/kvm/devices/xive.txt | 205 ------------------------- 3 files changed, 248 insertions(+), 205 deletions(-) create mode 100644 Documentation/virt/kvm/devices/xive.rst delete mode 100644 Documentation/virt/kvm/devices/xive.txt diff --git a/Documentation/virt/kvm/devices/index.rst b/Documentation/virt/kvm/devices/index.rst index 63b61369d09b..192cda7405c8 100644 --- a/Documentation/virt/kvm/devices/index.rst +++ b/Documentation/virt/kvm/devices/index.rst @@ -16,3 +16,4 @@ Devices vfio vm xics + xive diff --git a/Documentation/virt/kvm/devices/xive.rst b/Documentation/virt/kvm/devices/xive.rst new file mode 100644 index 000000000000..8bdf3dc38f01 --- /dev/null +++ b/Documentation/virt/kvm/devices/xive.rst @@ -0,0 +1,247 @@ +.. SPDX-License-Identifier: GPL-2.0 + +=========================================================== +POWER9 eXternal Interrupt Virtualization Engine (XIVE Gen1) +=========================================================== + +Device types supported: + - KVM_DEV_TYPE_XIVE POWER9 XIVE Interrupt Controller generation 1 + +This device acts as a VM interrupt controller. It provides the KVM +interface to configure the interrupt sources of a VM in the underlying +POWER9 XIVE interrupt controller. + +Only one XIVE instance may be instantiated. A guest XIVE device +requires a POWER9 host and the guest OS should have support for the +XIVE native exploitation interrupt mode. If not, it should run using +the legacy interrupt mode, referred as XICS (POWER7/8). + +* Device Mappings + + The KVM device exposes different MMIO ranges of the XIVE HW which + are required for interrupt management. These are exposed to the + guest in VMAs populated with a custom VM fault handler. + + 1. Thread Interrupt Management Area (TIMA) + + Each thread has an associated Thread Interrupt Management context + composed of a set of registers. These registers let the thread + handle priority management and interrupt acknowledgment. The most + important are : + + - Interrupt Pending Buffer (IPB) + - Current Processor Priority (CPPR) + - Notification Source Register (NSR) + + They are exposed to software in four different pages each proposing + a view with a different privilege. The first page is for the + physical thread context and the second for the hypervisor. Only the + third (operating system) and the fourth (user level) are exposed the + guest. + + 2. Event State Buffer (ESB) + + Each source is associated with an Event State Buffer (ESB) with + either a pair of even/odd pair of pages which provides commands to + manage the source: to trigger, to EOI, to turn off the source for + instance. + + 3. Device pass-through + + When a device is passed-through into the guest, the source + interrupts are from a different HW controller (PHB4) and the ESB + pages exposed to the guest should accommadate this change. + + The passthru_irq helpers, kvmppc_xive_set_mapped() and + kvmppc_xive_clr_mapped() are called when the device HW irqs are + mapped into or unmapped from the guest IRQ number space. The KVM + device extends these helpers to clear the ESB pages of the guest IRQ + number being mapped and then lets the VM fault handler repopulate. + The handler will insert the ESB page corresponding to the HW + interrupt of the device being passed-through or the initial IPI ESB + page if the device has being removed. + + The ESB remapping is fully transparent to the guest and the OS + device driver. All handling is done within VFIO and the above + helpers in KVM-PPC. + +* Groups: + +1. KVM_DEV_XIVE_GRP_CTRL + Provides global controls on the device + + Attributes: + 1.1 KVM_DEV_XIVE_RESET (write only) + Resets the interrupt controller configuration for sources and event + queues. To be used by kexec and kdump. + + Errors: none + + 1.2 KVM_DEV_XIVE_EQ_SYNC (write only) + Sync all the sources and queues and mark the EQ pages dirty. This + to make sure that a consistent memory state is captured when + migrating the VM. + + Errors: none + + 1.3 KVM_DEV_XIVE_NR_SERVERS (write only) + The kvm_device_attr.addr points to a __u32 value which is the number of + interrupt server numbers (ie, highest possible vcpu id plus one). + + Errors: + + ======= ========================================== + -EINVAL Value greater than KVM_MAX_VCPU_ID. + -EFAULT Invalid user pointer for attr->addr. + -EBUSY A vCPU is already connected to the device. + ======= ========================================== + +2. KVM_DEV_XIVE_GRP_SOURCE (write only) + Initializes a new source in the XIVE device and mask it. + + Attributes: + Interrupt source number (64-bit) + + The kvm_device_attr.addr points to a __u64 value:: + + bits: | 63 .... 2 | 1 | 0 + values: | unused | level | type + + - type: 0:MSI 1:LSI + - level: assertion level in case of an LSI. + + Errors: + + ======= ========================================== + -E2BIG Interrupt source number is out of range + -ENOMEM Could not create a new source block + -EFAULT Invalid user pointer for attr->addr. + -ENXIO Could not allocate underlying HW interrupt + ======= ========================================== + +3. KVM_DEV_XIVE_GRP_SOURCE_CONFIG (write only) + Configures source targeting + + Attributes: + Interrupt source number (64-bit) + + The kvm_device_attr.addr points to a __u64 value:: + + bits: | 63 .... 33 | 32 | 31 .. 3 | 2 .. 0 + values: | eisn | mask | server | priority + + - priority: 0-7 interrupt priority level + - server: CPU number chosen to handle the interrupt + - mask: mask flag (unused) + - eisn: Effective Interrupt Source Number + + Errors: + + ======= ======================================================= + -ENOENT Unknown source number + -EINVAL Not initialized source number + -EINVAL Invalid priority + -EINVAL Invalid CPU number. + -EFAULT Invalid user pointer for attr->addr. + -ENXIO CPU event queues not configured or configuration of the + underlying HW interrupt failed + -EBUSY No CPU available to serve interrupt + ======= ======================================================= + +4. KVM_DEV_XIVE_GRP_EQ_CONFIG (read-write) + Configures an event queue of a CPU + + Attributes: + EQ descriptor identifier (64-bit) + + The EQ descriptor identifier is a tuple (server, priority):: + + bits: | 63 .... 32 | 31 .. 3 | 2 .. 0 + values: | unused | server | priority + + The kvm_device_attr.addr points to:: + + struct kvm_ppc_xive_eq { + __u32 flags; + __u32 qshift; + __u64 qaddr; + __u32 qtoggle; + __u32 qindex; + __u8 pad[40]; + }; + + - flags: queue flags + KVM_XIVE_EQ_ALWAYS_NOTIFY (required) + forces notification without using the coalescing mechanism + provided by the XIVE END ESBs. + - qshift: queue size (power of 2) + - qaddr: real address of queue + - qtoggle: current queue toggle bit + - qindex: current queue index + - pad: reserved for future use + + Errors: + + ======= ========================================= + -ENOENT Invalid CPU number + -EINVAL Invalid priority + -EINVAL Invalid flags + -EINVAL Invalid queue size + -EINVAL Invalid queue address + -EFAULT Invalid user pointer for attr->addr. + -EIO Configuration of the underlying HW failed + ======= ========================================= + +5. KVM_DEV_XIVE_GRP_SOURCE_SYNC (write only) + Synchronize the source to flush event notifications + + Attributes: + Interrupt source number (64-bit) + + Errors: + + ======= ============================= + -ENOENT Unknown source number + -EINVAL Not initialized source number + ======= ============================= + +* VCPU state + + The XIVE IC maintains VP interrupt state in an internal structure + called the NVT. When a VP is not dispatched on a HW processor + thread, this structure can be updated by HW if the VP is the target + of an event notification. + + It is important for migration to capture the cached IPB from the NVT + as it synthesizes the priorities of the pending interrupts. We + capture a bit more to report debug information. + + KVM_REG_PPC_VP_STATE (2 * 64bits):: + + bits: | 63 .... 32 | 31 .... 0 | + values: | TIMA word0 | TIMA word1 | + bits: | 127 .......... 64 | + values: | unused | + +* Migration: + + Saving the state of a VM using the XIVE native exploitation mode + should follow a specific sequence. When the VM is stopped : + + 1. Mask all sources (PQ=01) to stop the flow of events. + + 2. Sync the XIVE device with the KVM control KVM_DEV_XIVE_EQ_SYNC to + flush any in-flight event notification and to stabilize the EQs. At + this stage, the EQ pages are marked dirty to make sure they are + transferred in the migration sequence. + + 3. Capture the state of the source targeting, the EQs configuration + and the state of thread interrupt context registers. + + Restore is similar: + + 1. Restore the EQ configuration. As targeting depends on it. + 2. Restore targeting + 3. Restore the thread interrupt contexts + 4. Restore the source states + 5. Let the vCPU run diff --git a/Documentation/virt/kvm/devices/xive.txt b/Documentation/virt/kvm/devices/xive.txt deleted file mode 100644 index f5d1d6b5af61..000000000000 --- a/Documentation/virt/kvm/devices/xive.txt +++ /dev/null @@ -1,205 +0,0 @@ -POWER9 eXternal Interrupt Virtualization Engine (XIVE Gen1) -========================================================== - -Device types supported: - KVM_DEV_TYPE_XIVE POWER9 XIVE Interrupt Controller generation 1 - -This device acts as a VM interrupt controller. It provides the KVM -interface to configure the interrupt sources of a VM in the underlying -POWER9 XIVE interrupt controller. - -Only one XIVE instance may be instantiated. A guest XIVE device -requires a POWER9 host and the guest OS should have support for the -XIVE native exploitation interrupt mode. If not, it should run using -the legacy interrupt mode, referred as XICS (POWER7/8). - -* Device Mappings - - The KVM device exposes different MMIO ranges of the XIVE HW which - are required for interrupt management. These are exposed to the - guest in VMAs populated with a custom VM fault handler. - - 1. Thread Interrupt Management Area (TIMA) - - Each thread has an associated Thread Interrupt Management context - composed of a set of registers. These registers let the thread - handle priority management and interrupt acknowledgment. The most - important are : - - - Interrupt Pending Buffer (IPB) - - Current Processor Priority (CPPR) - - Notification Source Register (NSR) - - They are exposed to software in four different pages each proposing - a view with a different privilege. The first page is for the - physical thread context and the second for the hypervisor. Only the - third (operating system) and the fourth (user level) are exposed the - guest. - - 2. Event State Buffer (ESB) - - Each source is associated with an Event State Buffer (ESB) with - either a pair of even/odd pair of pages which provides commands to - manage the source: to trigger, to EOI, to turn off the source for - instance. - - 3. Device pass-through - - When a device is passed-through into the guest, the source - interrupts are from a different HW controller (PHB4) and the ESB - pages exposed to the guest should accommadate this change. - - The passthru_irq helpers, kvmppc_xive_set_mapped() and - kvmppc_xive_clr_mapped() are called when the device HW irqs are - mapped into or unmapped from the guest IRQ number space. The KVM - device extends these helpers to clear the ESB pages of the guest IRQ - number being mapped and then lets the VM fault handler repopulate. - The handler will insert the ESB page corresponding to the HW - interrupt of the device being passed-through or the initial IPI ESB - page if the device has being removed. - - The ESB remapping is fully transparent to the guest and the OS - device driver. All handling is done within VFIO and the above - helpers in KVM-PPC. - -* Groups: - - 1. KVM_DEV_XIVE_GRP_CTRL - Provides global controls on the device - Attributes: - 1.1 KVM_DEV_XIVE_RESET (write only) - Resets the interrupt controller configuration for sources and event - queues. To be used by kexec and kdump. - Errors: none - - 1.2 KVM_DEV_XIVE_EQ_SYNC (write only) - Sync all the sources and queues and mark the EQ pages dirty. This - to make sure that a consistent memory state is captured when - migrating the VM. - Errors: none - - 1.3 KVM_DEV_XIVE_NR_SERVERS (write only) - The kvm_device_attr.addr points to a __u32 value which is the number of - interrupt server numbers (ie, highest possible vcpu id plus one). - Errors: - -EINVAL: Value greater than KVM_MAX_VCPU_ID. - -EFAULT: Invalid user pointer for attr->addr. - -EBUSY: A vCPU is already connected to the device. - - 2. KVM_DEV_XIVE_GRP_SOURCE (write only) - Initializes a new source in the XIVE device and mask it. - Attributes: - Interrupt source number (64-bit) - The kvm_device_attr.addr points to a __u64 value: - bits: | 63 .... 2 | 1 | 0 - values: | unused | level | type - - type: 0:MSI 1:LSI - - level: assertion level in case of an LSI. - Errors: - -E2BIG: Interrupt source number is out of range - -ENOMEM: Could not create a new source block - -EFAULT: Invalid user pointer for attr->addr. - -ENXIO: Could not allocate underlying HW interrupt - - 3. KVM_DEV_XIVE_GRP_SOURCE_CONFIG (write only) - Configures source targeting - Attributes: - Interrupt source number (64-bit) - The kvm_device_attr.addr points to a __u64 value: - bits: | 63 .... 33 | 32 | 31 .. 3 | 2 .. 0 - values: | eisn | mask | server | priority - - priority: 0-7 interrupt priority level - - server: CPU number chosen to handle the interrupt - - mask: mask flag (unused) - - eisn: Effective Interrupt Source Number - Errors: - -ENOENT: Unknown source number - -EINVAL: Not initialized source number - -EINVAL: Invalid priority - -EINVAL: Invalid CPU number. - -EFAULT: Invalid user pointer for attr->addr. - -ENXIO: CPU event queues not configured or configuration of the - underlying HW interrupt failed - -EBUSY: No CPU available to serve interrupt - - 4. KVM_DEV_XIVE_GRP_EQ_CONFIG (read-write) - Configures an event queue of a CPU - Attributes: - EQ descriptor identifier (64-bit) - The EQ descriptor identifier is a tuple (server, priority) : - bits: | 63 .... 32 | 31 .. 3 | 2 .. 0 - values: | unused | server | priority - The kvm_device_attr.addr points to : - struct kvm_ppc_xive_eq { - __u32 flags; - __u32 qshift; - __u64 qaddr; - __u32 qtoggle; - __u32 qindex; - __u8 pad[40]; - }; - - flags: queue flags - KVM_XIVE_EQ_ALWAYS_NOTIFY (required) - forces notification without using the coalescing mechanism - provided by the XIVE END ESBs. - - qshift: queue size (power of 2) - - qaddr: real address of queue - - qtoggle: current queue toggle bit - - qindex: current queue index - - pad: reserved for future use - Errors: - -ENOENT: Invalid CPU number - -EINVAL: Invalid priority - -EINVAL: Invalid flags - -EINVAL: Invalid queue size - -EINVAL: Invalid queue address - -EFAULT: Invalid user pointer for attr->addr. - -EIO: Configuration of the underlying HW failed - - 5. KVM_DEV_XIVE_GRP_SOURCE_SYNC (write only) - Synchronize the source to flush event notifications - Attributes: - Interrupt source number (64-bit) - Errors: - -ENOENT: Unknown source number - -EINVAL: Not initialized source number - -* VCPU state - - The XIVE IC maintains VP interrupt state in an internal structure - called the NVT. When a VP is not dispatched on a HW processor - thread, this structure can be updated by HW if the VP is the target - of an event notification. - - It is important for migration to capture the cached IPB from the NVT - as it synthesizes the priorities of the pending interrupts. We - capture a bit more to report debug information. - - KVM_REG_PPC_VP_STATE (2 * 64bits) - bits: | 63 .... 32 | 31 .... 0 | - values: | TIMA word0 | TIMA word1 | - bits: | 127 .......... 64 | - values: | unused | - -* Migration: - - Saving the state of a VM using the XIVE native exploitation mode - should follow a specific sequence. When the VM is stopped : - - 1. Mask all sources (PQ=01) to stop the flow of events. - - 2. Sync the XIVE device with the KVM control KVM_DEV_XIVE_EQ_SYNC to - flush any in-flight event notification and to stabilize the EQs. At - this stage, the EQ pages are marked dirty to make sure they are - transferred in the migration sequence. - - 3. Capture the state of the source targeting, the EQs configuration - and the state of thread interrupt context registers. - - Restore is similar : - - 1. Restore the EQ configuration. As targeting depends on it. - 2. Restore targeting - 3. Restore the thread interrupt contexts - 4. Restore the source states - 5. Let the vCPU run -- cgit v1.2.3 From 106ee47dc633a930bb61290713217803aee194e7 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 10 Feb 2020 07:02:55 +0100 Subject: docs: kvm: Convert api.txt to ReST format convert api.txt document to ReST format while trying to keep its format as close as possible with the authors intent, and avoid adding uneeded markups. - Use document title and chapter markups; - Convert tables; - Add markups for literal blocks; - use :field: for field descriptions; - Add blank lines and adjust indentation Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/api.rst | 6026 ++++++++++++++++++++++++++++++++++++++ Documentation/virt/kvm/api.txt | 5450 ---------------------------------- Documentation/virt/kvm/index.rst | 1 + 3 files changed, 6027 insertions(+), 5450 deletions(-) create mode 100644 Documentation/virt/kvm/api.rst delete mode 100644 Documentation/virt/kvm/api.txt diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst new file mode 100644 index 000000000000..97a72a53fa4b --- /dev/null +++ b/Documentation/virt/kvm/api.rst @@ -0,0 +1,6026 @@ +.. SPDX-License-Identifier: GPL-2.0 + +=================================================================== +The Definitive KVM (Kernel-based Virtual Machine) API Documentation +=================================================================== + +1. General description +====================== + +The kvm API is a set of ioctls that are issued to control various aspects +of a virtual machine. The ioctls belong to the following classes: + + - System ioctls: These query and set global attributes which affect the + whole kvm subsystem. In addition a system ioctl is used to create + virtual machines. + + - VM ioctls: These query and set attributes that affect an entire virtual + machine, for example memory layout. In addition a VM ioctl is used to + create virtual cpus (vcpus) and devices. + + VM ioctls must be issued from the same process (address space) that was + used to create the VM. + + - vcpu ioctls: These query and set attributes that control the operation + of a single virtual cpu. + + vcpu ioctls should be issued from the same thread that was used to create + the vcpu, except for asynchronous vcpu ioctl that are marked as such in + the documentation. Otherwise, the first ioctl after switching threads + could see a performance impact. + + - device ioctls: These query and set attributes that control the operation + of a single device. + + device ioctls must be issued from the same process (address space) that + was used to create the VM. + +2. File descriptors +=================== + +The kvm API is centered around file descriptors. An initial +open("/dev/kvm") obtains a handle to the kvm subsystem; this handle +can be used to issue system ioctls. A KVM_CREATE_VM ioctl on this +handle will create a VM file descriptor which can be used to issue VM +ioctls. A KVM_CREATE_VCPU or KVM_CREATE_DEVICE ioctl on a VM fd will +create a virtual cpu or device and return a file descriptor pointing to +the new resource. Finally, ioctls on a vcpu or device fd can be used +to control the vcpu or device. For vcpus, this includes the important +task of actually running guest code. + +In general file descriptors can be migrated among processes by means +of fork() and the SCM_RIGHTS facility of unix domain socket. These +kinds of tricks are explicitly not supported by kvm. While they will +not cause harm to the host, their actual behavior is not guaranteed by +the API. See "General description" for details on the ioctl usage +model that is supported by KVM. + +It is important to note that althought VM ioctls may only be issued from +the process that created the VM, a VM's lifecycle is associated with its +file descriptor, not its creator (process). In other words, the VM and +its resources, *including the associated address space*, are not freed +until the last reference to the VM's file descriptor has been released. +For example, if fork() is issued after ioctl(KVM_CREATE_VM), the VM will +not be freed until both the parent (original) process and its child have +put their references to the VM's file descriptor. + +Because a VM's resources are not freed until the last reference to its +file descriptor is released, creating additional references to a VM via +via fork(), dup(), etc... without careful consideration is strongly +discouraged and may have unwanted side effects, e.g. memory allocated +by and on behalf of the VM's process may not be freed/unaccounted when +the VM is shut down. + + +3. Extensions +============= + +As of Linux 2.6.22, the KVM ABI has been stabilized: no backward +incompatible change are allowed. However, there is an extension +facility that allows backward-compatible extensions to the API to be +queried and used. + +The extension mechanism is not based on the Linux version number. +Instead, kvm defines extension identifiers and a facility to query +whether a particular extension identifier is available. If it is, a +set of ioctls is available for application use. + + +4. API description +================== + +This section describes ioctls that can be used to control kvm guests. +For each ioctl, the following information is provided along with a +description: + + Capability: + which KVM extension provides this ioctl. Can be 'basic', + which means that is will be provided by any kernel that supports + API version 12 (see section 4.1), a KVM_CAP_xyz constant, which + means availability needs to be checked with KVM_CHECK_EXTENSION + (see section 4.4), or 'none' which means that while not all kernels + support this ioctl, there's no capability bit to check its + availability: for kernels that don't support the ioctl, + the ioctl returns -ENOTTY. + + Architectures: + which instruction set architectures provide this ioctl. + x86 includes both i386 and x86_64. + + Type: + system, vm, or vcpu. + + Parameters: + what parameters are accepted by the ioctl. + + Returns: + the return value. General error numbers (EBADF, ENOMEM, EINVAL) + are not detailed, but errors with specific meanings are. + + +4.1 KVM_GET_API_VERSION +----------------------- + +:Capability: basic +:Architectures: all +:Type: system ioctl +:Parameters: none +:Returns: the constant KVM_API_VERSION (=12) + +This identifies the API version as the stable kvm API. It is not +expected that this number will change. However, Linux 2.6.20 and +2.6.21 report earlier versions; these are not documented and not +supported. Applications should refuse to run if KVM_GET_API_VERSION +returns a value other than 12. If this check passes, all ioctls +described as 'basic' will be available. + + +4.2 KVM_CREATE_VM +----------------- + +:Capability: basic +:Architectures: all +:Type: system ioctl +:Parameters: machine type identifier (KVM_VM_*) +:Returns: a VM fd that can be used to control the new virtual machine. + +The new VM has no virtual cpus and no memory. +You probably want to use 0 as machine type. + +In order to create user controlled virtual machines on S390, check +KVM_CAP_S390_UCONTROL and use the flag KVM_VM_S390_UCONTROL as +privileged user (CAP_SYS_ADMIN). + +To use hardware assisted virtualization on MIPS (VZ ASE) rather than +the default trap & emulate implementation (which changes the virtual +memory layout to fit in user mode), check KVM_CAP_MIPS_VZ and use the +flag KVM_VM_MIPS_VZ. + + +On arm64, the physical address size for a VM (IPA Size limit) is limited +to 40bits by default. The limit can be configured if the host supports the +extension KVM_CAP_ARM_VM_IPA_SIZE. When supported, use +KVM_VM_TYPE_ARM_IPA_SIZE(IPA_Bits) to set the size in the machine type +identifier, where IPA_Bits is the maximum width of any physical +address used by the VM. The IPA_Bits is encoded in bits[7-0] of the +machine type identifier. + +e.g, to configure a guest to use 48bit physical address size:: + + vm_fd = ioctl(dev_fd, KVM_CREATE_VM, KVM_VM_TYPE_ARM_IPA_SIZE(48)); + +The requested size (IPA_Bits) must be: + + == ========================================================= + 0 Implies default size, 40bits (for backward compatibility) + N Implies N bits, where N is a positive integer such that, + 32 <= N <= Host_IPA_Limit + == ========================================================= + +Host_IPA_Limit is the maximum possible value for IPA_Bits on the host and +is dependent on the CPU capability and the kernel configuration. The limit can +be retrieved using KVM_CAP_ARM_VM_IPA_SIZE of the KVM_CHECK_EXTENSION +ioctl() at run-time. + +Please note that configuring the IPA size does not affect the capability +exposed by the guest CPUs in ID_AA64MMFR0_EL1[PARange]. It only affects +size of the address translated by the stage2 level (guest physical to +host physical address translations). + + +4.3 KVM_GET_MSR_INDEX_LIST, KVM_GET_MSR_FEATURE_INDEX_LIST +---------------------------------------------------------- + +:Capability: basic, KVM_CAP_GET_MSR_FEATURES for KVM_GET_MSR_FEATURE_INDEX_LIST +:Architectures: x86 +:Type: system ioctl +:Parameters: struct kvm_msr_list (in/out) +:Returns: 0 on success; -1 on error + +Errors: + + ====== ============================================================ + EFAULT the msr index list cannot be read from or written to + E2BIG the msr index list is to be to fit in the array specified by + the user. + ====== ============================================================ + +:: + + struct kvm_msr_list { + __u32 nmsrs; /* number of msrs in entries */ + __u32 indices[0]; + }; + +The user fills in the size of the indices array in nmsrs, and in return +kvm adjusts nmsrs to reflect the actual number of msrs and fills in the +indices array with their numbers. + +KVM_GET_MSR_INDEX_LIST returns the guest msrs that are supported. The list +varies by kvm version and host processor, but does not change otherwise. + +Note: if kvm indicates supports MCE (KVM_CAP_MCE), then the MCE bank MSRs are +not returned in the MSR list, as different vcpus can have a different number +of banks, as set via the KVM_X86_SETUP_MCE ioctl. + +KVM_GET_MSR_FEATURE_INDEX_LIST returns the list of MSRs that can be passed +to the KVM_GET_MSRS system ioctl. This lets userspace probe host capabilities +and processor features that are exposed via MSRs (e.g., VMX capabilities). +This list also varies by kvm version and host processor, but does not change +otherwise. + + +4.4 KVM_CHECK_EXTENSION +----------------------- + +:Capability: basic, KVM_CAP_CHECK_EXTENSION_VM for vm ioctl +:Architectures: all +:Type: system ioctl, vm ioctl +:Parameters: extension identifier (KVM_CAP_*) +:Returns: 0 if unsupported; 1 (or some other positive integer) if supported + +The API allows the application to query about extensions to the core +kvm API. Userspace passes an extension identifier (an integer) and +receives an integer that describes the extension availability. +Generally 0 means no and 1 means yes, but some extensions may report +additional information in the integer return value. + +Based on their initialization different VMs may have different capabilities. +It is thus encouraged to use the vm ioctl to query for capabilities (available +with KVM_CAP_CHECK_EXTENSION_VM on the vm fd) + +4.5 KVM_GET_VCPU_MMAP_SIZE +-------------------------- + +:Capability: basic +:Architectures: all +:Type: system ioctl +:Parameters: none +:Returns: size of vcpu mmap area, in bytes + +The KVM_RUN ioctl (cf.) communicates with userspace via a shared +memory region. This ioctl returns the size of that region. See the +KVM_RUN documentation for details. + + +4.6 KVM_SET_MEMORY_REGION +------------------------- + +:Capability: basic +:Architectures: all +:Type: vm ioctl +:Parameters: struct kvm_memory_region (in) +:Returns: 0 on success, -1 on error + +This ioctl is obsolete and has been removed. + + +4.7 KVM_CREATE_VCPU +------------------- + +:Capability: basic +:Architectures: all +:Type: vm ioctl +:Parameters: vcpu id (apic id on x86) +:Returns: vcpu fd on success, -1 on error + +This API adds a vcpu to a virtual machine. No more than max_vcpus may be added. +The vcpu id is an integer in the range [0, max_vcpu_id). + +The recommended max_vcpus value can be retrieved using the KVM_CAP_NR_VCPUS of +the KVM_CHECK_EXTENSION ioctl() at run-time. +The maximum possible value for max_vcpus can be retrieved using the +KVM_CAP_MAX_VCPUS of the KVM_CHECK_EXTENSION ioctl() at run-time. + +If the KVM_CAP_NR_VCPUS does not exist, you should assume that max_vcpus is 4 +cpus max. +If the KVM_CAP_MAX_VCPUS does not exist, you should assume that max_vcpus is +same as the value returned from KVM_CAP_NR_VCPUS. + +The maximum possible value for max_vcpu_id can be retrieved using the +KVM_CAP_MAX_VCPU_ID of the KVM_CHECK_EXTENSION ioctl() at run-time. + +If the KVM_CAP_MAX_VCPU_ID does not exist, you should assume that max_vcpu_id +is the same as the value returned from KVM_CAP_MAX_VCPUS. + +On powerpc using book3s_hv mode, the vcpus are mapped onto virtual +threads in one or more virtual CPU cores. (This is because the +hardware requires all the hardware threads in a CPU core to be in the +same partition.) The KVM_CAP_PPC_SMT capability indicates the number +of vcpus per virtual core (vcore). The vcore id is obtained by +dividing the vcpu id by the number of vcpus per vcore. The vcpus in a +given vcore will always be in the same physical core as each other +(though that might be a different physical core from time to time). +Userspace can control the threading (SMT) mode of the guest by its +allocation of vcpu ids. For example, if userspace wants +single-threaded guest vcpus, it should make all vcpu ids be a multiple +of the number of vcpus per vcore. + +For virtual cpus that have been created with S390 user controlled virtual +machines, the resulting vcpu fd can be memory mapped at page offset +KVM_S390_SIE_PAGE_OFFSET in order to obtain a memory map of the virtual +cpu's hardware control block. + + +4.8 KVM_GET_DIRTY_LOG (vm ioctl) +-------------------------------- + +:Capability: basic +:Architectures: all +:Type: vm ioctl +:Parameters: struct kvm_dirty_log (in/out) +:Returns: 0 on success, -1 on error + +:: + + /* for KVM_GET_DIRTY_LOG */ + struct kvm_dirty_log { + __u32 slot; + __u32 padding; + union { + void __user *dirty_bitmap; /* one bit per page */ + __u64 padding; + }; + }; + +Given a memory slot, return a bitmap containing any pages dirtied +since the last call to this ioctl. Bit 0 is the first page in the +memory slot. Ensure the entire structure is cleared to avoid padding +issues. + +If KVM_CAP_MULTI_ADDRESS_SPACE is available, bits 16-31 specifies +the address space for which you want to return the dirty bitmap. +They must be less than the value that KVM_CHECK_EXTENSION returns for +the KVM_CAP_MULTI_ADDRESS_SPACE capability. + +The bits in the dirty bitmap are cleared before the ioctl returns, unless +KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 is enabled. For more information, +see the description of the capability. + +4.9 KVM_SET_MEMORY_ALIAS +------------------------ + +:Capability: basic +:Architectures: x86 +:Type: vm ioctl +:Parameters: struct kvm_memory_alias (in) +:Returns: 0 (success), -1 (error) + +This ioctl is obsolete and has been removed. + + +4.10 KVM_RUN +------------ + +:Capability: basic +:Architectures: all +:Type: vcpu ioctl +:Parameters: none +:Returns: 0 on success, -1 on error + +Errors: + + ===== ============================= + EINTR an unmasked signal is pending + ===== ============================= + +This ioctl is used to run a guest virtual cpu. While there are no +explicit parameters, there is an implicit parameter block that can be +obtained by mmap()ing the vcpu fd at offset 0, with the size given by +KVM_GET_VCPU_MMAP_SIZE. The parameter block is formatted as a 'struct +kvm_run' (see below). + + +4.11 KVM_GET_REGS +----------------- + +:Capability: basic +:Architectures: all except ARM, arm64 +:Type: vcpu ioctl +:Parameters: struct kvm_regs (out) +:Returns: 0 on success, -1 on error + +Reads the general purpose registers from the vcpu. + +:: + + /* x86 */ + struct kvm_regs { + /* out (KVM_GET_REGS) / in (KVM_SET_REGS) */ + __u64 rax, rbx, rcx, rdx; + __u64 rsi, rdi, rsp, rbp; + __u64 r8, r9, r10, r11; + __u64 r12, r13, r14, r15; + __u64 rip, rflags; + }; + + /* mips */ + struct kvm_regs { + /* out (KVM_GET_REGS) / in (KVM_SET_REGS) */ + __u64 gpr[32]; + __u64 hi; + __u64 lo; + __u64 pc; + }; + + +4.12 KVM_SET_REGS +----------------- + +:Capability: basic +:Architectures: all except ARM, arm64 +:Type: vcpu ioctl +:Parameters: struct kvm_regs (in) +:Returns: 0 on success, -1 on error + +Writes the general purpose registers into the vcpu. + +See KVM_GET_REGS for the data structure. + + +4.13 KVM_GET_SREGS +------------------ + +:Capability: basic +:Architectures: x86, ppc +:Type: vcpu ioctl +:Parameters: struct kvm_sregs (out) +:Returns: 0 on success, -1 on error + +Reads special registers from the vcpu. + +:: + + /* x86 */ + struct kvm_sregs { + struct kvm_segment cs, ds, es, fs, gs, ss; + struct kvm_segment tr, ldt; + struct kvm_dtable gdt, idt; + __u64 cr0, cr2, cr3, cr4, cr8; + __u64 efer; + __u64 apic_base; + __u64 interrupt_bitmap[(KVM_NR_INTERRUPTS + 63) / 64]; + }; + + /* ppc -- see arch/powerpc/include/uapi/asm/kvm.h */ + +interrupt_bitmap is a bitmap of pending external interrupts. At most +one bit may be set. This interrupt has been acknowledged by the APIC +but not yet injected into the cpu core. + + +4.14 KVM_SET_SREGS +------------------ + +:Capability: basic +:Architectures: x86, ppc +:Type: vcpu ioctl +:Parameters: struct kvm_sregs (in) +:Returns: 0 on success, -1 on error + +Writes special registers into the vcpu. See KVM_GET_SREGS for the +data structures. + + +4.15 KVM_TRANSLATE +------------------ + +:Capability: basic +:Architectures: x86 +:Type: vcpu ioctl +:Parameters: struct kvm_translation (in/out) +:Returns: 0 on success, -1 on error + +Translates a virtual address according to the vcpu's current address +translation mode. + +:: + + struct kvm_translation { + /* in */ + __u64 linear_address; + + /* out */ + __u64 physical_address; + __u8 valid; + __u8 writeable; + __u8 usermode; + __u8 pad[5]; + }; + + +4.16 KVM_INTERRUPT +------------------ + +:Capability: basic +:Architectures: x86, ppc, mips +:Type: vcpu ioctl +:Parameters: struct kvm_interrupt (in) +:Returns: 0 on success, negative on failure. + +Queues a hardware interrupt vector to be injected. + +:: + + /* for KVM_INTERRUPT */ + struct kvm_interrupt { + /* in */ + __u32 irq; + }; + +X86: +^^^^ + +:Returns: + + ========= =================================== + 0 on success, + -EEXIST if an interrupt is already enqueued + -EINVAL the the irq number is invalid + -ENXIO if the PIC is in the kernel + -EFAULT if the pointer is invalid + ========= =================================== + +Note 'irq' is an interrupt vector, not an interrupt pin or line. This +ioctl is useful if the in-kernel PIC is not used. + +PPC: +^^^^ + +Queues an external interrupt to be injected. This ioctl is overleaded +with 3 different irq values: + +a) KVM_INTERRUPT_SET + + This injects an edge type external interrupt into the guest once it's ready + to receive interrupts. When injected, the interrupt is done. + +b) KVM_INTERRUPT_UNSET + + This unsets any pending interrupt. + + Only available with KVM_CAP_PPC_UNSET_IRQ. + +c) KVM_INTERRUPT_SET_LEVEL + + This injects a level type external interrupt into the guest context. The + interrupt stays pending until a specific ioctl with KVM_INTERRUPT_UNSET + is triggered. + + Only available with KVM_CAP_PPC_IRQ_LEVEL. + +Note that any value for 'irq' other than the ones stated above is invalid +and incurs unexpected behavior. + +This is an asynchronous vcpu ioctl and can be invoked from any thread. + +MIPS: +^^^^^ + +Queues an external interrupt to be injected into the virtual CPU. A negative +interrupt number dequeues the interrupt. + +This is an asynchronous vcpu ioctl and can be invoked from any thread. + + +4.17 KVM_DEBUG_GUEST +-------------------- + +:Capability: basic +:Architectures: none +:Type: vcpu ioctl +:Parameters: none) +:Returns: -1 on error + +Support for this has been removed. Use KVM_SET_GUEST_DEBUG instead. + + +4.18 KVM_GET_MSRS +----------------- + +:Capability: basic (vcpu), KVM_CAP_GET_MSR_FEATURES (system) +:Architectures: x86 +:Type: system ioctl, vcpu ioctl +:Parameters: struct kvm_msrs (in/out) +:Returns: number of msrs successfully returned; + -1 on error + +When used as a system ioctl: +Reads the values of MSR-based features that are available for the VM. This +is similar to KVM_GET_SUPPORTED_CPUID, but it returns MSR indices and values. +The list of msr-based features can be obtained using KVM_GET_MSR_FEATURE_INDEX_LIST +in a system ioctl. + +When used as a vcpu ioctl: +Reads model-specific registers from the vcpu. Supported msr indices can +be obtained using KVM_GET_MSR_INDEX_LIST in a system ioctl. + +:: + + struct kvm_msrs { + __u32 nmsrs; /* number of msrs in entries */ + __u32 pad; + + struct kvm_msr_entry entries[0]; + }; + + struct kvm_msr_entry { + __u32 index; + __u32 reserved; + __u64 data; + }; + +Application code should set the 'nmsrs' member (which indicates the +size of the entries array) and the 'index' member of each array entry. +kvm will fill in the 'data' member. + + +4.19 KVM_SET_MSRS +----------------- + +:Capability: basic +:Architectures: x86 +:Type: vcpu ioctl +:Parameters: struct kvm_msrs (in) +:Returns: number of msrs successfully set (see below), -1 on error + +Writes model-specific registers to the vcpu. See KVM_GET_MSRS for the +data structures. + +Application code should set the 'nmsrs' member (which indicates the +size of the entries array), and the 'index' and 'data' members of each +array entry. + +It tries to set the MSRs in array entries[] one by one. If setting an MSR +fails, e.g., due to setting reserved bits, the MSR isn't supported/emulated +by KVM, etc..., it stops processing the MSR list and returns the number of +MSRs that have been set successfully. + + +4.20 KVM_SET_CPUID +------------------ + +:Capability: basic +:Architectures: x86 +:Type: vcpu ioctl +:Parameters: struct kvm_cpuid (in) +:Returns: 0 on success, -1 on error + +Defines the vcpu responses to the cpuid instruction. Applications +should use the KVM_SET_CPUID2 ioctl if available. + +:: + + struct kvm_cpuid_entry { + __u32 function; + __u32 eax; + __u32 ebx; + __u32 ecx; + __u32 edx; + __u32 padding; + }; + + /* for KVM_SET_CPUID */ + struct kvm_cpuid { + __u32 nent; + __u32 padding; + struct kvm_cpuid_entry entries[0]; + }; + + +4.21 KVM_SET_SIGNAL_MASK +------------------------ + +:Capability: basic +:Architectures: all +:Type: vcpu ioctl +:Parameters: struct kvm_signal_mask (in) +:Returns: 0 on success, -1 on error + +Defines which signals are blocked during execution of KVM_RUN. This +signal mask temporarily overrides the threads signal mask. Any +unblocked signal received (except SIGKILL and SIGSTOP, which retain +their traditional behaviour) will cause KVM_RUN to return with -EINTR. + +Note the signal will only be delivered if not blocked by the original +signal mask. + +:: + + /* for KVM_SET_SIGNAL_MASK */ + struct kvm_signal_mask { + __u32 len; + __u8 sigset[0]; + }; + + +4.22 KVM_GET_FPU +---------------- + +:Capability: basic +:Architectures: x86 +:Type: vcpu ioctl +:Parameters: struct kvm_fpu (out) +:Returns: 0 on success, -1 on error + +Reads the floating point state from the vcpu. + +:: + + /* for KVM_GET_FPU and KVM_SET_FPU */ + struct kvm_fpu { + __u8 fpr[8][16]; + __u16 fcw; + __u16 fsw; + __u8 ftwx; /* in fxsave format */ + __u8 pad1; + __u16 last_opcode; + __u64 last_ip; + __u64 last_dp; + __u8 xmm[16][16]; + __u32 mxcsr; + __u32 pad2; + }; + + +4.23 KVM_SET_FPU +---------------- + +:Capability: basic +:Architectures: x86 +:Type: vcpu ioctl +:Parameters: struct kvm_fpu (in) +:Returns: 0 on success, -1 on error + +Writes the floating point state to the vcpu. + +:: + + /* for KVM_GET_FPU and KVM_SET_FPU */ + struct kvm_fpu { + __u8 fpr[8][16]; + __u16 fcw; + __u16 fsw; + __u8 ftwx; /* in fxsave format */ + __u8 pad1; + __u16 last_opcode; + __u64 last_ip; + __u64 last_dp; + __u8 xmm[16][16]; + __u32 mxcsr; + __u32 pad2; + }; + + +4.24 KVM_CREATE_IRQCHIP +----------------------- + +:Capability: KVM_CAP_IRQCHIP, KVM_CAP_S390_IRQCHIP (s390) +:Architectures: x86, ARM, arm64, s390 +:Type: vm ioctl +:Parameters: none +:Returns: 0 on success, -1 on error + +Creates an interrupt controller model in the kernel. +On x86, creates a virtual ioapic, a virtual PIC (two PICs, nested), and sets up +future vcpus to have a local APIC. IRQ routing for GSIs 0-15 is set to both +PIC and IOAPIC; GSI 16-23 only go to the IOAPIC. +On ARM/arm64, a GICv2 is created. Any other GIC versions require the usage of +KVM_CREATE_DEVICE, which also supports creating a GICv2. Using +KVM_CREATE_DEVICE is preferred over KVM_CREATE_IRQCHIP for GICv2. +On s390, a dummy irq routing table is created. + +Note that on s390 the KVM_CAP_S390_IRQCHIP vm capability needs to be enabled +before KVM_CREATE_IRQCHIP can be used. + + +4.25 KVM_IRQ_LINE +----------------- + +:Capability: KVM_CAP_IRQCHIP +:Architectures: x86, arm, arm64 +:Type: vm ioctl +:Parameters: struct kvm_irq_level +:Returns: 0 on success, -1 on error + +Sets the level of a GSI input to the interrupt controller model in the kernel. +On some architectures it is required that an interrupt controller model has +been previously created with KVM_CREATE_IRQCHIP. Note that edge-triggered +interrupts require the level to be set to 1 and then back to 0. + +On real hardware, interrupt pins can be active-low or active-high. This +does not matter for the level field of struct kvm_irq_level: 1 always +means active (asserted), 0 means inactive (deasserted). + +x86 allows the operating system to program the interrupt polarity +(active-low/active-high) for level-triggered interrupts, and KVM used +to consider the polarity. However, due to bitrot in the handling of +active-low interrupts, the above convention is now valid on x86 too. +This is signaled by KVM_CAP_X86_IOAPIC_POLARITY_IGNORED. Userspace +should not present interrupts to the guest as active-low unless this +capability is present (or unless it is not using the in-kernel irqchip, +of course). + + +ARM/arm64 can signal an interrupt either at the CPU level, or at the +in-kernel irqchip (GIC), and for in-kernel irqchip can tell the GIC to +use PPIs designated for specific cpus. The irq field is interpreted +like this:: + +  bits: | 31 ... 28 | 27 ... 24 | 23 ... 16 | 15 ... 0 | + field: | vcpu2_index | irq_type | vcpu_index | irq_id | + +The irq_type field has the following values: + +- irq_type[0]: + out-of-kernel GIC: irq_id 0 is IRQ, irq_id 1 is FIQ +- irq_type[1]: + in-kernel GIC: SPI, irq_id between 32 and 1019 (incl.) + (the vcpu_index field is ignored) +- irq_type[2]: + in-kernel GIC: PPI, irq_id between 16 and 31 (incl.) + +(The irq_id field thus corresponds nicely to the IRQ ID in the ARM GIC specs) + +In both cases, level is used to assert/deassert the line. + +When KVM_CAP_ARM_IRQ_LINE_LAYOUT_2 is supported, the target vcpu is +identified as (256 * vcpu2_index + vcpu_index). Otherwise, vcpu2_index +must be zero. + +Note that on arm/arm64, the KVM_CAP_IRQCHIP capability only conditions +injection of interrupts for the in-kernel irqchip. KVM_IRQ_LINE can always +be used for a userspace interrupt controller. + +:: + + struct kvm_irq_level { + union { + __u32 irq; /* GSI */ + __s32 status; /* not used for KVM_IRQ_LEVEL */ + }; + __u32 level; /* 0 or 1 */ + }; + + +4.26 KVM_GET_IRQCHIP +-------------------- + +:Capability: KVM_CAP_IRQCHIP +:Architectures: x86 +:Type: vm ioctl +:Parameters: struct kvm_irqchip (in/out) +:Returns: 0 on success, -1 on error + +Reads the state of a kernel interrupt controller created with +KVM_CREATE_IRQCHIP into a buffer provided by the caller. + +:: + + struct kvm_irqchip { + __u32 chip_id; /* 0 = PIC1, 1 = PIC2, 2 = IOAPIC */ + __u32 pad; + union { + char dummy[512]; /* reserving space */ + struct kvm_pic_state pic; + struct kvm_ioapic_state ioapic; + } chip; + }; + + +4.27 KVM_SET_IRQCHIP +-------------------- + +:Capability: KVM_CAP_IRQCHIP +:Architectures: x86 +:Type: vm ioctl +:Parameters: struct kvm_irqchip (in) +:Returns: 0 on success, -1 on error + +Sets the state of a kernel interrupt controller created with +KVM_CREATE_IRQCHIP from a buffer provided by the caller. + +:: + + struct kvm_irqchip { + __u32 chip_id; /* 0 = PIC1, 1 = PIC2, 2 = IOAPIC */ + __u32 pad; + union { + char dummy[512]; /* reserving space */ + struct kvm_pic_state pic; + struct kvm_ioapic_state ioapic; + } chip; + }; + + +4.28 KVM_XEN_HVM_CONFIG +----------------------- + +:Capability: KVM_CAP_XEN_HVM +:Architectures: x86 +:Type: vm ioctl +:Parameters: struct kvm_xen_hvm_config (in) +:Returns: 0 on success, -1 on error + +Sets the MSR that the Xen HVM guest uses to initialize its hypercall +page, and provides the starting address and size of the hypercall +blobs in userspace. When the guest writes the MSR, kvm copies one +page of a blob (32- or 64-bit, depending on the vcpu mode) to guest +memory. + +:: + + struct kvm_xen_hvm_config { + __u32 flags; + __u32 msr; + __u64 blob_addr_32; + __u64 blob_addr_64; + __u8 blob_size_32; + __u8 blob_size_64; + __u8 pad2[30]; + }; + + +4.29 KVM_GET_CLOCK +------------------ + +:Capability: KVM_CAP_ADJUST_CLOCK +:Architectures: x86 +:Type: vm ioctl +:Parameters: struct kvm_clock_data (out) +:Returns: 0 on success, -1 on error + +Gets the current timestamp of kvmclock as seen by the current guest. In +conjunction with KVM_SET_CLOCK, it is used to ensure monotonicity on scenarios +such as migration. + +When KVM_CAP_ADJUST_CLOCK is passed to KVM_CHECK_EXTENSION, it returns the +set of bits that KVM can return in struct kvm_clock_data's flag member. + +The only flag defined now is KVM_CLOCK_TSC_STABLE. If set, the returned +value is the exact kvmclock value seen by all VCPUs at the instant +when KVM_GET_CLOCK was called. If clear, the returned value is simply +CLOCK_MONOTONIC plus a constant offset; the offset can be modified +with KVM_SET_CLOCK. KVM will try to make all VCPUs follow this clock, +but the exact value read by each VCPU could differ, because the host +TSC is not stable. + +:: + + struct kvm_clock_data { + __u64 clock; /* kvmclock current value */ + __u32 flags; + __u32 pad[9]; + }; + + +4.30 KVM_SET_CLOCK +------------------ + +:Capability: KVM_CAP_ADJUST_CLOCK +:Architectures: x86 +:Type: vm ioctl +:Parameters: struct kvm_clock_data (in) +:Returns: 0 on success, -1 on error + +Sets the current timestamp of kvmclock to the value specified in its parameter. +In conjunction with KVM_GET_CLOCK, it is used to ensure monotonicity on scenarios +such as migration. + +:: + + struct kvm_clock_data { + __u64 clock; /* kvmclock current value */ + __u32 flags; + __u32 pad[9]; + }; + + +4.31 KVM_GET_VCPU_EVENTS +------------------------ + +:Capability: KVM_CAP_VCPU_EVENTS +:Extended by: KVM_CAP_INTR_SHADOW +:Architectures: x86, arm, arm64 +:Type: vcpu ioctl +:Parameters: struct kvm_vcpu_event (out) +:Returns: 0 on success, -1 on error + +X86: +^^^^ + +Gets currently pending exceptions, interrupts, and NMIs as well as related +states of the vcpu. + +:: + + struct kvm_vcpu_events { + struct { + __u8 injected; + __u8 nr; + __u8 has_error_code; + __u8 pending; + __u32 error_code; + } exception; + struct { + __u8 injected; + __u8 nr; + __u8 soft; + __u8 shadow; + } interrupt; + struct { + __u8 injected; + __u8 pending; + __u8 masked; + __u8 pad; + } nmi; + __u32 sipi_vector; + __u32 flags; + struct { + __u8 smm; + __u8 pending; + __u8 smm_inside_nmi; + __u8 latched_init; + } smi; + __u8 reserved[27]; + __u8 exception_has_payload; + __u64 exception_payload; + }; + +The following bits are defined in the flags field: + +- KVM_VCPUEVENT_VALID_SHADOW may be set to signal that + interrupt.shadow contains a valid state. + +- KVM_VCPUEVENT_VALID_SMM may be set to signal that smi contains a + valid state. + +- KVM_VCPUEVENT_VALID_PAYLOAD may be set to signal that the + exception_has_payload, exception_payload, and exception.pending + fields contain a valid state. This bit will be set whenever + KVM_CAP_EXCEPTION_PAYLOAD is enabled. + +ARM/ARM64: +^^^^^^^^^^ + +If the guest accesses a device that is being emulated by the host kernel in +such a way that a real device would generate a physical SError, KVM may make +a virtual SError pending for that VCPU. This system error interrupt remains +pending until the guest takes the exception by unmasking PSTATE.A. + +Running the VCPU may cause it to take a pending SError, or make an access that +causes an SError to become pending. The event's description is only valid while +the VPCU is not running. + +This API provides a way to read and write the pending 'event' state that is not +visible to the guest. To save, restore or migrate a VCPU the struct representing +the state can be read then written using this GET/SET API, along with the other +guest-visible registers. It is not possible to 'cancel' an SError that has been +made pending. + +A device being emulated in user-space may also wish to generate an SError. To do +this the events structure can be populated by user-space. The current state +should be read first, to ensure no existing SError is pending. If an existing +SError is pending, the architecture's 'Multiple SError interrupts' rules should +be followed. (2.5.3 of DDI0587.a "ARM Reliability, Availability, and +Serviceability (RAS) Specification"). + +SError exceptions always have an ESR value. Some CPUs have the ability to +specify what the virtual SError's ESR value should be. These systems will +advertise KVM_CAP_ARM_INJECT_SERROR_ESR. In this case exception.has_esr will +always have a non-zero value when read, and the agent making an SError pending +should specify the ISS field in the lower 24 bits of exception.serror_esr. If +the system supports KVM_CAP_ARM_INJECT_SERROR_ESR, but user-space sets the events +with exception.has_esr as zero, KVM will choose an ESR. + +Specifying exception.has_esr on a system that does not support it will return +-EINVAL. Setting anything other than the lower 24bits of exception.serror_esr +will return -EINVAL. + +It is not possible to read back a pending external abort (injected via +KVM_SET_VCPU_EVENTS or otherwise) because such an exception is always delivered +directly to the virtual CPU). + +:: + + struct kvm_vcpu_events { + struct { + __u8 serror_pending; + __u8 serror_has_esr; + __u8 ext_dabt_pending; + /* Align it to 8 bytes */ + __u8 pad[5]; + __u64 serror_esr; + } exception; + __u32 reserved[12]; + }; + +4.32 KVM_SET_VCPU_EVENTS +------------------------ + +:Capability: KVM_CAP_VCPU_EVENTS +:Extended by: KVM_CAP_INTR_SHADOW +:Architectures: x86, arm, arm64 +:Type: vcpu ioctl +:Parameters: struct kvm_vcpu_event (in) +:Returns: 0 on success, -1 on error + +X86: +^^^^ + +Set pending exceptions, interrupts, and NMIs as well as related states of the +vcpu. + +See KVM_GET_VCPU_EVENTS for the data structure. + +Fields that may be modified asynchronously by running VCPUs can be excluded +from the update. These fields are nmi.pending, sipi_vector, smi.smm, +smi.pending. Keep the corresponding bits in the flags field cleared to +suppress overwriting the current in-kernel state. The bits are: + +=============================== ================================== +KVM_VCPUEVENT_VALID_NMI_PENDING transfer nmi.pending to the kernel +KVM_VCPUEVENT_VALID_SIPI_VECTOR transfer sipi_vector +KVM_VCPUEVENT_VALID_SMM transfer the smi sub-struct. +=============================== ================================== + +If KVM_CAP_INTR_SHADOW is available, KVM_VCPUEVENT_VALID_SHADOW can be set in +the flags field to signal that interrupt.shadow contains a valid state and +shall be written into the VCPU. + +KVM_VCPUEVENT_VALID_SMM can only be set if KVM_CAP_X86_SMM is available. + +If KVM_CAP_EXCEPTION_PAYLOAD is enabled, KVM_VCPUEVENT_VALID_PAYLOAD +can be set in the flags field to signal that the +exception_has_payload, exception_payload, and exception.pending fields +contain a valid state and shall be written into the VCPU. + +ARM/ARM64: +^^^^^^^^^^ + +User space may need to inject several types of events to the guest. + +Set the pending SError exception state for this VCPU. It is not possible to +'cancel' an Serror that has been made pending. + +If the guest performed an access to I/O memory which could not be handled by +userspace, for example because of missing instruction syndrome decode +information or because there is no device mapped at the accessed IPA, then +userspace can ask the kernel to inject an external abort using the address +from the exiting fault on the VCPU. It is a programming error to set +ext_dabt_pending after an exit which was not either KVM_EXIT_MMIO or +KVM_EXIT_ARM_NISV. This feature is only available if the system supports +KVM_CAP_ARM_INJECT_EXT_DABT. This is a helper which provides commonality in +how userspace reports accesses for the above cases to guests, across different +userspace implementations. Nevertheless, userspace can still emulate all Arm +exceptions by manipulating individual registers using the KVM_SET_ONE_REG API. + +See KVM_GET_VCPU_EVENTS for the data structure. + + +4.33 KVM_GET_DEBUGREGS +---------------------- + +:Capability: KVM_CAP_DEBUGREGS +:Architectures: x86 +:Type: vm ioctl +:Parameters: struct kvm_debugregs (out) +:Returns: 0 on success, -1 on error + +Reads debug registers from the vcpu. + +:: + + struct kvm_debugregs { + __u64 db[4]; + __u64 dr6; + __u64 dr7; + __u64 flags; + __u64 reserved[9]; + }; + + +4.34 KVM_SET_DEBUGREGS +---------------------- + +:Capability: KVM_CAP_DEBUGREGS +:Architectures: x86 +:Type: vm ioctl +:Parameters: struct kvm_debugregs (in) +:Returns: 0 on success, -1 on error + +Writes debug registers into the vcpu. + +See KVM_GET_DEBUGREGS for the data structure. The flags field is unused +yet and must be cleared on entry. + + +4.35 KVM_SET_USER_MEMORY_REGION +------------------------------- + +:Capability: KVM_CAP_USER_MEMORY +:Architectures: all +:Type: vm ioctl +:Parameters: struct kvm_userspace_memory_region (in) +:Returns: 0 on success, -1 on error + +:: + + struct kvm_userspace_memory_region { + __u32 slot; + __u32 flags; + __u64 guest_phys_addr; + __u64 memory_size; /* bytes */ + __u64 userspace_addr; /* start of the userspace allocated memory */ + }; + + /* for kvm_memory_region::flags */ + #define KVM_MEM_LOG_DIRTY_PAGES (1UL << 0) + #define KVM_MEM_READONLY (1UL << 1) + +This ioctl allows the user to create, modify or delete a guest physical +memory slot. Bits 0-15 of "slot" specify the slot id and this value +should be less than the maximum number of user memory slots supported per +VM. The maximum allowed slots can be queried using KVM_CAP_NR_MEMSLOTS. +Slots may not overlap in guest physical address space. + +If KVM_CAP_MULTI_ADDRESS_SPACE is available, bits 16-31 of "slot" +specifies the address space which is being modified. They must be +less than the value that KVM_CHECK_EXTENSION returns for the +KVM_CAP_MULTI_ADDRESS_SPACE capability. Slots in separate address spaces +are unrelated; the restriction on overlapping slots only applies within +each address space. + +Deleting a slot is done by passing zero for memory_size. When changing +an existing slot, it may be moved in the guest physical memory space, +or its flags may be modified, but it may not be resized. + +Memory for the region is taken starting at the address denoted by the +field userspace_addr, which must point at user addressable memory for +the entire memory slot size. Any object may back this memory, including +anonymous memory, ordinary files, and hugetlbfs. + +It is recommended that the lower 21 bits of guest_phys_addr and userspace_addr +be identical. This allows large pages in the guest to be backed by large +pages in the host. + +The flags field supports two flags: KVM_MEM_LOG_DIRTY_PAGES and +KVM_MEM_READONLY. The former can be set to instruct KVM to keep track of +writes to memory within the slot. See KVM_GET_DIRTY_LOG ioctl to know how to +use it. The latter can be set, if KVM_CAP_READONLY_MEM capability allows it, +to make a new slot read-only. In this case, writes to this memory will be +posted to userspace as KVM_EXIT_MMIO exits. + +When the KVM_CAP_SYNC_MMU capability is available, changes in the backing of +the memory region are automatically reflected into the guest. For example, an +mmap() that affects the region will be made visible immediately. Another +example is madvise(MADV_DROP). + +It is recommended to use this API instead of the KVM_SET_MEMORY_REGION ioctl. +The KVM_SET_MEMORY_REGION does not allow fine grained control over memory +allocation and is deprecated. + + +4.36 KVM_SET_TSS_ADDR +--------------------- + +:Capability: KVM_CAP_SET_TSS_ADDR +:Architectures: x86 +:Type: vm ioctl +:Parameters: unsigned long tss_address (in) +:Returns: 0 on success, -1 on error + +This ioctl defines the physical address of a three-page region in the guest +physical address space. The region must be within the first 4GB of the +guest physical address space and must not conflict with any memory slot +or any mmio address. The guest may malfunction if it accesses this memory +region. + +This ioctl is required on Intel-based hosts. This is needed on Intel hardware +because of a quirk in the virtualization implementation (see the internals +documentation when it pops into existence). + + +4.37 KVM_ENABLE_CAP +------------------- + +:Capability: KVM_CAP_ENABLE_CAP +:Architectures: mips, ppc, s390 +:Type: vcpu ioctl +:Parameters: struct kvm_enable_cap (in) +:Returns: 0 on success; -1 on error + +:Capability: KVM_CAP_ENABLE_CAP_VM +:Architectures: all +:Type: vcpu ioctl +:Parameters: struct kvm_enable_cap (in) +:Returns: 0 on success; -1 on error + +.. note:: + + Not all extensions are enabled by default. Using this ioctl the application + can enable an extension, making it available to the guest. + +On systems that do not support this ioctl, it always fails. On systems that +do support it, it only works for extensions that are supported for enablement. + +To check if a capability can be enabled, the KVM_CHECK_EXTENSION ioctl should +be used. + +:: + + struct kvm_enable_cap { + /* in */ + __u32 cap; + +The capability that is supposed to get enabled. + +:: + + __u32 flags; + +A bitfield indicating future enhancements. Has to be 0 for now. + +:: + + __u64 args[4]; + +Arguments for enabling a feature. If a feature needs initial values to +function properly, this is the place to put them. + +:: + + __u8 pad[64]; + }; + +The vcpu ioctl should be used for vcpu-specific capabilities, the vm ioctl +for vm-wide capabilities. + +4.38 KVM_GET_MP_STATE +--------------------- + +:Capability: KVM_CAP_MP_STATE +:Architectures: x86, s390, arm, arm64 +:Type: vcpu ioctl +:Parameters: struct kvm_mp_state (out) +:Returns: 0 on success; -1 on error + +:: + + struct kvm_mp_state { + __u32 mp_state; + }; + +Returns the vcpu's current "multiprocessing state" (though also valid on +uniprocessor guests). + +Possible values are: + + ========================== =============================================== + KVM_MP_STATE_RUNNABLE the vcpu is currently running [x86,arm/arm64] + KVM_MP_STATE_UNINITIALIZED the vcpu is an application processor (AP) + which has not yet received an INIT signal [x86] + KVM_MP_STATE_INIT_RECEIVED the vcpu has received an INIT signal, and is + now ready for a SIPI [x86] + KVM_MP_STATE_HALTED the vcpu has executed a HLT instruction and + is waiting for an interrupt [x86] + KVM_MP_STATE_SIPI_RECEIVED the vcpu has just received a SIPI (vector + accessible via KVM_GET_VCPU_EVENTS) [x86] + KVM_MP_STATE_STOPPED the vcpu is stopped [s390,arm/arm64] + KVM_MP_STATE_CHECK_STOP the vcpu is in a special error state [s390] + KVM_MP_STATE_OPERATING the vcpu is operating (running or halted) + [s390] + KVM_MP_STATE_LOAD the vcpu is in a special load/startup state + [s390] + ========================== =============================================== + +On x86, this ioctl is only useful after KVM_CREATE_IRQCHIP. Without an +in-kernel irqchip, the multiprocessing state must be maintained by userspace on +these architectures. + +For arm/arm64: +^^^^^^^^^^^^^^ + +The only states that are valid are KVM_MP_STATE_STOPPED and +KVM_MP_STATE_RUNNABLE which reflect if the vcpu is paused or not. + +4.39 KVM_SET_MP_STATE +--------------------- + +:Capability: KVM_CAP_MP_STATE +:Architectures: x86, s390, arm, arm64 +:Type: vcpu ioctl +:Parameters: struct kvm_mp_state (in) +:Returns: 0 on success; -1 on error + +Sets the vcpu's current "multiprocessing state"; see KVM_GET_MP_STATE for +arguments. + +On x86, this ioctl is only useful after KVM_CREATE_IRQCHIP. Without an +in-kernel irqchip, the multiprocessing state must be maintained by userspace on +these architectures. + +For arm/arm64: +^^^^^^^^^^^^^^ + +The only states that are valid are KVM_MP_STATE_STOPPED and +KVM_MP_STATE_RUNNABLE which reflect if the vcpu should be paused or not. + +4.40 KVM_SET_IDENTITY_MAP_ADDR +------------------------------ + +:Capability: KVM_CAP_SET_IDENTITY_MAP_ADDR +:Architectures: x86 +:Type: vm ioctl +:Parameters: unsigned long identity (in) +:Returns: 0 on success, -1 on error + +This ioctl defines the physical address of a one-page region in the guest +physical address space. The region must be within the first 4GB of the +guest physical address space and must not conflict with any memory slot +or any mmio address. The guest may malfunction if it accesses this memory +region. + +Setting the address to 0 will result in resetting the address to its default +(0xfffbc000). + +This ioctl is required on Intel-based hosts. This is needed on Intel hardware +because of a quirk in the virtualization implementation (see the internals +documentation when it pops into existence). + +Fails if any VCPU has already been created. + +4.41 KVM_SET_BOOT_CPU_ID +------------------------ + +:Capability: KVM_CAP_SET_BOOT_CPU_ID +:Architectures: x86 +:Type: vm ioctl +:Parameters: unsigned long vcpu_id +:Returns: 0 on success, -1 on error + +Define which vcpu is the Bootstrap Processor (BSP). Values are the same +as the vcpu id in KVM_CREATE_VCPU. If this ioctl is not called, the default +is vcpu 0. + + +4.42 KVM_GET_XSAVE +------------------ + +:Capability: KVM_CAP_XSAVE +:Architectures: x86 +:Type: vcpu ioctl +:Parameters: struct kvm_xsave (out) +:Returns: 0 on success, -1 on error + + +:: + + struct kvm_xsave { + __u32 region[1024]; + }; + +This ioctl would copy current vcpu's xsave struct to the userspace. + + +4.43 KVM_SET_XSAVE +------------------ + +:Capability: KVM_CAP_XSAVE +:Architectures: x86 +:Type: vcpu ioctl +:Parameters: struct kvm_xsave (in) +:Returns: 0 on success, -1 on error + +:: + + + struct kvm_xsave { + __u32 region[1024]; + }; + +This ioctl would copy userspace's xsave struct to the kernel. + + +4.44 KVM_GET_XCRS +----------------- + +:Capability: KVM_CAP_XCRS +:Architectures: x86 +:Type: vcpu ioctl +:Parameters: struct kvm_xcrs (out) +:Returns: 0 on success, -1 on error + +:: + + struct kvm_xcr { + __u32 xcr; + __u32 reserved; + __u64 value; + }; + + struct kvm_xcrs { + __u32 nr_xcrs; + __u32 flags; + struct kvm_xcr xcrs[KVM_MAX_XCRS]; + __u64 padding[16]; + }; + +This ioctl would copy current vcpu's xcrs to the userspace. + + +4.45 KVM_SET_XCRS +----------------- + +:Capability: KVM_CAP_XCRS +:Architectures: x86 +:Type: vcpu ioctl +:Parameters: struct kvm_xcrs (in) +:Returns: 0 on success, -1 on error + +:: + + struct kvm_xcr { + __u32 xcr; + __u32 reserved; + __u64 value; + }; + + struct kvm_xcrs { + __u32 nr_xcrs; + __u32 flags; + struct kvm_xcr xcrs[KVM_MAX_XCRS]; + __u64 padding[16]; + }; + +This ioctl would set vcpu's xcr to the value userspace specified. + + +4.46 KVM_GET_SUPPORTED_CPUID +---------------------------- + +:Capability: KVM_CAP_EXT_CPUID +:Architectures: x86 +:Type: system ioctl +:Parameters: struct kvm_cpuid2 (in/out) +:Returns: 0 on success, -1 on error + +:: + + struct kvm_cpuid2 { + __u32 nent; + __u32 padding; + struct kvm_cpuid_entry2 entries[0]; + }; + + #define KVM_CPUID_FLAG_SIGNIFCANT_INDEX BIT(0) + #define KVM_CPUID_FLAG_STATEFUL_FUNC BIT(1) + #define KVM_CPUID_FLAG_STATE_READ_NEXT BIT(2) + + struct kvm_cpuid_entry2 { + __u32 function; + __u32 index; + __u32 flags; + __u32 eax; + __u32 ebx; + __u32 ecx; + __u32 edx; + __u32 padding[3]; + }; + +This ioctl returns x86 cpuid features which are supported by both the +hardware and kvm in its default configuration. Userspace can use the +information returned by this ioctl to construct cpuid information (for +KVM_SET_CPUID2) that is consistent with hardware, kernel, and +userspace capabilities, and with user requirements (for example, the +user may wish to constrain cpuid to emulate older hardware, or for +feature consistency across a cluster). + +Note that certain capabilities, such as KVM_CAP_X86_DISABLE_EXITS, may +expose cpuid features (e.g. MONITOR) which are not supported by kvm in +its default configuration. If userspace enables such capabilities, it +is responsible for modifying the results of this ioctl appropriately. + +Userspace invokes KVM_GET_SUPPORTED_CPUID by passing a kvm_cpuid2 structure +with the 'nent' field indicating the number of entries in the variable-size +array 'entries'. If the number of entries is too low to describe the cpu +capabilities, an error (E2BIG) is returned. If the number is too high, +the 'nent' field is adjusted and an error (ENOMEM) is returned. If the +number is just right, the 'nent' field is adjusted to the number of valid +entries in the 'entries' array, which is then filled. + +The entries returned are the host cpuid as returned by the cpuid instruction, +with unknown or unsupported features masked out. Some features (for example, +x2apic), may not be present in the host cpu, but are exposed by kvm if it can +emulate them efficiently. The fields in each entry are defined as follows: + + function: + the eax value used to obtain the entry + + index: + the ecx value used to obtain the entry (for entries that are + affected by ecx) + + flags: + an OR of zero or more of the following: + + KVM_CPUID_FLAG_SIGNIFCANT_INDEX: + if the index field is valid + KVM_CPUID_FLAG_STATEFUL_FUNC: + if cpuid for this function returns different values for successive + invocations; there will be several entries with the same function, + all with this flag set + KVM_CPUID_FLAG_STATE_READ_NEXT: + for KVM_CPUID_FLAG_STATEFUL_FUNC entries, set if this entry is + the first entry to be read by a cpu + + eax, ebx, ecx, edx: + the values returned by the cpuid instruction for + this function/index combination + +The TSC deadline timer feature (CPUID leaf 1, ecx[24]) is always returned +as false, since the feature depends on KVM_CREATE_IRQCHIP for local APIC +support. Instead it is reported via:: + + ioctl(KVM_CHECK_EXTENSION, KVM_CAP_TSC_DEADLINE_TIMER) + +if that returns true and you use KVM_CREATE_IRQCHIP, or if you emulate the +feature in userspace, then you can enable the feature for KVM_SET_CPUID2. + + +4.47 KVM_PPC_GET_PVINFO +----------------------- + +:Capability: KVM_CAP_PPC_GET_PVINFO +:Architectures: ppc +:Type: vm ioctl +:Parameters: struct kvm_ppc_pvinfo (out) +:Returns: 0 on success, !0 on error + +:: + + struct kvm_ppc_pvinfo { + __u32 flags; + __u32 hcall[4]; + __u8 pad[108]; + }; + +This ioctl fetches PV specific information that need to be passed to the guest +using the device tree or other means from vm context. + +The hcall array defines 4 instructions that make up a hypercall. + +If any additional field gets added to this structure later on, a bit for that +additional piece of information will be set in the flags bitmap. + +The flags bitmap is defined as:: + + /* the host supports the ePAPR idle hcall + #define KVM_PPC_PVINFO_FLAGS_EV_IDLE (1<<0) + +4.52 KVM_SET_GSI_ROUTING +------------------------ + +:Capability: KVM_CAP_IRQ_ROUTING +:Architectures: x86 s390 arm arm64 +:Type: vm ioctl +:Parameters: struct kvm_irq_routing (in) +:Returns: 0 on success, -1 on error + +Sets the GSI routing table entries, overwriting any previously set entries. + +On arm/arm64, GSI routing has the following limitation: + +- GSI routing does not apply to KVM_IRQ_LINE but only to KVM_IRQFD. + +:: + + struct kvm_irq_routing { + __u32 nr; + __u32 flags; + struct kvm_irq_routing_entry entries[0]; + }; + +No flags are specified so far, the corresponding field must be set to zero. + +:: + + struct kvm_irq_routing_entry { + __u32 gsi; + __u32 type; + __u32 flags; + __u32 pad; + union { + struct kvm_irq_routing_irqchip irqchip; + struct kvm_irq_routing_msi msi; + struct kvm_irq_routing_s390_adapter adapter; + struct kvm_irq_routing_hv_sint hv_sint; + __u32 pad[8]; + } u; + }; + + /* gsi routing entry types */ + #define KVM_IRQ_ROUTING_IRQCHIP 1 + #define KVM_IRQ_ROUTING_MSI 2 + #define KVM_IRQ_ROUTING_S390_ADAPTER 3 + #define KVM_IRQ_ROUTING_HV_SINT 4 + +flags: + +- KVM_MSI_VALID_DEVID: used along with KVM_IRQ_ROUTING_MSI routing entry + type, specifies that the devid field contains a valid value. The per-VM + KVM_CAP_MSI_DEVID capability advertises the requirement to provide + the device ID. If this capability is not available, userspace should + never set the KVM_MSI_VALID_DEVID flag as the ioctl might fail. +- zero otherwise + +:: + + struct kvm_irq_routing_irqchip { + __u32 irqchip; + __u32 pin; + }; + + struct kvm_irq_routing_msi { + __u32 address_lo; + __u32 address_hi; + __u32 data; + union { + __u32 pad; + __u32 devid; + }; + }; + +If KVM_MSI_VALID_DEVID is set, devid contains a unique device identifier +for the device that wrote the MSI message. For PCI, this is usually a +BFD identifier in the lower 16 bits. + +On x86, address_hi is ignored unless the KVM_X2APIC_API_USE_32BIT_IDS +feature of KVM_CAP_X2APIC_API capability is enabled. If it is enabled, +address_hi bits 31-8 provide bits 31-8 of the destination id. Bits 7-0 of +address_hi must be zero. + +:: + + struct kvm_irq_routing_s390_adapter { + __u64 ind_addr; + __u64 summary_addr; + __u64 ind_offset; + __u32 summary_offset; + __u32 adapter_id; + }; + + struct kvm_irq_routing_hv_sint { + __u32 vcpu; + __u32 sint; + }; + + +4.55 KVM_SET_TSC_KHZ +-------------------- + +:Capability: KVM_CAP_TSC_CONTROL +:Architectures: x86 +:Type: vcpu ioctl +:Parameters: virtual tsc_khz +:Returns: 0 on success, -1 on error + +Specifies the tsc frequency for the virtual machine. The unit of the +frequency is KHz. + + +4.56 KVM_GET_TSC_KHZ +-------------------- + +:Capability: KVM_CAP_GET_TSC_KHZ +:Architectures: x86 +:Type: vcpu ioctl +:Parameters: none +:Returns: virtual tsc-khz on success, negative value on error + +Returns the tsc frequency of the guest. The unit of the return value is +KHz. If the host has unstable tsc this ioctl returns -EIO instead as an +error. + + +4.57 KVM_GET_LAPIC +------------------ + +:Capability: KVM_CAP_IRQCHIP +:Architectures: x86 +:Type: vcpu ioctl +:Parameters: struct kvm_lapic_state (out) +:Returns: 0 on success, -1 on error + +:: + + #define KVM_APIC_REG_SIZE 0x400 + struct kvm_lapic_state { + char regs[KVM_APIC_REG_SIZE]; + }; + +Reads the Local APIC registers and copies them into the input argument. The +data format and layout are the same as documented in the architecture manual. + +If KVM_X2APIC_API_USE_32BIT_IDS feature of KVM_CAP_X2APIC_API is +enabled, then the format of APIC_ID register depends on the APIC mode +(reported by MSR_IA32_APICBASE) of its VCPU. x2APIC stores APIC ID in +the APIC_ID register (bytes 32-35). xAPIC only allows an 8-bit APIC ID +which is stored in bits 31-24 of the APIC register, or equivalently in +byte 35 of struct kvm_lapic_state's regs field. KVM_GET_LAPIC must then +be called after MSR_IA32_APICBASE has been set with KVM_SET_MSR. + +If KVM_X2APIC_API_USE_32BIT_IDS feature is disabled, struct kvm_lapic_state +always uses xAPIC format. + + +4.58 KVM_SET_LAPIC +------------------ + +:Capability: KVM_CAP_IRQCHIP +:Architectures: x86 +:Type: vcpu ioctl +:Parameters: struct kvm_lapic_state (in) +:Returns: 0 on success, -1 on error + +:: + + #define KVM_APIC_REG_SIZE 0x400 + struct kvm_lapic_state { + char regs[KVM_APIC_REG_SIZE]; + }; + +Copies the input argument into the Local APIC registers. The data format +and layout are the same as documented in the architecture manual. + +The format of the APIC ID register (bytes 32-35 of struct kvm_lapic_state's +regs field) depends on the state of the KVM_CAP_X2APIC_API capability. +See the note in KVM_GET_LAPIC. + + +4.59 KVM_IOEVENTFD +------------------ + +:Capability: KVM_CAP_IOEVENTFD +:Architectures: all +:Type: vm ioctl +:Parameters: struct kvm_ioeventfd (in) +:Returns: 0 on success, !0 on error + +This ioctl attaches or detaches an ioeventfd to a legal pio/mmio address +within the guest. A guest write in the registered address will signal the +provided event instead of triggering an exit. + +:: + + struct kvm_ioeventfd { + __u64 datamatch; + __u64 addr; /* legal pio/mmio address */ + __u32 len; /* 0, 1, 2, 4, or 8 bytes */ + __s32 fd; + __u32 flags; + __u8 pad[36]; + }; + +For the special case of virtio-ccw devices on s390, the ioevent is matched +to a subchannel/virtqueue tuple instead. + +The following flags are defined:: + + #define KVM_IOEVENTFD_FLAG_DATAMATCH (1 << kvm_ioeventfd_flag_nr_datamatch) + #define KVM_IOEVENTFD_FLAG_PIO (1 << kvm_ioeventfd_flag_nr_pio) + #define KVM_IOEVENTFD_FLAG_DEASSIGN (1 << kvm_ioeventfd_flag_nr_deassign) + #define KVM_IOEVENTFD_FLAG_VIRTIO_CCW_NOTIFY \ + (1 << kvm_ioeventfd_flag_nr_virtio_ccw_notify) + +If datamatch flag is set, the event will be signaled only if the written value +to the registered address is equal to datamatch in struct kvm_ioeventfd. + +For virtio-ccw devices, addr contains the subchannel id and datamatch the +virtqueue index. + +With KVM_CAP_IOEVENTFD_ANY_LENGTH, a zero length ioeventfd is allowed, and +the kernel will ignore the length of guest write and may get a faster vmexit. +The speedup may only apply to specific architectures, but the ioeventfd will +work anyway. + +4.60 KVM_DIRTY_TLB +------------------ + +:Capability: KVM_CAP_SW_TLB +:Architectures: ppc +:Type: vcpu ioctl +:Parameters: struct kvm_dirty_tlb (in) +:Returns: 0 on success, -1 on error + +:: + + struct kvm_dirty_tlb { + __u64 bitmap; + __u32 num_dirty; + }; + +This must be called whenever userspace has changed an entry in the shared +TLB, prior to calling KVM_RUN on the associated vcpu. + +The "bitmap" field is the userspace address of an array. This array +consists of a number of bits, equal to the total number of TLB entries as +determined by the last successful call to KVM_CONFIG_TLB, rounded up to the +nearest multiple of 64. + +Each bit corresponds to one TLB entry, ordered the same as in the shared TLB +array. + +The array is little-endian: the bit 0 is the least significant bit of the +first byte, bit 8 is the least significant bit of the second byte, etc. +This avoids any complications with differing word sizes. + +The "num_dirty" field is a performance hint for KVM to determine whether it +should skip processing the bitmap and just invalidate everything. It must +be set to the number of set bits in the bitmap. + + +4.62 KVM_CREATE_SPAPR_TCE +------------------------- + +:Capability: KVM_CAP_SPAPR_TCE +:Architectures: powerpc +:Type: vm ioctl +:Parameters: struct kvm_create_spapr_tce (in) +:Returns: file descriptor for manipulating the created TCE table + +This creates a virtual TCE (translation control entry) table, which +is an IOMMU for PAPR-style virtual I/O. It is used to translate +logical addresses used in virtual I/O into guest physical addresses, +and provides a scatter/gather capability for PAPR virtual I/O. + +:: + + /* for KVM_CAP_SPAPR_TCE */ + struct kvm_create_spapr_tce { + __u64 liobn; + __u32 window_size; + }; + +The liobn field gives the logical IO bus number for which to create a +TCE table. The window_size field specifies the size of the DMA window +which this TCE table will translate - the table will contain one 64 +bit TCE entry for every 4kiB of the DMA window. + +When the guest issues an H_PUT_TCE hcall on a liobn for which a TCE +table has been created using this ioctl(), the kernel will handle it +in real mode, updating the TCE table. H_PUT_TCE calls for other +liobns will cause a vm exit and must be handled by userspace. + +The return value is a file descriptor which can be passed to mmap(2) +to map the created TCE table into userspace. This lets userspace read +the entries written by kernel-handled H_PUT_TCE calls, and also lets +userspace update the TCE table directly which is useful in some +circumstances. + + +4.63 KVM_ALLOCATE_RMA +--------------------- + +:Capability: KVM_CAP_PPC_RMA +:Architectures: powerpc +:Type: vm ioctl +:Parameters: struct kvm_allocate_rma (out) +:Returns: file descriptor for mapping the allocated RMA + +This allocates a Real Mode Area (RMA) from the pool allocated at boot +time by the kernel. An RMA is a physically-contiguous, aligned region +of memory used on older POWER processors to provide the memory which +will be accessed by real-mode (MMU off) accesses in a KVM guest. +POWER processors support a set of sizes for the RMA that usually +includes 64MB, 128MB, 256MB and some larger powers of two. + +:: + + /* for KVM_ALLOCATE_RMA */ + struct kvm_allocate_rma { + __u64 rma_size; + }; + +The return value is a file descriptor which can be passed to mmap(2) +to map the allocated RMA into userspace. The mapped area can then be +passed to the KVM_SET_USER_MEMORY_REGION ioctl to establish it as the +RMA for a virtual machine. The size of the RMA in bytes (which is +fixed at host kernel boot time) is returned in the rma_size field of +the argument structure. + +The KVM_CAP_PPC_RMA capability is 1 or 2 if the KVM_ALLOCATE_RMA ioctl +is supported; 2 if the processor requires all virtual machines to have +an RMA, or 1 if the processor can use an RMA but doesn't require it, +because it supports the Virtual RMA (VRMA) facility. + + +4.64 KVM_NMI +------------ + +:Capability: KVM_CAP_USER_NMI +:Architectures: x86 +:Type: vcpu ioctl +:Parameters: none +:Returns: 0 on success, -1 on error + +Queues an NMI on the thread's vcpu. Note this is well defined only +when KVM_CREATE_IRQCHIP has not been called, since this is an interface +between the virtual cpu core and virtual local APIC. After KVM_CREATE_IRQCHIP +has been called, this interface is completely emulated within the kernel. + +To use this to emulate the LINT1 input with KVM_CREATE_IRQCHIP, use the +following algorithm: + + - pause the vcpu + - read the local APIC's state (KVM_GET_LAPIC) + - check whether changing LINT1 will queue an NMI (see the LVT entry for LINT1) + - if so, issue KVM_NMI + - resume the vcpu + +Some guests configure the LINT1 NMI input to cause a panic, aiding in +debugging. + + +4.65 KVM_S390_UCAS_MAP +---------------------- + +:Capability: KVM_CAP_S390_UCONTROL +:Architectures: s390 +:Type: vcpu ioctl +:Parameters: struct kvm_s390_ucas_mapping (in) +:Returns: 0 in case of success + +The parameter is defined like this:: + + struct kvm_s390_ucas_mapping { + __u64 user_addr; + __u64 vcpu_addr; + __u64 length; + }; + +This ioctl maps the memory at "user_addr" with the length "length" to +the vcpu's address space starting at "vcpu_addr". All parameters need to +be aligned by 1 megabyte. + + +4.66 KVM_S390_UCAS_UNMAP +------------------------ + +:Capability: KVM_CAP_S390_UCONTROL +:Architectures: s390 +:Type: vcpu ioctl +:Parameters: struct kvm_s390_ucas_mapping (in) +:Returns: 0 in case of success + +The parameter is defined like this:: + + struct kvm_s390_ucas_mapping { + __u64 user_addr; + __u64 vcpu_addr; + __u64 length; + }; + +This ioctl unmaps the memory in the vcpu's address space starting at +"vcpu_addr" with the length "length". The field "user_addr" is ignored. +All parameters need to be aligned by 1 megabyte. + + +4.67 KVM_S390_VCPU_FAULT +------------------------ + +:Capability: KVM_CAP_S390_UCONTROL +:Architectures: s390 +:Type: vcpu ioctl +:Parameters: vcpu absolute address (in) +:Returns: 0 in case of success + +This call creates a page table entry on the virtual cpu's address space +(for user controlled virtual machines) or the virtual machine's address +space (for regular virtual machines). This only works for minor faults, +thus it's recommended to access subject memory page via the user page +table upfront. This is useful to handle validity intercepts for user +controlled virtual machines to fault in the virtual cpu's lowcore pages +prior to calling the KVM_RUN ioctl. + + +4.68 KVM_SET_ONE_REG +-------------------- + +:Capability: KVM_CAP_ONE_REG +:Architectures: all +:Type: vcpu ioctl +:Parameters: struct kvm_one_reg (in) +:Returns: 0 on success, negative value on failure + +Errors: + + ====== ============================================================ +  ENOENT   no such register +  EINVAL   invalid register ID, or no such register +  EPERM    (arm64) register access not allowed before vcpu finalization + ====== ============================================================ + +(These error codes are indicative only: do not rely on a specific error +code being returned in a specific situation.) + +:: + + struct kvm_one_reg { + __u64 id; + __u64 addr; + }; + +Using this ioctl, a single vcpu register can be set to a specific value +defined by user space with the passed in struct kvm_one_reg, where id +refers to the register identifier as described below and addr is a pointer +to a variable with the respective size. There can be architecture agnostic +and architecture specific registers. Each have their own range of operation +and their own constants and width. To keep track of the implemented +registers, find a list below: + + ======= =============================== ============ + Arch Register Width (bits) + ======= =============================== ============ + PPC KVM_REG_PPC_HIOR 64 + PPC KVM_REG_PPC_IAC1 64 + PPC KVM_REG_PPC_IAC2 64 + PPC KVM_REG_PPC_IAC3 64 + PPC KVM_REG_PPC_IAC4 64 + PPC KVM_REG_PPC_DAC1 64 + PPC KVM_REG_PPC_DAC2 64 + PPC KVM_REG_PPC_DABR 64 + PPC KVM_REG_PPC_DSCR 64 + PPC KVM_REG_PPC_PURR 64 + PPC KVM_REG_PPC_SPURR 64 + PPC KVM_REG_PPC_DAR 64 + PPC KVM_REG_PPC_DSISR 32 + PPC KVM_REG_PPC_AMR 64 + PPC KVM_REG_PPC_UAMOR 64 + PPC KVM_REG_PPC_MMCR0 64 + PPC KVM_REG_PPC_MMCR1 64 + PPC KVM_REG_PPC_MMCRA 64 + PPC KVM_REG_PPC_MMCR2 64 + PPC KVM_REG_PPC_MMCRS 64 + PPC KVM_REG_PPC_SIAR 64 + PPC KVM_REG_PPC_SDAR 64 + PPC KVM_REG_PPC_SIER 64 + PPC KVM_REG_PPC_PMC1 32 + PPC KVM_REG_PPC_PMC2 32 + PPC KVM_REG_PPC_PMC3 32 + PPC KVM_REG_PPC_PMC4 32 + PPC KVM_REG_PPC_PMC5 32 + PPC KVM_REG_PPC_PMC6 32 + PPC KVM_REG_PPC_PMC7 32 + PPC KVM_REG_PPC_PMC8 32 + PPC KVM_REG_PPC_FPR0 64 + ... + PPC KVM_REG_PPC_FPR31 64 + PPC KVM_REG_PPC_VR0 128 + ... + PPC KVM_REG_PPC_VR31 128 + PPC KVM_REG_PPC_VSR0 128 + ... + PPC KVM_REG_PPC_VSR31 128 + PPC KVM_REG_PPC_FPSCR 64 + PPC KVM_REG_PPC_VSCR 32 + PPC KVM_REG_PPC_VPA_ADDR 64 + PPC KVM_REG_PPC_VPA_SLB 128 + PPC KVM_REG_PPC_VPA_DTL 128 + PPC KVM_REG_PPC_EPCR 32 + PPC KVM_REG_PPC_EPR 32 + PPC KVM_REG_PPC_TCR 32 + PPC KVM_REG_PPC_TSR 32 + PPC KVM_REG_PPC_OR_TSR 32 + PPC KVM_REG_PPC_CLEAR_TSR 32 + PPC KVM_REG_PPC_MAS0 32 + PPC KVM_REG_PPC_MAS1 32 + PPC KVM_REG_PPC_MAS2 64 + PPC KVM_REG_PPC_MAS7_3 64 + PPC KVM_REG_PPC_MAS4 32 + PPC KVM_REG_PPC_MAS6 32 + PPC KVM_REG_PPC_MMUCFG 32 + PPC KVM_REG_PPC_TLB0CFG 32 + PPC KVM_REG_PPC_TLB1CFG 32 + PPC KVM_REG_PPC_TLB2CFG 32 + PPC KVM_REG_PPC_TLB3CFG 32 + PPC KVM_REG_PPC_TLB0PS 32 + PPC KVM_REG_PPC_TLB1PS 32 + PPC KVM_REG_PPC_TLB2PS 32 + PPC KVM_REG_PPC_TLB3PS 32 + PPC KVM_REG_PPC_EPTCFG 32 + PPC KVM_REG_PPC_ICP_STATE 64 + PPC KVM_REG_PPC_VP_STATE 128 + PPC KVM_REG_PPC_TB_OFFSET 64 + PPC KVM_REG_PPC_SPMC1 32 + PPC KVM_REG_PPC_SPMC2 32 + PPC KVM_REG_PPC_IAMR 64 + PPC KVM_REG_PPC_TFHAR 64 + PPC KVM_REG_PPC_TFIAR 64 + PPC KVM_REG_PPC_TEXASR 64 + PPC KVM_REG_PPC_FSCR 64 + PPC KVM_REG_PPC_PSPB 32 + PPC KVM_REG_PPC_EBBHR 64 + PPC KVM_REG_PPC_EBBRR 64 + PPC KVM_REG_PPC_BESCR 64 + PPC KVM_REG_PPC_TAR 64 + PPC KVM_REG_PPC_DPDES 64 + PPC KVM_REG_PPC_DAWR 64 + PPC KVM_REG_PPC_DAWRX 64 + PPC KVM_REG_PPC_CIABR 64 + PPC KVM_REG_PPC_IC 64 + PPC KVM_REG_PPC_VTB 64 + PPC KVM_REG_PPC_CSIGR 64 + PPC KVM_REG_PPC_TACR 64 + PPC KVM_REG_PPC_TCSCR 64 + PPC KVM_REG_PPC_PID 64 + PPC KVM_REG_PPC_ACOP 64 + PPC KVM_REG_PPC_VRSAVE 32 + PPC KVM_REG_PPC_LPCR 32 + PPC KVM_REG_PPC_LPCR_64 64 + PPC KVM_REG_PPC_PPR 64 + PPC KVM_REG_PPC_ARCH_COMPAT 32 + PPC KVM_REG_PPC_DABRX 32 + PPC KVM_REG_PPC_WORT 64 + PPC KVM_REG_PPC_SPRG9 64 + PPC KVM_REG_PPC_DBSR 32 + PPC KVM_REG_PPC_TIDR 64 + PPC KVM_REG_PPC_PSSCR 64 + PPC KVM_REG_PPC_DEC_EXPIRY 64 + PPC KVM_REG_PPC_PTCR 64 + PPC KVM_REG_PPC_TM_GPR0 64 + ... + PPC KVM_REG_PPC_TM_GPR31 64 + PPC KVM_REG_PPC_TM_VSR0 128 + ... + PPC KVM_REG_PPC_TM_VSR63 128 + PPC KVM_REG_PPC_TM_CR 64 + PPC KVM_REG_PPC_TM_LR 64 + PPC KVM_REG_PPC_TM_CTR 64 + PPC KVM_REG_PPC_TM_FPSCR 64 + PPC KVM_REG_PPC_TM_AMR 64 + PPC KVM_REG_PPC_TM_PPR 64 + PPC KVM_REG_PPC_TM_VRSAVE 64 + PPC KVM_REG_PPC_TM_VSCR 32 + PPC KVM_REG_PPC_TM_DSCR 64 + PPC KVM_REG_PPC_TM_TAR 64 + PPC KVM_REG_PPC_TM_XER 64 + + MIPS KVM_REG_MIPS_R0 64 + ... + MIPS KVM_REG_MIPS_R31 64 + MIPS KVM_REG_MIPS_HI 64 + MIPS KVM_REG_MIPS_LO 64 + MIPS KVM_REG_MIPS_PC 64 + MIPS KVM_REG_MIPS_CP0_INDEX 32 + MIPS KVM_REG_MIPS_CP0_ENTRYLO0 64 + MIPS KVM_REG_MIPS_CP0_ENTRYLO1 64 + MIPS KVM_REG_MIPS_CP0_CONTEXT 64 + MIPS KVM_REG_MIPS_CP0_CONTEXTCONFIG 32 + MIPS KVM_REG_MIPS_CP0_USERLOCAL 64 + MIPS KVM_REG_MIPS_CP0_XCONTEXTCONFIG 64 + MIPS KVM_REG_MIPS_CP0_PAGEMASK 32 + MIPS KVM_REG_MIPS_CP0_PAGEGRAIN 32 + MIPS KVM_REG_MIPS_CP0_SEGCTL0 64 + MIPS KVM_REG_MIPS_CP0_SEGCTL1 64 + MIPS KVM_REG_MIPS_CP0_SEGCTL2 64 + MIPS KVM_REG_MIPS_CP0_PWBASE 64 + MIPS KVM_REG_MIPS_CP0_PWFIELD 64 + MIPS KVM_REG_MIPS_CP0_PWSIZE 64 + MIPS KVM_REG_MIPS_CP0_WIRED 32 + MIPS KVM_REG_MIPS_CP0_PWCTL 32 + MIPS KVM_REG_MIPS_CP0_HWRENA 32 + MIPS KVM_REG_MIPS_CP0_BADVADDR 64 + MIPS KVM_REG_MIPS_CP0_BADINSTR 32 + MIPS KVM_REG_MIPS_CP0_BADINSTRP 32 + MIPS KVM_REG_MIPS_CP0_COUNT 32 + MIPS KVM_REG_MIPS_CP0_ENTRYHI 64 + MIPS KVM_REG_MIPS_CP0_COMPARE 32 + MIPS KVM_REG_MIPS_CP0_STATUS 32 + MIPS KVM_REG_MIPS_CP0_INTCTL 32 + MIPS KVM_REG_MIPS_CP0_CAUSE 32 + MIPS KVM_REG_MIPS_CP0_EPC 64 + MIPS KVM_REG_MIPS_CP0_PRID 32 + MIPS KVM_REG_MIPS_CP0_EBASE 64 + MIPS KVM_REG_MIPS_CP0_CONFIG 32 + MIPS KVM_REG_MIPS_CP0_CONFIG1 32 + MIPS KVM_REG_MIPS_CP0_CONFIG2 32 + MIPS KVM_REG_MIPS_CP0_CONFIG3 32 + MIPS KVM_REG_MIPS_CP0_CONFIG4 32 + MIPS KVM_REG_MIPS_CP0_CONFIG5 32 + MIPS KVM_REG_MIPS_CP0_CONFIG7 32 + MIPS KVM_REG_MIPS_CP0_XCONTEXT 64 + MIPS KVM_REG_MIPS_CP0_ERROREPC 64 + MIPS KVM_REG_MIPS_CP0_KSCRATCH1 64 + MIPS KVM_REG_MIPS_CP0_KSCRATCH2 64 + MIPS KVM_REG_MIPS_CP0_KSCRATCH3 64 + MIPS KVM_REG_MIPS_CP0_KSCRATCH4 64 + MIPS KVM_REG_MIPS_CP0_KSCRATCH5 64 + MIPS KVM_REG_MIPS_CP0_KSCRATCH6 64 + MIPS KVM_REG_MIPS_CP0_MAAR(0..63) 64 + MIPS KVM_REG_MIPS_COUNT_CTL 64 + MIPS KVM_REG_MIPS_COUNT_RESUME 64 + MIPS KVM_REG_MIPS_COUNT_HZ 64 + MIPS KVM_REG_MIPS_FPR_32(0..31) 32 + MIPS KVM_REG_MIPS_FPR_64(0..31) 64 + MIPS KVM_REG_MIPS_VEC_128(0..31) 128 + MIPS KVM_REG_MIPS_FCR_IR 32 + MIPS KVM_REG_MIPS_FCR_CSR 32 + MIPS KVM_REG_MIPS_MSA_IR 32 + MIPS KVM_REG_MIPS_MSA_CSR 32 + ======= =============================== ============ + +ARM registers are mapped using the lower 32 bits. The upper 16 of that +is the register group type, or coprocessor number: + +ARM core registers have the following id bit patterns:: + + 0x4020 0000 0010 + +ARM 32-bit CP15 registers have the following id bit patterns:: + + 0x4020 0000 000F + +ARM 64-bit CP15 registers have the following id bit patterns:: + + 0x4030 0000 000F + +ARM CCSIDR registers are demultiplexed by CSSELR value:: + + 0x4020 0000 0011 00 + +ARM 32-bit VFP control registers have the following id bit patterns:: + + 0x4020 0000 0012 1 + +ARM 64-bit FP registers have the following id bit patterns:: + + 0x4030 0000 0012 0 + +ARM firmware pseudo-registers have the following bit pattern:: + + 0x4030 0000 0014 + + +arm64 registers are mapped using the lower 32 bits. The upper 16 of +that is the register group type, or coprocessor number: + +arm64 core/FP-SIMD registers have the following id bit patterns. Note +that the size of the access is variable, as the kvm_regs structure +contains elements ranging from 32 to 128 bits. The index is a 32bit +value in the kvm_regs structure seen as a 32bit array:: + + 0x60x0 0000 0010 + +Specifically: + +======================= ========= ===== ======================================= + Encoding Register Bits kvm_regs member +======================= ========= ===== ======================================= + 0x6030 0000 0010 0000 X0 64 regs.regs[0] + 0x6030 0000 0010 0002 X1 64 regs.regs[1] + ... + 0x6030 0000 0010 003c X30 64 regs.regs[30] + 0x6030 0000 0010 003e SP 64 regs.sp + 0x6030 0000 0010 0040 PC 64 regs.pc + 0x6030 0000 0010 0042 PSTATE 64 regs.pstate + 0x6030 0000 0010 0044 SP_EL1 64 sp_el1 + 0x6030 0000 0010 0046 ELR_EL1 64 elr_el1 + 0x6030 0000 0010 0048 SPSR_EL1 64 spsr[KVM_SPSR_EL1] (alias SPSR_SVC) + 0x6030 0000 0010 004a SPSR_ABT 64 spsr[KVM_SPSR_ABT] + 0x6030 0000 0010 004c SPSR_UND 64 spsr[KVM_SPSR_UND] + 0x6030 0000 0010 004e SPSR_IRQ 64 spsr[KVM_SPSR_IRQ] + 0x6060 0000 0010 0050 SPSR_FIQ 64 spsr[KVM_SPSR_FIQ] + 0x6040 0000 0010 0054 V0 128 fp_regs.vregs[0] [1]_ + 0x6040 0000 0010 0058 V1 128 fp_regs.vregs[1] [1]_ + ... + 0x6040 0000 0010 00d0 V31 128 fp_regs.vregs[31] [1]_ + 0x6020 0000 0010 00d4 FPSR 32 fp_regs.fpsr + 0x6020 0000 0010 00d5 FPCR 32 fp_regs.fpcr +======================= ========= ===== ======================================= + +.. [1] These encodings are not accepted for SVE-enabled vcpus. See + KVM_ARM_VCPU_INIT. + + The equivalent register content can be accessed via bits [127:0] of + the corresponding SVE Zn registers instead for vcpus that have SVE + enabled (see below). + +arm64 CCSIDR registers are demultiplexed by CSSELR value:: + + 0x6020 0000 0011 00 + +arm64 system registers have the following id bit patterns:: + + 0x6030 0000 0013 + +.. warning:: + + Two system register IDs do not follow the specified pattern. These + are KVM_REG_ARM_TIMER_CVAL and KVM_REG_ARM_TIMER_CNT, which map to + system registers CNTV_CVAL_EL0 and CNTVCT_EL0 respectively. These + two had their values accidentally swapped, which means TIMER_CVAL is + derived from the register encoding for CNTVCT_EL0 and TIMER_CNT is + derived from the register encoding for CNTV_CVAL_EL0. As this is + API, it must remain this way. + +arm64 firmware pseudo-registers have the following bit pattern:: + + 0x6030 0000 0014 + +arm64 SVE registers have the following bit patterns:: + + 0x6080 0000 0015 00 Zn bits[2048*slice + 2047 : 2048*slice] + 0x6050 0000 0015 04 Pn bits[256*slice + 255 : 256*slice] + 0x6050 0000 0015 060 FFR bits[256*slice + 255 : 256*slice] + 0x6060 0000 0015 ffff KVM_REG_ARM64_SVE_VLS pseudo-register + +Access to register IDs where 2048 * slice >= 128 * max_vq will fail with +ENOENT. max_vq is the vcpu's maximum supported vector length in 128-bit +quadwords: see [2]_ below. + +These registers are only accessible on vcpus for which SVE is enabled. +See KVM_ARM_VCPU_INIT for details. + +In addition, except for KVM_REG_ARM64_SVE_VLS, these registers are not +accessible until the vcpu's SVE configuration has been finalized +using KVM_ARM_VCPU_FINALIZE(KVM_ARM_VCPU_SVE). See KVM_ARM_VCPU_INIT +and KVM_ARM_VCPU_FINALIZE for more information about this procedure. + +KVM_REG_ARM64_SVE_VLS is a pseudo-register that allows the set of vector +lengths supported by the vcpu to be discovered and configured by +userspace. When transferred to or from user memory via KVM_GET_ONE_REG +or KVM_SET_ONE_REG, the value of this register is of type +__u64[KVM_ARM64_SVE_VLS_WORDS], and encodes the set of vector lengths as +follows:: + + __u64 vector_lengths[KVM_ARM64_SVE_VLS_WORDS]; + + if (vq >= SVE_VQ_MIN && vq <= SVE_VQ_MAX && + ((vector_lengths[(vq - KVM_ARM64_SVE_VQ_MIN) / 64] >> + ((vq - KVM_ARM64_SVE_VQ_MIN) % 64)) & 1)) + /* Vector length vq * 16 bytes supported */ + else + /* Vector length vq * 16 bytes not supported */ + +.. [2] The maximum value vq for which the above condition is true is + max_vq. This is the maximum vector length available to the guest on + this vcpu, and determines which register slices are visible through + this ioctl interface. + +(See Documentation/arm64/sve.rst for an explanation of the "vq" +nomenclature.) + +KVM_REG_ARM64_SVE_VLS is only accessible after KVM_ARM_VCPU_INIT. +KVM_ARM_VCPU_INIT initialises it to the best set of vector lengths that +the host supports. + +Userspace may subsequently modify it if desired until the vcpu's SVE +configuration is finalized using KVM_ARM_VCPU_FINALIZE(KVM_ARM_VCPU_SVE). + +Apart from simply removing all vector lengths from the host set that +exceed some value, support for arbitrarily chosen sets of vector lengths +is hardware-dependent and may not be available. Attempting to configure +an invalid set of vector lengths via KVM_SET_ONE_REG will fail with +EINVAL. + +After the vcpu's SVE configuration is finalized, further attempts to +write this register will fail with EPERM. + + +MIPS registers are mapped using the lower 32 bits. The upper 16 of that is +the register group type: + +MIPS core registers (see above) have the following id bit patterns:: + + 0x7030 0000 0000 + +MIPS CP0 registers (see KVM_REG_MIPS_CP0_* above) have the following id bit +patterns depending on whether they're 32-bit or 64-bit registers:: + + 0x7020 0000 0001 00 (32-bit) + 0x7030 0000 0001 00 (64-bit) + +Note: KVM_REG_MIPS_CP0_ENTRYLO0 and KVM_REG_MIPS_CP0_ENTRYLO1 are the MIPS64 +versions of the EntryLo registers regardless of the word size of the host +hardware, host kernel, guest, and whether XPA is present in the guest, i.e. +with the RI and XI bits (if they exist) in bits 63 and 62 respectively, and +the PFNX field starting at bit 30. + +MIPS MAARs (see KVM_REG_MIPS_CP0_MAAR(*) above) have the following id bit +patterns:: + + 0x7030 0000 0001 01 + +MIPS KVM control registers (see above) have the following id bit patterns:: + + 0x7030 0000 0002 + +MIPS FPU registers (see KVM_REG_MIPS_FPR_{32,64}() above) have the following +id bit patterns depending on the size of the register being accessed. They are +always accessed according to the current guest FPU mode (Status.FR and +Config5.FRE), i.e. as the guest would see them, and they become unpredictable +if the guest FPU mode is changed. MIPS SIMD Architecture (MSA) vector +registers (see KVM_REG_MIPS_VEC_128() above) have similar patterns as they +overlap the FPU registers:: + + 0x7020 0000 0003 00 <0:3> (32-bit FPU registers) + 0x7030 0000 0003 00 <0:3> (64-bit FPU registers) + 0x7040 0000 0003 00 <0:3> (128-bit MSA vector registers) + +MIPS FPU control registers (see KVM_REG_MIPS_FCR_{IR,CSR} above) have the +following id bit patterns:: + + 0x7020 0000 0003 01 <0:3> + +MIPS MSA control registers (see KVM_REG_MIPS_MSA_{IR,CSR} above) have the +following id bit patterns:: + + 0x7020 0000 0003 02 <0:3> + + +4.69 KVM_GET_ONE_REG +-------------------- + +:Capability: KVM_CAP_ONE_REG +:Architectures: all +:Type: vcpu ioctl +:Parameters: struct kvm_one_reg (in and out) +:Returns: 0 on success, negative value on failure + +Errors include: + + ======== ============================================================ +  ENOENT   no such register +  EINVAL   invalid register ID, or no such register +  EPERM    (arm64) register access not allowed before vcpu finalization + ======== ============================================================ + +(These error codes are indicative only: do not rely on a specific error +code being returned in a specific situation.) + +This ioctl allows to receive the value of a single register implemented +in a vcpu. The register to read is indicated by the "id" field of the +kvm_one_reg struct passed in. On success, the register value can be found +at the memory location pointed to by "addr". + +The list of registers accessible using this interface is identical to the +list in 4.68. + + +4.70 KVM_KVMCLOCK_CTRL +---------------------- + +:Capability: KVM_CAP_KVMCLOCK_CTRL +:Architectures: Any that implement pvclocks (currently x86 only) +:Type: vcpu ioctl +:Parameters: None +:Returns: 0 on success, -1 on error + +This signals to the host kernel that the specified guest is being paused by +userspace. The host will set a flag in the pvclock structure that is checked +from the soft lockup watchdog. The flag is part of the pvclock structure that +is shared between guest and host, specifically the second bit of the flags +field of the pvclock_vcpu_time_info structure. It will be set exclusively by +the host and read/cleared exclusively by the guest. The guest operation of +checking and clearing the flag must an atomic operation so +load-link/store-conditional, or equivalent must be used. There are two cases +where the guest will clear the flag: when the soft lockup watchdog timer resets +itself or when a soft lockup is detected. This ioctl can be called any time +after pausing the vcpu, but before it is resumed. + + +4.71 KVM_SIGNAL_MSI +------------------- + +:Capability: KVM_CAP_SIGNAL_MSI +:Architectures: x86 arm arm64 +:Type: vm ioctl +:Parameters: struct kvm_msi (in) +:Returns: >0 on delivery, 0 if guest blocked the MSI, and -1 on error + +Directly inject a MSI message. Only valid with in-kernel irqchip that handles +MSI messages. + +:: + + struct kvm_msi { + __u32 address_lo; + __u32 address_hi; + __u32 data; + __u32 flags; + __u32 devid; + __u8 pad[12]; + }; + +flags: + KVM_MSI_VALID_DEVID: devid contains a valid value. The per-VM + KVM_CAP_MSI_DEVID capability advertises the requirement to provide + the device ID. If this capability is not available, userspace + should never set the KVM_MSI_VALID_DEVID flag as the ioctl might fail. + +If KVM_MSI_VALID_DEVID is set, devid contains a unique device identifier +for the device that wrote the MSI message. For PCI, this is usually a +BFD identifier in the lower 16 bits. + +On x86, address_hi is ignored unless the KVM_X2APIC_API_USE_32BIT_IDS +feature of KVM_CAP_X2APIC_API capability is enabled. If it is enabled, +address_hi bits 31-8 provide bits 31-8 of the destination id. Bits 7-0 of +address_hi must be zero. + + +4.71 KVM_CREATE_PIT2 +-------------------- + +:Capability: KVM_CAP_PIT2 +:Architectures: x86 +:Type: vm ioctl +:Parameters: struct kvm_pit_config (in) +:Returns: 0 on success, -1 on error + +Creates an in-kernel device model for the i8254 PIT. This call is only valid +after enabling in-kernel irqchip support via KVM_CREATE_IRQCHIP. The following +parameters have to be passed:: + + struct kvm_pit_config { + __u32 flags; + __u32 pad[15]; + }; + +Valid flags are:: + + #define KVM_PIT_SPEAKER_DUMMY 1 /* emulate speaker port stub */ + +PIT timer interrupts may use a per-VM kernel thread for injection. If it +exists, this thread will have a name of the following pattern:: + + kvm-pit/ + +When running a guest with elevated priorities, the scheduling parameters of +this thread may have to be adjusted accordingly. + +This IOCTL replaces the obsolete KVM_CREATE_PIT. + + +4.72 KVM_GET_PIT2 +----------------- + +:Capability: KVM_CAP_PIT_STATE2 +:Architectures: x86 +:Type: vm ioctl +:Parameters: struct kvm_pit_state2 (out) +:Returns: 0 on success, -1 on error + +Retrieves the state of the in-kernel PIT model. Only valid after +KVM_CREATE_PIT2. The state is returned in the following structure:: + + struct kvm_pit_state2 { + struct kvm_pit_channel_state channels[3]; + __u32 flags; + __u32 reserved[9]; + }; + +Valid flags are:: + + /* disable PIT in HPET legacy mode */ + #define KVM_PIT_FLAGS_HPET_LEGACY 0x00000001 + +This IOCTL replaces the obsolete KVM_GET_PIT. + + +4.73 KVM_SET_PIT2 +----------------- + +:Capability: KVM_CAP_PIT_STATE2 +:Architectures: x86 +:Type: vm ioctl +:Parameters: struct kvm_pit_state2 (in) +:Returns: 0 on success, -1 on error + +Sets the state of the in-kernel PIT model. Only valid after KVM_CREATE_PIT2. +See KVM_GET_PIT2 for details on struct kvm_pit_state2. + +This IOCTL replaces the obsolete KVM_SET_PIT. + + +4.74 KVM_PPC_GET_SMMU_INFO +-------------------------- + +:Capability: KVM_CAP_PPC_GET_SMMU_INFO +:Architectures: powerpc +:Type: vm ioctl +:Parameters: None +:Returns: 0 on success, -1 on error + +This populates and returns a structure describing the features of +the "Server" class MMU emulation supported by KVM. +This can in turn be used by userspace to generate the appropriate +device-tree properties for the guest operating system. + +The structure contains some global information, followed by an +array of supported segment page sizes:: + + struct kvm_ppc_smmu_info { + __u64 flags; + __u32 slb_size; + __u32 pad; + struct kvm_ppc_one_seg_page_size sps[KVM_PPC_PAGE_SIZES_MAX_SZ]; + }; + +The supported flags are: + + - KVM_PPC_PAGE_SIZES_REAL: + When that flag is set, guest page sizes must "fit" the backing + store page sizes. When not set, any page size in the list can + be used regardless of how they are backed by userspace. + + - KVM_PPC_1T_SEGMENTS + The emulated MMU supports 1T segments in addition to the + standard 256M ones. + + - KVM_PPC_NO_HASH + This flag indicates that HPT guests are not supported by KVM, + thus all guests must use radix MMU mode. + +The "slb_size" field indicates how many SLB entries are supported + +The "sps" array contains 8 entries indicating the supported base +page sizes for a segment in increasing order. Each entry is defined +as follow:: + + struct kvm_ppc_one_seg_page_size { + __u32 page_shift; /* Base page shift of segment (or 0) */ + __u32 slb_enc; /* SLB encoding for BookS */ + struct kvm_ppc_one_page_size enc[KVM_PPC_PAGE_SIZES_MAX_SZ]; + }; + +An entry with a "page_shift" of 0 is unused. Because the array is +organized in increasing order, a lookup can stop when encoutering +such an entry. + +The "slb_enc" field provides the encoding to use in the SLB for the +page size. The bits are in positions such as the value can directly +be OR'ed into the "vsid" argument of the slbmte instruction. + +The "enc" array is a list which for each of those segment base page +size provides the list of supported actual page sizes (which can be +only larger or equal to the base page size), along with the +corresponding encoding in the hash PTE. Similarly, the array is +8 entries sorted by increasing sizes and an entry with a "0" shift +is an empty entry and a terminator:: + + struct kvm_ppc_one_page_size { + __u32 page_shift; /* Page shift (or 0) */ + __u32 pte_enc; /* Encoding in the HPTE (>>12) */ + }; + +The "pte_enc" field provides a value that can OR'ed into the hash +PTE's RPN field (ie, it needs to be shifted left by 12 to OR it +into the hash PTE second double word). + +4.75 KVM_IRQFD +-------------- + +:Capability: KVM_CAP_IRQFD +:Architectures: x86 s390 arm arm64 +:Type: vm ioctl +:Parameters: struct kvm_irqfd (in) +:Returns: 0 on success, -1 on error + +Allows setting an eventfd to directly trigger a guest interrupt. +kvm_irqfd.fd specifies the file descriptor to use as the eventfd and +kvm_irqfd.gsi specifies the irqchip pin toggled by this event. When +an event is triggered on the eventfd, an interrupt is injected into +the guest using the specified gsi pin. The irqfd is removed using +the KVM_IRQFD_FLAG_DEASSIGN flag, specifying both kvm_irqfd.fd +and kvm_irqfd.gsi. + +With KVM_CAP_IRQFD_RESAMPLE, KVM_IRQFD supports a de-assert and notify +mechanism allowing emulation of level-triggered, irqfd-based +interrupts. When KVM_IRQFD_FLAG_RESAMPLE is set the user must pass an +additional eventfd in the kvm_irqfd.resamplefd field. When operating +in resample mode, posting of an interrupt through kvm_irq.fd asserts +the specified gsi in the irqchip. When the irqchip is resampled, such +as from an EOI, the gsi is de-asserted and the user is notified via +kvm_irqfd.resamplefd. It is the user's responsibility to re-queue +the interrupt if the device making use of it still requires service. +Note that closing the resamplefd is not sufficient to disable the +irqfd. The KVM_IRQFD_FLAG_RESAMPLE is only necessary on assignment +and need not be specified with KVM_IRQFD_FLAG_DEASSIGN. + +On arm/arm64, gsi routing being supported, the following can happen: + +- in case no routing entry is associated to this gsi, injection fails +- in case the gsi is associated to an irqchip routing entry, + irqchip.pin + 32 corresponds to the injected SPI ID. +- in case the gsi is associated to an MSI routing entry, the MSI + message and device ID are translated into an LPI (support restricted + to GICv3 ITS in-kernel emulation). + +4.76 KVM_PPC_ALLOCATE_HTAB +-------------------------- + +:Capability: KVM_CAP_PPC_ALLOC_HTAB +:Architectures: powerpc +:Type: vm ioctl +:Parameters: Pointer to u32 containing hash table order (in/out) +:Returns: 0 on success, -1 on error + +This requests the host kernel to allocate an MMU hash table for a +guest using the PAPR paravirtualization interface. This only does +anything if the kernel is configured to use the Book 3S HV style of +virtualization. Otherwise the capability doesn't exist and the ioctl +returns an ENOTTY error. The rest of this description assumes Book 3S +HV. + +There must be no vcpus running when this ioctl is called; if there +are, it will do nothing and return an EBUSY error. + +The parameter is a pointer to a 32-bit unsigned integer variable +containing the order (log base 2) of the desired size of the hash +table, which must be between 18 and 46. On successful return from the +ioctl, the value will not be changed by the kernel. + +If no hash table has been allocated when any vcpu is asked to run +(with the KVM_RUN ioctl), the host kernel will allocate a +default-sized hash table (16 MB). + +If this ioctl is called when a hash table has already been allocated, +with a different order from the existing hash table, the existing hash +table will be freed and a new one allocated. If this is ioctl is +called when a hash table has already been allocated of the same order +as specified, the kernel will clear out the existing hash table (zero +all HPTEs). In either case, if the guest is using the virtualized +real-mode area (VRMA) facility, the kernel will re-create the VMRA +HPTEs on the next KVM_RUN of any vcpu. + +4.77 KVM_S390_INTERRUPT +----------------------- + +:Capability: basic +:Architectures: s390 +:Type: vm ioctl, vcpu ioctl +:Parameters: struct kvm_s390_interrupt (in) +:Returns: 0 on success, -1 on error + +Allows to inject an interrupt to the guest. Interrupts can be floating +(vm ioctl) or per cpu (vcpu ioctl), depending on the interrupt type. + +Interrupt parameters are passed via kvm_s390_interrupt:: + + struct kvm_s390_interrupt { + __u32 type; + __u32 parm; + __u64 parm64; + }; + +type can be one of the following: + +KVM_S390_SIGP_STOP (vcpu) + - sigp stop; optional flags in parm +KVM_S390_PROGRAM_INT (vcpu) + - program check; code in parm +KVM_S390_SIGP_SET_PREFIX (vcpu) + - sigp set prefix; prefix address in parm +KVM_S390_RESTART (vcpu) + - restart +KVM_S390_INT_CLOCK_COMP (vcpu) + - clock comparator interrupt +KVM_S390_INT_CPU_TIMER (vcpu) + - CPU timer interrupt +KVM_S390_INT_VIRTIO (vm) + - virtio external interrupt; external interrupt + parameters in parm and parm64 +KVM_S390_INT_SERVICE (vm) + - sclp external interrupt; sclp parameter in parm +KVM_S390_INT_EMERGENCY (vcpu) + - sigp emergency; source cpu in parm +KVM_S390_INT_EXTERNAL_CALL (vcpu) + - sigp external call; source cpu in parm +KVM_S390_INT_IO(ai,cssid,ssid,schid) (vm) + - compound value to indicate an + I/O interrupt (ai - adapter interrupt; cssid,ssid,schid - subchannel); + I/O interruption parameters in parm (subchannel) and parm64 (intparm, + interruption subclass) +KVM_S390_MCHK (vm, vcpu) + - machine check interrupt; cr 14 bits in parm, machine check interrupt + code in parm64 (note that machine checks needing further payload are not + supported by this ioctl) + +This is an asynchronous vcpu ioctl and can be invoked from any thread. + +4.78 KVM_PPC_GET_HTAB_FD +------------------------ + +:Capability: KVM_CAP_PPC_HTAB_FD +:Architectures: powerpc +:Type: vm ioctl +:Parameters: Pointer to struct kvm_get_htab_fd (in) +:Returns: file descriptor number (>= 0) on success, -1 on error + +This returns a file descriptor that can be used either to read out the +entries in the guest's hashed page table (HPT), or to write entries to +initialize the HPT. The returned fd can only be written to if the +KVM_GET_HTAB_WRITE bit is set in the flags field of the argument, and +can only be read if that bit is clear. The argument struct looks like +this:: + + /* For KVM_PPC_GET_HTAB_FD */ + struct kvm_get_htab_fd { + __u64 flags; + __u64 start_index; + __u64 reserved[2]; + }; + + /* Values for kvm_get_htab_fd.flags */ + #define KVM_GET_HTAB_BOLTED_ONLY ((__u64)0x1) + #define KVM_GET_HTAB_WRITE ((__u64)0x2) + +The 'start_index' field gives the index in the HPT of the entry at +which to start reading. It is ignored when writing. + +Reads on the fd will initially supply information about all +"interesting" HPT entries. Interesting entries are those with the +bolted bit set, if the KVM_GET_HTAB_BOLTED_ONLY bit is set, otherwise +all entries. When the end of the HPT is reached, the read() will +return. If read() is called again on the fd, it will start again from +the beginning of the HPT, but will only return HPT entries that have +changed since they were last read. + +Data read or written is structured as a header (8 bytes) followed by a +series of valid HPT entries (16 bytes) each. The header indicates how +many valid HPT entries there are and how many invalid entries follow +the valid entries. The invalid entries are not represented explicitly +in the stream. The header format is:: + + struct kvm_get_htab_header { + __u32 index; + __u16 n_valid; + __u16 n_invalid; + }; + +Writes to the fd create HPT entries starting at the index given in the +header; first 'n_valid' valid entries with contents from the data +written, then 'n_invalid' invalid entries, invalidating any previously +valid entries found. + +4.79 KVM_CREATE_DEVICE +---------------------- + +:Capability: KVM_CAP_DEVICE_CTRL +:Type: vm ioctl +:Parameters: struct kvm_create_device (in/out) +:Returns: 0 on success, -1 on error + +Errors: + + ====== ======================================================= + ENODEV The device type is unknown or unsupported + EEXIST Device already created, and this type of device may not + be instantiated multiple times + ====== ======================================================= + + Other error conditions may be defined by individual device types or + have their standard meanings. + +Creates an emulated device in the kernel. The file descriptor returned +in fd can be used with KVM_SET/GET/HAS_DEVICE_ATTR. + +If the KVM_CREATE_DEVICE_TEST flag is set, only test whether the +device type is supported (not necessarily whether it can be created +in the current vm). + +Individual devices should not define flags. Attributes should be used +for specifying any behavior that is not implied by the device type +number. + +:: + + struct kvm_create_device { + __u32 type; /* in: KVM_DEV_TYPE_xxx */ + __u32 fd; /* out: device handle */ + __u32 flags; /* in: KVM_CREATE_DEVICE_xxx */ + }; + +4.80 KVM_SET_DEVICE_ATTR/KVM_GET_DEVICE_ATTR +-------------------------------------------- + +:Capability: KVM_CAP_DEVICE_CTRL, KVM_CAP_VM_ATTRIBUTES for vm device, + KVM_CAP_VCPU_ATTRIBUTES for vcpu device +:Type: device ioctl, vm ioctl, vcpu ioctl +:Parameters: struct kvm_device_attr +:Returns: 0 on success, -1 on error + +Errors: + + ===== ============================================================= + ENXIO The group or attribute is unknown/unsupported for this device + or hardware support is missing. + EPERM The attribute cannot (currently) be accessed this way + (e.g. read-only attribute, or attribute that only makes + sense when the device is in a different state) + ===== ============================================================= + + Other error conditions may be defined by individual device types. + +Gets/sets a specified piece of device configuration and/or state. The +semantics are device-specific. See individual device documentation in +the "devices" directory. As with ONE_REG, the size of the data +transferred is defined by the particular attribute. + +:: + + struct kvm_device_attr { + __u32 flags; /* no flags currently defined */ + __u32 group; /* device-defined */ + __u64 attr; /* group-defined */ + __u64 addr; /* userspace address of attr data */ + }; + +4.81 KVM_HAS_DEVICE_ATTR +------------------------ + +:Capability: KVM_CAP_DEVICE_CTRL, KVM_CAP_VM_ATTRIBUTES for vm device, + KVM_CAP_VCPU_ATTRIBUTES for vcpu device +:Type: device ioctl, vm ioctl, vcpu ioctl +:Parameters: struct kvm_device_attr +:Returns: 0 on success, -1 on error + +Errors: + + ===== ============================================================= + ENXIO The group or attribute is unknown/unsupported for this device + or hardware support is missing. + ===== ============================================================= + +Tests whether a device supports a particular attribute. A successful +return indicates the attribute is implemented. It does not necessarily +indicate that the attribute can be read or written in the device's +current state. "addr" is ignored. + +4.82 KVM_ARM_VCPU_INIT +---------------------- + +:Capability: basic +:Architectures: arm, arm64 +:Type: vcpu ioctl +:Parameters: struct kvm_vcpu_init (in) +:Returns: 0 on success; -1 on error + +Errors: + + ====== ================================================================= +  EINVAL    the target is unknown, or the combination of features is invalid. +  ENOENT    a features bit specified is unknown. + ====== ================================================================= + +This tells KVM what type of CPU to present to the guest, and what +optional features it should have.  This will cause a reset of the cpu +registers to their initial values.  If this is not called, KVM_RUN will +return ENOEXEC for that vcpu. + +Note that because some registers reflect machine topology, all vcpus +should be created before this ioctl is invoked. + +Userspace can call this function multiple times for a given vcpu, including +after the vcpu has been run. This will reset the vcpu to its initial +state. All calls to this function after the initial call must use the same +target and same set of feature flags, otherwise EINVAL will be returned. + +Possible features: + + - KVM_ARM_VCPU_POWER_OFF: Starts the CPU in a power-off state. + Depends on KVM_CAP_ARM_PSCI. If not set, the CPU will be powered on + and execute guest code when KVM_RUN is called. + - KVM_ARM_VCPU_EL1_32BIT: Starts the CPU in a 32bit mode. + Depends on KVM_CAP_ARM_EL1_32BIT (arm64 only). + - KVM_ARM_VCPU_PSCI_0_2: Emulate PSCI v0.2 (or a future revision + backward compatible with v0.2) for the CPU. + Depends on KVM_CAP_ARM_PSCI_0_2. + - KVM_ARM_VCPU_PMU_V3: Emulate PMUv3 for the CPU. + Depends on KVM_CAP_ARM_PMU_V3. + + - KVM_ARM_VCPU_PTRAUTH_ADDRESS: Enables Address Pointer authentication + for arm64 only. + Depends on KVM_CAP_ARM_PTRAUTH_ADDRESS. + If KVM_CAP_ARM_PTRAUTH_ADDRESS and KVM_CAP_ARM_PTRAUTH_GENERIC are + both present, then both KVM_ARM_VCPU_PTRAUTH_ADDRESS and + KVM_ARM_VCPU_PTRAUTH_GENERIC must be requested or neither must be + requested. + + - KVM_ARM_VCPU_PTRAUTH_GENERIC: Enables Generic Pointer authentication + for arm64 only. + Depends on KVM_CAP_ARM_PTRAUTH_GENERIC. + If KVM_CAP_ARM_PTRAUTH_ADDRESS and KVM_CAP_ARM_PTRAUTH_GENERIC are + both present, then both KVM_ARM_VCPU_PTRAUTH_ADDRESS and + KVM_ARM_VCPU_PTRAUTH_GENERIC must be requested or neither must be + requested. + + - KVM_ARM_VCPU_SVE: Enables SVE for the CPU (arm64 only). + Depends on KVM_CAP_ARM_SVE. + Requires KVM_ARM_VCPU_FINALIZE(KVM_ARM_VCPU_SVE): + + * After KVM_ARM_VCPU_INIT: + + - KVM_REG_ARM64_SVE_VLS may be read using KVM_GET_ONE_REG: the + initial value of this pseudo-register indicates the best set of + vector lengths possible for a vcpu on this host. + + * Before KVM_ARM_VCPU_FINALIZE(KVM_ARM_VCPU_SVE): + + - KVM_RUN and KVM_GET_REG_LIST are not available; + + - KVM_GET_ONE_REG and KVM_SET_ONE_REG cannot be used to access + the scalable archietctural SVE registers + KVM_REG_ARM64_SVE_ZREG(), KVM_REG_ARM64_SVE_PREG() or + KVM_REG_ARM64_SVE_FFR; + + - KVM_REG_ARM64_SVE_VLS may optionally be written using + KVM_SET_ONE_REG, to modify the set of vector lengths available + for the vcpu. + + * After KVM_ARM_VCPU_FINALIZE(KVM_ARM_VCPU_SVE): + + - the KVM_REG_ARM64_SVE_VLS pseudo-register is immutable, and can + no longer be written using KVM_SET_ONE_REG. + +4.83 KVM_ARM_PREFERRED_TARGET +----------------------------- + +:Capability: basic +:Architectures: arm, arm64 +:Type: vm ioctl +:Parameters: struct struct kvm_vcpu_init (out) +:Returns: 0 on success; -1 on error + +Errors: + + ====== ========================================== + ENODEV no preferred target available for the host + ====== ========================================== + +This queries KVM for preferred CPU target type which can be emulated +by KVM on underlying host. + +The ioctl returns struct kvm_vcpu_init instance containing information +about preferred CPU target type and recommended features for it. The +kvm_vcpu_init->features bitmap returned will have feature bits set if +the preferred target recommends setting these features, but this is +not mandatory. + +The information returned by this ioctl can be used to prepare an instance +of struct kvm_vcpu_init for KVM_ARM_VCPU_INIT ioctl which will result in +in VCPU matching underlying host. + + +4.84 KVM_GET_REG_LIST +--------------------- + +:Capability: basic +:Architectures: arm, arm64, mips +:Type: vcpu ioctl +:Parameters: struct kvm_reg_list (in/out) +:Returns: 0 on success; -1 on error + +Errors: + + ===== ============================================================== +  E2BIG     the reg index list is too big to fit in the array specified by +             the user (the number required will be written into n). + ===== ============================================================== + +:: + + struct kvm_reg_list { + __u64 n; /* number of registers in reg[] */ + __u64 reg[0]; + }; + +This ioctl returns the guest registers that are supported for the +KVM_GET_ONE_REG/KVM_SET_ONE_REG calls. + + +4.85 KVM_ARM_SET_DEVICE_ADDR (deprecated) +----------------------------------------- + +:Capability: KVM_CAP_ARM_SET_DEVICE_ADDR +:Architectures: arm, arm64 +:Type: vm ioctl +:Parameters: struct kvm_arm_device_address (in) +:Returns: 0 on success, -1 on error + +Errors: + + ====== ============================================ + ENODEV The device id is unknown + ENXIO Device not supported on current system + EEXIST Address already set + E2BIG Address outside guest physical address space + EBUSY Address overlaps with other device range + ====== ============================================ + +:: + + struct kvm_arm_device_addr { + __u64 id; + __u64 addr; + }; + +Specify a device address in the guest's physical address space where guests +can access emulated or directly exposed devices, which the host kernel needs +to know about. The id field is an architecture specific identifier for a +specific device. + +ARM/arm64 divides the id field into two parts, a device id and an +address type id specific to the individual device:: + +  bits: | 63 ... 32 | 31 ... 16 | 15 ... 0 | + field: | 0x00000000 | device id | addr type id | + +ARM/arm64 currently only require this when using the in-kernel GIC +support for the hardware VGIC features, using KVM_ARM_DEVICE_VGIC_V2 +as the device id. When setting the base address for the guest's +mapping of the VGIC virtual CPU and distributor interface, the ioctl +must be called after calling KVM_CREATE_IRQCHIP, but before calling +KVM_RUN on any of the VCPUs. Calling this ioctl twice for any of the +base addresses will return -EEXIST. + +Note, this IOCTL is deprecated and the more flexible SET/GET_DEVICE_ATTR API +should be used instead. + + +4.86 KVM_PPC_RTAS_DEFINE_TOKEN +------------------------------ + +:Capability: KVM_CAP_PPC_RTAS +:Architectures: ppc +:Type: vm ioctl +:Parameters: struct kvm_rtas_token_args +:Returns: 0 on success, -1 on error + +Defines a token value for a RTAS (Run Time Abstraction Services) +service in order to allow it to be handled in the kernel. The +argument struct gives the name of the service, which must be the name +of a service that has a kernel-side implementation. If the token +value is non-zero, it will be associated with that service, and +subsequent RTAS calls by the guest specifying that token will be +handled by the kernel. If the token value is 0, then any token +associated with the service will be forgotten, and subsequent RTAS +calls by the guest for that service will be passed to userspace to be +handled. + +4.87 KVM_SET_GUEST_DEBUG +------------------------ + +:Capability: KVM_CAP_SET_GUEST_DEBUG +:Architectures: x86, s390, ppc, arm64 +:Type: vcpu ioctl +:Parameters: struct kvm_guest_debug (in) +:Returns: 0 on success; -1 on error + +:: + + struct kvm_guest_debug { + __u32 control; + __u32 pad; + struct kvm_guest_debug_arch arch; + }; + +Set up the processor specific debug registers and configure vcpu for +handling guest debug events. There are two parts to the structure, the +first a control bitfield indicates the type of debug events to handle +when running. Common control bits are: + + - KVM_GUESTDBG_ENABLE: guest debugging is enabled + - KVM_GUESTDBG_SINGLESTEP: the next run should single-step + +The top 16 bits of the control field are architecture specific control +flags which can include the following: + + - KVM_GUESTDBG_USE_SW_BP: using software breakpoints [x86, arm64] + - KVM_GUESTDBG_USE_HW_BP: using hardware breakpoints [x86, s390, arm64] + - KVM_GUESTDBG_INJECT_DB: inject DB type exception [x86] + - KVM_GUESTDBG_INJECT_BP: inject BP type exception [x86] + - KVM_GUESTDBG_EXIT_PENDING: trigger an immediate guest exit [s390] + +For example KVM_GUESTDBG_USE_SW_BP indicates that software breakpoints +are enabled in memory so we need to ensure breakpoint exceptions are +correctly trapped and the KVM run loop exits at the breakpoint and not +running off into the normal guest vector. For KVM_GUESTDBG_USE_HW_BP +we need to ensure the guest vCPUs architecture specific registers are +updated to the correct (supplied) values. + +The second part of the structure is architecture specific and +typically contains a set of debug registers. + +For arm64 the number of debug registers is implementation defined and +can be determined by querying the KVM_CAP_GUEST_DEBUG_HW_BPS and +KVM_CAP_GUEST_DEBUG_HW_WPS capabilities which return a positive number +indicating the number of supported registers. + +For ppc, the KVM_CAP_PPC_GUEST_DEBUG_SSTEP capability indicates whether +the single-step debug event (KVM_GUESTDBG_SINGLESTEP) is supported. + +When debug events exit the main run loop with the reason +KVM_EXIT_DEBUG with the kvm_debug_exit_arch part of the kvm_run +structure containing architecture specific debug information. + +4.88 KVM_GET_EMULATED_CPUID +--------------------------- + +:Capability: KVM_CAP_EXT_EMUL_CPUID +:Architectures: x86 +:Type: system ioctl +:Parameters: struct kvm_cpuid2 (in/out) +:Returns: 0 on success, -1 on error + +:: + + struct kvm_cpuid2 { + __u32 nent; + __u32 flags; + struct kvm_cpuid_entry2 entries[0]; + }; + +The member 'flags' is used for passing flags from userspace. + +:: + + #define KVM_CPUID_FLAG_SIGNIFCANT_INDEX BIT(0) + #define KVM_CPUID_FLAG_STATEFUL_FUNC BIT(1) + #define KVM_CPUID_FLAG_STATE_READ_NEXT BIT(2) + + struct kvm_cpuid_entry2 { + __u32 function; + __u32 index; + __u32 flags; + __u32 eax; + __u32 ebx; + __u32 ecx; + __u32 edx; + __u32 padding[3]; + }; + +This ioctl returns x86 cpuid features which are emulated by +kvm.Userspace can use the information returned by this ioctl to query +which features are emulated by kvm instead of being present natively. + +Userspace invokes KVM_GET_EMULATED_CPUID by passing a kvm_cpuid2 +structure with the 'nent' field indicating the number of entries in +the variable-size array 'entries'. If the number of entries is too low +to describe the cpu capabilities, an error (E2BIG) is returned. If the +number is too high, the 'nent' field is adjusted and an error (ENOMEM) +is returned. If the number is just right, the 'nent' field is adjusted +to the number of valid entries in the 'entries' array, which is then +filled. + +The entries returned are the set CPUID bits of the respective features +which kvm emulates, as returned by the CPUID instruction, with unknown +or unsupported feature bits cleared. + +Features like x2apic, for example, may not be present in the host cpu +but are exposed by kvm in KVM_GET_SUPPORTED_CPUID because they can be +emulated efficiently and thus not included here. + +The fields in each entry are defined as follows: + + function: + the eax value used to obtain the entry + index: + the ecx value used to obtain the entry (for entries that are + affected by ecx) + flags: + an OR of zero or more of the following: + + KVM_CPUID_FLAG_SIGNIFCANT_INDEX: + if the index field is valid + KVM_CPUID_FLAG_STATEFUL_FUNC: + if cpuid for this function returns different values for successive + invocations; there will be several entries with the same function, + all with this flag set + KVM_CPUID_FLAG_STATE_READ_NEXT: + for KVM_CPUID_FLAG_STATEFUL_FUNC entries, set if this entry is + the first entry to be read by a cpu + + eax, ebx, ecx, edx: + + the values returned by the cpuid instruction for + this function/index combination + +4.89 KVM_S390_MEM_OP +-------------------- + +:Capability: KVM_CAP_S390_MEM_OP +:Architectures: s390 +:Type: vcpu ioctl +:Parameters: struct kvm_s390_mem_op (in) +:Returns: = 0 on success, + < 0 on generic error (e.g. -EFAULT or -ENOMEM), + > 0 if an exception occurred while walking the page tables + +Read or write data from/to the logical (virtual) memory of a VCPU. + +Parameters are specified via the following structure:: + + struct kvm_s390_mem_op { + __u64 gaddr; /* the guest address */ + __u64 flags; /* flags */ + __u32 size; /* amount of bytes */ + __u32 op; /* type of operation */ + __u64 buf; /* buffer in userspace */ + __u8 ar; /* the access register number */ + __u8 reserved[31]; /* should be set to 0 */ + }; + +The type of operation is specified in the "op" field. It is either +KVM_S390_MEMOP_LOGICAL_READ for reading from logical memory space or +KVM_S390_MEMOP_LOGICAL_WRITE for writing to logical memory space. The +KVM_S390_MEMOP_F_CHECK_ONLY flag can be set in the "flags" field to check +whether the corresponding memory access would create an access exception +(without touching the data in the memory at the destination). In case an +access exception occurred while walking the MMU tables of the guest, the +ioctl returns a positive error number to indicate the type of exception. +This exception is also raised directly at the corresponding VCPU if the +flag KVM_S390_MEMOP_F_INJECT_EXCEPTION is set in the "flags" field. + +The start address of the memory region has to be specified in the "gaddr" +field, and the length of the region in the "size" field (which must not +be 0). The maximum value for "size" can be obtained by checking the +KVM_CAP_S390_MEM_OP capability. "buf" is the buffer supplied by the +userspace application where the read data should be written to for +KVM_S390_MEMOP_LOGICAL_READ, or where the data that should be written is +stored for a KVM_S390_MEMOP_LOGICAL_WRITE. When KVM_S390_MEMOP_F_CHECK_ONLY +is specified, "buf" is unused and can be NULL. "ar" designates the access +register number to be used; the valid range is 0..15. + +The "reserved" field is meant for future extensions. It is not used by +KVM with the currently defined set of flags. + +4.90 KVM_S390_GET_SKEYS +----------------------- + +:Capability: KVM_CAP_S390_SKEYS +:Architectures: s390 +:Type: vm ioctl +:Parameters: struct kvm_s390_skeys +:Returns: 0 on success, KVM_S390_GET_KEYS_NONE if guest is not using storage + keys, negative value on error + +This ioctl is used to get guest storage key values on the s390 +architecture. The ioctl takes parameters via the kvm_s390_skeys struct:: + + struct kvm_s390_skeys { + __u64 start_gfn; + __u64 count; + __u64 skeydata_addr; + __u32 flags; + __u32 reserved[9]; + }; + +The start_gfn field is the number of the first guest frame whose storage keys +you want to get. + +The count field is the number of consecutive frames (starting from start_gfn) +whose storage keys to get. The count field must be at least 1 and the maximum +allowed value is defined as KVM_S390_SKEYS_ALLOC_MAX. Values outside this range +will cause the ioctl to return -EINVAL. + +The skeydata_addr field is the address to a buffer large enough to hold count +bytes. This buffer will be filled with storage key data by the ioctl. + +4.91 KVM_S390_SET_SKEYS +----------------------- + +:Capability: KVM_CAP_S390_SKEYS +:Architectures: s390 +:Type: vm ioctl +:Parameters: struct kvm_s390_skeys +:Returns: 0 on success, negative value on error + +This ioctl is used to set guest storage key values on the s390 +architecture. The ioctl takes parameters via the kvm_s390_skeys struct. +See section on KVM_S390_GET_SKEYS for struct definition. + +The start_gfn field is the number of the first guest frame whose storage keys +you want to set. + +The count field is the number of consecutive frames (starting from start_gfn) +whose storage keys to get. The count field must be at least 1 and the maximum +allowed value is defined as KVM_S390_SKEYS_ALLOC_MAX. Values outside this range +will cause the ioctl to return -EINVAL. + +The skeydata_addr field is the address to a buffer containing count bytes of +storage keys. Each byte in the buffer will be set as the storage key for a +single frame starting at start_gfn for count frames. + +Note: If any architecturally invalid key value is found in the given data then +the ioctl will return -EINVAL. + +4.92 KVM_S390_IRQ +----------------- + +:Capability: KVM_CAP_S390_INJECT_IRQ +:Architectures: s390 +:Type: vcpu ioctl +:Parameters: struct kvm_s390_irq (in) +:Returns: 0 on success, -1 on error + +Errors: + + + ====== ================================================================= + EINVAL interrupt type is invalid + type is KVM_S390_SIGP_STOP and flag parameter is invalid value, + type is KVM_S390_INT_EXTERNAL_CALL and code is bigger + than the maximum of VCPUs + EBUSY type is KVM_S390_SIGP_SET_PREFIX and vcpu is not stopped, + type is KVM_S390_SIGP_STOP and a stop irq is already pending, + type is KVM_S390_INT_EXTERNAL_CALL and an external call interrupt + is already pending + ====== ================================================================= + +Allows to inject an interrupt to the guest. + +Using struct kvm_s390_irq as a parameter allows +to inject additional payload which is not +possible via KVM_S390_INTERRUPT. + +Interrupt parameters are passed via kvm_s390_irq:: + + struct kvm_s390_irq { + __u64 type; + union { + struct kvm_s390_io_info io; + struct kvm_s390_ext_info ext; + struct kvm_s390_pgm_info pgm; + struct kvm_s390_emerg_info emerg; + struct kvm_s390_extcall_info extcall; + struct kvm_s390_prefix_info prefix; + struct kvm_s390_stop_info stop; + struct kvm_s390_mchk_info mchk; + char reserved[64]; + } u; + }; + +type can be one of the following: + +- KVM_S390_SIGP_STOP - sigp stop; parameter in .stop +- KVM_S390_PROGRAM_INT - program check; parameters in .pgm +- KVM_S390_SIGP_SET_PREFIX - sigp set prefix; parameters in .prefix +- KVM_S390_RESTART - restart; no parameters +- KVM_S390_INT_CLOCK_COMP - clock comparator interrupt; no parameters +- KVM_S390_INT_CPU_TIMER - CPU timer interrupt; no parameters +- KVM_S390_INT_EMERGENCY - sigp emergency; parameters in .emerg +- KVM_S390_INT_EXTERNAL_CALL - sigp external call; parameters in .extcall +- KVM_S390_MCHK - machine check interrupt; parameters in .mchk + +This is an asynchronous vcpu ioctl and can be invoked from any thread. + +4.94 KVM_S390_GET_IRQ_STATE +--------------------------- + +:Capability: KVM_CAP_S390_IRQ_STATE +:Architectures: s390 +:Type: vcpu ioctl +:Parameters: struct kvm_s390_irq_state (out) +:Returns: >= number of bytes copied into buffer, + -EINVAL if buffer size is 0, + -ENOBUFS if buffer size is too small to fit all pending interrupts, + -EFAULT if the buffer address was invalid + +This ioctl allows userspace to retrieve the complete state of all currently +pending interrupts in a single buffer. Use cases include migration +and introspection. The parameter structure contains the address of a +userspace buffer and its length:: + + struct kvm_s390_irq_state { + __u64 buf; + __u32 flags; /* will stay unused for compatibility reasons */ + __u32 len; + __u32 reserved[4]; /* will stay unused for compatibility reasons */ + }; + +Userspace passes in the above struct and for each pending interrupt a +struct kvm_s390_irq is copied to the provided buffer. + +The structure contains a flags and a reserved field for future extensions. As +the kernel never checked for flags == 0 and QEMU never pre-zeroed flags and +reserved, these fields can not be used in the future without breaking +compatibility. + +If -ENOBUFS is returned the buffer provided was too small and userspace +may retry with a bigger buffer. + +4.95 KVM_S390_SET_IRQ_STATE +--------------------------- + +:Capability: KVM_CAP_S390_IRQ_STATE +:Architectures: s390 +:Type: vcpu ioctl +:Parameters: struct kvm_s390_irq_state (in) +:Returns: 0 on success, + -EFAULT if the buffer address was invalid, + -EINVAL for an invalid buffer length (see below), + -EBUSY if there were already interrupts pending, + errors occurring when actually injecting the + interrupt. See KVM_S390_IRQ. + +This ioctl allows userspace to set the complete state of all cpu-local +interrupts currently pending for the vcpu. It is intended for restoring +interrupt state after a migration. The input parameter is a userspace buffer +containing a struct kvm_s390_irq_state:: + + struct kvm_s390_irq_state { + __u64 buf; + __u32 flags; /* will stay unused for compatibility reasons */ + __u32 len; + __u32 reserved[4]; /* will stay unused for compatibility reasons */ + }; + +The restrictions for flags and reserved apply as well. +(see KVM_S390_GET_IRQ_STATE) + +The userspace memory referenced by buf contains a struct kvm_s390_irq +for each interrupt to be injected into the guest. +If one of the interrupts could not be injected for some reason the +ioctl aborts. + +len must be a multiple of sizeof(struct kvm_s390_irq). It must be > 0 +and it must not exceed (max_vcpus + 32) * sizeof(struct kvm_s390_irq), +which is the maximum number of possibly pending cpu-local interrupts. + +4.96 KVM_SMI +------------ + +:Capability: KVM_CAP_X86_SMM +:Architectures: x86 +:Type: vcpu ioctl +:Parameters: none +:Returns: 0 on success, -1 on error + +Queues an SMI on the thread's vcpu. + +4.97 KVM_CAP_PPC_MULTITCE +------------------------- + +:Capability: KVM_CAP_PPC_MULTITCE +:Architectures: ppc +:Type: vm + +This capability means the kernel is capable of handling hypercalls +H_PUT_TCE_INDIRECT and H_STUFF_TCE without passing those into the user +space. This significantly accelerates DMA operations for PPC KVM guests. +User space should expect that its handlers for these hypercalls +are not going to be called if user space previously registered LIOBN +in KVM (via KVM_CREATE_SPAPR_TCE or similar calls). + +In order to enable H_PUT_TCE_INDIRECT and H_STUFF_TCE use in the guest, +user space might have to advertise it for the guest. For example, +IBM pSeries (sPAPR) guest starts using them if "hcall-multi-tce" is +present in the "ibm,hypertas-functions" device-tree property. + +The hypercalls mentioned above may or may not be processed successfully +in the kernel based fast path. If they can not be handled by the kernel, +they will get passed on to user space. So user space still has to have +an implementation for these despite the in kernel acceleration. + +This capability is always enabled. + +4.98 KVM_CREATE_SPAPR_TCE_64 +---------------------------- + +:Capability: KVM_CAP_SPAPR_TCE_64 +:Architectures: powerpc +:Type: vm ioctl +:Parameters: struct kvm_create_spapr_tce_64 (in) +:Returns: file descriptor for manipulating the created TCE table + +This is an extension for KVM_CAP_SPAPR_TCE which only supports 32bit +windows, described in 4.62 KVM_CREATE_SPAPR_TCE + +This capability uses extended struct in ioctl interface:: + + /* for KVM_CAP_SPAPR_TCE_64 */ + struct kvm_create_spapr_tce_64 { + __u64 liobn; + __u32 page_shift; + __u32 flags; + __u64 offset; /* in pages */ + __u64 size; /* in pages */ + }; + +The aim of extension is to support an additional bigger DMA window with +a variable page size. +KVM_CREATE_SPAPR_TCE_64 receives a 64bit window size, an IOMMU page shift and +a bus offset of the corresponding DMA window, @size and @offset are numbers +of IOMMU pages. + +@flags are not used at the moment. + +The rest of functionality is identical to KVM_CREATE_SPAPR_TCE. + +4.99 KVM_REINJECT_CONTROL +------------------------- + +:Capability: KVM_CAP_REINJECT_CONTROL +:Architectures: x86 +:Type: vm ioctl +:Parameters: struct kvm_reinject_control (in) +:Returns: 0 on success, + -EFAULT if struct kvm_reinject_control cannot be read, + -ENXIO if KVM_CREATE_PIT or KVM_CREATE_PIT2 didn't succeed earlier. + +i8254 (PIT) has two modes, reinject and !reinject. The default is reinject, +where KVM queues elapsed i8254 ticks and monitors completion of interrupt from +vector(s) that i8254 injects. Reinject mode dequeues a tick and injects its +interrupt whenever there isn't a pending interrupt from i8254. +!reinject mode injects an interrupt as soon as a tick arrives. + +:: + + struct kvm_reinject_control { + __u8 pit_reinject; + __u8 reserved[31]; + }; + +pit_reinject = 0 (!reinject mode) is recommended, unless running an old +operating system that uses the PIT for timing (e.g. Linux 2.4.x). + +4.100 KVM_PPC_CONFIGURE_V3_MMU +------------------------------ + +:Capability: KVM_CAP_PPC_RADIX_MMU or KVM_CAP_PPC_HASH_MMU_V3 +:Architectures: ppc +:Type: vm ioctl +:Parameters: struct kvm_ppc_mmuv3_cfg (in) +:Returns: 0 on success, + -EFAULT if struct kvm_ppc_mmuv3_cfg cannot be read, + -EINVAL if the configuration is invalid + +This ioctl controls whether the guest will use radix or HPT (hashed +page table) translation, and sets the pointer to the process table for +the guest. + +:: + + struct kvm_ppc_mmuv3_cfg { + __u64 flags; + __u64 process_table; + }; + +There are two bits that can be set in flags; KVM_PPC_MMUV3_RADIX and +KVM_PPC_MMUV3_GTSE. KVM_PPC_MMUV3_RADIX, if set, configures the guest +to use radix tree translation, and if clear, to use HPT translation. +KVM_PPC_MMUV3_GTSE, if set and if KVM permits it, configures the guest +to be able to use the global TLB and SLB invalidation instructions; +if clear, the guest may not use these instructions. + +The process_table field specifies the address and size of the guest +process table, which is in the guest's space. This field is formatted +as the second doubleword of the partition table entry, as defined in +the Power ISA V3.00, Book III section 5.7.6.1. + +4.101 KVM_PPC_GET_RMMU_INFO +--------------------------- + +:Capability: KVM_CAP_PPC_RADIX_MMU +:Architectures: ppc +:Type: vm ioctl +:Parameters: struct kvm_ppc_rmmu_info (out) +:Returns: 0 on success, + -EFAULT if struct kvm_ppc_rmmu_info cannot be written, + -EINVAL if no useful information can be returned + +This ioctl returns a structure containing two things: (a) a list +containing supported radix tree geometries, and (b) a list that maps +page sizes to put in the "AP" (actual page size) field for the tlbie +(TLB invalidate entry) instruction. + +:: + + struct kvm_ppc_rmmu_info { + struct kvm_ppc_radix_geom { + __u8 page_shift; + __u8 level_bits[4]; + __u8 pad[3]; + } geometries[8]; + __u32 ap_encodings[8]; + }; + +The geometries[] field gives up to 8 supported geometries for the +radix page table, in terms of the log base 2 of the smallest page +size, and the number of bits indexed at each level of the tree, from +the PTE level up to the PGD level in that order. Any unused entries +will have 0 in the page_shift field. + +The ap_encodings gives the supported page sizes and their AP field +encodings, encoded with the AP value in the top 3 bits and the log +base 2 of the page size in the bottom 6 bits. + +4.102 KVM_PPC_RESIZE_HPT_PREPARE +-------------------------------- + +:Capability: KVM_CAP_SPAPR_RESIZE_HPT +:Architectures: powerpc +:Type: vm ioctl +:Parameters: struct kvm_ppc_resize_hpt (in) +:Returns: 0 on successful completion, + >0 if a new HPT is being prepared, the value is an estimated + number of milliseconds until preparation is complete, + -EFAULT if struct kvm_reinject_control cannot be read, + -EINVAL if the supplied shift or flags are invalid, + -ENOMEM if unable to allocate the new HPT, + -ENOSPC if there was a hash collision + +:: + + struct kvm_ppc_rmmu_info { + struct kvm_ppc_radix_geom { + __u8 page_shift; + __u8 level_bits[4]; + __u8 pad[3]; + } geometries[8]; + __u32 ap_encodings[8]; + }; + +The geometries[] field gives up to 8 supported geometries for the +radix page table, in terms of the log base 2 of the smallest page +size, and the number of bits indexed at each level of the tree, from +the PTE level up to the PGD level in that order. Any unused entries +will have 0 in the page_shift field. + +The ap_encodings gives the supported page sizes and their AP field +encodings, encoded with the AP value in the top 3 bits and the log +base 2 of the page size in the bottom 6 bits. + +4.102 KVM_PPC_RESIZE_HPT_PREPARE +-------------------------------- + +:Capability: KVM_CAP_SPAPR_RESIZE_HPT +:Architectures: powerpc +:Type: vm ioctl +:Parameters: struct kvm_ppc_resize_hpt (in) +:Returns: 0 on successful completion, + >0 if a new HPT is being prepared, the value is an estimated + number of milliseconds until preparation is complete, + -EFAULT if struct kvm_reinject_control cannot be read, + -EINVAL if the supplied shift or flags are invalid,when moving existing + HPT entries to the new HPT, + -EIO on other error conditions + +Used to implement the PAPR extension for runtime resizing of a guest's +Hashed Page Table (HPT). Specifically this starts, stops or monitors +the preparation of a new potential HPT for the guest, essentially +implementing the H_RESIZE_HPT_PREPARE hypercall. + +If called with shift > 0 when there is no pending HPT for the guest, +this begins preparation of a new pending HPT of size 2^(shift) bytes. +It then returns a positive integer with the estimated number of +milliseconds until preparation is complete. + +If called when there is a pending HPT whose size does not match that +requested in the parameters, discards the existing pending HPT and +creates a new one as above. + +If called when there is a pending HPT of the size requested, will: + + * If preparation of the pending HPT is already complete, return 0 + * If preparation of the pending HPT has failed, return an error + code, then discard the pending HPT. + * If preparation of the pending HPT is still in progress, return an + estimated number of milliseconds until preparation is complete. + +If called with shift == 0, discards any currently pending HPT and +returns 0 (i.e. cancels any in-progress preparation). + +flags is reserved for future expansion, currently setting any bits in +flags will result in an -EINVAL. + +Normally this will be called repeatedly with the same parameters until +it returns <= 0. The first call will initiate preparation, subsequent +ones will monitor preparation until it completes or fails. + +:: + + struct kvm_ppc_resize_hpt { + __u64 flags; + __u32 shift; + __u32 pad; + }; + +4.103 KVM_PPC_RESIZE_HPT_COMMIT +------------------------------- + +:Capability: KVM_CAP_SPAPR_RESIZE_HPT +:Architectures: powerpc +:Type: vm ioctl +:Parameters: struct kvm_ppc_resize_hpt (in) +:Returns: 0 on successful completion, + -EFAULT if struct kvm_reinject_control cannot be read, + -EINVAL if the supplied shift or flags are invalid, + -ENXIO is there is no pending HPT, or the pending HPT doesn't + have the requested size, + -EBUSY if the pending HPT is not fully prepared, + -ENOSPC if there was a hash collision when moving existing + HPT entries to the new HPT, + -EIO on other error conditions + +Used to implement the PAPR extension for runtime resizing of a guest's +Hashed Page Table (HPT). Specifically this requests that the guest be +transferred to working with the new HPT, essentially implementing the +H_RESIZE_HPT_COMMIT hypercall. + +This should only be called after KVM_PPC_RESIZE_HPT_PREPARE has +returned 0 with the same parameters. In other cases +KVM_PPC_RESIZE_HPT_COMMIT will return an error (usually -ENXIO or +-EBUSY, though others may be possible if the preparation was started, +but failed). + +This will have undefined effects on the guest if it has not already +placed itself in a quiescent state where no vcpu will make MMU enabled +memory accesses. + +On succsful completion, the pending HPT will become the guest's active +HPT and the previous HPT will be discarded. + +On failure, the guest will still be operating on its previous HPT. + +:: + + struct kvm_ppc_resize_hpt { + __u64 flags; + __u32 shift; + __u32 pad; + }; + +4.104 KVM_X86_GET_MCE_CAP_SUPPORTED +----------------------------------- + +:Capability: KVM_CAP_MCE +:Architectures: x86 +:Type: system ioctl +:Parameters: u64 mce_cap (out) +:Returns: 0 on success, -1 on error + +Returns supported MCE capabilities. The u64 mce_cap parameter +has the same format as the MSR_IA32_MCG_CAP register. Supported +capabilities will have the corresponding bits set. + +4.105 KVM_X86_SETUP_MCE +----------------------- + +:Capability: KVM_CAP_MCE +:Architectures: x86 +:Type: vcpu ioctl +:Parameters: u64 mcg_cap (in) +:Returns: 0 on success, + -EFAULT if u64 mcg_cap cannot be read, + -EINVAL if the requested number of banks is invalid, + -EINVAL if requested MCE capability is not supported. + +Initializes MCE support for use. The u64 mcg_cap parameter +has the same format as the MSR_IA32_MCG_CAP register and +specifies which capabilities should be enabled. The maximum +supported number of error-reporting banks can be retrieved when +checking for KVM_CAP_MCE. The supported capabilities can be +retrieved with KVM_X86_GET_MCE_CAP_SUPPORTED. + +4.106 KVM_X86_SET_MCE +--------------------- + +:Capability: KVM_CAP_MCE +:Architectures: x86 +:Type: vcpu ioctl +:Parameters: struct kvm_x86_mce (in) +:Returns: 0 on success, + -EFAULT if struct kvm_x86_mce cannot be read, + -EINVAL if the bank number is invalid, + -EINVAL if VAL bit is not set in status field. + +Inject a machine check error (MCE) into the guest. The input +parameter is:: + + struct kvm_x86_mce { + __u64 status; + __u64 addr; + __u64 misc; + __u64 mcg_status; + __u8 bank; + __u8 pad1[7]; + __u64 pad2[3]; + }; + +If the MCE being reported is an uncorrected error, KVM will +inject it as an MCE exception into the guest. If the guest +MCG_STATUS register reports that an MCE is in progress, KVM +causes an KVM_EXIT_SHUTDOWN vmexit. + +Otherwise, if the MCE is a corrected error, KVM will just +store it in the corresponding bank (provided this bank is +not holding a previously reported uncorrected error). + +4.107 KVM_S390_GET_CMMA_BITS +---------------------------- + +:Capability: KVM_CAP_S390_CMMA_MIGRATION +:Architectures: s390 +:Type: vm ioctl +:Parameters: struct kvm_s390_cmma_log (in, out) +:Returns: 0 on success, a negative value on error + +This ioctl is used to get the values of the CMMA bits on the s390 +architecture. It is meant to be used in two scenarios: + +- During live migration to save the CMMA values. Live migration needs + to be enabled via the KVM_REQ_START_MIGRATION VM property. +- To non-destructively peek at the CMMA values, with the flag + KVM_S390_CMMA_PEEK set. + +The ioctl takes parameters via the kvm_s390_cmma_log struct. The desired +values are written to a buffer whose location is indicated via the "values" +member in the kvm_s390_cmma_log struct. The values in the input struct are +also updated as needed. + +Each CMMA value takes up one byte. + +:: + + struct kvm_s390_cmma_log { + __u64 start_gfn; + __u32 count; + __u32 flags; + union { + __u64 remaining; + __u64 mask; + }; + __u64 values; + }; + +start_gfn is the number of the first guest frame whose CMMA values are +to be retrieved, + +count is the length of the buffer in bytes, + +values points to the buffer where the result will be written to. + +If count is greater than KVM_S390_SKEYS_MAX, then it is considered to be +KVM_S390_SKEYS_MAX. KVM_S390_SKEYS_MAX is re-used for consistency with +other ioctls. + +The result is written in the buffer pointed to by the field values, and +the values of the input parameter are updated as follows. + +Depending on the flags, different actions are performed. The only +supported flag so far is KVM_S390_CMMA_PEEK. + +The default behaviour if KVM_S390_CMMA_PEEK is not set is: +start_gfn will indicate the first page frame whose CMMA bits were dirty. +It is not necessarily the same as the one passed as input, as clean pages +are skipped. + +count will indicate the number of bytes actually written in the buffer. +It can (and very often will) be smaller than the input value, since the +buffer is only filled until 16 bytes of clean values are found (which +are then not copied in the buffer). Since a CMMA migration block needs +the base address and the length, for a total of 16 bytes, we will send +back some clean data if there is some dirty data afterwards, as long as +the size of the clean data does not exceed the size of the header. This +allows to minimize the amount of data to be saved or transferred over +the network at the expense of more roundtrips to userspace. The next +invocation of the ioctl will skip over all the clean values, saving +potentially more than just the 16 bytes we found. + +If KVM_S390_CMMA_PEEK is set: +the existing storage attributes are read even when not in migration +mode, and no other action is performed; + +the output start_gfn will be equal to the input start_gfn, + +the output count will be equal to the input count, except if the end of +memory has been reached. + +In both cases: +the field "remaining" will indicate the total number of dirty CMMA values +still remaining, or 0 if KVM_S390_CMMA_PEEK is set and migration mode is +not enabled. + +mask is unused. + +values points to the userspace buffer where the result will be stored. + +This ioctl can fail with -ENOMEM if not enough memory can be allocated to +complete the task, with -ENXIO if CMMA is not enabled, with -EINVAL if +KVM_S390_CMMA_PEEK is not set but migration mode was not enabled, with +-EFAULT if the userspace address is invalid or if no page table is +present for the addresses (e.g. when using hugepages). + +4.108 KVM_S390_SET_CMMA_BITS +---------------------------- + +:Capability: KVM_CAP_S390_CMMA_MIGRATION +:Architectures: s390 +:Type: vm ioctl +:Parameters: struct kvm_s390_cmma_log (in) +:Returns: 0 on success, a negative value on error + +This ioctl is used to set the values of the CMMA bits on the s390 +architecture. It is meant to be used during live migration to restore +the CMMA values, but there are no restrictions on its use. +The ioctl takes parameters via the kvm_s390_cmma_values struct. +Each CMMA value takes up one byte. + +:: + + struct kvm_s390_cmma_log { + __u64 start_gfn; + __u32 count; + __u32 flags; + union { + __u64 remaining; + __u64 mask; + }; + __u64 values; + }; + +start_gfn indicates the starting guest frame number, + +count indicates how many values are to be considered in the buffer, + +flags is not used and must be 0. + +mask indicates which PGSTE bits are to be considered. + +remaining is not used. + +values points to the buffer in userspace where to store the values. + +This ioctl can fail with -ENOMEM if not enough memory can be allocated to +complete the task, with -ENXIO if CMMA is not enabled, with -EINVAL if +the count field is too large (e.g. more than KVM_S390_CMMA_SIZE_MAX) or +if the flags field was not 0, with -EFAULT if the userspace address is +invalid, if invalid pages are written to (e.g. after the end of memory) +or if no page table is present for the addresses (e.g. when using +hugepages). + +4.109 KVM_PPC_GET_CPU_CHAR +-------------------------- + +:Capability: KVM_CAP_PPC_GET_CPU_CHAR +:Architectures: powerpc +:Type: vm ioctl +:Parameters: struct kvm_ppc_cpu_char (out) +:Returns: 0 on successful completion, + -EFAULT if struct kvm_ppc_cpu_char cannot be written + +This ioctl gives userspace information about certain characteristics +of the CPU relating to speculative execution of instructions and +possible information leakage resulting from speculative execution (see +CVE-2017-5715, CVE-2017-5753 and CVE-2017-5754). The information is +returned in struct kvm_ppc_cpu_char, which looks like this:: + + struct kvm_ppc_cpu_char { + __u64 character; /* characteristics of the CPU */ + __u64 behaviour; /* recommended software behaviour */ + __u64 character_mask; /* valid bits in character */ + __u64 behaviour_mask; /* valid bits in behaviour */ + }; + +For extensibility, the character_mask and behaviour_mask fields +indicate which bits of character and behaviour have been filled in by +the kernel. If the set of defined bits is extended in future then +userspace will be able to tell whether it is running on a kernel that +knows about the new bits. + +The character field describes attributes of the CPU which can help +with preventing inadvertent information disclosure - specifically, +whether there is an instruction to flash-invalidate the L1 data cache +(ori 30,30,0 or mtspr SPRN_TRIG2,rN), whether the L1 data cache is set +to a mode where entries can only be used by the thread that created +them, whether the bcctr[l] instruction prevents speculation, and +whether a speculation barrier instruction (ori 31,31,0) is provided. + +The behaviour field describes actions that software should take to +prevent inadvertent information disclosure, and thus describes which +vulnerabilities the hardware is subject to; specifically whether the +L1 data cache should be flushed when returning to user mode from the +kernel, and whether a speculation barrier should be placed between an +array bounds check and the array access. + +These fields use the same bit definitions as the new +H_GET_CPU_CHARACTERISTICS hypercall. + +4.110 KVM_MEMORY_ENCRYPT_OP +--------------------------- + +:Capability: basic +:Architectures: x86 +:Type: system +:Parameters: an opaque platform specific structure (in/out) +:Returns: 0 on success; -1 on error + +If the platform supports creating encrypted VMs then this ioctl can be used +for issuing platform-specific memory encryption commands to manage those +encrypted VMs. + +Currently, this ioctl is used for issuing Secure Encrypted Virtualization +(SEV) commands on AMD Processors. The SEV commands are defined in +Documentation/virt/kvm/amd-memory-encryption.rst. + +4.111 KVM_MEMORY_ENCRYPT_REG_REGION +----------------------------------- + +:Capability: basic +:Architectures: x86 +:Type: system +:Parameters: struct kvm_enc_region (in) +:Returns: 0 on success; -1 on error + +This ioctl can be used to register a guest memory region which may +contain encrypted data (e.g. guest RAM, SMRAM etc). + +It is used in the SEV-enabled guest. When encryption is enabled, a guest +memory region may contain encrypted data. The SEV memory encryption +engine uses a tweak such that two identical plaintext pages, each at +different locations will have differing ciphertexts. So swapping or +moving ciphertext of those pages will not result in plaintext being +swapped. So relocating (or migrating) physical backing pages for the SEV +guest will require some additional steps. + +Note: The current SEV key management spec does not provide commands to +swap or migrate (move) ciphertext pages. Hence, for now we pin the guest +memory region registered with the ioctl. + +4.112 KVM_MEMORY_ENCRYPT_UNREG_REGION +------------------------------------- + +:Capability: basic +:Architectures: x86 +:Type: system +:Parameters: struct kvm_enc_region (in) +:Returns: 0 on success; -1 on error + +This ioctl can be used to unregister the guest memory region registered +with KVM_MEMORY_ENCRYPT_REG_REGION ioctl above. + +4.113 KVM_HYPERV_EVENTFD +------------------------ + +:Capability: KVM_CAP_HYPERV_EVENTFD +:Architectures: x86 +:Type: vm ioctl +:Parameters: struct kvm_hyperv_eventfd (in) + +This ioctl (un)registers an eventfd to receive notifications from the guest on +the specified Hyper-V connection id through the SIGNAL_EVENT hypercall, without +causing a user exit. SIGNAL_EVENT hypercall with non-zero event flag number +(bits 24-31) still triggers a KVM_EXIT_HYPERV_HCALL user exit. + +:: + + struct kvm_hyperv_eventfd { + __u32 conn_id; + __s32 fd; + __u32 flags; + __u32 padding[3]; + }; + +The conn_id field should fit within 24 bits:: + + #define KVM_HYPERV_CONN_ID_MASK 0x00ffffff + +The acceptable values for the flags field are:: + + #define KVM_HYPERV_EVENTFD_DEASSIGN (1 << 0) + +:Returns: 0 on success, + -EINVAL if conn_id or flags is outside the allowed range, + -ENOENT on deassign if the conn_id isn't registered, + -EEXIST on assign if the conn_id is already registered + +4.114 KVM_GET_NESTED_STATE +-------------------------- + +:Capability: KVM_CAP_NESTED_STATE +:Architectures: x86 +:Type: vcpu ioctl +:Parameters: struct kvm_nested_state (in/out) +:Returns: 0 on success, -1 on error + +Errors: + + ===== ============================================================= + E2BIG the total state size exceeds the value of 'size' specified by + the user; the size required will be written into size. + ===== ============================================================= + +:: + + struct kvm_nested_state { + __u16 flags; + __u16 format; + __u32 size; + + union { + struct kvm_vmx_nested_state_hdr vmx; + struct kvm_svm_nested_state_hdr svm; + + /* Pad the header to 128 bytes. */ + __u8 pad[120]; + } hdr; + + union { + struct kvm_vmx_nested_state_data vmx[0]; + struct kvm_svm_nested_state_data svm[0]; + } data; + }; + + #define KVM_STATE_NESTED_GUEST_MODE 0x00000001 + #define KVM_STATE_NESTED_RUN_PENDING 0x00000002 + #define KVM_STATE_NESTED_EVMCS 0x00000004 + + #define KVM_STATE_NESTED_FORMAT_VMX 0 + #define KVM_STATE_NESTED_FORMAT_SVM 1 + + #define KVM_STATE_NESTED_VMX_VMCS_SIZE 0x1000 + + #define KVM_STATE_NESTED_VMX_SMM_GUEST_MODE 0x00000001 + #define KVM_STATE_NESTED_VMX_SMM_VMXON 0x00000002 + + struct kvm_vmx_nested_state_hdr { + __u64 vmxon_pa; + __u64 vmcs12_pa; + + struct { + __u16 flags; + } smm; + }; + + struct kvm_vmx_nested_state_data { + __u8 vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE]; + __u8 shadow_vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE]; + }; + +This ioctl copies the vcpu's nested virtualization state from the kernel to +userspace. + +The maximum size of the state can be retrieved by passing KVM_CAP_NESTED_STATE +to the KVM_CHECK_EXTENSION ioctl(). + +4.115 KVM_SET_NESTED_STATE +-------------------------- + +:Capability: KVM_CAP_NESTED_STATE +:Architectures: x86 +:Type: vcpu ioctl +:Parameters: struct kvm_nested_state (in) +:Returns: 0 on success, -1 on error + +This copies the vcpu's kvm_nested_state struct from userspace to the kernel. +For the definition of struct kvm_nested_state, see KVM_GET_NESTED_STATE. + +4.116 KVM_(UN)REGISTER_COALESCED_MMIO +------------------------------------- + +:Capability: KVM_CAP_COALESCED_MMIO (for coalesced mmio) + KVM_CAP_COALESCED_PIO (for coalesced pio) +:Architectures: all +:Type: vm ioctl +:Parameters: struct kvm_coalesced_mmio_zone +:Returns: 0 on success, < 0 on error + +Coalesced I/O is a performance optimization that defers hardware +register write emulation so that userspace exits are avoided. It is +typically used to reduce the overhead of emulating frequently accessed +hardware registers. + +When a hardware register is configured for coalesced I/O, write accesses +do not exit to userspace and their value is recorded in a ring buffer +that is shared between kernel and userspace. + +Coalesced I/O is used if one or more write accesses to a hardware +register can be deferred until a read or a write to another hardware +register on the same device. This last access will cause a vmexit and +userspace will process accesses from the ring buffer before emulating +it. That will avoid exiting to userspace on repeated writes. + +Coalesced pio is based on coalesced mmio. There is little difference +between coalesced mmio and pio except that coalesced pio records accesses +to I/O ports. + +4.117 KVM_CLEAR_DIRTY_LOG (vm ioctl) +------------------------------------ + +:Capability: KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 +:Architectures: x86, arm, arm64, mips +:Type: vm ioctl +:Parameters: struct kvm_dirty_log (in) +:Returns: 0 on success, -1 on error + +:: + + /* for KVM_CLEAR_DIRTY_LOG */ + struct kvm_clear_dirty_log { + __u32 slot; + __u32 num_pages; + __u64 first_page; + union { + void __user *dirty_bitmap; /* one bit per page */ + __u64 padding; + }; + }; + +The ioctl clears the dirty status of pages in a memory slot, according to +the bitmap that is passed in struct kvm_clear_dirty_log's dirty_bitmap +field. Bit 0 of the bitmap corresponds to page "first_page" in the +memory slot, and num_pages is the size in bits of the input bitmap. +first_page must be a multiple of 64; num_pages must also be a multiple of +64 unless first_page + num_pages is the size of the memory slot. For each +bit that is set in the input bitmap, the corresponding page is marked "clean" +in KVM's dirty bitmap, and dirty tracking is re-enabled for that page +(for example via write-protection, or by clearing the dirty bit in +a page table entry). + +If KVM_CAP_MULTI_ADDRESS_SPACE is available, bits 16-31 specifies +the address space for which you want to return the dirty bitmap. +They must be less than the value that KVM_CHECK_EXTENSION returns for +the KVM_CAP_MULTI_ADDRESS_SPACE capability. + +This ioctl is mostly useful when KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 +is enabled; for more information, see the description of the capability. +However, it can always be used as long as KVM_CHECK_EXTENSION confirms +that KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 is present. + +4.118 KVM_GET_SUPPORTED_HV_CPUID +-------------------------------- + +:Capability: KVM_CAP_HYPERV_CPUID +:Architectures: x86 +:Type: vcpu ioctl +:Parameters: struct kvm_cpuid2 (in/out) +:Returns: 0 on success, -1 on error + +:: + + struct kvm_cpuid2 { + __u32 nent; + __u32 padding; + struct kvm_cpuid_entry2 entries[0]; + }; + + struct kvm_cpuid_entry2 { + __u32 function; + __u32 index; + __u32 flags; + __u32 eax; + __u32 ebx; + __u32 ecx; + __u32 edx; + __u32 padding[3]; + }; + +This ioctl returns x86 cpuid features leaves related to Hyper-V emulation in +KVM. Userspace can use the information returned by this ioctl to construct +cpuid information presented to guests consuming Hyper-V enlightenments (e.g. +Windows or Hyper-V guests). + +CPUID feature leaves returned by this ioctl are defined by Hyper-V Top Level +Functional Specification (TLFS). These leaves can't be obtained with +KVM_GET_SUPPORTED_CPUID ioctl because some of them intersect with KVM feature +leaves (0x40000000, 0x40000001). + +Currently, the following list of CPUID leaves are returned: + - HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS + - HYPERV_CPUID_INTERFACE + - HYPERV_CPUID_VERSION + - HYPERV_CPUID_FEATURES + - HYPERV_CPUID_ENLIGHTMENT_INFO + - HYPERV_CPUID_IMPLEMENT_LIMITS + - HYPERV_CPUID_NESTED_FEATURES + +HYPERV_CPUID_NESTED_FEATURES leaf is only exposed when Enlightened VMCS was +enabled on the corresponding vCPU (KVM_CAP_HYPERV_ENLIGHTENED_VMCS). + +Userspace invokes KVM_GET_SUPPORTED_CPUID by passing a kvm_cpuid2 structure +with the 'nent' field indicating the number of entries in the variable-size +array 'entries'. If the number of entries is too low to describe all Hyper-V +feature leaves, an error (E2BIG) is returned. If the number is more or equal +to the number of Hyper-V feature leaves, the 'nent' field is adjusted to the +number of valid entries in the 'entries' array, which is then filled. + +'index' and 'flags' fields in 'struct kvm_cpuid_entry2' are currently reserved, +userspace should not expect to get any particular value there. + +4.119 KVM_ARM_VCPU_FINALIZE +--------------------------- + +:Architectures: arm, arm64 +:Type: vcpu ioctl +:Parameters: int feature (in) +:Returns: 0 on success, -1 on error + +Errors: + + ====== ============================================================== + EPERM feature not enabled, needs configuration, or already finalized + EINVAL feature unknown or not present + ====== ============================================================== + +Recognised values for feature: + + ===== =========================================== + arm64 KVM_ARM_VCPU_SVE (requires KVM_CAP_ARM_SVE) + ===== =========================================== + +Finalizes the configuration of the specified vcpu feature. + +The vcpu must already have been initialised, enabling the affected feature, by +means of a successful KVM_ARM_VCPU_INIT call with the appropriate flag set in +features[]. + +For affected vcpu features, this is a mandatory step that must be performed +before the vcpu is fully usable. + +Between KVM_ARM_VCPU_INIT and KVM_ARM_VCPU_FINALIZE, the feature may be +configured by use of ioctls such as KVM_SET_ONE_REG. The exact configuration +that should be performaned and how to do it are feature-dependent. + +Other calls that depend on a particular feature being finalized, such as +KVM_RUN, KVM_GET_REG_LIST, KVM_GET_ONE_REG and KVM_SET_ONE_REG, will fail with +-EPERM unless the feature has already been finalized by means of a +KVM_ARM_VCPU_FINALIZE call. + +See KVM_ARM_VCPU_INIT for details of vcpu features that require finalization +using this ioctl. + +4.120 KVM_SET_PMU_EVENT_FILTER +------------------------------ + +:Capability: KVM_CAP_PMU_EVENT_FILTER +:Architectures: x86 +:Type: vm ioctl +:Parameters: struct kvm_pmu_event_filter (in) +:Returns: 0 on success, -1 on error + +:: + + struct kvm_pmu_event_filter { + __u32 action; + __u32 nevents; + __u32 fixed_counter_bitmap; + __u32 flags; + __u32 pad[4]; + __u64 events[0]; + }; + +This ioctl restricts the set of PMU events that the guest can program. +The argument holds a list of events which will be allowed or denied. +The eventsel+umask of each event the guest attempts to program is compared +against the events field to determine whether the guest should have access. +The events field only controls general purpose counters; fixed purpose +counters are controlled by the fixed_counter_bitmap. + +No flags are defined yet, the field must be zero. + +Valid values for 'action':: + + #define KVM_PMU_EVENT_ALLOW 0 + #define KVM_PMU_EVENT_DENY 1 + +4.121 KVM_PPC_SVM_OFF +--------------------- + +:Capability: basic +:Architectures: powerpc +:Type: vm ioctl +:Parameters: none +:Returns: 0 on successful completion, + +Errors: + + ====== ================================================================ + EINVAL if ultravisor failed to terminate the secure guest + ENOMEM if hypervisor failed to allocate new radix page tables for guest + ====== ================================================================ + +This ioctl is used to turn off the secure mode of the guest or transition +the guest from secure mode to normal mode. This is invoked when the guest +is reset. This has no effect if called for a normal guest. + +This ioctl issues an ultravisor call to terminate the secure guest, +unpins the VPA pages and releases all the device pages that are used to +track the secure pages by hypervisor. + +4.122 KVM_S390_NORMAL_RESET + +Capability: KVM_CAP_S390_VCPU_RESETS +Architectures: s390 +Type: vcpu ioctl +Parameters: none +Returns: 0 + +This ioctl resets VCPU registers and control structures according to +the cpu reset definition in the POP (Principles Of Operation). + +4.123 KVM_S390_INITIAL_RESET + +Capability: none +Architectures: s390 +Type: vcpu ioctl +Parameters: none +Returns: 0 + +This ioctl resets VCPU registers and control structures according to +the initial cpu reset definition in the POP. However, the cpu is not +put into ESA mode. This reset is a superset of the normal reset. + +4.124 KVM_S390_CLEAR_RESET + +Capability: KVM_CAP_S390_VCPU_RESETS +Architectures: s390 +Type: vcpu ioctl +Parameters: none +Returns: 0 + +This ioctl resets VCPU registers and control structures according to +the clear cpu reset definition in the POP. However, the cpu is not put +into ESA mode. This reset is a superset of the initial reset. + + +5. The kvm_run structure +======================== + +Application code obtains a pointer to the kvm_run structure by +mmap()ing a vcpu fd. From that point, application code can control +execution by changing fields in kvm_run prior to calling the KVM_RUN +ioctl, and obtain information about the reason KVM_RUN returned by +looking up structure members. + +:: + + struct kvm_run { + /* in */ + __u8 request_interrupt_window; + +Request that KVM_RUN return when it becomes possible to inject external +interrupts into the guest. Useful in conjunction with KVM_INTERRUPT. + +:: + + __u8 immediate_exit; + +This field is polled once when KVM_RUN starts; if non-zero, KVM_RUN +exits immediately, returning -EINTR. In the common scenario where a +signal is used to "kick" a VCPU out of KVM_RUN, this field can be used +to avoid usage of KVM_SET_SIGNAL_MASK, which has worse scalability. +Rather than blocking the signal outside KVM_RUN, userspace can set up +a signal handler that sets run->immediate_exit to a non-zero value. + +This field is ignored if KVM_CAP_IMMEDIATE_EXIT is not available. + +:: + + __u8 padding1[6]; + + /* out */ + __u32 exit_reason; + +When KVM_RUN has returned successfully (return value 0), this informs +application code why KVM_RUN has returned. Allowable values for this +field are detailed below. + +:: + + __u8 ready_for_interrupt_injection; + +If request_interrupt_window has been specified, this field indicates +an interrupt can be injected now with KVM_INTERRUPT. + +:: + + __u8 if_flag; + +The value of the current interrupt flag. Only valid if in-kernel +local APIC is not used. + +:: + + __u16 flags; + +More architecture-specific flags detailing state of the VCPU that may +affect the device's behavior. The only currently defined flag is +KVM_RUN_X86_SMM, which is valid on x86 machines and is set if the +VCPU is in system management mode. + +:: + + /* in (pre_kvm_run), out (post_kvm_run) */ + __u64 cr8; + +The value of the cr8 register. Only valid if in-kernel local APIC is +not used. Both input and output. + +:: + + __u64 apic_base; + +The value of the APIC BASE msr. Only valid if in-kernel local +APIC is not used. Both input and output. + +:: + + union { + /* KVM_EXIT_UNKNOWN */ + struct { + __u64 hardware_exit_reason; + } hw; + +If exit_reason is KVM_EXIT_UNKNOWN, the vcpu has exited due to unknown +reasons. Further architecture-specific information is available in +hardware_exit_reason. + +:: + + /* KVM_EXIT_FAIL_ENTRY */ + struct { + __u64 hardware_entry_failure_reason; + } fail_entry; + +If exit_reason is KVM_EXIT_FAIL_ENTRY, the vcpu could not be run due +to unknown reasons. Further architecture-specific information is +available in hardware_entry_failure_reason. + +:: + + /* KVM_EXIT_EXCEPTION */ + struct { + __u32 exception; + __u32 error_code; + } ex; + +Unused. + +:: + + /* KVM_EXIT_IO */ + struct { + #define KVM_EXIT_IO_IN 0 + #define KVM_EXIT_IO_OUT 1 + __u8 direction; + __u8 size; /* bytes */ + __u16 port; + __u32 count; + __u64 data_offset; /* relative to kvm_run start */ + } io; + +If exit_reason is KVM_EXIT_IO, then the vcpu has +executed a port I/O instruction which could not be satisfied by kvm. +data_offset describes where the data is located (KVM_EXIT_IO_OUT) or +where kvm expects application code to place the data for the next +KVM_RUN invocation (KVM_EXIT_IO_IN). Data format is a packed array. + +:: + + /* KVM_EXIT_DEBUG */ + struct { + struct kvm_debug_exit_arch arch; + } debug; + +If the exit_reason is KVM_EXIT_DEBUG, then a vcpu is processing a debug event +for which architecture specific information is returned. + +:: + + /* KVM_EXIT_MMIO */ + struct { + __u64 phys_addr; + __u8 data[8]; + __u32 len; + __u8 is_write; + } mmio; + +If exit_reason is KVM_EXIT_MMIO, then the vcpu has +executed a memory-mapped I/O instruction which could not be satisfied +by kvm. The 'data' member contains the written data if 'is_write' is +true, and should be filled by application code otherwise. + +The 'data' member contains, in its first 'len' bytes, the value as it would +appear if the VCPU performed a load or store of the appropriate width directly +to the byte array. + +.. note:: + + For KVM_EXIT_IO, KVM_EXIT_MMIO, KVM_EXIT_OSI, KVM_EXIT_PAPR and + KVM_EXIT_EPR the corresponding + +operations are complete (and guest state is consistent) only after userspace +has re-entered the kernel with KVM_RUN. The kernel side will first finish +incomplete operations and then check for pending signals. Userspace +can re-enter the guest with an unmasked signal pending to complete +pending operations. + +:: + + /* KVM_EXIT_HYPERCALL */ + struct { + __u64 nr; + __u64 args[6]; + __u64 ret; + __u32 longmode; + __u32 pad; + } hypercall; + +Unused. This was once used for 'hypercall to userspace'. To implement +such functionality, use KVM_EXIT_IO (x86) or KVM_EXIT_MMIO (all except s390). + +.. note:: KVM_EXIT_IO is significantly faster than KVM_EXIT_MMIO. + +:: + + /* KVM_EXIT_TPR_ACCESS */ + struct { + __u64 rip; + __u32 is_write; + __u32 pad; + } tpr_access; + +To be documented (KVM_TPR_ACCESS_REPORTING). + +:: + + /* KVM_EXIT_S390_SIEIC */ + struct { + __u8 icptcode; + __u64 mask; /* psw upper half */ + __u64 addr; /* psw lower half */ + __u16 ipa; + __u32 ipb; + } s390_sieic; + +s390 specific. + +:: + + /* KVM_EXIT_S390_RESET */ + #define KVM_S390_RESET_POR 1 + #define KVM_S390_RESET_CLEAR 2 + #define KVM_S390_RESET_SUBSYSTEM 4 + #define KVM_S390_RESET_CPU_INIT 8 + #define KVM_S390_RESET_IPL 16 + __u64 s390_reset_flags; + +s390 specific. + +:: + + /* KVM_EXIT_S390_UCONTROL */ + struct { + __u64 trans_exc_code; + __u32 pgm_code; + } s390_ucontrol; + +s390 specific. A page fault has occurred for a user controlled virtual +machine (KVM_VM_S390_UNCONTROL) on it's host page table that cannot be +resolved by the kernel. +The program code and the translation exception code that were placed +in the cpu's lowcore are presented here as defined by the z Architecture +Principles of Operation Book in the Chapter for Dynamic Address Translation +(DAT) + +:: + + /* KVM_EXIT_DCR */ + struct { + __u32 dcrn; + __u32 data; + __u8 is_write; + } dcr; + +Deprecated - was used for 440 KVM. + +:: + + /* KVM_EXIT_OSI */ + struct { + __u64 gprs[32]; + } osi; + +MOL uses a special hypercall interface it calls 'OSI'. To enable it, we catch +hypercalls and exit with this exit struct that contains all the guest gprs. + +If exit_reason is KVM_EXIT_OSI, then the vcpu has triggered such a hypercall. +Userspace can now handle the hypercall and when it's done modify the gprs as +necessary. Upon guest entry all guest GPRs will then be replaced by the values +in this struct. + +:: + + /* KVM_EXIT_PAPR_HCALL */ + struct { + __u64 nr; + __u64 ret; + __u64 args[9]; + } papr_hcall; + +This is used on 64-bit PowerPC when emulating a pSeries partition, +e.g. with the 'pseries' machine type in qemu. It occurs when the +guest does a hypercall using the 'sc 1' instruction. The 'nr' field +contains the hypercall number (from the guest R3), and 'args' contains +the arguments (from the guest R4 - R12). Userspace should put the +return code in 'ret' and any extra returned values in args[]. +The possible hypercalls are defined in the Power Architecture Platform +Requirements (PAPR) document available from www.power.org (free +developer registration required to access it). + +:: + + /* KVM_EXIT_S390_TSCH */ + struct { + __u16 subchannel_id; + __u16 subchannel_nr; + __u32 io_int_parm; + __u32 io_int_word; + __u32 ipb; + __u8 dequeued; + } s390_tsch; + +s390 specific. This exit occurs when KVM_CAP_S390_CSS_SUPPORT has been enabled +and TEST SUBCHANNEL was intercepted. If dequeued is set, a pending I/O +interrupt for the target subchannel has been dequeued and subchannel_id, +subchannel_nr, io_int_parm and io_int_word contain the parameters for that +interrupt. ipb is needed for instruction parameter decoding. + +:: + + /* KVM_EXIT_EPR */ + struct { + __u32 epr; + } epr; + +On FSL BookE PowerPC chips, the interrupt controller has a fast patch +interrupt acknowledge path to the core. When the core successfully +delivers an interrupt, it automatically populates the EPR register with +the interrupt vector number and acknowledges the interrupt inside +the interrupt controller. + +In case the interrupt controller lives in user space, we need to do +the interrupt acknowledge cycle through it to fetch the next to be +delivered interrupt vector using this exit. + +It gets triggered whenever both KVM_CAP_PPC_EPR are enabled and an +external interrupt has just been delivered into the guest. User space +should put the acknowledged interrupt vector into the 'epr' field. + +:: + + /* KVM_EXIT_SYSTEM_EVENT */ + struct { + #define KVM_SYSTEM_EVENT_SHUTDOWN 1 + #define KVM_SYSTEM_EVENT_RESET 2 + #define KVM_SYSTEM_EVENT_CRASH 3 + __u32 type; + __u64 flags; + } system_event; + +If exit_reason is KVM_EXIT_SYSTEM_EVENT then the vcpu has triggered +a system-level event using some architecture specific mechanism (hypercall +or some special instruction). In case of ARM/ARM64, this is triggered using +HVC instruction based PSCI call from the vcpu. The 'type' field describes +the system-level event type. The 'flags' field describes architecture +specific flags for the system-level event. + +Valid values for 'type' are: + + - KVM_SYSTEM_EVENT_SHUTDOWN -- the guest has requested a shutdown of the + VM. Userspace is not obliged to honour this, and if it does honour + this does not need to destroy the VM synchronously (ie it may call + KVM_RUN again before shutdown finally occurs). + - KVM_SYSTEM_EVENT_RESET -- the guest has requested a reset of the VM. + As with SHUTDOWN, userspace can choose to ignore the request, or + to schedule the reset to occur in the future and may call KVM_RUN again. + - KVM_SYSTEM_EVENT_CRASH -- the guest crash occurred and the guest + has requested a crash condition maintenance. Userspace can choose + to ignore the request, or to gather VM memory core dump and/or + reset/shutdown of the VM. + +:: + + /* KVM_EXIT_IOAPIC_EOI */ + struct { + __u8 vector; + } eoi; + +Indicates that the VCPU's in-kernel local APIC received an EOI for a +level-triggered IOAPIC interrupt. This exit only triggers when the +IOAPIC is implemented in userspace (i.e. KVM_CAP_SPLIT_IRQCHIP is enabled); +the userspace IOAPIC should process the EOI and retrigger the interrupt if +it is still asserted. Vector is the LAPIC interrupt vector for which the +EOI was received. + +:: + + struct kvm_hyperv_exit { + #define KVM_EXIT_HYPERV_SYNIC 1 + #define KVM_EXIT_HYPERV_HCALL 2 + __u32 type; + union { + struct { + __u32 msr; + __u64 control; + __u64 evt_page; + __u64 msg_page; + } synic; + struct { + __u64 input; + __u64 result; + __u64 params[2]; + } hcall; + } u; + }; + /* KVM_EXIT_HYPERV */ + struct kvm_hyperv_exit hyperv; + +Indicates that the VCPU exits into userspace to process some tasks +related to Hyper-V emulation. + +Valid values for 'type' are: + + - KVM_EXIT_HYPERV_SYNIC -- synchronously notify user-space about + +Hyper-V SynIC state change. Notification is used to remap SynIC +event/message pages and to enable/disable SynIC messages/events processing +in userspace. + +:: + + /* KVM_EXIT_ARM_NISV */ + struct { + __u64 esr_iss; + __u64 fault_ipa; + } arm_nisv; + +Used on arm and arm64 systems. If a guest accesses memory not in a memslot, +KVM will typically return to userspace and ask it to do MMIO emulation on its +behalf. However, for certain classes of instructions, no instruction decode +(direction, length of memory access) is provided, and fetching and decoding +the instruction from the VM is overly complicated to live in the kernel. + +Historically, when this situation occurred, KVM would print a warning and kill +the VM. KVM assumed that if the guest accessed non-memslot memory, it was +trying to do I/O, which just couldn't be emulated, and the warning message was +phrased accordingly. However, what happened more often was that a guest bug +caused access outside the guest memory areas which should lead to a more +meaningful warning message and an external abort in the guest, if the access +did not fall within an I/O window. + +Userspace implementations can query for KVM_CAP_ARM_NISV_TO_USER, and enable +this capability at VM creation. Once this is done, these types of errors will +instead return to userspace with KVM_EXIT_ARM_NISV, with the valid bits from +the HSR (arm) and ESR_EL2 (arm64) in the esr_iss field, and the faulting IPA +in the fault_ipa field. Userspace can either fix up the access if it's +actually an I/O access by decoding the instruction from guest memory (if it's +very brave) and continue executing the guest, or it can decide to suspend, +dump, or restart the guest. + +Note that KVM does not skip the faulting instruction as it does for +KVM_EXIT_MMIO, but userspace has to emulate any change to the processing state +if it decides to decode and emulate the instruction. + +:: + + /* Fix the size of the union. */ + char padding[256]; + }; + + /* + * shared registers between kvm and userspace. + * kvm_valid_regs specifies the register classes set by the host + * kvm_dirty_regs specified the register classes dirtied by userspace + * struct kvm_sync_regs is architecture specific, as well as the + * bits for kvm_valid_regs and kvm_dirty_regs + */ + __u64 kvm_valid_regs; + __u64 kvm_dirty_regs; + union { + struct kvm_sync_regs regs; + char padding[SYNC_REGS_SIZE_BYTES]; + } s; + +If KVM_CAP_SYNC_REGS is defined, these fields allow userspace to access +certain guest registers without having to call SET/GET_*REGS. Thus we can +avoid some system call overhead if userspace has to handle the exit. +Userspace can query the validity of the structure by checking +kvm_valid_regs for specific bits. These bits are architecture specific +and usually define the validity of a groups of registers. (e.g. one bit +for general purpose registers) + +Please note that the kernel is allowed to use the kvm_run structure as the +primary storage for certain register types. Therefore, the kernel may use the +values in kvm_run even if the corresponding bit in kvm_dirty_regs is not set. + +:: + + }; + + + +6. Capabilities that can be enabled on vCPUs +============================================ + +There are certain capabilities that change the behavior of the virtual CPU or +the virtual machine when enabled. To enable them, please see section 4.37. +Below you can find a list of capabilities and what their effect on the vCPU or +the virtual machine is when enabling them. + +The following information is provided along with the description: + + Architectures: + which instruction set architectures provide this ioctl. + x86 includes both i386 and x86_64. + + Target: + whether this is a per-vcpu or per-vm capability. + + Parameters: + what parameters are accepted by the capability. + + Returns: + the return value. General error numbers (EBADF, ENOMEM, EINVAL) + are not detailed, but errors with specific meanings are. + + +6.1 KVM_CAP_PPC_OSI +------------------- + +:Architectures: ppc +:Target: vcpu +:Parameters: none +:Returns: 0 on success; -1 on error + +This capability enables interception of OSI hypercalls that otherwise would +be treated as normal system calls to be injected into the guest. OSI hypercalls +were invented by Mac-on-Linux to have a standardized communication mechanism +between the guest and the host. + +When this capability is enabled, KVM_EXIT_OSI can occur. + + +6.2 KVM_CAP_PPC_PAPR +-------------------- + +:Architectures: ppc +:Target: vcpu +:Parameters: none +:Returns: 0 on success; -1 on error + +This capability enables interception of PAPR hypercalls. PAPR hypercalls are +done using the hypercall instruction "sc 1". + +It also sets the guest privilege level to "supervisor" mode. Usually the guest +runs in "hypervisor" privilege mode with a few missing features. + +In addition to the above, it changes the semantics of SDR1. In this mode, the +HTAB address part of SDR1 contains an HVA instead of a GPA, as PAPR keeps the +HTAB invisible to the guest. + +When this capability is enabled, KVM_EXIT_PAPR_HCALL can occur. + + +6.3 KVM_CAP_SW_TLB +------------------ + +:Architectures: ppc +:Target: vcpu +:Parameters: args[0] is the address of a struct kvm_config_tlb +:Returns: 0 on success; -1 on error + +:: + + struct kvm_config_tlb { + __u64 params; + __u64 array; + __u32 mmu_type; + __u32 array_len; + }; + +Configures the virtual CPU's TLB array, establishing a shared memory area +between userspace and KVM. The "params" and "array" fields are userspace +addresses of mmu-type-specific data structures. The "array_len" field is an +safety mechanism, and should be set to the size in bytes of the memory that +userspace has reserved for the array. It must be at least the size dictated +by "mmu_type" and "params". + +While KVM_RUN is active, the shared region is under control of KVM. Its +contents are undefined, and any modification by userspace results in +boundedly undefined behavior. + +On return from KVM_RUN, the shared region will reflect the current state of +the guest's TLB. If userspace makes any changes, it must call KVM_DIRTY_TLB +to tell KVM which entries have been changed, prior to calling KVM_RUN again +on this vcpu. + +For mmu types KVM_MMU_FSL_BOOKE_NOHV and KVM_MMU_FSL_BOOKE_HV: + + - The "params" field is of type "struct kvm_book3e_206_tlb_params". + - The "array" field points to an array of type "struct + kvm_book3e_206_tlb_entry". + - The array consists of all entries in the first TLB, followed by all + entries in the second TLB. + - Within a TLB, entries are ordered first by increasing set number. Within a + set, entries are ordered by way (increasing ESEL). + - The hash for determining set number in TLB0 is: (MAS2 >> 12) & (num_sets - 1) + where "num_sets" is the tlb_sizes[] value divided by the tlb_ways[] value. + - The tsize field of mas1 shall be set to 4K on TLB0, even though the + hardware ignores this value for TLB0. + +6.4 KVM_CAP_S390_CSS_SUPPORT +---------------------------- + +:Architectures: s390 +:Target: vcpu +:Parameters: none +:Returns: 0 on success; -1 on error + +This capability enables support for handling of channel I/O instructions. + +TEST PENDING INTERRUPTION and the interrupt portion of TEST SUBCHANNEL are +handled in-kernel, while the other I/O instructions are passed to userspace. + +When this capability is enabled, KVM_EXIT_S390_TSCH will occur on TEST +SUBCHANNEL intercepts. + +Note that even though this capability is enabled per-vcpu, the complete +virtual machine is affected. + +6.5 KVM_CAP_PPC_EPR +------------------- + +:Architectures: ppc +:Target: vcpu +:Parameters: args[0] defines whether the proxy facility is active +:Returns: 0 on success; -1 on error + +This capability enables or disables the delivery of interrupts through the +external proxy facility. + +When enabled (args[0] != 0), every time the guest gets an external interrupt +delivered, it automatically exits into user space with a KVM_EXIT_EPR exit +to receive the topmost interrupt vector. + +When disabled (args[0] == 0), behavior is as if this facility is unsupported. + +When this capability is enabled, KVM_EXIT_EPR can occur. + +6.6 KVM_CAP_IRQ_MPIC +-------------------- + +:Architectures: ppc +:Parameters: args[0] is the MPIC device fd; + args[1] is the MPIC CPU number for this vcpu + +This capability connects the vcpu to an in-kernel MPIC device. + +6.7 KVM_CAP_IRQ_XICS +-------------------- + +:Architectures: ppc +:Target: vcpu +:Parameters: args[0] is the XICS device fd; + args[1] is the XICS CPU number (server ID) for this vcpu + +This capability connects the vcpu to an in-kernel XICS device. + +6.8 KVM_CAP_S390_IRQCHIP +------------------------ + +:Architectures: s390 +:Target: vm +:Parameters: none + +This capability enables the in-kernel irqchip for s390. Please refer to +"4.24 KVM_CREATE_IRQCHIP" for details. + +6.9 KVM_CAP_MIPS_FPU +-------------------- + +:Architectures: mips +:Target: vcpu +:Parameters: args[0] is reserved for future use (should be 0). + +This capability allows the use of the host Floating Point Unit by the guest. It +allows the Config1.FP bit to be set to enable the FPU in the guest. Once this is +done the ``KVM_REG_MIPS_FPR_*`` and ``KVM_REG_MIPS_FCR_*`` registers can be +accessed (depending on the current guest FPU register mode), and the Status.FR, +Config5.FRE bits are accessible via the KVM API and also from the guest, +depending on them being supported by the FPU. + +6.10 KVM_CAP_MIPS_MSA +--------------------- + +:Architectures: mips +:Target: vcpu +:Parameters: args[0] is reserved for future use (should be 0). + +This capability allows the use of the MIPS SIMD Architecture (MSA) by the guest. +It allows the Config3.MSAP bit to be set to enable the use of MSA by the guest. +Once this is done the ``KVM_REG_MIPS_VEC_*`` and ``KVM_REG_MIPS_MSA_*`` +registers can be accessed, and the Config5.MSAEn bit is accessible via the +KVM API and also from the guest. + +6.74 KVM_CAP_SYNC_REGS +---------------------- + +:Architectures: s390, x86 +:Target: s390: always enabled, x86: vcpu +:Parameters: none +:Returns: x86: KVM_CHECK_EXTENSION returns a bit-array indicating which register + sets are supported + (bitfields defined in arch/x86/include/uapi/asm/kvm.h). + +As described above in the kvm_sync_regs struct info in section 5 (kvm_run): +KVM_CAP_SYNC_REGS "allow[s] userspace to access certain guest registers +without having to call SET/GET_*REGS". This reduces overhead by eliminating +repeated ioctl calls for setting and/or getting register values. This is +particularly important when userspace is making synchronous guest state +modifications, e.g. when emulating and/or intercepting instructions in +userspace. + +For s390 specifics, please refer to the source code. + +For x86: + +- the register sets to be copied out to kvm_run are selectable + by userspace (rather that all sets being copied out for every exit). +- vcpu_events are available in addition to regs and sregs. + +For x86, the 'kvm_valid_regs' field of struct kvm_run is overloaded to +function as an input bit-array field set by userspace to indicate the +specific register sets to be copied out on the next exit. + +To indicate when userspace has modified values that should be copied into +the vCPU, the all architecture bitarray field, 'kvm_dirty_regs' must be set. +This is done using the same bitflags as for the 'kvm_valid_regs' field. +If the dirty bit is not set, then the register set values will not be copied +into the vCPU even if they've been modified. + +Unused bitfields in the bitarrays must be set to zero. + +:: + + struct kvm_sync_regs { + struct kvm_regs regs; + struct kvm_sregs sregs; + struct kvm_vcpu_events events; + }; + +6.75 KVM_CAP_PPC_IRQ_XIVE +------------------------- + +:Architectures: ppc +:Target: vcpu +:Parameters: args[0] is the XIVE device fd; + args[1] is the XIVE CPU number (server ID) for this vcpu + +This capability connects the vcpu to an in-kernel XIVE device. + +7. Capabilities that can be enabled on VMs +========================================== + +There are certain capabilities that change the behavior of the virtual +machine when enabled. To enable them, please see section 4.37. Below +you can find a list of capabilities and what their effect on the VM +is when enabling them. + +The following information is provided along with the description: + + Architectures: + which instruction set architectures provide this ioctl. + x86 includes both i386 and x86_64. + + Parameters: + what parameters are accepted by the capability. + + Returns: + the return value. General error numbers (EBADF, ENOMEM, EINVAL) + are not detailed, but errors with specific meanings are. + + +7.1 KVM_CAP_PPC_ENABLE_HCALL +---------------------------- + +:Architectures: ppc +:Parameters: args[0] is the sPAPR hcall number; + args[1] is 0 to disable, 1 to enable in-kernel handling + +This capability controls whether individual sPAPR hypercalls (hcalls) +get handled by the kernel or not. Enabling or disabling in-kernel +handling of an hcall is effective across the VM. On creation, an +initial set of hcalls are enabled for in-kernel handling, which +consists of those hcalls for which in-kernel handlers were implemented +before this capability was implemented. If disabled, the kernel will +not to attempt to handle the hcall, but will always exit to userspace +to handle it. Note that it may not make sense to enable some and +disable others of a group of related hcalls, but KVM does not prevent +userspace from doing that. + +If the hcall number specified is not one that has an in-kernel +implementation, the KVM_ENABLE_CAP ioctl will fail with an EINVAL +error. + +7.2 KVM_CAP_S390_USER_SIGP +-------------------------- + +:Architectures: s390 +:Parameters: none + +This capability controls which SIGP orders will be handled completely in user +space. With this capability enabled, all fast orders will be handled completely +in the kernel: + +- SENSE +- SENSE RUNNING +- EXTERNAL CALL +- EMERGENCY SIGNAL +- CONDITIONAL EMERGENCY SIGNAL + +All other orders will be handled completely in user space. + +Only privileged operation exceptions will be checked for in the kernel (or even +in the hardware prior to interception). If this capability is not enabled, the +old way of handling SIGP orders is used (partially in kernel and user space). + +7.3 KVM_CAP_S390_VECTOR_REGISTERS +--------------------------------- + +:Architectures: s390 +:Parameters: none +:Returns: 0 on success, negative value on error + +Allows use of the vector registers introduced with z13 processor, and +provides for the synchronization between host and user space. Will +return -EINVAL if the machine does not support vectors. + +7.4 KVM_CAP_S390_USER_STSI +-------------------------- + +:Architectures: s390 +:Parameters: none + +This capability allows post-handlers for the STSI instruction. After +initial handling in the kernel, KVM exits to user space with +KVM_EXIT_S390_STSI to allow user space to insert further data. + +Before exiting to userspace, kvm handlers should fill in s390_stsi field of +vcpu->run:: + + struct { + __u64 addr; + __u8 ar; + __u8 reserved; + __u8 fc; + __u8 sel1; + __u16 sel2; + } s390_stsi; + + @addr - guest address of STSI SYSIB + @fc - function code + @sel1 - selector 1 + @sel2 - selector 2 + @ar - access register number + +KVM handlers should exit to userspace with rc = -EREMOTE. + +7.5 KVM_CAP_SPLIT_IRQCHIP +------------------------- + +:Architectures: x86 +:Parameters: args[0] - number of routes reserved for userspace IOAPICs +:Returns: 0 on success, -1 on error + +Create a local apic for each processor in the kernel. This can be used +instead of KVM_CREATE_IRQCHIP if the userspace VMM wishes to emulate the +IOAPIC and PIC (and also the PIT, even though this has to be enabled +separately). + +This capability also enables in kernel routing of interrupt requests; +when KVM_CAP_SPLIT_IRQCHIP only routes of KVM_IRQ_ROUTING_MSI type are +used in the IRQ routing table. The first args[0] MSI routes are reserved +for the IOAPIC pins. Whenever the LAPIC receives an EOI for these routes, +a KVM_EXIT_IOAPIC_EOI vmexit will be reported to userspace. + +Fails if VCPU has already been created, or if the irqchip is already in the +kernel (i.e. KVM_CREATE_IRQCHIP has already been called). + +7.6 KVM_CAP_S390_RI +------------------- + +:Architectures: s390 +:Parameters: none + +Allows use of runtime-instrumentation introduced with zEC12 processor. +Will return -EINVAL if the machine does not support runtime-instrumentation. +Will return -EBUSY if a VCPU has already been created. + +7.7 KVM_CAP_X2APIC_API +---------------------- + +:Architectures: x86 +:Parameters: args[0] - features that should be enabled +:Returns: 0 on success, -EINVAL when args[0] contains invalid features + +Valid feature flags in args[0] are:: + + #define KVM_X2APIC_API_USE_32BIT_IDS (1ULL << 0) + #define KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK (1ULL << 1) + +Enabling KVM_X2APIC_API_USE_32BIT_IDS changes the behavior of +KVM_SET_GSI_ROUTING, KVM_SIGNAL_MSI, KVM_SET_LAPIC, and KVM_GET_LAPIC, +allowing the use of 32-bit APIC IDs. See KVM_CAP_X2APIC_API in their +respective sections. + +KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK must be enabled for x2APIC to work +in logical mode or with more than 255 VCPUs. Otherwise, KVM treats 0xff +as a broadcast even in x2APIC mode in order to support physical x2APIC +without interrupt remapping. This is undesirable in logical mode, +where 0xff represents CPUs 0-7 in cluster 0. + +7.8 KVM_CAP_S390_USER_INSTR0 +---------------------------- + +:Architectures: s390 +:Parameters: none + +With this capability enabled, all illegal instructions 0x0000 (2 bytes) will +be intercepted and forwarded to user space. User space can use this +mechanism e.g. to realize 2-byte software breakpoints. The kernel will +not inject an operating exception for these instructions, user space has +to take care of that. + +This capability can be enabled dynamically even if VCPUs were already +created and are running. + +7.9 KVM_CAP_S390_GS +------------------- + +:Architectures: s390 +:Parameters: none +:Returns: 0 on success; -EINVAL if the machine does not support + guarded storage; -EBUSY if a VCPU has already been created. + +Allows use of guarded storage for the KVM guest. + +7.10 KVM_CAP_S390_AIS +--------------------- + +:Architectures: s390 +:Parameters: none + +Allow use of adapter-interruption suppression. +:Returns: 0 on success; -EBUSY if a VCPU has already been created. + +7.11 KVM_CAP_PPC_SMT +-------------------- + +:Architectures: ppc +:Parameters: vsmt_mode, flags + +Enabling this capability on a VM provides userspace with a way to set +the desired virtual SMT mode (i.e. the number of virtual CPUs per +virtual core). The virtual SMT mode, vsmt_mode, must be a power of 2 +between 1 and 8. On POWER8, vsmt_mode must also be no greater than +the number of threads per subcore for the host. Currently flags must +be 0. A successful call to enable this capability will result in +vsmt_mode being returned when the KVM_CAP_PPC_SMT capability is +subsequently queried for the VM. This capability is only supported by +HV KVM, and can only be set before any VCPUs have been created. +The KVM_CAP_PPC_SMT_POSSIBLE capability indicates which virtual SMT +modes are available. + +7.12 KVM_CAP_PPC_FWNMI +---------------------- + +:Architectures: ppc +:Parameters: none + +With this capability a machine check exception in the guest address +space will cause KVM to exit the guest with NMI exit reason. This +enables QEMU to build error log and branch to guest kernel registered +machine check handling routine. Without this capability KVM will +branch to guests' 0x200 interrupt vector. + +7.13 KVM_CAP_X86_DISABLE_EXITS +------------------------------ + +:Architectures: x86 +:Parameters: args[0] defines which exits are disabled +:Returns: 0 on success, -EINVAL when args[0] contains invalid exits + +Valid bits in args[0] are:: + + #define KVM_X86_DISABLE_EXITS_MWAIT (1 << 0) + #define KVM_X86_DISABLE_EXITS_HLT (1 << 1) + #define KVM_X86_DISABLE_EXITS_PAUSE (1 << 2) + #define KVM_X86_DISABLE_EXITS_CSTATE (1 << 3) + +Enabling this capability on a VM provides userspace with a way to no +longer intercept some instructions for improved latency in some +workloads, and is suggested when vCPUs are associated to dedicated +physical CPUs. More bits can be added in the future; userspace can +just pass the KVM_CHECK_EXTENSION result to KVM_ENABLE_CAP to disable +all such vmexits. + +Do not enable KVM_FEATURE_PV_UNHALT if you disable HLT exits. + +7.14 KVM_CAP_S390_HPAGE_1M +-------------------------- + +:Architectures: s390 +:Parameters: none +:Returns: 0 on success, -EINVAL if hpage module parameter was not set + or cmma is enabled, or the VM has the KVM_VM_S390_UCONTROL + flag set + +With this capability the KVM support for memory backing with 1m pages +through hugetlbfs can be enabled for a VM. After the capability is +enabled, cmma can't be enabled anymore and pfmfi and the storage key +interpretation are disabled. If cmma has already been enabled or the +hpage module parameter is not set to 1, -EINVAL is returned. + +While it is generally possible to create a huge page backed VM without +this capability, the VM will not be able to run. + +7.15 KVM_CAP_MSR_PLATFORM_INFO +------------------------------ + +:Architectures: x86 +:Parameters: args[0] whether feature should be enabled or not + +With this capability, a guest may read the MSR_PLATFORM_INFO MSR. Otherwise, +a #GP would be raised when the guest tries to access. Currently, this +capability does not enable write permissions of this MSR for the guest. + +7.16 KVM_CAP_PPC_NESTED_HV +-------------------------- + +:Architectures: ppc +:Parameters: none +:Returns: 0 on success, -EINVAL when the implementation doesn't support + nested-HV virtualization. + +HV-KVM on POWER9 and later systems allows for "nested-HV" +virtualization, which provides a way for a guest VM to run guests that +can run using the CPU's supervisor mode (privileged non-hypervisor +state). Enabling this capability on a VM depends on the CPU having +the necessary functionality and on the facility being enabled with a +kvm-hv module parameter. + +7.17 KVM_CAP_EXCEPTION_PAYLOAD +------------------------------ + +:Architectures: x86 +:Parameters: args[0] whether feature should be enabled or not + +With this capability enabled, CR2 will not be modified prior to the +emulated VM-exit when L1 intercepts a #PF exception that occurs in +L2. Similarly, for kvm-intel only, DR6 will not be modified prior to +the emulated VM-exit when L1 intercepts a #DB exception that occurs in +L2. As a result, when KVM_GET_VCPU_EVENTS reports a pending #PF (or +#DB) exception for L2, exception.has_payload will be set and the +faulting address (or the new DR6 bits*) will be reported in the +exception_payload field. Similarly, when userspace injects a #PF (or +#DB) into L2 using KVM_SET_VCPU_EVENTS, it is expected to set +exception.has_payload and to put the faulting address - or the new DR6 +bits\ [#]_ - in the exception_payload field. + +This capability also enables exception.pending in struct +kvm_vcpu_events, which allows userspace to distinguish between pending +and injected exceptions. + + +.. [#] For the new DR6 bits, note that bit 16 is set iff the #DB exception + will clear DR6.RTM. + +7.18 KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 + +:Architectures: x86, arm, arm64, mips +:Parameters: args[0] whether feature should be enabled or not + +With this capability enabled, KVM_GET_DIRTY_LOG will not automatically +clear and write-protect all pages that are returned as dirty. +Rather, userspace will have to do this operation separately using +KVM_CLEAR_DIRTY_LOG. + +At the cost of a slightly more complicated operation, this provides better +scalability and responsiveness for two reasons. First, +KVM_CLEAR_DIRTY_LOG ioctl can operate on a 64-page granularity rather +than requiring to sync a full memslot; this ensures that KVM does not +take spinlocks for an extended period of time. Second, in some cases a +large amount of time can pass between a call to KVM_GET_DIRTY_LOG and +userspace actually using the data in the page. Pages can be modified +during this time, which is inefficint for both the guest and userspace: +the guest will incur a higher penalty due to write protection faults, +while userspace can see false reports of dirty pages. Manual reprotection +helps reducing this time, improving guest performance and reducing the +number of dirty log false positives. + +KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 was previously available under the name +KVM_CAP_MANUAL_DIRTY_LOG_PROTECT, but the implementation had bugs that make +it hard or impossible to use it correctly. The availability of +KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 signals that those bugs are fixed. +Userspace should not try to use KVM_CAP_MANUAL_DIRTY_LOG_PROTECT. + +8. Other capabilities. +====================== + +This section lists capabilities that give information about other +features of the KVM implementation. + +8.1 KVM_CAP_PPC_HWRNG +--------------------- + +:Architectures: ppc + +This capability, if KVM_CHECK_EXTENSION indicates that it is +available, means that that the kernel has an implementation of the +H_RANDOM hypercall backed by a hardware random-number generator. +If present, the kernel H_RANDOM handler can be enabled for guest use +with the KVM_CAP_PPC_ENABLE_HCALL capability. + +8.2 KVM_CAP_HYPERV_SYNIC +------------------------ + +:Architectures: x86 + +This capability, if KVM_CHECK_EXTENSION indicates that it is +available, means that that the kernel has an implementation of the +Hyper-V Synthetic interrupt controller(SynIC). Hyper-V SynIC is +used to support Windows Hyper-V based guest paravirt drivers(VMBus). + +In order to use SynIC, it has to be activated by setting this +capability via KVM_ENABLE_CAP ioctl on the vcpu fd. Note that this +will disable the use of APIC hardware virtualization even if supported +by the CPU, as it's incompatible with SynIC auto-EOI behavior. + +8.3 KVM_CAP_PPC_RADIX_MMU +------------------------- + +:Architectures: ppc + +This capability, if KVM_CHECK_EXTENSION indicates that it is +available, means that that the kernel can support guests using the +radix MMU defined in Power ISA V3.00 (as implemented in the POWER9 +processor). + +8.4 KVM_CAP_PPC_HASH_MMU_V3 +--------------------------- + +:Architectures: ppc + +This capability, if KVM_CHECK_EXTENSION indicates that it is +available, means that that the kernel can support guests using the +hashed page table MMU defined in Power ISA V3.00 (as implemented in +the POWER9 processor), including in-memory segment tables. + +8.5 KVM_CAP_MIPS_VZ +------------------- + +:Architectures: mips + +This capability, if KVM_CHECK_EXTENSION on the main kvm handle indicates that +it is available, means that full hardware assisted virtualization capabilities +of the hardware are available for use through KVM. An appropriate +KVM_VM_MIPS_* type must be passed to KVM_CREATE_VM to create a VM which +utilises it. + +If KVM_CHECK_EXTENSION on a kvm VM handle indicates that this capability is +available, it means that the VM is using full hardware assisted virtualization +capabilities of the hardware. This is useful to check after creating a VM with +KVM_VM_MIPS_DEFAULT. + +The value returned by KVM_CHECK_EXTENSION should be compared against known +values (see below). All other values are reserved. This is to allow for the +possibility of other hardware assisted virtualization implementations which +may be incompatible with the MIPS VZ ASE. + +== ========================================================================== + 0 The trap & emulate implementation is in use to run guest code in user + mode. Guest virtual memory segments are rearranged to fit the guest in the + user mode address space. + + 1 The MIPS VZ ASE is in use, providing full hardware assisted + virtualization, including standard guest virtual memory segments. +== ========================================================================== + +8.6 KVM_CAP_MIPS_TE +------------------- + +:Architectures: mips + +This capability, if KVM_CHECK_EXTENSION on the main kvm handle indicates that +it is available, means that the trap & emulate implementation is available to +run guest code in user mode, even if KVM_CAP_MIPS_VZ indicates that hardware +assisted virtualisation is also available. KVM_VM_MIPS_TE (0) must be passed +to KVM_CREATE_VM to create a VM which utilises it. + +If KVM_CHECK_EXTENSION on a kvm VM handle indicates that this capability is +available, it means that the VM is using trap & emulate. + +8.7 KVM_CAP_MIPS_64BIT +---------------------- + +:Architectures: mips + +This capability indicates the supported architecture type of the guest, i.e. the +supported register and address width. + +The values returned when this capability is checked by KVM_CHECK_EXTENSION on a +kvm VM handle correspond roughly to the CP0_Config.AT register field, and should +be checked specifically against known values (see below). All other values are +reserved. + +== ======================================================================== + 0 MIPS32 or microMIPS32. + Both registers and addresses are 32-bits wide. + It will only be possible to run 32-bit guest code. + + 1 MIPS64 or microMIPS64 with access only to 32-bit compatibility segments. + Registers are 64-bits wide, but addresses are 32-bits wide. + 64-bit guest code may run but cannot access MIPS64 memory segments. + It will also be possible to run 32-bit guest code. + + 2 MIPS64 or microMIPS64 with access to all address segments. + Both registers and addresses are 64-bits wide. + It will be possible to run 64-bit or 32-bit guest code. +== ======================================================================== + +8.9 KVM_CAP_ARM_USER_IRQ +------------------------ + +:Architectures: arm, arm64 + +This capability, if KVM_CHECK_EXTENSION indicates that it is available, means +that if userspace creates a VM without an in-kernel interrupt controller, it +will be notified of changes to the output level of in-kernel emulated devices, +which can generate virtual interrupts, presented to the VM. +For such VMs, on every return to userspace, the kernel +updates the vcpu's run->s.regs.device_irq_level field to represent the actual +output level of the device. + +Whenever kvm detects a change in the device output level, kvm guarantees at +least one return to userspace before running the VM. This exit could either +be a KVM_EXIT_INTR or any other exit event, like KVM_EXIT_MMIO. This way, +userspace can always sample the device output level and re-compute the state of +the userspace interrupt controller. Userspace should always check the state +of run->s.regs.device_irq_level on every kvm exit. +The value in run->s.regs.device_irq_level can represent both level and edge +triggered interrupt signals, depending on the device. Edge triggered interrupt +signals will exit to userspace with the bit in run->s.regs.device_irq_level +set exactly once per edge signal. + +The field run->s.regs.device_irq_level is available independent of +run->kvm_valid_regs or run->kvm_dirty_regs bits. + +If KVM_CAP_ARM_USER_IRQ is supported, the KVM_CHECK_EXTENSION ioctl returns a +number larger than 0 indicating the version of this capability is implemented +and thereby which bits in in run->s.regs.device_irq_level can signal values. + +Currently the following bits are defined for the device_irq_level bitmap:: + + KVM_CAP_ARM_USER_IRQ >= 1: + + KVM_ARM_DEV_EL1_VTIMER - EL1 virtual timer + KVM_ARM_DEV_EL1_PTIMER - EL1 physical timer + KVM_ARM_DEV_PMU - ARM PMU overflow interrupt signal + +Future versions of kvm may implement additional events. These will get +indicated by returning a higher number from KVM_CHECK_EXTENSION and will be +listed above. + +8.10 KVM_CAP_PPC_SMT_POSSIBLE +----------------------------- + +:Architectures: ppc + +Querying this capability returns a bitmap indicating the possible +virtual SMT modes that can be set using KVM_CAP_PPC_SMT. If bit N +(counting from the right) is set, then a virtual SMT mode of 2^N is +available. + +8.11 KVM_CAP_HYPERV_SYNIC2 +-------------------------- + +:Architectures: x86 + +This capability enables a newer version of Hyper-V Synthetic interrupt +controller (SynIC). The only difference with KVM_CAP_HYPERV_SYNIC is that KVM +doesn't clear SynIC message and event flags pages when they are enabled by +writing to the respective MSRs. + +8.12 KVM_CAP_HYPERV_VP_INDEX +---------------------------- + +:Architectures: x86 + +This capability indicates that userspace can load HV_X64_MSR_VP_INDEX msr. Its +value is used to denote the target vcpu for a SynIC interrupt. For +compatibilty, KVM initializes this msr to KVM's internal vcpu index. When this +capability is absent, userspace can still query this msr's value. + +8.13 KVM_CAP_S390_AIS_MIGRATION +------------------------------- + +:Architectures: s390 +:Parameters: none + +This capability indicates if the flic device will be able to get/set the +AIS states for migration via the KVM_DEV_FLIC_AISM_ALL attribute and allows +to discover this without having to create a flic device. + +8.14 KVM_CAP_S390_PSW +--------------------- + +:Architectures: s390 + +This capability indicates that the PSW is exposed via the kvm_run structure. + +8.15 KVM_CAP_S390_GMAP +---------------------- + +:Architectures: s390 + +This capability indicates that the user space memory used as guest mapping can +be anywhere in the user memory address space, as long as the memory slots are +aligned and sized to a segment (1MB) boundary. + +8.16 KVM_CAP_S390_COW +--------------------- + +:Architectures: s390 + +This capability indicates that the user space memory used as guest mapping can +use copy-on-write semantics as well as dirty pages tracking via read-only page +tables. + +8.17 KVM_CAP_S390_BPB +--------------------- + +:Architectures: s390 + +This capability indicates that kvm will implement the interfaces to handle +reset, migration and nested KVM for branch prediction blocking. The stfle +facility 82 should not be provided to the guest without this capability. + +8.18 KVM_CAP_HYPERV_TLBFLUSH +---------------------------- + +:Architectures: x86 + +This capability indicates that KVM supports paravirtualized Hyper-V TLB Flush +hypercalls: +HvFlushVirtualAddressSpace, HvFlushVirtualAddressSpaceEx, +HvFlushVirtualAddressList, HvFlushVirtualAddressListEx. + +8.19 KVM_CAP_ARM_INJECT_SERROR_ESR +---------------------------------- + +:Architectures: arm, arm64 + +This capability indicates that userspace can specify (via the +KVM_SET_VCPU_EVENTS ioctl) the syndrome value reported to the guest when it +takes a virtual SError interrupt exception. +If KVM advertises this capability, userspace can only specify the ISS field for +the ESR syndrome. Other parts of the ESR, such as the EC are generated by the +CPU when the exception is taken. If this virtual SError is taken to EL1 using +AArch64, this value will be reported in the ISS field of ESR_ELx. + +See KVM_CAP_VCPU_EVENTS for more details. + +8.20 KVM_CAP_HYPERV_SEND_IPI +---------------------------- + +:Architectures: x86 + +This capability indicates that KVM supports paravirtualized Hyper-V IPI send +hypercalls: +HvCallSendSyntheticClusterIpi, HvCallSendSyntheticClusterIpiEx. + +8.21 KVM_CAP_HYPERV_DIRECT_TLBFLUSH +----------------------------------- + +:Architecture: x86 + +This capability indicates that KVM running on top of Hyper-V hypervisor +enables Direct TLB flush for its guests meaning that TLB flush +hypercalls are handled by Level 0 hypervisor (Hyper-V) bypassing KVM. +Due to the different ABI for hypercall parameters between Hyper-V and +KVM, enabling this capability effectively disables all hypercall +handling by KVM (as some KVM hypercall may be mistakenly treated as TLB +flush hypercalls by Hyper-V) so userspace should disable KVM identification +in CPUID and only exposes Hyper-V identification. In this case, guest +thinks it's running on Hyper-V and only use Hyper-V hypercalls. + +8.22 KVM_CAP_S390_VCPU_RESETS + +Architectures: s390 + +This capability indicates that the KVM_S390_NORMAL_RESET and +KVM_S390_CLEAR_RESET ioctls are available. diff --git a/Documentation/virt/kvm/api.txt b/Documentation/virt/kvm/api.txt deleted file mode 100644 index c6e1ce5d40de..000000000000 --- a/Documentation/virt/kvm/api.txt +++ /dev/null @@ -1,5450 +0,0 @@ -The Definitive KVM (Kernel-based Virtual Machine) API Documentation -=================================================================== - -1. General description ----------------------- - -The kvm API is a set of ioctls that are issued to control various aspects -of a virtual machine. The ioctls belong to the following classes: - - - System ioctls: These query and set global attributes which affect the - whole kvm subsystem. In addition a system ioctl is used to create - virtual machines. - - - VM ioctls: These query and set attributes that affect an entire virtual - machine, for example memory layout. In addition a VM ioctl is used to - create virtual cpus (vcpus) and devices. - - VM ioctls must be issued from the same process (address space) that was - used to create the VM. - - - vcpu ioctls: These query and set attributes that control the operation - of a single virtual cpu. - - vcpu ioctls should be issued from the same thread that was used to create - the vcpu, except for asynchronous vcpu ioctl that are marked as such in - the documentation. Otherwise, the first ioctl after switching threads - could see a performance impact. - - - device ioctls: These query and set attributes that control the operation - of a single device. - - device ioctls must be issued from the same process (address space) that - was used to create the VM. - -2. File descriptors -------------------- - -The kvm API is centered around file descriptors. An initial -open("/dev/kvm") obtains a handle to the kvm subsystem; this handle -can be used to issue system ioctls. A KVM_CREATE_VM ioctl on this -handle will create a VM file descriptor which can be used to issue VM -ioctls. A KVM_CREATE_VCPU or KVM_CREATE_DEVICE ioctl on a VM fd will -create a virtual cpu or device and return a file descriptor pointing to -the new resource. Finally, ioctls on a vcpu or device fd can be used -to control the vcpu or device. For vcpus, this includes the important -task of actually running guest code. - -In general file descriptors can be migrated among processes by means -of fork() and the SCM_RIGHTS facility of unix domain socket. These -kinds of tricks are explicitly not supported by kvm. While they will -not cause harm to the host, their actual behavior is not guaranteed by -the API. See "General description" for details on the ioctl usage -model that is supported by KVM. - -It is important to note that althought VM ioctls may only be issued from -the process that created the VM, a VM's lifecycle is associated with its -file descriptor, not its creator (process). In other words, the VM and -its resources, *including the associated address space*, are not freed -until the last reference to the VM's file descriptor has been released. -For example, if fork() is issued after ioctl(KVM_CREATE_VM), the VM will -not be freed until both the parent (original) process and its child have -put their references to the VM's file descriptor. - -Because a VM's resources are not freed until the last reference to its -file descriptor is released, creating additional references to a VM via -via fork(), dup(), etc... without careful consideration is strongly -discouraged and may have unwanted side effects, e.g. memory allocated -by and on behalf of the VM's process may not be freed/unaccounted when -the VM is shut down. - - -3. Extensions -------------- - -As of Linux 2.6.22, the KVM ABI has been stabilized: no backward -incompatible change are allowed. However, there is an extension -facility that allows backward-compatible extensions to the API to be -queried and used. - -The extension mechanism is not based on the Linux version number. -Instead, kvm defines extension identifiers and a facility to query -whether a particular extension identifier is available. If it is, a -set of ioctls is available for application use. - - -4. API description ------------------- - -This section describes ioctls that can be used to control kvm guests. -For each ioctl, the following information is provided along with a -description: - - Capability: which KVM extension provides this ioctl. Can be 'basic', - which means that is will be provided by any kernel that supports - API version 12 (see section 4.1), a KVM_CAP_xyz constant, which - means availability needs to be checked with KVM_CHECK_EXTENSION - (see section 4.4), or 'none' which means that while not all kernels - support this ioctl, there's no capability bit to check its - availability: for kernels that don't support the ioctl, - the ioctl returns -ENOTTY. - - Architectures: which instruction set architectures provide this ioctl. - x86 includes both i386 and x86_64. - - Type: system, vm, or vcpu. - - Parameters: what parameters are accepted by the ioctl. - - Returns: the return value. General error numbers (EBADF, ENOMEM, EINVAL) - are not detailed, but errors with specific meanings are. - - -4.1 KVM_GET_API_VERSION - -Capability: basic -Architectures: all -Type: system ioctl -Parameters: none -Returns: the constant KVM_API_VERSION (=12) - -This identifies the API version as the stable kvm API. It is not -expected that this number will change. However, Linux 2.6.20 and -2.6.21 report earlier versions; these are not documented and not -supported. Applications should refuse to run if KVM_GET_API_VERSION -returns a value other than 12. If this check passes, all ioctls -described as 'basic' will be available. - - -4.2 KVM_CREATE_VM - -Capability: basic -Architectures: all -Type: system ioctl -Parameters: machine type identifier (KVM_VM_*) -Returns: a VM fd that can be used to control the new virtual machine. - -The new VM has no virtual cpus and no memory. -You probably want to use 0 as machine type. - -In order to create user controlled virtual machines on S390, check -KVM_CAP_S390_UCONTROL and use the flag KVM_VM_S390_UCONTROL as -privileged user (CAP_SYS_ADMIN). - -To use hardware assisted virtualization on MIPS (VZ ASE) rather than -the default trap & emulate implementation (which changes the virtual -memory layout to fit in user mode), check KVM_CAP_MIPS_VZ and use the -flag KVM_VM_MIPS_VZ. - - -On arm64, the physical address size for a VM (IPA Size limit) is limited -to 40bits by default. The limit can be configured if the host supports the -extension KVM_CAP_ARM_VM_IPA_SIZE. When supported, use -KVM_VM_TYPE_ARM_IPA_SIZE(IPA_Bits) to set the size in the machine type -identifier, where IPA_Bits is the maximum width of any physical -address used by the VM. The IPA_Bits is encoded in bits[7-0] of the -machine type identifier. - -e.g, to configure a guest to use 48bit physical address size : - - vm_fd = ioctl(dev_fd, KVM_CREATE_VM, KVM_VM_TYPE_ARM_IPA_SIZE(48)); - -The requested size (IPA_Bits) must be : - 0 - Implies default size, 40bits (for backward compatibility) - - or - - N - Implies N bits, where N is a positive integer such that, - 32 <= N <= Host_IPA_Limit - -Host_IPA_Limit is the maximum possible value for IPA_Bits on the host and -is dependent on the CPU capability and the kernel configuration. The limit can -be retrieved using KVM_CAP_ARM_VM_IPA_SIZE of the KVM_CHECK_EXTENSION -ioctl() at run-time. - -Please note that configuring the IPA size does not affect the capability -exposed by the guest CPUs in ID_AA64MMFR0_EL1[PARange]. It only affects -size of the address translated by the stage2 level (guest physical to -host physical address translations). - - -4.3 KVM_GET_MSR_INDEX_LIST, KVM_GET_MSR_FEATURE_INDEX_LIST - -Capability: basic, KVM_CAP_GET_MSR_FEATURES for KVM_GET_MSR_FEATURE_INDEX_LIST -Architectures: x86 -Type: system ioctl -Parameters: struct kvm_msr_list (in/out) -Returns: 0 on success; -1 on error -Errors: - EFAULT: the msr index list cannot be read from or written to - E2BIG: the msr index list is to be to fit in the array specified by - the user. - -struct kvm_msr_list { - __u32 nmsrs; /* number of msrs in entries */ - __u32 indices[0]; -}; - -The user fills in the size of the indices array in nmsrs, and in return -kvm adjusts nmsrs to reflect the actual number of msrs and fills in the -indices array with their numbers. - -KVM_GET_MSR_INDEX_LIST returns the guest msrs that are supported. The list -varies by kvm version and host processor, but does not change otherwise. - -Note: if kvm indicates supports MCE (KVM_CAP_MCE), then the MCE bank MSRs are -not returned in the MSR list, as different vcpus can have a different number -of banks, as set via the KVM_X86_SETUP_MCE ioctl. - -KVM_GET_MSR_FEATURE_INDEX_LIST returns the list of MSRs that can be passed -to the KVM_GET_MSRS system ioctl. This lets userspace probe host capabilities -and processor features that are exposed via MSRs (e.g., VMX capabilities). -This list also varies by kvm version and host processor, but does not change -otherwise. - - -4.4 KVM_CHECK_EXTENSION - -Capability: basic, KVM_CAP_CHECK_EXTENSION_VM for vm ioctl -Architectures: all -Type: system ioctl, vm ioctl -Parameters: extension identifier (KVM_CAP_*) -Returns: 0 if unsupported; 1 (or some other positive integer) if supported - -The API allows the application to query about extensions to the core -kvm API. Userspace passes an extension identifier (an integer) and -receives an integer that describes the extension availability. -Generally 0 means no and 1 means yes, but some extensions may report -additional information in the integer return value. - -Based on their initialization different VMs may have different capabilities. -It is thus encouraged to use the vm ioctl to query for capabilities (available -with KVM_CAP_CHECK_EXTENSION_VM on the vm fd) - -4.5 KVM_GET_VCPU_MMAP_SIZE - -Capability: basic -Architectures: all -Type: system ioctl -Parameters: none -Returns: size of vcpu mmap area, in bytes - -The KVM_RUN ioctl (cf.) communicates with userspace via a shared -memory region. This ioctl returns the size of that region. See the -KVM_RUN documentation for details. - - -4.6 KVM_SET_MEMORY_REGION - -Capability: basic -Architectures: all -Type: vm ioctl -Parameters: struct kvm_memory_region (in) -Returns: 0 on success, -1 on error - -This ioctl is obsolete and has been removed. - - -4.7 KVM_CREATE_VCPU - -Capability: basic -Architectures: all -Type: vm ioctl -Parameters: vcpu id (apic id on x86) -Returns: vcpu fd on success, -1 on error - -This API adds a vcpu to a virtual machine. No more than max_vcpus may be added. -The vcpu id is an integer in the range [0, max_vcpu_id). - -The recommended max_vcpus value can be retrieved using the KVM_CAP_NR_VCPUS of -the KVM_CHECK_EXTENSION ioctl() at run-time. -The maximum possible value for max_vcpus can be retrieved using the -KVM_CAP_MAX_VCPUS of the KVM_CHECK_EXTENSION ioctl() at run-time. - -If the KVM_CAP_NR_VCPUS does not exist, you should assume that max_vcpus is 4 -cpus max. -If the KVM_CAP_MAX_VCPUS does not exist, you should assume that max_vcpus is -same as the value returned from KVM_CAP_NR_VCPUS. - -The maximum possible value for max_vcpu_id can be retrieved using the -KVM_CAP_MAX_VCPU_ID of the KVM_CHECK_EXTENSION ioctl() at run-time. - -If the KVM_CAP_MAX_VCPU_ID does not exist, you should assume that max_vcpu_id -is the same as the value returned from KVM_CAP_MAX_VCPUS. - -On powerpc using book3s_hv mode, the vcpus are mapped onto virtual -threads in one or more virtual CPU cores. (This is because the -hardware requires all the hardware threads in a CPU core to be in the -same partition.) The KVM_CAP_PPC_SMT capability indicates the number -of vcpus per virtual core (vcore). The vcore id is obtained by -dividing the vcpu id by the number of vcpus per vcore. The vcpus in a -given vcore will always be in the same physical core as each other -(though that might be a different physical core from time to time). -Userspace can control the threading (SMT) mode of the guest by its -allocation of vcpu ids. For example, if userspace wants -single-threaded guest vcpus, it should make all vcpu ids be a multiple -of the number of vcpus per vcore. - -For virtual cpus that have been created with S390 user controlled virtual -machines, the resulting vcpu fd can be memory mapped at page offset -KVM_S390_SIE_PAGE_OFFSET in order to obtain a memory map of the virtual -cpu's hardware control block. - - -4.8 KVM_GET_DIRTY_LOG (vm ioctl) - -Capability: basic -Architectures: all -Type: vm ioctl -Parameters: struct kvm_dirty_log (in/out) -Returns: 0 on success, -1 on error - -/* for KVM_GET_DIRTY_LOG */ -struct kvm_dirty_log { - __u32 slot; - __u32 padding; - union { - void __user *dirty_bitmap; /* one bit per page */ - __u64 padding; - }; -}; - -Given a memory slot, return a bitmap containing any pages dirtied -since the last call to this ioctl. Bit 0 is the first page in the -memory slot. Ensure the entire structure is cleared to avoid padding -issues. - -If KVM_CAP_MULTI_ADDRESS_SPACE is available, bits 16-31 specifies -the address space for which you want to return the dirty bitmap. -They must be less than the value that KVM_CHECK_EXTENSION returns for -the KVM_CAP_MULTI_ADDRESS_SPACE capability. - -The bits in the dirty bitmap are cleared before the ioctl returns, unless -KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 is enabled. For more information, -see the description of the capability. - -4.9 KVM_SET_MEMORY_ALIAS - -Capability: basic -Architectures: x86 -Type: vm ioctl -Parameters: struct kvm_memory_alias (in) -Returns: 0 (success), -1 (error) - -This ioctl is obsolete and has been removed. - - -4.10 KVM_RUN - -Capability: basic -Architectures: all -Type: vcpu ioctl -Parameters: none -Returns: 0 on success, -1 on error -Errors: - EINTR: an unmasked signal is pending - -This ioctl is used to run a guest virtual cpu. While there are no -explicit parameters, there is an implicit parameter block that can be -obtained by mmap()ing the vcpu fd at offset 0, with the size given by -KVM_GET_VCPU_MMAP_SIZE. The parameter block is formatted as a 'struct -kvm_run' (see below). - - -4.11 KVM_GET_REGS - -Capability: basic -Architectures: all except ARM, arm64 -Type: vcpu ioctl -Parameters: struct kvm_regs (out) -Returns: 0 on success, -1 on error - -Reads the general purpose registers from the vcpu. - -/* x86 */ -struct kvm_regs { - /* out (KVM_GET_REGS) / in (KVM_SET_REGS) */ - __u64 rax, rbx, rcx, rdx; - __u64 rsi, rdi, rsp, rbp; - __u64 r8, r9, r10, r11; - __u64 r12, r13, r14, r15; - __u64 rip, rflags; -}; - -/* mips */ -struct kvm_regs { - /* out (KVM_GET_REGS) / in (KVM_SET_REGS) */ - __u64 gpr[32]; - __u64 hi; - __u64 lo; - __u64 pc; -}; - - -4.12 KVM_SET_REGS - -Capability: basic -Architectures: all except ARM, arm64 -Type: vcpu ioctl -Parameters: struct kvm_regs (in) -Returns: 0 on success, -1 on error - -Writes the general purpose registers into the vcpu. - -See KVM_GET_REGS for the data structure. - - -4.13 KVM_GET_SREGS - -Capability: basic -Architectures: x86, ppc -Type: vcpu ioctl -Parameters: struct kvm_sregs (out) -Returns: 0 on success, -1 on error - -Reads special registers from the vcpu. - -/* x86 */ -struct kvm_sregs { - struct kvm_segment cs, ds, es, fs, gs, ss; - struct kvm_segment tr, ldt; - struct kvm_dtable gdt, idt; - __u64 cr0, cr2, cr3, cr4, cr8; - __u64 efer; - __u64 apic_base; - __u64 interrupt_bitmap[(KVM_NR_INTERRUPTS + 63) / 64]; -}; - -/* ppc -- see arch/powerpc/include/uapi/asm/kvm.h */ - -interrupt_bitmap is a bitmap of pending external interrupts. At most -one bit may be set. This interrupt has been acknowledged by the APIC -but not yet injected into the cpu core. - - -4.14 KVM_SET_SREGS - -Capability: basic -Architectures: x86, ppc -Type: vcpu ioctl -Parameters: struct kvm_sregs (in) -Returns: 0 on success, -1 on error - -Writes special registers into the vcpu. See KVM_GET_SREGS for the -data structures. - - -4.15 KVM_TRANSLATE - -Capability: basic -Architectures: x86 -Type: vcpu ioctl -Parameters: struct kvm_translation (in/out) -Returns: 0 on success, -1 on error - -Translates a virtual address according to the vcpu's current address -translation mode. - -struct kvm_translation { - /* in */ - __u64 linear_address; - - /* out */ - __u64 physical_address; - __u8 valid; - __u8 writeable; - __u8 usermode; - __u8 pad[5]; -}; - - -4.16 KVM_INTERRUPT - -Capability: basic -Architectures: x86, ppc, mips -Type: vcpu ioctl -Parameters: struct kvm_interrupt (in) -Returns: 0 on success, negative on failure. - -Queues a hardware interrupt vector to be injected. - -/* for KVM_INTERRUPT */ -struct kvm_interrupt { - /* in */ - __u32 irq; -}; - -X86: - -Returns: 0 on success, - -EEXIST if an interrupt is already enqueued - -EINVAL the the irq number is invalid - -ENXIO if the PIC is in the kernel - -EFAULT if the pointer is invalid - -Note 'irq' is an interrupt vector, not an interrupt pin or line. This -ioctl is useful if the in-kernel PIC is not used. - -PPC: - -Queues an external interrupt to be injected. This ioctl is overleaded -with 3 different irq values: - -a) KVM_INTERRUPT_SET - - This injects an edge type external interrupt into the guest once it's ready - to receive interrupts. When injected, the interrupt is done. - -b) KVM_INTERRUPT_UNSET - - This unsets any pending interrupt. - - Only available with KVM_CAP_PPC_UNSET_IRQ. - -c) KVM_INTERRUPT_SET_LEVEL - - This injects a level type external interrupt into the guest context. The - interrupt stays pending until a specific ioctl with KVM_INTERRUPT_UNSET - is triggered. - - Only available with KVM_CAP_PPC_IRQ_LEVEL. - -Note that any value for 'irq' other than the ones stated above is invalid -and incurs unexpected behavior. - -This is an asynchronous vcpu ioctl and can be invoked from any thread. - -MIPS: - -Queues an external interrupt to be injected into the virtual CPU. A negative -interrupt number dequeues the interrupt. - -This is an asynchronous vcpu ioctl and can be invoked from any thread. - - -4.17 KVM_DEBUG_GUEST - -Capability: basic -Architectures: none -Type: vcpu ioctl -Parameters: none) -Returns: -1 on error - -Support for this has been removed. Use KVM_SET_GUEST_DEBUG instead. - - -4.18 KVM_GET_MSRS - -Capability: basic (vcpu), KVM_CAP_GET_MSR_FEATURES (system) -Architectures: x86 -Type: system ioctl, vcpu ioctl -Parameters: struct kvm_msrs (in/out) -Returns: number of msrs successfully returned; - -1 on error - -When used as a system ioctl: -Reads the values of MSR-based features that are available for the VM. This -is similar to KVM_GET_SUPPORTED_CPUID, but it returns MSR indices and values. -The list of msr-based features can be obtained using KVM_GET_MSR_FEATURE_INDEX_LIST -in a system ioctl. - -When used as a vcpu ioctl: -Reads model-specific registers from the vcpu. Supported msr indices can -be obtained using KVM_GET_MSR_INDEX_LIST in a system ioctl. - -struct kvm_msrs { - __u32 nmsrs; /* number of msrs in entries */ - __u32 pad; - - struct kvm_msr_entry entries[0]; -}; - -struct kvm_msr_entry { - __u32 index; - __u32 reserved; - __u64 data; -}; - -Application code should set the 'nmsrs' member (which indicates the -size of the entries array) and the 'index' member of each array entry. -kvm will fill in the 'data' member. - - -4.19 KVM_SET_MSRS - -Capability: basic -Architectures: x86 -Type: vcpu ioctl -Parameters: struct kvm_msrs (in) -Returns: number of msrs successfully set (see below), -1 on error - -Writes model-specific registers to the vcpu. See KVM_GET_MSRS for the -data structures. - -Application code should set the 'nmsrs' member (which indicates the -size of the entries array), and the 'index' and 'data' members of each -array entry. - -It tries to set the MSRs in array entries[] one by one. If setting an MSR -fails, e.g., due to setting reserved bits, the MSR isn't supported/emulated -by KVM, etc..., it stops processing the MSR list and returns the number of -MSRs that have been set successfully. - - -4.20 KVM_SET_CPUID - -Capability: basic -Architectures: x86 -Type: vcpu ioctl -Parameters: struct kvm_cpuid (in) -Returns: 0 on success, -1 on error - -Defines the vcpu responses to the cpuid instruction. Applications -should use the KVM_SET_CPUID2 ioctl if available. - - -struct kvm_cpuid_entry { - __u32 function; - __u32 eax; - __u32 ebx; - __u32 ecx; - __u32 edx; - __u32 padding; -}; - -/* for KVM_SET_CPUID */ -struct kvm_cpuid { - __u32 nent; - __u32 padding; - struct kvm_cpuid_entry entries[0]; -}; - - -4.21 KVM_SET_SIGNAL_MASK - -Capability: basic -Architectures: all -Type: vcpu ioctl -Parameters: struct kvm_signal_mask (in) -Returns: 0 on success, -1 on error - -Defines which signals are blocked during execution of KVM_RUN. This -signal mask temporarily overrides the threads signal mask. Any -unblocked signal received (except SIGKILL and SIGSTOP, which retain -their traditional behaviour) will cause KVM_RUN to return with -EINTR. - -Note the signal will only be delivered if not blocked by the original -signal mask. - -/* for KVM_SET_SIGNAL_MASK */ -struct kvm_signal_mask { - __u32 len; - __u8 sigset[0]; -}; - - -4.22 KVM_GET_FPU - -Capability: basic -Architectures: x86 -Type: vcpu ioctl -Parameters: struct kvm_fpu (out) -Returns: 0 on success, -1 on error - -Reads the floating point state from the vcpu. - -/* for KVM_GET_FPU and KVM_SET_FPU */ -struct kvm_fpu { - __u8 fpr[8][16]; - __u16 fcw; - __u16 fsw; - __u8 ftwx; /* in fxsave format */ - __u8 pad1; - __u16 last_opcode; - __u64 last_ip; - __u64 last_dp; - __u8 xmm[16][16]; - __u32 mxcsr; - __u32 pad2; -}; - - -4.23 KVM_SET_FPU - -Capability: basic -Architectures: x86 -Type: vcpu ioctl -Parameters: struct kvm_fpu (in) -Returns: 0 on success, -1 on error - -Writes the floating point state to the vcpu. - -/* for KVM_GET_FPU and KVM_SET_FPU */ -struct kvm_fpu { - __u8 fpr[8][16]; - __u16 fcw; - __u16 fsw; - __u8 ftwx; /* in fxsave format */ - __u8 pad1; - __u16 last_opcode; - __u64 last_ip; - __u64 last_dp; - __u8 xmm[16][16]; - __u32 mxcsr; - __u32 pad2; -}; - - -4.24 KVM_CREATE_IRQCHIP - -Capability: KVM_CAP_IRQCHIP, KVM_CAP_S390_IRQCHIP (s390) -Architectures: x86, ARM, arm64, s390 -Type: vm ioctl -Parameters: none -Returns: 0 on success, -1 on error - -Creates an interrupt controller model in the kernel. -On x86, creates a virtual ioapic, a virtual PIC (two PICs, nested), and sets up -future vcpus to have a local APIC. IRQ routing for GSIs 0-15 is set to both -PIC and IOAPIC; GSI 16-23 only go to the IOAPIC. -On ARM/arm64, a GICv2 is created. Any other GIC versions require the usage of -KVM_CREATE_DEVICE, which also supports creating a GICv2. Using -KVM_CREATE_DEVICE is preferred over KVM_CREATE_IRQCHIP for GICv2. -On s390, a dummy irq routing table is created. - -Note that on s390 the KVM_CAP_S390_IRQCHIP vm capability needs to be enabled -before KVM_CREATE_IRQCHIP can be used. - - -4.25 KVM_IRQ_LINE - -Capability: KVM_CAP_IRQCHIP -Architectures: x86, arm, arm64 -Type: vm ioctl -Parameters: struct kvm_irq_level -Returns: 0 on success, -1 on error - -Sets the level of a GSI input to the interrupt controller model in the kernel. -On some architectures it is required that an interrupt controller model has -been previously created with KVM_CREATE_IRQCHIP. Note that edge-triggered -interrupts require the level to be set to 1 and then back to 0. - -On real hardware, interrupt pins can be active-low or active-high. This -does not matter for the level field of struct kvm_irq_level: 1 always -means active (asserted), 0 means inactive (deasserted). - -x86 allows the operating system to program the interrupt polarity -(active-low/active-high) for level-triggered interrupts, and KVM used -to consider the polarity. However, due to bitrot in the handling of -active-low interrupts, the above convention is now valid on x86 too. -This is signaled by KVM_CAP_X86_IOAPIC_POLARITY_IGNORED. Userspace -should not present interrupts to the guest as active-low unless this -capability is present (or unless it is not using the in-kernel irqchip, -of course). - - -ARM/arm64 can signal an interrupt either at the CPU level, or at the -in-kernel irqchip (GIC), and for in-kernel irqchip can tell the GIC to -use PPIs designated for specific cpus. The irq field is interpreted -like this: - -  bits: | 31 ... 28 | 27 ... 24 | 23 ... 16 | 15 ... 0 | - field: | vcpu2_index | irq_type | vcpu_index | irq_id | - -The irq_type field has the following values: -- irq_type[0]: out-of-kernel GIC: irq_id 0 is IRQ, irq_id 1 is FIQ -- irq_type[1]: in-kernel GIC: SPI, irq_id between 32 and 1019 (incl.) - (the vcpu_index field is ignored) -- irq_type[2]: in-kernel GIC: PPI, irq_id between 16 and 31 (incl.) - -(The irq_id field thus corresponds nicely to the IRQ ID in the ARM GIC specs) - -In both cases, level is used to assert/deassert the line. - -When KVM_CAP_ARM_IRQ_LINE_LAYOUT_2 is supported, the target vcpu is -identified as (256 * vcpu2_index + vcpu_index). Otherwise, vcpu2_index -must be zero. - -Note that on arm/arm64, the KVM_CAP_IRQCHIP capability only conditions -injection of interrupts for the in-kernel irqchip. KVM_IRQ_LINE can always -be used for a userspace interrupt controller. - -struct kvm_irq_level { - union { - __u32 irq; /* GSI */ - __s32 status; /* not used for KVM_IRQ_LEVEL */ - }; - __u32 level; /* 0 or 1 */ -}; - - -4.26 KVM_GET_IRQCHIP - -Capability: KVM_CAP_IRQCHIP -Architectures: x86 -Type: vm ioctl -Parameters: struct kvm_irqchip (in/out) -Returns: 0 on success, -1 on error - -Reads the state of a kernel interrupt controller created with -KVM_CREATE_IRQCHIP into a buffer provided by the caller. - -struct kvm_irqchip { - __u32 chip_id; /* 0 = PIC1, 1 = PIC2, 2 = IOAPIC */ - __u32 pad; - union { - char dummy[512]; /* reserving space */ - struct kvm_pic_state pic; - struct kvm_ioapic_state ioapic; - } chip; -}; - - -4.27 KVM_SET_IRQCHIP - -Capability: KVM_CAP_IRQCHIP -Architectures: x86 -Type: vm ioctl -Parameters: struct kvm_irqchip (in) -Returns: 0 on success, -1 on error - -Sets the state of a kernel interrupt controller created with -KVM_CREATE_IRQCHIP from a buffer provided by the caller. - -struct kvm_irqchip { - __u32 chip_id; /* 0 = PIC1, 1 = PIC2, 2 = IOAPIC */ - __u32 pad; - union { - char dummy[512]; /* reserving space */ - struct kvm_pic_state pic; - struct kvm_ioapic_state ioapic; - } chip; -}; - - -4.28 KVM_XEN_HVM_CONFIG - -Capability: KVM_CAP_XEN_HVM -Architectures: x86 -Type: vm ioctl -Parameters: struct kvm_xen_hvm_config (in) -Returns: 0 on success, -1 on error - -Sets the MSR that the Xen HVM guest uses to initialize its hypercall -page, and provides the starting address and size of the hypercall -blobs in userspace. When the guest writes the MSR, kvm copies one -page of a blob (32- or 64-bit, depending on the vcpu mode) to guest -memory. - -struct kvm_xen_hvm_config { - __u32 flags; - __u32 msr; - __u64 blob_addr_32; - __u64 blob_addr_64; - __u8 blob_size_32; - __u8 blob_size_64; - __u8 pad2[30]; -}; - - -4.29 KVM_GET_CLOCK - -Capability: KVM_CAP_ADJUST_CLOCK -Architectures: x86 -Type: vm ioctl -Parameters: struct kvm_clock_data (out) -Returns: 0 on success, -1 on error - -Gets the current timestamp of kvmclock as seen by the current guest. In -conjunction with KVM_SET_CLOCK, it is used to ensure monotonicity on scenarios -such as migration. - -When KVM_CAP_ADJUST_CLOCK is passed to KVM_CHECK_EXTENSION, it returns the -set of bits that KVM can return in struct kvm_clock_data's flag member. - -The only flag defined now is KVM_CLOCK_TSC_STABLE. If set, the returned -value is the exact kvmclock value seen by all VCPUs at the instant -when KVM_GET_CLOCK was called. If clear, the returned value is simply -CLOCK_MONOTONIC plus a constant offset; the offset can be modified -with KVM_SET_CLOCK. KVM will try to make all VCPUs follow this clock, -but the exact value read by each VCPU could differ, because the host -TSC is not stable. - -struct kvm_clock_data { - __u64 clock; /* kvmclock current value */ - __u32 flags; - __u32 pad[9]; -}; - - -4.30 KVM_SET_CLOCK - -Capability: KVM_CAP_ADJUST_CLOCK -Architectures: x86 -Type: vm ioctl -Parameters: struct kvm_clock_data (in) -Returns: 0 on success, -1 on error - -Sets the current timestamp of kvmclock to the value specified in its parameter. -In conjunction with KVM_GET_CLOCK, it is used to ensure monotonicity on scenarios -such as migration. - -struct kvm_clock_data { - __u64 clock; /* kvmclock current value */ - __u32 flags; - __u32 pad[9]; -}; - - -4.31 KVM_GET_VCPU_EVENTS - -Capability: KVM_CAP_VCPU_EVENTS -Extended by: KVM_CAP_INTR_SHADOW -Architectures: x86, arm, arm64 -Type: vcpu ioctl -Parameters: struct kvm_vcpu_event (out) -Returns: 0 on success, -1 on error - -X86: - -Gets currently pending exceptions, interrupts, and NMIs as well as related -states of the vcpu. - -struct kvm_vcpu_events { - struct { - __u8 injected; - __u8 nr; - __u8 has_error_code; - __u8 pending; - __u32 error_code; - } exception; - struct { - __u8 injected; - __u8 nr; - __u8 soft; - __u8 shadow; - } interrupt; - struct { - __u8 injected; - __u8 pending; - __u8 masked; - __u8 pad; - } nmi; - __u32 sipi_vector; - __u32 flags; - struct { - __u8 smm; - __u8 pending; - __u8 smm_inside_nmi; - __u8 latched_init; - } smi; - __u8 reserved[27]; - __u8 exception_has_payload; - __u64 exception_payload; -}; - -The following bits are defined in the flags field: - -- KVM_VCPUEVENT_VALID_SHADOW may be set to signal that - interrupt.shadow contains a valid state. - -- KVM_VCPUEVENT_VALID_SMM may be set to signal that smi contains a - valid state. - -- KVM_VCPUEVENT_VALID_PAYLOAD may be set to signal that the - exception_has_payload, exception_payload, and exception.pending - fields contain a valid state. This bit will be set whenever - KVM_CAP_EXCEPTION_PAYLOAD is enabled. - -ARM/ARM64: - -If the guest accesses a device that is being emulated by the host kernel in -such a way that a real device would generate a physical SError, KVM may make -a virtual SError pending for that VCPU. This system error interrupt remains -pending until the guest takes the exception by unmasking PSTATE.A. - -Running the VCPU may cause it to take a pending SError, or make an access that -causes an SError to become pending. The event's description is only valid while -the VPCU is not running. - -This API provides a way to read and write the pending 'event' state that is not -visible to the guest. To save, restore or migrate a VCPU the struct representing -the state can be read then written using this GET/SET API, along with the other -guest-visible registers. It is not possible to 'cancel' an SError that has been -made pending. - -A device being emulated in user-space may also wish to generate an SError. To do -this the events structure can be populated by user-space. The current state -should be read first, to ensure no existing SError is pending. If an existing -SError is pending, the architecture's 'Multiple SError interrupts' rules should -be followed. (2.5.3 of DDI0587.a "ARM Reliability, Availability, and -Serviceability (RAS) Specification"). - -SError exceptions always have an ESR value. Some CPUs have the ability to -specify what the virtual SError's ESR value should be. These systems will -advertise KVM_CAP_ARM_INJECT_SERROR_ESR. In this case exception.has_esr will -always have a non-zero value when read, and the agent making an SError pending -should specify the ISS field in the lower 24 bits of exception.serror_esr. If -the system supports KVM_CAP_ARM_INJECT_SERROR_ESR, but user-space sets the events -with exception.has_esr as zero, KVM will choose an ESR. - -Specifying exception.has_esr on a system that does not support it will return --EINVAL. Setting anything other than the lower 24bits of exception.serror_esr -will return -EINVAL. - -It is not possible to read back a pending external abort (injected via -KVM_SET_VCPU_EVENTS or otherwise) because such an exception is always delivered -directly to the virtual CPU). - - -struct kvm_vcpu_events { - struct { - __u8 serror_pending; - __u8 serror_has_esr; - __u8 ext_dabt_pending; - /* Align it to 8 bytes */ - __u8 pad[5]; - __u64 serror_esr; - } exception; - __u32 reserved[12]; -}; - -4.32 KVM_SET_VCPU_EVENTS - -Capability: KVM_CAP_VCPU_EVENTS -Extended by: KVM_CAP_INTR_SHADOW -Architectures: x86, arm, arm64 -Type: vcpu ioctl -Parameters: struct kvm_vcpu_event (in) -Returns: 0 on success, -1 on error - -X86: - -Set pending exceptions, interrupts, and NMIs as well as related states of the -vcpu. - -See KVM_GET_VCPU_EVENTS for the data structure. - -Fields that may be modified asynchronously by running VCPUs can be excluded -from the update. These fields are nmi.pending, sipi_vector, smi.smm, -smi.pending. Keep the corresponding bits in the flags field cleared to -suppress overwriting the current in-kernel state. The bits are: - -KVM_VCPUEVENT_VALID_NMI_PENDING - transfer nmi.pending to the kernel -KVM_VCPUEVENT_VALID_SIPI_VECTOR - transfer sipi_vector -KVM_VCPUEVENT_VALID_SMM - transfer the smi sub-struct. - -If KVM_CAP_INTR_SHADOW is available, KVM_VCPUEVENT_VALID_SHADOW can be set in -the flags field to signal that interrupt.shadow contains a valid state and -shall be written into the VCPU. - -KVM_VCPUEVENT_VALID_SMM can only be set if KVM_CAP_X86_SMM is available. - -If KVM_CAP_EXCEPTION_PAYLOAD is enabled, KVM_VCPUEVENT_VALID_PAYLOAD -can be set in the flags field to signal that the -exception_has_payload, exception_payload, and exception.pending fields -contain a valid state and shall be written into the VCPU. - -ARM/ARM64: - -User space may need to inject several types of events to the guest. - -Set the pending SError exception state for this VCPU. It is not possible to -'cancel' an Serror that has been made pending. - -If the guest performed an access to I/O memory which could not be handled by -userspace, for example because of missing instruction syndrome decode -information or because there is no device mapped at the accessed IPA, then -userspace can ask the kernel to inject an external abort using the address -from the exiting fault on the VCPU. It is a programming error to set -ext_dabt_pending after an exit which was not either KVM_EXIT_MMIO or -KVM_EXIT_ARM_NISV. This feature is only available if the system supports -KVM_CAP_ARM_INJECT_EXT_DABT. This is a helper which provides commonality in -how userspace reports accesses for the above cases to guests, across different -userspace implementations. Nevertheless, userspace can still emulate all Arm -exceptions by manipulating individual registers using the KVM_SET_ONE_REG API. - -See KVM_GET_VCPU_EVENTS for the data structure. - - -4.33 KVM_GET_DEBUGREGS - -Capability: KVM_CAP_DEBUGREGS -Architectures: x86 -Type: vm ioctl -Parameters: struct kvm_debugregs (out) -Returns: 0 on success, -1 on error - -Reads debug registers from the vcpu. - -struct kvm_debugregs { - __u64 db[4]; - __u64 dr6; - __u64 dr7; - __u64 flags; - __u64 reserved[9]; -}; - - -4.34 KVM_SET_DEBUGREGS - -Capability: KVM_CAP_DEBUGREGS -Architectures: x86 -Type: vm ioctl -Parameters: struct kvm_debugregs (in) -Returns: 0 on success, -1 on error - -Writes debug registers into the vcpu. - -See KVM_GET_DEBUGREGS for the data structure. The flags field is unused -yet and must be cleared on entry. - - -4.35 KVM_SET_USER_MEMORY_REGION - -Capability: KVM_CAP_USER_MEMORY -Architectures: all -Type: vm ioctl -Parameters: struct kvm_userspace_memory_region (in) -Returns: 0 on success, -1 on error - -struct kvm_userspace_memory_region { - __u32 slot; - __u32 flags; - __u64 guest_phys_addr; - __u64 memory_size; /* bytes */ - __u64 userspace_addr; /* start of the userspace allocated memory */ -}; - -/* for kvm_memory_region::flags */ -#define KVM_MEM_LOG_DIRTY_PAGES (1UL << 0) -#define KVM_MEM_READONLY (1UL << 1) - -This ioctl allows the user to create, modify or delete a guest physical -memory slot. Bits 0-15 of "slot" specify the slot id and this value -should be less than the maximum number of user memory slots supported per -VM. The maximum allowed slots can be queried using KVM_CAP_NR_MEMSLOTS. -Slots may not overlap in guest physical address space. - -If KVM_CAP_MULTI_ADDRESS_SPACE is available, bits 16-31 of "slot" -specifies the address space which is being modified. They must be -less than the value that KVM_CHECK_EXTENSION returns for the -KVM_CAP_MULTI_ADDRESS_SPACE capability. Slots in separate address spaces -are unrelated; the restriction on overlapping slots only applies within -each address space. - -Deleting a slot is done by passing zero for memory_size. When changing -an existing slot, it may be moved in the guest physical memory space, -or its flags may be modified, but it may not be resized. - -Memory for the region is taken starting at the address denoted by the -field userspace_addr, which must point at user addressable memory for -the entire memory slot size. Any object may back this memory, including -anonymous memory, ordinary files, and hugetlbfs. - -It is recommended that the lower 21 bits of guest_phys_addr and userspace_addr -be identical. This allows large pages in the guest to be backed by large -pages in the host. - -The flags field supports two flags: KVM_MEM_LOG_DIRTY_PAGES and -KVM_MEM_READONLY. The former can be set to instruct KVM to keep track of -writes to memory within the slot. See KVM_GET_DIRTY_LOG ioctl to know how to -use it. The latter can be set, if KVM_CAP_READONLY_MEM capability allows it, -to make a new slot read-only. In this case, writes to this memory will be -posted to userspace as KVM_EXIT_MMIO exits. - -When the KVM_CAP_SYNC_MMU capability is available, changes in the backing of -the memory region are automatically reflected into the guest. For example, an -mmap() that affects the region will be made visible immediately. Another -example is madvise(MADV_DROP). - -It is recommended to use this API instead of the KVM_SET_MEMORY_REGION ioctl. -The KVM_SET_MEMORY_REGION does not allow fine grained control over memory -allocation and is deprecated. - - -4.36 KVM_SET_TSS_ADDR - -Capability: KVM_CAP_SET_TSS_ADDR -Architectures: x86 -Type: vm ioctl -Parameters: unsigned long tss_address (in) -Returns: 0 on success, -1 on error - -This ioctl defines the physical address of a three-page region in the guest -physical address space. The region must be within the first 4GB of the -guest physical address space and must not conflict with any memory slot -or any mmio address. The guest may malfunction if it accesses this memory -region. - -This ioctl is required on Intel-based hosts. This is needed on Intel hardware -because of a quirk in the virtualization implementation (see the internals -documentation when it pops into existence). - - -4.37 KVM_ENABLE_CAP - -Capability: KVM_CAP_ENABLE_CAP -Architectures: mips, ppc, s390 -Type: vcpu ioctl -Parameters: struct kvm_enable_cap (in) -Returns: 0 on success; -1 on error - -Capability: KVM_CAP_ENABLE_CAP_VM -Architectures: all -Type: vcpu ioctl -Parameters: struct kvm_enable_cap (in) -Returns: 0 on success; -1 on error - -+Not all extensions are enabled by default. Using this ioctl the application -can enable an extension, making it available to the guest. - -On systems that do not support this ioctl, it always fails. On systems that -do support it, it only works for extensions that are supported for enablement. - -To check if a capability can be enabled, the KVM_CHECK_EXTENSION ioctl should -be used. - -struct kvm_enable_cap { - /* in */ - __u32 cap; - -The capability that is supposed to get enabled. - - __u32 flags; - -A bitfield indicating future enhancements. Has to be 0 for now. - - __u64 args[4]; - -Arguments for enabling a feature. If a feature needs initial values to -function properly, this is the place to put them. - - __u8 pad[64]; -}; - -The vcpu ioctl should be used for vcpu-specific capabilities, the vm ioctl -for vm-wide capabilities. - -4.38 KVM_GET_MP_STATE - -Capability: KVM_CAP_MP_STATE -Architectures: x86, s390, arm, arm64 -Type: vcpu ioctl -Parameters: struct kvm_mp_state (out) -Returns: 0 on success; -1 on error - -struct kvm_mp_state { - __u32 mp_state; -}; - -Returns the vcpu's current "multiprocessing state" (though also valid on -uniprocessor guests). - -Possible values are: - - - KVM_MP_STATE_RUNNABLE: the vcpu is currently running [x86,arm/arm64] - - KVM_MP_STATE_UNINITIALIZED: the vcpu is an application processor (AP) - which has not yet received an INIT signal [x86] - - KVM_MP_STATE_INIT_RECEIVED: the vcpu has received an INIT signal, and is - now ready for a SIPI [x86] - - KVM_MP_STATE_HALTED: the vcpu has executed a HLT instruction and - is waiting for an interrupt [x86] - - KVM_MP_STATE_SIPI_RECEIVED: the vcpu has just received a SIPI (vector - accessible via KVM_GET_VCPU_EVENTS) [x86] - - KVM_MP_STATE_STOPPED: the vcpu is stopped [s390,arm/arm64] - - KVM_MP_STATE_CHECK_STOP: the vcpu is in a special error state [s390] - - KVM_MP_STATE_OPERATING: the vcpu is operating (running or halted) - [s390] - - KVM_MP_STATE_LOAD: the vcpu is in a special load/startup state - [s390] - -On x86, this ioctl is only useful after KVM_CREATE_IRQCHIP. Without an -in-kernel irqchip, the multiprocessing state must be maintained by userspace on -these architectures. - -For arm/arm64: - -The only states that are valid are KVM_MP_STATE_STOPPED and -KVM_MP_STATE_RUNNABLE which reflect if the vcpu is paused or not. - -4.39 KVM_SET_MP_STATE - -Capability: KVM_CAP_MP_STATE -Architectures: x86, s390, arm, arm64 -Type: vcpu ioctl -Parameters: struct kvm_mp_state (in) -Returns: 0 on success; -1 on error - -Sets the vcpu's current "multiprocessing state"; see KVM_GET_MP_STATE for -arguments. - -On x86, this ioctl is only useful after KVM_CREATE_IRQCHIP. Without an -in-kernel irqchip, the multiprocessing state must be maintained by userspace on -these architectures. - -For arm/arm64: - -The only states that are valid are KVM_MP_STATE_STOPPED and -KVM_MP_STATE_RUNNABLE which reflect if the vcpu should be paused or not. - -4.40 KVM_SET_IDENTITY_MAP_ADDR - -Capability: KVM_CAP_SET_IDENTITY_MAP_ADDR -Architectures: x86 -Type: vm ioctl -Parameters: unsigned long identity (in) -Returns: 0 on success, -1 on error - -This ioctl defines the physical address of a one-page region in the guest -physical address space. The region must be within the first 4GB of the -guest physical address space and must not conflict with any memory slot -or any mmio address. The guest may malfunction if it accesses this memory -region. - -Setting the address to 0 will result in resetting the address to its default -(0xfffbc000). - -This ioctl is required on Intel-based hosts. This is needed on Intel hardware -because of a quirk in the virtualization implementation (see the internals -documentation when it pops into existence). - -Fails if any VCPU has already been created. - -4.41 KVM_SET_BOOT_CPU_ID - -Capability: KVM_CAP_SET_BOOT_CPU_ID -Architectures: x86 -Type: vm ioctl -Parameters: unsigned long vcpu_id -Returns: 0 on success, -1 on error - -Define which vcpu is the Bootstrap Processor (BSP). Values are the same -as the vcpu id in KVM_CREATE_VCPU. If this ioctl is not called, the default -is vcpu 0. - - -4.42 KVM_GET_XSAVE - -Capability: KVM_CAP_XSAVE -Architectures: x86 -Type: vcpu ioctl -Parameters: struct kvm_xsave (out) -Returns: 0 on success, -1 on error - -struct kvm_xsave { - __u32 region[1024]; -}; - -This ioctl would copy current vcpu's xsave struct to the userspace. - - -4.43 KVM_SET_XSAVE - -Capability: KVM_CAP_XSAVE -Architectures: x86 -Type: vcpu ioctl -Parameters: struct kvm_xsave (in) -Returns: 0 on success, -1 on error - -struct kvm_xsave { - __u32 region[1024]; -}; - -This ioctl would copy userspace's xsave struct to the kernel. - - -4.44 KVM_GET_XCRS - -Capability: KVM_CAP_XCRS -Architectures: x86 -Type: vcpu ioctl -Parameters: struct kvm_xcrs (out) -Returns: 0 on success, -1 on error - -struct kvm_xcr { - __u32 xcr; - __u32 reserved; - __u64 value; -}; - -struct kvm_xcrs { - __u32 nr_xcrs; - __u32 flags; - struct kvm_xcr xcrs[KVM_MAX_XCRS]; - __u64 padding[16]; -}; - -This ioctl would copy current vcpu's xcrs to the userspace. - - -4.45 KVM_SET_XCRS - -Capability: KVM_CAP_XCRS -Architectures: x86 -Type: vcpu ioctl -Parameters: struct kvm_xcrs (in) -Returns: 0 on success, -1 on error - -struct kvm_xcr { - __u32 xcr; - __u32 reserved; - __u64 value; -}; - -struct kvm_xcrs { - __u32 nr_xcrs; - __u32 flags; - struct kvm_xcr xcrs[KVM_MAX_XCRS]; - __u64 padding[16]; -}; - -This ioctl would set vcpu's xcr to the value userspace specified. - - -4.46 KVM_GET_SUPPORTED_CPUID - -Capability: KVM_CAP_EXT_CPUID -Architectures: x86 -Type: system ioctl -Parameters: struct kvm_cpuid2 (in/out) -Returns: 0 on success, -1 on error - -struct kvm_cpuid2 { - __u32 nent; - __u32 padding; - struct kvm_cpuid_entry2 entries[0]; -}; - -#define KVM_CPUID_FLAG_SIGNIFCANT_INDEX BIT(0) -#define KVM_CPUID_FLAG_STATEFUL_FUNC BIT(1) -#define KVM_CPUID_FLAG_STATE_READ_NEXT BIT(2) - -struct kvm_cpuid_entry2 { - __u32 function; - __u32 index; - __u32 flags; - __u32 eax; - __u32 ebx; - __u32 ecx; - __u32 edx; - __u32 padding[3]; -}; - -This ioctl returns x86 cpuid features which are supported by both the -hardware and kvm in its default configuration. Userspace can use the -information returned by this ioctl to construct cpuid information (for -KVM_SET_CPUID2) that is consistent with hardware, kernel, and -userspace capabilities, and with user requirements (for example, the -user may wish to constrain cpuid to emulate older hardware, or for -feature consistency across a cluster). - -Note that certain capabilities, such as KVM_CAP_X86_DISABLE_EXITS, may -expose cpuid features (e.g. MONITOR) which are not supported by kvm in -its default configuration. If userspace enables such capabilities, it -is responsible for modifying the results of this ioctl appropriately. - -Userspace invokes KVM_GET_SUPPORTED_CPUID by passing a kvm_cpuid2 structure -with the 'nent' field indicating the number of entries in the variable-size -array 'entries'. If the number of entries is too low to describe the cpu -capabilities, an error (E2BIG) is returned. If the number is too high, -the 'nent' field is adjusted and an error (ENOMEM) is returned. If the -number is just right, the 'nent' field is adjusted to the number of valid -entries in the 'entries' array, which is then filled. - -The entries returned are the host cpuid as returned by the cpuid instruction, -with unknown or unsupported features masked out. Some features (for example, -x2apic), may not be present in the host cpu, but are exposed by kvm if it can -emulate them efficiently. The fields in each entry are defined as follows: - - function: the eax value used to obtain the entry - index: the ecx value used to obtain the entry (for entries that are - affected by ecx) - flags: an OR of zero or more of the following: - KVM_CPUID_FLAG_SIGNIFCANT_INDEX: - if the index field is valid - KVM_CPUID_FLAG_STATEFUL_FUNC: - if cpuid for this function returns different values for successive - invocations; there will be several entries with the same function, - all with this flag set - KVM_CPUID_FLAG_STATE_READ_NEXT: - for KVM_CPUID_FLAG_STATEFUL_FUNC entries, set if this entry is - the first entry to be read by a cpu - eax, ebx, ecx, edx: the values returned by the cpuid instruction for - this function/index combination - -The TSC deadline timer feature (CPUID leaf 1, ecx[24]) is always returned -as false, since the feature depends on KVM_CREATE_IRQCHIP for local APIC -support. Instead it is reported via - - ioctl(KVM_CHECK_EXTENSION, KVM_CAP_TSC_DEADLINE_TIMER) - -if that returns true and you use KVM_CREATE_IRQCHIP, or if you emulate the -feature in userspace, then you can enable the feature for KVM_SET_CPUID2. - - -4.47 KVM_PPC_GET_PVINFO - -Capability: KVM_CAP_PPC_GET_PVINFO -Architectures: ppc -Type: vm ioctl -Parameters: struct kvm_ppc_pvinfo (out) -Returns: 0 on success, !0 on error - -struct kvm_ppc_pvinfo { - __u32 flags; - __u32 hcall[4]; - __u8 pad[108]; -}; - -This ioctl fetches PV specific information that need to be passed to the guest -using the device tree or other means from vm context. - -The hcall array defines 4 instructions that make up a hypercall. - -If any additional field gets added to this structure later on, a bit for that -additional piece of information will be set in the flags bitmap. - -The flags bitmap is defined as: - - /* the host supports the ePAPR idle hcall - #define KVM_PPC_PVINFO_FLAGS_EV_IDLE (1<<0) - -4.52 KVM_SET_GSI_ROUTING - -Capability: KVM_CAP_IRQ_ROUTING -Architectures: x86 s390 arm arm64 -Type: vm ioctl -Parameters: struct kvm_irq_routing (in) -Returns: 0 on success, -1 on error - -Sets the GSI routing table entries, overwriting any previously set entries. - -On arm/arm64, GSI routing has the following limitation: -- GSI routing does not apply to KVM_IRQ_LINE but only to KVM_IRQFD. - -struct kvm_irq_routing { - __u32 nr; - __u32 flags; - struct kvm_irq_routing_entry entries[0]; -}; - -No flags are specified so far, the corresponding field must be set to zero. - -struct kvm_irq_routing_entry { - __u32 gsi; - __u32 type; - __u32 flags; - __u32 pad; - union { - struct kvm_irq_routing_irqchip irqchip; - struct kvm_irq_routing_msi msi; - struct kvm_irq_routing_s390_adapter adapter; - struct kvm_irq_routing_hv_sint hv_sint; - __u32 pad[8]; - } u; -}; - -/* gsi routing entry types */ -#define KVM_IRQ_ROUTING_IRQCHIP 1 -#define KVM_IRQ_ROUTING_MSI 2 -#define KVM_IRQ_ROUTING_S390_ADAPTER 3 -#define KVM_IRQ_ROUTING_HV_SINT 4 - -flags: -- KVM_MSI_VALID_DEVID: used along with KVM_IRQ_ROUTING_MSI routing entry - type, specifies that the devid field contains a valid value. The per-VM - KVM_CAP_MSI_DEVID capability advertises the requirement to provide - the device ID. If this capability is not available, userspace should - never set the KVM_MSI_VALID_DEVID flag as the ioctl might fail. -- zero otherwise - -struct kvm_irq_routing_irqchip { - __u32 irqchip; - __u32 pin; -}; - -struct kvm_irq_routing_msi { - __u32 address_lo; - __u32 address_hi; - __u32 data; - union { - __u32 pad; - __u32 devid; - }; -}; - -If KVM_MSI_VALID_DEVID is set, devid contains a unique device identifier -for the device that wrote the MSI message. For PCI, this is usually a -BFD identifier in the lower 16 bits. - -On x86, address_hi is ignored unless the KVM_X2APIC_API_USE_32BIT_IDS -feature of KVM_CAP_X2APIC_API capability is enabled. If it is enabled, -address_hi bits 31-8 provide bits 31-8 of the destination id. Bits 7-0 of -address_hi must be zero. - -struct kvm_irq_routing_s390_adapter { - __u64 ind_addr; - __u64 summary_addr; - __u64 ind_offset; - __u32 summary_offset; - __u32 adapter_id; -}; - -struct kvm_irq_routing_hv_sint { - __u32 vcpu; - __u32 sint; -}; - - -4.55 KVM_SET_TSC_KHZ - -Capability: KVM_CAP_TSC_CONTROL -Architectures: x86 -Type: vcpu ioctl -Parameters: virtual tsc_khz -Returns: 0 on success, -1 on error - -Specifies the tsc frequency for the virtual machine. The unit of the -frequency is KHz. - - -4.56 KVM_GET_TSC_KHZ - -Capability: KVM_CAP_GET_TSC_KHZ -Architectures: x86 -Type: vcpu ioctl -Parameters: none -Returns: virtual tsc-khz on success, negative value on error - -Returns the tsc frequency of the guest. The unit of the return value is -KHz. If the host has unstable tsc this ioctl returns -EIO instead as an -error. - - -4.57 KVM_GET_LAPIC - -Capability: KVM_CAP_IRQCHIP -Architectures: x86 -Type: vcpu ioctl -Parameters: struct kvm_lapic_state (out) -Returns: 0 on success, -1 on error - -#define KVM_APIC_REG_SIZE 0x400 -struct kvm_lapic_state { - char regs[KVM_APIC_REG_SIZE]; -}; - -Reads the Local APIC registers and copies them into the input argument. The -data format and layout are the same as documented in the architecture manual. - -If KVM_X2APIC_API_USE_32BIT_IDS feature of KVM_CAP_X2APIC_API is -enabled, then the format of APIC_ID register depends on the APIC mode -(reported by MSR_IA32_APICBASE) of its VCPU. x2APIC stores APIC ID in -the APIC_ID register (bytes 32-35). xAPIC only allows an 8-bit APIC ID -which is stored in bits 31-24 of the APIC register, or equivalently in -byte 35 of struct kvm_lapic_state's regs field. KVM_GET_LAPIC must then -be called after MSR_IA32_APICBASE has been set with KVM_SET_MSR. - -If KVM_X2APIC_API_USE_32BIT_IDS feature is disabled, struct kvm_lapic_state -always uses xAPIC format. - - -4.58 KVM_SET_LAPIC - -Capability: KVM_CAP_IRQCHIP -Architectures: x86 -Type: vcpu ioctl -Parameters: struct kvm_lapic_state (in) -Returns: 0 on success, -1 on error - -#define KVM_APIC_REG_SIZE 0x400 -struct kvm_lapic_state { - char regs[KVM_APIC_REG_SIZE]; -}; - -Copies the input argument into the Local APIC registers. The data format -and layout are the same as documented in the architecture manual. - -The format of the APIC ID register (bytes 32-35 of struct kvm_lapic_state's -regs field) depends on the state of the KVM_CAP_X2APIC_API capability. -See the note in KVM_GET_LAPIC. - - -4.59 KVM_IOEVENTFD - -Capability: KVM_CAP_IOEVENTFD -Architectures: all -Type: vm ioctl -Parameters: struct kvm_ioeventfd (in) -Returns: 0 on success, !0 on error - -This ioctl attaches or detaches an ioeventfd to a legal pio/mmio address -within the guest. A guest write in the registered address will signal the -provided event instead of triggering an exit. - -struct kvm_ioeventfd { - __u64 datamatch; - __u64 addr; /* legal pio/mmio address */ - __u32 len; /* 0, 1, 2, 4, or 8 bytes */ - __s32 fd; - __u32 flags; - __u8 pad[36]; -}; - -For the special case of virtio-ccw devices on s390, the ioevent is matched -to a subchannel/virtqueue tuple instead. - -The following flags are defined: - -#define KVM_IOEVENTFD_FLAG_DATAMATCH (1 << kvm_ioeventfd_flag_nr_datamatch) -#define KVM_IOEVENTFD_FLAG_PIO (1 << kvm_ioeventfd_flag_nr_pio) -#define KVM_IOEVENTFD_FLAG_DEASSIGN (1 << kvm_ioeventfd_flag_nr_deassign) -#define KVM_IOEVENTFD_FLAG_VIRTIO_CCW_NOTIFY \ - (1 << kvm_ioeventfd_flag_nr_virtio_ccw_notify) - -If datamatch flag is set, the event will be signaled only if the written value -to the registered address is equal to datamatch in struct kvm_ioeventfd. - -For virtio-ccw devices, addr contains the subchannel id and datamatch the -virtqueue index. - -With KVM_CAP_IOEVENTFD_ANY_LENGTH, a zero length ioeventfd is allowed, and -the kernel will ignore the length of guest write and may get a faster vmexit. -The speedup may only apply to specific architectures, but the ioeventfd will -work anyway. - -4.60 KVM_DIRTY_TLB - -Capability: KVM_CAP_SW_TLB -Architectures: ppc -Type: vcpu ioctl -Parameters: struct kvm_dirty_tlb (in) -Returns: 0 on success, -1 on error - -struct kvm_dirty_tlb { - __u64 bitmap; - __u32 num_dirty; -}; - -This must be called whenever userspace has changed an entry in the shared -TLB, prior to calling KVM_RUN on the associated vcpu. - -The "bitmap" field is the userspace address of an array. This array -consists of a number of bits, equal to the total number of TLB entries as -determined by the last successful call to KVM_CONFIG_TLB, rounded up to the -nearest multiple of 64. - -Each bit corresponds to one TLB entry, ordered the same as in the shared TLB -array. - -The array is little-endian: the bit 0 is the least significant bit of the -first byte, bit 8 is the least significant bit of the second byte, etc. -This avoids any complications with differing word sizes. - -The "num_dirty" field is a performance hint for KVM to determine whether it -should skip processing the bitmap and just invalidate everything. It must -be set to the number of set bits in the bitmap. - - -4.62 KVM_CREATE_SPAPR_TCE - -Capability: KVM_CAP_SPAPR_TCE -Architectures: powerpc -Type: vm ioctl -Parameters: struct kvm_create_spapr_tce (in) -Returns: file descriptor for manipulating the created TCE table - -This creates a virtual TCE (translation control entry) table, which -is an IOMMU for PAPR-style virtual I/O. It is used to translate -logical addresses used in virtual I/O into guest physical addresses, -and provides a scatter/gather capability for PAPR virtual I/O. - -/* for KVM_CAP_SPAPR_TCE */ -struct kvm_create_spapr_tce { - __u64 liobn; - __u32 window_size; -}; - -The liobn field gives the logical IO bus number for which to create a -TCE table. The window_size field specifies the size of the DMA window -which this TCE table will translate - the table will contain one 64 -bit TCE entry for every 4kiB of the DMA window. - -When the guest issues an H_PUT_TCE hcall on a liobn for which a TCE -table has been created using this ioctl(), the kernel will handle it -in real mode, updating the TCE table. H_PUT_TCE calls for other -liobns will cause a vm exit and must be handled by userspace. - -The return value is a file descriptor which can be passed to mmap(2) -to map the created TCE table into userspace. This lets userspace read -the entries written by kernel-handled H_PUT_TCE calls, and also lets -userspace update the TCE table directly which is useful in some -circumstances. - - -4.63 KVM_ALLOCATE_RMA - -Capability: KVM_CAP_PPC_RMA -Architectures: powerpc -Type: vm ioctl -Parameters: struct kvm_allocate_rma (out) -Returns: file descriptor for mapping the allocated RMA - -This allocates a Real Mode Area (RMA) from the pool allocated at boot -time by the kernel. An RMA is a physically-contiguous, aligned region -of memory used on older POWER processors to provide the memory which -will be accessed by real-mode (MMU off) accesses in a KVM guest. -POWER processors support a set of sizes for the RMA that usually -includes 64MB, 128MB, 256MB and some larger powers of two. - -/* for KVM_ALLOCATE_RMA */ -struct kvm_allocate_rma { - __u64 rma_size; -}; - -The return value is a file descriptor which can be passed to mmap(2) -to map the allocated RMA into userspace. The mapped area can then be -passed to the KVM_SET_USER_MEMORY_REGION ioctl to establish it as the -RMA for a virtual machine. The size of the RMA in bytes (which is -fixed at host kernel boot time) is returned in the rma_size field of -the argument structure. - -The KVM_CAP_PPC_RMA capability is 1 or 2 if the KVM_ALLOCATE_RMA ioctl -is supported; 2 if the processor requires all virtual machines to have -an RMA, or 1 if the processor can use an RMA but doesn't require it, -because it supports the Virtual RMA (VRMA) facility. - - -4.64 KVM_NMI - -Capability: KVM_CAP_USER_NMI -Architectures: x86 -Type: vcpu ioctl -Parameters: none -Returns: 0 on success, -1 on error - -Queues an NMI on the thread's vcpu. Note this is well defined only -when KVM_CREATE_IRQCHIP has not been called, since this is an interface -between the virtual cpu core and virtual local APIC. After KVM_CREATE_IRQCHIP -has been called, this interface is completely emulated within the kernel. - -To use this to emulate the LINT1 input with KVM_CREATE_IRQCHIP, use the -following algorithm: - - - pause the vcpu - - read the local APIC's state (KVM_GET_LAPIC) - - check whether changing LINT1 will queue an NMI (see the LVT entry for LINT1) - - if so, issue KVM_NMI - - resume the vcpu - -Some guests configure the LINT1 NMI input to cause a panic, aiding in -debugging. - - -4.65 KVM_S390_UCAS_MAP - -Capability: KVM_CAP_S390_UCONTROL -Architectures: s390 -Type: vcpu ioctl -Parameters: struct kvm_s390_ucas_mapping (in) -Returns: 0 in case of success - -The parameter is defined like this: - struct kvm_s390_ucas_mapping { - __u64 user_addr; - __u64 vcpu_addr; - __u64 length; - }; - -This ioctl maps the memory at "user_addr" with the length "length" to -the vcpu's address space starting at "vcpu_addr". All parameters need to -be aligned by 1 megabyte. - - -4.66 KVM_S390_UCAS_UNMAP - -Capability: KVM_CAP_S390_UCONTROL -Architectures: s390 -Type: vcpu ioctl -Parameters: struct kvm_s390_ucas_mapping (in) -Returns: 0 in case of success - -The parameter is defined like this: - struct kvm_s390_ucas_mapping { - __u64 user_addr; - __u64 vcpu_addr; - __u64 length; - }; - -This ioctl unmaps the memory in the vcpu's address space starting at -"vcpu_addr" with the length "length". The field "user_addr" is ignored. -All parameters need to be aligned by 1 megabyte. - - -4.67 KVM_S390_VCPU_FAULT - -Capability: KVM_CAP_S390_UCONTROL -Architectures: s390 -Type: vcpu ioctl -Parameters: vcpu absolute address (in) -Returns: 0 in case of success - -This call creates a page table entry on the virtual cpu's address space -(for user controlled virtual machines) or the virtual machine's address -space (for regular virtual machines). This only works for minor faults, -thus it's recommended to access subject memory page via the user page -table upfront. This is useful to handle validity intercepts for user -controlled virtual machines to fault in the virtual cpu's lowcore pages -prior to calling the KVM_RUN ioctl. - - -4.68 KVM_SET_ONE_REG - -Capability: KVM_CAP_ONE_REG -Architectures: all -Type: vcpu ioctl -Parameters: struct kvm_one_reg (in) -Returns: 0 on success, negative value on failure -Errors: -  ENOENT:   no such register -  EINVAL:   invalid register ID, or no such register -  EPERM:    (arm64) register access not allowed before vcpu finalization -(These error codes are indicative only: do not rely on a specific error -code being returned in a specific situation.) - -struct kvm_one_reg { - __u64 id; - __u64 addr; -}; - -Using this ioctl, a single vcpu register can be set to a specific value -defined by user space with the passed in struct kvm_one_reg, where id -refers to the register identifier as described below and addr is a pointer -to a variable with the respective size. There can be architecture agnostic -and architecture specific registers. Each have their own range of operation -and their own constants and width. To keep track of the implemented -registers, find a list below: - - Arch | Register | Width (bits) - | | - PPC | KVM_REG_PPC_HIOR | 64 - PPC | KVM_REG_PPC_IAC1 | 64 - PPC | KVM_REG_PPC_IAC2 | 64 - PPC | KVM_REG_PPC_IAC3 | 64 - PPC | KVM_REG_PPC_IAC4 | 64 - PPC | KVM_REG_PPC_DAC1 | 64 - PPC | KVM_REG_PPC_DAC2 | 64 - PPC | KVM_REG_PPC_DABR | 64 - PPC | KVM_REG_PPC_DSCR | 64 - PPC | KVM_REG_PPC_PURR | 64 - PPC | KVM_REG_PPC_SPURR | 64 - PPC | KVM_REG_PPC_DAR | 64 - PPC | KVM_REG_PPC_DSISR | 32 - PPC | KVM_REG_PPC_AMR | 64 - PPC | KVM_REG_PPC_UAMOR | 64 - PPC | KVM_REG_PPC_MMCR0 | 64 - PPC | KVM_REG_PPC_MMCR1 | 64 - PPC | KVM_REG_PPC_MMCRA | 64 - PPC | KVM_REG_PPC_MMCR2 | 64 - PPC | KVM_REG_PPC_MMCRS | 64 - PPC | KVM_REG_PPC_SIAR | 64 - PPC | KVM_REG_PPC_SDAR | 64 - PPC | KVM_REG_PPC_SIER | 64 - PPC | KVM_REG_PPC_PMC1 | 32 - PPC | KVM_REG_PPC_PMC2 | 32 - PPC | KVM_REG_PPC_PMC3 | 32 - PPC | KVM_REG_PPC_PMC4 | 32 - PPC | KVM_REG_PPC_PMC5 | 32 - PPC | KVM_REG_PPC_PMC6 | 32 - PPC | KVM_REG_PPC_PMC7 | 32 - PPC | KVM_REG_PPC_PMC8 | 32 - PPC | KVM_REG_PPC_FPR0 | 64 - ... - PPC | KVM_REG_PPC_FPR31 | 64 - PPC | KVM_REG_PPC_VR0 | 128 - ... - PPC | KVM_REG_PPC_VR31 | 128 - PPC | KVM_REG_PPC_VSR0 | 128 - ... - PPC | KVM_REG_PPC_VSR31 | 128 - PPC | KVM_REG_PPC_FPSCR | 64 - PPC | KVM_REG_PPC_VSCR | 32 - PPC | KVM_REG_PPC_VPA_ADDR | 64 - PPC | KVM_REG_PPC_VPA_SLB | 128 - PPC | KVM_REG_PPC_VPA_DTL | 128 - PPC | KVM_REG_PPC_EPCR | 32 - PPC | KVM_REG_PPC_EPR | 32 - PPC | KVM_REG_PPC_TCR | 32 - PPC | KVM_REG_PPC_TSR | 32 - PPC | KVM_REG_PPC_OR_TSR | 32 - PPC | KVM_REG_PPC_CLEAR_TSR | 32 - PPC | KVM_REG_PPC_MAS0 | 32 - PPC | KVM_REG_PPC_MAS1 | 32 - PPC | KVM_REG_PPC_MAS2 | 64 - PPC | KVM_REG_PPC_MAS7_3 | 64 - PPC | KVM_REG_PPC_MAS4 | 32 - PPC | KVM_REG_PPC_MAS6 | 32 - PPC | KVM_REG_PPC_MMUCFG | 32 - PPC | KVM_REG_PPC_TLB0CFG | 32 - PPC | KVM_REG_PPC_TLB1CFG | 32 - PPC | KVM_REG_PPC_TLB2CFG | 32 - PPC | KVM_REG_PPC_TLB3CFG | 32 - PPC | KVM_REG_PPC_TLB0PS | 32 - PPC | KVM_REG_PPC_TLB1PS | 32 - PPC | KVM_REG_PPC_TLB2PS | 32 - PPC | KVM_REG_PPC_TLB3PS | 32 - PPC | KVM_REG_PPC_EPTCFG | 32 - PPC | KVM_REG_PPC_ICP_STATE | 64 - PPC | KVM_REG_PPC_VP_STATE | 128 - PPC | KVM_REG_PPC_TB_OFFSET | 64 - PPC | KVM_REG_PPC_SPMC1 | 32 - PPC | KVM_REG_PPC_SPMC2 | 32 - PPC | KVM_REG_PPC_IAMR | 64 - PPC | KVM_REG_PPC_TFHAR | 64 - PPC | KVM_REG_PPC_TFIAR | 64 - PPC | KVM_REG_PPC_TEXASR | 64 - PPC | KVM_REG_PPC_FSCR | 64 - PPC | KVM_REG_PPC_PSPB | 32 - PPC | KVM_REG_PPC_EBBHR | 64 - PPC | KVM_REG_PPC_EBBRR | 64 - PPC | KVM_REG_PPC_BESCR | 64 - PPC | KVM_REG_PPC_TAR | 64 - PPC | KVM_REG_PPC_DPDES | 64 - PPC | KVM_REG_PPC_DAWR | 64 - PPC | KVM_REG_PPC_DAWRX | 64 - PPC | KVM_REG_PPC_CIABR | 64 - PPC | KVM_REG_PPC_IC | 64 - PPC | KVM_REG_PPC_VTB | 64 - PPC | KVM_REG_PPC_CSIGR | 64 - PPC | KVM_REG_PPC_TACR | 64 - PPC | KVM_REG_PPC_TCSCR | 64 - PPC | KVM_REG_PPC_PID | 64 - PPC | KVM_REG_PPC_ACOP | 64 - PPC | KVM_REG_PPC_VRSAVE | 32 - PPC | KVM_REG_PPC_LPCR | 32 - PPC | KVM_REG_PPC_LPCR_64 | 64 - PPC | KVM_REG_PPC_PPR | 64 - PPC | KVM_REG_PPC_ARCH_COMPAT | 32 - PPC | KVM_REG_PPC_DABRX | 32 - PPC | KVM_REG_PPC_WORT | 64 - PPC | KVM_REG_PPC_SPRG9 | 64 - PPC | KVM_REG_PPC_DBSR | 32 - PPC | KVM_REG_PPC_TIDR | 64 - PPC | KVM_REG_PPC_PSSCR | 64 - PPC | KVM_REG_PPC_DEC_EXPIRY | 64 - PPC | KVM_REG_PPC_PTCR | 64 - PPC | KVM_REG_PPC_TM_GPR0 | 64 - ... - PPC | KVM_REG_PPC_TM_GPR31 | 64 - PPC | KVM_REG_PPC_TM_VSR0 | 128 - ... - PPC | KVM_REG_PPC_TM_VSR63 | 128 - PPC | KVM_REG_PPC_TM_CR | 64 - PPC | KVM_REG_PPC_TM_LR | 64 - PPC | KVM_REG_PPC_TM_CTR | 64 - PPC | KVM_REG_PPC_TM_FPSCR | 64 - PPC | KVM_REG_PPC_TM_AMR | 64 - PPC | KVM_REG_PPC_TM_PPR | 64 - PPC | KVM_REG_PPC_TM_VRSAVE | 64 - PPC | KVM_REG_PPC_TM_VSCR | 32 - PPC | KVM_REG_PPC_TM_DSCR | 64 - PPC | KVM_REG_PPC_TM_TAR | 64 - PPC | KVM_REG_PPC_TM_XER | 64 - | | - MIPS | KVM_REG_MIPS_R0 | 64 - ... - MIPS | KVM_REG_MIPS_R31 | 64 - MIPS | KVM_REG_MIPS_HI | 64 - MIPS | KVM_REG_MIPS_LO | 64 - MIPS | KVM_REG_MIPS_PC | 64 - MIPS | KVM_REG_MIPS_CP0_INDEX | 32 - MIPS | KVM_REG_MIPS_CP0_ENTRYLO0 | 64 - MIPS | KVM_REG_MIPS_CP0_ENTRYLO1 | 64 - MIPS | KVM_REG_MIPS_CP0_CONTEXT | 64 - MIPS | KVM_REG_MIPS_CP0_CONTEXTCONFIG| 32 - MIPS | KVM_REG_MIPS_CP0_USERLOCAL | 64 - MIPS | KVM_REG_MIPS_CP0_XCONTEXTCONFIG| 64 - MIPS | KVM_REG_MIPS_CP0_PAGEMASK | 32 - MIPS | KVM_REG_MIPS_CP0_PAGEGRAIN | 32 - MIPS | KVM_REG_MIPS_CP0_SEGCTL0 | 64 - MIPS | KVM_REG_MIPS_CP0_SEGCTL1 | 64 - MIPS | KVM_REG_MIPS_CP0_SEGCTL2 | 64 - MIPS | KVM_REG_MIPS_CP0_PWBASE | 64 - MIPS | KVM_REG_MIPS_CP0_PWFIELD | 64 - MIPS | KVM_REG_MIPS_CP0_PWSIZE | 64 - MIPS | KVM_REG_MIPS_CP0_WIRED | 32 - MIPS | KVM_REG_MIPS_CP0_PWCTL | 32 - MIPS | KVM_REG_MIPS_CP0_HWRENA | 32 - MIPS | KVM_REG_MIPS_CP0_BADVADDR | 64 - MIPS | KVM_REG_MIPS_CP0_BADINSTR | 32 - MIPS | KVM_REG_MIPS_CP0_BADINSTRP | 32 - MIPS | KVM_REG_MIPS_CP0_COUNT | 32 - MIPS | KVM_REG_MIPS_CP0_ENTRYHI | 64 - MIPS | KVM_REG_MIPS_CP0_COMPARE | 32 - MIPS | KVM_REG_MIPS_CP0_STATUS | 32 - MIPS | KVM_REG_MIPS_CP0_INTCTL | 32 - MIPS | KVM_REG_MIPS_CP0_CAUSE | 32 - MIPS | KVM_REG_MIPS_CP0_EPC | 64 - MIPS | KVM_REG_MIPS_CP0_PRID | 32 - MIPS | KVM_REG_MIPS_CP0_EBASE | 64 - MIPS | KVM_REG_MIPS_CP0_CONFIG | 32 - MIPS | KVM_REG_MIPS_CP0_CONFIG1 | 32 - MIPS | KVM_REG_MIPS_CP0_CONFIG2 | 32 - MIPS | KVM_REG_MIPS_CP0_CONFIG3 | 32 - MIPS | KVM_REG_MIPS_CP0_CONFIG4 | 32 - MIPS | KVM_REG_MIPS_CP0_CONFIG5 | 32 - MIPS | KVM_REG_MIPS_CP0_CONFIG7 | 32 - MIPS | KVM_REG_MIPS_CP0_XCONTEXT | 64 - MIPS | KVM_REG_MIPS_CP0_ERROREPC | 64 - MIPS | KVM_REG_MIPS_CP0_KSCRATCH1 | 64 - MIPS | KVM_REG_MIPS_CP0_KSCRATCH2 | 64 - MIPS | KVM_REG_MIPS_CP0_KSCRATCH3 | 64 - MIPS | KVM_REG_MIPS_CP0_KSCRATCH4 | 64 - MIPS | KVM_REG_MIPS_CP0_KSCRATCH5 | 64 - MIPS | KVM_REG_MIPS_CP0_KSCRATCH6 | 64 - MIPS | KVM_REG_MIPS_CP0_MAAR(0..63) | 64 - MIPS | KVM_REG_MIPS_COUNT_CTL | 64 - MIPS | KVM_REG_MIPS_COUNT_RESUME | 64 - MIPS | KVM_REG_MIPS_COUNT_HZ | 64 - MIPS | KVM_REG_MIPS_FPR_32(0..31) | 32 - MIPS | KVM_REG_MIPS_FPR_64(0..31) | 64 - MIPS | KVM_REG_MIPS_VEC_128(0..31) | 128 - MIPS | KVM_REG_MIPS_FCR_IR | 32 - MIPS | KVM_REG_MIPS_FCR_CSR | 32 - MIPS | KVM_REG_MIPS_MSA_IR | 32 - MIPS | KVM_REG_MIPS_MSA_CSR | 32 - -ARM registers are mapped using the lower 32 bits. The upper 16 of that -is the register group type, or coprocessor number: - -ARM core registers have the following id bit patterns: - 0x4020 0000 0010 - -ARM 32-bit CP15 registers have the following id bit patterns: - 0x4020 0000 000F - -ARM 64-bit CP15 registers have the following id bit patterns: - 0x4030 0000 000F - -ARM CCSIDR registers are demultiplexed by CSSELR value: - 0x4020 0000 0011 00 - -ARM 32-bit VFP control registers have the following id bit patterns: - 0x4020 0000 0012 1 - -ARM 64-bit FP registers have the following id bit patterns: - 0x4030 0000 0012 0 - -ARM firmware pseudo-registers have the following bit pattern: - 0x4030 0000 0014 - - -arm64 registers are mapped using the lower 32 bits. The upper 16 of -that is the register group type, or coprocessor number: - -arm64 core/FP-SIMD registers have the following id bit patterns. Note -that the size of the access is variable, as the kvm_regs structure -contains elements ranging from 32 to 128 bits. The index is a 32bit -value in the kvm_regs structure seen as a 32bit array. - 0x60x0 0000 0010 - -Specifically: - Encoding Register Bits kvm_regs member ----------------------------------------------------------------- - 0x6030 0000 0010 0000 X0 64 regs.regs[0] - 0x6030 0000 0010 0002 X1 64 regs.regs[1] - ... - 0x6030 0000 0010 003c X30 64 regs.regs[30] - 0x6030 0000 0010 003e SP 64 regs.sp - 0x6030 0000 0010 0040 PC 64 regs.pc - 0x6030 0000 0010 0042 PSTATE 64 regs.pstate - 0x6030 0000 0010 0044 SP_EL1 64 sp_el1 - 0x6030 0000 0010 0046 ELR_EL1 64 elr_el1 - 0x6030 0000 0010 0048 SPSR_EL1 64 spsr[KVM_SPSR_EL1] (alias SPSR_SVC) - 0x6030 0000 0010 004a SPSR_ABT 64 spsr[KVM_SPSR_ABT] - 0x6030 0000 0010 004c SPSR_UND 64 spsr[KVM_SPSR_UND] - 0x6030 0000 0010 004e SPSR_IRQ 64 spsr[KVM_SPSR_IRQ] - 0x6060 0000 0010 0050 SPSR_FIQ 64 spsr[KVM_SPSR_FIQ] - 0x6040 0000 0010 0054 V0 128 fp_regs.vregs[0] (*) - 0x6040 0000 0010 0058 V1 128 fp_regs.vregs[1] (*) - ... - 0x6040 0000 0010 00d0 V31 128 fp_regs.vregs[31] (*) - 0x6020 0000 0010 00d4 FPSR 32 fp_regs.fpsr - 0x6020 0000 0010 00d5 FPCR 32 fp_regs.fpcr - -(*) These encodings are not accepted for SVE-enabled vcpus. See - KVM_ARM_VCPU_INIT. - - The equivalent register content can be accessed via bits [127:0] of - the corresponding SVE Zn registers instead for vcpus that have SVE - enabled (see below). - -arm64 CCSIDR registers are demultiplexed by CSSELR value: - 0x6020 0000 0011 00 - -arm64 system registers have the following id bit patterns: - 0x6030 0000 0013 - -WARNING: - Two system register IDs do not follow the specified pattern. These - are KVM_REG_ARM_TIMER_CVAL and KVM_REG_ARM_TIMER_CNT, which map to - system registers CNTV_CVAL_EL0 and CNTVCT_EL0 respectively. These - two had their values accidentally swapped, which means TIMER_CVAL is - derived from the register encoding for CNTVCT_EL0 and TIMER_CNT is - derived from the register encoding for CNTV_CVAL_EL0. As this is - API, it must remain this way. - -arm64 firmware pseudo-registers have the following bit pattern: - 0x6030 0000 0014 - -arm64 SVE registers have the following bit patterns: - 0x6080 0000 0015 00 Zn bits[2048*slice + 2047 : 2048*slice] - 0x6050 0000 0015 04 Pn bits[256*slice + 255 : 256*slice] - 0x6050 0000 0015 060 FFR bits[256*slice + 255 : 256*slice] - 0x6060 0000 0015 ffff KVM_REG_ARM64_SVE_VLS pseudo-register - -Access to register IDs where 2048 * slice >= 128 * max_vq will fail with -ENOENT. max_vq is the vcpu's maximum supported vector length in 128-bit -quadwords: see (**) below. - -These registers are only accessible on vcpus for which SVE is enabled. -See KVM_ARM_VCPU_INIT for details. - -In addition, except for KVM_REG_ARM64_SVE_VLS, these registers are not -accessible until the vcpu's SVE configuration has been finalized -using KVM_ARM_VCPU_FINALIZE(KVM_ARM_VCPU_SVE). See KVM_ARM_VCPU_INIT -and KVM_ARM_VCPU_FINALIZE for more information about this procedure. - -KVM_REG_ARM64_SVE_VLS is a pseudo-register that allows the set of vector -lengths supported by the vcpu to be discovered and configured by -userspace. When transferred to or from user memory via KVM_GET_ONE_REG -or KVM_SET_ONE_REG, the value of this register is of type -__u64[KVM_ARM64_SVE_VLS_WORDS], and encodes the set of vector lengths as -follows: - -__u64 vector_lengths[KVM_ARM64_SVE_VLS_WORDS]; - -if (vq >= SVE_VQ_MIN && vq <= SVE_VQ_MAX && - ((vector_lengths[(vq - KVM_ARM64_SVE_VQ_MIN) / 64] >> - ((vq - KVM_ARM64_SVE_VQ_MIN) % 64)) & 1)) - /* Vector length vq * 16 bytes supported */ -else - /* Vector length vq * 16 bytes not supported */ - -(**) The maximum value vq for which the above condition is true is -max_vq. This is the maximum vector length available to the guest on -this vcpu, and determines which register slices are visible through -this ioctl interface. - -(See Documentation/arm64/sve.rst for an explanation of the "vq" -nomenclature.) - -KVM_REG_ARM64_SVE_VLS is only accessible after KVM_ARM_VCPU_INIT. -KVM_ARM_VCPU_INIT initialises it to the best set of vector lengths that -the host supports. - -Userspace may subsequently modify it if desired until the vcpu's SVE -configuration is finalized using KVM_ARM_VCPU_FINALIZE(KVM_ARM_VCPU_SVE). - -Apart from simply removing all vector lengths from the host set that -exceed some value, support for arbitrarily chosen sets of vector lengths -is hardware-dependent and may not be available. Attempting to configure -an invalid set of vector lengths via KVM_SET_ONE_REG will fail with -EINVAL. - -After the vcpu's SVE configuration is finalized, further attempts to -write this register will fail with EPERM. - - -MIPS registers are mapped using the lower 32 bits. The upper 16 of that is -the register group type: - -MIPS core registers (see above) have the following id bit patterns: - 0x7030 0000 0000 - -MIPS CP0 registers (see KVM_REG_MIPS_CP0_* above) have the following id bit -patterns depending on whether they're 32-bit or 64-bit registers: - 0x7020 0000 0001 00 (32-bit) - 0x7030 0000 0001 00 (64-bit) - -Note: KVM_REG_MIPS_CP0_ENTRYLO0 and KVM_REG_MIPS_CP0_ENTRYLO1 are the MIPS64 -versions of the EntryLo registers regardless of the word size of the host -hardware, host kernel, guest, and whether XPA is present in the guest, i.e. -with the RI and XI bits (if they exist) in bits 63 and 62 respectively, and -the PFNX field starting at bit 30. - -MIPS MAARs (see KVM_REG_MIPS_CP0_MAAR(*) above) have the following id bit -patterns: - 0x7030 0000 0001 01 - -MIPS KVM control registers (see above) have the following id bit patterns: - 0x7030 0000 0002 - -MIPS FPU registers (see KVM_REG_MIPS_FPR_{32,64}() above) have the following -id bit patterns depending on the size of the register being accessed. They are -always accessed according to the current guest FPU mode (Status.FR and -Config5.FRE), i.e. as the guest would see them, and they become unpredictable -if the guest FPU mode is changed. MIPS SIMD Architecture (MSA) vector -registers (see KVM_REG_MIPS_VEC_128() above) have similar patterns as they -overlap the FPU registers: - 0x7020 0000 0003 00 <0:3> (32-bit FPU registers) - 0x7030 0000 0003 00 <0:3> (64-bit FPU registers) - 0x7040 0000 0003 00 <0:3> (128-bit MSA vector registers) - -MIPS FPU control registers (see KVM_REG_MIPS_FCR_{IR,CSR} above) have the -following id bit patterns: - 0x7020 0000 0003 01 <0:3> - -MIPS MSA control registers (see KVM_REG_MIPS_MSA_{IR,CSR} above) have the -following id bit patterns: - 0x7020 0000 0003 02 <0:3> - - -4.69 KVM_GET_ONE_REG - -Capability: KVM_CAP_ONE_REG -Architectures: all -Type: vcpu ioctl -Parameters: struct kvm_one_reg (in and out) -Returns: 0 on success, negative value on failure -Errors include: -  ENOENT:   no such register -  EINVAL:   invalid register ID, or no such register -  EPERM:    (arm64) register access not allowed before vcpu finalization -(These error codes are indicative only: do not rely on a specific error -code being returned in a specific situation.) - -This ioctl allows to receive the value of a single register implemented -in a vcpu. The register to read is indicated by the "id" field of the -kvm_one_reg struct passed in. On success, the register value can be found -at the memory location pointed to by "addr". - -The list of registers accessible using this interface is identical to the -list in 4.68. - - -4.70 KVM_KVMCLOCK_CTRL - -Capability: KVM_CAP_KVMCLOCK_CTRL -Architectures: Any that implement pvclocks (currently x86 only) -Type: vcpu ioctl -Parameters: None -Returns: 0 on success, -1 on error - -This signals to the host kernel that the specified guest is being paused by -userspace. The host will set a flag in the pvclock structure that is checked -from the soft lockup watchdog. The flag is part of the pvclock structure that -is shared between guest and host, specifically the second bit of the flags -field of the pvclock_vcpu_time_info structure. It will be set exclusively by -the host and read/cleared exclusively by the guest. The guest operation of -checking and clearing the flag must an atomic operation so -load-link/store-conditional, or equivalent must be used. There are two cases -where the guest will clear the flag: when the soft lockup watchdog timer resets -itself or when a soft lockup is detected. This ioctl can be called any time -after pausing the vcpu, but before it is resumed. - - -4.71 KVM_SIGNAL_MSI - -Capability: KVM_CAP_SIGNAL_MSI -Architectures: x86 arm arm64 -Type: vm ioctl -Parameters: struct kvm_msi (in) -Returns: >0 on delivery, 0 if guest blocked the MSI, and -1 on error - -Directly inject a MSI message. Only valid with in-kernel irqchip that handles -MSI messages. - -struct kvm_msi { - __u32 address_lo; - __u32 address_hi; - __u32 data; - __u32 flags; - __u32 devid; - __u8 pad[12]; -}; - -flags: KVM_MSI_VALID_DEVID: devid contains a valid value. The per-VM - KVM_CAP_MSI_DEVID capability advertises the requirement to provide - the device ID. If this capability is not available, userspace - should never set the KVM_MSI_VALID_DEVID flag as the ioctl might fail. - -If KVM_MSI_VALID_DEVID is set, devid contains a unique device identifier -for the device that wrote the MSI message. For PCI, this is usually a -BFD identifier in the lower 16 bits. - -On x86, address_hi is ignored unless the KVM_X2APIC_API_USE_32BIT_IDS -feature of KVM_CAP_X2APIC_API capability is enabled. If it is enabled, -address_hi bits 31-8 provide bits 31-8 of the destination id. Bits 7-0 of -address_hi must be zero. - - -4.71 KVM_CREATE_PIT2 - -Capability: KVM_CAP_PIT2 -Architectures: x86 -Type: vm ioctl -Parameters: struct kvm_pit_config (in) -Returns: 0 on success, -1 on error - -Creates an in-kernel device model for the i8254 PIT. This call is only valid -after enabling in-kernel irqchip support via KVM_CREATE_IRQCHIP. The following -parameters have to be passed: - -struct kvm_pit_config { - __u32 flags; - __u32 pad[15]; -}; - -Valid flags are: - -#define KVM_PIT_SPEAKER_DUMMY 1 /* emulate speaker port stub */ - -PIT timer interrupts may use a per-VM kernel thread for injection. If it -exists, this thread will have a name of the following pattern: - -kvm-pit/ - -When running a guest with elevated priorities, the scheduling parameters of -this thread may have to be adjusted accordingly. - -This IOCTL replaces the obsolete KVM_CREATE_PIT. - - -4.72 KVM_GET_PIT2 - -Capability: KVM_CAP_PIT_STATE2 -Architectures: x86 -Type: vm ioctl -Parameters: struct kvm_pit_state2 (out) -Returns: 0 on success, -1 on error - -Retrieves the state of the in-kernel PIT model. Only valid after -KVM_CREATE_PIT2. The state is returned in the following structure: - -struct kvm_pit_state2 { - struct kvm_pit_channel_state channels[3]; - __u32 flags; - __u32 reserved[9]; -}; - -Valid flags are: - -/* disable PIT in HPET legacy mode */ -#define KVM_PIT_FLAGS_HPET_LEGACY 0x00000001 - -This IOCTL replaces the obsolete KVM_GET_PIT. - - -4.73 KVM_SET_PIT2 - -Capability: KVM_CAP_PIT_STATE2 -Architectures: x86 -Type: vm ioctl -Parameters: struct kvm_pit_state2 (in) -Returns: 0 on success, -1 on error - -Sets the state of the in-kernel PIT model. Only valid after KVM_CREATE_PIT2. -See KVM_GET_PIT2 for details on struct kvm_pit_state2. - -This IOCTL replaces the obsolete KVM_SET_PIT. - - -4.74 KVM_PPC_GET_SMMU_INFO - -Capability: KVM_CAP_PPC_GET_SMMU_INFO -Architectures: powerpc -Type: vm ioctl -Parameters: None -Returns: 0 on success, -1 on error - -This populates and returns a structure describing the features of -the "Server" class MMU emulation supported by KVM. -This can in turn be used by userspace to generate the appropriate -device-tree properties for the guest operating system. - -The structure contains some global information, followed by an -array of supported segment page sizes: - - struct kvm_ppc_smmu_info { - __u64 flags; - __u32 slb_size; - __u32 pad; - struct kvm_ppc_one_seg_page_size sps[KVM_PPC_PAGE_SIZES_MAX_SZ]; - }; - -The supported flags are: - - - KVM_PPC_PAGE_SIZES_REAL: - When that flag is set, guest page sizes must "fit" the backing - store page sizes. When not set, any page size in the list can - be used regardless of how they are backed by userspace. - - - KVM_PPC_1T_SEGMENTS - The emulated MMU supports 1T segments in addition to the - standard 256M ones. - - - KVM_PPC_NO_HASH - This flag indicates that HPT guests are not supported by KVM, - thus all guests must use radix MMU mode. - -The "slb_size" field indicates how many SLB entries are supported - -The "sps" array contains 8 entries indicating the supported base -page sizes for a segment in increasing order. Each entry is defined -as follow: - - struct kvm_ppc_one_seg_page_size { - __u32 page_shift; /* Base page shift of segment (or 0) */ - __u32 slb_enc; /* SLB encoding for BookS */ - struct kvm_ppc_one_page_size enc[KVM_PPC_PAGE_SIZES_MAX_SZ]; - }; - -An entry with a "page_shift" of 0 is unused. Because the array is -organized in increasing order, a lookup can stop when encoutering -such an entry. - -The "slb_enc" field provides the encoding to use in the SLB for the -page size. The bits are in positions such as the value can directly -be OR'ed into the "vsid" argument of the slbmte instruction. - -The "enc" array is a list which for each of those segment base page -size provides the list of supported actual page sizes (which can be -only larger or equal to the base page size), along with the -corresponding encoding in the hash PTE. Similarly, the array is -8 entries sorted by increasing sizes and an entry with a "0" shift -is an empty entry and a terminator: - - struct kvm_ppc_one_page_size { - __u32 page_shift; /* Page shift (or 0) */ - __u32 pte_enc; /* Encoding in the HPTE (>>12) */ - }; - -The "pte_enc" field provides a value that can OR'ed into the hash -PTE's RPN field (ie, it needs to be shifted left by 12 to OR it -into the hash PTE second double word). - -4.75 KVM_IRQFD - -Capability: KVM_CAP_IRQFD -Architectures: x86 s390 arm arm64 -Type: vm ioctl -Parameters: struct kvm_irqfd (in) -Returns: 0 on success, -1 on error - -Allows setting an eventfd to directly trigger a guest interrupt. -kvm_irqfd.fd specifies the file descriptor to use as the eventfd and -kvm_irqfd.gsi specifies the irqchip pin toggled by this event. When -an event is triggered on the eventfd, an interrupt is injected into -the guest using the specified gsi pin. The irqfd is removed using -the KVM_IRQFD_FLAG_DEASSIGN flag, specifying both kvm_irqfd.fd -and kvm_irqfd.gsi. - -With KVM_CAP_IRQFD_RESAMPLE, KVM_IRQFD supports a de-assert and notify -mechanism allowing emulation of level-triggered, irqfd-based -interrupts. When KVM_IRQFD_FLAG_RESAMPLE is set the user must pass an -additional eventfd in the kvm_irqfd.resamplefd field. When operating -in resample mode, posting of an interrupt through kvm_irq.fd asserts -the specified gsi in the irqchip. When the irqchip is resampled, such -as from an EOI, the gsi is de-asserted and the user is notified via -kvm_irqfd.resamplefd. It is the user's responsibility to re-queue -the interrupt if the device making use of it still requires service. -Note that closing the resamplefd is not sufficient to disable the -irqfd. The KVM_IRQFD_FLAG_RESAMPLE is only necessary on assignment -and need not be specified with KVM_IRQFD_FLAG_DEASSIGN. - -On arm/arm64, gsi routing being supported, the following can happen: -- in case no routing entry is associated to this gsi, injection fails -- in case the gsi is associated to an irqchip routing entry, - irqchip.pin + 32 corresponds to the injected SPI ID. -- in case the gsi is associated to an MSI routing entry, the MSI - message and device ID are translated into an LPI (support restricted - to GICv3 ITS in-kernel emulation). - -4.76 KVM_PPC_ALLOCATE_HTAB - -Capability: KVM_CAP_PPC_ALLOC_HTAB -Architectures: powerpc -Type: vm ioctl -Parameters: Pointer to u32 containing hash table order (in/out) -Returns: 0 on success, -1 on error - -This requests the host kernel to allocate an MMU hash table for a -guest using the PAPR paravirtualization interface. This only does -anything if the kernel is configured to use the Book 3S HV style of -virtualization. Otherwise the capability doesn't exist and the ioctl -returns an ENOTTY error. The rest of this description assumes Book 3S -HV. - -There must be no vcpus running when this ioctl is called; if there -are, it will do nothing and return an EBUSY error. - -The parameter is a pointer to a 32-bit unsigned integer variable -containing the order (log base 2) of the desired size of the hash -table, which must be between 18 and 46. On successful return from the -ioctl, the value will not be changed by the kernel. - -If no hash table has been allocated when any vcpu is asked to run -(with the KVM_RUN ioctl), the host kernel will allocate a -default-sized hash table (16 MB). - -If this ioctl is called when a hash table has already been allocated, -with a different order from the existing hash table, the existing hash -table will be freed and a new one allocated. If this is ioctl is -called when a hash table has already been allocated of the same order -as specified, the kernel will clear out the existing hash table (zero -all HPTEs). In either case, if the guest is using the virtualized -real-mode area (VRMA) facility, the kernel will re-create the VMRA -HPTEs on the next KVM_RUN of any vcpu. - -4.77 KVM_S390_INTERRUPT - -Capability: basic -Architectures: s390 -Type: vm ioctl, vcpu ioctl -Parameters: struct kvm_s390_interrupt (in) -Returns: 0 on success, -1 on error - -Allows to inject an interrupt to the guest. Interrupts can be floating -(vm ioctl) or per cpu (vcpu ioctl), depending on the interrupt type. - -Interrupt parameters are passed via kvm_s390_interrupt: - -struct kvm_s390_interrupt { - __u32 type; - __u32 parm; - __u64 parm64; -}; - -type can be one of the following: - -KVM_S390_SIGP_STOP (vcpu) - sigp stop; optional flags in parm -KVM_S390_PROGRAM_INT (vcpu) - program check; code in parm -KVM_S390_SIGP_SET_PREFIX (vcpu) - sigp set prefix; prefix address in parm -KVM_S390_RESTART (vcpu) - restart -KVM_S390_INT_CLOCK_COMP (vcpu) - clock comparator interrupt -KVM_S390_INT_CPU_TIMER (vcpu) - CPU timer interrupt -KVM_S390_INT_VIRTIO (vm) - virtio external interrupt; external interrupt - parameters in parm and parm64 -KVM_S390_INT_SERVICE (vm) - sclp external interrupt; sclp parameter in parm -KVM_S390_INT_EMERGENCY (vcpu) - sigp emergency; source cpu in parm -KVM_S390_INT_EXTERNAL_CALL (vcpu) - sigp external call; source cpu in parm -KVM_S390_INT_IO(ai,cssid,ssid,schid) (vm) - compound value to indicate an - I/O interrupt (ai - adapter interrupt; cssid,ssid,schid - subchannel); - I/O interruption parameters in parm (subchannel) and parm64 (intparm, - interruption subclass) -KVM_S390_MCHK (vm, vcpu) - machine check interrupt; cr 14 bits in parm, - machine check interrupt code in parm64 (note that - machine checks needing further payload are not - supported by this ioctl) - -This is an asynchronous vcpu ioctl and can be invoked from any thread. - -4.78 KVM_PPC_GET_HTAB_FD - -Capability: KVM_CAP_PPC_HTAB_FD -Architectures: powerpc -Type: vm ioctl -Parameters: Pointer to struct kvm_get_htab_fd (in) -Returns: file descriptor number (>= 0) on success, -1 on error - -This returns a file descriptor that can be used either to read out the -entries in the guest's hashed page table (HPT), or to write entries to -initialize the HPT. The returned fd can only be written to if the -KVM_GET_HTAB_WRITE bit is set in the flags field of the argument, and -can only be read if that bit is clear. The argument struct looks like -this: - -/* For KVM_PPC_GET_HTAB_FD */ -struct kvm_get_htab_fd { - __u64 flags; - __u64 start_index; - __u64 reserved[2]; -}; - -/* Values for kvm_get_htab_fd.flags */ -#define KVM_GET_HTAB_BOLTED_ONLY ((__u64)0x1) -#define KVM_GET_HTAB_WRITE ((__u64)0x2) - -The `start_index' field gives the index in the HPT of the entry at -which to start reading. It is ignored when writing. - -Reads on the fd will initially supply information about all -"interesting" HPT entries. Interesting entries are those with the -bolted bit set, if the KVM_GET_HTAB_BOLTED_ONLY bit is set, otherwise -all entries. When the end of the HPT is reached, the read() will -return. If read() is called again on the fd, it will start again from -the beginning of the HPT, but will only return HPT entries that have -changed since they were last read. - -Data read or written is structured as a header (8 bytes) followed by a -series of valid HPT entries (16 bytes) each. The header indicates how -many valid HPT entries there are and how many invalid entries follow -the valid entries. The invalid entries are not represented explicitly -in the stream. The header format is: - -struct kvm_get_htab_header { - __u32 index; - __u16 n_valid; - __u16 n_invalid; -}; - -Writes to the fd create HPT entries starting at the index given in the -header; first `n_valid' valid entries with contents from the data -written, then `n_invalid' invalid entries, invalidating any previously -valid entries found. - -4.79 KVM_CREATE_DEVICE - -Capability: KVM_CAP_DEVICE_CTRL -Type: vm ioctl -Parameters: struct kvm_create_device (in/out) -Returns: 0 on success, -1 on error -Errors: - ENODEV: The device type is unknown or unsupported - EEXIST: Device already created, and this type of device may not - be instantiated multiple times - - Other error conditions may be defined by individual device types or - have their standard meanings. - -Creates an emulated device in the kernel. The file descriptor returned -in fd can be used with KVM_SET/GET/HAS_DEVICE_ATTR. - -If the KVM_CREATE_DEVICE_TEST flag is set, only test whether the -device type is supported (not necessarily whether it can be created -in the current vm). - -Individual devices should not define flags. Attributes should be used -for specifying any behavior that is not implied by the device type -number. - -struct kvm_create_device { - __u32 type; /* in: KVM_DEV_TYPE_xxx */ - __u32 fd; /* out: device handle */ - __u32 flags; /* in: KVM_CREATE_DEVICE_xxx */ -}; - -4.80 KVM_SET_DEVICE_ATTR/KVM_GET_DEVICE_ATTR - -Capability: KVM_CAP_DEVICE_CTRL, KVM_CAP_VM_ATTRIBUTES for vm device, - KVM_CAP_VCPU_ATTRIBUTES for vcpu device -Type: device ioctl, vm ioctl, vcpu ioctl -Parameters: struct kvm_device_attr -Returns: 0 on success, -1 on error -Errors: - ENXIO: The group or attribute is unknown/unsupported for this device - or hardware support is missing. - EPERM: The attribute cannot (currently) be accessed this way - (e.g. read-only attribute, or attribute that only makes - sense when the device is in a different state) - - Other error conditions may be defined by individual device types. - -Gets/sets a specified piece of device configuration and/or state. The -semantics are device-specific. See individual device documentation in -the "devices" directory. As with ONE_REG, the size of the data -transferred is defined by the particular attribute. - -struct kvm_device_attr { - __u32 flags; /* no flags currently defined */ - __u32 group; /* device-defined */ - __u64 attr; /* group-defined */ - __u64 addr; /* userspace address of attr data */ -}; - -4.81 KVM_HAS_DEVICE_ATTR - -Capability: KVM_CAP_DEVICE_CTRL, KVM_CAP_VM_ATTRIBUTES for vm device, - KVM_CAP_VCPU_ATTRIBUTES for vcpu device -Type: device ioctl, vm ioctl, vcpu ioctl -Parameters: struct kvm_device_attr -Returns: 0 on success, -1 on error -Errors: - ENXIO: The group or attribute is unknown/unsupported for this device - or hardware support is missing. - -Tests whether a device supports a particular attribute. A successful -return indicates the attribute is implemented. It does not necessarily -indicate that the attribute can be read or written in the device's -current state. "addr" is ignored. - -4.82 KVM_ARM_VCPU_INIT - -Capability: basic -Architectures: arm, arm64 -Type: vcpu ioctl -Parameters: struct kvm_vcpu_init (in) -Returns: 0 on success; -1 on error -Errors: -  EINVAL:    the target is unknown, or the combination of features is invalid. -  ENOENT:    a features bit specified is unknown. - -This tells KVM what type of CPU to present to the guest, and what -optional features it should have.  This will cause a reset of the cpu -registers to their initial values.  If this is not called, KVM_RUN will -return ENOEXEC for that vcpu. - -Note that because some registers reflect machine topology, all vcpus -should be created before this ioctl is invoked. - -Userspace can call this function multiple times for a given vcpu, including -after the vcpu has been run. This will reset the vcpu to its initial -state. All calls to this function after the initial call must use the same -target and same set of feature flags, otherwise EINVAL will be returned. - -Possible features: - - KVM_ARM_VCPU_POWER_OFF: Starts the CPU in a power-off state. - Depends on KVM_CAP_ARM_PSCI. If not set, the CPU will be powered on - and execute guest code when KVM_RUN is called. - - KVM_ARM_VCPU_EL1_32BIT: Starts the CPU in a 32bit mode. - Depends on KVM_CAP_ARM_EL1_32BIT (arm64 only). - - KVM_ARM_VCPU_PSCI_0_2: Emulate PSCI v0.2 (or a future revision - backward compatible with v0.2) for the CPU. - Depends on KVM_CAP_ARM_PSCI_0_2. - - KVM_ARM_VCPU_PMU_V3: Emulate PMUv3 for the CPU. - Depends on KVM_CAP_ARM_PMU_V3. - - - KVM_ARM_VCPU_PTRAUTH_ADDRESS: Enables Address Pointer authentication - for arm64 only. - Depends on KVM_CAP_ARM_PTRAUTH_ADDRESS. - If KVM_CAP_ARM_PTRAUTH_ADDRESS and KVM_CAP_ARM_PTRAUTH_GENERIC are - both present, then both KVM_ARM_VCPU_PTRAUTH_ADDRESS and - KVM_ARM_VCPU_PTRAUTH_GENERIC must be requested or neither must be - requested. - - - KVM_ARM_VCPU_PTRAUTH_GENERIC: Enables Generic Pointer authentication - for arm64 only. - Depends on KVM_CAP_ARM_PTRAUTH_GENERIC. - If KVM_CAP_ARM_PTRAUTH_ADDRESS and KVM_CAP_ARM_PTRAUTH_GENERIC are - both present, then both KVM_ARM_VCPU_PTRAUTH_ADDRESS and - KVM_ARM_VCPU_PTRAUTH_GENERIC must be requested or neither must be - requested. - - - KVM_ARM_VCPU_SVE: Enables SVE for the CPU (arm64 only). - Depends on KVM_CAP_ARM_SVE. - Requires KVM_ARM_VCPU_FINALIZE(KVM_ARM_VCPU_SVE): - - * After KVM_ARM_VCPU_INIT: - - - KVM_REG_ARM64_SVE_VLS may be read using KVM_GET_ONE_REG: the - initial value of this pseudo-register indicates the best set of - vector lengths possible for a vcpu on this host. - - * Before KVM_ARM_VCPU_FINALIZE(KVM_ARM_VCPU_SVE): - - - KVM_RUN and KVM_GET_REG_LIST are not available; - - - KVM_GET_ONE_REG and KVM_SET_ONE_REG cannot be used to access - the scalable archietctural SVE registers - KVM_REG_ARM64_SVE_ZREG(), KVM_REG_ARM64_SVE_PREG() or - KVM_REG_ARM64_SVE_FFR; - - - KVM_REG_ARM64_SVE_VLS may optionally be written using - KVM_SET_ONE_REG, to modify the set of vector lengths available - for the vcpu. - - * After KVM_ARM_VCPU_FINALIZE(KVM_ARM_VCPU_SVE): - - - the KVM_REG_ARM64_SVE_VLS pseudo-register is immutable, and can - no longer be written using KVM_SET_ONE_REG. - -4.83 KVM_ARM_PREFERRED_TARGET - -Capability: basic -Architectures: arm, arm64 -Type: vm ioctl -Parameters: struct struct kvm_vcpu_init (out) -Returns: 0 on success; -1 on error -Errors: - ENODEV: no preferred target available for the host - -This queries KVM for preferred CPU target type which can be emulated -by KVM on underlying host. - -The ioctl returns struct kvm_vcpu_init instance containing information -about preferred CPU target type and recommended features for it. The -kvm_vcpu_init->features bitmap returned will have feature bits set if -the preferred target recommends setting these features, but this is -not mandatory. - -The information returned by this ioctl can be used to prepare an instance -of struct kvm_vcpu_init for KVM_ARM_VCPU_INIT ioctl which will result in -in VCPU matching underlying host. - - -4.84 KVM_GET_REG_LIST - -Capability: basic -Architectures: arm, arm64, mips -Type: vcpu ioctl -Parameters: struct kvm_reg_list (in/out) -Returns: 0 on success; -1 on error -Errors: -  E2BIG:     the reg index list is too big to fit in the array specified by -             the user (the number required will be written into n). - -struct kvm_reg_list { - __u64 n; /* number of registers in reg[] */ - __u64 reg[0]; -}; - -This ioctl returns the guest registers that are supported for the -KVM_GET_ONE_REG/KVM_SET_ONE_REG calls. - - -4.85 KVM_ARM_SET_DEVICE_ADDR (deprecated) - -Capability: KVM_CAP_ARM_SET_DEVICE_ADDR -Architectures: arm, arm64 -Type: vm ioctl -Parameters: struct kvm_arm_device_address (in) -Returns: 0 on success, -1 on error -Errors: - ENODEV: The device id is unknown - ENXIO: Device not supported on current system - EEXIST: Address already set - E2BIG: Address outside guest physical address space - EBUSY: Address overlaps with other device range - -struct kvm_arm_device_addr { - __u64 id; - __u64 addr; -}; - -Specify a device address in the guest's physical address space where guests -can access emulated or directly exposed devices, which the host kernel needs -to know about. The id field is an architecture specific identifier for a -specific device. - -ARM/arm64 divides the id field into two parts, a device id and an -address type id specific to the individual device. - -  bits: | 63 ... 32 | 31 ... 16 | 15 ... 0 | - field: | 0x00000000 | device id | addr type id | - -ARM/arm64 currently only require this when using the in-kernel GIC -support for the hardware VGIC features, using KVM_ARM_DEVICE_VGIC_V2 -as the device id. When setting the base address for the guest's -mapping of the VGIC virtual CPU and distributor interface, the ioctl -must be called after calling KVM_CREATE_IRQCHIP, but before calling -KVM_RUN on any of the VCPUs. Calling this ioctl twice for any of the -base addresses will return -EEXIST. - -Note, this IOCTL is deprecated and the more flexible SET/GET_DEVICE_ATTR API -should be used instead. - - -4.86 KVM_PPC_RTAS_DEFINE_TOKEN - -Capability: KVM_CAP_PPC_RTAS -Architectures: ppc -Type: vm ioctl -Parameters: struct kvm_rtas_token_args -Returns: 0 on success, -1 on error - -Defines a token value for a RTAS (Run Time Abstraction Services) -service in order to allow it to be handled in the kernel. The -argument struct gives the name of the service, which must be the name -of a service that has a kernel-side implementation. If the token -value is non-zero, it will be associated with that service, and -subsequent RTAS calls by the guest specifying that token will be -handled by the kernel. If the token value is 0, then any token -associated with the service will be forgotten, and subsequent RTAS -calls by the guest for that service will be passed to userspace to be -handled. - -4.87 KVM_SET_GUEST_DEBUG - -Capability: KVM_CAP_SET_GUEST_DEBUG -Architectures: x86, s390, ppc, arm64 -Type: vcpu ioctl -Parameters: struct kvm_guest_debug (in) -Returns: 0 on success; -1 on error - -struct kvm_guest_debug { - __u32 control; - __u32 pad; - struct kvm_guest_debug_arch arch; -}; - -Set up the processor specific debug registers and configure vcpu for -handling guest debug events. There are two parts to the structure, the -first a control bitfield indicates the type of debug events to handle -when running. Common control bits are: - - - KVM_GUESTDBG_ENABLE: guest debugging is enabled - - KVM_GUESTDBG_SINGLESTEP: the next run should single-step - -The top 16 bits of the control field are architecture specific control -flags which can include the following: - - - KVM_GUESTDBG_USE_SW_BP: using software breakpoints [x86, arm64] - - KVM_GUESTDBG_USE_HW_BP: using hardware breakpoints [x86, s390, arm64] - - KVM_GUESTDBG_INJECT_DB: inject DB type exception [x86] - - KVM_GUESTDBG_INJECT_BP: inject BP type exception [x86] - - KVM_GUESTDBG_EXIT_PENDING: trigger an immediate guest exit [s390] - -For example KVM_GUESTDBG_USE_SW_BP indicates that software breakpoints -are enabled in memory so we need to ensure breakpoint exceptions are -correctly trapped and the KVM run loop exits at the breakpoint and not -running off into the normal guest vector. For KVM_GUESTDBG_USE_HW_BP -we need to ensure the guest vCPUs architecture specific registers are -updated to the correct (supplied) values. - -The second part of the structure is architecture specific and -typically contains a set of debug registers. - -For arm64 the number of debug registers is implementation defined and -can be determined by querying the KVM_CAP_GUEST_DEBUG_HW_BPS and -KVM_CAP_GUEST_DEBUG_HW_WPS capabilities which return a positive number -indicating the number of supported registers. - -For ppc, the KVM_CAP_PPC_GUEST_DEBUG_SSTEP capability indicates whether -the single-step debug event (KVM_GUESTDBG_SINGLESTEP) is supported. - -When debug events exit the main run loop with the reason -KVM_EXIT_DEBUG with the kvm_debug_exit_arch part of the kvm_run -structure containing architecture specific debug information. - -4.88 KVM_GET_EMULATED_CPUID - -Capability: KVM_CAP_EXT_EMUL_CPUID -Architectures: x86 -Type: system ioctl -Parameters: struct kvm_cpuid2 (in/out) -Returns: 0 on success, -1 on error - -struct kvm_cpuid2 { - __u32 nent; - __u32 flags; - struct kvm_cpuid_entry2 entries[0]; -}; - -The member 'flags' is used for passing flags from userspace. - -#define KVM_CPUID_FLAG_SIGNIFCANT_INDEX BIT(0) -#define KVM_CPUID_FLAG_STATEFUL_FUNC BIT(1) -#define KVM_CPUID_FLAG_STATE_READ_NEXT BIT(2) - -struct kvm_cpuid_entry2 { - __u32 function; - __u32 index; - __u32 flags; - __u32 eax; - __u32 ebx; - __u32 ecx; - __u32 edx; - __u32 padding[3]; -}; - -This ioctl returns x86 cpuid features which are emulated by -kvm.Userspace can use the information returned by this ioctl to query -which features are emulated by kvm instead of being present natively. - -Userspace invokes KVM_GET_EMULATED_CPUID by passing a kvm_cpuid2 -structure with the 'nent' field indicating the number of entries in -the variable-size array 'entries'. If the number of entries is too low -to describe the cpu capabilities, an error (E2BIG) is returned. If the -number is too high, the 'nent' field is adjusted and an error (ENOMEM) -is returned. If the number is just right, the 'nent' field is adjusted -to the number of valid entries in the 'entries' array, which is then -filled. - -The entries returned are the set CPUID bits of the respective features -which kvm emulates, as returned by the CPUID instruction, with unknown -or unsupported feature bits cleared. - -Features like x2apic, for example, may not be present in the host cpu -but are exposed by kvm in KVM_GET_SUPPORTED_CPUID because they can be -emulated efficiently and thus not included here. - -The fields in each entry are defined as follows: - - function: the eax value used to obtain the entry - index: the ecx value used to obtain the entry (for entries that are - affected by ecx) - flags: an OR of zero or more of the following: - KVM_CPUID_FLAG_SIGNIFCANT_INDEX: - if the index field is valid - KVM_CPUID_FLAG_STATEFUL_FUNC: - if cpuid for this function returns different values for successive - invocations; there will be several entries with the same function, - all with this flag set - KVM_CPUID_FLAG_STATE_READ_NEXT: - for KVM_CPUID_FLAG_STATEFUL_FUNC entries, set if this entry is - the first entry to be read by a cpu - eax, ebx, ecx, edx: the values returned by the cpuid instruction for - this function/index combination - -4.89 KVM_S390_MEM_OP - -Capability: KVM_CAP_S390_MEM_OP -Architectures: s390 -Type: vcpu ioctl -Parameters: struct kvm_s390_mem_op (in) -Returns: = 0 on success, - < 0 on generic error (e.g. -EFAULT or -ENOMEM), - > 0 if an exception occurred while walking the page tables - -Read or write data from/to the logical (virtual) memory of a VCPU. - -Parameters are specified via the following structure: - -struct kvm_s390_mem_op { - __u64 gaddr; /* the guest address */ - __u64 flags; /* flags */ - __u32 size; /* amount of bytes */ - __u32 op; /* type of operation */ - __u64 buf; /* buffer in userspace */ - __u8 ar; /* the access register number */ - __u8 reserved[31]; /* should be set to 0 */ -}; - -The type of operation is specified in the "op" field. It is either -KVM_S390_MEMOP_LOGICAL_READ for reading from logical memory space or -KVM_S390_MEMOP_LOGICAL_WRITE for writing to logical memory space. The -KVM_S390_MEMOP_F_CHECK_ONLY flag can be set in the "flags" field to check -whether the corresponding memory access would create an access exception -(without touching the data in the memory at the destination). In case an -access exception occurred while walking the MMU tables of the guest, the -ioctl returns a positive error number to indicate the type of exception. -This exception is also raised directly at the corresponding VCPU if the -flag KVM_S390_MEMOP_F_INJECT_EXCEPTION is set in the "flags" field. - -The start address of the memory region has to be specified in the "gaddr" -field, and the length of the region in the "size" field (which must not -be 0). The maximum value for "size" can be obtained by checking the -KVM_CAP_S390_MEM_OP capability. "buf" is the buffer supplied by the -userspace application where the read data should be written to for -KVM_S390_MEMOP_LOGICAL_READ, or where the data that should be written is -stored for a KVM_S390_MEMOP_LOGICAL_WRITE. When KVM_S390_MEMOP_F_CHECK_ONLY -is specified, "buf" is unused and can be NULL. "ar" designates the access -register number to be used; the valid range is 0..15. - -The "reserved" field is meant for future extensions. It is not used by -KVM with the currently defined set of flags. - -4.90 KVM_S390_GET_SKEYS - -Capability: KVM_CAP_S390_SKEYS -Architectures: s390 -Type: vm ioctl -Parameters: struct kvm_s390_skeys -Returns: 0 on success, KVM_S390_GET_KEYS_NONE if guest is not using storage - keys, negative value on error - -This ioctl is used to get guest storage key values on the s390 -architecture. The ioctl takes parameters via the kvm_s390_skeys struct. - -struct kvm_s390_skeys { - __u64 start_gfn; - __u64 count; - __u64 skeydata_addr; - __u32 flags; - __u32 reserved[9]; -}; - -The start_gfn field is the number of the first guest frame whose storage keys -you want to get. - -The count field is the number of consecutive frames (starting from start_gfn) -whose storage keys to get. The count field must be at least 1 and the maximum -allowed value is defined as KVM_S390_SKEYS_ALLOC_MAX. Values outside this range -will cause the ioctl to return -EINVAL. - -The skeydata_addr field is the address to a buffer large enough to hold count -bytes. This buffer will be filled with storage key data by the ioctl. - -4.91 KVM_S390_SET_SKEYS - -Capability: KVM_CAP_S390_SKEYS -Architectures: s390 -Type: vm ioctl -Parameters: struct kvm_s390_skeys -Returns: 0 on success, negative value on error - -This ioctl is used to set guest storage key values on the s390 -architecture. The ioctl takes parameters via the kvm_s390_skeys struct. -See section on KVM_S390_GET_SKEYS for struct definition. - -The start_gfn field is the number of the first guest frame whose storage keys -you want to set. - -The count field is the number of consecutive frames (starting from start_gfn) -whose storage keys to get. The count field must be at least 1 and the maximum -allowed value is defined as KVM_S390_SKEYS_ALLOC_MAX. Values outside this range -will cause the ioctl to return -EINVAL. - -The skeydata_addr field is the address to a buffer containing count bytes of -storage keys. Each byte in the buffer will be set as the storage key for a -single frame starting at start_gfn for count frames. - -Note: If any architecturally invalid key value is found in the given data then -the ioctl will return -EINVAL. - -4.92 KVM_S390_IRQ - -Capability: KVM_CAP_S390_INJECT_IRQ -Architectures: s390 -Type: vcpu ioctl -Parameters: struct kvm_s390_irq (in) -Returns: 0 on success, -1 on error -Errors: - EINVAL: interrupt type is invalid - type is KVM_S390_SIGP_STOP and flag parameter is invalid value - type is KVM_S390_INT_EXTERNAL_CALL and code is bigger - than the maximum of VCPUs - EBUSY: type is KVM_S390_SIGP_SET_PREFIX and vcpu is not stopped - type is KVM_S390_SIGP_STOP and a stop irq is already pending - type is KVM_S390_INT_EXTERNAL_CALL and an external call interrupt - is already pending - -Allows to inject an interrupt to the guest. - -Using struct kvm_s390_irq as a parameter allows -to inject additional payload which is not -possible via KVM_S390_INTERRUPT. - -Interrupt parameters are passed via kvm_s390_irq: - -struct kvm_s390_irq { - __u64 type; - union { - struct kvm_s390_io_info io; - struct kvm_s390_ext_info ext; - struct kvm_s390_pgm_info pgm; - struct kvm_s390_emerg_info emerg; - struct kvm_s390_extcall_info extcall; - struct kvm_s390_prefix_info prefix; - struct kvm_s390_stop_info stop; - struct kvm_s390_mchk_info mchk; - char reserved[64]; - } u; -}; - -type can be one of the following: - -KVM_S390_SIGP_STOP - sigp stop; parameter in .stop -KVM_S390_PROGRAM_INT - program check; parameters in .pgm -KVM_S390_SIGP_SET_PREFIX - sigp set prefix; parameters in .prefix -KVM_S390_RESTART - restart; no parameters -KVM_S390_INT_CLOCK_COMP - clock comparator interrupt; no parameters -KVM_S390_INT_CPU_TIMER - CPU timer interrupt; no parameters -KVM_S390_INT_EMERGENCY - sigp emergency; parameters in .emerg -KVM_S390_INT_EXTERNAL_CALL - sigp external call; parameters in .extcall -KVM_S390_MCHK - machine check interrupt; parameters in .mchk - -This is an asynchronous vcpu ioctl and can be invoked from any thread. - -4.94 KVM_S390_GET_IRQ_STATE - -Capability: KVM_CAP_S390_IRQ_STATE -Architectures: s390 -Type: vcpu ioctl -Parameters: struct kvm_s390_irq_state (out) -Returns: >= number of bytes copied into buffer, - -EINVAL if buffer size is 0, - -ENOBUFS if buffer size is too small to fit all pending interrupts, - -EFAULT if the buffer address was invalid - -This ioctl allows userspace to retrieve the complete state of all currently -pending interrupts in a single buffer. Use cases include migration -and introspection. The parameter structure contains the address of a -userspace buffer and its length: - -struct kvm_s390_irq_state { - __u64 buf; - __u32 flags; /* will stay unused for compatibility reasons */ - __u32 len; - __u32 reserved[4]; /* will stay unused for compatibility reasons */ -}; - -Userspace passes in the above struct and for each pending interrupt a -struct kvm_s390_irq is copied to the provided buffer. - -The structure contains a flags and a reserved field for future extensions. As -the kernel never checked for flags == 0 and QEMU never pre-zeroed flags and -reserved, these fields can not be used in the future without breaking -compatibility. - -If -ENOBUFS is returned the buffer provided was too small and userspace -may retry with a bigger buffer. - -4.95 KVM_S390_SET_IRQ_STATE - -Capability: KVM_CAP_S390_IRQ_STATE -Architectures: s390 -Type: vcpu ioctl -Parameters: struct kvm_s390_irq_state (in) -Returns: 0 on success, - -EFAULT if the buffer address was invalid, - -EINVAL for an invalid buffer length (see below), - -EBUSY if there were already interrupts pending, - errors occurring when actually injecting the - interrupt. See KVM_S390_IRQ. - -This ioctl allows userspace to set the complete state of all cpu-local -interrupts currently pending for the vcpu. It is intended for restoring -interrupt state after a migration. The input parameter is a userspace buffer -containing a struct kvm_s390_irq_state: - -struct kvm_s390_irq_state { - __u64 buf; - __u32 flags; /* will stay unused for compatibility reasons */ - __u32 len; - __u32 reserved[4]; /* will stay unused for compatibility reasons */ -}; - -The restrictions for flags and reserved apply as well. -(see KVM_S390_GET_IRQ_STATE) - -The userspace memory referenced by buf contains a struct kvm_s390_irq -for each interrupt to be injected into the guest. -If one of the interrupts could not be injected for some reason the -ioctl aborts. - -len must be a multiple of sizeof(struct kvm_s390_irq). It must be > 0 -and it must not exceed (max_vcpus + 32) * sizeof(struct kvm_s390_irq), -which is the maximum number of possibly pending cpu-local interrupts. - -4.96 KVM_SMI - -Capability: KVM_CAP_X86_SMM -Architectures: x86 -Type: vcpu ioctl -Parameters: none -Returns: 0 on success, -1 on error - -Queues an SMI on the thread's vcpu. - -4.97 KVM_CAP_PPC_MULTITCE - -Capability: KVM_CAP_PPC_MULTITCE -Architectures: ppc -Type: vm - -This capability means the kernel is capable of handling hypercalls -H_PUT_TCE_INDIRECT and H_STUFF_TCE without passing those into the user -space. This significantly accelerates DMA operations for PPC KVM guests. -User space should expect that its handlers for these hypercalls -are not going to be called if user space previously registered LIOBN -in KVM (via KVM_CREATE_SPAPR_TCE or similar calls). - -In order to enable H_PUT_TCE_INDIRECT and H_STUFF_TCE use in the guest, -user space might have to advertise it for the guest. For example, -IBM pSeries (sPAPR) guest starts using them if "hcall-multi-tce" is -present in the "ibm,hypertas-functions" device-tree property. - -The hypercalls mentioned above may or may not be processed successfully -in the kernel based fast path. If they can not be handled by the kernel, -they will get passed on to user space. So user space still has to have -an implementation for these despite the in kernel acceleration. - -This capability is always enabled. - -4.98 KVM_CREATE_SPAPR_TCE_64 - -Capability: KVM_CAP_SPAPR_TCE_64 -Architectures: powerpc -Type: vm ioctl -Parameters: struct kvm_create_spapr_tce_64 (in) -Returns: file descriptor for manipulating the created TCE table - -This is an extension for KVM_CAP_SPAPR_TCE which only supports 32bit -windows, described in 4.62 KVM_CREATE_SPAPR_TCE - -This capability uses extended struct in ioctl interface: - -/* for KVM_CAP_SPAPR_TCE_64 */ -struct kvm_create_spapr_tce_64 { - __u64 liobn; - __u32 page_shift; - __u32 flags; - __u64 offset; /* in pages */ - __u64 size; /* in pages */ -}; - -The aim of extension is to support an additional bigger DMA window with -a variable page size. -KVM_CREATE_SPAPR_TCE_64 receives a 64bit window size, an IOMMU page shift and -a bus offset of the corresponding DMA window, @size and @offset are numbers -of IOMMU pages. - -@flags are not used at the moment. - -The rest of functionality is identical to KVM_CREATE_SPAPR_TCE. - -4.99 KVM_REINJECT_CONTROL - -Capability: KVM_CAP_REINJECT_CONTROL -Architectures: x86 -Type: vm ioctl -Parameters: struct kvm_reinject_control (in) -Returns: 0 on success, - -EFAULT if struct kvm_reinject_control cannot be read, - -ENXIO if KVM_CREATE_PIT or KVM_CREATE_PIT2 didn't succeed earlier. - -i8254 (PIT) has two modes, reinject and !reinject. The default is reinject, -where KVM queues elapsed i8254 ticks and monitors completion of interrupt from -vector(s) that i8254 injects. Reinject mode dequeues a tick and injects its -interrupt whenever there isn't a pending interrupt from i8254. -!reinject mode injects an interrupt as soon as a tick arrives. - -struct kvm_reinject_control { - __u8 pit_reinject; - __u8 reserved[31]; -}; - -pit_reinject = 0 (!reinject mode) is recommended, unless running an old -operating system that uses the PIT for timing (e.g. Linux 2.4.x). - -4.100 KVM_PPC_CONFIGURE_V3_MMU - -Capability: KVM_CAP_PPC_RADIX_MMU or KVM_CAP_PPC_HASH_MMU_V3 -Architectures: ppc -Type: vm ioctl -Parameters: struct kvm_ppc_mmuv3_cfg (in) -Returns: 0 on success, - -EFAULT if struct kvm_ppc_mmuv3_cfg cannot be read, - -EINVAL if the configuration is invalid - -This ioctl controls whether the guest will use radix or HPT (hashed -page table) translation, and sets the pointer to the process table for -the guest. - -struct kvm_ppc_mmuv3_cfg { - __u64 flags; - __u64 process_table; -}; - -There are two bits that can be set in flags; KVM_PPC_MMUV3_RADIX and -KVM_PPC_MMUV3_GTSE. KVM_PPC_MMUV3_RADIX, if set, configures the guest -to use radix tree translation, and if clear, to use HPT translation. -KVM_PPC_MMUV3_GTSE, if set and if KVM permits it, configures the guest -to be able to use the global TLB and SLB invalidation instructions; -if clear, the guest may not use these instructions. - -The process_table field specifies the address and size of the guest -process table, which is in the guest's space. This field is formatted -as the second doubleword of the partition table entry, as defined in -the Power ISA V3.00, Book III section 5.7.6.1. - -4.101 KVM_PPC_GET_RMMU_INFO - -Capability: KVM_CAP_PPC_RADIX_MMU -Architectures: ppc -Type: vm ioctl -Parameters: struct kvm_ppc_rmmu_info (out) -Returns: 0 on success, - -EFAULT if struct kvm_ppc_rmmu_info cannot be written, - -EINVAL if no useful information can be returned - -This ioctl returns a structure containing two things: (a) a list -containing supported radix tree geometries, and (b) a list that maps -page sizes to put in the "AP" (actual page size) field for the tlbie -(TLB invalidate entry) instruction. - -struct kvm_ppc_rmmu_info { - struct kvm_ppc_radix_geom { - __u8 page_shift; - __u8 level_bits[4]; - __u8 pad[3]; - } geometries[8]; - __u32 ap_encodings[8]; -}; - -The geometries[] field gives up to 8 supported geometries for the -radix page table, in terms of the log base 2 of the smallest page -size, and the number of bits indexed at each level of the tree, from -the PTE level up to the PGD level in that order. Any unused entries -will have 0 in the page_shift field. - -The ap_encodings gives the supported page sizes and their AP field -encodings, encoded with the AP value in the top 3 bits and the log -base 2 of the page size in the bottom 6 bits. - -4.102 KVM_PPC_RESIZE_HPT_PREPARE - -Capability: KVM_CAP_SPAPR_RESIZE_HPT -Architectures: powerpc -Type: vm ioctl -Parameters: struct kvm_ppc_resize_hpt (in) -Returns: 0 on successful completion, - >0 if a new HPT is being prepared, the value is an estimated - number of milliseconds until preparation is complete - -EFAULT if struct kvm_reinject_control cannot be read, - -EINVAL if the supplied shift or flags are invalid - -ENOMEM if unable to allocate the new HPT - -ENOSPC if there was a hash collision when moving existing - HPT entries to the new HPT - -EIO on other error conditions - -Used to implement the PAPR extension for runtime resizing of a guest's -Hashed Page Table (HPT). Specifically this starts, stops or monitors -the preparation of a new potential HPT for the guest, essentially -implementing the H_RESIZE_HPT_PREPARE hypercall. - -If called with shift > 0 when there is no pending HPT for the guest, -this begins preparation of a new pending HPT of size 2^(shift) bytes. -It then returns a positive integer with the estimated number of -milliseconds until preparation is complete. - -If called when there is a pending HPT whose size does not match that -requested in the parameters, discards the existing pending HPT and -creates a new one as above. - -If called when there is a pending HPT of the size requested, will: - * If preparation of the pending HPT is already complete, return 0 - * If preparation of the pending HPT has failed, return an error - code, then discard the pending HPT. - * If preparation of the pending HPT is still in progress, return an - estimated number of milliseconds until preparation is complete. - -If called with shift == 0, discards any currently pending HPT and -returns 0 (i.e. cancels any in-progress preparation). - -flags is reserved for future expansion, currently setting any bits in -flags will result in an -EINVAL. - -Normally this will be called repeatedly with the same parameters until -it returns <= 0. The first call will initiate preparation, subsequent -ones will monitor preparation until it completes or fails. - -struct kvm_ppc_resize_hpt { - __u64 flags; - __u32 shift; - __u32 pad; -}; - -4.103 KVM_PPC_RESIZE_HPT_COMMIT - -Capability: KVM_CAP_SPAPR_RESIZE_HPT -Architectures: powerpc -Type: vm ioctl -Parameters: struct kvm_ppc_resize_hpt (in) -Returns: 0 on successful completion, - -EFAULT if struct kvm_reinject_control cannot be read, - -EINVAL if the supplied shift or flags are invalid - -ENXIO is there is no pending HPT, or the pending HPT doesn't - have the requested size - -EBUSY if the pending HPT is not fully prepared - -ENOSPC if there was a hash collision when moving existing - HPT entries to the new HPT - -EIO on other error conditions - -Used to implement the PAPR extension for runtime resizing of a guest's -Hashed Page Table (HPT). Specifically this requests that the guest be -transferred to working with the new HPT, essentially implementing the -H_RESIZE_HPT_COMMIT hypercall. - -This should only be called after KVM_PPC_RESIZE_HPT_PREPARE has -returned 0 with the same parameters. In other cases -KVM_PPC_RESIZE_HPT_COMMIT will return an error (usually -ENXIO or --EBUSY, though others may be possible if the preparation was started, -but failed). - -This will have undefined effects on the guest if it has not already -placed itself in a quiescent state where no vcpu will make MMU enabled -memory accesses. - -On succsful completion, the pending HPT will become the guest's active -HPT and the previous HPT will be discarded. - -On failure, the guest will still be operating on its previous HPT. - -struct kvm_ppc_resize_hpt { - __u64 flags; - __u32 shift; - __u32 pad; -}; - -4.104 KVM_X86_GET_MCE_CAP_SUPPORTED - -Capability: KVM_CAP_MCE -Architectures: x86 -Type: system ioctl -Parameters: u64 mce_cap (out) -Returns: 0 on success, -1 on error - -Returns supported MCE capabilities. The u64 mce_cap parameter -has the same format as the MSR_IA32_MCG_CAP register. Supported -capabilities will have the corresponding bits set. - -4.105 KVM_X86_SETUP_MCE - -Capability: KVM_CAP_MCE -Architectures: x86 -Type: vcpu ioctl -Parameters: u64 mcg_cap (in) -Returns: 0 on success, - -EFAULT if u64 mcg_cap cannot be read, - -EINVAL if the requested number of banks is invalid, - -EINVAL if requested MCE capability is not supported. - -Initializes MCE support for use. The u64 mcg_cap parameter -has the same format as the MSR_IA32_MCG_CAP register and -specifies which capabilities should be enabled. The maximum -supported number of error-reporting banks can be retrieved when -checking for KVM_CAP_MCE. The supported capabilities can be -retrieved with KVM_X86_GET_MCE_CAP_SUPPORTED. - -4.106 KVM_X86_SET_MCE - -Capability: KVM_CAP_MCE -Architectures: x86 -Type: vcpu ioctl -Parameters: struct kvm_x86_mce (in) -Returns: 0 on success, - -EFAULT if struct kvm_x86_mce cannot be read, - -EINVAL if the bank number is invalid, - -EINVAL if VAL bit is not set in status field. - -Inject a machine check error (MCE) into the guest. The input -parameter is: - -struct kvm_x86_mce { - __u64 status; - __u64 addr; - __u64 misc; - __u64 mcg_status; - __u8 bank; - __u8 pad1[7]; - __u64 pad2[3]; -}; - -If the MCE being reported is an uncorrected error, KVM will -inject it as an MCE exception into the guest. If the guest -MCG_STATUS register reports that an MCE is in progress, KVM -causes an KVM_EXIT_SHUTDOWN vmexit. - -Otherwise, if the MCE is a corrected error, KVM will just -store it in the corresponding bank (provided this bank is -not holding a previously reported uncorrected error). - -4.107 KVM_S390_GET_CMMA_BITS - -Capability: KVM_CAP_S390_CMMA_MIGRATION -Architectures: s390 -Type: vm ioctl -Parameters: struct kvm_s390_cmma_log (in, out) -Returns: 0 on success, a negative value on error - -This ioctl is used to get the values of the CMMA bits on the s390 -architecture. It is meant to be used in two scenarios: -- During live migration to save the CMMA values. Live migration needs - to be enabled via the KVM_REQ_START_MIGRATION VM property. -- To non-destructively peek at the CMMA values, with the flag - KVM_S390_CMMA_PEEK set. - -The ioctl takes parameters via the kvm_s390_cmma_log struct. The desired -values are written to a buffer whose location is indicated via the "values" -member in the kvm_s390_cmma_log struct. The values in the input struct are -also updated as needed. -Each CMMA value takes up one byte. - -struct kvm_s390_cmma_log { - __u64 start_gfn; - __u32 count; - __u32 flags; - union { - __u64 remaining; - __u64 mask; - }; - __u64 values; -}; - -start_gfn is the number of the first guest frame whose CMMA values are -to be retrieved, - -count is the length of the buffer in bytes, - -values points to the buffer where the result will be written to. - -If count is greater than KVM_S390_SKEYS_MAX, then it is considered to be -KVM_S390_SKEYS_MAX. KVM_S390_SKEYS_MAX is re-used for consistency with -other ioctls. - -The result is written in the buffer pointed to by the field values, and -the values of the input parameter are updated as follows. - -Depending on the flags, different actions are performed. The only -supported flag so far is KVM_S390_CMMA_PEEK. - -The default behaviour if KVM_S390_CMMA_PEEK is not set is: -start_gfn will indicate the first page frame whose CMMA bits were dirty. -It is not necessarily the same as the one passed as input, as clean pages -are skipped. - -count will indicate the number of bytes actually written in the buffer. -It can (and very often will) be smaller than the input value, since the -buffer is only filled until 16 bytes of clean values are found (which -are then not copied in the buffer). Since a CMMA migration block needs -the base address and the length, for a total of 16 bytes, we will send -back some clean data if there is some dirty data afterwards, as long as -the size of the clean data does not exceed the size of the header. This -allows to minimize the amount of data to be saved or transferred over -the network at the expense of more roundtrips to userspace. The next -invocation of the ioctl will skip over all the clean values, saving -potentially more than just the 16 bytes we found. - -If KVM_S390_CMMA_PEEK is set: -the existing storage attributes are read even when not in migration -mode, and no other action is performed; - -the output start_gfn will be equal to the input start_gfn, - -the output count will be equal to the input count, except if the end of -memory has been reached. - -In both cases: -the field "remaining" will indicate the total number of dirty CMMA values -still remaining, or 0 if KVM_S390_CMMA_PEEK is set and migration mode is -not enabled. - -mask is unused. - -values points to the userspace buffer where the result will be stored. - -This ioctl can fail with -ENOMEM if not enough memory can be allocated to -complete the task, with -ENXIO if CMMA is not enabled, with -EINVAL if -KVM_S390_CMMA_PEEK is not set but migration mode was not enabled, with --EFAULT if the userspace address is invalid or if no page table is -present for the addresses (e.g. when using hugepages). - -4.108 KVM_S390_SET_CMMA_BITS - -Capability: KVM_CAP_S390_CMMA_MIGRATION -Architectures: s390 -Type: vm ioctl -Parameters: struct kvm_s390_cmma_log (in) -Returns: 0 on success, a negative value on error - -This ioctl is used to set the values of the CMMA bits on the s390 -architecture. It is meant to be used during live migration to restore -the CMMA values, but there are no restrictions on its use. -The ioctl takes parameters via the kvm_s390_cmma_values struct. -Each CMMA value takes up one byte. - -struct kvm_s390_cmma_log { - __u64 start_gfn; - __u32 count; - __u32 flags; - union { - __u64 remaining; - __u64 mask; - }; - __u64 values; -}; - -start_gfn indicates the starting guest frame number, - -count indicates how many values are to be considered in the buffer, - -flags is not used and must be 0. - -mask indicates which PGSTE bits are to be considered. - -remaining is not used. - -values points to the buffer in userspace where to store the values. - -This ioctl can fail with -ENOMEM if not enough memory can be allocated to -complete the task, with -ENXIO if CMMA is not enabled, with -EINVAL if -the count field is too large (e.g. more than KVM_S390_CMMA_SIZE_MAX) or -if the flags field was not 0, with -EFAULT if the userspace address is -invalid, if invalid pages are written to (e.g. after the end of memory) -or if no page table is present for the addresses (e.g. when using -hugepages). - -4.109 KVM_PPC_GET_CPU_CHAR - -Capability: KVM_CAP_PPC_GET_CPU_CHAR -Architectures: powerpc -Type: vm ioctl -Parameters: struct kvm_ppc_cpu_char (out) -Returns: 0 on successful completion - -EFAULT if struct kvm_ppc_cpu_char cannot be written - -This ioctl gives userspace information about certain characteristics -of the CPU relating to speculative execution of instructions and -possible information leakage resulting from speculative execution (see -CVE-2017-5715, CVE-2017-5753 and CVE-2017-5754). The information is -returned in struct kvm_ppc_cpu_char, which looks like this: - -struct kvm_ppc_cpu_char { - __u64 character; /* characteristics of the CPU */ - __u64 behaviour; /* recommended software behaviour */ - __u64 character_mask; /* valid bits in character */ - __u64 behaviour_mask; /* valid bits in behaviour */ -}; - -For extensibility, the character_mask and behaviour_mask fields -indicate which bits of character and behaviour have been filled in by -the kernel. If the set of defined bits is extended in future then -userspace will be able to tell whether it is running on a kernel that -knows about the new bits. - -The character field describes attributes of the CPU which can help -with preventing inadvertent information disclosure - specifically, -whether there is an instruction to flash-invalidate the L1 data cache -(ori 30,30,0 or mtspr SPRN_TRIG2,rN), whether the L1 data cache is set -to a mode where entries can only be used by the thread that created -them, whether the bcctr[l] instruction prevents speculation, and -whether a speculation barrier instruction (ori 31,31,0) is provided. - -The behaviour field describes actions that software should take to -prevent inadvertent information disclosure, and thus describes which -vulnerabilities the hardware is subject to; specifically whether the -L1 data cache should be flushed when returning to user mode from the -kernel, and whether a speculation barrier should be placed between an -array bounds check and the array access. - -These fields use the same bit definitions as the new -H_GET_CPU_CHARACTERISTICS hypercall. - -4.110 KVM_MEMORY_ENCRYPT_OP - -Capability: basic -Architectures: x86 -Type: system -Parameters: an opaque platform specific structure (in/out) -Returns: 0 on success; -1 on error - -If the platform supports creating encrypted VMs then this ioctl can be used -for issuing platform-specific memory encryption commands to manage those -encrypted VMs. - -Currently, this ioctl is used for issuing Secure Encrypted Virtualization -(SEV) commands on AMD Processors. The SEV commands are defined in -Documentation/virt/kvm/amd-memory-encryption.rst. - -4.111 KVM_MEMORY_ENCRYPT_REG_REGION - -Capability: basic -Architectures: x86 -Type: system -Parameters: struct kvm_enc_region (in) -Returns: 0 on success; -1 on error - -This ioctl can be used to register a guest memory region which may -contain encrypted data (e.g. guest RAM, SMRAM etc). - -It is used in the SEV-enabled guest. When encryption is enabled, a guest -memory region may contain encrypted data. The SEV memory encryption -engine uses a tweak such that two identical plaintext pages, each at -different locations will have differing ciphertexts. So swapping or -moving ciphertext of those pages will not result in plaintext being -swapped. So relocating (or migrating) physical backing pages for the SEV -guest will require some additional steps. - -Note: The current SEV key management spec does not provide commands to -swap or migrate (move) ciphertext pages. Hence, for now we pin the guest -memory region registered with the ioctl. - -4.112 KVM_MEMORY_ENCRYPT_UNREG_REGION - -Capability: basic -Architectures: x86 -Type: system -Parameters: struct kvm_enc_region (in) -Returns: 0 on success; -1 on error - -This ioctl can be used to unregister the guest memory region registered -with KVM_MEMORY_ENCRYPT_REG_REGION ioctl above. - -4.113 KVM_HYPERV_EVENTFD - -Capability: KVM_CAP_HYPERV_EVENTFD -Architectures: x86 -Type: vm ioctl -Parameters: struct kvm_hyperv_eventfd (in) - -This ioctl (un)registers an eventfd to receive notifications from the guest on -the specified Hyper-V connection id through the SIGNAL_EVENT hypercall, without -causing a user exit. SIGNAL_EVENT hypercall with non-zero event flag number -(bits 24-31) still triggers a KVM_EXIT_HYPERV_HCALL user exit. - -struct kvm_hyperv_eventfd { - __u32 conn_id; - __s32 fd; - __u32 flags; - __u32 padding[3]; -}; - -The conn_id field should fit within 24 bits: - -#define KVM_HYPERV_CONN_ID_MASK 0x00ffffff - -The acceptable values for the flags field are: - -#define KVM_HYPERV_EVENTFD_DEASSIGN (1 << 0) - -Returns: 0 on success, - -EINVAL if conn_id or flags is outside the allowed range - -ENOENT on deassign if the conn_id isn't registered - -EEXIST on assign if the conn_id is already registered - -4.114 KVM_GET_NESTED_STATE - -Capability: KVM_CAP_NESTED_STATE -Architectures: x86 -Type: vcpu ioctl -Parameters: struct kvm_nested_state (in/out) -Returns: 0 on success, -1 on error -Errors: - E2BIG: the total state size exceeds the value of 'size' specified by - the user; the size required will be written into size. - -struct kvm_nested_state { - __u16 flags; - __u16 format; - __u32 size; - - union { - struct kvm_vmx_nested_state_hdr vmx; - struct kvm_svm_nested_state_hdr svm; - - /* Pad the header to 128 bytes. */ - __u8 pad[120]; - } hdr; - - union { - struct kvm_vmx_nested_state_data vmx[0]; - struct kvm_svm_nested_state_data svm[0]; - } data; -}; - -#define KVM_STATE_NESTED_GUEST_MODE 0x00000001 -#define KVM_STATE_NESTED_RUN_PENDING 0x00000002 -#define KVM_STATE_NESTED_EVMCS 0x00000004 - -#define KVM_STATE_NESTED_FORMAT_VMX 0 -#define KVM_STATE_NESTED_FORMAT_SVM 1 - -#define KVM_STATE_NESTED_VMX_VMCS_SIZE 0x1000 - -#define KVM_STATE_NESTED_VMX_SMM_GUEST_MODE 0x00000001 -#define KVM_STATE_NESTED_VMX_SMM_VMXON 0x00000002 - -struct kvm_vmx_nested_state_hdr { - __u64 vmxon_pa; - __u64 vmcs12_pa; - - struct { - __u16 flags; - } smm; -}; - -struct kvm_vmx_nested_state_data { - __u8 vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE]; - __u8 shadow_vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE]; -}; - -This ioctl copies the vcpu's nested virtualization state from the kernel to -userspace. - -The maximum size of the state can be retrieved by passing KVM_CAP_NESTED_STATE -to the KVM_CHECK_EXTENSION ioctl(). - -4.115 KVM_SET_NESTED_STATE - -Capability: KVM_CAP_NESTED_STATE -Architectures: x86 -Type: vcpu ioctl -Parameters: struct kvm_nested_state (in) -Returns: 0 on success, -1 on error - -This copies the vcpu's kvm_nested_state struct from userspace to the kernel. -For the definition of struct kvm_nested_state, see KVM_GET_NESTED_STATE. - -4.116 KVM_(UN)REGISTER_COALESCED_MMIO - -Capability: KVM_CAP_COALESCED_MMIO (for coalesced mmio) - KVM_CAP_COALESCED_PIO (for coalesced pio) -Architectures: all -Type: vm ioctl -Parameters: struct kvm_coalesced_mmio_zone -Returns: 0 on success, < 0 on error - -Coalesced I/O is a performance optimization that defers hardware -register write emulation so that userspace exits are avoided. It is -typically used to reduce the overhead of emulating frequently accessed -hardware registers. - -When a hardware register is configured for coalesced I/O, write accesses -do not exit to userspace and their value is recorded in a ring buffer -that is shared between kernel and userspace. - -Coalesced I/O is used if one or more write accesses to a hardware -register can be deferred until a read or a write to another hardware -register on the same device. This last access will cause a vmexit and -userspace will process accesses from the ring buffer before emulating -it. That will avoid exiting to userspace on repeated writes. - -Coalesced pio is based on coalesced mmio. There is little difference -between coalesced mmio and pio except that coalesced pio records accesses -to I/O ports. - -4.117 KVM_CLEAR_DIRTY_LOG (vm ioctl) - -Capability: KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 -Architectures: x86, arm, arm64, mips -Type: vm ioctl -Parameters: struct kvm_dirty_log (in) -Returns: 0 on success, -1 on error - -/* for KVM_CLEAR_DIRTY_LOG */ -struct kvm_clear_dirty_log { - __u32 slot; - __u32 num_pages; - __u64 first_page; - union { - void __user *dirty_bitmap; /* one bit per page */ - __u64 padding; - }; -}; - -The ioctl clears the dirty status of pages in a memory slot, according to -the bitmap that is passed in struct kvm_clear_dirty_log's dirty_bitmap -field. Bit 0 of the bitmap corresponds to page "first_page" in the -memory slot, and num_pages is the size in bits of the input bitmap. -first_page must be a multiple of 64; num_pages must also be a multiple of -64 unless first_page + num_pages is the size of the memory slot. For each -bit that is set in the input bitmap, the corresponding page is marked "clean" -in KVM's dirty bitmap, and dirty tracking is re-enabled for that page -(for example via write-protection, or by clearing the dirty bit in -a page table entry). - -If KVM_CAP_MULTI_ADDRESS_SPACE is available, bits 16-31 specifies -the address space for which you want to return the dirty bitmap. -They must be less than the value that KVM_CHECK_EXTENSION returns for -the KVM_CAP_MULTI_ADDRESS_SPACE capability. - -This ioctl is mostly useful when KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 -is enabled; for more information, see the description of the capability. -However, it can always be used as long as KVM_CHECK_EXTENSION confirms -that KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 is present. - -4.118 KVM_GET_SUPPORTED_HV_CPUID - -Capability: KVM_CAP_HYPERV_CPUID -Architectures: x86 -Type: vcpu ioctl -Parameters: struct kvm_cpuid2 (in/out) -Returns: 0 on success, -1 on error - -struct kvm_cpuid2 { - __u32 nent; - __u32 padding; - struct kvm_cpuid_entry2 entries[0]; -}; - -struct kvm_cpuid_entry2 { - __u32 function; - __u32 index; - __u32 flags; - __u32 eax; - __u32 ebx; - __u32 ecx; - __u32 edx; - __u32 padding[3]; -}; - -This ioctl returns x86 cpuid features leaves related to Hyper-V emulation in -KVM. Userspace can use the information returned by this ioctl to construct -cpuid information presented to guests consuming Hyper-V enlightenments (e.g. -Windows or Hyper-V guests). - -CPUID feature leaves returned by this ioctl are defined by Hyper-V Top Level -Functional Specification (TLFS). These leaves can't be obtained with -KVM_GET_SUPPORTED_CPUID ioctl because some of them intersect with KVM feature -leaves (0x40000000, 0x40000001). - -Currently, the following list of CPUID leaves are returned: - HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS - HYPERV_CPUID_INTERFACE - HYPERV_CPUID_VERSION - HYPERV_CPUID_FEATURES - HYPERV_CPUID_ENLIGHTMENT_INFO - HYPERV_CPUID_IMPLEMENT_LIMITS - HYPERV_CPUID_NESTED_FEATURES - -HYPERV_CPUID_NESTED_FEATURES leaf is only exposed when Enlightened VMCS was -enabled on the corresponding vCPU (KVM_CAP_HYPERV_ENLIGHTENED_VMCS). - -Userspace invokes KVM_GET_SUPPORTED_CPUID by passing a kvm_cpuid2 structure -with the 'nent' field indicating the number of entries in the variable-size -array 'entries'. If the number of entries is too low to describe all Hyper-V -feature leaves, an error (E2BIG) is returned. If the number is more or equal -to the number of Hyper-V feature leaves, the 'nent' field is adjusted to the -number of valid entries in the 'entries' array, which is then filled. - -'index' and 'flags' fields in 'struct kvm_cpuid_entry2' are currently reserved, -userspace should not expect to get any particular value there. - -4.119 KVM_ARM_VCPU_FINALIZE - -Architectures: arm, arm64 -Type: vcpu ioctl -Parameters: int feature (in) -Returns: 0 on success, -1 on error -Errors: - EPERM: feature not enabled, needs configuration, or already finalized - EINVAL: feature unknown or not present - -Recognised values for feature: - arm64 KVM_ARM_VCPU_SVE (requires KVM_CAP_ARM_SVE) - -Finalizes the configuration of the specified vcpu feature. - -The vcpu must already have been initialised, enabling the affected feature, by -means of a successful KVM_ARM_VCPU_INIT call with the appropriate flag set in -features[]. - -For affected vcpu features, this is a mandatory step that must be performed -before the vcpu is fully usable. - -Between KVM_ARM_VCPU_INIT and KVM_ARM_VCPU_FINALIZE, the feature may be -configured by use of ioctls such as KVM_SET_ONE_REG. The exact configuration -that should be performaned and how to do it are feature-dependent. - -Other calls that depend on a particular feature being finalized, such as -KVM_RUN, KVM_GET_REG_LIST, KVM_GET_ONE_REG and KVM_SET_ONE_REG, will fail with --EPERM unless the feature has already been finalized by means of a -KVM_ARM_VCPU_FINALIZE call. - -See KVM_ARM_VCPU_INIT for details of vcpu features that require finalization -using this ioctl. - -4.120 KVM_SET_PMU_EVENT_FILTER - -Capability: KVM_CAP_PMU_EVENT_FILTER -Architectures: x86 -Type: vm ioctl -Parameters: struct kvm_pmu_event_filter (in) -Returns: 0 on success, -1 on error - -struct kvm_pmu_event_filter { - __u32 action; - __u32 nevents; - __u32 fixed_counter_bitmap; - __u32 flags; - __u32 pad[4]; - __u64 events[0]; -}; - -This ioctl restricts the set of PMU events that the guest can program. -The argument holds a list of events which will be allowed or denied. -The eventsel+umask of each event the guest attempts to program is compared -against the events field to determine whether the guest should have access. -The events field only controls general purpose counters; fixed purpose -counters are controlled by the fixed_counter_bitmap. - -No flags are defined yet, the field must be zero. - -Valid values for 'action': -#define KVM_PMU_EVENT_ALLOW 0 -#define KVM_PMU_EVENT_DENY 1 - -4.121 KVM_PPC_SVM_OFF - -Capability: basic -Architectures: powerpc -Type: vm ioctl -Parameters: none -Returns: 0 on successful completion, -Errors: - EINVAL: if ultravisor failed to terminate the secure guest - ENOMEM: if hypervisor failed to allocate new radix page tables for guest - -This ioctl is used to turn off the secure mode of the guest or transition -the guest from secure mode to normal mode. This is invoked when the guest -is reset. This has no effect if called for a normal guest. - -This ioctl issues an ultravisor call to terminate the secure guest, -unpins the VPA pages and releases all the device pages that are used to -track the secure pages by hypervisor. - -4.122 KVM_S390_NORMAL_RESET - -Capability: KVM_CAP_S390_VCPU_RESETS -Architectures: s390 -Type: vcpu ioctl -Parameters: none -Returns: 0 - -This ioctl resets VCPU registers and control structures according to -the cpu reset definition in the POP (Principles Of Operation). - -4.123 KVM_S390_INITIAL_RESET - -Capability: none -Architectures: s390 -Type: vcpu ioctl -Parameters: none -Returns: 0 - -This ioctl resets VCPU registers and control structures according to -the initial cpu reset definition in the POP. However, the cpu is not -put into ESA mode. This reset is a superset of the normal reset. - -4.124 KVM_S390_CLEAR_RESET - -Capability: KVM_CAP_S390_VCPU_RESETS -Architectures: s390 -Type: vcpu ioctl -Parameters: none -Returns: 0 - -This ioctl resets VCPU registers and control structures according to -the clear cpu reset definition in the POP. However, the cpu is not put -into ESA mode. This reset is a superset of the initial reset. - - -5. The kvm_run structure ------------------------- - -Application code obtains a pointer to the kvm_run structure by -mmap()ing a vcpu fd. From that point, application code can control -execution by changing fields in kvm_run prior to calling the KVM_RUN -ioctl, and obtain information about the reason KVM_RUN returned by -looking up structure members. - -struct kvm_run { - /* in */ - __u8 request_interrupt_window; - -Request that KVM_RUN return when it becomes possible to inject external -interrupts into the guest. Useful in conjunction with KVM_INTERRUPT. - - __u8 immediate_exit; - -This field is polled once when KVM_RUN starts; if non-zero, KVM_RUN -exits immediately, returning -EINTR. In the common scenario where a -signal is used to "kick" a VCPU out of KVM_RUN, this field can be used -to avoid usage of KVM_SET_SIGNAL_MASK, which has worse scalability. -Rather than blocking the signal outside KVM_RUN, userspace can set up -a signal handler that sets run->immediate_exit to a non-zero value. - -This field is ignored if KVM_CAP_IMMEDIATE_EXIT is not available. - - __u8 padding1[6]; - - /* out */ - __u32 exit_reason; - -When KVM_RUN has returned successfully (return value 0), this informs -application code why KVM_RUN has returned. Allowable values for this -field are detailed below. - - __u8 ready_for_interrupt_injection; - -If request_interrupt_window has been specified, this field indicates -an interrupt can be injected now with KVM_INTERRUPT. - - __u8 if_flag; - -The value of the current interrupt flag. Only valid if in-kernel -local APIC is not used. - - __u16 flags; - -More architecture-specific flags detailing state of the VCPU that may -affect the device's behavior. The only currently defined flag is -KVM_RUN_X86_SMM, which is valid on x86 machines and is set if the -VCPU is in system management mode. - - /* in (pre_kvm_run), out (post_kvm_run) */ - __u64 cr8; - -The value of the cr8 register. Only valid if in-kernel local APIC is -not used. Both input and output. - - __u64 apic_base; - -The value of the APIC BASE msr. Only valid if in-kernel local -APIC is not used. Both input and output. - - union { - /* KVM_EXIT_UNKNOWN */ - struct { - __u64 hardware_exit_reason; - } hw; - -If exit_reason is KVM_EXIT_UNKNOWN, the vcpu has exited due to unknown -reasons. Further architecture-specific information is available in -hardware_exit_reason. - - /* KVM_EXIT_FAIL_ENTRY */ - struct { - __u64 hardware_entry_failure_reason; - } fail_entry; - -If exit_reason is KVM_EXIT_FAIL_ENTRY, the vcpu could not be run due -to unknown reasons. Further architecture-specific information is -available in hardware_entry_failure_reason. - - /* KVM_EXIT_EXCEPTION */ - struct { - __u32 exception; - __u32 error_code; - } ex; - -Unused. - - /* KVM_EXIT_IO */ - struct { -#define KVM_EXIT_IO_IN 0 -#define KVM_EXIT_IO_OUT 1 - __u8 direction; - __u8 size; /* bytes */ - __u16 port; - __u32 count; - __u64 data_offset; /* relative to kvm_run start */ - } io; - -If exit_reason is KVM_EXIT_IO, then the vcpu has -executed a port I/O instruction which could not be satisfied by kvm. -data_offset describes where the data is located (KVM_EXIT_IO_OUT) or -where kvm expects application code to place the data for the next -KVM_RUN invocation (KVM_EXIT_IO_IN). Data format is a packed array. - - /* KVM_EXIT_DEBUG */ - struct { - struct kvm_debug_exit_arch arch; - } debug; - -If the exit_reason is KVM_EXIT_DEBUG, then a vcpu is processing a debug event -for which architecture specific information is returned. - - /* KVM_EXIT_MMIO */ - struct { - __u64 phys_addr; - __u8 data[8]; - __u32 len; - __u8 is_write; - } mmio; - -If exit_reason is KVM_EXIT_MMIO, then the vcpu has -executed a memory-mapped I/O instruction which could not be satisfied -by kvm. The 'data' member contains the written data if 'is_write' is -true, and should be filled by application code otherwise. - -The 'data' member contains, in its first 'len' bytes, the value as it would -appear if the VCPU performed a load or store of the appropriate width directly -to the byte array. - -NOTE: For KVM_EXIT_IO, KVM_EXIT_MMIO, KVM_EXIT_OSI, KVM_EXIT_PAPR and - KVM_EXIT_EPR the corresponding -operations are complete (and guest state is consistent) only after userspace -has re-entered the kernel with KVM_RUN. The kernel side will first finish -incomplete operations and then check for pending signals. Userspace -can re-enter the guest with an unmasked signal pending to complete -pending operations. - - /* KVM_EXIT_HYPERCALL */ - struct { - __u64 nr; - __u64 args[6]; - __u64 ret; - __u32 longmode; - __u32 pad; - } hypercall; - -Unused. This was once used for 'hypercall to userspace'. To implement -such functionality, use KVM_EXIT_IO (x86) or KVM_EXIT_MMIO (all except s390). -Note KVM_EXIT_IO is significantly faster than KVM_EXIT_MMIO. - - /* KVM_EXIT_TPR_ACCESS */ - struct { - __u64 rip; - __u32 is_write; - __u32 pad; - } tpr_access; - -To be documented (KVM_TPR_ACCESS_REPORTING). - - /* KVM_EXIT_S390_SIEIC */ - struct { - __u8 icptcode; - __u64 mask; /* psw upper half */ - __u64 addr; /* psw lower half */ - __u16 ipa; - __u32 ipb; - } s390_sieic; - -s390 specific. - - /* KVM_EXIT_S390_RESET */ -#define KVM_S390_RESET_POR 1 -#define KVM_S390_RESET_CLEAR 2 -#define KVM_S390_RESET_SUBSYSTEM 4 -#define KVM_S390_RESET_CPU_INIT 8 -#define KVM_S390_RESET_IPL 16 - __u64 s390_reset_flags; - -s390 specific. - - /* KVM_EXIT_S390_UCONTROL */ - struct { - __u64 trans_exc_code; - __u32 pgm_code; - } s390_ucontrol; - -s390 specific. A page fault has occurred for a user controlled virtual -machine (KVM_VM_S390_UNCONTROL) on it's host page table that cannot be -resolved by the kernel. -The program code and the translation exception code that were placed -in the cpu's lowcore are presented here as defined by the z Architecture -Principles of Operation Book in the Chapter for Dynamic Address Translation -(DAT) - - /* KVM_EXIT_DCR */ - struct { - __u32 dcrn; - __u32 data; - __u8 is_write; - } dcr; - -Deprecated - was used for 440 KVM. - - /* KVM_EXIT_OSI */ - struct { - __u64 gprs[32]; - } osi; - -MOL uses a special hypercall interface it calls 'OSI'. To enable it, we catch -hypercalls and exit with this exit struct that contains all the guest gprs. - -If exit_reason is KVM_EXIT_OSI, then the vcpu has triggered such a hypercall. -Userspace can now handle the hypercall and when it's done modify the gprs as -necessary. Upon guest entry all guest GPRs will then be replaced by the values -in this struct. - - /* KVM_EXIT_PAPR_HCALL */ - struct { - __u64 nr; - __u64 ret; - __u64 args[9]; - } papr_hcall; - -This is used on 64-bit PowerPC when emulating a pSeries partition, -e.g. with the 'pseries' machine type in qemu. It occurs when the -guest does a hypercall using the 'sc 1' instruction. The 'nr' field -contains the hypercall number (from the guest R3), and 'args' contains -the arguments (from the guest R4 - R12). Userspace should put the -return code in 'ret' and any extra returned values in args[]. -The possible hypercalls are defined in the Power Architecture Platform -Requirements (PAPR) document available from www.power.org (free -developer registration required to access it). - - /* KVM_EXIT_S390_TSCH */ - struct { - __u16 subchannel_id; - __u16 subchannel_nr; - __u32 io_int_parm; - __u32 io_int_word; - __u32 ipb; - __u8 dequeued; - } s390_tsch; - -s390 specific. This exit occurs when KVM_CAP_S390_CSS_SUPPORT has been enabled -and TEST SUBCHANNEL was intercepted. If dequeued is set, a pending I/O -interrupt for the target subchannel has been dequeued and subchannel_id, -subchannel_nr, io_int_parm and io_int_word contain the parameters for that -interrupt. ipb is needed for instruction parameter decoding. - - /* KVM_EXIT_EPR */ - struct { - __u32 epr; - } epr; - -On FSL BookE PowerPC chips, the interrupt controller has a fast patch -interrupt acknowledge path to the core. When the core successfully -delivers an interrupt, it automatically populates the EPR register with -the interrupt vector number and acknowledges the interrupt inside -the interrupt controller. - -In case the interrupt controller lives in user space, we need to do -the interrupt acknowledge cycle through it to fetch the next to be -delivered interrupt vector using this exit. - -It gets triggered whenever both KVM_CAP_PPC_EPR are enabled and an -external interrupt has just been delivered into the guest. User space -should put the acknowledged interrupt vector into the 'epr' field. - - /* KVM_EXIT_SYSTEM_EVENT */ - struct { -#define KVM_SYSTEM_EVENT_SHUTDOWN 1 -#define KVM_SYSTEM_EVENT_RESET 2 -#define KVM_SYSTEM_EVENT_CRASH 3 - __u32 type; - __u64 flags; - } system_event; - -If exit_reason is KVM_EXIT_SYSTEM_EVENT then the vcpu has triggered -a system-level event using some architecture specific mechanism (hypercall -or some special instruction). In case of ARM/ARM64, this is triggered using -HVC instruction based PSCI call from the vcpu. The 'type' field describes -the system-level event type. The 'flags' field describes architecture -specific flags for the system-level event. - -Valid values for 'type' are: - KVM_SYSTEM_EVENT_SHUTDOWN -- the guest has requested a shutdown of the - VM. Userspace is not obliged to honour this, and if it does honour - this does not need to destroy the VM synchronously (ie it may call - KVM_RUN again before shutdown finally occurs). - KVM_SYSTEM_EVENT_RESET -- the guest has requested a reset of the VM. - As with SHUTDOWN, userspace can choose to ignore the request, or - to schedule the reset to occur in the future and may call KVM_RUN again. - KVM_SYSTEM_EVENT_CRASH -- the guest crash occurred and the guest - has requested a crash condition maintenance. Userspace can choose - to ignore the request, or to gather VM memory core dump and/or - reset/shutdown of the VM. - - /* KVM_EXIT_IOAPIC_EOI */ - struct { - __u8 vector; - } eoi; - -Indicates that the VCPU's in-kernel local APIC received an EOI for a -level-triggered IOAPIC interrupt. This exit only triggers when the -IOAPIC is implemented in userspace (i.e. KVM_CAP_SPLIT_IRQCHIP is enabled); -the userspace IOAPIC should process the EOI and retrigger the interrupt if -it is still asserted. Vector is the LAPIC interrupt vector for which the -EOI was received. - - struct kvm_hyperv_exit { -#define KVM_EXIT_HYPERV_SYNIC 1 -#define KVM_EXIT_HYPERV_HCALL 2 - __u32 type; - union { - struct { - __u32 msr; - __u64 control; - __u64 evt_page; - __u64 msg_page; - } synic; - struct { - __u64 input; - __u64 result; - __u64 params[2]; - } hcall; - } u; - }; - /* KVM_EXIT_HYPERV */ - struct kvm_hyperv_exit hyperv; -Indicates that the VCPU exits into userspace to process some tasks -related to Hyper-V emulation. -Valid values for 'type' are: - KVM_EXIT_HYPERV_SYNIC -- synchronously notify user-space about -Hyper-V SynIC state change. Notification is used to remap SynIC -event/message pages and to enable/disable SynIC messages/events processing -in userspace. - - /* KVM_EXIT_ARM_NISV */ - struct { - __u64 esr_iss; - __u64 fault_ipa; - } arm_nisv; - -Used on arm and arm64 systems. If a guest accesses memory not in a memslot, -KVM will typically return to userspace and ask it to do MMIO emulation on its -behalf. However, for certain classes of instructions, no instruction decode -(direction, length of memory access) is provided, and fetching and decoding -the instruction from the VM is overly complicated to live in the kernel. - -Historically, when this situation occurred, KVM would print a warning and kill -the VM. KVM assumed that if the guest accessed non-memslot memory, it was -trying to do I/O, which just couldn't be emulated, and the warning message was -phrased accordingly. However, what happened more often was that a guest bug -caused access outside the guest memory areas which should lead to a more -meaningful warning message and an external abort in the guest, if the access -did not fall within an I/O window. - -Userspace implementations can query for KVM_CAP_ARM_NISV_TO_USER, and enable -this capability at VM creation. Once this is done, these types of errors will -instead return to userspace with KVM_EXIT_ARM_NISV, with the valid bits from -the HSR (arm) and ESR_EL2 (arm64) in the esr_iss field, and the faulting IPA -in the fault_ipa field. Userspace can either fix up the access if it's -actually an I/O access by decoding the instruction from guest memory (if it's -very brave) and continue executing the guest, or it can decide to suspend, -dump, or restart the guest. - -Note that KVM does not skip the faulting instruction as it does for -KVM_EXIT_MMIO, but userspace has to emulate any change to the processing state -if it decides to decode and emulate the instruction. - - /* Fix the size of the union. */ - char padding[256]; - }; - - /* - * shared registers between kvm and userspace. - * kvm_valid_regs specifies the register classes set by the host - * kvm_dirty_regs specified the register classes dirtied by userspace - * struct kvm_sync_regs is architecture specific, as well as the - * bits for kvm_valid_regs and kvm_dirty_regs - */ - __u64 kvm_valid_regs; - __u64 kvm_dirty_regs; - union { - struct kvm_sync_regs regs; - char padding[SYNC_REGS_SIZE_BYTES]; - } s; - -If KVM_CAP_SYNC_REGS is defined, these fields allow userspace to access -certain guest registers without having to call SET/GET_*REGS. Thus we can -avoid some system call overhead if userspace has to handle the exit. -Userspace can query the validity of the structure by checking -kvm_valid_regs for specific bits. These bits are architecture specific -and usually define the validity of a groups of registers. (e.g. one bit - for general purpose registers) - -Please note that the kernel is allowed to use the kvm_run structure as the -primary storage for certain register types. Therefore, the kernel may use the -values in kvm_run even if the corresponding bit in kvm_dirty_regs is not set. - -}; - - - -6. Capabilities that can be enabled on vCPUs --------------------------------------------- - -There are certain capabilities that change the behavior of the virtual CPU or -the virtual machine when enabled. To enable them, please see section 4.37. -Below you can find a list of capabilities and what their effect on the vCPU or -the virtual machine is when enabling them. - -The following information is provided along with the description: - - Architectures: which instruction set architectures provide this ioctl. - x86 includes both i386 and x86_64. - - Target: whether this is a per-vcpu or per-vm capability. - - Parameters: what parameters are accepted by the capability. - - Returns: the return value. General error numbers (EBADF, ENOMEM, EINVAL) - are not detailed, but errors with specific meanings are. - - -6.1 KVM_CAP_PPC_OSI - -Architectures: ppc -Target: vcpu -Parameters: none -Returns: 0 on success; -1 on error - -This capability enables interception of OSI hypercalls that otherwise would -be treated as normal system calls to be injected into the guest. OSI hypercalls -were invented by Mac-on-Linux to have a standardized communication mechanism -between the guest and the host. - -When this capability is enabled, KVM_EXIT_OSI can occur. - - -6.2 KVM_CAP_PPC_PAPR - -Architectures: ppc -Target: vcpu -Parameters: none -Returns: 0 on success; -1 on error - -This capability enables interception of PAPR hypercalls. PAPR hypercalls are -done using the hypercall instruction "sc 1". - -It also sets the guest privilege level to "supervisor" mode. Usually the guest -runs in "hypervisor" privilege mode with a few missing features. - -In addition to the above, it changes the semantics of SDR1. In this mode, the -HTAB address part of SDR1 contains an HVA instead of a GPA, as PAPR keeps the -HTAB invisible to the guest. - -When this capability is enabled, KVM_EXIT_PAPR_HCALL can occur. - - -6.3 KVM_CAP_SW_TLB - -Architectures: ppc -Target: vcpu -Parameters: args[0] is the address of a struct kvm_config_tlb -Returns: 0 on success; -1 on error - -struct kvm_config_tlb { - __u64 params; - __u64 array; - __u32 mmu_type; - __u32 array_len; -}; - -Configures the virtual CPU's TLB array, establishing a shared memory area -between userspace and KVM. The "params" and "array" fields are userspace -addresses of mmu-type-specific data structures. The "array_len" field is an -safety mechanism, and should be set to the size in bytes of the memory that -userspace has reserved for the array. It must be at least the size dictated -by "mmu_type" and "params". - -While KVM_RUN is active, the shared region is under control of KVM. Its -contents are undefined, and any modification by userspace results in -boundedly undefined behavior. - -On return from KVM_RUN, the shared region will reflect the current state of -the guest's TLB. If userspace makes any changes, it must call KVM_DIRTY_TLB -to tell KVM which entries have been changed, prior to calling KVM_RUN again -on this vcpu. - -For mmu types KVM_MMU_FSL_BOOKE_NOHV and KVM_MMU_FSL_BOOKE_HV: - - The "params" field is of type "struct kvm_book3e_206_tlb_params". - - The "array" field points to an array of type "struct - kvm_book3e_206_tlb_entry". - - The array consists of all entries in the first TLB, followed by all - entries in the second TLB. - - Within a TLB, entries are ordered first by increasing set number. Within a - set, entries are ordered by way (increasing ESEL). - - The hash for determining set number in TLB0 is: (MAS2 >> 12) & (num_sets - 1) - where "num_sets" is the tlb_sizes[] value divided by the tlb_ways[] value. - - The tsize field of mas1 shall be set to 4K on TLB0, even though the - hardware ignores this value for TLB0. - -6.4 KVM_CAP_S390_CSS_SUPPORT - -Architectures: s390 -Target: vcpu -Parameters: none -Returns: 0 on success; -1 on error - -This capability enables support for handling of channel I/O instructions. - -TEST PENDING INTERRUPTION and the interrupt portion of TEST SUBCHANNEL are -handled in-kernel, while the other I/O instructions are passed to userspace. - -When this capability is enabled, KVM_EXIT_S390_TSCH will occur on TEST -SUBCHANNEL intercepts. - -Note that even though this capability is enabled per-vcpu, the complete -virtual machine is affected. - -6.5 KVM_CAP_PPC_EPR - -Architectures: ppc -Target: vcpu -Parameters: args[0] defines whether the proxy facility is active -Returns: 0 on success; -1 on error - -This capability enables or disables the delivery of interrupts through the -external proxy facility. - -When enabled (args[0] != 0), every time the guest gets an external interrupt -delivered, it automatically exits into user space with a KVM_EXIT_EPR exit -to receive the topmost interrupt vector. - -When disabled (args[0] == 0), behavior is as if this facility is unsupported. - -When this capability is enabled, KVM_EXIT_EPR can occur. - -6.6 KVM_CAP_IRQ_MPIC - -Architectures: ppc -Parameters: args[0] is the MPIC device fd - args[1] is the MPIC CPU number for this vcpu - -This capability connects the vcpu to an in-kernel MPIC device. - -6.7 KVM_CAP_IRQ_XICS - -Architectures: ppc -Target: vcpu -Parameters: args[0] is the XICS device fd - args[1] is the XICS CPU number (server ID) for this vcpu - -This capability connects the vcpu to an in-kernel XICS device. - -6.8 KVM_CAP_S390_IRQCHIP - -Architectures: s390 -Target: vm -Parameters: none - -This capability enables the in-kernel irqchip for s390. Please refer to -"4.24 KVM_CREATE_IRQCHIP" for details. - -6.9 KVM_CAP_MIPS_FPU - -Architectures: mips -Target: vcpu -Parameters: args[0] is reserved for future use (should be 0). - -This capability allows the use of the host Floating Point Unit by the guest. It -allows the Config1.FP bit to be set to enable the FPU in the guest. Once this is -done the KVM_REG_MIPS_FPR_* and KVM_REG_MIPS_FCR_* registers can be accessed -(depending on the current guest FPU register mode), and the Status.FR, -Config5.FRE bits are accessible via the KVM API and also from the guest, -depending on them being supported by the FPU. - -6.10 KVM_CAP_MIPS_MSA - -Architectures: mips -Target: vcpu -Parameters: args[0] is reserved for future use (should be 0). - -This capability allows the use of the MIPS SIMD Architecture (MSA) by the guest. -It allows the Config3.MSAP bit to be set to enable the use of MSA by the guest. -Once this is done the KVM_REG_MIPS_VEC_* and KVM_REG_MIPS_MSA_* registers can be -accessed, and the Config5.MSAEn bit is accessible via the KVM API and also from -the guest. - -6.74 KVM_CAP_SYNC_REGS -Architectures: s390, x86 -Target: s390: always enabled, x86: vcpu -Parameters: none -Returns: x86: KVM_CHECK_EXTENSION returns a bit-array indicating which register -sets are supported (bitfields defined in arch/x86/include/uapi/asm/kvm.h). - -As described above in the kvm_sync_regs struct info in section 5 (kvm_run): -KVM_CAP_SYNC_REGS "allow[s] userspace to access certain guest registers -without having to call SET/GET_*REGS". This reduces overhead by eliminating -repeated ioctl calls for setting and/or getting register values. This is -particularly important when userspace is making synchronous guest state -modifications, e.g. when emulating and/or intercepting instructions in -userspace. - -For s390 specifics, please refer to the source code. - -For x86: -- the register sets to be copied out to kvm_run are selectable - by userspace (rather that all sets being copied out for every exit). -- vcpu_events are available in addition to regs and sregs. - -For x86, the 'kvm_valid_regs' field of struct kvm_run is overloaded to -function as an input bit-array field set by userspace to indicate the -specific register sets to be copied out on the next exit. - -To indicate when userspace has modified values that should be copied into -the vCPU, the all architecture bitarray field, 'kvm_dirty_regs' must be set. -This is done using the same bitflags as for the 'kvm_valid_regs' field. -If the dirty bit is not set, then the register set values will not be copied -into the vCPU even if they've been modified. - -Unused bitfields in the bitarrays must be set to zero. - -struct kvm_sync_regs { - struct kvm_regs regs; - struct kvm_sregs sregs; - struct kvm_vcpu_events events; -}; - -6.75 KVM_CAP_PPC_IRQ_XIVE - -Architectures: ppc -Target: vcpu -Parameters: args[0] is the XIVE device fd - args[1] is the XIVE CPU number (server ID) for this vcpu - -This capability connects the vcpu to an in-kernel XIVE device. - -7. Capabilities that can be enabled on VMs ------------------------------------------- - -There are certain capabilities that change the behavior of the virtual -machine when enabled. To enable them, please see section 4.37. Below -you can find a list of capabilities and what their effect on the VM -is when enabling them. - -The following information is provided along with the description: - - Architectures: which instruction set architectures provide this ioctl. - x86 includes both i386 and x86_64. - - Parameters: what parameters are accepted by the capability. - - Returns: the return value. General error numbers (EBADF, ENOMEM, EINVAL) - are not detailed, but errors with specific meanings are. - - -7.1 KVM_CAP_PPC_ENABLE_HCALL - -Architectures: ppc -Parameters: args[0] is the sPAPR hcall number - args[1] is 0 to disable, 1 to enable in-kernel handling - -This capability controls whether individual sPAPR hypercalls (hcalls) -get handled by the kernel or not. Enabling or disabling in-kernel -handling of an hcall is effective across the VM. On creation, an -initial set of hcalls are enabled for in-kernel handling, which -consists of those hcalls for which in-kernel handlers were implemented -before this capability was implemented. If disabled, the kernel will -not to attempt to handle the hcall, but will always exit to userspace -to handle it. Note that it may not make sense to enable some and -disable others of a group of related hcalls, but KVM does not prevent -userspace from doing that. - -If the hcall number specified is not one that has an in-kernel -implementation, the KVM_ENABLE_CAP ioctl will fail with an EINVAL -error. - -7.2 KVM_CAP_S390_USER_SIGP - -Architectures: s390 -Parameters: none - -This capability controls which SIGP orders will be handled completely in user -space. With this capability enabled, all fast orders will be handled completely -in the kernel: -- SENSE -- SENSE RUNNING -- EXTERNAL CALL -- EMERGENCY SIGNAL -- CONDITIONAL EMERGENCY SIGNAL - -All other orders will be handled completely in user space. - -Only privileged operation exceptions will be checked for in the kernel (or even -in the hardware prior to interception). If this capability is not enabled, the -old way of handling SIGP orders is used (partially in kernel and user space). - -7.3 KVM_CAP_S390_VECTOR_REGISTERS - -Architectures: s390 -Parameters: none -Returns: 0 on success, negative value on error - -Allows use of the vector registers introduced with z13 processor, and -provides for the synchronization between host and user space. Will -return -EINVAL if the machine does not support vectors. - -7.4 KVM_CAP_S390_USER_STSI - -Architectures: s390 -Parameters: none - -This capability allows post-handlers for the STSI instruction. After -initial handling in the kernel, KVM exits to user space with -KVM_EXIT_S390_STSI to allow user space to insert further data. - -Before exiting to userspace, kvm handlers should fill in s390_stsi field of -vcpu->run: -struct { - __u64 addr; - __u8 ar; - __u8 reserved; - __u8 fc; - __u8 sel1; - __u16 sel2; -} s390_stsi; - -@addr - guest address of STSI SYSIB -@fc - function code -@sel1 - selector 1 -@sel2 - selector 2 -@ar - access register number - -KVM handlers should exit to userspace with rc = -EREMOTE. - -7.5 KVM_CAP_SPLIT_IRQCHIP - -Architectures: x86 -Parameters: args[0] - number of routes reserved for userspace IOAPICs -Returns: 0 on success, -1 on error - -Create a local apic for each processor in the kernel. This can be used -instead of KVM_CREATE_IRQCHIP if the userspace VMM wishes to emulate the -IOAPIC and PIC (and also the PIT, even though this has to be enabled -separately). - -This capability also enables in kernel routing of interrupt requests; -when KVM_CAP_SPLIT_IRQCHIP only routes of KVM_IRQ_ROUTING_MSI type are -used in the IRQ routing table. The first args[0] MSI routes are reserved -for the IOAPIC pins. Whenever the LAPIC receives an EOI for these routes, -a KVM_EXIT_IOAPIC_EOI vmexit will be reported to userspace. - -Fails if VCPU has already been created, or if the irqchip is already in the -kernel (i.e. KVM_CREATE_IRQCHIP has already been called). - -7.6 KVM_CAP_S390_RI - -Architectures: s390 -Parameters: none - -Allows use of runtime-instrumentation introduced with zEC12 processor. -Will return -EINVAL if the machine does not support runtime-instrumentation. -Will return -EBUSY if a VCPU has already been created. - -7.7 KVM_CAP_X2APIC_API - -Architectures: x86 -Parameters: args[0] - features that should be enabled -Returns: 0 on success, -EINVAL when args[0] contains invalid features - -Valid feature flags in args[0] are - -#define KVM_X2APIC_API_USE_32BIT_IDS (1ULL << 0) -#define KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK (1ULL << 1) - -Enabling KVM_X2APIC_API_USE_32BIT_IDS changes the behavior of -KVM_SET_GSI_ROUTING, KVM_SIGNAL_MSI, KVM_SET_LAPIC, and KVM_GET_LAPIC, -allowing the use of 32-bit APIC IDs. See KVM_CAP_X2APIC_API in their -respective sections. - -KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK must be enabled for x2APIC to work -in logical mode or with more than 255 VCPUs. Otherwise, KVM treats 0xff -as a broadcast even in x2APIC mode in order to support physical x2APIC -without interrupt remapping. This is undesirable in logical mode, -where 0xff represents CPUs 0-7 in cluster 0. - -7.8 KVM_CAP_S390_USER_INSTR0 - -Architectures: s390 -Parameters: none - -With this capability enabled, all illegal instructions 0x0000 (2 bytes) will -be intercepted and forwarded to user space. User space can use this -mechanism e.g. to realize 2-byte software breakpoints. The kernel will -not inject an operating exception for these instructions, user space has -to take care of that. - -This capability can be enabled dynamically even if VCPUs were already -created and are running. - -7.9 KVM_CAP_S390_GS - -Architectures: s390 -Parameters: none -Returns: 0 on success; -EINVAL if the machine does not support - guarded storage; -EBUSY if a VCPU has already been created. - -Allows use of guarded storage for the KVM guest. - -7.10 KVM_CAP_S390_AIS - -Architectures: s390 -Parameters: none - -Allow use of adapter-interruption suppression. -Returns: 0 on success; -EBUSY if a VCPU has already been created. - -7.11 KVM_CAP_PPC_SMT - -Architectures: ppc -Parameters: vsmt_mode, flags - -Enabling this capability on a VM provides userspace with a way to set -the desired virtual SMT mode (i.e. the number of virtual CPUs per -virtual core). The virtual SMT mode, vsmt_mode, must be a power of 2 -between 1 and 8. On POWER8, vsmt_mode must also be no greater than -the number of threads per subcore for the host. Currently flags must -be 0. A successful call to enable this capability will result in -vsmt_mode being returned when the KVM_CAP_PPC_SMT capability is -subsequently queried for the VM. This capability is only supported by -HV KVM, and can only be set before any VCPUs have been created. -The KVM_CAP_PPC_SMT_POSSIBLE capability indicates which virtual SMT -modes are available. - -7.12 KVM_CAP_PPC_FWNMI - -Architectures: ppc -Parameters: none - -With this capability a machine check exception in the guest address -space will cause KVM to exit the guest with NMI exit reason. This -enables QEMU to build error log and branch to guest kernel registered -machine check handling routine. Without this capability KVM will -branch to guests' 0x200 interrupt vector. - -7.13 KVM_CAP_X86_DISABLE_EXITS - -Architectures: x86 -Parameters: args[0] defines which exits are disabled -Returns: 0 on success, -EINVAL when args[0] contains invalid exits - -Valid bits in args[0] are - -#define KVM_X86_DISABLE_EXITS_MWAIT (1 << 0) -#define KVM_X86_DISABLE_EXITS_HLT (1 << 1) -#define KVM_X86_DISABLE_EXITS_PAUSE (1 << 2) -#define KVM_X86_DISABLE_EXITS_CSTATE (1 << 3) - -Enabling this capability on a VM provides userspace with a way to no -longer intercept some instructions for improved latency in some -workloads, and is suggested when vCPUs are associated to dedicated -physical CPUs. More bits can be added in the future; userspace can -just pass the KVM_CHECK_EXTENSION result to KVM_ENABLE_CAP to disable -all such vmexits. - -Do not enable KVM_FEATURE_PV_UNHALT if you disable HLT exits. - -7.14 KVM_CAP_S390_HPAGE_1M - -Architectures: s390 -Parameters: none -Returns: 0 on success, -EINVAL if hpage module parameter was not set - or cmma is enabled, or the VM has the KVM_VM_S390_UCONTROL - flag set - -With this capability the KVM support for memory backing with 1m pages -through hugetlbfs can be enabled for a VM. After the capability is -enabled, cmma can't be enabled anymore and pfmfi and the storage key -interpretation are disabled. If cmma has already been enabled or the -hpage module parameter is not set to 1, -EINVAL is returned. - -While it is generally possible to create a huge page backed VM without -this capability, the VM will not be able to run. - -7.15 KVM_CAP_MSR_PLATFORM_INFO - -Architectures: x86 -Parameters: args[0] whether feature should be enabled or not - -With this capability, a guest may read the MSR_PLATFORM_INFO MSR. Otherwise, -a #GP would be raised when the guest tries to access. Currently, this -capability does not enable write permissions of this MSR for the guest. - -7.16 KVM_CAP_PPC_NESTED_HV - -Architectures: ppc -Parameters: none -Returns: 0 on success, -EINVAL when the implementation doesn't support - nested-HV virtualization. - -HV-KVM on POWER9 and later systems allows for "nested-HV" -virtualization, which provides a way for a guest VM to run guests that -can run using the CPU's supervisor mode (privileged non-hypervisor -state). Enabling this capability on a VM depends on the CPU having -the necessary functionality and on the facility being enabled with a -kvm-hv module parameter. - -7.17 KVM_CAP_EXCEPTION_PAYLOAD - -Architectures: x86 -Parameters: args[0] whether feature should be enabled or not - -With this capability enabled, CR2 will not be modified prior to the -emulated VM-exit when L1 intercepts a #PF exception that occurs in -L2. Similarly, for kvm-intel only, DR6 will not be modified prior to -the emulated VM-exit when L1 intercepts a #DB exception that occurs in -L2. As a result, when KVM_GET_VCPU_EVENTS reports a pending #PF (or -#DB) exception for L2, exception.has_payload will be set and the -faulting address (or the new DR6 bits*) will be reported in the -exception_payload field. Similarly, when userspace injects a #PF (or -#DB) into L2 using KVM_SET_VCPU_EVENTS, it is expected to set -exception.has_payload and to put the faulting address (or the new DR6 -bits*) in the exception_payload field. - -This capability also enables exception.pending in struct -kvm_vcpu_events, which allows userspace to distinguish between pending -and injected exceptions. - - -* For the new DR6 bits, note that bit 16 is set iff the #DB exception - will clear DR6.RTM. - -7.18 KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 - -Architectures: x86, arm, arm64, mips -Parameters: args[0] whether feature should be enabled or not - -With this capability enabled, KVM_GET_DIRTY_LOG will not automatically -clear and write-protect all pages that are returned as dirty. -Rather, userspace will have to do this operation separately using -KVM_CLEAR_DIRTY_LOG. - -At the cost of a slightly more complicated operation, this provides better -scalability and responsiveness for two reasons. First, -KVM_CLEAR_DIRTY_LOG ioctl can operate on a 64-page granularity rather -than requiring to sync a full memslot; this ensures that KVM does not -take spinlocks for an extended period of time. Second, in some cases a -large amount of time can pass between a call to KVM_GET_DIRTY_LOG and -userspace actually using the data in the page. Pages can be modified -during this time, which is inefficint for both the guest and userspace: -the guest will incur a higher penalty due to write protection faults, -while userspace can see false reports of dirty pages. Manual reprotection -helps reducing this time, improving guest performance and reducing the -number of dirty log false positives. - -KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 was previously available under the name -KVM_CAP_MANUAL_DIRTY_LOG_PROTECT, but the implementation had bugs that make -it hard or impossible to use it correctly. The availability of -KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 signals that those bugs are fixed. -Userspace should not try to use KVM_CAP_MANUAL_DIRTY_LOG_PROTECT. - -8. Other capabilities. ----------------------- - -This section lists capabilities that give information about other -features of the KVM implementation. - -8.1 KVM_CAP_PPC_HWRNG - -Architectures: ppc - -This capability, if KVM_CHECK_EXTENSION indicates that it is -available, means that that the kernel has an implementation of the -H_RANDOM hypercall backed by a hardware random-number generator. -If present, the kernel H_RANDOM handler can be enabled for guest use -with the KVM_CAP_PPC_ENABLE_HCALL capability. - -8.2 KVM_CAP_HYPERV_SYNIC - -Architectures: x86 -This capability, if KVM_CHECK_EXTENSION indicates that it is -available, means that that the kernel has an implementation of the -Hyper-V Synthetic interrupt controller(SynIC). Hyper-V SynIC is -used to support Windows Hyper-V based guest paravirt drivers(VMBus). - -In order to use SynIC, it has to be activated by setting this -capability via KVM_ENABLE_CAP ioctl on the vcpu fd. Note that this -will disable the use of APIC hardware virtualization even if supported -by the CPU, as it's incompatible with SynIC auto-EOI behavior. - -8.3 KVM_CAP_PPC_RADIX_MMU - -Architectures: ppc - -This capability, if KVM_CHECK_EXTENSION indicates that it is -available, means that that the kernel can support guests using the -radix MMU defined in Power ISA V3.00 (as implemented in the POWER9 -processor). - -8.4 KVM_CAP_PPC_HASH_MMU_V3 - -Architectures: ppc - -This capability, if KVM_CHECK_EXTENSION indicates that it is -available, means that that the kernel can support guests using the -hashed page table MMU defined in Power ISA V3.00 (as implemented in -the POWER9 processor), including in-memory segment tables. - -8.5 KVM_CAP_MIPS_VZ - -Architectures: mips - -This capability, if KVM_CHECK_EXTENSION on the main kvm handle indicates that -it is available, means that full hardware assisted virtualization capabilities -of the hardware are available for use through KVM. An appropriate -KVM_VM_MIPS_* type must be passed to KVM_CREATE_VM to create a VM which -utilises it. - -If KVM_CHECK_EXTENSION on a kvm VM handle indicates that this capability is -available, it means that the VM is using full hardware assisted virtualization -capabilities of the hardware. This is useful to check after creating a VM with -KVM_VM_MIPS_DEFAULT. - -The value returned by KVM_CHECK_EXTENSION should be compared against known -values (see below). All other values are reserved. This is to allow for the -possibility of other hardware assisted virtualization implementations which -may be incompatible with the MIPS VZ ASE. - - 0: The trap & emulate implementation is in use to run guest code in user - mode. Guest virtual memory segments are rearranged to fit the guest in the - user mode address space. - - 1: The MIPS VZ ASE is in use, providing full hardware assisted - virtualization, including standard guest virtual memory segments. - -8.6 KVM_CAP_MIPS_TE - -Architectures: mips - -This capability, if KVM_CHECK_EXTENSION on the main kvm handle indicates that -it is available, means that the trap & emulate implementation is available to -run guest code in user mode, even if KVM_CAP_MIPS_VZ indicates that hardware -assisted virtualisation is also available. KVM_VM_MIPS_TE (0) must be passed -to KVM_CREATE_VM to create a VM which utilises it. - -If KVM_CHECK_EXTENSION on a kvm VM handle indicates that this capability is -available, it means that the VM is using trap & emulate. - -8.7 KVM_CAP_MIPS_64BIT - -Architectures: mips - -This capability indicates the supported architecture type of the guest, i.e. the -supported register and address width. - -The values returned when this capability is checked by KVM_CHECK_EXTENSION on a -kvm VM handle correspond roughly to the CP0_Config.AT register field, and should -be checked specifically against known values (see below). All other values are -reserved. - - 0: MIPS32 or microMIPS32. - Both registers and addresses are 32-bits wide. - It will only be possible to run 32-bit guest code. - - 1: MIPS64 or microMIPS64 with access only to 32-bit compatibility segments. - Registers are 64-bits wide, but addresses are 32-bits wide. - 64-bit guest code may run but cannot access MIPS64 memory segments. - It will also be possible to run 32-bit guest code. - - 2: MIPS64 or microMIPS64 with access to all address segments. - Both registers and addresses are 64-bits wide. - It will be possible to run 64-bit or 32-bit guest code. - -8.9 KVM_CAP_ARM_USER_IRQ - -Architectures: arm, arm64 -This capability, if KVM_CHECK_EXTENSION indicates that it is available, means -that if userspace creates a VM without an in-kernel interrupt controller, it -will be notified of changes to the output level of in-kernel emulated devices, -which can generate virtual interrupts, presented to the VM. -For such VMs, on every return to userspace, the kernel -updates the vcpu's run->s.regs.device_irq_level field to represent the actual -output level of the device. - -Whenever kvm detects a change in the device output level, kvm guarantees at -least one return to userspace before running the VM. This exit could either -be a KVM_EXIT_INTR or any other exit event, like KVM_EXIT_MMIO. This way, -userspace can always sample the device output level and re-compute the state of -the userspace interrupt controller. Userspace should always check the state -of run->s.regs.device_irq_level on every kvm exit. -The value in run->s.regs.device_irq_level can represent both level and edge -triggered interrupt signals, depending on the device. Edge triggered interrupt -signals will exit to userspace with the bit in run->s.regs.device_irq_level -set exactly once per edge signal. - -The field run->s.regs.device_irq_level is available independent of -run->kvm_valid_regs or run->kvm_dirty_regs bits. - -If KVM_CAP_ARM_USER_IRQ is supported, the KVM_CHECK_EXTENSION ioctl returns a -number larger than 0 indicating the version of this capability is implemented -and thereby which bits in in run->s.regs.device_irq_level can signal values. - -Currently the following bits are defined for the device_irq_level bitmap: - - KVM_CAP_ARM_USER_IRQ >= 1: - - KVM_ARM_DEV_EL1_VTIMER - EL1 virtual timer - KVM_ARM_DEV_EL1_PTIMER - EL1 physical timer - KVM_ARM_DEV_PMU - ARM PMU overflow interrupt signal - -Future versions of kvm may implement additional events. These will get -indicated by returning a higher number from KVM_CHECK_EXTENSION and will be -listed above. - -8.10 KVM_CAP_PPC_SMT_POSSIBLE - -Architectures: ppc - -Querying this capability returns a bitmap indicating the possible -virtual SMT modes that can be set using KVM_CAP_PPC_SMT. If bit N -(counting from the right) is set, then a virtual SMT mode of 2^N is -available. - -8.11 KVM_CAP_HYPERV_SYNIC2 - -Architectures: x86 - -This capability enables a newer version of Hyper-V Synthetic interrupt -controller (SynIC). The only difference with KVM_CAP_HYPERV_SYNIC is that KVM -doesn't clear SynIC message and event flags pages when they are enabled by -writing to the respective MSRs. - -8.12 KVM_CAP_HYPERV_VP_INDEX - -Architectures: x86 - -This capability indicates that userspace can load HV_X64_MSR_VP_INDEX msr. Its -value is used to denote the target vcpu for a SynIC interrupt. For -compatibilty, KVM initializes this msr to KVM's internal vcpu index. When this -capability is absent, userspace can still query this msr's value. - -8.13 KVM_CAP_S390_AIS_MIGRATION - -Architectures: s390 -Parameters: none - -This capability indicates if the flic device will be able to get/set the -AIS states for migration via the KVM_DEV_FLIC_AISM_ALL attribute and allows -to discover this without having to create a flic device. - -8.14 KVM_CAP_S390_PSW - -Architectures: s390 - -This capability indicates that the PSW is exposed via the kvm_run structure. - -8.15 KVM_CAP_S390_GMAP - -Architectures: s390 - -This capability indicates that the user space memory used as guest mapping can -be anywhere in the user memory address space, as long as the memory slots are -aligned and sized to a segment (1MB) boundary. - -8.16 KVM_CAP_S390_COW - -Architectures: s390 - -This capability indicates that the user space memory used as guest mapping can -use copy-on-write semantics as well as dirty pages tracking via read-only page -tables. - -8.17 KVM_CAP_S390_BPB - -Architectures: s390 - -This capability indicates that kvm will implement the interfaces to handle -reset, migration and nested KVM for branch prediction blocking. The stfle -facility 82 should not be provided to the guest without this capability. - -8.18 KVM_CAP_HYPERV_TLBFLUSH - -Architectures: x86 - -This capability indicates that KVM supports paravirtualized Hyper-V TLB Flush -hypercalls: -HvFlushVirtualAddressSpace, HvFlushVirtualAddressSpaceEx, -HvFlushVirtualAddressList, HvFlushVirtualAddressListEx. - -8.19 KVM_CAP_ARM_INJECT_SERROR_ESR - -Architectures: arm, arm64 - -This capability indicates that userspace can specify (via the -KVM_SET_VCPU_EVENTS ioctl) the syndrome value reported to the guest when it -takes a virtual SError interrupt exception. -If KVM advertises this capability, userspace can only specify the ISS field for -the ESR syndrome. Other parts of the ESR, such as the EC are generated by the -CPU when the exception is taken. If this virtual SError is taken to EL1 using -AArch64, this value will be reported in the ISS field of ESR_ELx. - -See KVM_CAP_VCPU_EVENTS for more details. -8.20 KVM_CAP_HYPERV_SEND_IPI - -Architectures: x86 - -This capability indicates that KVM supports paravirtualized Hyper-V IPI send -hypercalls: -HvCallSendSyntheticClusterIpi, HvCallSendSyntheticClusterIpiEx. -8.21 KVM_CAP_HYPERV_DIRECT_TLBFLUSH - -Architecture: x86 - -This capability indicates that KVM running on top of Hyper-V hypervisor -enables Direct TLB flush for its guests meaning that TLB flush -hypercalls are handled by Level 0 hypervisor (Hyper-V) bypassing KVM. -Due to the different ABI for hypercall parameters between Hyper-V and -KVM, enabling this capability effectively disables all hypercall -handling by KVM (as some KVM hypercall may be mistakenly treated as TLB -flush hypercalls by Hyper-V) so userspace should disable KVM identification -in CPUID and only exposes Hyper-V identification. In this case, guest -thinks it's running on Hyper-V and only use Hyper-V hypercalls. - -8.22 KVM_CAP_S390_VCPU_RESETS - -Architectures: s390 - -This capability indicates that the KVM_S390_NORMAL_RESET and -KVM_S390_CLEAR_RESET ioctls are available. diff --git a/Documentation/virt/kvm/index.rst b/Documentation/virt/kvm/index.rst index 24d1076ec680..6fe79185b9bc 100644 --- a/Documentation/virt/kvm/index.rst +++ b/Documentation/virt/kvm/index.rst @@ -7,6 +7,7 @@ KVM .. toctree:: :maxdepth: 2 + api amd-memory-encryption cpuid halt-polling -- cgit v1.2.3 From 69bf758bc8a4875a361d7c703995248d808fa24d Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 10 Feb 2020 07:02:56 +0100 Subject: docs: kvm: convert arm/hyp-abi.txt to ReST - Add proper markups for titles; - Adjust whitespaces and blank lines to match ReST needs; - Mark literal blocks as such. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/arm/hyp-abi.rst | 63 ++++++++++++++++++++++++++++++++++ Documentation/virt/kvm/arm/hyp-abi.txt | 53 ---------------------------- Documentation/virt/kvm/arm/index.rst | 1 + 3 files changed, 64 insertions(+), 53 deletions(-) create mode 100644 Documentation/virt/kvm/arm/hyp-abi.rst delete mode 100644 Documentation/virt/kvm/arm/hyp-abi.txt diff --git a/Documentation/virt/kvm/arm/hyp-abi.rst b/Documentation/virt/kvm/arm/hyp-abi.rst new file mode 100644 index 000000000000..d1fc27d848e9 --- /dev/null +++ b/Documentation/virt/kvm/arm/hyp-abi.rst @@ -0,0 +1,63 @@ +.. SPDX-License-Identifier: GPL-2.0 + +======================================= +Internal ABI between the kernel and HYP +======================================= + +This file documents the interaction between the Linux kernel and the +hypervisor layer when running Linux as a hypervisor (for example +KVM). It doesn't cover the interaction of the kernel with the +hypervisor when running as a guest (under Xen, KVM or any other +hypervisor), or any hypervisor-specific interaction when the kernel is +used as a host. + +On arm and arm64 (without VHE), the kernel doesn't run in hypervisor +mode, but still needs to interact with it, allowing a built-in +hypervisor to be either installed or torn down. + +In order to achieve this, the kernel must be booted at HYP (arm) or +EL2 (arm64), allowing it to install a set of stubs before dropping to +SVC/EL1. These stubs are accessible by using a 'hvc #0' instruction, +and only act on individual CPUs. + +Unless specified otherwise, any built-in hypervisor must implement +these functions (see arch/arm{,64}/include/asm/virt.h): + +* :: + + r0/x0 = HVC_SET_VECTORS + r1/x1 = vectors + + Set HVBAR/VBAR_EL2 to 'vectors' to enable a hypervisor. 'vectors' + must be a physical address, and respect the alignment requirements + of the architecture. Only implemented by the initial stubs, not by + Linux hypervisors. + +* :: + + r0/x0 = HVC_RESET_VECTORS + + Turn HYP/EL2 MMU off, and reset HVBAR/VBAR_EL2 to the initials + stubs' exception vector value. This effectively disables an existing + hypervisor. + +* :: + + r0/x0 = HVC_SOFT_RESTART + r1/x1 = restart address + x2 = x0's value when entering the next payload (arm64) + x3 = x1's value when entering the next payload (arm64) + x4 = x2's value when entering the next payload (arm64) + + Mask all exceptions, disable the MMU, move the arguments into place + (arm64 only), and jump to the restart address while at HYP/EL2. This + hypercall is not expected to return to its caller. + +Any other value of r0/x0 triggers a hypervisor-specific handling, +which is not documented here. + +The return value of a stub hypercall is held by r0/x0, and is 0 on +success, and HVC_STUB_ERR on error. A stub hypercall is allowed to +clobber any of the caller-saved registers (x0-x18 on arm64, r0-r3 and +ip on arm). It is thus recommended to use a function call to perform +the hypercall. diff --git a/Documentation/virt/kvm/arm/hyp-abi.txt b/Documentation/virt/kvm/arm/hyp-abi.txt deleted file mode 100644 index a20a0bee268d..000000000000 --- a/Documentation/virt/kvm/arm/hyp-abi.txt +++ /dev/null @@ -1,53 +0,0 @@ -* Internal ABI between the kernel and HYP - -This file documents the interaction between the Linux kernel and the -hypervisor layer when running Linux as a hypervisor (for example -KVM). It doesn't cover the interaction of the kernel with the -hypervisor when running as a guest (under Xen, KVM or any other -hypervisor), or any hypervisor-specific interaction when the kernel is -used as a host. - -On arm and arm64 (without VHE), the kernel doesn't run in hypervisor -mode, but still needs to interact with it, allowing a built-in -hypervisor to be either installed or torn down. - -In order to achieve this, the kernel must be booted at HYP (arm) or -EL2 (arm64), allowing it to install a set of stubs before dropping to -SVC/EL1. These stubs are accessible by using a 'hvc #0' instruction, -and only act on individual CPUs. - -Unless specified otherwise, any built-in hypervisor must implement -these functions (see arch/arm{,64}/include/asm/virt.h): - -* r0/x0 = HVC_SET_VECTORS - r1/x1 = vectors - - Set HVBAR/VBAR_EL2 to 'vectors' to enable a hypervisor. 'vectors' - must be a physical address, and respect the alignment requirements - of the architecture. Only implemented by the initial stubs, not by - Linux hypervisors. - -* r0/x0 = HVC_RESET_VECTORS - - Turn HYP/EL2 MMU off, and reset HVBAR/VBAR_EL2 to the initials - stubs' exception vector value. This effectively disables an existing - hypervisor. - -* r0/x0 = HVC_SOFT_RESTART - r1/x1 = restart address - x2 = x0's value when entering the next payload (arm64) - x3 = x1's value when entering the next payload (arm64) - x4 = x2's value when entering the next payload (arm64) - - Mask all exceptions, disable the MMU, move the arguments into place - (arm64 only), and jump to the restart address while at HYP/EL2. This - hypercall is not expected to return to its caller. - -Any other value of r0/x0 triggers a hypervisor-specific handling, -which is not documented here. - -The return value of a stub hypercall is held by r0/x0, and is 0 on -success, and HVC_STUB_ERR on error. A stub hypercall is allowed to -clobber any of the caller-saved registers (x0-x18 on arm64, r0-r3 and -ip on arm). It is thus recommended to use a function call to perform -the hypercall. diff --git a/Documentation/virt/kvm/arm/index.rst b/Documentation/virt/kvm/arm/index.rst index e039d9b1e076..134fa5fa77e5 100644 --- a/Documentation/virt/kvm/arm/index.rst +++ b/Documentation/virt/kvm/arm/index.rst @@ -7,4 +7,5 @@ ARM .. toctree:: :maxdepth: 2 + hyp-abi pvtime -- cgit v1.2.3 From cec0e48be339f06879d971702f206e9683956ef1 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 10 Feb 2020 07:02:57 +0100 Subject: docs: kvm: arm/psci.txt: convert to ReST - Add a title for the document; - Adjust whitespaces for it to be properly formatted after parsed. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/arm/index.rst | 1 + Documentation/virt/kvm/arm/psci.rst | 77 ++++++++++++++++++++++++++++++++++++ Documentation/virt/kvm/arm/psci.txt | 61 ---------------------------- 3 files changed, 78 insertions(+), 61 deletions(-) create mode 100644 Documentation/virt/kvm/arm/psci.rst delete mode 100644 Documentation/virt/kvm/arm/psci.txt diff --git a/Documentation/virt/kvm/arm/index.rst b/Documentation/virt/kvm/arm/index.rst index 134fa5fa77e5..3e2b2aba90fc 100644 --- a/Documentation/virt/kvm/arm/index.rst +++ b/Documentation/virt/kvm/arm/index.rst @@ -8,4 +8,5 @@ ARM :maxdepth: 2 hyp-abi + psci pvtime diff --git a/Documentation/virt/kvm/arm/psci.rst b/Documentation/virt/kvm/arm/psci.rst new file mode 100644 index 000000000000..d52c2e83b5b8 --- /dev/null +++ b/Documentation/virt/kvm/arm/psci.rst @@ -0,0 +1,77 @@ +.. SPDX-License-Identifier: GPL-2.0 + +========================================= +Power State Coordination Interface (PSCI) +========================================= + +KVM implements the PSCI (Power State Coordination Interface) +specification in order to provide services such as CPU on/off, reset +and power-off to the guest. + +The PSCI specification is regularly updated to provide new features, +and KVM implements these updates if they make sense from a virtualization +point of view. + +This means that a guest booted on two different versions of KVM can +observe two different "firmware" revisions. This could cause issues if +a given guest is tied to a particular PSCI revision (unlikely), or if +a migration causes a different PSCI version to be exposed out of the +blue to an unsuspecting guest. + +In order to remedy this situation, KVM exposes a set of "firmware +pseudo-registers" that can be manipulated using the GET/SET_ONE_REG +interface. These registers can be saved/restored by userspace, and set +to a convenient value if required. + +The following register is defined: + +* KVM_REG_ARM_PSCI_VERSION: + + - Only valid if the vcpu has the KVM_ARM_VCPU_PSCI_0_2 feature set + (and thus has already been initialized) + - Returns the current PSCI version on GET_ONE_REG (defaulting to the + highest PSCI version implemented by KVM and compatible with v0.2) + - Allows any PSCI version implemented by KVM and compatible with + v0.2 to be set with SET_ONE_REG + - Affects the whole VM (even if the register view is per-vcpu) + +* KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1: + Holds the state of the firmware support to mitigate CVE-2017-5715, as + offered by KVM to the guest via a HVC call. The workaround is described + under SMCCC_ARCH_WORKAROUND_1 in [1]. + + Accepted values are: + + KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_AVAIL: + KVM does not offer + firmware support for the workaround. The mitigation status for the + guest is unknown. + KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_AVAIL: + The workaround HVC call is + available to the guest and required for the mitigation. + KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_REQUIRED: + The workaround HVC call + is available to the guest, but it is not needed on this VCPU. + +* KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2: + Holds the state of the firmware support to mitigate CVE-2018-3639, as + offered by KVM to the guest via a HVC call. The workaround is described + under SMCCC_ARCH_WORKAROUND_2 in [1]_. + + Accepted values are: + + KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_AVAIL: + A workaround is not + available. KVM does not offer firmware support for the workaround. + KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_UNKNOWN: + The workaround state is + unknown. KVM does not offer firmware support for the workaround. + KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_AVAIL: + The workaround is available, + and can be disabled by a vCPU. If + KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_ENABLED is set, it is active for + this vCPU. + KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_REQUIRED: + The workaround is always active on this vCPU or it is not needed. + +.. [1] https://developer.arm.com/-/media/developer/pdf/ARM_DEN_0070A_Firmware_interfaces_for_mitigating_CVE-2017-5715.pdf diff --git a/Documentation/virt/kvm/arm/psci.txt b/Documentation/virt/kvm/arm/psci.txt deleted file mode 100644 index 559586fc9d37..000000000000 --- a/Documentation/virt/kvm/arm/psci.txt +++ /dev/null @@ -1,61 +0,0 @@ -KVM implements the PSCI (Power State Coordination Interface) -specification in order to provide services such as CPU on/off, reset -and power-off to the guest. - -The PSCI specification is regularly updated to provide new features, -and KVM implements these updates if they make sense from a virtualization -point of view. - -This means that a guest booted on two different versions of KVM can -observe two different "firmware" revisions. This could cause issues if -a given guest is tied to a particular PSCI revision (unlikely), or if -a migration causes a different PSCI version to be exposed out of the -blue to an unsuspecting guest. - -In order to remedy this situation, KVM exposes a set of "firmware -pseudo-registers" that can be manipulated using the GET/SET_ONE_REG -interface. These registers can be saved/restored by userspace, and set -to a convenient value if required. - -The following register is defined: - -* KVM_REG_ARM_PSCI_VERSION: - - - Only valid if the vcpu has the KVM_ARM_VCPU_PSCI_0_2 feature set - (and thus has already been initialized) - - Returns the current PSCI version on GET_ONE_REG (defaulting to the - highest PSCI version implemented by KVM and compatible with v0.2) - - Allows any PSCI version implemented by KVM and compatible with - v0.2 to be set with SET_ONE_REG - - Affects the whole VM (even if the register view is per-vcpu) - -* KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1: - Holds the state of the firmware support to mitigate CVE-2017-5715, as - offered by KVM to the guest via a HVC call. The workaround is described - under SMCCC_ARCH_WORKAROUND_1 in [1]. - Accepted values are: - KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_AVAIL: KVM does not offer - firmware support for the workaround. The mitigation status for the - guest is unknown. - KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_AVAIL: The workaround HVC call is - available to the guest and required for the mitigation. - KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_REQUIRED: The workaround HVC call - is available to the guest, but it is not needed on this VCPU. - -* KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2: - Holds the state of the firmware support to mitigate CVE-2018-3639, as - offered by KVM to the guest via a HVC call. The workaround is described - under SMCCC_ARCH_WORKAROUND_2 in [1]. - Accepted values are: - KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_AVAIL: A workaround is not - available. KVM does not offer firmware support for the workaround. - KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_UNKNOWN: The workaround state is - unknown. KVM does not offer firmware support for the workaround. - KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_AVAIL: The workaround is available, - and can be disabled by a vCPU. If - KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_ENABLED is set, it is active for - this vCPU. - KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_REQUIRED: The workaround is - always active on this vCPU or it is not needed. - -[1] https://developer.arm.com/-/media/developer/pdf/ARM_DEN_0070A_Firmware_interfaces_for_mitigating_CVE-2017-5715.pdf -- cgit v1.2.3 From 5a0af4806c25aff4b2f8d2e24d635840ec58a87b Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 10 Feb 2020 07:02:58 +0100 Subject: docs: kvm: Convert hypercalls.txt to ReST format - Use document title and chapter markups; - Convert tables; - Add markups for literal blocks; - use :field: for field descriptions; - Add blank lines and adjust indentation Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/hypercalls.rst | 171 ++++++++++++++++++++++++++++++++++ Documentation/virt/kvm/hypercalls.txt | 154 ------------------------------ Documentation/virt/kvm/index.rst | 2 + 3 files changed, 173 insertions(+), 154 deletions(-) create mode 100644 Documentation/virt/kvm/hypercalls.rst delete mode 100644 Documentation/virt/kvm/hypercalls.txt diff --git a/Documentation/virt/kvm/hypercalls.rst b/Documentation/virt/kvm/hypercalls.rst new file mode 100644 index 000000000000..dbaf207e560d --- /dev/null +++ b/Documentation/virt/kvm/hypercalls.rst @@ -0,0 +1,171 @@ +.. SPDX-License-Identifier: GPL-2.0 + +=================== +Linux KVM Hypercall +=================== + +X86: + KVM Hypercalls have a three-byte sequence of either the vmcall or the vmmcall + instruction. The hypervisor can replace it with instructions that are + guaranteed to be supported. + + Up to four arguments may be passed in rbx, rcx, rdx, and rsi respectively. + The hypercall number should be placed in rax and the return value will be + placed in rax. No other registers will be clobbered unless explicitly stated + by the particular hypercall. + +S390: + R2-R7 are used for parameters 1-6. In addition, R1 is used for hypercall + number. The return value is written to R2. + + S390 uses diagnose instruction as hypercall (0x500) along with hypercall + number in R1. + + For further information on the S390 diagnose call as supported by KVM, + refer to Documentation/virt/kvm/s390-diag.txt. + +PowerPC: + It uses R3-R10 and hypercall number in R11. R4-R11 are used as output registers. + Return value is placed in R3. + + KVM hypercalls uses 4 byte opcode, that are patched with 'hypercall-instructions' + property inside the device tree's /hypervisor node. + For more information refer to Documentation/virt/kvm/ppc-pv.txt + +MIPS: + KVM hypercalls use the HYPCALL instruction with code 0 and the hypercall + number in $2 (v0). Up to four arguments may be placed in $4-$7 (a0-a3) and + the return value is placed in $2 (v0). + +KVM Hypercalls Documentation +============================ + +The template for each hypercall is: +1. Hypercall name. +2. Architecture(s) +3. Status (deprecated, obsolete, active) +4. Purpose + +1. KVM_HC_VAPIC_POLL_IRQ +------------------------ + +:Architecture: x86 +:Status: active +:Purpose: Trigger guest exit so that the host can check for pending + interrupts on reentry. + +2. KVM_HC_MMU_OP +---------------- + +:Architecture: x86 +:Status: deprecated. +:Purpose: Support MMU operations such as writing to PTE, + flushing TLB, release PT. + +3. KVM_HC_FEATURES +------------------ + +:Architecture: PPC +:Status: active +:Purpose: Expose hypercall availability to the guest. On x86 platforms, cpuid + used to enumerate which hypercalls are available. On PPC, either + device tree based lookup ( which is also what EPAPR dictates) + OR KVM specific enumeration mechanism (which is this hypercall) + can be used. + +4. KVM_HC_PPC_MAP_MAGIC_PAGE +---------------------------- + +:Architecture: PPC +:Status: active +:Purpose: To enable communication between the hypervisor and guest there is a + shared page that contains parts of supervisor visible register state. + The guest can map this shared page to access its supervisor register + through memory using this hypercall. + +5. KVM_HC_KICK_CPU +------------------ + +:Architecture: x86 +:Status: active +:Purpose: Hypercall used to wakeup a vcpu from HLT state +:Usage example: + A vcpu of a paravirtualized guest that is busywaiting in guest + kernel mode for an event to occur (ex: a spinlock to become available) can + execute HLT instruction once it has busy-waited for more than a threshold + time-interval. Execution of HLT instruction would cause the hypervisor to put + the vcpu to sleep until occurrence of an appropriate event. Another vcpu of the + same guest can wakeup the sleeping vcpu by issuing KVM_HC_KICK_CPU hypercall, + specifying APIC ID (a1) of the vcpu to be woken up. An additional argument (a0) + is used in the hypercall for future use. + + +6. KVM_HC_CLOCK_PAIRING +----------------------- +:Architecture: x86 +:Status: active +:Purpose: Hypercall used to synchronize host and guest clocks. + +Usage: + +a0: guest physical address where host copies +"struct kvm_clock_offset" structure. + +a1: clock_type, ATM only KVM_CLOCK_PAIRING_WALLCLOCK (0) +is supported (corresponding to the host's CLOCK_REALTIME clock). + + :: + + struct kvm_clock_pairing { + __s64 sec; + __s64 nsec; + __u64 tsc; + __u32 flags; + __u32 pad[9]; + }; + + Where: + * sec: seconds from clock_type clock. + * nsec: nanoseconds from clock_type clock. + * tsc: guest TSC value used to calculate sec/nsec pair + * flags: flags, unused (0) at the moment. + +The hypercall lets a guest compute a precise timestamp across +host and guest. The guest can use the returned TSC value to +compute the CLOCK_REALTIME for its clock, at the same instant. + +Returns KVM_EOPNOTSUPP if the host does not use TSC clocksource, +or if clock type is different than KVM_CLOCK_PAIRING_WALLCLOCK. + +6. KVM_HC_SEND_IPI +------------------ + +:Architecture: x86 +:Status: active +:Purpose: Send IPIs to multiple vCPUs. + +- a0: lower part of the bitmap of destination APIC IDs +- a1: higher part of the bitmap of destination APIC IDs +- a2: the lowest APIC ID in bitmap +- a3: APIC ICR + +The hypercall lets a guest send multicast IPIs, with at most 128 +128 destinations per hypercall in 64-bit mode and 64 vCPUs per +hypercall in 32-bit mode. The destinations are represented by a +bitmap contained in the first two arguments (a0 and a1). Bit 0 of +a0 corresponds to the APIC ID in the third argument (a2), bit 1 +corresponds to the APIC ID a2+1, and so on. + +Returns the number of CPUs to which the IPIs were delivered successfully. + +7. KVM_HC_SCHED_YIELD +--------------------- + +:Architecture: x86 +:Status: active +:Purpose: Hypercall used to yield if the IPI target vCPU is preempted + +a0: destination APIC ID + +:Usage example: When sending a call-function IPI-many to vCPUs, yield if + any of the IPI target vCPUs was preempted. diff --git a/Documentation/virt/kvm/hypercalls.txt b/Documentation/virt/kvm/hypercalls.txt deleted file mode 100644 index 5f6d291bd004..000000000000 --- a/Documentation/virt/kvm/hypercalls.txt +++ /dev/null @@ -1,154 +0,0 @@ -Linux KVM Hypercall: -=================== -X86: - KVM Hypercalls have a three-byte sequence of either the vmcall or the vmmcall - instruction. The hypervisor can replace it with instructions that are - guaranteed to be supported. - - Up to four arguments may be passed in rbx, rcx, rdx, and rsi respectively. - The hypercall number should be placed in rax and the return value will be - placed in rax. No other registers will be clobbered unless explicitly stated - by the particular hypercall. - -S390: - R2-R7 are used for parameters 1-6. In addition, R1 is used for hypercall - number. The return value is written to R2. - - S390 uses diagnose instruction as hypercall (0x500) along with hypercall - number in R1. - - For further information on the S390 diagnose call as supported by KVM, - refer to Documentation/virt/kvm/s390-diag.txt. - - PowerPC: - It uses R3-R10 and hypercall number in R11. R4-R11 are used as output registers. - Return value is placed in R3. - - KVM hypercalls uses 4 byte opcode, that are patched with 'hypercall-instructions' - property inside the device tree's /hypervisor node. - For more information refer to Documentation/virt/kvm/ppc-pv.txt - -MIPS: - KVM hypercalls use the HYPCALL instruction with code 0 and the hypercall - number in $2 (v0). Up to four arguments may be placed in $4-$7 (a0-a3) and - the return value is placed in $2 (v0). - -KVM Hypercalls Documentation -=========================== -The template for each hypercall is: -1. Hypercall name. -2. Architecture(s) -3. Status (deprecated, obsolete, active) -4. Purpose - -1. KVM_HC_VAPIC_POLL_IRQ ------------------------- -Architecture: x86 -Status: active -Purpose: Trigger guest exit so that the host can check for pending -interrupts on reentry. - -2. KVM_HC_MMU_OP ------------------------- -Architecture: x86 -Status: deprecated. -Purpose: Support MMU operations such as writing to PTE, -flushing TLB, release PT. - -3. KVM_HC_FEATURES ------------------------- -Architecture: PPC -Status: active -Purpose: Expose hypercall availability to the guest. On x86 platforms, cpuid -used to enumerate which hypercalls are available. On PPC, either device tree -based lookup ( which is also what EPAPR dictates) OR KVM specific enumeration -mechanism (which is this hypercall) can be used. - -4. KVM_HC_PPC_MAP_MAGIC_PAGE ------------------------- -Architecture: PPC -Status: active -Purpose: To enable communication between the hypervisor and guest there is a -shared page that contains parts of supervisor visible register state. -The guest can map this shared page to access its supervisor register through -memory using this hypercall. - -5. KVM_HC_KICK_CPU ------------------------- -Architecture: x86 -Status: active -Purpose: Hypercall used to wakeup a vcpu from HLT state -Usage example : A vcpu of a paravirtualized guest that is busywaiting in guest -kernel mode for an event to occur (ex: a spinlock to become available) can -execute HLT instruction once it has busy-waited for more than a threshold -time-interval. Execution of HLT instruction would cause the hypervisor to put -the vcpu to sleep until occurrence of an appropriate event. Another vcpu of the -same guest can wakeup the sleeping vcpu by issuing KVM_HC_KICK_CPU hypercall, -specifying APIC ID (a1) of the vcpu to be woken up. An additional argument (a0) -is used in the hypercall for future use. - - -6. KVM_HC_CLOCK_PAIRING ------------------------- -Architecture: x86 -Status: active -Purpose: Hypercall used to synchronize host and guest clocks. -Usage: - -a0: guest physical address where host copies -"struct kvm_clock_offset" structure. - -a1: clock_type, ATM only KVM_CLOCK_PAIRING_WALLCLOCK (0) -is supported (corresponding to the host's CLOCK_REALTIME clock). - - struct kvm_clock_pairing { - __s64 sec; - __s64 nsec; - __u64 tsc; - __u32 flags; - __u32 pad[9]; - }; - - Where: - * sec: seconds from clock_type clock. - * nsec: nanoseconds from clock_type clock. - * tsc: guest TSC value used to calculate sec/nsec pair - * flags: flags, unused (0) at the moment. - -The hypercall lets a guest compute a precise timestamp across -host and guest. The guest can use the returned TSC value to -compute the CLOCK_REALTIME for its clock, at the same instant. - -Returns KVM_EOPNOTSUPP if the host does not use TSC clocksource, -or if clock type is different than KVM_CLOCK_PAIRING_WALLCLOCK. - -6. KVM_HC_SEND_IPI ------------------------- -Architecture: x86 -Status: active -Purpose: Send IPIs to multiple vCPUs. - -a0: lower part of the bitmap of destination APIC IDs -a1: higher part of the bitmap of destination APIC IDs -a2: the lowest APIC ID in bitmap -a3: APIC ICR - -The hypercall lets a guest send multicast IPIs, with at most 128 -128 destinations per hypercall in 64-bit mode and 64 vCPUs per -hypercall in 32-bit mode. The destinations are represented by a -bitmap contained in the first two arguments (a0 and a1). Bit 0 of -a0 corresponds to the APIC ID in the third argument (a2), bit 1 -corresponds to the APIC ID a2+1, and so on. - -Returns the number of CPUs to which the IPIs were delivered successfully. - -7. KVM_HC_SCHED_YIELD ------------------------- -Architecture: x86 -Status: active -Purpose: Hypercall used to yield if the IPI target vCPU is preempted - -a0: destination APIC ID - -Usage example: When sending a call-function IPI-many to vCPUs, yield if -any of the IPI target vCPUs was preempted. diff --git a/Documentation/virt/kvm/index.rst b/Documentation/virt/kvm/index.rst index 6fe79185b9bc..ac83bc588f7e 100644 --- a/Documentation/virt/kvm/index.rst +++ b/Documentation/virt/kvm/index.rst @@ -11,8 +11,10 @@ KVM amd-memory-encryption cpuid halt-polling + hypercalls msr vcpu-requests arm/index + devices/index -- cgit v1.2.3 From 75e7fcdb4a6f394a6644ee1cfe193284945003b5 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 10 Feb 2020 07:02:59 +0100 Subject: docs: kvm: Convert locking.txt to ReST format - Use document title and chapter markups; - Add markups for literal blocks; - use :field: for field descriptions; - Add blank lines and adjust indentation. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/index.rst | 1 + Documentation/virt/kvm/locking.rst | 243 +++++++++++++++++++++++++++++++++++++ Documentation/virt/kvm/locking.txt | 215 -------------------------------- 3 files changed, 244 insertions(+), 215 deletions(-) create mode 100644 Documentation/virt/kvm/locking.rst delete mode 100644 Documentation/virt/kvm/locking.txt diff --git a/Documentation/virt/kvm/index.rst b/Documentation/virt/kvm/index.rst index ac83bc588f7e..9be8f53b729d 100644 --- a/Documentation/virt/kvm/index.rst +++ b/Documentation/virt/kvm/index.rst @@ -12,6 +12,7 @@ KVM cpuid halt-polling hypercalls + locking msr vcpu-requests diff --git a/Documentation/virt/kvm/locking.rst b/Documentation/virt/kvm/locking.rst new file mode 100644 index 000000000000..c02291beac3f --- /dev/null +++ b/Documentation/virt/kvm/locking.rst @@ -0,0 +1,243 @@ +.. SPDX-License-Identifier: GPL-2.0 + +================= +KVM Lock Overview +================= + +1. Acquisition Orders +--------------------- + +The acquisition orders for mutexes are as follows: + +- kvm->lock is taken outside vcpu->mutex + +- kvm->lock is taken outside kvm->slots_lock and kvm->irq_lock + +- kvm->slots_lock is taken outside kvm->irq_lock, though acquiring + them together is quite rare. + +On x86, vcpu->mutex is taken outside kvm->arch.hyperv.hv_lock. + +Everything else is a leaf: no other lock is taken inside the critical +sections. + +2. Exception +------------ + +Fast page fault: + +Fast page fault is the fast path which fixes the guest page fault out of +the mmu-lock on x86. Currently, the page fault can be fast in one of the +following two cases: + +1. Access Tracking: The SPTE is not present, but it is marked for access + tracking i.e. the SPTE_SPECIAL_MASK is set. That means we need to + restore the saved R/X bits. This is described in more detail later below. + +2. Write-Protection: The SPTE is present and the fault is + caused by write-protect. That means we just need to change the W bit of + the spte. + +What we use to avoid all the race is the SPTE_HOST_WRITEABLE bit and +SPTE_MMU_WRITEABLE bit on the spte: + +- SPTE_HOST_WRITEABLE means the gfn is writable on host. +- SPTE_MMU_WRITEABLE means the gfn is writable on mmu. The bit is set when + the gfn is writable on guest mmu and it is not write-protected by shadow + page write-protection. + +On fast page fault path, we will use cmpxchg to atomically set the spte W +bit if spte.SPTE_HOST_WRITEABLE = 1 and spte.SPTE_WRITE_PROTECT = 1, or +restore the saved R/X bits if VMX_EPT_TRACK_ACCESS mask is set, or both. This +is safe because whenever changing these bits can be detected by cmpxchg. + +But we need carefully check these cases: + +1) The mapping from gfn to pfn + +The mapping from gfn to pfn may be changed since we can only ensure the pfn +is not changed during cmpxchg. This is a ABA problem, for example, below case +will happen: + ++------------------------------------------------------------------------+ +| At the beginning:: | +| | +| gpte = gfn1 | +| gfn1 is mapped to pfn1 on host | +| spte is the shadow page table entry corresponding with gpte and | +| spte = pfn1 | ++------------------------------------------------------------------------+ +| On fast page fault path: | ++------------------------------------+-----------------------------------+ +| CPU 0: | CPU 1: | ++------------------------------------+-----------------------------------+ +| :: | | +| | | +| old_spte = *spte; | | ++------------------------------------+-----------------------------------+ +| | pfn1 is swapped out:: | +| | | +| | spte = 0; | +| | | +| | pfn1 is re-alloced for gfn2. | +| | | +| | gpte is changed to point to | +| | gfn2 by the guest:: | +| | | +| | spte = pfn1; | ++------------------------------------+-----------------------------------+ +| :: | +| | +| if (cmpxchg(spte, old_spte, old_spte+W) | +| mark_page_dirty(vcpu->kvm, gfn1) | +| OOPS!!! | ++------------------------------------------------------------------------+ + +We dirty-log for gfn1, that means gfn2 is lost in dirty-bitmap. + +For direct sp, we can easily avoid it since the spte of direct sp is fixed +to gfn. For indirect sp, before we do cmpxchg, we call gfn_to_pfn_atomic() +to pin gfn to pfn, because after gfn_to_pfn_atomic(): + +- We have held the refcount of pfn that means the pfn can not be freed and + be reused for another gfn. +- The pfn is writable that means it can not be shared between different gfns + by KSM. + +Then, we can ensure the dirty bitmaps is correctly set for a gfn. + +Currently, to simplify the whole things, we disable fast page fault for +indirect shadow page. + +2) Dirty bit tracking + +In the origin code, the spte can be fast updated (non-atomically) if the +spte is read-only and the Accessed bit has already been set since the +Accessed bit and Dirty bit can not be lost. + +But it is not true after fast page fault since the spte can be marked +writable between reading spte and updating spte. Like below case: + ++------------------------------------------------------------------------+ +| At the beginning:: | +| | +| spte.W = 0 | +| spte.Accessed = 1 | ++------------------------------------+-----------------------------------+ +| CPU 0: | CPU 1: | ++------------------------------------+-----------------------------------+ +| In mmu_spte_clear_track_bits():: | | +| | | +| old_spte = *spte; | | +| | | +| | | +| /* 'if' condition is satisfied. */| | +| if (old_spte.Accessed == 1 && | | +| old_spte.W == 0) | | +| spte = 0ull; | | ++------------------------------------+-----------------------------------+ +| | on fast page fault path:: | +| | | +| | spte.W = 1 | +| | | +| | memory write on the spte:: | +| | | +| | spte.Dirty = 1 | ++------------------------------------+-----------------------------------+ +| :: | | +| | | +| else | | +| old_spte = xchg(spte, 0ull) | | +| if (old_spte.Accessed == 1) | | +| kvm_set_pfn_accessed(spte.pfn);| | +| if (old_spte.Dirty == 1) | | +| kvm_set_pfn_dirty(spte.pfn); | | +| OOPS!!! | | ++------------------------------------+-----------------------------------+ + +The Dirty bit is lost in this case. + +In order to avoid this kind of issue, we always treat the spte as "volatile" +if it can be updated out of mmu-lock, see spte_has_volatile_bits(), it means, +the spte is always atomically updated in this case. + +3) flush tlbs due to spte updated + +If the spte is updated from writable to readonly, we should flush all TLBs, +otherwise rmap_write_protect will find a read-only spte, even though the +writable spte might be cached on a CPU's TLB. + +As mentioned before, the spte can be updated to writable out of mmu-lock on +fast page fault path, in order to easily audit the path, we see if TLBs need +be flushed caused by this reason in mmu_spte_update() since this is a common +function to update spte (present -> present). + +Since the spte is "volatile" if it can be updated out of mmu-lock, we always +atomically update the spte, the race caused by fast page fault can be avoided, +See the comments in spte_has_volatile_bits() and mmu_spte_update(). + +Lockless Access Tracking: + +This is used for Intel CPUs that are using EPT but do not support the EPT A/D +bits. In this case, when the KVM MMU notifier is called to track accesses to a +page (via kvm_mmu_notifier_clear_flush_young), it marks the PTE as not-present +by clearing the RWX bits in the PTE and storing the original R & X bits in +some unused/ignored bits. In addition, the SPTE_SPECIAL_MASK is also set on the +PTE (using the ignored bit 62). When the VM tries to access the page later on, +a fault is generated and the fast page fault mechanism described above is used +to atomically restore the PTE to a Present state. The W bit is not saved when +the PTE is marked for access tracking and during restoration to the Present +state, the W bit is set depending on whether or not it was a write access. If +it wasn't, then the W bit will remain clear until a write access happens, at +which time it will be set using the Dirty tracking mechanism described above. + +3. Reference +------------ + +:Name: kvm_lock +:Type: mutex +:Arch: any +:Protects: - vm_list + +:Name: kvm_count_lock +:Type: raw_spinlock_t +:Arch: any +:Protects: - hardware virtualization enable/disable +:Comment: 'raw' because hardware enabling/disabling must be atomic /wrt + migration. + +:Name: kvm_arch::tsc_write_lock +:Type: raw_spinlock +:Arch: x86 +:Protects: - kvm_arch::{last_tsc_write,last_tsc_nsec,last_tsc_offset} + - tsc offset in vmcb +:Comment: 'raw' because updating the tsc offsets must not be preempted. + +:Name: kvm->mmu_lock +:Type: spinlock_t +:Arch: any +:Protects: -shadow page/shadow tlb entry +:Comment: it is a spinlock since it is used in mmu notifier. + +:Name: kvm->srcu +:Type: srcu lock +:Arch: any +:Protects: - kvm->memslots + - kvm->buses +:Comment: The srcu read lock must be held while accessing memslots (e.g. + when using gfn_to_* functions) and while accessing in-kernel + MMIO/PIO address->device structure mapping (kvm->buses). + The srcu index can be stored in kvm_vcpu->srcu_idx per vcpu + if it is needed by multiple functions. + +:Name: blocked_vcpu_on_cpu_lock +:Type: spinlock_t +:Arch: x86 +:Protects: blocked_vcpu_on_cpu +:Comment: This is a per-CPU lock and it is used for VT-d posted-interrupts. + When VT-d posted-interrupts is supported and the VM has assigned + devices, we put the blocked vCPU on the list blocked_vcpu_on_cpu + protected by blocked_vcpu_on_cpu_lock, when VT-d hardware issues + wakeup notification event since external interrupts from the + assigned devices happens, we will find the vCPU on the list to + wakeup. diff --git a/Documentation/virt/kvm/locking.txt b/Documentation/virt/kvm/locking.txt deleted file mode 100644 index 635cd6eaf714..000000000000 --- a/Documentation/virt/kvm/locking.txt +++ /dev/null @@ -1,215 +0,0 @@ -KVM Lock Overview -================= - -1. Acquisition Orders ---------------------- - -The acquisition orders for mutexes are as follows: - -- kvm->lock is taken outside vcpu->mutex - -- kvm->lock is taken outside kvm->slots_lock and kvm->irq_lock - -- kvm->slots_lock is taken outside kvm->irq_lock, though acquiring - them together is quite rare. - -On x86, vcpu->mutex is taken outside kvm->arch.hyperv.hv_lock. - -Everything else is a leaf: no other lock is taken inside the critical -sections. - -2: Exception ------------- - -Fast page fault: - -Fast page fault is the fast path which fixes the guest page fault out of -the mmu-lock on x86. Currently, the page fault can be fast in one of the -following two cases: - -1. Access Tracking: The SPTE is not present, but it is marked for access -tracking i.e. the SPTE_SPECIAL_MASK is set. That means we need to -restore the saved R/X bits. This is described in more detail later below. - -2. Write-Protection: The SPTE is present and the fault is -caused by write-protect. That means we just need to change the W bit of the -spte. - -What we use to avoid all the race is the SPTE_HOST_WRITEABLE bit and -SPTE_MMU_WRITEABLE bit on the spte: -- SPTE_HOST_WRITEABLE means the gfn is writable on host. -- SPTE_MMU_WRITEABLE means the gfn is writable on mmu. The bit is set when - the gfn is writable on guest mmu and it is not write-protected by shadow - page write-protection. - -On fast page fault path, we will use cmpxchg to atomically set the spte W -bit if spte.SPTE_HOST_WRITEABLE = 1 and spte.SPTE_WRITE_PROTECT = 1, or -restore the saved R/X bits if VMX_EPT_TRACK_ACCESS mask is set, or both. This -is safe because whenever changing these bits can be detected by cmpxchg. - -But we need carefully check these cases: -1): The mapping from gfn to pfn -The mapping from gfn to pfn may be changed since we can only ensure the pfn -is not changed during cmpxchg. This is a ABA problem, for example, below case -will happen: - -At the beginning: -gpte = gfn1 -gfn1 is mapped to pfn1 on host -spte is the shadow page table entry corresponding with gpte and -spte = pfn1 - - VCPU 0 VCPU0 -on fast page fault path: - - old_spte = *spte; - pfn1 is swapped out: - spte = 0; - - pfn1 is re-alloced for gfn2. - - gpte is changed to point to - gfn2 by the guest: - spte = pfn1; - - if (cmpxchg(spte, old_spte, old_spte+W) - mark_page_dirty(vcpu->kvm, gfn1) - OOPS!!! - -We dirty-log for gfn1, that means gfn2 is lost in dirty-bitmap. - -For direct sp, we can easily avoid it since the spte of direct sp is fixed -to gfn. For indirect sp, before we do cmpxchg, we call gfn_to_pfn_atomic() -to pin gfn to pfn, because after gfn_to_pfn_atomic(): -- We have held the refcount of pfn that means the pfn can not be freed and - be reused for another gfn. -- The pfn is writable that means it can not be shared between different gfns - by KSM. - -Then, we can ensure the dirty bitmaps is correctly set for a gfn. - -Currently, to simplify the whole things, we disable fast page fault for -indirect shadow page. - -2): Dirty bit tracking -In the origin code, the spte can be fast updated (non-atomically) if the -spte is read-only and the Accessed bit has already been set since the -Accessed bit and Dirty bit can not be lost. - -But it is not true after fast page fault since the spte can be marked -writable between reading spte and updating spte. Like below case: - -At the beginning: -spte.W = 0 -spte.Accessed = 1 - - VCPU 0 VCPU0 -In mmu_spte_clear_track_bits(): - - old_spte = *spte; - - /* 'if' condition is satisfied. */ - if (old_spte.Accessed == 1 && - old_spte.W == 0) - spte = 0ull; - on fast page fault path: - spte.W = 1 - memory write on the spte: - spte.Dirty = 1 - - - else - old_spte = xchg(spte, 0ull) - - - if (old_spte.Accessed == 1) - kvm_set_pfn_accessed(spte.pfn); - if (old_spte.Dirty == 1) - kvm_set_pfn_dirty(spte.pfn); - OOPS!!! - -The Dirty bit is lost in this case. - -In order to avoid this kind of issue, we always treat the spte as "volatile" -if it can be updated out of mmu-lock, see spte_has_volatile_bits(), it means, -the spte is always atomically updated in this case. - -3): flush tlbs due to spte updated -If the spte is updated from writable to readonly, we should flush all TLBs, -otherwise rmap_write_protect will find a read-only spte, even though the -writable spte might be cached on a CPU's TLB. - -As mentioned before, the spte can be updated to writable out of mmu-lock on -fast page fault path, in order to easily audit the path, we see if TLBs need -be flushed caused by this reason in mmu_spte_update() since this is a common -function to update spte (present -> present). - -Since the spte is "volatile" if it can be updated out of mmu-lock, we always -atomically update the spte, the race caused by fast page fault can be avoided, -See the comments in spte_has_volatile_bits() and mmu_spte_update(). - -Lockless Access Tracking: - -This is used for Intel CPUs that are using EPT but do not support the EPT A/D -bits. In this case, when the KVM MMU notifier is called to track accesses to a -page (via kvm_mmu_notifier_clear_flush_young), it marks the PTE as not-present -by clearing the RWX bits in the PTE and storing the original R & X bits in -some unused/ignored bits. In addition, the SPTE_SPECIAL_MASK is also set on the -PTE (using the ignored bit 62). When the VM tries to access the page later on, -a fault is generated and the fast page fault mechanism described above is used -to atomically restore the PTE to a Present state. The W bit is not saved when -the PTE is marked for access tracking and during restoration to the Present -state, the W bit is set depending on whether or not it was a write access. If -it wasn't, then the W bit will remain clear until a write access happens, at -which time it will be set using the Dirty tracking mechanism described above. - -3. Reference ------------- - -Name: kvm_lock -Type: mutex -Arch: any -Protects: - vm_list - -Name: kvm_count_lock -Type: raw_spinlock_t -Arch: any -Protects: - hardware virtualization enable/disable -Comment: 'raw' because hardware enabling/disabling must be atomic /wrt - migration. - -Name: kvm_arch::tsc_write_lock -Type: raw_spinlock -Arch: x86 -Protects: - kvm_arch::{last_tsc_write,last_tsc_nsec,last_tsc_offset} - - tsc offset in vmcb -Comment: 'raw' because updating the tsc offsets must not be preempted. - -Name: kvm->mmu_lock -Type: spinlock_t -Arch: any -Protects: -shadow page/shadow tlb entry -Comment: it is a spinlock since it is used in mmu notifier. - -Name: kvm->srcu -Type: srcu lock -Arch: any -Protects: - kvm->memslots - - kvm->buses -Comment: The srcu read lock must be held while accessing memslots (e.g. - when using gfn_to_* functions) and while accessing in-kernel - MMIO/PIO address->device structure mapping (kvm->buses). - The srcu index can be stored in kvm_vcpu->srcu_idx per vcpu - if it is needed by multiple functions. - -Name: blocked_vcpu_on_cpu_lock -Type: spinlock_t -Arch: x86 -Protects: blocked_vcpu_on_cpu -Comment: This is a per-CPU lock and it is used for VT-d posted-interrupts. - When VT-d posted-interrupts is supported and the VM has assigned - devices, we put the blocked vCPU on the list blocked_vcpu_on_cpu - protected by blocked_vcpu_on_cpu_lock, when VT-d hardware issues - wakeup notification event since external interrupts from the - assigned devices happens, we will find the vCPU on the list to - wakeup. -- cgit v1.2.3 From 037d1f92eff908f794644d49435d8849a3c10461 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 10 Feb 2020 07:03:00 +0100 Subject: docs: kvm: Convert mmu.txt to ReST format - Use document title and chapter markups; - Add markups for tables; - Add markups for literal blocks; - Add blank lines and adjust indentation. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/index.rst | 1 + Documentation/virt/kvm/mmu.rst | 483 +++++++++++++++++++++++++++++++++++++++ Documentation/virt/kvm/mmu.txt | 449 ------------------------------------ 3 files changed, 484 insertions(+), 449 deletions(-) create mode 100644 Documentation/virt/kvm/mmu.rst delete mode 100644 Documentation/virt/kvm/mmu.txt diff --git a/Documentation/virt/kvm/index.rst b/Documentation/virt/kvm/index.rst index 9be8f53b729d..95e2487d38f4 100644 --- a/Documentation/virt/kvm/index.rst +++ b/Documentation/virt/kvm/index.rst @@ -13,6 +13,7 @@ KVM halt-polling hypercalls locking + mmu msr vcpu-requests diff --git a/Documentation/virt/kvm/mmu.rst b/Documentation/virt/kvm/mmu.rst new file mode 100644 index 000000000000..60981887d20b --- /dev/null +++ b/Documentation/virt/kvm/mmu.rst @@ -0,0 +1,483 @@ +.. SPDX-License-Identifier: GPL-2.0 + +====================== +The x86 kvm shadow mmu +====================== + +The mmu (in arch/x86/kvm, files mmu.[ch] and paging_tmpl.h) is responsible +for presenting a standard x86 mmu to the guest, while translating guest +physical addresses to host physical addresses. + +The mmu code attempts to satisfy the following requirements: + +- correctness: + the guest should not be able to determine that it is running + on an emulated mmu except for timing (we attempt to comply + with the specification, not emulate the characteristics of + a particular implementation such as tlb size) +- security: + the guest must not be able to touch host memory not assigned + to it +- performance: + minimize the performance penalty imposed by the mmu +- scaling: + need to scale to large memory and large vcpu guests +- hardware: + support the full range of x86 virtualization hardware +- integration: + Linux memory management code must be in control of guest memory + so that swapping, page migration, page merging, transparent + hugepages, and similar features work without change +- dirty tracking: + report writes to guest memory to enable live migration + and framebuffer-based displays +- footprint: + keep the amount of pinned kernel memory low (most memory + should be shrinkable) +- reliability: + avoid multipage or GFP_ATOMIC allocations + +Acronyms +======== + +==== ==================================================================== +pfn host page frame number +hpa host physical address +hva host virtual address +gfn guest frame number +gpa guest physical address +gva guest virtual address +ngpa nested guest physical address +ngva nested guest virtual address +pte page table entry (used also to refer generically to paging structure + entries) +gpte guest pte (referring to gfns) +spte shadow pte (referring to pfns) +tdp two dimensional paging (vendor neutral term for NPT and EPT) +==== ==================================================================== + +Virtual and real hardware supported +=================================== + +The mmu supports first-generation mmu hardware, which allows an atomic switch +of the current paging mode and cr3 during guest entry, as well as +two-dimensional paging (AMD's NPT and Intel's EPT). The emulated hardware +it exposes is the traditional 2/3/4 level x86 mmu, with support for global +pages, pae, pse, pse36, cr0.wp, and 1GB pages. Emulated hardware also +able to expose NPT capable hardware on NPT capable hosts. + +Translation +=========== + +The primary job of the mmu is to program the processor's mmu to translate +addresses for the guest. Different translations are required at different +times: + +- when guest paging is disabled, we translate guest physical addresses to + host physical addresses (gpa->hpa) +- when guest paging is enabled, we translate guest virtual addresses, to + guest physical addresses, to host physical addresses (gva->gpa->hpa) +- when the guest launches a guest of its own, we translate nested guest + virtual addresses, to nested guest physical addresses, to guest physical + addresses, to host physical addresses (ngva->ngpa->gpa->hpa) + +The primary challenge is to encode between 1 and 3 translations into hardware +that support only 1 (traditional) and 2 (tdp) translations. When the +number of required translations matches the hardware, the mmu operates in +direct mode; otherwise it operates in shadow mode (see below). + +Memory +====== + +Guest memory (gpa) is part of the user address space of the process that is +using kvm. Userspace defines the translation between guest addresses and user +addresses (gpa->hva); note that two gpas may alias to the same hva, but not +vice versa. + +These hvas may be backed using any method available to the host: anonymous +memory, file backed memory, and device memory. Memory might be paged by the +host at any time. + +Events +====== + +The mmu is driven by events, some from the guest, some from the host. + +Guest generated events: + +- writes to control registers (especially cr3) +- invlpg/invlpga instruction execution +- access to missing or protected translations + +Host generated events: + +- changes in the gpa->hpa translation (either through gpa->hva changes or + through hva->hpa changes) +- memory pressure (the shrinker) + +Shadow pages +============ + +The principal data structure is the shadow page, 'struct kvm_mmu_page'. A +shadow page contains 512 sptes, which can be either leaf or nonleaf sptes. A +shadow page may contain a mix of leaf and nonleaf sptes. + +A nonleaf spte allows the hardware mmu to reach the leaf pages and +is not related to a translation directly. It points to other shadow pages. + +A leaf spte corresponds to either one or two translations encoded into +one paging structure entry. These are always the lowest level of the +translation stack, with optional higher level translations left to NPT/EPT. +Leaf ptes point at guest pages. + +The following table shows translations encoded by leaf ptes, with higher-level +translations in parentheses: + + Non-nested guests:: + + nonpaging: gpa->hpa + paging: gva->gpa->hpa + paging, tdp: (gva->)gpa->hpa + + Nested guests:: + + non-tdp: ngva->gpa->hpa (*) + tdp: (ngva->)ngpa->gpa->hpa + + (*) the guest hypervisor will encode the ngva->gpa translation into its page + tables if npt is not present + +Shadow pages contain the following information: + role.level: + The level in the shadow paging hierarchy that this shadow page belongs to. + 1=4k sptes, 2=2M sptes, 3=1G sptes, etc. + role.direct: + If set, leaf sptes reachable from this page are for a linear range. + Examples include real mode translation, large guest pages backed by small + host pages, and gpa->hpa translations when NPT or EPT is active. + The linear range starts at (gfn << PAGE_SHIFT) and its size is determined + by role.level (2MB for first level, 1GB for second level, 0.5TB for third + level, 256TB for fourth level) + If clear, this page corresponds to a guest page table denoted by the gfn + field. + role.quadrant: + When role.gpte_is_8_bytes=0, the guest uses 32-bit gptes while the host uses 64-bit + sptes. That means a guest page table contains more ptes than the host, + so multiple shadow pages are needed to shadow one guest page. + For first-level shadow pages, role.quadrant can be 0 or 1 and denotes the + first or second 512-gpte block in the guest page table. For second-level + page tables, each 32-bit gpte is converted to two 64-bit sptes + (since each first-level guest page is shadowed by two first-level + shadow pages) so role.quadrant takes values in the range 0..3. Each + quadrant maps 1GB virtual address space. + role.access: + Inherited guest access permissions in the form uwx. Note execute + permission is positive, not negative. + role.invalid: + The page is invalid and should not be used. It is a root page that is + currently pinned (by a cpu hardware register pointing to it); once it is + unpinned it will be destroyed. + role.gpte_is_8_bytes: + Reflects the size of the guest PTE for which the page is valid, i.e. '1' + if 64-bit gptes are in use, '0' if 32-bit gptes are in use. + role.nxe: + Contains the value of efer.nxe for which the page is valid. + role.cr0_wp: + Contains the value of cr0.wp for which the page is valid. + role.smep_andnot_wp: + Contains the value of cr4.smep && !cr0.wp for which the page is valid + (pages for which this is true are different from other pages; see the + treatment of cr0.wp=0 below). + role.smap_andnot_wp: + Contains the value of cr4.smap && !cr0.wp for which the page is valid + (pages for which this is true are different from other pages; see the + treatment of cr0.wp=0 below). + role.ept_sp: + This is a virtual flag to denote a shadowed nested EPT page. ept_sp + is true if "cr0_wp && smap_andnot_wp", an otherwise invalid combination. + role.smm: + Is 1 if the page is valid in system management mode. This field + determines which of the kvm_memslots array was used to build this + shadow page; it is also used to go back from a struct kvm_mmu_page + to a memslot, through the kvm_memslots_for_spte_role macro and + __gfn_to_memslot. + role.ad_disabled: + Is 1 if the MMU instance cannot use A/D bits. EPT did not have A/D + bits before Haswell; shadow EPT page tables also cannot use A/D bits + if the L1 hypervisor does not enable them. + gfn: + Either the guest page table containing the translations shadowed by this + page, or the base page frame for linear translations. See role.direct. + spt: + A pageful of 64-bit sptes containing the translations for this page. + Accessed by both kvm and hardware. + The page pointed to by spt will have its page->private pointing back + at the shadow page structure. + sptes in spt point either at guest pages, or at lower-level shadow pages. + Specifically, if sp1 and sp2 are shadow pages, then sp1->spt[n] may point + at __pa(sp2->spt). sp2 will point back at sp1 through parent_pte. + The spt array forms a DAG structure with the shadow page as a node, and + guest pages as leaves. + gfns: + An array of 512 guest frame numbers, one for each present pte. Used to + perform a reverse map from a pte to a gfn. When role.direct is set, any + element of this array can be calculated from the gfn field when used, in + this case, the array of gfns is not allocated. See role.direct and gfn. + root_count: + A counter keeping track of how many hardware registers (guest cr3 or + pdptrs) are now pointing at the page. While this counter is nonzero, the + page cannot be destroyed. See role.invalid. + parent_ptes: + The reverse mapping for the pte/ptes pointing at this page's spt. If + parent_ptes bit 0 is zero, only one spte points at this page and + parent_ptes points at this single spte, otherwise, there exists multiple + sptes pointing at this page and (parent_ptes & ~0x1) points at a data + structure with a list of parent sptes. + unsync: + If true, then the translations in this page may not match the guest's + translation. This is equivalent to the state of the tlb when a pte is + changed but before the tlb entry is flushed. Accordingly, unsync ptes + are synchronized when the guest executes invlpg or flushes its tlb by + other means. Valid for leaf pages. + unsync_children: + How many sptes in the page point at pages that are unsync (or have + unsynchronized children). + unsync_child_bitmap: + A bitmap indicating which sptes in spt point (directly or indirectly) at + pages that may be unsynchronized. Used to quickly locate all unsychronized + pages reachable from a given page. + clear_spte_count: + Only present on 32-bit hosts, where a 64-bit spte cannot be written + atomically. The reader uses this while running out of the MMU lock + to detect in-progress updates and retry them until the writer has + finished the write. + write_flooding_count: + A guest may write to a page table many times, causing a lot of + emulations if the page needs to be write-protected (see "Synchronized + and unsynchronized pages" below). Leaf pages can be unsynchronized + so that they do not trigger frequent emulation, but this is not + possible for non-leafs. This field counts the number of emulations + since the last time the page table was actually used; if emulation + is triggered too frequently on this page, KVM will unmap the page + to avoid emulation in the future. + +Reverse map +=========== + +The mmu maintains a reverse mapping whereby all ptes mapping a page can be +reached given its gfn. This is used, for example, when swapping out a page. + +Synchronized and unsynchronized pages +===================================== + +The guest uses two events to synchronize its tlb and page tables: tlb flushes +and page invalidations (invlpg). + +A tlb flush means that we need to synchronize all sptes reachable from the +guest's cr3. This is expensive, so we keep all guest page tables write +protected, and synchronize sptes to gptes when a gpte is written. + +A special case is when a guest page table is reachable from the current +guest cr3. In this case, the guest is obliged to issue an invlpg instruction +before using the translation. We take advantage of that by removing write +protection from the guest page, and allowing the guest to modify it freely. +We synchronize modified gptes when the guest invokes invlpg. This reduces +the amount of emulation we have to do when the guest modifies multiple gptes, +or when the a guest page is no longer used as a page table and is used for +random guest data. + +As a side effect we have to resynchronize all reachable unsynchronized shadow +pages on a tlb flush. + + +Reaction to events +================== + +- guest page fault (or npt page fault, or ept violation) + +This is the most complicated event. The cause of a page fault can be: + + - a true guest fault (the guest translation won't allow the access) (*) + - access to a missing translation + - access to a protected translation + - when logging dirty pages, memory is write protected + - synchronized shadow pages are write protected (*) + - access to untranslatable memory (mmio) + + (*) not applicable in direct mode + +Handling a page fault is performed as follows: + + - if the RSV bit of the error code is set, the page fault is caused by guest + accessing MMIO and cached MMIO information is available. + + - walk shadow page table + - check for valid generation number in the spte (see "Fast invalidation of + MMIO sptes" below) + - cache the information to vcpu->arch.mmio_gva, vcpu->arch.mmio_access and + vcpu->arch.mmio_gfn, and call the emulator + + - If both P bit and R/W bit of error code are set, this could possibly + be handled as a "fast page fault" (fixed without taking the MMU lock). See + the description in Documentation/virt/kvm/locking.txt. + + - if needed, walk the guest page tables to determine the guest translation + (gva->gpa or ngpa->gpa) + + - if permissions are insufficient, reflect the fault back to the guest + + - determine the host page + + - if this is an mmio request, there is no host page; cache the info to + vcpu->arch.mmio_gva, vcpu->arch.mmio_access and vcpu->arch.mmio_gfn + + - walk the shadow page table to find the spte for the translation, + instantiating missing intermediate page tables as necessary + + - If this is an mmio request, cache the mmio info to the spte and set some + reserved bit on the spte (see callers of kvm_mmu_set_mmio_spte_mask) + + - try to unsynchronize the page + + - if successful, we can let the guest continue and modify the gpte + + - emulate the instruction + + - if failed, unshadow the page and let the guest continue + + - update any translations that were modified by the instruction + +invlpg handling: + + - walk the shadow page hierarchy and drop affected translations + - try to reinstantiate the indicated translation in the hope that the + guest will use it in the near future + +Guest control register updates: + +- mov to cr3 + + - look up new shadow roots + - synchronize newly reachable shadow pages + +- mov to cr0/cr4/efer + + - set up mmu context for new paging mode + - look up new shadow roots + - synchronize newly reachable shadow pages + +Host translation updates: + + - mmu notifier called with updated hva + - look up affected sptes through reverse map + - drop (or update) translations + +Emulating cr0.wp +================ + +If tdp is not enabled, the host must keep cr0.wp=1 so page write protection +works for the guest kernel, not guest guest userspace. When the guest +cr0.wp=1, this does not present a problem. However when the guest cr0.wp=0, +we cannot map the permissions for gpte.u=1, gpte.w=0 to any spte (the +semantics require allowing any guest kernel access plus user read access). + +We handle this by mapping the permissions to two possible sptes, depending +on fault type: + +- kernel write fault: spte.u=0, spte.w=1 (allows full kernel access, + disallows user access) +- read fault: spte.u=1, spte.w=0 (allows full read access, disallows kernel + write access) + +(user write faults generate a #PF) + +In the first case there are two additional complications: + +- if CR4.SMEP is enabled: since we've turned the page into a kernel page, + the kernel may now execute it. We handle this by also setting spte.nx. + If we get a user fetch or read fault, we'll change spte.u=1 and + spte.nx=gpte.nx back. For this to work, KVM forces EFER.NX to 1 when + shadow paging is in use. +- if CR4.SMAP is disabled: since the page has been changed to a kernel + page, it can not be reused when CR4.SMAP is enabled. We set + CR4.SMAP && !CR0.WP into shadow page's role to avoid this case. Note, + here we do not care the case that CR4.SMAP is enabled since KVM will + directly inject #PF to guest due to failed permission check. + +To prevent an spte that was converted into a kernel page with cr0.wp=0 +from being written by the kernel after cr0.wp has changed to 1, we make +the value of cr0.wp part of the page role. This means that an spte created +with one value of cr0.wp cannot be used when cr0.wp has a different value - +it will simply be missed by the shadow page lookup code. A similar issue +exists when an spte created with cr0.wp=0 and cr4.smep=0 is used after +changing cr4.smep to 1. To avoid this, the value of !cr0.wp && cr4.smep +is also made a part of the page role. + +Large pages +=========== + +The mmu supports all combinations of large and small guest and host pages. +Supported page sizes include 4k, 2M, 4M, and 1G. 4M pages are treated as +two separate 2M pages, on both guest and host, since the mmu always uses PAE +paging. + +To instantiate a large spte, four constraints must be satisfied: + +- the spte must point to a large host page +- the guest pte must be a large pte of at least equivalent size (if tdp is + enabled, there is no guest pte and this condition is satisfied) +- if the spte will be writeable, the large page frame may not overlap any + write-protected pages +- the guest page must be wholly contained by a single memory slot + +To check the last two conditions, the mmu maintains a ->disallow_lpage set of +arrays for each memory slot and large page size. Every write protected page +causes its disallow_lpage to be incremented, thus preventing instantiation of +a large spte. The frames at the end of an unaligned memory slot have +artificially inflated ->disallow_lpages so they can never be instantiated. + +Fast invalidation of MMIO sptes +=============================== + +As mentioned in "Reaction to events" above, kvm will cache MMIO +information in leaf sptes. When a new memslot is added or an existing +memslot is changed, this information may become stale and needs to be +invalidated. This also needs to hold the MMU lock while walking all +shadow pages, and is made more scalable with a similar technique. + +MMIO sptes have a few spare bits, which are used to store a +generation number. The global generation number is stored in +kvm_memslots(kvm)->generation, and increased whenever guest memory info +changes. + +When KVM finds an MMIO spte, it checks the generation number of the spte. +If the generation number of the spte does not equal the global generation +number, it will ignore the cached MMIO information and handle the page +fault through the slow path. + +Since only 19 bits are used to store generation-number on mmio spte, all +pages are zapped when there is an overflow. + +Unfortunately, a single memory access might access kvm_memslots(kvm) multiple +times, the last one happening when the generation number is retrieved and +stored into the MMIO spte. Thus, the MMIO spte might be created based on +out-of-date information, but with an up-to-date generation number. + +To avoid this, the generation number is incremented again after synchronize_srcu +returns; thus, bit 63 of kvm_memslots(kvm)->generation set to 1 only during a +memslot update, while some SRCU readers might be using the old copy. We do not +want to use an MMIO sptes created with an odd generation number, and we can do +this without losing a bit in the MMIO spte. The "update in-progress" bit of the +generation is not stored in MMIO spte, and is so is implicitly zero when the +generation is extracted out of the spte. If KVM is unlucky and creates an MMIO +spte while an update is in-progress, the next access to the spte will always be +a cache miss. For example, a subsequent access during the update window will +miss due to the in-progress flag diverging, while an access after the update +window closes will have a higher generation number (as compared to the spte). + + +Further reading +=============== + +- NPT presentation from KVM Forum 2008 + http://www.linux-kvm.org/images/c/c8/KvmForum2008%24kdf2008_21.pdf diff --git a/Documentation/virt/kvm/mmu.txt b/Documentation/virt/kvm/mmu.txt deleted file mode 100644 index dadb29e8738f..000000000000 --- a/Documentation/virt/kvm/mmu.txt +++ /dev/null @@ -1,449 +0,0 @@ -The x86 kvm shadow mmu -====================== - -The mmu (in arch/x86/kvm, files mmu.[ch] and paging_tmpl.h) is responsible -for presenting a standard x86 mmu to the guest, while translating guest -physical addresses to host physical addresses. - -The mmu code attempts to satisfy the following requirements: - -- correctness: the guest should not be able to determine that it is running - on an emulated mmu except for timing (we attempt to comply - with the specification, not emulate the characteristics of - a particular implementation such as tlb size) -- security: the guest must not be able to touch host memory not assigned - to it -- performance: minimize the performance penalty imposed by the mmu -- scaling: need to scale to large memory and large vcpu guests -- hardware: support the full range of x86 virtualization hardware -- integration: Linux memory management code must be in control of guest memory - so that swapping, page migration, page merging, transparent - hugepages, and similar features work without change -- dirty tracking: report writes to guest memory to enable live migration - and framebuffer-based displays -- footprint: keep the amount of pinned kernel memory low (most memory - should be shrinkable) -- reliability: avoid multipage or GFP_ATOMIC allocations - -Acronyms -======== - -pfn host page frame number -hpa host physical address -hva host virtual address -gfn guest frame number -gpa guest physical address -gva guest virtual address -ngpa nested guest physical address -ngva nested guest virtual address -pte page table entry (used also to refer generically to paging structure - entries) -gpte guest pte (referring to gfns) -spte shadow pte (referring to pfns) -tdp two dimensional paging (vendor neutral term for NPT and EPT) - -Virtual and real hardware supported -=================================== - -The mmu supports first-generation mmu hardware, which allows an atomic switch -of the current paging mode and cr3 during guest entry, as well as -two-dimensional paging (AMD's NPT and Intel's EPT). The emulated hardware -it exposes is the traditional 2/3/4 level x86 mmu, with support for global -pages, pae, pse, pse36, cr0.wp, and 1GB pages. Emulated hardware also -able to expose NPT capable hardware on NPT capable hosts. - -Translation -=========== - -The primary job of the mmu is to program the processor's mmu to translate -addresses for the guest. Different translations are required at different -times: - -- when guest paging is disabled, we translate guest physical addresses to - host physical addresses (gpa->hpa) -- when guest paging is enabled, we translate guest virtual addresses, to - guest physical addresses, to host physical addresses (gva->gpa->hpa) -- when the guest launches a guest of its own, we translate nested guest - virtual addresses, to nested guest physical addresses, to guest physical - addresses, to host physical addresses (ngva->ngpa->gpa->hpa) - -The primary challenge is to encode between 1 and 3 translations into hardware -that support only 1 (traditional) and 2 (tdp) translations. When the -number of required translations matches the hardware, the mmu operates in -direct mode; otherwise it operates in shadow mode (see below). - -Memory -====== - -Guest memory (gpa) is part of the user address space of the process that is -using kvm. Userspace defines the translation between guest addresses and user -addresses (gpa->hva); note that two gpas may alias to the same hva, but not -vice versa. - -These hvas may be backed using any method available to the host: anonymous -memory, file backed memory, and device memory. Memory might be paged by the -host at any time. - -Events -====== - -The mmu is driven by events, some from the guest, some from the host. - -Guest generated events: -- writes to control registers (especially cr3) -- invlpg/invlpga instruction execution -- access to missing or protected translations - -Host generated events: -- changes in the gpa->hpa translation (either through gpa->hva changes or - through hva->hpa changes) -- memory pressure (the shrinker) - -Shadow pages -============ - -The principal data structure is the shadow page, 'struct kvm_mmu_page'. A -shadow page contains 512 sptes, which can be either leaf or nonleaf sptes. A -shadow page may contain a mix of leaf and nonleaf sptes. - -A nonleaf spte allows the hardware mmu to reach the leaf pages and -is not related to a translation directly. It points to other shadow pages. - -A leaf spte corresponds to either one or two translations encoded into -one paging structure entry. These are always the lowest level of the -translation stack, with optional higher level translations left to NPT/EPT. -Leaf ptes point at guest pages. - -The following table shows translations encoded by leaf ptes, with higher-level -translations in parentheses: - - Non-nested guests: - nonpaging: gpa->hpa - paging: gva->gpa->hpa - paging, tdp: (gva->)gpa->hpa - Nested guests: - non-tdp: ngva->gpa->hpa (*) - tdp: (ngva->)ngpa->gpa->hpa - -(*) the guest hypervisor will encode the ngva->gpa translation into its page - tables if npt is not present - -Shadow pages contain the following information: - role.level: - The level in the shadow paging hierarchy that this shadow page belongs to. - 1=4k sptes, 2=2M sptes, 3=1G sptes, etc. - role.direct: - If set, leaf sptes reachable from this page are for a linear range. - Examples include real mode translation, large guest pages backed by small - host pages, and gpa->hpa translations when NPT or EPT is active. - The linear range starts at (gfn << PAGE_SHIFT) and its size is determined - by role.level (2MB for first level, 1GB for second level, 0.5TB for third - level, 256TB for fourth level) - If clear, this page corresponds to a guest page table denoted by the gfn - field. - role.quadrant: - When role.gpte_is_8_bytes=0, the guest uses 32-bit gptes while the host uses 64-bit - sptes. That means a guest page table contains more ptes than the host, - so multiple shadow pages are needed to shadow one guest page. - For first-level shadow pages, role.quadrant can be 0 or 1 and denotes the - first or second 512-gpte block in the guest page table. For second-level - page tables, each 32-bit gpte is converted to two 64-bit sptes - (since each first-level guest page is shadowed by two first-level - shadow pages) so role.quadrant takes values in the range 0..3. Each - quadrant maps 1GB virtual address space. - role.access: - Inherited guest access permissions in the form uwx. Note execute - permission is positive, not negative. - role.invalid: - The page is invalid and should not be used. It is a root page that is - currently pinned (by a cpu hardware register pointing to it); once it is - unpinned it will be destroyed. - role.gpte_is_8_bytes: - Reflects the size of the guest PTE for which the page is valid, i.e. '1' - if 64-bit gptes are in use, '0' if 32-bit gptes are in use. - role.nxe: - Contains the value of efer.nxe for which the page is valid. - role.cr0_wp: - Contains the value of cr0.wp for which the page is valid. - role.smep_andnot_wp: - Contains the value of cr4.smep && !cr0.wp for which the page is valid - (pages for which this is true are different from other pages; see the - treatment of cr0.wp=0 below). - role.smap_andnot_wp: - Contains the value of cr4.smap && !cr0.wp for which the page is valid - (pages for which this is true are different from other pages; see the - treatment of cr0.wp=0 below). - role.ept_sp: - This is a virtual flag to denote a shadowed nested EPT page. ept_sp - is true if "cr0_wp && smap_andnot_wp", an otherwise invalid combination. - role.smm: - Is 1 if the page is valid in system management mode. This field - determines which of the kvm_memslots array was used to build this - shadow page; it is also used to go back from a struct kvm_mmu_page - to a memslot, through the kvm_memslots_for_spte_role macro and - __gfn_to_memslot. - role.ad_disabled: - Is 1 if the MMU instance cannot use A/D bits. EPT did not have A/D - bits before Haswell; shadow EPT page tables also cannot use A/D bits - if the L1 hypervisor does not enable them. - gfn: - Either the guest page table containing the translations shadowed by this - page, or the base page frame for linear translations. See role.direct. - spt: - A pageful of 64-bit sptes containing the translations for this page. - Accessed by both kvm and hardware. - The page pointed to by spt will have its page->private pointing back - at the shadow page structure. - sptes in spt point either at guest pages, or at lower-level shadow pages. - Specifically, if sp1 and sp2 are shadow pages, then sp1->spt[n] may point - at __pa(sp2->spt). sp2 will point back at sp1 through parent_pte. - The spt array forms a DAG structure with the shadow page as a node, and - guest pages as leaves. - gfns: - An array of 512 guest frame numbers, one for each present pte. Used to - perform a reverse map from a pte to a gfn. When role.direct is set, any - element of this array can be calculated from the gfn field when used, in - this case, the array of gfns is not allocated. See role.direct and gfn. - root_count: - A counter keeping track of how many hardware registers (guest cr3 or - pdptrs) are now pointing at the page. While this counter is nonzero, the - page cannot be destroyed. See role.invalid. - parent_ptes: - The reverse mapping for the pte/ptes pointing at this page's spt. If - parent_ptes bit 0 is zero, only one spte points at this page and - parent_ptes points at this single spte, otherwise, there exists multiple - sptes pointing at this page and (parent_ptes & ~0x1) points at a data - structure with a list of parent sptes. - unsync: - If true, then the translations in this page may not match the guest's - translation. This is equivalent to the state of the tlb when a pte is - changed but before the tlb entry is flushed. Accordingly, unsync ptes - are synchronized when the guest executes invlpg or flushes its tlb by - other means. Valid for leaf pages. - unsync_children: - How many sptes in the page point at pages that are unsync (or have - unsynchronized children). - unsync_child_bitmap: - A bitmap indicating which sptes in spt point (directly or indirectly) at - pages that may be unsynchronized. Used to quickly locate all unsychronized - pages reachable from a given page. - clear_spte_count: - Only present on 32-bit hosts, where a 64-bit spte cannot be written - atomically. The reader uses this while running out of the MMU lock - to detect in-progress updates and retry them until the writer has - finished the write. - write_flooding_count: - A guest may write to a page table many times, causing a lot of - emulations if the page needs to be write-protected (see "Synchronized - and unsynchronized pages" below). Leaf pages can be unsynchronized - so that they do not trigger frequent emulation, but this is not - possible for non-leafs. This field counts the number of emulations - since the last time the page table was actually used; if emulation - is triggered too frequently on this page, KVM will unmap the page - to avoid emulation in the future. - -Reverse map -=========== - -The mmu maintains a reverse mapping whereby all ptes mapping a page can be -reached given its gfn. This is used, for example, when swapping out a page. - -Synchronized and unsynchronized pages -===================================== - -The guest uses two events to synchronize its tlb and page tables: tlb flushes -and page invalidations (invlpg). - -A tlb flush means that we need to synchronize all sptes reachable from the -guest's cr3. This is expensive, so we keep all guest page tables write -protected, and synchronize sptes to gptes when a gpte is written. - -A special case is when a guest page table is reachable from the current -guest cr3. In this case, the guest is obliged to issue an invlpg instruction -before using the translation. We take advantage of that by removing write -protection from the guest page, and allowing the guest to modify it freely. -We synchronize modified gptes when the guest invokes invlpg. This reduces -the amount of emulation we have to do when the guest modifies multiple gptes, -or when the a guest page is no longer used as a page table and is used for -random guest data. - -As a side effect we have to resynchronize all reachable unsynchronized shadow -pages on a tlb flush. - - -Reaction to events -================== - -- guest page fault (or npt page fault, or ept violation) - -This is the most complicated event. The cause of a page fault can be: - - - a true guest fault (the guest translation won't allow the access) (*) - - access to a missing translation - - access to a protected translation - - when logging dirty pages, memory is write protected - - synchronized shadow pages are write protected (*) - - access to untranslatable memory (mmio) - - (*) not applicable in direct mode - -Handling a page fault is performed as follows: - - - if the RSV bit of the error code is set, the page fault is caused by guest - accessing MMIO and cached MMIO information is available. - - walk shadow page table - - check for valid generation number in the spte (see "Fast invalidation of - MMIO sptes" below) - - cache the information to vcpu->arch.mmio_gva, vcpu->arch.mmio_access and - vcpu->arch.mmio_gfn, and call the emulator - - If both P bit and R/W bit of error code are set, this could possibly - be handled as a "fast page fault" (fixed without taking the MMU lock). See - the description in Documentation/virt/kvm/locking.txt. - - if needed, walk the guest page tables to determine the guest translation - (gva->gpa or ngpa->gpa) - - if permissions are insufficient, reflect the fault back to the guest - - determine the host page - - if this is an mmio request, there is no host page; cache the info to - vcpu->arch.mmio_gva, vcpu->arch.mmio_access and vcpu->arch.mmio_gfn - - walk the shadow page table to find the spte for the translation, - instantiating missing intermediate page tables as necessary - - If this is an mmio request, cache the mmio info to the spte and set some - reserved bit on the spte (see callers of kvm_mmu_set_mmio_spte_mask) - - try to unsynchronize the page - - if successful, we can let the guest continue and modify the gpte - - emulate the instruction - - if failed, unshadow the page and let the guest continue - - update any translations that were modified by the instruction - -invlpg handling: - - - walk the shadow page hierarchy and drop affected translations - - try to reinstantiate the indicated translation in the hope that the - guest will use it in the near future - -Guest control register updates: - -- mov to cr3 - - look up new shadow roots - - synchronize newly reachable shadow pages - -- mov to cr0/cr4/efer - - set up mmu context for new paging mode - - look up new shadow roots - - synchronize newly reachable shadow pages - -Host translation updates: - - - mmu notifier called with updated hva - - look up affected sptes through reverse map - - drop (or update) translations - -Emulating cr0.wp -================ - -If tdp is not enabled, the host must keep cr0.wp=1 so page write protection -works for the guest kernel, not guest guest userspace. When the guest -cr0.wp=1, this does not present a problem. However when the guest cr0.wp=0, -we cannot map the permissions for gpte.u=1, gpte.w=0 to any spte (the -semantics require allowing any guest kernel access plus user read access). - -We handle this by mapping the permissions to two possible sptes, depending -on fault type: - -- kernel write fault: spte.u=0, spte.w=1 (allows full kernel access, - disallows user access) -- read fault: spte.u=1, spte.w=0 (allows full read access, disallows kernel - write access) - -(user write faults generate a #PF) - -In the first case there are two additional complications: -- if CR4.SMEP is enabled: since we've turned the page into a kernel page, - the kernel may now execute it. We handle this by also setting spte.nx. - If we get a user fetch or read fault, we'll change spte.u=1 and - spte.nx=gpte.nx back. For this to work, KVM forces EFER.NX to 1 when - shadow paging is in use. -- if CR4.SMAP is disabled: since the page has been changed to a kernel - page, it can not be reused when CR4.SMAP is enabled. We set - CR4.SMAP && !CR0.WP into shadow page's role to avoid this case. Note, - here we do not care the case that CR4.SMAP is enabled since KVM will - directly inject #PF to guest due to failed permission check. - -To prevent an spte that was converted into a kernel page with cr0.wp=0 -from being written by the kernel after cr0.wp has changed to 1, we make -the value of cr0.wp part of the page role. This means that an spte created -with one value of cr0.wp cannot be used when cr0.wp has a different value - -it will simply be missed by the shadow page lookup code. A similar issue -exists when an spte created with cr0.wp=0 and cr4.smep=0 is used after -changing cr4.smep to 1. To avoid this, the value of !cr0.wp && cr4.smep -is also made a part of the page role. - -Large pages -=========== - -The mmu supports all combinations of large and small guest and host pages. -Supported page sizes include 4k, 2M, 4M, and 1G. 4M pages are treated as -two separate 2M pages, on both guest and host, since the mmu always uses PAE -paging. - -To instantiate a large spte, four constraints must be satisfied: - -- the spte must point to a large host page -- the guest pte must be a large pte of at least equivalent size (if tdp is - enabled, there is no guest pte and this condition is satisfied) -- if the spte will be writeable, the large page frame may not overlap any - write-protected pages -- the guest page must be wholly contained by a single memory slot - -To check the last two conditions, the mmu maintains a ->disallow_lpage set of -arrays for each memory slot and large page size. Every write protected page -causes its disallow_lpage to be incremented, thus preventing instantiation of -a large spte. The frames at the end of an unaligned memory slot have -artificially inflated ->disallow_lpages so they can never be instantiated. - -Fast invalidation of MMIO sptes -=============================== - -As mentioned in "Reaction to events" above, kvm will cache MMIO -information in leaf sptes. When a new memslot is added or an existing -memslot is changed, this information may become stale and needs to be -invalidated. This also needs to hold the MMU lock while walking all -shadow pages, and is made more scalable with a similar technique. - -MMIO sptes have a few spare bits, which are used to store a -generation number. The global generation number is stored in -kvm_memslots(kvm)->generation, and increased whenever guest memory info -changes. - -When KVM finds an MMIO spte, it checks the generation number of the spte. -If the generation number of the spte does not equal the global generation -number, it will ignore the cached MMIO information and handle the page -fault through the slow path. - -Since only 19 bits are used to store generation-number on mmio spte, all -pages are zapped when there is an overflow. - -Unfortunately, a single memory access might access kvm_memslots(kvm) multiple -times, the last one happening when the generation number is retrieved and -stored into the MMIO spte. Thus, the MMIO spte might be created based on -out-of-date information, but with an up-to-date generation number. - -To avoid this, the generation number is incremented again after synchronize_srcu -returns; thus, bit 63 of kvm_memslots(kvm)->generation set to 1 only during a -memslot update, while some SRCU readers might be using the old copy. We do not -want to use an MMIO sptes created with an odd generation number, and we can do -this without losing a bit in the MMIO spte. The "update in-progress" bit of the -generation is not stored in MMIO spte, and is so is implicitly zero when the -generation is extracted out of the spte. If KVM is unlucky and creates an MMIO -spte while an update is in-progress, the next access to the spte will always be -a cache miss. For example, a subsequent access during the update window will -miss due to the in-progress flag diverging, while an access after the update -window closes will have a higher generation number (as compared to the spte). - - -Further reading -=============== - -- NPT presentation from KVM Forum 2008 - http://www.linux-kvm.org/images/c/c8/KvmForum2008%24kdf2008_21.pdf - -- cgit v1.2.3 From 320f3f74d9a1a4a193d515de0549eafc82369f47 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 10 Feb 2020 07:03:01 +0100 Subject: docs: kvm: Convert nested-vmx.txt to ReST format This file is almost in ReST format. Just a small set of changes were needed: - Add markups for lists; - Add markups for a literal block; - Adjust whitespaces. While here, use the standard markup for the document title. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/index.rst | 1 + Documentation/virt/kvm/nested-vmx.rst | 245 ++++++++++++++++++++++++++++++++++ Documentation/virt/kvm/nested-vmx.txt | 240 --------------------------------- 3 files changed, 246 insertions(+), 240 deletions(-) create mode 100644 Documentation/virt/kvm/nested-vmx.rst delete mode 100644 Documentation/virt/kvm/nested-vmx.txt diff --git a/Documentation/virt/kvm/index.rst b/Documentation/virt/kvm/index.rst index 95e2487d38f4..123385d0a74a 100644 --- a/Documentation/virt/kvm/index.rst +++ b/Documentation/virt/kvm/index.rst @@ -15,6 +15,7 @@ KVM locking mmu msr + nested-vmx vcpu-requests arm/index diff --git a/Documentation/virt/kvm/nested-vmx.rst b/Documentation/virt/kvm/nested-vmx.rst new file mode 100644 index 000000000000..592b0ab6970b --- /dev/null +++ b/Documentation/virt/kvm/nested-vmx.rst @@ -0,0 +1,245 @@ +.. SPDX-License-Identifier: GPL-2.0 + +========== +Nested VMX +========== + +Overview +--------- + +On Intel processors, KVM uses Intel's VMX (Virtual-Machine eXtensions) +to easily and efficiently run guest operating systems. Normally, these guests +*cannot* themselves be hypervisors running their own guests, because in VMX, +guests cannot use VMX instructions. + +The "Nested VMX" feature adds this missing capability - of running guest +hypervisors (which use VMX) with their own nested guests. It does so by +allowing a guest to use VMX instructions, and correctly and efficiently +emulating them using the single level of VMX available in the hardware. + +We describe in much greater detail the theory behind the nested VMX feature, +its implementation and its performance characteristics, in the OSDI 2010 paper +"The Turtles Project: Design and Implementation of Nested Virtualization", +available at: + + http://www.usenix.org/events/osdi10/tech/full_papers/Ben-Yehuda.pdf + + +Terminology +----------- + +Single-level virtualization has two levels - the host (KVM) and the guests. +In nested virtualization, we have three levels: The host (KVM), which we call +L0, the guest hypervisor, which we call L1, and its nested guest, which we +call L2. + + +Running nested VMX +------------------ + +The nested VMX feature is disabled by default. It can be enabled by giving +the "nested=1" option to the kvm-intel module. + +No modifications are required to user space (qemu). However, qemu's default +emulated CPU type (qemu64) does not list the "VMX" CPU feature, so it must be +explicitly enabled, by giving qemu one of the following options: + + - cpu host (emulated CPU has all features of the real CPU) + + - cpu qemu64,+vmx (add just the vmx feature to a named CPU type) + + +ABIs +---- + +Nested VMX aims to present a standard and (eventually) fully-functional VMX +implementation for the a guest hypervisor to use. As such, the official +specification of the ABI that it provides is Intel's VMX specification, +namely volume 3B of their "Intel 64 and IA-32 Architectures Software +Developer's Manual". Not all of VMX's features are currently fully supported, +but the goal is to eventually support them all, starting with the VMX features +which are used in practice by popular hypervisors (KVM and others). + +As a VMX implementation, nested VMX presents a VMCS structure to L1. +As mandated by the spec, other than the two fields revision_id and abort, +this structure is *opaque* to its user, who is not supposed to know or care +about its internal structure. Rather, the structure is accessed through the +VMREAD and VMWRITE instructions. +Still, for debugging purposes, KVM developers might be interested to know the +internals of this structure; This is struct vmcs12 from arch/x86/kvm/vmx.c. + +The name "vmcs12" refers to the VMCS that L1 builds for L2. In the code we +also have "vmcs01", the VMCS that L0 built for L1, and "vmcs02" is the VMCS +which L0 builds to actually run L2 - how this is done is explained in the +aforementioned paper. + +For convenience, we repeat the content of struct vmcs12 here. If the internals +of this structure changes, this can break live migration across KVM versions. +VMCS12_REVISION (from vmx.c) should be changed if struct vmcs12 or its inner +struct shadow_vmcs is ever changed. + +:: + + typedef u64 natural_width; + struct __packed vmcs12 { + /* According to the Intel spec, a VMCS region must start with + * these two user-visible fields */ + u32 revision_id; + u32 abort; + + u32 launch_state; /* set to 0 by VMCLEAR, to 1 by VMLAUNCH */ + u32 padding[7]; /* room for future expansion */ + + u64 io_bitmap_a; + u64 io_bitmap_b; + u64 msr_bitmap; + u64 vm_exit_msr_store_addr; + u64 vm_exit_msr_load_addr; + u64 vm_entry_msr_load_addr; + u64 tsc_offset; + u64 virtual_apic_page_addr; + u64 apic_access_addr; + u64 ept_pointer; + u64 guest_physical_address; + u64 vmcs_link_pointer; + u64 guest_ia32_debugctl; + u64 guest_ia32_pat; + u64 guest_ia32_efer; + u64 guest_pdptr0; + u64 guest_pdptr1; + u64 guest_pdptr2; + u64 guest_pdptr3; + u64 host_ia32_pat; + u64 host_ia32_efer; + u64 padding64[8]; /* room for future expansion */ + natural_width cr0_guest_host_mask; + natural_width cr4_guest_host_mask; + natural_width cr0_read_shadow; + natural_width cr4_read_shadow; + natural_width cr3_target_value0; + natural_width cr3_target_value1; + natural_width cr3_target_value2; + natural_width cr3_target_value3; + natural_width exit_qualification; + natural_width guest_linear_address; + natural_width guest_cr0; + natural_width guest_cr3; + natural_width guest_cr4; + natural_width guest_es_base; + natural_width guest_cs_base; + natural_width guest_ss_base; + natural_width guest_ds_base; + natural_width guest_fs_base; + natural_width guest_gs_base; + natural_width guest_ldtr_base; + natural_width guest_tr_base; + natural_width guest_gdtr_base; + natural_width guest_idtr_base; + natural_width guest_dr7; + natural_width guest_rsp; + natural_width guest_rip; + natural_width guest_rflags; + natural_width guest_pending_dbg_exceptions; + natural_width guest_sysenter_esp; + natural_width guest_sysenter_eip; + natural_width host_cr0; + natural_width host_cr3; + natural_width host_cr4; + natural_width host_fs_base; + natural_width host_gs_base; + natural_width host_tr_base; + natural_width host_gdtr_base; + natural_width host_idtr_base; + natural_width host_ia32_sysenter_esp; + natural_width host_ia32_sysenter_eip; + natural_width host_rsp; + natural_width host_rip; + natural_width paddingl[8]; /* room for future expansion */ + u32 pin_based_vm_exec_control; + u32 cpu_based_vm_exec_control; + u32 exception_bitmap; + u32 page_fault_error_code_mask; + u32 page_fault_error_code_match; + u32 cr3_target_count; + u32 vm_exit_controls; + u32 vm_exit_msr_store_count; + u32 vm_exit_msr_load_count; + u32 vm_entry_controls; + u32 vm_entry_msr_load_count; + u32 vm_entry_intr_info_field; + u32 vm_entry_exception_error_code; + u32 vm_entry_instruction_len; + u32 tpr_threshold; + u32 secondary_vm_exec_control; + u32 vm_instruction_error; + u32 vm_exit_reason; + u32 vm_exit_intr_info; + u32 vm_exit_intr_error_code; + u32 idt_vectoring_info_field; + u32 idt_vectoring_error_code; + u32 vm_exit_instruction_len; + u32 vmx_instruction_info; + u32 guest_es_limit; + u32 guest_cs_limit; + u32 guest_ss_limit; + u32 guest_ds_limit; + u32 guest_fs_limit; + u32 guest_gs_limit; + u32 guest_ldtr_limit; + u32 guest_tr_limit; + u32 guest_gdtr_limit; + u32 guest_idtr_limit; + u32 guest_es_ar_bytes; + u32 guest_cs_ar_bytes; + u32 guest_ss_ar_bytes; + u32 guest_ds_ar_bytes; + u32 guest_fs_ar_bytes; + u32 guest_gs_ar_bytes; + u32 guest_ldtr_ar_bytes; + u32 guest_tr_ar_bytes; + u32 guest_interruptibility_info; + u32 guest_activity_state; + u32 guest_sysenter_cs; + u32 host_ia32_sysenter_cs; + u32 padding32[8]; /* room for future expansion */ + u16 virtual_processor_id; + u16 guest_es_selector; + u16 guest_cs_selector; + u16 guest_ss_selector; + u16 guest_ds_selector; + u16 guest_fs_selector; + u16 guest_gs_selector; + u16 guest_ldtr_selector; + u16 guest_tr_selector; + u16 host_es_selector; + u16 host_cs_selector; + u16 host_ss_selector; + u16 host_ds_selector; + u16 host_fs_selector; + u16 host_gs_selector; + u16 host_tr_selector; + }; + + +Authors +------- + +These patches were written by: + - Abel Gordon, abelg il.ibm.com + - Nadav Har'El, nyh il.ibm.com + - Orit Wasserman, oritw il.ibm.com + - Ben-Ami Yassor, benami il.ibm.com + - Muli Ben-Yehuda, muli il.ibm.com + +With contributions by: + - Anthony Liguori, aliguori us.ibm.com + - Mike Day, mdday us.ibm.com + - Michael Factor, factor il.ibm.com + - Zvi Dubitzky, dubi il.ibm.com + +And valuable reviews by: + - Avi Kivity, avi redhat.com + - Gleb Natapov, gleb redhat.com + - Marcelo Tosatti, mtosatti redhat.com + - Kevin Tian, kevin.tian intel.com + - and others. diff --git a/Documentation/virt/kvm/nested-vmx.txt b/Documentation/virt/kvm/nested-vmx.txt deleted file mode 100644 index 97eb1353e962..000000000000 --- a/Documentation/virt/kvm/nested-vmx.txt +++ /dev/null @@ -1,240 +0,0 @@ -Nested VMX -========== - -Overview ---------- - -On Intel processors, KVM uses Intel's VMX (Virtual-Machine eXtensions) -to easily and efficiently run guest operating systems. Normally, these guests -*cannot* themselves be hypervisors running their own guests, because in VMX, -guests cannot use VMX instructions. - -The "Nested VMX" feature adds this missing capability - of running guest -hypervisors (which use VMX) with their own nested guests. It does so by -allowing a guest to use VMX instructions, and correctly and efficiently -emulating them using the single level of VMX available in the hardware. - -We describe in much greater detail the theory behind the nested VMX feature, -its implementation and its performance characteristics, in the OSDI 2010 paper -"The Turtles Project: Design and Implementation of Nested Virtualization", -available at: - - http://www.usenix.org/events/osdi10/tech/full_papers/Ben-Yehuda.pdf - - -Terminology ------------ - -Single-level virtualization has two levels - the host (KVM) and the guests. -In nested virtualization, we have three levels: The host (KVM), which we call -L0, the guest hypervisor, which we call L1, and its nested guest, which we -call L2. - - -Running nested VMX ------------------- - -The nested VMX feature is disabled by default. It can be enabled by giving -the "nested=1" option to the kvm-intel module. - -No modifications are required to user space (qemu). However, qemu's default -emulated CPU type (qemu64) does not list the "VMX" CPU feature, so it must be -explicitly enabled, by giving qemu one of the following options: - - -cpu host (emulated CPU has all features of the real CPU) - - -cpu qemu64,+vmx (add just the vmx feature to a named CPU type) - - -ABIs ----- - -Nested VMX aims to present a standard and (eventually) fully-functional VMX -implementation for the a guest hypervisor to use. As such, the official -specification of the ABI that it provides is Intel's VMX specification, -namely volume 3B of their "Intel 64 and IA-32 Architectures Software -Developer's Manual". Not all of VMX's features are currently fully supported, -but the goal is to eventually support them all, starting with the VMX features -which are used in practice by popular hypervisors (KVM and others). - -As a VMX implementation, nested VMX presents a VMCS structure to L1. -As mandated by the spec, other than the two fields revision_id and abort, -this structure is *opaque* to its user, who is not supposed to know or care -about its internal structure. Rather, the structure is accessed through the -VMREAD and VMWRITE instructions. -Still, for debugging purposes, KVM developers might be interested to know the -internals of this structure; This is struct vmcs12 from arch/x86/kvm/vmx.c. - -The name "vmcs12" refers to the VMCS that L1 builds for L2. In the code we -also have "vmcs01", the VMCS that L0 built for L1, and "vmcs02" is the VMCS -which L0 builds to actually run L2 - how this is done is explained in the -aforementioned paper. - -For convenience, we repeat the content of struct vmcs12 here. If the internals -of this structure changes, this can break live migration across KVM versions. -VMCS12_REVISION (from vmx.c) should be changed if struct vmcs12 or its inner -struct shadow_vmcs is ever changed. - - typedef u64 natural_width; - struct __packed vmcs12 { - /* According to the Intel spec, a VMCS region must start with - * these two user-visible fields */ - u32 revision_id; - u32 abort; - - u32 launch_state; /* set to 0 by VMCLEAR, to 1 by VMLAUNCH */ - u32 padding[7]; /* room for future expansion */ - - u64 io_bitmap_a; - u64 io_bitmap_b; - u64 msr_bitmap; - u64 vm_exit_msr_store_addr; - u64 vm_exit_msr_load_addr; - u64 vm_entry_msr_load_addr; - u64 tsc_offset; - u64 virtual_apic_page_addr; - u64 apic_access_addr; - u64 ept_pointer; - u64 guest_physical_address; - u64 vmcs_link_pointer; - u64 guest_ia32_debugctl; - u64 guest_ia32_pat; - u64 guest_ia32_efer; - u64 guest_pdptr0; - u64 guest_pdptr1; - u64 guest_pdptr2; - u64 guest_pdptr3; - u64 host_ia32_pat; - u64 host_ia32_efer; - u64 padding64[8]; /* room for future expansion */ - natural_width cr0_guest_host_mask; - natural_width cr4_guest_host_mask; - natural_width cr0_read_shadow; - natural_width cr4_read_shadow; - natural_width cr3_target_value0; - natural_width cr3_target_value1; - natural_width cr3_target_value2; - natural_width cr3_target_value3; - natural_width exit_qualification; - natural_width guest_linear_address; - natural_width guest_cr0; - natural_width guest_cr3; - natural_width guest_cr4; - natural_width guest_es_base; - natural_width guest_cs_base; - natural_width guest_ss_base; - natural_width guest_ds_base; - natural_width guest_fs_base; - natural_width guest_gs_base; - natural_width guest_ldtr_base; - natural_width guest_tr_base; - natural_width guest_gdtr_base; - natural_width guest_idtr_base; - natural_width guest_dr7; - natural_width guest_rsp; - natural_width guest_rip; - natural_width guest_rflags; - natural_width guest_pending_dbg_exceptions; - natural_width guest_sysenter_esp; - natural_width guest_sysenter_eip; - natural_width host_cr0; - natural_width host_cr3; - natural_width host_cr4; - natural_width host_fs_base; - natural_width host_gs_base; - natural_width host_tr_base; - natural_width host_gdtr_base; - natural_width host_idtr_base; - natural_width host_ia32_sysenter_esp; - natural_width host_ia32_sysenter_eip; - natural_width host_rsp; - natural_width host_rip; - natural_width paddingl[8]; /* room for future expansion */ - u32 pin_based_vm_exec_control; - u32 cpu_based_vm_exec_control; - u32 exception_bitmap; - u32 page_fault_error_code_mask; - u32 page_fault_error_code_match; - u32 cr3_target_count; - u32 vm_exit_controls; - u32 vm_exit_msr_store_count; - u32 vm_exit_msr_load_count; - u32 vm_entry_controls; - u32 vm_entry_msr_load_count; - u32 vm_entry_intr_info_field; - u32 vm_entry_exception_error_code; - u32 vm_entry_instruction_len; - u32 tpr_threshold; - u32 secondary_vm_exec_control; - u32 vm_instruction_error; - u32 vm_exit_reason; - u32 vm_exit_intr_info; - u32 vm_exit_intr_error_code; - u32 idt_vectoring_info_field; - u32 idt_vectoring_error_code; - u32 vm_exit_instruction_len; - u32 vmx_instruction_info; - u32 guest_es_limit; - u32 guest_cs_limit; - u32 guest_ss_limit; - u32 guest_ds_limit; - u32 guest_fs_limit; - u32 guest_gs_limit; - u32 guest_ldtr_limit; - u32 guest_tr_limit; - u32 guest_gdtr_limit; - u32 guest_idtr_limit; - u32 guest_es_ar_bytes; - u32 guest_cs_ar_bytes; - u32 guest_ss_ar_bytes; - u32 guest_ds_ar_bytes; - u32 guest_fs_ar_bytes; - u32 guest_gs_ar_bytes; - u32 guest_ldtr_ar_bytes; - u32 guest_tr_ar_bytes; - u32 guest_interruptibility_info; - u32 guest_activity_state; - u32 guest_sysenter_cs; - u32 host_ia32_sysenter_cs; - u32 padding32[8]; /* room for future expansion */ - u16 virtual_processor_id; - u16 guest_es_selector; - u16 guest_cs_selector; - u16 guest_ss_selector; - u16 guest_ds_selector; - u16 guest_fs_selector; - u16 guest_gs_selector; - u16 guest_ldtr_selector; - u16 guest_tr_selector; - u16 host_es_selector; - u16 host_cs_selector; - u16 host_ss_selector; - u16 host_ds_selector; - u16 host_fs_selector; - u16 host_gs_selector; - u16 host_tr_selector; - }; - - -Authors -------- - -These patches were written by: - Abel Gordon, abelg il.ibm.com - Nadav Har'El, nyh il.ibm.com - Orit Wasserman, oritw il.ibm.com - Ben-Ami Yassor, benami il.ibm.com - Muli Ben-Yehuda, muli il.ibm.com - -With contributions by: - Anthony Liguori, aliguori us.ibm.com - Mike Day, mdday us.ibm.com - Michael Factor, factor il.ibm.com - Zvi Dubitzky, dubi il.ibm.com - -And valuable reviews by: - Avi Kivity, avi redhat.com - Gleb Natapov, gleb redhat.com - Marcelo Tosatti, mtosatti redhat.com - Kevin Tian, kevin.tian intel.com - and others. -- cgit v1.2.3 From c849d8613991292d5f945956780bb8134cbce7ed Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 10 Feb 2020 07:03:02 +0100 Subject: docs: kvm: Convert ppc-pv.txt to ReST format - Use document title and chapter markups; - Add markups for tables; - Use list markups; - Add markups for literal blocks; - Add blank lines. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/index.rst | 1 + Documentation/virt/kvm/ppc-pv.rst | 222 ++++++++++++++++++++++++++++++++++++++ Documentation/virt/kvm/ppc-pv.txt | 212 ------------------------------------ 3 files changed, 223 insertions(+), 212 deletions(-) create mode 100644 Documentation/virt/kvm/ppc-pv.rst delete mode 100644 Documentation/virt/kvm/ppc-pv.txt diff --git a/Documentation/virt/kvm/index.rst b/Documentation/virt/kvm/index.rst index 123385d0a74a..d0e17e717461 100644 --- a/Documentation/virt/kvm/index.rst +++ b/Documentation/virt/kvm/index.rst @@ -16,6 +16,7 @@ KVM mmu msr nested-vmx + ppc-pv vcpu-requests arm/index diff --git a/Documentation/virt/kvm/ppc-pv.rst b/Documentation/virt/kvm/ppc-pv.rst new file mode 100644 index 000000000000..5fdb907670be --- /dev/null +++ b/Documentation/virt/kvm/ppc-pv.rst @@ -0,0 +1,222 @@ +.. SPDX-License-Identifier: GPL-2.0 + +================================= +The PPC KVM paravirtual interface +================================= + +The basic execution principle by which KVM on PowerPC works is to run all kernel +space code in PR=1 which is user space. This way we trap all privileged +instructions and can emulate them accordingly. + +Unfortunately that is also the downfall. There are quite some privileged +instructions that needlessly return us to the hypervisor even though they +could be handled differently. + +This is what the PPC PV interface helps with. It takes privileged instructions +and transforms them into unprivileged ones with some help from the hypervisor. +This cuts down virtualization costs by about 50% on some of my benchmarks. + +The code for that interface can be found in arch/powerpc/kernel/kvm* + +Querying for existence +====================== + +To find out if we're running on KVM or not, we leverage the device tree. When +Linux is running on KVM, a node /hypervisor exists. That node contains a +compatible property with the value "linux,kvm". + +Once you determined you're running under a PV capable KVM, you can now use +hypercalls as described below. + +KVM hypercalls +============== + +Inside the device tree's /hypervisor node there's a property called +'hypercall-instructions'. This property contains at most 4 opcodes that make +up the hypercall. To call a hypercall, just call these instructions. + +The parameters are as follows: + + ======== ================ ================ + Register IN OUT + ======== ================ ================ + r0 - volatile + r3 1st parameter Return code + r4 2nd parameter 1st output value + r5 3rd parameter 2nd output value + r6 4th parameter 3rd output value + r7 5th parameter 4th output value + r8 6th parameter 5th output value + r9 7th parameter 6th output value + r10 8th parameter 7th output value + r11 hypercall number 8th output value + r12 - volatile + ======== ================ ================ + +Hypercall definitions are shared in generic code, so the same hypercall numbers +apply for x86 and powerpc alike with the exception that each KVM hypercall +also needs to be ORed with the KVM vendor code which is (42 << 16). + +Return codes can be as follows: + + ==== ========================= + Code Meaning + ==== ========================= + 0 Success + 12 Hypercall not implemented + <0 Error + ==== ========================= + +The magic page +============== + +To enable communication between the hypervisor and guest there is a new shared +page that contains parts of supervisor visible register state. The guest can +map this shared page using the KVM hypercall KVM_HC_PPC_MAP_MAGIC_PAGE. + +With this hypercall issued the guest always gets the magic page mapped at the +desired location. The first parameter indicates the effective address when the +MMU is enabled. The second parameter indicates the address in real mode, if +applicable to the target. For now, we always map the page to -4096. This way we +can access it using absolute load and store functions. The following +instruction reads the first field of the magic page:: + + ld rX, -4096(0) + +The interface is designed to be extensible should there be need later to add +additional registers to the magic page. If you add fields to the magic page, +also define a new hypercall feature to indicate that the host can give you more +registers. Only if the host supports the additional features, make use of them. + +The magic page layout is described by struct kvm_vcpu_arch_shared +in arch/powerpc/include/asm/kvm_para.h. + +Magic page features +=================== + +When mapping the magic page using the KVM hypercall KVM_HC_PPC_MAP_MAGIC_PAGE, +a second return value is passed to the guest. This second return value contains +a bitmap of available features inside the magic page. + +The following enhancements to the magic page are currently available: + + ============================ ======================================= + KVM_MAGIC_FEAT_SR Maps SR registers r/w in the magic page + KVM_MAGIC_FEAT_MAS0_TO_SPRG7 Maps MASn, ESR, PIR and high SPRGs + ============================ ======================================= + +For enhanced features in the magic page, please check for the existence of the +feature before using them! + +Magic page flags +================ + +In addition to features that indicate whether a host is capable of a particular +feature we also have a channel for a guest to tell the guest whether it's capable +of something. This is what we call "flags". + +Flags are passed to the host in the low 12 bits of the Effective Address. + +The following flags are currently available for a guest to expose: + + MAGIC_PAGE_FLAG_NOT_MAPPED_NX Guest handles NX bits correctly wrt magic page + +MSR bits +======== + +The MSR contains bits that require hypervisor intervention and bits that do +not require direct hypervisor intervention because they only get interpreted +when entering the guest or don't have any impact on the hypervisor's behavior. + +The following bits are safe to be set inside the guest: + + - MSR_EE + - MSR_RI + +If any other bit changes in the MSR, please still use mtmsr(d). + +Patched instructions +==================== + +The "ld" and "std" instructions are transformed to "lwz" and "stw" instructions +respectively on 32 bit systems with an added offset of 4 to accommodate for big +endianness. + +The following is a list of mapping the Linux kernel performs when running as +guest. Implementing any of those mappings is optional, as the instruction traps +also act on the shared page. So calling privileged instructions still works as +before. + +======================= ================================ +From To +======================= ================================ +mfmsr rX ld rX, magic_page->msr +mfsprg rX, 0 ld rX, magic_page->sprg0 +mfsprg rX, 1 ld rX, magic_page->sprg1 +mfsprg rX, 2 ld rX, magic_page->sprg2 +mfsprg rX, 3 ld rX, magic_page->sprg3 +mfsrr0 rX ld rX, magic_page->srr0 +mfsrr1 rX ld rX, magic_page->srr1 +mfdar rX ld rX, magic_page->dar +mfdsisr rX lwz rX, magic_page->dsisr + +mtmsr rX std rX, magic_page->msr +mtsprg 0, rX std rX, magic_page->sprg0 +mtsprg 1, rX std rX, magic_page->sprg1 +mtsprg 2, rX std rX, magic_page->sprg2 +mtsprg 3, rX std rX, magic_page->sprg3 +mtsrr0 rX std rX, magic_page->srr0 +mtsrr1 rX std rX, magic_page->srr1 +mtdar rX std rX, magic_page->dar +mtdsisr rX stw rX, magic_page->dsisr + +tlbsync nop + +mtmsrd rX, 0 b +mtmsr rX b + +mtmsrd rX, 1 b + +[Book3S only] +mtsrin rX, rY b + +[BookE only] +wrteei [0|1] b +======================= ================================ + +Some instructions require more logic to determine what's going on than a load +or store instruction can deliver. To enable patching of those, we keep some +RAM around where we can live translate instructions to. What happens is the +following: + + 1) copy emulation code to memory + 2) patch that code to fit the emulated instruction + 3) patch that code to return to the original pc + 4 + 4) patch the original instruction to branch to the new code + +That way we can inject an arbitrary amount of code as replacement for a single +instruction. This allows us to check for pending interrupts when setting EE=1 +for example. + +Hypercall ABIs in KVM on PowerPC +================================= + +1) KVM hypercalls (ePAPR) + +These are ePAPR compliant hypercall implementation (mentioned above). Even +generic hypercalls are implemented here, like the ePAPR idle hcall. These are +available on all targets. + +2) PAPR hypercalls + +PAPR hypercalls are needed to run server PowerPC PAPR guests (-M pseries in QEMU). +These are the same hypercalls that pHyp, the POWER hypervisor implements. Some of +them are handled in the kernel, some are handled in user space. This is only +available on book3s_64. + +3) OSI hypercalls + +Mac-on-Linux is another user of KVM on PowerPC, which has its own hypercall (long +before KVM). This is supported to maintain compatibility. All these hypercalls get +forwarded to user space. This is only useful on book3s_32, but can be used with +book3s_64 as well. diff --git a/Documentation/virt/kvm/ppc-pv.txt b/Documentation/virt/kvm/ppc-pv.txt deleted file mode 100644 index e26115ce4258..000000000000 --- a/Documentation/virt/kvm/ppc-pv.txt +++ /dev/null @@ -1,212 +0,0 @@ -The PPC KVM paravirtual interface -================================= - -The basic execution principle by which KVM on PowerPC works is to run all kernel -space code in PR=1 which is user space. This way we trap all privileged -instructions and can emulate them accordingly. - -Unfortunately that is also the downfall. There are quite some privileged -instructions that needlessly return us to the hypervisor even though they -could be handled differently. - -This is what the PPC PV interface helps with. It takes privileged instructions -and transforms them into unprivileged ones with some help from the hypervisor. -This cuts down virtualization costs by about 50% on some of my benchmarks. - -The code for that interface can be found in arch/powerpc/kernel/kvm* - -Querying for existence -====================== - -To find out if we're running on KVM or not, we leverage the device tree. When -Linux is running on KVM, a node /hypervisor exists. That node contains a -compatible property with the value "linux,kvm". - -Once you determined you're running under a PV capable KVM, you can now use -hypercalls as described below. - -KVM hypercalls -============== - -Inside the device tree's /hypervisor node there's a property called -'hypercall-instructions'. This property contains at most 4 opcodes that make -up the hypercall. To call a hypercall, just call these instructions. - -The parameters are as follows: - - Register IN OUT - - r0 - volatile - r3 1st parameter Return code - r4 2nd parameter 1st output value - r5 3rd parameter 2nd output value - r6 4th parameter 3rd output value - r7 5th parameter 4th output value - r8 6th parameter 5th output value - r9 7th parameter 6th output value - r10 8th parameter 7th output value - r11 hypercall number 8th output value - r12 - volatile - -Hypercall definitions are shared in generic code, so the same hypercall numbers -apply for x86 and powerpc alike with the exception that each KVM hypercall -also needs to be ORed with the KVM vendor code which is (42 << 16). - -Return codes can be as follows: - - Code Meaning - - 0 Success - 12 Hypercall not implemented - <0 Error - -The magic page -============== - -To enable communication between the hypervisor and guest there is a new shared -page that contains parts of supervisor visible register state. The guest can -map this shared page using the KVM hypercall KVM_HC_PPC_MAP_MAGIC_PAGE. - -With this hypercall issued the guest always gets the magic page mapped at the -desired location. The first parameter indicates the effective address when the -MMU is enabled. The second parameter indicates the address in real mode, if -applicable to the target. For now, we always map the page to -4096. This way we -can access it using absolute load and store functions. The following -instruction reads the first field of the magic page: - - ld rX, -4096(0) - -The interface is designed to be extensible should there be need later to add -additional registers to the magic page. If you add fields to the magic page, -also define a new hypercall feature to indicate that the host can give you more -registers. Only if the host supports the additional features, make use of them. - -The magic page layout is described by struct kvm_vcpu_arch_shared -in arch/powerpc/include/asm/kvm_para.h. - -Magic page features -=================== - -When mapping the magic page using the KVM hypercall KVM_HC_PPC_MAP_MAGIC_PAGE, -a second return value is passed to the guest. This second return value contains -a bitmap of available features inside the magic page. - -The following enhancements to the magic page are currently available: - - KVM_MAGIC_FEAT_SR Maps SR registers r/w in the magic page - KVM_MAGIC_FEAT_MAS0_TO_SPRG7 Maps MASn, ESR, PIR and high SPRGs - -For enhanced features in the magic page, please check for the existence of the -feature before using them! - -Magic page flags -================ - -In addition to features that indicate whether a host is capable of a particular -feature we also have a channel for a guest to tell the guest whether it's capable -of something. This is what we call "flags". - -Flags are passed to the host in the low 12 bits of the Effective Address. - -The following flags are currently available for a guest to expose: - - MAGIC_PAGE_FLAG_NOT_MAPPED_NX Guest handles NX bits correctly wrt magic page - -MSR bits -======== - -The MSR contains bits that require hypervisor intervention and bits that do -not require direct hypervisor intervention because they only get interpreted -when entering the guest or don't have any impact on the hypervisor's behavior. - -The following bits are safe to be set inside the guest: - - MSR_EE - MSR_RI - -If any other bit changes in the MSR, please still use mtmsr(d). - -Patched instructions -==================== - -The "ld" and "std" instructions are transformed to "lwz" and "stw" instructions -respectively on 32 bit systems with an added offset of 4 to accommodate for big -endianness. - -The following is a list of mapping the Linux kernel performs when running as -guest. Implementing any of those mappings is optional, as the instruction traps -also act on the shared page. So calling privileged instructions still works as -before. - -From To -==== == - -mfmsr rX ld rX, magic_page->msr -mfsprg rX, 0 ld rX, magic_page->sprg0 -mfsprg rX, 1 ld rX, magic_page->sprg1 -mfsprg rX, 2 ld rX, magic_page->sprg2 -mfsprg rX, 3 ld rX, magic_page->sprg3 -mfsrr0 rX ld rX, magic_page->srr0 -mfsrr1 rX ld rX, magic_page->srr1 -mfdar rX ld rX, magic_page->dar -mfdsisr rX lwz rX, magic_page->dsisr - -mtmsr rX std rX, magic_page->msr -mtsprg 0, rX std rX, magic_page->sprg0 -mtsprg 1, rX std rX, magic_page->sprg1 -mtsprg 2, rX std rX, magic_page->sprg2 -mtsprg 3, rX std rX, magic_page->sprg3 -mtsrr0 rX std rX, magic_page->srr0 -mtsrr1 rX std rX, magic_page->srr1 -mtdar rX std rX, magic_page->dar -mtdsisr rX stw rX, magic_page->dsisr - -tlbsync nop - -mtmsrd rX, 0 b -mtmsr rX b - -mtmsrd rX, 1 b - -[Book3S only] -mtsrin rX, rY b - -[BookE only] -wrteei [0|1] b - - -Some instructions require more logic to determine what's going on than a load -or store instruction can deliver. To enable patching of those, we keep some -RAM around where we can live translate instructions to. What happens is the -following: - - 1) copy emulation code to memory - 2) patch that code to fit the emulated instruction - 3) patch that code to return to the original pc + 4 - 4) patch the original instruction to branch to the new code - -That way we can inject an arbitrary amount of code as replacement for a single -instruction. This allows us to check for pending interrupts when setting EE=1 -for example. - -Hypercall ABIs in KVM on PowerPC -================================= -1) KVM hypercalls (ePAPR) - -These are ePAPR compliant hypercall implementation (mentioned above). Even -generic hypercalls are implemented here, like the ePAPR idle hcall. These are -available on all targets. - -2) PAPR hypercalls - -PAPR hypercalls are needed to run server PowerPC PAPR guests (-M pseries in QEMU). -These are the same hypercalls that pHyp, the POWER hypervisor implements. Some of -them are handled in the kernel, some are handled in user space. This is only -available on book3s_64. - -3) OSI hypercalls - -Mac-on-Linux is another user of KVM on PowerPC, which has its own hypercall (long -before KVM). This is supported to maintain compatibility. All these hypercalls get -forwarded to user space. This is only useful on book3s_32, but can be used with -book3s_64 as well. -- cgit v1.2.3 From a9700af64e1bb28fc18fd362307bc787e10e340b Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 10 Feb 2020 07:03:03 +0100 Subject: docs: kvm: Convert s390-diag.txt to ReST format This file is almost in ReST format. Just one change was needed: - Add markups for a literal block and change its indentation. While here, use the standard markup for the document title. Signed-off-by: Mauro Carvalho Chehab Reviewed-by: Cornelia Huck Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/index.rst | 1 + Documentation/virt/kvm/s390-diag.rst | 86 ++++++++++++++++++++++++++++++++++++ Documentation/virt/kvm/s390-diag.txt | 83 ---------------------------------- 3 files changed, 87 insertions(+), 83 deletions(-) create mode 100644 Documentation/virt/kvm/s390-diag.rst delete mode 100644 Documentation/virt/kvm/s390-diag.txt diff --git a/Documentation/virt/kvm/index.rst b/Documentation/virt/kvm/index.rst index d0e17e717461..e5ea75f97d52 100644 --- a/Documentation/virt/kvm/index.rst +++ b/Documentation/virt/kvm/index.rst @@ -17,6 +17,7 @@ KVM msr nested-vmx ppc-pv + s390-diag vcpu-requests arm/index diff --git a/Documentation/virt/kvm/s390-diag.rst b/Documentation/virt/kvm/s390-diag.rst new file mode 100644 index 000000000000..eaac4864d3d6 --- /dev/null +++ b/Documentation/virt/kvm/s390-diag.rst @@ -0,0 +1,86 @@ +.. SPDX-License-Identifier: GPL-2.0 + +============================= +The s390 DIAGNOSE call on KVM +============================= + +KVM on s390 supports the DIAGNOSE call for making hypercalls, both for +native hypercalls and for selected hypercalls found on other s390 +hypervisors. + +Note that bits are numbered as by the usual s390 convention (most significant +bit on the left). + + +General remarks +--------------- + +DIAGNOSE calls by the guest cause a mandatory intercept. This implies +all supported DIAGNOSE calls need to be handled by either KVM or its +userspace. + +All DIAGNOSE calls supported by KVM use the RS-a format:: + + -------------------------------------- + | '83' | R1 | R3 | B2 | D2 | + -------------------------------------- + 0 8 12 16 20 31 + +The second-operand address (obtained by the base/displacement calculation) +is not used to address data. Instead, bits 48-63 of this address specify +the function code, and bits 0-47 are ignored. + +The supported DIAGNOSE function codes vary by the userspace used. For +DIAGNOSE function codes not specific to KVM, please refer to the +documentation for the s390 hypervisors defining them. + + +DIAGNOSE function code 'X'500' - KVM virtio functions +----------------------------------------------------- + +If the function code specifies 0x500, various virtio-related functions +are performed. + +General register 1 contains the virtio subfunction code. Supported +virtio subfunctions depend on KVM's userspace. Generally, userspace +provides either s390-virtio (subcodes 0-2) or virtio-ccw (subcode 3). + +Upon completion of the DIAGNOSE instruction, general register 2 contains +the function's return code, which is either a return code or a subcode +specific value. + +Subcode 0 - s390-virtio notification and early console printk + Handled by userspace. + +Subcode 1 - s390-virtio reset + Handled by userspace. + +Subcode 2 - s390-virtio set status + Handled by userspace. + +Subcode 3 - virtio-ccw notification + Handled by either userspace or KVM (ioeventfd case). + + General register 2 contains a subchannel-identification word denoting + the subchannel of the virtio-ccw proxy device to be notified. + + General register 3 contains the number of the virtqueue to be notified. + + General register 4 contains a 64bit identifier for KVM usage (the + kvm_io_bus cookie). If general register 4 does not contain a valid + identifier, it is ignored. + + After completion of the DIAGNOSE call, general register 2 may contain + a 64bit identifier (in the kvm_io_bus cookie case), or a negative + error value, if an internal error occurred. + + See also the virtio standard for a discussion of this hypercall. + + +DIAGNOSE function code 'X'501 - KVM breakpoint +---------------------------------------------- + +If the function code specifies 0x501, breakpoint functions may be performed. +This function code is handled by userspace. + +This diagnose function code has no subfunctions and uses no parameters. diff --git a/Documentation/virt/kvm/s390-diag.txt b/Documentation/virt/kvm/s390-diag.txt deleted file mode 100644 index 7c52e5f8b210..000000000000 --- a/Documentation/virt/kvm/s390-diag.txt +++ /dev/null @@ -1,83 +0,0 @@ -The s390 DIAGNOSE call on KVM -============================= - -KVM on s390 supports the DIAGNOSE call for making hypercalls, both for -native hypercalls and for selected hypercalls found on other s390 -hypervisors. - -Note that bits are numbered as by the usual s390 convention (most significant -bit on the left). - - -General remarks ---------------- - -DIAGNOSE calls by the guest cause a mandatory intercept. This implies -all supported DIAGNOSE calls need to be handled by either KVM or its -userspace. - -All DIAGNOSE calls supported by KVM use the RS-a format: - --------------------------------------- -| '83' | R1 | R3 | B2 | D2 | --------------------------------------- -0 8 12 16 20 31 - -The second-operand address (obtained by the base/displacement calculation) -is not used to address data. Instead, bits 48-63 of this address specify -the function code, and bits 0-47 are ignored. - -The supported DIAGNOSE function codes vary by the userspace used. For -DIAGNOSE function codes not specific to KVM, please refer to the -documentation for the s390 hypervisors defining them. - - -DIAGNOSE function code 'X'500' - KVM virtio functions ------------------------------------------------------ - -If the function code specifies 0x500, various virtio-related functions -are performed. - -General register 1 contains the virtio subfunction code. Supported -virtio subfunctions depend on KVM's userspace. Generally, userspace -provides either s390-virtio (subcodes 0-2) or virtio-ccw (subcode 3). - -Upon completion of the DIAGNOSE instruction, general register 2 contains -the function's return code, which is either a return code or a subcode -specific value. - -Subcode 0 - s390-virtio notification and early console printk - Handled by userspace. - -Subcode 1 - s390-virtio reset - Handled by userspace. - -Subcode 2 - s390-virtio set status - Handled by userspace. - -Subcode 3 - virtio-ccw notification - Handled by either userspace or KVM (ioeventfd case). - - General register 2 contains a subchannel-identification word denoting - the subchannel of the virtio-ccw proxy device to be notified. - - General register 3 contains the number of the virtqueue to be notified. - - General register 4 contains a 64bit identifier for KVM usage (the - kvm_io_bus cookie). If general register 4 does not contain a valid - identifier, it is ignored. - - After completion of the DIAGNOSE call, general register 2 may contain - a 64bit identifier (in the kvm_io_bus cookie case), or a negative - error value, if an internal error occurred. - - See also the virtio standard for a discussion of this hypercall. - - -DIAGNOSE function code 'X'501 - KVM breakpoint ----------------------------------------------- - -If the function code specifies 0x501, breakpoint functions may be performed. -This function code is handled by userspace. - -This diagnose function code has no subfunctions and uses no parameters. -- cgit v1.2.3 From 6012d9a9fa693e608f4de3c5a13741794dc4b2c7 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 10 Feb 2020 07:03:04 +0100 Subject: docs: kvm: Convert timekeeping.txt to ReST format - Use document title and chapter markups; - Add markups for literal blocks; - Add markups for tables; - use :field: for field descriptions; - Add blank lines and adjust indentation. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/index.rst | 1 + Documentation/virt/kvm/timekeeping.rst | 645 +++++++++++++++++++++++++++++++++ Documentation/virt/kvm/timekeeping.txt | 612 ------------------------------- 3 files changed, 646 insertions(+), 612 deletions(-) create mode 100644 Documentation/virt/kvm/timekeeping.rst delete mode 100644 Documentation/virt/kvm/timekeeping.txt diff --git a/Documentation/virt/kvm/index.rst b/Documentation/virt/kvm/index.rst index e5ea75f97d52..7c1be8910837 100644 --- a/Documentation/virt/kvm/index.rst +++ b/Documentation/virt/kvm/index.rst @@ -18,6 +18,7 @@ KVM nested-vmx ppc-pv s390-diag + timekeeping vcpu-requests arm/index diff --git a/Documentation/virt/kvm/timekeeping.rst b/Documentation/virt/kvm/timekeeping.rst new file mode 100644 index 000000000000..21ae7efa29ba --- /dev/null +++ b/Documentation/virt/kvm/timekeeping.rst @@ -0,0 +1,645 @@ +.. SPDX-License-Identifier: GPL-2.0 + +====================================================== +Timekeeping Virtualization for X86-Based Architectures +====================================================== + +:Author: Zachary Amsden +:Copyright: (c) 2010, Red Hat. All rights reserved. + +.. Contents + + 1) Overview + 2) Timing Devices + 3) TSC Hardware + 4) Virtualization Problems + +1. Overview +=========== + +One of the most complicated parts of the X86 platform, and specifically, +the virtualization of this platform is the plethora of timing devices available +and the complexity of emulating those devices. In addition, virtualization of +time introduces a new set of challenges because it introduces a multiplexed +division of time beyond the control of the guest CPU. + +First, we will describe the various timekeeping hardware available, then +present some of the problems which arise and solutions available, giving +specific recommendations for certain classes of KVM guests. + +The purpose of this document is to collect data and information relevant to +timekeeping which may be difficult to find elsewhere, specifically, +information relevant to KVM and hardware-based virtualization. + +2. Timing Devices +================= + +First we discuss the basic hardware devices available. TSC and the related +KVM clock are special enough to warrant a full exposition and are described in +the following section. + +2.1. i8254 - PIT +---------------- + +One of the first timer devices available is the programmable interrupt timer, +or PIT. The PIT has a fixed frequency 1.193182 MHz base clock and three +channels which can be programmed to deliver periodic or one-shot interrupts. +These three channels can be configured in different modes and have individual +counters. Channel 1 and 2 were not available for general use in the original +IBM PC, and historically were connected to control RAM refresh and the PC +speaker. Now the PIT is typically integrated as part of an emulated chipset +and a separate physical PIT is not used. + +The PIT uses I/O ports 0x40 - 0x43. Access to the 16-bit counters is done +using single or multiple byte access to the I/O ports. There are 6 modes +available, but not all modes are available to all timers, as only timer 2 +has a connected gate input, required for modes 1 and 5. The gate line is +controlled by port 61h, bit 0, as illustrated in the following diagram:: + + -------------- ---------------- + | | | | + | 1.1932 MHz|---------->| CLOCK OUT | ---------> IRQ 0 + | Clock | | | | + -------------- | +->| GATE TIMER 0 | + | ---------------- + | + | ---------------- + | | | + |------>| CLOCK OUT | ---------> 66.3 KHZ DRAM + | | | (aka /dev/null) + | +->| GATE TIMER 1 | + | ---------------- + | + | ---------------- + | | | + |------>| CLOCK OUT | ---------> Port 61h, bit 5 + | | | + Port 61h, bit 0 -------->| GATE TIMER 2 | \_.---- ____ + ---------------- _| )--|LPF|---Speaker + / *---- \___/ + Port 61h, bit 1 ---------------------------------/ + +The timer modes are now described. + +Mode 0: Single Timeout. + This is a one-shot software timeout that counts down + when the gate is high (always true for timers 0 and 1). When the count + reaches zero, the output goes high. + +Mode 1: Triggered One-shot. + The output is initially set high. When the gate + line is set high, a countdown is initiated (which does not stop if the gate is + lowered), during which the output is set low. When the count reaches zero, + the output goes high. + +Mode 2: Rate Generator. + The output is initially set high. When the countdown + reaches 1, the output goes low for one count and then returns high. The value + is reloaded and the countdown automatically resumes. If the gate line goes + low, the count is halted. If the output is low when the gate is lowered, the + output automatically goes high (this only affects timer 2). + +Mode 3: Square Wave. + This generates a high / low square wave. The count + determines the length of the pulse, which alternates between high and low + when zero is reached. The count only proceeds when gate is high and is + automatically reloaded on reaching zero. The count is decremented twice at + each clock to generate a full high / low cycle at the full periodic rate. + If the count is even, the clock remains high for N/2 counts and low for N/2 + counts; if the clock is odd, the clock is high for (N+1)/2 counts and low + for (N-1)/2 counts. Only even values are latched by the counter, so odd + values are not observed when reading. This is the intended mode for timer 2, + which generates sine-like tones by low-pass filtering the square wave output. + +Mode 4: Software Strobe. + After programming this mode and loading the counter, + the output remains high until the counter reaches zero. Then the output + goes low for 1 clock cycle and returns high. The counter is not reloaded. + Counting only occurs when gate is high. + +Mode 5: Hardware Strobe. + After programming and loading the counter, the + output remains high. When the gate is raised, a countdown is initiated + (which does not stop if the gate is lowered). When the counter reaches zero, + the output goes low for 1 clock cycle and then returns high. The counter is + not reloaded. + +In addition to normal binary counting, the PIT supports BCD counting. The +command port, 0x43 is used to set the counter and mode for each of the three +timers. + +PIT commands, issued to port 0x43, using the following bit encoding:: + + Bit 7-4: Command (See table below) + Bit 3-1: Mode (000 = Mode 0, 101 = Mode 5, 11X = undefined) + Bit 0 : Binary (0) / BCD (1) + +Command table:: + + 0000 - Latch Timer 0 count for port 0x40 + sample and hold the count to be read in port 0x40; + additional commands ignored until counter is read; + mode bits ignored. + + 0001 - Set Timer 0 LSB mode for port 0x40 + set timer to read LSB only and force MSB to zero; + mode bits set timer mode + + 0010 - Set Timer 0 MSB mode for port 0x40 + set timer to read MSB only and force LSB to zero; + mode bits set timer mode + + 0011 - Set Timer 0 16-bit mode for port 0x40 + set timer to read / write LSB first, then MSB; + mode bits set timer mode + + 0100 - Latch Timer 1 count for port 0x41 - as described above + 0101 - Set Timer 1 LSB mode for port 0x41 - as described above + 0110 - Set Timer 1 MSB mode for port 0x41 - as described above + 0111 - Set Timer 1 16-bit mode for port 0x41 - as described above + + 1000 - Latch Timer 2 count for port 0x42 - as described above + 1001 - Set Timer 2 LSB mode for port 0x42 - as described above + 1010 - Set Timer 2 MSB mode for port 0x42 - as described above + 1011 - Set Timer 2 16-bit mode for port 0x42 as described above + + 1101 - General counter latch + Latch combination of counters into corresponding ports + Bit 3 = Counter 2 + Bit 2 = Counter 1 + Bit 1 = Counter 0 + Bit 0 = Unused + + 1110 - Latch timer status + Latch combination of counter mode into corresponding ports + Bit 3 = Counter 2 + Bit 2 = Counter 1 + Bit 1 = Counter 0 + + The output of ports 0x40-0x42 following this command will be: + + Bit 7 = Output pin + Bit 6 = Count loaded (0 if timer has expired) + Bit 5-4 = Read / Write mode + 01 = MSB only + 10 = LSB only + 11 = LSB / MSB (16-bit) + Bit 3-1 = Mode + Bit 0 = Binary (0) / BCD mode (1) + +2.2. RTC +-------- + +The second device which was available in the original PC was the MC146818 real +time clock. The original device is now obsolete, and usually emulated by the +system chipset, sometimes by an HPET and some frankenstein IRQ routing. + +The RTC is accessed through CMOS variables, which uses an index register to +control which bytes are read. Since there is only one index register, read +of the CMOS and read of the RTC require lock protection (in addition, it is +dangerous to allow userspace utilities such as hwclock to have direct RTC +access, as they could corrupt kernel reads and writes of CMOS memory). + +The RTC generates an interrupt which is usually routed to IRQ 8. The interrupt +can function as a periodic timer, an additional once a day alarm, and can issue +interrupts after an update of the CMOS registers by the MC146818 is complete. +The type of interrupt is signalled in the RTC status registers. + +The RTC will update the current time fields by battery power even while the +system is off. The current time fields should not be read while an update is +in progress, as indicated in the status register. + +The clock uses a 32.768kHz crystal, so bits 6-4 of register A should be +programmed to a 32kHz divider if the RTC is to count seconds. + +This is the RAM map originally used for the RTC/CMOS:: + + Location Size Description + ------------------------------------------ + 00h byte Current second (BCD) + 01h byte Seconds alarm (BCD) + 02h byte Current minute (BCD) + 03h byte Minutes alarm (BCD) + 04h byte Current hour (BCD) + 05h byte Hours alarm (BCD) + 06h byte Current day of week (BCD) + 07h byte Current day of month (BCD) + 08h byte Current month (BCD) + 09h byte Current year (BCD) + 0Ah byte Register A + bit 7 = Update in progress + bit 6-4 = Divider for clock + 000 = 4.194 MHz + 001 = 1.049 MHz + 010 = 32 kHz + 10X = test modes + 110 = reset / disable + 111 = reset / disable + bit 3-0 = Rate selection for periodic interrupt + 000 = periodic timer disabled + 001 = 3.90625 uS + 010 = 7.8125 uS + 011 = .122070 mS + 100 = .244141 mS + ... + 1101 = 125 mS + 1110 = 250 mS + 1111 = 500 mS + 0Bh byte Register B + bit 7 = Run (0) / Halt (1) + bit 6 = Periodic interrupt enable + bit 5 = Alarm interrupt enable + bit 4 = Update-ended interrupt enable + bit 3 = Square wave interrupt enable + bit 2 = BCD calendar (0) / Binary (1) + bit 1 = 12-hour mode (0) / 24-hour mode (1) + bit 0 = 0 (DST off) / 1 (DST enabled) + OCh byte Register C (read only) + bit 7 = interrupt request flag (IRQF) + bit 6 = periodic interrupt flag (PF) + bit 5 = alarm interrupt flag (AF) + bit 4 = update interrupt flag (UF) + bit 3-0 = reserved + ODh byte Register D (read only) + bit 7 = RTC has power + bit 6-0 = reserved + 32h byte Current century BCD (*) + (*) location vendor specific and now determined from ACPI global tables + +2.3. APIC +--------- + +On Pentium and later processors, an on-board timer is available to each CPU +as part of the Advanced Programmable Interrupt Controller. The APIC is +accessed through memory-mapped registers and provides interrupt service to each +CPU, used for IPIs and local timer interrupts. + +Although in theory the APIC is a safe and stable source for local interrupts, +in practice, many bugs and glitches have occurred due to the special nature of +the APIC CPU-local memory-mapped hardware. Beware that CPU errata may affect +the use of the APIC and that workarounds may be required. In addition, some of +these workarounds pose unique constraints for virtualization - requiring either +extra overhead incurred from extra reads of memory-mapped I/O or additional +functionality that may be more computationally expensive to implement. + +Since the APIC is documented quite well in the Intel and AMD manuals, we will +avoid repetition of the detail here. It should be pointed out that the APIC +timer is programmed through the LVT (local vector timer) register, is capable +of one-shot or periodic operation, and is based on the bus clock divided down +by the programmable divider register. + +2.4. HPET +--------- + +HPET is quite complex, and was originally intended to replace the PIT / RTC +support of the X86 PC. It remains to be seen whether that will be the case, as +the de facto standard of PC hardware is to emulate these older devices. Some +systems designated as legacy free may support only the HPET as a hardware timer +device. + +The HPET spec is rather loose and vague, requiring at least 3 hardware timers, +but allowing implementation freedom to support many more. It also imposes no +fixed rate on the timer frequency, but does impose some extremal values on +frequency, error and slew. + +In general, the HPET is recommended as a high precision (compared to PIT /RTC) +time source which is independent of local variation (as there is only one HPET +in any given system). The HPET is also memory-mapped, and its presence is +indicated through ACPI tables by the BIOS. + +Detailed specification of the HPET is beyond the current scope of this +document, as it is also very well documented elsewhere. + +2.5. Offboard Timers +-------------------- + +Several cards, both proprietary (watchdog boards) and commonplace (e1000) have +timing chips built into the cards which may have registers which are accessible +to kernel or user drivers. To the author's knowledge, using these to generate +a clocksource for a Linux or other kernel has not yet been attempted and is in +general frowned upon as not playing by the agreed rules of the game. Such a +timer device would require additional support to be virtualized properly and is +not considered important at this time as no known operating system does this. + +3. TSC Hardware +=============== + +The TSC or time stamp counter is relatively simple in theory; it counts +instruction cycles issued by the processor, which can be used as a measure of +time. In practice, due to a number of problems, it is the most complicated +timekeeping device to use. + +The TSC is represented internally as a 64-bit MSR which can be read with the +RDMSR, RDTSC, or RDTSCP (when available) instructions. In the past, hardware +limitations made it possible to write the TSC, but generally on old hardware it +was only possible to write the low 32-bits of the 64-bit counter, and the upper +32-bits of the counter were cleared. Now, however, on Intel processors family +0Fh, for models 3, 4 and 6, and family 06h, models e and f, this restriction +has been lifted and all 64-bits are writable. On AMD systems, the ability to +write the TSC MSR is not an architectural guarantee. + +The TSC is accessible from CPL-0 and conditionally, for CPL > 0 software by +means of the CR4.TSD bit, which when enabled, disables CPL > 0 TSC access. + +Some vendors have implemented an additional instruction, RDTSCP, which returns +atomically not just the TSC, but an indicator which corresponds to the +processor number. This can be used to index into an array of TSC variables to +determine offset information in SMP systems where TSCs are not synchronized. +The presence of this instruction must be determined by consulting CPUID feature +bits. + +Both VMX and SVM provide extension fields in the virtualization hardware which +allows the guest visible TSC to be offset by a constant. Newer implementations +promise to allow the TSC to additionally be scaled, but this hardware is not +yet widely available. + +3.1. TSC synchronization +------------------------ + +The TSC is a CPU-local clock in most implementations. This means, on SMP +platforms, the TSCs of different CPUs may start at different times depending +on when the CPUs are powered on. Generally, CPUs on the same die will share +the same clock, however, this is not always the case. + +The BIOS may attempt to resynchronize the TSCs during the poweron process and +the operating system or other system software may attempt to do this as well. +Several hardware limitations make the problem worse - if it is not possible to +write the full 64-bits of the TSC, it may be impossible to match the TSC in +newly arriving CPUs to that of the rest of the system, resulting in +unsynchronized TSCs. This may be done by BIOS or system software, but in +practice, getting a perfectly synchronized TSC will not be possible unless all +values are read from the same clock, which generally only is possible on single +socket systems or those with special hardware support. + +3.2. TSC and CPU hotplug +------------------------ + +As touched on already, CPUs which arrive later than the boot time of the system +may not have a TSC value that is synchronized with the rest of the system. +Either system software, BIOS, or SMM code may actually try to establish the TSC +to a value matching the rest of the system, but a perfect match is usually not +a guarantee. This can have the effect of bringing a system from a state where +TSC is synchronized back to a state where TSC synchronization flaws, however +small, may be exposed to the OS and any virtualization environment. + +3.3. TSC and multi-socket / NUMA +-------------------------------- + +Multi-socket systems, especially large multi-socket systems are likely to have +individual clocksources rather than a single, universally distributed clock. +Since these clocks are driven by different crystals, they will not have +perfectly matched frequency, and temperature and electrical variations will +cause the CPU clocks, and thus the TSCs to drift over time. Depending on the +exact clock and bus design, the drift may or may not be fixed in absolute +error, and may accumulate over time. + +In addition, very large systems may deliberately slew the clocks of individual +cores. This technique, known as spread-spectrum clocking, reduces EMI at the +clock frequency and harmonics of it, which may be required to pass FCC +standards for telecommunications and computer equipment. + +It is recommended not to trust the TSCs to remain synchronized on NUMA or +multiple socket systems for these reasons. + +3.4. TSC and C-states +--------------------- + +C-states, or idling states of the processor, especially C1E and deeper sleep +states may be problematic for TSC as well. The TSC may stop advancing in such +a state, resulting in a TSC which is behind that of other CPUs when execution +is resumed. Such CPUs must be detected and flagged by the operating system +based on CPU and chipset identifications. + +The TSC in such a case may be corrected by catching it up to a known external +clocksource. + +3.5. TSC frequency change / P-states +------------------------------------ + +To make things slightly more interesting, some CPUs may change frequency. They +may or may not run the TSC at the same rate, and because the frequency change +may be staggered or slewed, at some points in time, the TSC rate may not be +known other than falling within a range of values. In this case, the TSC will +not be a stable time source, and must be calibrated against a known, stable, +external clock to be a usable source of time. + +Whether the TSC runs at a constant rate or scales with the P-state is model +dependent and must be determined by inspecting CPUID, chipset or vendor +specific MSR fields. + +In addition, some vendors have known bugs where the P-state is actually +compensated for properly during normal operation, but when the processor is +inactive, the P-state may be raised temporarily to service cache misses from +other processors. In such cases, the TSC on halted CPUs could advance faster +than that of non-halted processors. AMD Turion processors are known to have +this problem. + +3.6. TSC and STPCLK / T-states +------------------------------ + +External signals given to the processor may also have the effect of stopping +the TSC. This is typically done for thermal emergency power control to prevent +an overheating condition, and typically, there is no way to detect that this +condition has happened. + +3.7. TSC virtualization - VMX +----------------------------- + +VMX provides conditional trapping of RDTSC, RDMSR, WRMSR and RDTSCP +instructions, which is enough for full virtualization of TSC in any manner. In +addition, VMX allows passing through the host TSC plus an additional TSC_OFFSET +field specified in the VMCS. Special instructions must be used to read and +write the VMCS field. + +3.8. TSC virtualization - SVM +----------------------------- + +SVM provides conditional trapping of RDTSC, RDMSR, WRMSR and RDTSCP +instructions, which is enough for full virtualization of TSC in any manner. In +addition, SVM allows passing through the host TSC plus an additional offset +field specified in the SVM control block. + +3.9. TSC feature bits in Linux +------------------------------ + +In summary, there is no way to guarantee the TSC remains in perfect +synchronization unless it is explicitly guaranteed by the architecture. Even +if so, the TSCs in multi-sockets or NUMA systems may still run independently +despite being locally consistent. + +The following feature bits are used by Linux to signal various TSC attributes, +but they can only be taken to be meaningful for UP or single node systems. + +========================= ======================================= +X86_FEATURE_TSC The TSC is available in hardware +X86_FEATURE_RDTSCP The RDTSCP instruction is available +X86_FEATURE_CONSTANT_TSC The TSC rate is unchanged with P-states +X86_FEATURE_NONSTOP_TSC The TSC does not stop in C-states +X86_FEATURE_TSC_RELIABLE TSC sync checks are skipped (VMware) +========================= ======================================= + +4. Virtualization Problems +========================== + +Timekeeping is especially problematic for virtualization because a number of +challenges arise. The most obvious problem is that time is now shared between +the host and, potentially, a number of virtual machines. Thus the virtual +operating system does not run with 100% usage of the CPU, despite the fact that +it may very well make that assumption. It may expect it to remain true to very +exacting bounds when interrupt sources are disabled, but in reality only its +virtual interrupt sources are disabled, and the machine may still be preempted +at any time. This causes problems as the passage of real time, the injection +of machine interrupts and the associated clock sources are no longer completely +synchronized with real time. + +This same problem can occur on native hardware to a degree, as SMM mode may +steal cycles from the naturally on X86 systems when SMM mode is used by the +BIOS, but not in such an extreme fashion. However, the fact that SMM mode may +cause similar problems to virtualization makes it a good justification for +solving many of these problems on bare metal. + +4.1. Interrupt clocking +----------------------- + +One of the most immediate problems that occurs with legacy operating systems +is that the system timekeeping routines are often designed to keep track of +time by counting periodic interrupts. These interrupts may come from the PIT +or the RTC, but the problem is the same: the host virtualization engine may not +be able to deliver the proper number of interrupts per second, and so guest +time may fall behind. This is especially problematic if a high interrupt rate +is selected, such as 1000 HZ, which is unfortunately the default for many Linux +guests. + +There are three approaches to solving this problem; first, it may be possible +to simply ignore it. Guests which have a separate time source for tracking +'wall clock' or 'real time' may not need any adjustment of their interrupts to +maintain proper time. If this is not sufficient, it may be necessary to inject +additional interrupts into the guest in order to increase the effective +interrupt rate. This approach leads to complications in extreme conditions, +where host load or guest lag is too much to compensate for, and thus another +solution to the problem has risen: the guest may need to become aware of lost +ticks and compensate for them internally. Although promising in theory, the +implementation of this policy in Linux has been extremely error prone, and a +number of buggy variants of lost tick compensation are distributed across +commonly used Linux systems. + +Windows uses periodic RTC clocking as a means of keeping time internally, and +thus requires interrupt slewing to keep proper time. It does use a low enough +rate (ed: is it 18.2 Hz?) however that it has not yet been a problem in +practice. + +4.2. TSC sampling and serialization +----------------------------------- + +As the highest precision time source available, the cycle counter of the CPU +has aroused much interest from developers. As explained above, this timer has +many problems unique to its nature as a local, potentially unstable and +potentially unsynchronized source. One issue which is not unique to the TSC, +but is highlighted because of its very precise nature is sampling delay. By +definition, the counter, once read is already old. However, it is also +possible for the counter to be read ahead of the actual use of the result. +This is a consequence of the superscalar execution of the instruction stream, +which may execute instructions out of order. Such execution is called +non-serialized. Forcing serialized execution is necessary for precise +measurement with the TSC, and requires a serializing instruction, such as CPUID +or an MSR read. + +Since CPUID may actually be virtualized by a trap and emulate mechanism, this +serialization can pose a performance issue for hardware virtualization. An +accurate time stamp counter reading may therefore not always be available, and +it may be necessary for an implementation to guard against "backwards" reads of +the TSC as seen from other CPUs, even in an otherwise perfectly synchronized +system. + +4.3. Timespec aliasing +---------------------- + +Additionally, this lack of serialization from the TSC poses another challenge +when using results of the TSC when measured against another time source. As +the TSC is much higher precision, many possible values of the TSC may be read +while another clock is still expressing the same value. + +That is, you may read (T,T+10) while external clock C maintains the same value. +Due to non-serialized reads, you may actually end up with a range which +fluctuates - from (T-1.. T+10). Thus, any time calculated from a TSC, but +calibrated against an external value may have a range of valid values. +Re-calibrating this computation may actually cause time, as computed after the +calibration, to go backwards, compared with time computed before the +calibration. + +This problem is particularly pronounced with an internal time source in Linux, +the kernel time, which is expressed in the theoretically high resolution +timespec - but which advances in much larger granularity intervals, sometimes +at the rate of jiffies, and possibly in catchup modes, at a much larger step. + +This aliasing requires care in the computation and recalibration of kvmclock +and any other values derived from TSC computation (such as TSC virtualization +itself). + +4.4. Migration +-------------- + +Migration of a virtual machine raises problems for timekeeping in two ways. +First, the migration itself may take time, during which interrupts cannot be +delivered, and after which, the guest time may need to be caught up. NTP may +be able to help to some degree here, as the clock correction required is +typically small enough to fall in the NTP-correctable window. + +An additional concern is that timers based off the TSC (or HPET, if the raw bus +clock is exposed) may now be running at different rates, requiring compensation +in some way in the hypervisor by virtualizing these timers. In addition, +migrating to a faster machine may preclude the use of a passthrough TSC, as a +faster clock cannot be made visible to a guest without the potential of time +advancing faster than usual. A slower clock is less of a problem, as it can +always be caught up to the original rate. KVM clock avoids these problems by +simply storing multipliers and offsets against the TSC for the guest to convert +back into nanosecond resolution values. + +4.5. Scheduling +--------------- + +Since scheduling may be based on precise timing and firing of interrupts, the +scheduling algorithms of an operating system may be adversely affected by +virtualization. In theory, the effect is random and should be universally +distributed, but in contrived as well as real scenarios (guest device access, +causes of virtualization exits, possible context switch), this may not always +be the case. The effect of this has not been well studied. + +In an attempt to work around this, several implementations have provided a +paravirtualized scheduler clock, which reveals the true amount of CPU time for +which a virtual machine has been running. + +4.6. Watchdogs +-------------- + +Watchdog timers, such as the lock detector in Linux may fire accidentally when +running under hardware virtualization due to timer interrupts being delayed or +misinterpretation of the passage of real time. Usually, these warnings are +spurious and can be ignored, but in some circumstances it may be necessary to +disable such detection. + +4.7. Delays and precision timing +-------------------------------- + +Precise timing and delays may not be possible in a virtualized system. This +can happen if the system is controlling physical hardware, or issues delays to +compensate for slower I/O to and from devices. The first issue is not solvable +in general for a virtualized system; hardware control software can't be +adequately virtualized without a full real-time operating system, which would +require an RT aware virtualization platform. + +The second issue may cause performance problems, but this is unlikely to be a +significant issue. In many cases these delays may be eliminated through +configuration or paravirtualization. + +4.8. Covert channels and leaks +------------------------------ + +In addition to the above problems, time information will inevitably leak to the +guest about the host in anything but a perfect implementation of virtualized +time. This may allow the guest to infer the presence of a hypervisor (as in a +red-pill type detection), and it may allow information to leak between guests +by using CPU utilization itself as a signalling channel. Preventing such +problems would require completely isolated virtual time which may not track +real time any longer. This may be useful in certain security or QA contexts, +but in general isn't recommended for real-world deployment scenarios. diff --git a/Documentation/virt/kvm/timekeeping.txt b/Documentation/virt/kvm/timekeeping.txt deleted file mode 100644 index 76808a17ad84..000000000000 --- a/Documentation/virt/kvm/timekeeping.txt +++ /dev/null @@ -1,612 +0,0 @@ - - Timekeeping Virtualization for X86-Based Architectures - - Zachary Amsden - Copyright (c) 2010, Red Hat. All rights reserved. - -1) Overview -2) Timing Devices -3) TSC Hardware -4) Virtualization Problems - -========================================================================= - -1) Overview - -One of the most complicated parts of the X86 platform, and specifically, -the virtualization of this platform is the plethora of timing devices available -and the complexity of emulating those devices. In addition, virtualization of -time introduces a new set of challenges because it introduces a multiplexed -division of time beyond the control of the guest CPU. - -First, we will describe the various timekeeping hardware available, then -present some of the problems which arise and solutions available, giving -specific recommendations for certain classes of KVM guests. - -The purpose of this document is to collect data and information relevant to -timekeeping which may be difficult to find elsewhere, specifically, -information relevant to KVM and hardware-based virtualization. - -========================================================================= - -2) Timing Devices - -First we discuss the basic hardware devices available. TSC and the related -KVM clock are special enough to warrant a full exposition and are described in -the following section. - -2.1) i8254 - PIT - -One of the first timer devices available is the programmable interrupt timer, -or PIT. The PIT has a fixed frequency 1.193182 MHz base clock and three -channels which can be programmed to deliver periodic or one-shot interrupts. -These three channels can be configured in different modes and have individual -counters. Channel 1 and 2 were not available for general use in the original -IBM PC, and historically were connected to control RAM refresh and the PC -speaker. Now the PIT is typically integrated as part of an emulated chipset -and a separate physical PIT is not used. - -The PIT uses I/O ports 0x40 - 0x43. Access to the 16-bit counters is done -using single or multiple byte access to the I/O ports. There are 6 modes -available, but not all modes are available to all timers, as only timer 2 -has a connected gate input, required for modes 1 and 5. The gate line is -controlled by port 61h, bit 0, as illustrated in the following diagram. - - -------------- ---------------- -| | | | -| 1.1932 MHz |---------->| CLOCK OUT | ---------> IRQ 0 -| Clock | | | | - -------------- | +->| GATE TIMER 0 | - | ---------------- - | - | ---------------- - | | | - |------>| CLOCK OUT | ---------> 66.3 KHZ DRAM - | | | (aka /dev/null) - | +->| GATE TIMER 1 | - | ---------------- - | - | ---------------- - | | | - |------>| CLOCK OUT | ---------> Port 61h, bit 5 - | | | -Port 61h, bit 0 ---------->| GATE TIMER 2 | \_.---- ____ - ---------------- _| )--|LPF|---Speaker - / *---- \___/ -Port 61h, bit 1 -----------------------------------/ - -The timer modes are now described. - -Mode 0: Single Timeout. This is a one-shot software timeout that counts down - when the gate is high (always true for timers 0 and 1). When the count - reaches zero, the output goes high. - -Mode 1: Triggered One-shot. The output is initially set high. When the gate - line is set high, a countdown is initiated (which does not stop if the gate is - lowered), during which the output is set low. When the count reaches zero, - the output goes high. - -Mode 2: Rate Generator. The output is initially set high. When the countdown - reaches 1, the output goes low for one count and then returns high. The value - is reloaded and the countdown automatically resumes. If the gate line goes - low, the count is halted. If the output is low when the gate is lowered, the - output automatically goes high (this only affects timer 2). - -Mode 3: Square Wave. This generates a high / low square wave. The count - determines the length of the pulse, which alternates between high and low - when zero is reached. The count only proceeds when gate is high and is - automatically reloaded on reaching zero. The count is decremented twice at - each clock to generate a full high / low cycle at the full periodic rate. - If the count is even, the clock remains high for N/2 counts and low for N/2 - counts; if the clock is odd, the clock is high for (N+1)/2 counts and low - for (N-1)/2 counts. Only even values are latched by the counter, so odd - values are not observed when reading. This is the intended mode for timer 2, - which generates sine-like tones by low-pass filtering the square wave output. - -Mode 4: Software Strobe. After programming this mode and loading the counter, - the output remains high until the counter reaches zero. Then the output - goes low for 1 clock cycle and returns high. The counter is not reloaded. - Counting only occurs when gate is high. - -Mode 5: Hardware Strobe. After programming and loading the counter, the - output remains high. When the gate is raised, a countdown is initiated - (which does not stop if the gate is lowered). When the counter reaches zero, - the output goes low for 1 clock cycle and then returns high. The counter is - not reloaded. - -In addition to normal binary counting, the PIT supports BCD counting. The -command port, 0x43 is used to set the counter and mode for each of the three -timers. - -PIT commands, issued to port 0x43, using the following bit encoding: - -Bit 7-4: Command (See table below) -Bit 3-1: Mode (000 = Mode 0, 101 = Mode 5, 11X = undefined) -Bit 0 : Binary (0) / BCD (1) - -Command table: - -0000 - Latch Timer 0 count for port 0x40 - sample and hold the count to be read in port 0x40; - additional commands ignored until counter is read; - mode bits ignored. - -0001 - Set Timer 0 LSB mode for port 0x40 - set timer to read LSB only and force MSB to zero; - mode bits set timer mode - -0010 - Set Timer 0 MSB mode for port 0x40 - set timer to read MSB only and force LSB to zero; - mode bits set timer mode - -0011 - Set Timer 0 16-bit mode for port 0x40 - set timer to read / write LSB first, then MSB; - mode bits set timer mode - -0100 - Latch Timer 1 count for port 0x41 - as described above -0101 - Set Timer 1 LSB mode for port 0x41 - as described above -0110 - Set Timer 1 MSB mode for port 0x41 - as described above -0111 - Set Timer 1 16-bit mode for port 0x41 - as described above - -1000 - Latch Timer 2 count for port 0x42 - as described above -1001 - Set Timer 2 LSB mode for port 0x42 - as described above -1010 - Set Timer 2 MSB mode for port 0x42 - as described above -1011 - Set Timer 2 16-bit mode for port 0x42 as described above - -1101 - General counter latch - Latch combination of counters into corresponding ports - Bit 3 = Counter 2 - Bit 2 = Counter 1 - Bit 1 = Counter 0 - Bit 0 = Unused - -1110 - Latch timer status - Latch combination of counter mode into corresponding ports - Bit 3 = Counter 2 - Bit 2 = Counter 1 - Bit 1 = Counter 0 - - The output of ports 0x40-0x42 following this command will be: - - Bit 7 = Output pin - Bit 6 = Count loaded (0 if timer has expired) - Bit 5-4 = Read / Write mode - 01 = MSB only - 10 = LSB only - 11 = LSB / MSB (16-bit) - Bit 3-1 = Mode - Bit 0 = Binary (0) / BCD mode (1) - -2.2) RTC - -The second device which was available in the original PC was the MC146818 real -time clock. The original device is now obsolete, and usually emulated by the -system chipset, sometimes by an HPET and some frankenstein IRQ routing. - -The RTC is accessed through CMOS variables, which uses an index register to -control which bytes are read. Since there is only one index register, read -of the CMOS and read of the RTC require lock protection (in addition, it is -dangerous to allow userspace utilities such as hwclock to have direct RTC -access, as they could corrupt kernel reads and writes of CMOS memory). - -The RTC generates an interrupt which is usually routed to IRQ 8. The interrupt -can function as a periodic timer, an additional once a day alarm, and can issue -interrupts after an update of the CMOS registers by the MC146818 is complete. -The type of interrupt is signalled in the RTC status registers. - -The RTC will update the current time fields by battery power even while the -system is off. The current time fields should not be read while an update is -in progress, as indicated in the status register. - -The clock uses a 32.768kHz crystal, so bits 6-4 of register A should be -programmed to a 32kHz divider if the RTC is to count seconds. - -This is the RAM map originally used for the RTC/CMOS: - -Location Size Description ------------------------------------------- -00h byte Current second (BCD) -01h byte Seconds alarm (BCD) -02h byte Current minute (BCD) -03h byte Minutes alarm (BCD) -04h byte Current hour (BCD) -05h byte Hours alarm (BCD) -06h byte Current day of week (BCD) -07h byte Current day of month (BCD) -08h byte Current month (BCD) -09h byte Current year (BCD) -0Ah byte Register A - bit 7 = Update in progress - bit 6-4 = Divider for clock - 000 = 4.194 MHz - 001 = 1.049 MHz - 010 = 32 kHz - 10X = test modes - 110 = reset / disable - 111 = reset / disable - bit 3-0 = Rate selection for periodic interrupt - 000 = periodic timer disabled - 001 = 3.90625 uS - 010 = 7.8125 uS - 011 = .122070 mS - 100 = .244141 mS - ... - 1101 = 125 mS - 1110 = 250 mS - 1111 = 500 mS -0Bh byte Register B - bit 7 = Run (0) / Halt (1) - bit 6 = Periodic interrupt enable - bit 5 = Alarm interrupt enable - bit 4 = Update-ended interrupt enable - bit 3 = Square wave interrupt enable - bit 2 = BCD calendar (0) / Binary (1) - bit 1 = 12-hour mode (0) / 24-hour mode (1) - bit 0 = 0 (DST off) / 1 (DST enabled) -OCh byte Register C (read only) - bit 7 = interrupt request flag (IRQF) - bit 6 = periodic interrupt flag (PF) - bit 5 = alarm interrupt flag (AF) - bit 4 = update interrupt flag (UF) - bit 3-0 = reserved -ODh byte Register D (read only) - bit 7 = RTC has power - bit 6-0 = reserved -32h byte Current century BCD (*) - (*) location vendor specific and now determined from ACPI global tables - -2.3) APIC - -On Pentium and later processors, an on-board timer is available to each CPU -as part of the Advanced Programmable Interrupt Controller. The APIC is -accessed through memory-mapped registers and provides interrupt service to each -CPU, used for IPIs and local timer interrupts. - -Although in theory the APIC is a safe and stable source for local interrupts, -in practice, many bugs and glitches have occurred due to the special nature of -the APIC CPU-local memory-mapped hardware. Beware that CPU errata may affect -the use of the APIC and that workarounds may be required. In addition, some of -these workarounds pose unique constraints for virtualization - requiring either -extra overhead incurred from extra reads of memory-mapped I/O or additional -functionality that may be more computationally expensive to implement. - -Since the APIC is documented quite well in the Intel and AMD manuals, we will -avoid repetition of the detail here. It should be pointed out that the APIC -timer is programmed through the LVT (local vector timer) register, is capable -of one-shot or periodic operation, and is based on the bus clock divided down -by the programmable divider register. - -2.4) HPET - -HPET is quite complex, and was originally intended to replace the PIT / RTC -support of the X86 PC. It remains to be seen whether that will be the case, as -the de facto standard of PC hardware is to emulate these older devices. Some -systems designated as legacy free may support only the HPET as a hardware timer -device. - -The HPET spec is rather loose and vague, requiring at least 3 hardware timers, -but allowing implementation freedom to support many more. It also imposes no -fixed rate on the timer frequency, but does impose some extremal values on -frequency, error and slew. - -In general, the HPET is recommended as a high precision (compared to PIT /RTC) -time source which is independent of local variation (as there is only one HPET -in any given system). The HPET is also memory-mapped, and its presence is -indicated through ACPI tables by the BIOS. - -Detailed specification of the HPET is beyond the current scope of this -document, as it is also very well documented elsewhere. - -2.5) Offboard Timers - -Several cards, both proprietary (watchdog boards) and commonplace (e1000) have -timing chips built into the cards which may have registers which are accessible -to kernel or user drivers. To the author's knowledge, using these to generate -a clocksource for a Linux or other kernel has not yet been attempted and is in -general frowned upon as not playing by the agreed rules of the game. Such a -timer device would require additional support to be virtualized properly and is -not considered important at this time as no known operating system does this. - -========================================================================= - -3) TSC Hardware - -The TSC or time stamp counter is relatively simple in theory; it counts -instruction cycles issued by the processor, which can be used as a measure of -time. In practice, due to a number of problems, it is the most complicated -timekeeping device to use. - -The TSC is represented internally as a 64-bit MSR which can be read with the -RDMSR, RDTSC, or RDTSCP (when available) instructions. In the past, hardware -limitations made it possible to write the TSC, but generally on old hardware it -was only possible to write the low 32-bits of the 64-bit counter, and the upper -32-bits of the counter were cleared. Now, however, on Intel processors family -0Fh, for models 3, 4 and 6, and family 06h, models e and f, this restriction -has been lifted and all 64-bits are writable. On AMD systems, the ability to -write the TSC MSR is not an architectural guarantee. - -The TSC is accessible from CPL-0 and conditionally, for CPL > 0 software by -means of the CR4.TSD bit, which when enabled, disables CPL > 0 TSC access. - -Some vendors have implemented an additional instruction, RDTSCP, which returns -atomically not just the TSC, but an indicator which corresponds to the -processor number. This can be used to index into an array of TSC variables to -determine offset information in SMP systems where TSCs are not synchronized. -The presence of this instruction must be determined by consulting CPUID feature -bits. - -Both VMX and SVM provide extension fields in the virtualization hardware which -allows the guest visible TSC to be offset by a constant. Newer implementations -promise to allow the TSC to additionally be scaled, but this hardware is not -yet widely available. - -3.1) TSC synchronization - -The TSC is a CPU-local clock in most implementations. This means, on SMP -platforms, the TSCs of different CPUs may start at different times depending -on when the CPUs are powered on. Generally, CPUs on the same die will share -the same clock, however, this is not always the case. - -The BIOS may attempt to resynchronize the TSCs during the poweron process and -the operating system or other system software may attempt to do this as well. -Several hardware limitations make the problem worse - if it is not possible to -write the full 64-bits of the TSC, it may be impossible to match the TSC in -newly arriving CPUs to that of the rest of the system, resulting in -unsynchronized TSCs. This may be done by BIOS or system software, but in -practice, getting a perfectly synchronized TSC will not be possible unless all -values are read from the same clock, which generally only is possible on single -socket systems or those with special hardware support. - -3.2) TSC and CPU hotplug - -As touched on already, CPUs which arrive later than the boot time of the system -may not have a TSC value that is synchronized with the rest of the system. -Either system software, BIOS, or SMM code may actually try to establish the TSC -to a value matching the rest of the system, but a perfect match is usually not -a guarantee. This can have the effect of bringing a system from a state where -TSC is synchronized back to a state where TSC synchronization flaws, however -small, may be exposed to the OS and any virtualization environment. - -3.3) TSC and multi-socket / NUMA - -Multi-socket systems, especially large multi-socket systems are likely to have -individual clocksources rather than a single, universally distributed clock. -Since these clocks are driven by different crystals, they will not have -perfectly matched frequency, and temperature and electrical variations will -cause the CPU clocks, and thus the TSCs to drift over time. Depending on the -exact clock and bus design, the drift may or may not be fixed in absolute -error, and may accumulate over time. - -In addition, very large systems may deliberately slew the clocks of individual -cores. This technique, known as spread-spectrum clocking, reduces EMI at the -clock frequency and harmonics of it, which may be required to pass FCC -standards for telecommunications and computer equipment. - -It is recommended not to trust the TSCs to remain synchronized on NUMA or -multiple socket systems for these reasons. - -3.4) TSC and C-states - -C-states, or idling states of the processor, especially C1E and deeper sleep -states may be problematic for TSC as well. The TSC may stop advancing in such -a state, resulting in a TSC which is behind that of other CPUs when execution -is resumed. Such CPUs must be detected and flagged by the operating system -based on CPU and chipset identifications. - -The TSC in such a case may be corrected by catching it up to a known external -clocksource. - -3.5) TSC frequency change / P-states - -To make things slightly more interesting, some CPUs may change frequency. They -may or may not run the TSC at the same rate, and because the frequency change -may be staggered or slewed, at some points in time, the TSC rate may not be -known other than falling within a range of values. In this case, the TSC will -not be a stable time source, and must be calibrated against a known, stable, -external clock to be a usable source of time. - -Whether the TSC runs at a constant rate or scales with the P-state is model -dependent and must be determined by inspecting CPUID, chipset or vendor -specific MSR fields. - -In addition, some vendors have known bugs where the P-state is actually -compensated for properly during normal operation, but when the processor is -inactive, the P-state may be raised temporarily to service cache misses from -other processors. In such cases, the TSC on halted CPUs could advance faster -than that of non-halted processors. AMD Turion processors are known to have -this problem. - -3.6) TSC and STPCLK / T-states - -External signals given to the processor may also have the effect of stopping -the TSC. This is typically done for thermal emergency power control to prevent -an overheating condition, and typically, there is no way to detect that this -condition has happened. - -3.7) TSC virtualization - VMX - -VMX provides conditional trapping of RDTSC, RDMSR, WRMSR and RDTSCP -instructions, which is enough for full virtualization of TSC in any manner. In -addition, VMX allows passing through the host TSC plus an additional TSC_OFFSET -field specified in the VMCS. Special instructions must be used to read and -write the VMCS field. - -3.8) TSC virtualization - SVM - -SVM provides conditional trapping of RDTSC, RDMSR, WRMSR and RDTSCP -instructions, which is enough for full virtualization of TSC in any manner. In -addition, SVM allows passing through the host TSC plus an additional offset -field specified in the SVM control block. - -3.9) TSC feature bits in Linux - -In summary, there is no way to guarantee the TSC remains in perfect -synchronization unless it is explicitly guaranteed by the architecture. Even -if so, the TSCs in multi-sockets or NUMA systems may still run independently -despite being locally consistent. - -The following feature bits are used by Linux to signal various TSC attributes, -but they can only be taken to be meaningful for UP or single node systems. - -X86_FEATURE_TSC : The TSC is available in hardware -X86_FEATURE_RDTSCP : The RDTSCP instruction is available -X86_FEATURE_CONSTANT_TSC : The TSC rate is unchanged with P-states -X86_FEATURE_NONSTOP_TSC : The TSC does not stop in C-states -X86_FEATURE_TSC_RELIABLE : TSC sync checks are skipped (VMware) - -4) Virtualization Problems - -Timekeeping is especially problematic for virtualization because a number of -challenges arise. The most obvious problem is that time is now shared between -the host and, potentially, a number of virtual machines. Thus the virtual -operating system does not run with 100% usage of the CPU, despite the fact that -it may very well make that assumption. It may expect it to remain true to very -exacting bounds when interrupt sources are disabled, but in reality only its -virtual interrupt sources are disabled, and the machine may still be preempted -at any time. This causes problems as the passage of real time, the injection -of machine interrupts and the associated clock sources are no longer completely -synchronized with real time. - -This same problem can occur on native hardware to a degree, as SMM mode may -steal cycles from the naturally on X86 systems when SMM mode is used by the -BIOS, but not in such an extreme fashion. However, the fact that SMM mode may -cause similar problems to virtualization makes it a good justification for -solving many of these problems on bare metal. - -4.1) Interrupt clocking - -One of the most immediate problems that occurs with legacy operating systems -is that the system timekeeping routines are often designed to keep track of -time by counting periodic interrupts. These interrupts may come from the PIT -or the RTC, but the problem is the same: the host virtualization engine may not -be able to deliver the proper number of interrupts per second, and so guest -time may fall behind. This is especially problematic if a high interrupt rate -is selected, such as 1000 HZ, which is unfortunately the default for many Linux -guests. - -There are three approaches to solving this problem; first, it may be possible -to simply ignore it. Guests which have a separate time source for tracking -'wall clock' or 'real time' may not need any adjustment of their interrupts to -maintain proper time. If this is not sufficient, it may be necessary to inject -additional interrupts into the guest in order to increase the effective -interrupt rate. This approach leads to complications in extreme conditions, -where host load or guest lag is too much to compensate for, and thus another -solution to the problem has risen: the guest may need to become aware of lost -ticks and compensate for them internally. Although promising in theory, the -implementation of this policy in Linux has been extremely error prone, and a -number of buggy variants of lost tick compensation are distributed across -commonly used Linux systems. - -Windows uses periodic RTC clocking as a means of keeping time internally, and -thus requires interrupt slewing to keep proper time. It does use a low enough -rate (ed: is it 18.2 Hz?) however that it has not yet been a problem in -practice. - -4.2) TSC sampling and serialization - -As the highest precision time source available, the cycle counter of the CPU -has aroused much interest from developers. As explained above, this timer has -many problems unique to its nature as a local, potentially unstable and -potentially unsynchronized source. One issue which is not unique to the TSC, -but is highlighted because of its very precise nature is sampling delay. By -definition, the counter, once read is already old. However, it is also -possible for the counter to be read ahead of the actual use of the result. -This is a consequence of the superscalar execution of the instruction stream, -which may execute instructions out of order. Such execution is called -non-serialized. Forcing serialized execution is necessary for precise -measurement with the TSC, and requires a serializing instruction, such as CPUID -or an MSR read. - -Since CPUID may actually be virtualized by a trap and emulate mechanism, this -serialization can pose a performance issue for hardware virtualization. An -accurate time stamp counter reading may therefore not always be available, and -it may be necessary for an implementation to guard against "backwards" reads of -the TSC as seen from other CPUs, even in an otherwise perfectly synchronized -system. - -4.3) Timespec aliasing - -Additionally, this lack of serialization from the TSC poses another challenge -when using results of the TSC when measured against another time source. As -the TSC is much higher precision, many possible values of the TSC may be read -while another clock is still expressing the same value. - -That is, you may read (T,T+10) while external clock C maintains the same value. -Due to non-serialized reads, you may actually end up with a range which -fluctuates - from (T-1.. T+10). Thus, any time calculated from a TSC, but -calibrated against an external value may have a range of valid values. -Re-calibrating this computation may actually cause time, as computed after the -calibration, to go backwards, compared with time computed before the -calibration. - -This problem is particularly pronounced with an internal time source in Linux, -the kernel time, which is expressed in the theoretically high resolution -timespec - but which advances in much larger granularity intervals, sometimes -at the rate of jiffies, and possibly in catchup modes, at a much larger step. - -This aliasing requires care in the computation and recalibration of kvmclock -and any other values derived from TSC computation (such as TSC virtualization -itself). - -4.4) Migration - -Migration of a virtual machine raises problems for timekeeping in two ways. -First, the migration itself may take time, during which interrupts cannot be -delivered, and after which, the guest time may need to be caught up. NTP may -be able to help to some degree here, as the clock correction required is -typically small enough to fall in the NTP-correctable window. - -An additional concern is that timers based off the TSC (or HPET, if the raw bus -clock is exposed) may now be running at different rates, requiring compensation -in some way in the hypervisor by virtualizing these timers. In addition, -migrating to a faster machine may preclude the use of a passthrough TSC, as a -faster clock cannot be made visible to a guest without the potential of time -advancing faster than usual. A slower clock is less of a problem, as it can -always be caught up to the original rate. KVM clock avoids these problems by -simply storing multipliers and offsets against the TSC for the guest to convert -back into nanosecond resolution values. - -4.5) Scheduling - -Since scheduling may be based on precise timing and firing of interrupts, the -scheduling algorithms of an operating system may be adversely affected by -virtualization. In theory, the effect is random and should be universally -distributed, but in contrived as well as real scenarios (guest device access, -causes of virtualization exits, possible context switch), this may not always -be the case. The effect of this has not been well studied. - -In an attempt to work around this, several implementations have provided a -paravirtualized scheduler clock, which reveals the true amount of CPU time for -which a virtual machine has been running. - -4.6) Watchdogs - -Watchdog timers, such as the lock detector in Linux may fire accidentally when -running under hardware virtualization due to timer interrupts being delayed or -misinterpretation of the passage of real time. Usually, these warnings are -spurious and can be ignored, but in some circumstances it may be necessary to -disable such detection. - -4.7) Delays and precision timing - -Precise timing and delays may not be possible in a virtualized system. This -can happen if the system is controlling physical hardware, or issues delays to -compensate for slower I/O to and from devices. The first issue is not solvable -in general for a virtualized system; hardware control software can't be -adequately virtualized without a full real-time operating system, which would -require an RT aware virtualization platform. - -The second issue may cause performance problems, but this is unlikely to be a -significant issue. In many cases these delays may be eliminated through -configuration or paravirtualization. - -4.8) Covert channels and leaks - -In addition to the above problems, time information will inevitably leak to the -guest about the host in anything but a perfect implementation of virtualized -time. This may allow the guest to infer the presence of a hypervisor (as in a -red-pill type detection), and it may allow information to leak between guests -by using CPU utilization itself as a signalling channel. Preventing such -problems would require completely isolated virtual time which may not track -real time any longer. This may be useful in certain security or QA contexts, -but in general isn't recommended for real-world deployment scenarios. -- cgit v1.2.3 From 033741c6c997e60f9c2de280925519d3ccff5366 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 10 Feb 2020 07:03:05 +0100 Subject: docs: kvm: review-checklist.txt: rename to ReST This file is already in ReST compatible format. So, rename it and add to the kvm's index.rst. While here, use the standard conversion for document titles. Signed-off-by: Mauro Carvalho Chehab Reviewed-by: Cornelia Huck Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/index.rst | 2 ++ Documentation/virt/kvm/review-checklist.rst | 41 +++++++++++++++++++++++++++++ Documentation/virt/kvm/review-checklist.txt | 38 -------------------------- 3 files changed, 43 insertions(+), 38 deletions(-) create mode 100644 Documentation/virt/kvm/review-checklist.rst delete mode 100644 Documentation/virt/kvm/review-checklist.txt diff --git a/Documentation/virt/kvm/index.rst b/Documentation/virt/kvm/index.rst index 7c1be8910837..774deaebf7fa 100644 --- a/Documentation/virt/kvm/index.rst +++ b/Documentation/virt/kvm/index.rst @@ -21,6 +21,8 @@ KVM timekeeping vcpu-requests + review-checklist + arm/index devices/index diff --git a/Documentation/virt/kvm/review-checklist.rst b/Documentation/virt/kvm/review-checklist.rst new file mode 100644 index 000000000000..1f86a9d3f705 --- /dev/null +++ b/Documentation/virt/kvm/review-checklist.rst @@ -0,0 +1,41 @@ +.. SPDX-License-Identifier: GPL-2.0 + +================================ +Review checklist for kvm patches +================================ + +1. The patch must follow Documentation/process/coding-style.rst and + Documentation/process/submitting-patches.rst. + +2. Patches should be against kvm.git master branch. + +3. If the patch introduces or modifies a new userspace API: + - the API must be documented in Documentation/virt/kvm/api.txt + - the API must be discoverable using KVM_CHECK_EXTENSION + +4. New state must include support for save/restore. + +5. New features must default to off (userspace should explicitly request them). + Performance improvements can and should default to on. + +6. New cpu features should be exposed via KVM_GET_SUPPORTED_CPUID2 + +7. Emulator changes should be accompanied by unit tests for qemu-kvm.git + kvm/test directory. + +8. Changes should be vendor neutral when possible. Changes to common code + are better than duplicating changes to vendor code. + +9. Similarly, prefer changes to arch independent code than to arch dependent + code. + +10. User/kernel interfaces and guest/host interfaces must be 64-bit clean + (all variables and sizes naturally aligned on 64-bit; use specific types + only - u64 rather than ulong). + +11. New guest visible features must either be documented in a hardware manual + or be accompanied by documentation. + +12. Features must be robust against reset and kexec - for example, shared + host/guest memory must be unshared to prevent the host from writing to + guest memory that the guest has not reserved for this purpose. diff --git a/Documentation/virt/kvm/review-checklist.txt b/Documentation/virt/kvm/review-checklist.txt deleted file mode 100644 index 499af499e296..000000000000 --- a/Documentation/virt/kvm/review-checklist.txt +++ /dev/null @@ -1,38 +0,0 @@ -Review checklist for kvm patches -================================ - -1. The patch must follow Documentation/process/coding-style.rst and - Documentation/process/submitting-patches.rst. - -2. Patches should be against kvm.git master branch. - -3. If the patch introduces or modifies a new userspace API: - - the API must be documented in Documentation/virt/kvm/api.txt - - the API must be discoverable using KVM_CHECK_EXTENSION - -4. New state must include support for save/restore. - -5. New features must default to off (userspace should explicitly request them). - Performance improvements can and should default to on. - -6. New cpu features should be exposed via KVM_GET_SUPPORTED_CPUID2 - -7. Emulator changes should be accompanied by unit tests for qemu-kvm.git - kvm/test directory. - -8. Changes should be vendor neutral when possible. Changes to common code - are better than duplicating changes to vendor code. - -9. Similarly, prefer changes to arch independent code than to arch dependent - code. - -10. User/kernel interfaces and guest/host interfaces must be 64-bit clean - (all variables and sizes naturally aligned on 64-bit; use specific types - only - u64 rather than ulong). - -11. New guest visible features must either be documented in a hardware manual - or be accompanied by documentation. - -12. Features must be robust against reset and kexec - for example, shared - host/guest memory must be unshared to prevent the host from writing to - guest memory that the guest has not reserved for this purpose. -- cgit v1.2.3 From 120881b9e888689cbdb90a1dd1689684d8bc95f3 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 10 Feb 2020 07:03:06 +0100 Subject: docs: virt: guest-halt-polling.txt convert to ReST Due to some merge conflict, this file ended being alone under Documentation/virtual. The file itself is almost at ReST format. Just minor adjustments are needed: - Adjust title markup; - Adjust a list identation; - add a literal block markup; - Add some blank lines. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Paolo Bonzini --- Documentation/virt/guest-halt-polling.rst | 84 ++++++++++++++++++++++++++++ Documentation/virt/index.rst | 1 + Documentation/virtual/guest-halt-polling.txt | 78 -------------------------- 3 files changed, 85 insertions(+), 78 deletions(-) create mode 100644 Documentation/virt/guest-halt-polling.rst delete mode 100644 Documentation/virtual/guest-halt-polling.txt diff --git a/Documentation/virt/guest-halt-polling.rst b/Documentation/virt/guest-halt-polling.rst new file mode 100644 index 000000000000..b4e747942417 --- /dev/null +++ b/Documentation/virt/guest-halt-polling.rst @@ -0,0 +1,84 @@ +================== +Guest halt polling +================== + +The cpuidle_haltpoll driver, with the haltpoll governor, allows +the guest vcpus to poll for a specified amount of time before +halting. + +This provides the following benefits to host side polling: + + 1) The POLL flag is set while polling is performed, which allows + a remote vCPU to avoid sending an IPI (and the associated + cost of handling the IPI) when performing a wakeup. + + 2) The VM-exit cost can be avoided. + +The downside of guest side polling is that polling is performed +even with other runnable tasks in the host. + +The basic logic as follows: A global value, guest_halt_poll_ns, +is configured by the user, indicating the maximum amount of +time polling is allowed. This value is fixed. + +Each vcpu has an adjustable guest_halt_poll_ns +("per-cpu guest_halt_poll_ns"), which is adjusted by the algorithm +in response to events (explained below). + +Module Parameters +================= + +The haltpoll governor has 5 tunable module parameters: + +1) guest_halt_poll_ns: + +Maximum amount of time, in nanoseconds, that polling is +performed before halting. + +Default: 200000 + +2) guest_halt_poll_shrink: + +Division factor used to shrink per-cpu guest_halt_poll_ns when +wakeup event occurs after the global guest_halt_poll_ns. + +Default: 2 + +3) guest_halt_poll_grow: + +Multiplication factor used to grow per-cpu guest_halt_poll_ns +when event occurs after per-cpu guest_halt_poll_ns +but before global guest_halt_poll_ns. + +Default: 2 + +4) guest_halt_poll_grow_start: + +The per-cpu guest_halt_poll_ns eventually reaches zero +in case of an idle system. This value sets the initial +per-cpu guest_halt_poll_ns when growing. This can +be increased from 10000, to avoid misses during the initial +growth stage: + +10k, 20k, 40k, ... (example assumes guest_halt_poll_grow=2). + +Default: 50000 + +5) guest_halt_poll_allow_shrink: + +Bool parameter which allows shrinking. Set to N +to avoid it (per-cpu guest_halt_poll_ns will remain +high once achieves global guest_halt_poll_ns value). + +Default: Y + +The module parameters can be set from the debugfs files in:: + + /sys/module/haltpoll/parameters/ + +Further Notes +============= + +- Care should be taken when setting the guest_halt_poll_ns parameter as a + large value has the potential to drive the cpu usage to 100% on a machine + which would be almost entirely idle otherwise. diff --git a/Documentation/virt/index.rst b/Documentation/virt/index.rst index 0a8f7fda64ad..de1ab81df958 100644 --- a/Documentation/virt/index.rst +++ b/Documentation/virt/index.rst @@ -10,6 +10,7 @@ Linux Virtualization Support kvm/index uml/user_mode_linux paravirt_ops + guest-halt-polling .. only:: html and subproject diff --git a/Documentation/virtual/guest-halt-polling.txt b/Documentation/virtual/guest-halt-polling.txt deleted file mode 100644 index b3a2a294532d..000000000000 --- a/Documentation/virtual/guest-halt-polling.txt +++ /dev/null @@ -1,78 +0,0 @@ -Guest halt polling -================== - -The cpuidle_haltpoll driver, with the haltpoll governor, allows -the guest vcpus to poll for a specified amount of time before -halting. -This provides the following benefits to host side polling: - - 1) The POLL flag is set while polling is performed, which allows - a remote vCPU to avoid sending an IPI (and the associated - cost of handling the IPI) when performing a wakeup. - - 2) The VM-exit cost can be avoided. - -The downside of guest side polling is that polling is performed -even with other runnable tasks in the host. - -The basic logic as follows: A global value, guest_halt_poll_ns, -is configured by the user, indicating the maximum amount of -time polling is allowed. This value is fixed. - -Each vcpu has an adjustable guest_halt_poll_ns -("per-cpu guest_halt_poll_ns"), which is adjusted by the algorithm -in response to events (explained below). - -Module Parameters -================= - -The haltpoll governor has 5 tunable module parameters: - -1) guest_halt_poll_ns: -Maximum amount of time, in nanoseconds, that polling is -performed before halting. - -Default: 200000 - -2) guest_halt_poll_shrink: -Division factor used to shrink per-cpu guest_halt_poll_ns when -wakeup event occurs after the global guest_halt_poll_ns. - -Default: 2 - -3) guest_halt_poll_grow: -Multiplication factor used to grow per-cpu guest_halt_poll_ns -when event occurs after per-cpu guest_halt_poll_ns -but before global guest_halt_poll_ns. - -Default: 2 - -4) guest_halt_poll_grow_start: -The per-cpu guest_halt_poll_ns eventually reaches zero -in case of an idle system. This value sets the initial -per-cpu guest_halt_poll_ns when growing. This can -be increased from 10000, to avoid misses during the initial -growth stage: - -10k, 20k, 40k, ... (example assumes guest_halt_poll_grow=2). - -Default: 50000 - -5) guest_halt_poll_allow_shrink: - -Bool parameter which allows shrinking. Set to N -to avoid it (per-cpu guest_halt_poll_ns will remain -high once achieves global guest_halt_poll_ns value). - -Default: Y - -The module parameters can be set from the debugfs files in: - - /sys/module/haltpoll/parameters/ - -Further Notes -============= - -- Care should be taken when setting the guest_halt_poll_ns parameter as a -large value has the potential to drive the cpu usage to 100% on a machine which -would be almost entirely idle otherwise. -- cgit v1.2.3 From 242b5e068b25119d6f1d63dac076b79d89580b4b Mon Sep 17 00:00:00 2001 From: Dave Ertman Date: Thu, 6 Feb 2020 01:19:59 -0800 Subject: ice: Fix DCB rebuild after reset The function ice_dcb_rebuild had some logic flaws in it, and also didn't differentiate between FW and SW modes needs. For FW flow, the willing setting was being forced to OFF and left that way. Unwilling in DCB FW mode is not a supported model. Leave the config alone and use the return value from the set command to determine if setting the config was successful. The SW DCB flow does not need to need to register for MIB change events (as they are not used in SW mode). Use !is_sw_lldp checks to only perform FW specific task while in FW mode. Also adding a reapplication of the current DCB config after a link event. Some NVMs are not maintaining their DCB configs across link events. Signed-off-by: Dave Ertman Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/ice/ice_dcb_lib.c | 88 +++++++++++----------------- drivers/net/ethernet/intel/ice/ice_main.c | 1 + 2 files changed, 36 insertions(+), 53 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_dcb_lib.c b/drivers/net/ethernet/intel/ice/ice_dcb_lib.c index 0664e5b8d130..99ee4c009a96 100644 --- a/drivers/net/ethernet/intel/ice/ice_dcb_lib.c +++ b/drivers/net/ethernet/intel/ice/ice_dcb_lib.c @@ -315,9 +315,9 @@ ice_dcb_need_recfg(struct ice_pf *pf, struct ice_dcbx_cfg *old_cfg, */ void ice_dcb_rebuild(struct ice_pf *pf) { - struct ice_dcbx_cfg *local_dcbx_cfg, *desired_dcbx_cfg, *prev_cfg; struct ice_aqc_port_ets_elem buf = { 0 }; struct device *dev = ice_pf_to_dev(pf); + struct ice_dcbx_cfg *err_cfg; enum ice_status ret; ret = ice_query_port_ets(pf->hw.port_info, &buf, sizeof(buf), NULL); @@ -330,53 +330,25 @@ void ice_dcb_rebuild(struct ice_pf *pf) if (!test_bit(ICE_FLAG_DCB_ENA, pf->flags)) return; - local_dcbx_cfg = &pf->hw.port_info->local_dcbx_cfg; - desired_dcbx_cfg = &pf->hw.port_info->desired_dcbx_cfg; + mutex_lock(&pf->tc_mutex); - /* Save current willing state and force FW to unwilling */ - local_dcbx_cfg->etscfg.willing = 0x0; - local_dcbx_cfg->pfc.willing = 0x0; - local_dcbx_cfg->app_mode = ICE_DCBX_APPS_NON_WILLING; + if (!pf->hw.port_info->is_sw_lldp) + ice_cfg_etsrec_defaults(pf->hw.port_info); - ice_cfg_etsrec_defaults(pf->hw.port_info); ret = ice_set_dcb_cfg(pf->hw.port_info); if (ret) { - dev_err(dev, "Failed to set DCB to unwilling\n"); + dev_err(dev, "Failed to set DCB config in rebuild\n"); goto dcb_error; } - /* Retrieve DCB config and ensure same as current in SW */ - prev_cfg = kmemdup(local_dcbx_cfg, sizeof(*prev_cfg), GFP_KERNEL); - if (!prev_cfg) - goto dcb_error; - - ice_init_dcb(&pf->hw, true); - if (pf->hw.port_info->dcbx_status == ICE_DCBX_STATUS_DIS) - pf->hw.port_info->is_sw_lldp = true; - else - pf->hw.port_info->is_sw_lldp = false; - - if (ice_dcb_need_recfg(pf, prev_cfg, local_dcbx_cfg)) { - /* difference in cfg detected - disable DCB till next MIB */ - dev_err(dev, "Set local MIB not accurate\n"); - kfree(prev_cfg); - goto dcb_error; + if (!pf->hw.port_info->is_sw_lldp) { + ret = ice_cfg_lldp_mib_change(&pf->hw, true); + if (ret && !pf->hw.port_info->is_sw_lldp) { + dev_err(dev, "Failed to register for MIB changes\n"); + goto dcb_error; + } } - /* fetched config congruent to previous configuration */ - kfree(prev_cfg); - - /* Set the local desired config */ - if (local_dcbx_cfg->dcbx_mode == ICE_DCBX_MODE_CEE) - memcpy(local_dcbx_cfg, desired_dcbx_cfg, - sizeof(*local_dcbx_cfg)); - - ice_cfg_etsrec_defaults(pf->hw.port_info); - ret = ice_set_dcb_cfg(pf->hw.port_info); - if (ret) { - dev_err(dev, "Failed to set desired config\n"); - goto dcb_error; - } dev_info(dev, "DCB restored after reset\n"); ret = ice_query_port_ets(pf->hw.port_info, &buf, sizeof(buf), NULL); if (ret) { @@ -384,26 +356,32 @@ void ice_dcb_rebuild(struct ice_pf *pf) goto dcb_error; } + mutex_unlock(&pf->tc_mutex); + return; dcb_error: dev_err(dev, "Disabling DCB until new settings occur\n"); - prev_cfg = kzalloc(sizeof(*prev_cfg), GFP_KERNEL); - if (!prev_cfg) + err_cfg = kzalloc(sizeof(*err_cfg), GFP_KERNEL); + if (!err_cfg) { + mutex_unlock(&pf->tc_mutex); return; + } - prev_cfg->etscfg.willing = true; - prev_cfg->etscfg.tcbwtable[0] = ICE_TC_MAX_BW; - prev_cfg->etscfg.tsatable[0] = ICE_IEEE_TSA_ETS; - memcpy(&prev_cfg->etsrec, &prev_cfg->etscfg, sizeof(prev_cfg->etsrec)); + err_cfg->etscfg.willing = true; + err_cfg->etscfg.tcbwtable[0] = ICE_TC_MAX_BW; + err_cfg->etscfg.tsatable[0] = ICE_IEEE_TSA_ETS; + memcpy(&err_cfg->etsrec, &err_cfg->etscfg, sizeof(err_cfg->etsrec)); /* Coverity warns the return code of ice_pf_dcb_cfg() is not checked * here as is done for other calls to that function. That check is * not necessary since this is in this function's error cleanup path. * Suppress the Coverity warning with the following comment... */ /* coverity[check_return] */ - ice_pf_dcb_cfg(pf, prev_cfg, false); - kfree(prev_cfg); + ice_pf_dcb_cfg(pf, err_cfg, false); + kfree(err_cfg); + + mutex_unlock(&pf->tc_mutex); } /** @@ -777,6 +755,8 @@ ice_dcb_process_lldp_set_mib_change(struct ice_pf *pf, } } + mutex_lock(&pf->tc_mutex); + /* store the old configuration */ tmp_dcbx_cfg = pf->hw.port_info->local_dcbx_cfg; @@ -787,20 +767,20 @@ ice_dcb_process_lldp_set_mib_change(struct ice_pf *pf, ret = ice_get_dcb_cfg(pf->hw.port_info); if (ret) { dev_err(dev, "Failed to get DCB config\n"); - return; + goto out; } /* No change detected in DCBX configs */ if (!memcmp(&tmp_dcbx_cfg, &pi->local_dcbx_cfg, sizeof(tmp_dcbx_cfg))) { dev_dbg(dev, "No change detected in DCBX configuration.\n"); - return; + goto out; } need_reconfig = ice_dcb_need_recfg(pf, &tmp_dcbx_cfg, &pi->local_dcbx_cfg); ice_dcbnl_flush_apps(pf, &tmp_dcbx_cfg, &pi->local_dcbx_cfg); if (!need_reconfig) - return; + goto out; /* Enable DCB tagging only when more than one TC */ if (ice_dcb_get_num_tc(&pi->local_dcbx_cfg) > 1) { @@ -814,7 +794,7 @@ ice_dcb_process_lldp_set_mib_change(struct ice_pf *pf, pf_vsi = ice_get_main_vsi(pf); if (!pf_vsi) { dev_dbg(dev, "PF VSI doesn't exist\n"); - return; + goto out; } rtnl_lock(); @@ -823,13 +803,15 @@ ice_dcb_process_lldp_set_mib_change(struct ice_pf *pf, ret = ice_query_port_ets(pf->hw.port_info, &buf, sizeof(buf), NULL); if (ret) { dev_err(dev, "Query Port ETS failed\n"); - rtnl_unlock(); - return; + goto unlock_rtnl; } /* changes in configuration update VSI */ ice_pf_dcb_recfg(pf); ice_ena_vsi(pf_vsi, true); +unlock_rtnl: rtnl_unlock(); +out: + mutex_unlock(&pf->tc_mutex); } diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c index 5ae671609f98..4075d5379b23 100644 --- a/drivers/net/ethernet/intel/ice/ice_main.c +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -841,6 +841,7 @@ ice_link_event(struct ice_pf *pf, struct ice_port_info *pi, bool link_up, } } + ice_dcb_rebuild(pf); ice_vsi_link_event(vsi, link_up); ice_print_link_msg(vsi, link_up); -- cgit v1.2.3 From 53977ee47410885e7d4eee87d2c811a48a275150 Mon Sep 17 00:00:00 2001 From: Dave Ertman Date: Thu, 6 Feb 2020 01:20:00 -0800 Subject: ice: Fix switch between FW and SW LLDP When switching between FW and SW LLDP mode, the number of configured TLV apps in the driver's DCB configuration is getting out of synch with what lldpad thinks is configured. This is causing a problem when shutting down lldpad. The cleanup is trying to delete TLV apps that are not defined in the kernel. Since the driver is keeping an accurate account of the apps defined, use the drivers number of apps to determine if there is an app to delete. If the number of apps is <= 1, then do not attempt to delete. Signed-off-by: Dave Ertman Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/ice/ice_dcb_nl.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_dcb_nl.c b/drivers/net/ethernet/intel/ice/ice_dcb_nl.c index d870c1aedc17..926c9772f086 100644 --- a/drivers/net/ethernet/intel/ice/ice_dcb_nl.c +++ b/drivers/net/ethernet/intel/ice/ice_dcb_nl.c @@ -713,13 +713,13 @@ static int ice_dcbnl_delapp(struct net_device *netdev, struct dcb_app *app) return -EINVAL; mutex_lock(&pf->tc_mutex); - ret = dcb_ieee_delapp(netdev, app); - if (ret) - goto delapp_out; - old_cfg = &pf->hw.port_info->local_dcbx_cfg; - if (old_cfg->numapps == 1) + if (old_cfg->numapps <= 1) + goto delapp_out; + + ret = dcb_ieee_delapp(netdev, app); + if (ret) goto delapp_out; new_cfg = &pf->hw.port_info->desired_dcbx_cfg; -- cgit v1.2.3 From ad9a87bec3d004ce80e9104e3a0aa6a204a15dbf Mon Sep 17 00:00:00 2001 From: Paul Greenwalt Date: Thu, 6 Feb 2020 01:20:01 -0800 Subject: ice: display supported and advertised link modes Display all of the supported and advertised link modes based on the PHY capability with media. Displaying all supported modes is more informative then only displaying the current link mode. Signed-off-by: Paul Greenwalt Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/ice/ice_ethtool.c | 282 +-------------------------- 1 file changed, 2 insertions(+), 280 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_ethtool.c b/drivers/net/ethernet/intel/ice/ice_ethtool.c index 90c6a3ca20c9..26eca4ce9e2c 100644 --- a/drivers/net/ethernet/intel/ice/ice_ethtool.c +++ b/drivers/net/ethernet/intel/ice/ice_ethtool.c @@ -1710,291 +1710,13 @@ ice_get_settings_link_up(struct ethtool_link_ksettings *ks, { struct ice_netdev_priv *np = netdev_priv(netdev); struct ice_port_info *pi = np->vsi->port_info; - struct ethtool_link_ksettings cap_ksettings; struct ice_link_status *link_info; struct ice_vsi *vsi = np->vsi; - bool unrecog_phy_high = false; - bool unrecog_phy_low = false; link_info = &vsi->port_info->phy.link_info; - /* Initialize supported and advertised settings based on PHY settings */ - switch (link_info->phy_type_low) { - case ICE_PHY_TYPE_LOW_100BASE_TX: - ethtool_link_ksettings_add_link_mode(ks, supported, Autoneg); - ethtool_link_ksettings_add_link_mode(ks, supported, - 100baseT_Full); - ethtool_link_ksettings_add_link_mode(ks, advertising, Autoneg); - ethtool_link_ksettings_add_link_mode(ks, advertising, - 100baseT_Full); - break; - case ICE_PHY_TYPE_LOW_100M_SGMII: - ethtool_link_ksettings_add_link_mode(ks, supported, - 100baseT_Full); - break; - case ICE_PHY_TYPE_LOW_1000BASE_T: - ethtool_link_ksettings_add_link_mode(ks, supported, Autoneg); - ethtool_link_ksettings_add_link_mode(ks, supported, - 1000baseT_Full); - ethtool_link_ksettings_add_link_mode(ks, advertising, Autoneg); - ethtool_link_ksettings_add_link_mode(ks, advertising, - 1000baseT_Full); - break; - case ICE_PHY_TYPE_LOW_1G_SGMII: - ethtool_link_ksettings_add_link_mode(ks, supported, - 1000baseT_Full); - break; - case ICE_PHY_TYPE_LOW_1000BASE_SX: - case ICE_PHY_TYPE_LOW_1000BASE_LX: - ethtool_link_ksettings_add_link_mode(ks, supported, - 1000baseX_Full); - break; - case ICE_PHY_TYPE_LOW_1000BASE_KX: - ethtool_link_ksettings_add_link_mode(ks, supported, Autoneg); - ethtool_link_ksettings_add_link_mode(ks, supported, - 1000baseKX_Full); - ethtool_link_ksettings_add_link_mode(ks, advertising, Autoneg); - ethtool_link_ksettings_add_link_mode(ks, advertising, - 1000baseKX_Full); - break; - case ICE_PHY_TYPE_LOW_2500BASE_T: - ethtool_link_ksettings_add_link_mode(ks, supported, Autoneg); - ethtool_link_ksettings_add_link_mode(ks, advertising, Autoneg); - ethtool_link_ksettings_add_link_mode(ks, supported, - 2500baseT_Full); - ethtool_link_ksettings_add_link_mode(ks, advertising, - 2500baseT_Full); - break; - case ICE_PHY_TYPE_LOW_2500BASE_X: - ethtool_link_ksettings_add_link_mode(ks, supported, - 2500baseX_Full); - break; - case ICE_PHY_TYPE_LOW_2500BASE_KX: - ethtool_link_ksettings_add_link_mode(ks, supported, Autoneg); - ethtool_link_ksettings_add_link_mode(ks, supported, - 2500baseX_Full); - ethtool_link_ksettings_add_link_mode(ks, advertising, Autoneg); - ethtool_link_ksettings_add_link_mode(ks, advertising, - 2500baseX_Full); - break; - case ICE_PHY_TYPE_LOW_5GBASE_T: - case ICE_PHY_TYPE_LOW_5GBASE_KR: - ethtool_link_ksettings_add_link_mode(ks, supported, Autoneg); - ethtool_link_ksettings_add_link_mode(ks, supported, - 5000baseT_Full); - ethtool_link_ksettings_add_link_mode(ks, advertising, Autoneg); - ethtool_link_ksettings_add_link_mode(ks, advertising, - 5000baseT_Full); - break; - case ICE_PHY_TYPE_LOW_10GBASE_T: - ethtool_link_ksettings_add_link_mode(ks, supported, Autoneg); - ethtool_link_ksettings_add_link_mode(ks, supported, - 10000baseT_Full); - ethtool_link_ksettings_add_link_mode(ks, advertising, Autoneg); - ethtool_link_ksettings_add_link_mode(ks, advertising, - 10000baseT_Full); - break; - case ICE_PHY_TYPE_LOW_10G_SFI_DA: - case ICE_PHY_TYPE_LOW_10G_SFI_AOC_ACC: - case ICE_PHY_TYPE_LOW_10G_SFI_C2C: - ethtool_link_ksettings_add_link_mode(ks, supported, - 10000baseT_Full); - break; - case ICE_PHY_TYPE_LOW_10GBASE_SR: - ethtool_link_ksettings_add_link_mode(ks, supported, - 10000baseSR_Full); - break; - case ICE_PHY_TYPE_LOW_10GBASE_LR: - ethtool_link_ksettings_add_link_mode(ks, supported, - 10000baseLR_Full); - break; - case ICE_PHY_TYPE_LOW_10GBASE_KR_CR1: - ethtool_link_ksettings_add_link_mode(ks, supported, Autoneg); - ethtool_link_ksettings_add_link_mode(ks, supported, - 10000baseKR_Full); - ethtool_link_ksettings_add_link_mode(ks, advertising, Autoneg); - ethtool_link_ksettings_add_link_mode(ks, advertising, - 10000baseKR_Full); - break; - case ICE_PHY_TYPE_LOW_25GBASE_T: - case ICE_PHY_TYPE_LOW_25GBASE_CR: - case ICE_PHY_TYPE_LOW_25GBASE_CR_S: - case ICE_PHY_TYPE_LOW_25GBASE_CR1: - ethtool_link_ksettings_add_link_mode(ks, supported, Autoneg); - ethtool_link_ksettings_add_link_mode(ks, supported, - 25000baseCR_Full); - ethtool_link_ksettings_add_link_mode(ks, advertising, Autoneg); - ethtool_link_ksettings_add_link_mode(ks, advertising, - 25000baseCR_Full); - break; - case ICE_PHY_TYPE_LOW_25G_AUI_AOC_ACC: - case ICE_PHY_TYPE_LOW_25G_AUI_C2C: - ethtool_link_ksettings_add_link_mode(ks, supported, - 25000baseCR_Full); - break; - case ICE_PHY_TYPE_LOW_25GBASE_SR: - case ICE_PHY_TYPE_LOW_25GBASE_LR: - ethtool_link_ksettings_add_link_mode(ks, supported, - 25000baseSR_Full); - break; - case ICE_PHY_TYPE_LOW_25GBASE_KR: - case ICE_PHY_TYPE_LOW_25GBASE_KR1: - case ICE_PHY_TYPE_LOW_25GBASE_KR_S: - ethtool_link_ksettings_add_link_mode(ks, supported, Autoneg); - ethtool_link_ksettings_add_link_mode(ks, supported, - 25000baseKR_Full); - ethtool_link_ksettings_add_link_mode(ks, advertising, Autoneg); - ethtool_link_ksettings_add_link_mode(ks, advertising, - 25000baseKR_Full); - break; - case ICE_PHY_TYPE_LOW_40GBASE_CR4: - ethtool_link_ksettings_add_link_mode(ks, supported, Autoneg); - ethtool_link_ksettings_add_link_mode(ks, supported, - 40000baseCR4_Full); - ethtool_link_ksettings_add_link_mode(ks, advertising, Autoneg); - ethtool_link_ksettings_add_link_mode(ks, advertising, - 40000baseCR4_Full); - break; - case ICE_PHY_TYPE_LOW_40G_XLAUI_AOC_ACC: - case ICE_PHY_TYPE_LOW_40G_XLAUI: - ethtool_link_ksettings_add_link_mode(ks, supported, - 40000baseCR4_Full); - break; - case ICE_PHY_TYPE_LOW_40GBASE_SR4: - ethtool_link_ksettings_add_link_mode(ks, supported, - 40000baseSR4_Full); - break; - case ICE_PHY_TYPE_LOW_40GBASE_LR4: - ethtool_link_ksettings_add_link_mode(ks, supported, - 40000baseLR4_Full); - break; - case ICE_PHY_TYPE_LOW_40GBASE_KR4: - ethtool_link_ksettings_add_link_mode(ks, supported, Autoneg); - ethtool_link_ksettings_add_link_mode(ks, supported, - 40000baseKR4_Full); - ethtool_link_ksettings_add_link_mode(ks, advertising, Autoneg); - ethtool_link_ksettings_add_link_mode(ks, advertising, - 40000baseKR4_Full); - break; - case ICE_PHY_TYPE_LOW_50GBASE_CR2: - case ICE_PHY_TYPE_LOW_50GBASE_CP: - ethtool_link_ksettings_add_link_mode(ks, supported, Autoneg); - ethtool_link_ksettings_add_link_mode(ks, supported, - 50000baseCR2_Full); - ethtool_link_ksettings_add_link_mode(ks, advertising, Autoneg); - ethtool_link_ksettings_add_link_mode(ks, advertising, - 50000baseCR2_Full); - break; - case ICE_PHY_TYPE_LOW_50G_LAUI2_AOC_ACC: - case ICE_PHY_TYPE_LOW_50G_LAUI2: - case ICE_PHY_TYPE_LOW_50G_AUI2_AOC_ACC: - case ICE_PHY_TYPE_LOW_50G_AUI2: - case ICE_PHY_TYPE_LOW_50GBASE_SR: - case ICE_PHY_TYPE_LOW_50G_AUI1_AOC_ACC: - case ICE_PHY_TYPE_LOW_50G_AUI1: - ethtool_link_ksettings_add_link_mode(ks, supported, - 50000baseCR2_Full); - break; - case ICE_PHY_TYPE_LOW_50GBASE_KR2: - case ICE_PHY_TYPE_LOW_50GBASE_KR_PAM4: - ethtool_link_ksettings_add_link_mode(ks, supported, Autoneg); - ethtool_link_ksettings_add_link_mode(ks, supported, - 50000baseKR2_Full); - ethtool_link_ksettings_add_link_mode(ks, advertising, Autoneg); - ethtool_link_ksettings_add_link_mode(ks, advertising, - 50000baseKR2_Full); - break; - case ICE_PHY_TYPE_LOW_50GBASE_SR2: - case ICE_PHY_TYPE_LOW_50GBASE_LR2: - case ICE_PHY_TYPE_LOW_50GBASE_FR: - case ICE_PHY_TYPE_LOW_50GBASE_LR: - ethtool_link_ksettings_add_link_mode(ks, supported, - 50000baseSR2_Full); - break; - case ICE_PHY_TYPE_LOW_100GBASE_CR4: - ethtool_link_ksettings_add_link_mode(ks, supported, Autoneg); - ethtool_link_ksettings_add_link_mode(ks, supported, - 100000baseCR4_Full); - ethtool_link_ksettings_add_link_mode(ks, advertising, Autoneg); - ethtool_link_ksettings_add_link_mode(ks, advertising, - 100000baseCR4_Full); - break; - case ICE_PHY_TYPE_LOW_100G_CAUI4_AOC_ACC: - case ICE_PHY_TYPE_LOW_100G_CAUI4: - case ICE_PHY_TYPE_LOW_100G_AUI4_AOC_ACC: - case ICE_PHY_TYPE_LOW_100G_AUI4: - case ICE_PHY_TYPE_LOW_100GBASE_CR_PAM4: - ethtool_link_ksettings_add_link_mode(ks, supported, - 100000baseCR4_Full); - break; - case ICE_PHY_TYPE_LOW_100GBASE_CP2: - ethtool_link_ksettings_add_link_mode(ks, supported, Autoneg); - ethtool_link_ksettings_add_link_mode(ks, supported, - 100000baseCR4_Full); - ethtool_link_ksettings_add_link_mode(ks, advertising, Autoneg); - ethtool_link_ksettings_add_link_mode(ks, advertising, - 100000baseCR4_Full); - break; - case ICE_PHY_TYPE_LOW_100GBASE_SR4: - case ICE_PHY_TYPE_LOW_100GBASE_SR2: - ethtool_link_ksettings_add_link_mode(ks, supported, - 100000baseSR4_Full); - break; - case ICE_PHY_TYPE_LOW_100GBASE_LR4: - case ICE_PHY_TYPE_LOW_100GBASE_DR: - ethtool_link_ksettings_add_link_mode(ks, supported, - 100000baseLR4_ER4_Full); - break; - case ICE_PHY_TYPE_LOW_100GBASE_KR4: - case ICE_PHY_TYPE_LOW_100GBASE_KR_PAM4: - ethtool_link_ksettings_add_link_mode(ks, supported, Autoneg); - ethtool_link_ksettings_add_link_mode(ks, supported, - 100000baseKR4_Full); - ethtool_link_ksettings_add_link_mode(ks, advertising, Autoneg); - ethtool_link_ksettings_add_link_mode(ks, advertising, - 100000baseKR4_Full); - break; - default: - unrecog_phy_low = true; - } - - switch (link_info->phy_type_high) { - case ICE_PHY_TYPE_HIGH_100GBASE_KR2_PAM4: - ethtool_link_ksettings_add_link_mode(ks, supported, Autoneg); - ethtool_link_ksettings_add_link_mode(ks, supported, - 100000baseKR4_Full); - ethtool_link_ksettings_add_link_mode(ks, advertising, Autoneg); - ethtool_link_ksettings_add_link_mode(ks, advertising, - 100000baseKR4_Full); - break; - case ICE_PHY_TYPE_HIGH_100G_CAUI2_AOC_ACC: - case ICE_PHY_TYPE_HIGH_100G_CAUI2: - case ICE_PHY_TYPE_HIGH_100G_AUI2_AOC_ACC: - case ICE_PHY_TYPE_HIGH_100G_AUI2: - ethtool_link_ksettings_add_link_mode(ks, supported, - 100000baseCR4_Full); - break; - default: - unrecog_phy_high = true; - } - - if (unrecog_phy_low && unrecog_phy_high) { - /* if we got here and link is up something bad is afoot */ - netdev_info(netdev, - "WARNING: Unrecognized PHY_Low (0x%llx).\n", - (u64)link_info->phy_type_low); - netdev_info(netdev, - "WARNING: Unrecognized PHY_High (0x%llx).\n", - (u64)link_info->phy_type_high); - } - - /* Now that we've worked out everything that could be supported by the - * current PHY type, get what is supported by the NVM and intersect - * them to get what is truly supported - */ - memset(&cap_ksettings, 0, sizeof(cap_ksettings)); - ice_phy_type_to_ethtool(netdev, &cap_ksettings); - ethtool_intersect_link_masks(ks, &cap_ksettings); + /* Get supported and advertised settings from PHY ability with media */ + ice_phy_type_to_ethtool(netdev, ks); switch (link_info->link_speed) { case ICE_AQ_LINK_SPEED_100GB: -- cgit v1.2.3 From 168983a8e19b89efd175661e53faa6246be363a0 Mon Sep 17 00:00:00 2001 From: Brett Creeley Date: Thu, 6 Feb 2020 01:20:02 -0800 Subject: ice: Don't allow same value for Rx tail to be written twice Currently we compare the value we are about to write to the Rx tail register with the previous value of next_to_use. The problem with this is we only write tail on 8 descriptor boundaries, but next_to_use is updated whenever we clean Rx descriptors. Fix this by comparing the value we are about to write to tail with the previously written tail value. This will prevent duplicate Rx tail bumps. Signed-off-by: Brett Creeley Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/ice/ice_txrx_lib.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/ice/ice_txrx_lib.c b/drivers/net/ethernet/intel/ice/ice_txrx_lib.c index 35bbc4ff603c..6da048a6ca7c 100644 --- a/drivers/net/ethernet/intel/ice/ice_txrx_lib.c +++ b/drivers/net/ethernet/intel/ice/ice_txrx_lib.c @@ -10,7 +10,7 @@ */ void ice_release_rx_desc(struct ice_ring *rx_ring, u32 val) { - u16 prev_ntu = rx_ring->next_to_use; + u16 prev_ntu = rx_ring->next_to_use & ~0x7; rx_ring->next_to_use = val; -- cgit v1.2.3 From 3d9f99908037a90ba9399e553b9beffcac26f068 Mon Sep 17 00:00:00 2001 From: Brett Creeley Date: Thu, 6 Feb 2020 01:20:03 -0800 Subject: ice: Remove ice_dev_onetime_setup() ice_dev_onetime_setup contains driver workarounds needed for firmware limitations. These issues have now been resolved in newer NVMs so remove the function. Signed-off-by: Brett Creeley Signed-off-by: Tony Nguyen Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/ice/ice_common.c | 16 ---------------- drivers/net/ethernet/intel/ice/ice_common.h | 2 -- drivers/net/ethernet/intel/ice/ice_hw_autogen.h | 1 - drivers/net/ethernet/intel/ice/ice_lib.c | 1 - 4 files changed, 20 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_common.c b/drivers/net/ethernet/intel/ice/ice_common.c index 0207e28c2682..cee2c91381bc 100644 --- a/drivers/net/ethernet/intel/ice/ice_common.c +++ b/drivers/net/ethernet/intel/ice/ice_common.c @@ -24,20 +24,6 @@ static enum ice_status ice_set_mac_type(struct ice_hw *hw) return 0; } -/** - * ice_dev_onetime_setup - Temporary HW/FW workarounds - * @hw: pointer to the HW structure - * - * This function provides temporary workarounds for certain issues - * that are expected to be fixed in the HW/FW. - */ -void ice_dev_onetime_setup(struct ice_hw *hw) -{ -#define MBX_PF_VT_PFALLOC 0x00231E80 - /* set VFs per PF */ - wr32(hw, MBX_PF_VT_PFALLOC, rd32(hw, PF_VT_PFALLOC_HIF)); -} - /** * ice_clear_pf_cfg - Clear PF configuration * @hw: pointer to the hardware structure @@ -763,8 +749,6 @@ enum ice_status ice_init_hw(struct ice_hw *hw) if (status) goto err_unroll_sched; - ice_dev_onetime_setup(hw); - /* Get MAC information */ /* A single port can report up to two (LAN and WoL) addresses */ mac_buf = devm_kcalloc(ice_hw_to_dev(hw), 2, diff --git a/drivers/net/ethernet/intel/ice/ice_common.h b/drivers/net/ethernet/intel/ice/ice_common.h index b5c013fdaaf9..f9fc005d35a7 100644 --- a/drivers/net/ethernet/intel/ice/ice_common.h +++ b/drivers/net/ethernet/intel/ice/ice_common.h @@ -54,8 +54,6 @@ enum ice_status ice_get_caps(struct ice_hw *hw); void ice_set_safe_mode_caps(struct ice_hw *hw); -void ice_dev_onetime_setup(struct ice_hw *hw); - enum ice_status ice_write_rxq_ctx(struct ice_hw *hw, struct ice_rlan_ctx *rlan_ctx, u32 rxq_index); diff --git a/drivers/net/ethernet/intel/ice/ice_hw_autogen.h b/drivers/net/ethernet/intel/ice/ice_hw_autogen.h index f2cababf2561..edff3260060d 100644 --- a/drivers/net/ethernet/intel/ice/ice_hw_autogen.h +++ b/drivers/net/ethernet/intel/ice/ice_hw_autogen.h @@ -331,7 +331,6 @@ #define GLV_TEPC(_VSI) (0x00312000 + ((_VSI) * 4)) #define GLV_UPRCL(_i) (0x003B2000 + ((_i) * 8)) #define GLV_UPTCL(_i) (0x0030A000 + ((_i) * 8)) -#define PF_VT_PFALLOC_HIF 0x0009DD80 #define VSIQF_HKEY_MAX_INDEX 12 #define VSIQF_HLUT_MAX_INDEX 15 #define VFINT_DYN_CTLN(_i) (0x00003800 + ((_i) * 4)) diff --git a/drivers/net/ethernet/intel/ice/ice_lib.c b/drivers/net/ethernet/intel/ice/ice_lib.c index 1874c9f51a32..19f908d9b47f 100644 --- a/drivers/net/ethernet/intel/ice/ice_lib.c +++ b/drivers/net/ethernet/intel/ice/ice_lib.c @@ -2686,7 +2686,6 @@ int ice_vsi_rebuild(struct ice_vsi *vsi, bool init_vsi) ice_vsi_put_qs(vsi); ice_vsi_clear_rings(vsi); ice_vsi_free_arrays(vsi); - ice_dev_onetime_setup(&pf->hw); if (vsi->type == ICE_VSI_VF) ice_vsi_set_num_qs(vsi, vf->vf_id); else -- cgit v1.2.3 From a8b72ce03a114239e95c9dd7e892c65452478183 Mon Sep 17 00:00:00 2001 From: Anirudh Venkataramanan Date: Thu, 6 Feb 2020 01:20:04 -0800 Subject: ice: Remove CONFIG_PCI_IOV wrap in ice_set_pf_caps Remove unnecessary CONFIG_PCI_IOV wrapping in ice_set_pf_caps. None of the data structures accessed within the block are wrapped with this flag. When CONFIG_PCI_IOV is undefined, pf->num_vfs_supported will be 0 anyway. Signed-off-by: Anirudh Venkataramanan Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/ice/ice_main.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c index 4075d5379b23..32fd3dc3c7c9 100644 --- a/drivers/net/ethernet/intel/ice/ice_main.c +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -2665,14 +2665,12 @@ static void ice_set_pf_caps(struct ice_pf *pf) clear_bit(ICE_FLAG_DCB_CAPABLE, pf->flags); if (func_caps->common_cap.dcb) set_bit(ICE_FLAG_DCB_CAPABLE, pf->flags); -#ifdef CONFIG_PCI_IOV clear_bit(ICE_FLAG_SRIOV_CAPABLE, pf->flags); if (func_caps->common_cap.sr_iov_1_1) { set_bit(ICE_FLAG_SRIOV_CAPABLE, pf->flags); pf->num_vfs_supported = min_t(int, func_caps->num_allocd_vfs, ICE_MAX_VF_COUNT); } -#endif /* CONFIG_PCI_IOV */ clear_bit(ICE_FLAG_RSS_ENA, pf->flags); if (func_caps->common_cap.rss_table_size) set_bit(ICE_FLAG_RSS_ENA, pf->flags); -- cgit v1.2.3 From b55e603252acdf229501d27f6ecb78107cbb1855 Mon Sep 17 00:00:00 2001 From: Akeem G Abodunrin Date: Thu, 6 Feb 2020 01:20:05 -0800 Subject: ice: Modify link message logging This patch modifies link message logging to include "Full Duplex" and "Negotiated" for FEC, so as to distinguish it from "Requested" FEC. Signed-off-by: Akeem G Abodunrin Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/ice/ice_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c index 32fd3dc3c7c9..e92af2471635 100644 --- a/drivers/net/ethernet/intel/ice/ice_main.c +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -752,7 +752,7 @@ void ice_print_link_msg(struct ice_vsi *vsi, bool isup) kfree(caps); done: - netdev_info(vsi->netdev, "NIC Link is up %sbps, Requested FEC: %s, FEC: %s, Autoneg: %s, Flow Control: %s\n", + netdev_info(vsi->netdev, "NIC Link is up %sbps Full Duplex, Requested FEC: %s, Negotiated FEC: %s, Autoneg: %s, Flow Control: %s\n", speed, fec_req, fec, an, fc); ice_print_topo_conflict(vsi); } -- cgit v1.2.3 From fbf1e1f6988e70287b1bfcad4f655ca96b681929 Mon Sep 17 00:00:00 2001 From: Bruce Allan Date: Thu, 6 Feb 2020 01:20:06 -0800 Subject: ice: fix and consolidate logging of NVM/firmware version information Logging the firmware/NVM information during driver load is redundant since that information is also available via ethtool. Move the functionality found in ice_nvm_version_str() directly into ice_get_drvinfo() and remove calling the former and logging that info during driver probe. This also gets rid of a bug in ice_nvm_version_str() where it returns a pointer to a buffer which is free'ed when that function exits. Signed-off-by: Bruce Allan Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/ice/ice_ethtool.c | 15 +++++++++++++-- drivers/net/ethernet/intel/ice/ice_lib.c | 19 ------------------- drivers/net/ethernet/intel/ice/ice_lib.h | 2 -- drivers/net/ethernet/intel/ice/ice_main.c | 5 ----- 4 files changed, 13 insertions(+), 28 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_ethtool.c b/drivers/net/ethernet/intel/ice/ice_ethtool.c index 26eca4ce9e2c..7539fd8147de 100644 --- a/drivers/net/ethernet/intel/ice/ice_ethtool.c +++ b/drivers/net/ethernet/intel/ice/ice_ethtool.c @@ -166,13 +166,24 @@ static void ice_get_drvinfo(struct net_device *netdev, struct ethtool_drvinfo *drvinfo) { struct ice_netdev_priv *np = netdev_priv(netdev); + u8 oem_ver, oem_patch, nvm_ver_hi, nvm_ver_lo; struct ice_vsi *vsi = np->vsi; struct ice_pf *pf = vsi->back; + struct ice_hw *hw = &pf->hw; + u16 oem_build; strlcpy(drvinfo->driver, KBUILD_MODNAME, sizeof(drvinfo->driver)); strlcpy(drvinfo->version, ice_drv_ver, sizeof(drvinfo->version)); - strlcpy(drvinfo->fw_version, ice_nvm_version_str(&pf->hw), - sizeof(drvinfo->fw_version)); + + /* Display NVM version (from which the firmware version can be + * determined) which contains more pertinent information. + */ + ice_get_nvm_version(hw, &oem_ver, &oem_build, &oem_patch, + &nvm_ver_hi, &nvm_ver_lo); + snprintf(drvinfo->fw_version, sizeof(drvinfo->fw_version), + "%x.%02x 0x%x %d.%d.%d", nvm_ver_hi, nvm_ver_lo, + hw->nvm.eetrack, oem_ver, oem_build, oem_patch); + strlcpy(drvinfo->bus_info, pci_name(pf->pdev), sizeof(drvinfo->bus_info)); drvinfo->n_priv_flags = ICE_PRIV_FLAG_ARRAY_SIZE; diff --git a/drivers/net/ethernet/intel/ice/ice_lib.c b/drivers/net/ethernet/intel/ice/ice_lib.c index 19f908d9b47f..921d21285f14 100644 --- a/drivers/net/ethernet/intel/ice/ice_lib.c +++ b/drivers/net/ethernet/intel/ice/ice_lib.c @@ -2890,25 +2890,6 @@ out: } #endif /* CONFIG_DCB */ -/** - * ice_nvm_version_str - format the NVM version strings - * @hw: ptr to the hardware info - */ -char *ice_nvm_version_str(struct ice_hw *hw) -{ - u8 oem_ver, oem_patch, ver_hi, ver_lo; - static char buf[ICE_NVM_VER_LEN]; - u16 oem_build; - - ice_get_nvm_version(hw, &oem_ver, &oem_build, &oem_patch, &ver_hi, - &ver_lo); - - snprintf(buf, sizeof(buf), "%x.%02x 0x%x %d.%d.%d", ver_hi, ver_lo, - hw->nvm.eetrack, oem_ver, oem_build, oem_patch); - - return buf; -} - /** * ice_update_ring_stats - Update ring statistics * @ring: ring to update diff --git a/drivers/net/ethernet/intel/ice/ice_lib.h b/drivers/net/ethernet/intel/ice/ice_lib.h index 68fd0d4505c2..e2c0dadce920 100644 --- a/drivers/net/ethernet/intel/ice/ice_lib.h +++ b/drivers/net/ethernet/intel/ice/ice_lib.h @@ -97,8 +97,6 @@ void ice_vsi_cfg_frame_size(struct ice_vsi *vsi); u32 ice_intrl_usec_to_reg(u8 intrl, u8 gran); -char *ice_nvm_version_str(struct ice_hw *hw); - enum ice_status ice_vsi_cfg_mac_fltr(struct ice_vsi *vsi, const u8 *macaddr, bool set); diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c index e92af2471635..f08acf18e1c7 100644 --- a/drivers/net/ethernet/intel/ice/ice_main.c +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -3244,11 +3244,6 @@ ice_probe(struct pci_dev *pdev, const struct pci_device_id __always_unused *ent) goto err_exit_unroll; } - dev_info(dev, "firmware %d.%d.%d api %d.%d.%d nvm %s build 0x%08x\n", - hw->fw_maj_ver, hw->fw_min_ver, hw->fw_patch, - hw->api_maj_ver, hw->api_min_ver, hw->api_patch, - ice_nvm_version_str(hw), hw->fw_build); - ice_request_fw(pf); /* if ice_request_fw fails, ICE_FLAG_ADV_FEATURES bit won't be -- cgit v1.2.3 From cf8fc2a0863f9ff27ebd2efcdb1f7d378b9fb8a6 Mon Sep 17 00:00:00 2001 From: Bruce Allan Date: Thu, 6 Feb 2020 01:20:07 -0800 Subject: ice: update Unit Load Status bitmask to check after reset After a reset the Unit Load Status bits in the GLNVM_ULD register to check for completion should be 0x7FF before continuing. Update the mask to check (minus the three reserved bits that are always set). Signed-off-by: Bruce Allan Signed-off-by: Tony Nguyen Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/ice/ice_common.c | 17 ++++++++++++----- drivers/net/ethernet/intel/ice/ice_hw_autogen.h | 6 ++++++ 2 files changed, 18 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_common.c b/drivers/net/ethernet/intel/ice/ice_common.c index cee2c91381bc..a46d51357650 100644 --- a/drivers/net/ethernet/intel/ice/ice_common.c +++ b/drivers/net/ethernet/intel/ice/ice_common.c @@ -818,7 +818,7 @@ void ice_deinit_hw(struct ice_hw *hw) */ enum ice_status ice_check_reset(struct ice_hw *hw) { - u32 cnt, reg = 0, grst_delay; + u32 cnt, reg = 0, grst_delay, uld_mask; /* Poll for Device Active state in case a recent CORER, GLOBR, * or EMPR has occurred. The grst delay value is in 100ms units. @@ -840,13 +840,20 @@ enum ice_status ice_check_reset(struct ice_hw *hw) return ICE_ERR_RESET_FAILED; } -#define ICE_RESET_DONE_MASK (GLNVM_ULD_CORER_DONE_M | \ - GLNVM_ULD_GLOBR_DONE_M) +#define ICE_RESET_DONE_MASK (GLNVM_ULD_PCIER_DONE_M |\ + GLNVM_ULD_PCIER_DONE_1_M |\ + GLNVM_ULD_CORER_DONE_M |\ + GLNVM_ULD_GLOBR_DONE_M |\ + GLNVM_ULD_POR_DONE_M |\ + GLNVM_ULD_POR_DONE_1_M |\ + GLNVM_ULD_PCIER_DONE_2_M) + + uld_mask = ICE_RESET_DONE_MASK; /* Device is Active; check Global Reset processes are done */ for (cnt = 0; cnt < ICE_PF_RESET_WAIT_COUNT; cnt++) { - reg = rd32(hw, GLNVM_ULD) & ICE_RESET_DONE_MASK; - if (reg == ICE_RESET_DONE_MASK) { + reg = rd32(hw, GLNVM_ULD) & uld_mask; + if (reg == uld_mask) { ice_debug(hw, ICE_DBG_INIT, "Global reset processes done. %d\n", cnt); break; diff --git a/drivers/net/ethernet/intel/ice/ice_hw_autogen.h b/drivers/net/ethernet/intel/ice/ice_hw_autogen.h index edff3260060d..6db3d0494127 100644 --- a/drivers/net/ethernet/intel/ice/ice_hw_autogen.h +++ b/drivers/net/ethernet/intel/ice/ice_hw_autogen.h @@ -267,8 +267,14 @@ #define GLNVM_GENS_SR_SIZE_S 5 #define GLNVM_GENS_SR_SIZE_M ICE_M(0x7, 5) #define GLNVM_ULD 0x000B6008 +#define GLNVM_ULD_PCIER_DONE_M BIT(0) +#define GLNVM_ULD_PCIER_DONE_1_M BIT(1) #define GLNVM_ULD_CORER_DONE_M BIT(3) #define GLNVM_ULD_GLOBR_DONE_M BIT(4) +#define GLNVM_ULD_POR_DONE_M BIT(5) +#define GLNVM_ULD_POR_DONE_1_M BIT(8) +#define GLNVM_ULD_PCIER_DONE_2_M BIT(9) +#define GLNVM_ULD_PE_DONE_M BIT(10) #define GLPCI_CNF2 0x000BE004 #define GLPCI_CNF2_CACHELINE_SIZE_M BIT(1) #define PF_FUNC_RID 0x0009E880 -- cgit v1.2.3 From 0a6ea04e3bbd20833d2b49296e5adc1c5bb86386 Mon Sep 17 00:00:00 2001 From: Tony Nguyen Date: Thu, 6 Feb 2020 01:20:08 -0800 Subject: ice: Remove possible null dereference Commit 1f45ebe0d8fb ("ice: add extra check for null Rx descriptor") moved the call to ice_construct_skb() under a null check as Coverity reported a possible use of null skb. However, the original call was not deleted, do so now. Fixes: 1f45ebe0d8fb ("ice: add extra check for null Rx descriptor") Reported-by: Bruce Allan Signed-off-by: Tony Nguyen Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/ice/ice_txrx.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c b/drivers/net/ethernet/intel/ice/ice_txrx.c index fd17ace6b226..1d4755acca3d 100644 --- a/drivers/net/ethernet/intel/ice/ice_txrx.c +++ b/drivers/net/ethernet/intel/ice/ice_txrx.c @@ -1078,8 +1078,6 @@ construct_skb: skb = ice_build_skb(rx_ring, rx_buf, &xdp); else skb = ice_construct_skb(rx_ring, rx_buf, &xdp); - } else { - skb = ice_construct_skb(rx_ring, rx_buf, &xdp); } /* exit if we failed to retrieve a buffer */ if (!skb) { -- cgit v1.2.3 From 9a946843ba5c173e259fef7a035feac994a65b59 Mon Sep 17 00:00:00 2001 From: Anirudh Venkataramanan Date: Thu, 6 Feb 2020 01:20:09 -0800 Subject: ice: Use ice_pf_to_dev Use ice_pf_to_dev(pf) instead of &pf->pdev->dev Use ice_pf_to_dev(vsi->back) instead of &vsi->back->pdev->dev When a pointer to the pf instance is available, use ice_pf_to_dev instead of ice_hw_to_dev Signed-off-by: Anirudh Venkataramanan Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/ice/ice_base.c | 12 ++++++------ drivers/net/ethernet/intel/ice/ice_dcb_nl.c | 2 +- drivers/net/ethernet/intel/ice/ice_ethtool.c | 2 +- drivers/net/ethernet/intel/ice/ice_lib.c | 14 +++++++------- drivers/net/ethernet/intel/ice/ice_main.c | 16 ++++++++-------- drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c | 8 ++++---- 6 files changed, 27 insertions(+), 27 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_base.c b/drivers/net/ethernet/intel/ice/ice_base.c index d8e975cceb21..8c7d08e2916e 100644 --- a/drivers/net/ethernet/intel/ice/ice_base.c +++ b/drivers/net/ethernet/intel/ice/ice_base.c @@ -324,7 +324,7 @@ int ice_setup_rx_ctx(struct ice_ring *ring) if (err) return err; - dev_info(&vsi->back->pdev->dev, "Registered XDP mem model MEM_TYPE_ZERO_COPY on Rx ring %d\n", + dev_info(ice_pf_to_dev(vsi->back), "Registered XDP mem model MEM_TYPE_ZERO_COPY on Rx ring %d\n", ring->q_index); } else { ring->zca.free = NULL; @@ -405,7 +405,7 @@ int ice_setup_rx_ctx(struct ice_ring *ring) /* Absolute queue number out of 2K needs to be passed */ err = ice_write_rxq_ctx(hw, &rlan_ctx, pf_q); if (err) { - dev_err(&vsi->back->pdev->dev, + dev_err(ice_pf_to_dev(vsi->back), "Failed to set LAN Rx queue context for absolute Rx queue %d error: %d\n", pf_q, err); return -EIO; @@ -428,7 +428,7 @@ int ice_setup_rx_ctx(struct ice_ring *ring) ice_alloc_rx_bufs_slow_zc(ring, ICE_DESC_UNUSED(ring)) : ice_alloc_rx_bufs(ring, ICE_DESC_UNUSED(ring)); if (err) - dev_info(&vsi->back->pdev->dev, + dev_info(ice_pf_to_dev(vsi->back), "Failed allocate some buffers on %sRx ring %d (pf_q %d)\n", ring->xsk_umem ? "UMEM enabled " : "", ring->q_index, pf_q); @@ -815,13 +815,13 @@ ice_vsi_stop_tx_ring(struct ice_vsi *vsi, enum ice_disq_rst_src rst_src, * queues at the hardware level anyway. */ if (status == ICE_ERR_RESET_ONGOING) { - dev_dbg(&vsi->back->pdev->dev, + dev_dbg(ice_pf_to_dev(vsi->back), "Reset in progress. LAN Tx queues already disabled\n"); } else if (status == ICE_ERR_DOES_NOT_EXIST) { - dev_dbg(&vsi->back->pdev->dev, + dev_dbg(ice_pf_to_dev(vsi->back), "LAN Tx queues do not exist, nothing to disable\n"); } else if (status) { - dev_err(&vsi->back->pdev->dev, + dev_err(ice_pf_to_dev(vsi->back), "Failed to disable LAN Tx queues, error: %d\n", status); return -ENODEV; } diff --git a/drivers/net/ethernet/intel/ice/ice_dcb_nl.c b/drivers/net/ethernet/intel/ice/ice_dcb_nl.c index 926c9772f086..265cf69b321b 100644 --- a/drivers/net/ethernet/intel/ice/ice_dcb_nl.c +++ b/drivers/net/ethernet/intel/ice/ice_dcb_nl.c @@ -882,7 +882,7 @@ ice_dcbnl_vsi_del_app(struct ice_vsi *vsi, sapp.protocol = app->prot_id; sapp.priority = app->priority; err = ice_dcbnl_delapp(vsi->netdev, &sapp); - dev_dbg(&vsi->back->pdev->dev, + dev_dbg(ice_pf_to_dev(vsi->back), "Deleting app for VSI idx=%d err=%d sel=%d proto=0x%x, prio=%d\n", vsi->idx, err, app->selector, app->prot_id, app->priority); } diff --git a/drivers/net/ethernet/intel/ice/ice_ethtool.c b/drivers/net/ethernet/intel/ice/ice_ethtool.c index 7539fd8147de..8110da94c979 100644 --- a/drivers/net/ethernet/intel/ice/ice_ethtool.c +++ b/drivers/net/ethernet/intel/ice/ice_ethtool.c @@ -1058,7 +1058,7 @@ ice_set_fecparam(struct net_device *netdev, struct ethtool_fecparam *fecparam) fec = ICE_FEC_NONE; break; default: - dev_warn(&vsi->back->pdev->dev, "Unsupported FEC mode: %d\n", + dev_warn(ice_pf_to_dev(vsi->back), "Unsupported FEC mode: %d\n", fecparam->fec); return -EINVAL; } diff --git a/drivers/net/ethernet/intel/ice/ice_lib.c b/drivers/net/ethernet/intel/ice/ice_lib.c index 921d21285f14..797773a45a98 100644 --- a/drivers/net/ethernet/intel/ice/ice_lib.c +++ b/drivers/net/ethernet/intel/ice/ice_lib.c @@ -117,7 +117,7 @@ static void ice_vsi_set_num_desc(struct ice_vsi *vsi) vsi->num_tx_desc = ICE_DFLT_NUM_TX_DESC; break; default: - dev_dbg(&vsi->back->pdev->dev, + dev_dbg(ice_pf_to_dev(vsi->back), "Not setting number of Tx/Rx descriptors for VSI type %d\n", vsi->type); break; @@ -724,7 +724,7 @@ static void ice_vsi_setup_q_map(struct ice_vsi *vsi, struct ice_vsi_ctx *ctxt) vsi->num_txq = tx_count; if (vsi->type == ICE_VSI_VF && vsi->num_txq != vsi->num_rxq) { - dev_dbg(&vsi->back->pdev->dev, "VF VSI should have same number of Tx and Rx queues. Hence making them equal\n"); + dev_dbg(ice_pf_to_dev(vsi->back), "VF VSI should have same number of Tx and Rx queues. Hence making them equal\n"); /* since there is a chance that num_rxq could have been changed * in the above for loop, make num_txq equal to num_rxq. */ @@ -1453,7 +1453,7 @@ setup_rings: err = ice_setup_rx_ctx(vsi->rx_rings[i]); if (err) { - dev_err(&vsi->back->pdev->dev, + dev_err(ice_pf_to_dev(vsi->back), "ice_setup_rx_ctx failed for RxQ %d, err %d\n", i, err); return err; @@ -1623,7 +1623,7 @@ int ice_vsi_manage_vlan_insertion(struct ice_vsi *vsi) status = ice_update_vsi(hw, vsi->idx, ctxt, NULL); if (status) { - dev_err(&vsi->back->pdev->dev, "update VSI for VLAN insert failed, err %d aq_err %d\n", + dev_err(ice_pf_to_dev(vsi->back), "update VSI for VLAN insert failed, err %d aq_err %d\n", status, hw->adminq.sq_last_status); ret = -EIO; goto out; @@ -1669,7 +1669,7 @@ int ice_vsi_manage_vlan_stripping(struct ice_vsi *vsi, bool ena) status = ice_update_vsi(hw, vsi->idx, ctxt, NULL); if (status) { - dev_err(&vsi->back->pdev->dev, "update VSI for VLAN strip failed, ena = %d err %d aq_err %d\n", + dev_err(ice_pf_to_dev(vsi->back), "update VSI for VLAN strip failed, ena = %d err %d aq_err %d\n", ena, status, hw->adminq.sq_last_status); ret = -EIO; goto out; @@ -1834,7 +1834,7 @@ ice_vsi_set_q_vectors_reg_idx(struct ice_vsi *vsi) struct ice_q_vector *q_vector = vsi->q_vectors[i]; if (!q_vector) { - dev_err(&vsi->back->pdev->dev, + dev_err(ice_pf_to_dev(vsi->back), "Failed to set reg_idx on q_vector %d VSI %d\n", i, vsi->vsi_num); goto clear_reg_idx; @@ -2961,7 +2961,7 @@ ice_vsi_cfg_mac_fltr(struct ice_vsi *vsi, const u8 *macaddr, bool set) status = ice_remove_mac(&vsi->back->hw, &tmp_add_list); cfg_mac_fltr_exit: - ice_free_fltr_list(&vsi->back->pdev->dev, &tmp_add_list); + ice_free_fltr_list(ice_pf_to_dev(vsi->back), &tmp_add_list); return status; } diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c index f08acf18e1c7..044995f69748 100644 --- a/drivers/net/ethernet/intel/ice/ice_main.c +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -269,7 +269,7 @@ static int ice_cfg_promisc(struct ice_vsi *vsi, u8 promisc_m, bool set_promisc) */ static int ice_vsi_sync_fltr(struct ice_vsi *vsi) { - struct device *dev = &vsi->back->pdev->dev; + struct device *dev = ice_pf_to_dev(vsi->back); struct net_device *netdev = vsi->netdev; bool promisc_forced_on = false; struct ice_pf *pf = vsi->back; @@ -1368,7 +1368,7 @@ static int ice_force_phys_link_state(struct ice_vsi *vsi, bool link_up) if (vsi->type != ICE_VSI_PF) return 0; - dev = &vsi->back->pdev->dev; + dev = ice_pf_to_dev(vsi->back); pi = vsi->port_info; @@ -1686,7 +1686,7 @@ free_q_irqs: */ static int ice_xdp_alloc_setup_rings(struct ice_vsi *vsi) { - struct device *dev = &vsi->back->pdev->dev; + struct device *dev = ice_pf_to_dev(vsi->back); int i; for (i = 0; i < vsi->num_xdp_txq; i++) { @@ -3870,14 +3870,14 @@ ice_set_features(struct net_device *netdev, netdev_features_t features) /* Don't set any netdev advanced features with device in Safe Mode */ if (ice_is_safe_mode(vsi->back)) { - dev_err(&vsi->back->pdev->dev, + dev_err(ice_pf_to_dev(vsi->back), "Device is in Safe Mode - not enabling advanced netdev features\n"); return ret; } /* Do not change setting during reset */ if (ice_is_reset_in_progress(pf->state)) { - dev_err(&vsi->back->pdev->dev, + dev_err(ice_pf_to_dev(vsi->back), "Device is resetting, changing advanced netdev features temporarily unavailable.\n"); return -EBUSY; } @@ -4420,7 +4420,7 @@ int ice_vsi_setup_tx_rings(struct ice_vsi *vsi) int i, err = 0; if (!vsi->num_txq) { - dev_err(&vsi->back->pdev->dev, "VSI %d has 0 Tx queues\n", + dev_err(ice_pf_to_dev(vsi->back), "VSI %d has 0 Tx queues\n", vsi->vsi_num); return -EINVAL; } @@ -4451,7 +4451,7 @@ int ice_vsi_setup_rx_rings(struct ice_vsi *vsi) int i, err = 0; if (!vsi->num_rxq) { - dev_err(&vsi->back->pdev->dev, "VSI %d has 0 Rx queues\n", + dev_err(ice_pf_to_dev(vsi->back), "VSI %d has 0 Rx queues\n", vsi->vsi_num); return -EINVAL; } @@ -4987,7 +4987,7 @@ static int ice_vsi_update_bridge_mode(struct ice_vsi *vsi, u16 bmode) status = ice_update_vsi(hw, vsi->idx, ctxt, NULL); if (status) { - dev_err(&vsi->back->pdev->dev, "update VSI for bridge mode failed, bmode = %d err %d aq_err %d\n", + dev_err(ice_pf_to_dev(vsi->back), "update VSI for bridge mode failed, bmode = %d err %d aq_err %d\n", bmode, status, hw->adminq.sq_last_status); ret = -EIO; goto out; diff --git a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c index 82b1e7a4cb92..8f1a934a318c 100644 --- a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c +++ b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c @@ -462,7 +462,7 @@ static int ice_vsi_manage_pvid(struct ice_vsi *vsi, u16 vid, bool enable) status = ice_update_vsi(hw, vsi->idx, ctxt, NULL); if (status) { - dev_info(&vsi->back->pdev->dev, "update VSI for port VLAN failed, err %d aq_err %d\n", + dev_info(ice_pf_to_dev(vsi->back), "update VSI for port VLAN failed, err %d aq_err %d\n", status, hw->adminq.sq_last_status); ret = -EIO; goto out; @@ -2063,7 +2063,7 @@ static int ice_vc_ena_qs_msg(struct ice_vf *vf, u8 *msg) continue; if (ice_vsi_ctrl_rx_ring(vsi, true, vf_q_id)) { - dev_err(&vsi->back->pdev->dev, + dev_err(ice_pf_to_dev(vsi->back), "Failed to enable Rx ring %d on VSI %d\n", vf_q_id, vsi->vsi_num); v_ret = VIRTCHNL_STATUS_ERR_PARAM; @@ -2166,7 +2166,7 @@ static int ice_vc_dis_qs_msg(struct ice_vf *vf, u8 *msg) if (ice_vsi_stop_tx_ring(vsi, ICE_NO_RESET, vf->vf_id, ring, &txq_meta)) { - dev_err(&vsi->back->pdev->dev, + dev_err(ice_pf_to_dev(vsi->back), "Failed to stop Tx ring %d on VSI %d\n", vf_q_id, vsi->vsi_num); v_ret = VIRTCHNL_STATUS_ERR_PARAM; @@ -2193,7 +2193,7 @@ static int ice_vc_dis_qs_msg(struct ice_vf *vf, u8 *msg) continue; if (ice_vsi_ctrl_rx_ring(vsi, false, vf_q_id)) { - dev_err(&vsi->back->pdev->dev, + dev_err(ice_pf_to_dev(vsi->back), "Failed to stop Rx ring %d on VSI %d\n", vf_q_id, vsi->vsi_num); v_ret = VIRTCHNL_STATUS_ERR_PARAM; -- cgit v1.2.3 From 19cce2c6f6dc43dd9cd6ba8a9123857192990d50 Mon Sep 17 00:00:00 2001 From: Anirudh Venkataramanan Date: Thu, 6 Feb 2020 01:20:10 -0800 Subject: ice: Make print statements more compact Formatting strings in print function calls (like dev_info, dev_err, etc.) can exceed 80 columns without making checkpatch unhappy. So remove newlines where applicable and make print statements more compact. Signed-off-by: Anirudh Venkataramanan Signed-off-by: Tony Nguyen Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/ice/ice_base.c | 22 ++- drivers/net/ethernet/intel/ice/ice_dcb_lib.c | 7 +- drivers/net/ethernet/intel/ice/ice_dcb_nl.c | 10 +- drivers/net/ethernet/intel/ice/ice_ethtool.c | 42 ++---- drivers/net/ethernet/intel/ice/ice_lib.c | 36 ++--- drivers/net/ethernet/intel/ice/ice_main.c | 167 ++++++++--------------- drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c | 62 +++------ drivers/net/ethernet/intel/ice/ice_xsk.c | 4 +- 8 files changed, 123 insertions(+), 227 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_base.c b/drivers/net/ethernet/intel/ice/ice_base.c index 8c7d08e2916e..46f427a95056 100644 --- a/drivers/net/ethernet/intel/ice/ice_base.c +++ b/drivers/net/ethernet/intel/ice/ice_base.c @@ -405,8 +405,7 @@ int ice_setup_rx_ctx(struct ice_ring *ring) /* Absolute queue number out of 2K needs to be passed */ err = ice_write_rxq_ctx(hw, &rlan_ctx, pf_q); if (err) { - dev_err(ice_pf_to_dev(vsi->back), - "Failed to set LAN Rx queue context for absolute Rx queue %d error: %d\n", + dev_err(ice_pf_to_dev(vsi->back), "Failed to set LAN Rx queue context for absolute Rx queue %d error: %d\n", pf_q, err); return -EIO; } @@ -428,8 +427,7 @@ int ice_setup_rx_ctx(struct ice_ring *ring) ice_alloc_rx_bufs_slow_zc(ring, ICE_DESC_UNUSED(ring)) : ice_alloc_rx_bufs(ring, ICE_DESC_UNUSED(ring)); if (err) - dev_info(ice_pf_to_dev(vsi->back), - "Failed allocate some buffers on %sRx ring %d (pf_q %d)\n", + dev_info(ice_pf_to_dev(vsi->back), "Failed allocate some buffers on %sRx ring %d (pf_q %d)\n", ring->xsk_umem ? "UMEM enabled " : "", ring->q_index, pf_q); @@ -490,8 +488,7 @@ int ice_vsi_ctrl_rx_ring(struct ice_vsi *vsi, bool ena, u16 rxq_idx) /* wait for the change to finish */ ret = ice_pf_rxq_wait(pf, pf_q, ena); if (ret) - dev_err(ice_pf_to_dev(pf), - "VSI idx %d Rx ring %d %sable timeout\n", + dev_err(ice_pf_to_dev(pf), "VSI idx %d Rx ring %d %sable timeout\n", vsi->idx, pf_q, (ena ? "en" : "dis")); return ret; @@ -648,8 +645,7 @@ ice_vsi_cfg_txq(struct ice_vsi *vsi, struct ice_ring *ring, status = ice_ena_vsi_txq(vsi->port_info, vsi->idx, tc, ring->q_handle, 1, qg_buf, buf_len, NULL); if (status) { - dev_err(ice_pf_to_dev(pf), - "Failed to set LAN Tx queue context, error: %d\n", + dev_err(ice_pf_to_dev(pf), "Failed to set LAN Tx queue context, error: %d\n", status); return -ENODEV; } @@ -815,14 +811,12 @@ ice_vsi_stop_tx_ring(struct ice_vsi *vsi, enum ice_disq_rst_src rst_src, * queues at the hardware level anyway. */ if (status == ICE_ERR_RESET_ONGOING) { - dev_dbg(ice_pf_to_dev(vsi->back), - "Reset in progress. LAN Tx queues already disabled\n"); + dev_dbg(ice_pf_to_dev(vsi->back), "Reset in progress. LAN Tx queues already disabled\n"); } else if (status == ICE_ERR_DOES_NOT_EXIST) { - dev_dbg(ice_pf_to_dev(vsi->back), - "LAN Tx queues do not exist, nothing to disable\n"); + dev_dbg(ice_pf_to_dev(vsi->back), "LAN Tx queues do not exist, nothing to disable\n"); } else if (status) { - dev_err(ice_pf_to_dev(vsi->back), - "Failed to disable LAN Tx queues, error: %d\n", status); + dev_err(ice_pf_to_dev(vsi->back), "Failed to disable LAN Tx queues, error: %d\n", + status); return -ENODEV; } diff --git a/drivers/net/ethernet/intel/ice/ice_dcb_lib.c b/drivers/net/ethernet/intel/ice/ice_dcb_lib.c index 99ee4c009a96..adc69fe1bd64 100644 --- a/drivers/net/ethernet/intel/ice/ice_dcb_lib.c +++ b/drivers/net/ethernet/intel/ice/ice_dcb_lib.c @@ -577,8 +577,7 @@ int ice_init_pf_dcb(struct ice_pf *pf, bool locked) goto dcb_init_err; } - dev_info(dev, - "DCB is enabled in the hardware, max number of TCs supported on this port are %d\n", + dev_info(dev, "DCB is enabled in the hardware, max number of TCs supported on this port are %d\n", pf->hw.func_caps.common_cap.maxtc); if (err) { struct ice_vsi *pf_vsi; @@ -588,8 +587,8 @@ int ice_init_pf_dcb(struct ice_pf *pf, bool locked) clear_bit(ICE_FLAG_FW_LLDP_AGENT, pf->flags); err = ice_dcb_sw_dflt_cfg(pf, true, locked); if (err) { - dev_err(dev, - "Failed to set local DCB config %d\n", err); + dev_err(dev, "Failed to set local DCB config %d\n", + err); err = -EIO; goto dcb_init_err; } diff --git a/drivers/net/ethernet/intel/ice/ice_dcb_nl.c b/drivers/net/ethernet/intel/ice/ice_dcb_nl.c index 265cf69b321b..b61aba428adb 100644 --- a/drivers/net/ethernet/intel/ice/ice_dcb_nl.c +++ b/drivers/net/ethernet/intel/ice/ice_dcb_nl.c @@ -297,8 +297,7 @@ ice_dcbnl_get_pfc_cfg(struct net_device *netdev, int prio, u8 *setting) return; *setting = (pi->local_dcbx_cfg.pfc.pfcena >> prio) & 0x1; - dev_dbg(ice_pf_to_dev(pf), - "Get PFC Config up=%d, setting=%d, pfcenable=0x%x\n", + dev_dbg(ice_pf_to_dev(pf), "Get PFC Config up=%d, setting=%d, pfcenable=0x%x\n", prio, *setting, pi->local_dcbx_cfg.pfc.pfcena); } @@ -418,8 +417,8 @@ ice_dcbnl_get_pg_tc_cfg_tx(struct net_device *netdev, int prio, return; *pgid = pi->local_dcbx_cfg.etscfg.prio_table[prio]; - dev_dbg(ice_pf_to_dev(pf), - "Get PG config prio=%d tc=%d\n", prio, *pgid); + dev_dbg(ice_pf_to_dev(pf), "Get PG config prio=%d tc=%d\n", prio, + *pgid); } /** @@ -882,8 +881,7 @@ ice_dcbnl_vsi_del_app(struct ice_vsi *vsi, sapp.protocol = app->prot_id; sapp.priority = app->priority; err = ice_dcbnl_delapp(vsi->netdev, &sapp); - dev_dbg(ice_pf_to_dev(vsi->back), - "Deleting app for VSI idx=%d err=%d sel=%d proto=0x%x, prio=%d\n", + dev_dbg(ice_pf_to_dev(vsi->back), "Deleting app for VSI idx=%d err=%d sel=%d proto=0x%x, prio=%d\n", vsi->idx, err, app->selector, app->prot_id, app->priority); } diff --git a/drivers/net/ethernet/intel/ice/ice_ethtool.c b/drivers/net/ethernet/intel/ice/ice_ethtool.c index 8110da94c979..4b29d1ae56a7 100644 --- a/drivers/net/ethernet/intel/ice/ice_ethtool.c +++ b/drivers/net/ethernet/intel/ice/ice_ethtool.c @@ -374,8 +374,7 @@ static int ice_reg_pattern_test(struct ice_hw *hw, u32 reg, u32 mask) val = rd32(hw, reg); if (val == pattern) continue; - dev_err(dev, - "%s: reg pattern test failed - reg 0x%08x pat 0x%08x val 0x%08x\n" + dev_err(dev, "%s: reg pattern test failed - reg 0x%08x pat 0x%08x val 0x%08x\n" , __func__, reg, pattern, val); return 1; } @@ -383,8 +382,7 @@ static int ice_reg_pattern_test(struct ice_hw *hw, u32 reg, u32 mask) wr32(hw, reg, orig_val); val = rd32(hw, reg); if (val != orig_val) { - dev_err(dev, - "%s: reg restore test failed - reg 0x%08x orig 0x%08x val 0x%08x\n" + dev_err(dev, "%s: reg restore test failed - reg 0x%08x orig 0x%08x val 0x%08x\n" , __func__, reg, orig_val, val); return 1; } @@ -802,8 +800,7 @@ ice_self_test(struct net_device *netdev, struct ethtool_test *eth_test, set_bit(__ICE_TESTING, pf->state); if (ice_active_vfs(pf)) { - dev_warn(dev, - "Please take active VFs and Netqueues offline and restart the adapter before running NIC diagnostics\n"); + dev_warn(dev, "Please take active VFs and Netqueues offline and restart the adapter before running NIC diagnostics\n"); data[ICE_ETH_TEST_REG] = 1; data[ICE_ETH_TEST_EEPROM] = 1; data[ICE_ETH_TEST_INTR] = 1; @@ -1211,8 +1208,7 @@ static int ice_set_priv_flags(struct net_device *netdev, u32 flags) * events to respond to. */ if (status) - dev_info(dev, - "Failed to unreg for LLDP events\n"); + dev_info(dev, "Failed to unreg for LLDP events\n"); /* The AQ call to stop the FW LLDP agent will generate * an error if the agent is already stopped. @@ -1267,8 +1263,7 @@ static int ice_set_priv_flags(struct net_device *netdev, u32 flags) /* Register for MIB change events */ status = ice_cfg_lldp_mib_change(&pf->hw, true); if (status) - dev_dbg(dev, - "Fail to enable MIB change events\n"); + dev_dbg(dev, "Fail to enable MIB change events\n"); } } if (test_bit(ICE_FLAG_LEGACY_RX, change_flags)) { @@ -1761,8 +1756,7 @@ ice_get_settings_link_up(struct ethtool_link_ksettings *ks, ks->base.speed = SPEED_100; break; default: - netdev_info(netdev, - "WARNING: Unrecognized link_speed (0x%x).\n", + netdev_info(netdev, "WARNING: Unrecognized link_speed (0x%x).\n", link_info->link_speed); break; } @@ -2578,13 +2572,11 @@ ice_set_ringparam(struct net_device *netdev, struct ethtool_ringparam *ring) new_tx_cnt = ALIGN(ring->tx_pending, ICE_REQ_DESC_MULTIPLE); if (new_tx_cnt != ring->tx_pending) - netdev_info(netdev, - "Requested Tx descriptor count rounded up to %d\n", + netdev_info(netdev, "Requested Tx descriptor count rounded up to %d\n", new_tx_cnt); new_rx_cnt = ALIGN(ring->rx_pending, ICE_REQ_DESC_MULTIPLE); if (new_rx_cnt != ring->rx_pending) - netdev_info(netdev, - "Requested Rx descriptor count rounded up to %d\n", + netdev_info(netdev, "Requested Rx descriptor count rounded up to %d\n", new_rx_cnt); /* if nothing to do return success */ @@ -3451,8 +3443,7 @@ ice_set_rc_coalesce(enum ice_container_type c_type, struct ethtool_coalesce *ec, if (ec->rx_coalesce_usecs_high > ICE_MAX_INTRL || (ec->rx_coalesce_usecs_high && ec->rx_coalesce_usecs_high < pf->hw.intrl_gran)) { - netdev_info(vsi->netdev, - "Invalid value, %s-usecs-high valid values are 0 (disabled), %d-%d\n", + netdev_info(vsi->netdev, "Invalid value, %s-usecs-high valid values are 0 (disabled), %d-%d\n", c_type_str, pf->hw.intrl_gran, ICE_MAX_INTRL); return -EINVAL; @@ -3470,8 +3461,7 @@ ice_set_rc_coalesce(enum ice_container_type c_type, struct ethtool_coalesce *ec, break; case ICE_TX_CONTAINER: if (ec->tx_coalesce_usecs_high) { - netdev_info(vsi->netdev, - "setting %s-usecs-high is not supported\n", + netdev_info(vsi->netdev, "setting %s-usecs-high is not supported\n", c_type_str); return -EINVAL; } @@ -3488,23 +3478,20 @@ ice_set_rc_coalesce(enum ice_container_type c_type, struct ethtool_coalesce *ec, itr_setting = rc->itr_setting & ~ICE_ITR_DYNAMIC; if (coalesce_usecs != itr_setting && use_adaptive_coalesce) { - netdev_info(vsi->netdev, - "%s interrupt throttling cannot be changed if adaptive-%s is enabled\n", + netdev_info(vsi->netdev, "%s interrupt throttling cannot be changed if adaptive-%s is enabled\n", c_type_str, c_type_str); return -EINVAL; } if (coalesce_usecs > ICE_ITR_MAX) { - netdev_info(vsi->netdev, - "Invalid value, %s-usecs range is 0-%d\n", + netdev_info(vsi->netdev, "Invalid value, %s-usecs range is 0-%d\n", c_type_str, ICE_ITR_MAX); return -EINVAL; } /* hardware only supports an ITR granularity of 2us */ if (coalesce_usecs % 2 != 0) { - netdev_info(vsi->netdev, - "Invalid value, %s-usecs must be even\n", + netdev_info(vsi->netdev, "Invalid value, %s-usecs must be even\n", c_type_str); return -EINVAL; } @@ -3745,8 +3732,7 @@ ice_get_module_info(struct net_device *netdev, } break; default: - netdev_warn(netdev, - "SFF Module Type not recognized.\n"); + netdev_warn(netdev, "SFF Module Type not recognized.\n"); return -EINVAL; } return 0; diff --git a/drivers/net/ethernet/intel/ice/ice_lib.c b/drivers/net/ethernet/intel/ice/ice_lib.c index 797773a45a98..4f5116554dcc 100644 --- a/drivers/net/ethernet/intel/ice/ice_lib.c +++ b/drivers/net/ethernet/intel/ice/ice_lib.c @@ -117,8 +117,7 @@ static void ice_vsi_set_num_desc(struct ice_vsi *vsi) vsi->num_tx_desc = ICE_DFLT_NUM_TX_DESC; break; default: - dev_dbg(ice_pf_to_dev(vsi->back), - "Not setting number of Tx/Rx descriptors for VSI type %d\n", + dev_dbg(ice_pf_to_dev(vsi->back), "Not setting number of Tx/Rx descriptors for VSI type %d\n", vsi->type); break; } @@ -929,8 +928,7 @@ static int ice_vsi_setup_vector_base(struct ice_vsi *vsi) vsi->base_vector = ice_get_res(pf, pf->irq_tracker, num_q_vectors, vsi->idx); if (vsi->base_vector < 0) { - dev_err(dev, - "Failed to get tracking for %d vectors for VSI %d, err=%d\n", + dev_err(dev, "Failed to get tracking for %d vectors for VSI %d, err=%d\n", num_q_vectors, vsi->vsi_num, vsi->base_vector); return -ENOENT; } @@ -1392,12 +1390,10 @@ int ice_vsi_kill_vlan(struct ice_vsi *vsi, u16 vid) status = ice_remove_vlan(&pf->hw, &tmp_add_list); if (status == ICE_ERR_DOES_NOT_EXIST) { - dev_dbg(dev, - "Failed to remove VLAN %d on VSI %i, it does not exist, status: %d\n", + dev_dbg(dev, "Failed to remove VLAN %d on VSI %i, it does not exist, status: %d\n", vid, vsi->vsi_num, status); } else if (status) { - dev_err(dev, - "Error removing VLAN %d on vsi %i error: %d\n", + dev_err(dev, "Error removing VLAN %d on vsi %i error: %d\n", vid, vsi->vsi_num, status); err = -EIO; } @@ -1453,8 +1449,7 @@ setup_rings: err = ice_setup_rx_ctx(vsi->rx_rings[i]); if (err) { - dev_err(ice_pf_to_dev(vsi->back), - "ice_setup_rx_ctx failed for RxQ %d, err %d\n", + dev_err(ice_pf_to_dev(vsi->back), "ice_setup_rx_ctx failed for RxQ %d, err %d\n", i, err); return err; } @@ -1834,8 +1829,7 @@ ice_vsi_set_q_vectors_reg_idx(struct ice_vsi *vsi) struct ice_q_vector *q_vector = vsi->q_vectors[i]; if (!q_vector) { - dev_err(ice_pf_to_dev(vsi->back), - "Failed to set reg_idx on q_vector %d VSI %d\n", + dev_err(ice_pf_to_dev(vsi->back), "Failed to set reg_idx on q_vector %d VSI %d\n", i, vsi->vsi_num); goto clear_reg_idx; } @@ -1898,8 +1892,7 @@ ice_vsi_add_rem_eth_mac(struct ice_vsi *vsi, bool add_rule) status = ice_remove_eth_mac(&pf->hw, &tmp_add_list); if (status) - dev_err(dev, - "Failure Adding or Removing Ethertype on VSI %i error: %d\n", + dev_err(dev, "Failure Adding or Removing Ethertype on VSI %i error: %d\n", vsi->vsi_num, status); ice_free_fltr_list(dev, &tmp_add_list); @@ -2384,8 +2377,7 @@ ice_get_res(struct ice_pf *pf, struct ice_res_tracker *res, u16 needed, u16 id) return -EINVAL; if (!needed || needed > res->num_entries || id >= ICE_RES_VALID_BIT) { - dev_err(ice_pf_to_dev(pf), - "param err: needed=%d, num_entries = %d id=0x%04x\n", + dev_err(ice_pf_to_dev(pf), "param err: needed=%d, num_entries = %d id=0x%04x\n", needed, res->num_entries, id); return -EINVAL; } @@ -2764,8 +2756,7 @@ int ice_vsi_rebuild(struct ice_vsi *vsi, bool init_vsi) status = ice_cfg_vsi_lan(vsi->port_info, vsi->idx, vsi->tc_cfg.ena_tc, max_txqs); if (status) { - dev_err(ice_pf_to_dev(pf), - "VSI %d failed lan queue config, error %d\n", + dev_err(ice_pf_to_dev(pf), "VSI %d failed lan queue config, error %d\n", vsi->vsi_num, status); if (init_vsi) { ret = -EIO; @@ -3023,16 +3014,14 @@ int ice_set_dflt_vsi(struct ice_sw *sw, struct ice_vsi *vsi) /* another VSI is already the default VSI for this switch */ if (ice_is_dflt_vsi_in_use(sw)) { - dev_err(dev, - "Default forwarding VSI %d already in use, disable it and try again\n", + dev_err(dev, "Default forwarding VSI %d already in use, disable it and try again\n", sw->dflt_vsi->vsi_num); return -EEXIST; } status = ice_cfg_dflt_vsi(&vsi->back->hw, vsi->idx, true, ICE_FLTR_RX); if (status) { - dev_err(dev, - "Failed to set VSI %d as the default forwarding VSI, error %d\n", + dev_err(dev, "Failed to set VSI %d as the default forwarding VSI, error %d\n", vsi->vsi_num, status); return -EIO; } @@ -3071,8 +3060,7 @@ int ice_clear_dflt_vsi(struct ice_sw *sw) status = ice_cfg_dflt_vsi(&dflt_vsi->back->hw, dflt_vsi->idx, false, ICE_FLTR_RX); if (status) { - dev_err(dev, - "Failed to clear the default forwarding VSI %d, error %d\n", + dev_err(dev, "Failed to clear the default forwarding VSI %d, error %d\n", dflt_vsi->vsi_num, status); return -EIO; } diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c index 044995f69748..db7e8069f37b 100644 --- a/drivers/net/ethernet/intel/ice/ice_main.c +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -162,8 +162,7 @@ unregister: * had an error */ if (status && vsi->netdev->reg_state == NETREG_REGISTERED) { - dev_err(ice_pf_to_dev(pf), - "Could not add MAC filters error %d. Unregistering device\n", + dev_err(ice_pf_to_dev(pf), "Could not add MAC filters error %d. Unregistering device\n", status); unregister_netdev(vsi->netdev); free_netdev(vsi->netdev); @@ -335,8 +334,7 @@ static int ice_vsi_sync_fltr(struct ice_vsi *vsi) !test_and_set_bit(__ICE_FLTR_OVERFLOW_PROMISC, vsi->state)) { promisc_forced_on = true; - netdev_warn(netdev, - "Reached MAC filter limit, forcing promisc mode on VSI %d\n", + netdev_warn(netdev, "Reached MAC filter limit, forcing promisc mode on VSI %d\n", vsi->vsi_num); } else { err = -EIO; @@ -382,8 +380,7 @@ static int ice_vsi_sync_fltr(struct ice_vsi *vsi) if (!ice_is_dflt_vsi_in_use(pf->first_sw)) { err = ice_set_dflt_vsi(pf->first_sw, vsi); if (err && err != -EEXIST) { - netdev_err(netdev, - "Error %d setting default VSI %i Rx rule\n", + netdev_err(netdev, "Error %d setting default VSI %i Rx rule\n", err, vsi->vsi_num); vsi->current_netdev_flags &= ~IFF_PROMISC; @@ -395,8 +392,7 @@ static int ice_vsi_sync_fltr(struct ice_vsi *vsi) if (ice_is_vsi_dflt_vsi(pf->first_sw, vsi)) { err = ice_clear_dflt_vsi(pf->first_sw); if (err) { - netdev_err(netdev, - "Error %d clearing default VSI %i Rx rule\n", + netdev_err(netdev, "Error %d clearing default VSI %i Rx rule\n", err, vsi->vsi_num); vsi->current_netdev_flags |= IFF_PROMISC; @@ -815,8 +811,7 @@ ice_link_event(struct ice_pf *pf, struct ice_port_info *pi, bool link_up, */ result = ice_update_link_info(pi); if (result) - dev_dbg(dev, - "Failed to update link status and re-enable link events for port %d\n", + dev_dbg(dev, "Failed to update link status and re-enable link events for port %d\n", pi->lport); /* if the old link up/down and speed is the same as the new */ @@ -834,8 +829,7 @@ ice_link_event(struct ice_pf *pf, struct ice_port_info *pi, bool link_up, result = ice_aq_set_link_restart_an(pi, false, NULL); if (result) { - dev_dbg(dev, - "Failed to set link down, VSI %d error %d\n", + dev_dbg(dev, "Failed to set link down, VSI %d error %d\n", vsi->vsi_num, result); return result; } @@ -893,15 +887,13 @@ static int ice_init_link_events(struct ice_port_info *pi) ICE_AQ_LINK_EVENT_MODULE_QUAL_FAIL)); if (ice_aq_set_event_mask(pi->hw, pi->lport, mask, NULL)) { - dev_dbg(ice_hw_to_dev(pi->hw), - "Failed to set link event mask for port %d\n", + dev_dbg(ice_hw_to_dev(pi->hw), "Failed to set link event mask for port %d\n", pi->lport); return -EIO; } if (ice_aq_get_link_info(pi, true, NULL, NULL)) { - dev_dbg(ice_hw_to_dev(pi->hw), - "Failed to enable link events for port %d\n", + dev_dbg(ice_hw_to_dev(pi->hw), "Failed to enable link events for port %d\n", pi->lport); return -EIO; } @@ -930,8 +922,8 @@ ice_handle_link_event(struct ice_pf *pf, struct ice_rq_event_info *event) !!(link_data->link_info & ICE_AQ_LINK_UP), le16_to_cpu(link_data->link_speed)); if (status) - dev_dbg(ice_pf_to_dev(pf), - "Could not process link event, error %d\n", status); + dev_dbg(ice_pf_to_dev(pf), "Could not process link event, error %d\n", + status); return status; } @@ -980,13 +972,11 @@ static int __ice_clean_ctrlq(struct ice_pf *pf, enum ice_ctl_q q_type) dev_dbg(dev, "%s Receive Queue VF Error detected\n", qtype); if (val & PF_FW_ARQLEN_ARQOVFL_M) { - dev_dbg(dev, - "%s Receive Queue Overflow Error detected\n", + dev_dbg(dev, "%s Receive Queue Overflow Error detected\n", qtype); } if (val & PF_FW_ARQLEN_ARQCRIT_M) - dev_dbg(dev, - "%s Receive Queue Critical Error detected\n", + dev_dbg(dev, "%s Receive Queue Critical Error detected\n", qtype); val &= ~(PF_FW_ARQLEN_ARQVFE_M | PF_FW_ARQLEN_ARQOVFL_M | PF_FW_ARQLEN_ARQCRIT_M); @@ -999,8 +989,8 @@ static int __ice_clean_ctrlq(struct ice_pf *pf, enum ice_ctl_q q_type) PF_FW_ATQLEN_ATQCRIT_M)) { oldval = val; if (val & PF_FW_ATQLEN_ATQVFE_M) - dev_dbg(dev, - "%s Send Queue VF Error detected\n", qtype); + dev_dbg(dev, "%s Send Queue VF Error detected\n", + qtype); if (val & PF_FW_ATQLEN_ATQOVFL_M) { dev_dbg(dev, "%s Send Queue Overflow Error detected\n", qtype); @@ -1049,8 +1039,7 @@ static int __ice_clean_ctrlq(struct ice_pf *pf, enum ice_ctl_q q_type) ice_dcb_process_lldp_set_mib_change(pf, &event); break; default: - dev_dbg(dev, - "%s Receive Queue unknown event 0x%04x ignored\n", + dev_dbg(dev, "%s Receive Queue unknown event 0x%04x ignored\n", qtype, opcode); break; } @@ -1336,8 +1325,7 @@ static void ice_handle_mdd_event(struct ice_pf *pf) vf->num_mdd_events++; if (vf->num_mdd_events && vf->num_mdd_events <= ICE_MDD_EVENTS_THRESHOLD) - dev_info(dev, - "VF %d has had %llu MDD events since last boot, Admin might need to reload AVF driver with this number of events\n", + dev_info(dev, "VF %d has had %llu MDD events since last boot, Admin might need to reload AVF driver with this number of events\n", i, vf->num_mdd_events); } } @@ -1379,8 +1367,7 @@ static int ice_force_phys_link_state(struct ice_vsi *vsi, bool link_up) retcode = ice_aq_get_phy_caps(pi, false, ICE_AQC_REPORT_SW_CFG, pcaps, NULL); if (retcode) { - dev_err(dev, - "Failed to get phy capabilities, VSI %d error %d\n", + dev_err(dev, "Failed to get phy capabilities, VSI %d error %d\n", vsi->vsi_num, retcode); retcode = -EIO; goto out; @@ -1650,8 +1637,8 @@ static int ice_vsi_req_irq_msix(struct ice_vsi *vsi, char *basename) err = devm_request_irq(dev, irq_num, vsi->irq_handler, 0, q_vector->name, q_vector); if (err) { - netdev_err(vsi->netdev, - "MSIX request_irq failed, error: %d\n", err); + netdev_err(vsi->netdev, "MSIX request_irq failed, error: %d\n", + err); goto free_q_irqs; } @@ -2763,8 +2750,7 @@ static int ice_ena_msix_range(struct ice_pf *pf) } if (v_actual < v_budget) { - dev_warn(dev, - "not enough OS MSI-X vectors. requested = %d, obtained = %d\n", + dev_warn(dev, "not enough OS MSI-X vectors. requested = %d, obtained = %d\n", v_budget, v_actual); /* 2 vectors for LAN (traffic + OICR) */ #define ICE_MIN_LAN_VECS 2 @@ -2786,8 +2772,7 @@ msix_err: goto exit_err; no_hw_vecs_left_err: - dev_err(dev, - "not enough device MSI-X vectors. requested = %d, available = %d\n", + dev_err(dev, "not enough device MSI-X vectors. requested = %d, available = %d\n", needed, v_left); err = -ERANGE; exit_err: @@ -2920,16 +2905,14 @@ ice_log_pkg_init(struct ice_hw *hw, enum ice_status *status) !memcmp(hw->pkg_name, hw->active_pkg_name, sizeof(hw->pkg_name))) { if (hw->pkg_dwnld_status == ICE_AQ_RC_EEXIST) - dev_info(dev, - "DDP package already present on device: %s version %d.%d.%d.%d\n", + dev_info(dev, "DDP package already present on device: %s version %d.%d.%d.%d\n", hw->active_pkg_name, hw->active_pkg_ver.major, hw->active_pkg_ver.minor, hw->active_pkg_ver.update, hw->active_pkg_ver.draft); else - dev_info(dev, - "The DDP package was successfully loaded: %s version %d.%d.%d.%d\n", + dev_info(dev, "The DDP package was successfully loaded: %s version %d.%d.%d.%d\n", hw->active_pkg_name, hw->active_pkg_ver.major, hw->active_pkg_ver.minor, @@ -2937,8 +2920,7 @@ ice_log_pkg_init(struct ice_hw *hw, enum ice_status *status) hw->active_pkg_ver.draft); } else if (hw->active_pkg_ver.major != ICE_PKG_SUPP_VER_MAJ || hw->active_pkg_ver.minor != ICE_PKG_SUPP_VER_MNR) { - dev_err(dev, - "The device has a DDP package that is not supported by the driver. The device has package '%s' version %d.%d.x.x. The driver requires version %d.%d.x.x. Entering Safe Mode.\n", + dev_err(dev, "The device has a DDP package that is not supported by the driver. The device has package '%s' version %d.%d.x.x. The driver requires version %d.%d.x.x. Entering Safe Mode.\n", hw->active_pkg_name, hw->active_pkg_ver.major, hw->active_pkg_ver.minor, @@ -2946,8 +2928,7 @@ ice_log_pkg_init(struct ice_hw *hw, enum ice_status *status) *status = ICE_ERR_NOT_SUPPORTED; } else if (hw->active_pkg_ver.major == ICE_PKG_SUPP_VER_MAJ && hw->active_pkg_ver.minor == ICE_PKG_SUPP_VER_MNR) { - dev_info(dev, - "The driver could not load the DDP package file because a compatible DDP package is already present on the device. The device has package '%s' version %d.%d.%d.%d. The package file found by the driver: '%s' version %d.%d.%d.%d.\n", + dev_info(dev, "The driver could not load the DDP package file because a compatible DDP package is already present on the device. The device has package '%s' version %d.%d.%d.%d. The package file found by the driver: '%s' version %d.%d.%d.%d.\n", hw->active_pkg_name, hw->active_pkg_ver.major, hw->active_pkg_ver.minor, @@ -2959,54 +2940,46 @@ ice_log_pkg_init(struct ice_hw *hw, enum ice_status *status) hw->pkg_ver.update, hw->pkg_ver.draft); } else { - dev_err(dev, - "An unknown error occurred when loading the DDP package, please reboot the system. If the problem persists, update the NVM. Entering Safe Mode.\n"); + dev_err(dev, "An unknown error occurred when loading the DDP package, please reboot the system. If the problem persists, update the NVM. Entering Safe Mode.\n"); *status = ICE_ERR_NOT_SUPPORTED; } break; case ICE_ERR_BUF_TOO_SHORT: /* fall-through */ case ICE_ERR_CFG: - dev_err(dev, - "The DDP package file is invalid. Entering Safe Mode.\n"); + dev_err(dev, "The DDP package file is invalid. Entering Safe Mode.\n"); break; case ICE_ERR_NOT_SUPPORTED: /* Package File version not supported */ if (hw->pkg_ver.major > ICE_PKG_SUPP_VER_MAJ || (hw->pkg_ver.major == ICE_PKG_SUPP_VER_MAJ && hw->pkg_ver.minor > ICE_PKG_SUPP_VER_MNR)) - dev_err(dev, - "The DDP package file version is higher than the driver supports. Please use an updated driver. Entering Safe Mode.\n"); + dev_err(dev, "The DDP package file version is higher than the driver supports. Please use an updated driver. Entering Safe Mode.\n"); else if (hw->pkg_ver.major < ICE_PKG_SUPP_VER_MAJ || (hw->pkg_ver.major == ICE_PKG_SUPP_VER_MAJ && hw->pkg_ver.minor < ICE_PKG_SUPP_VER_MNR)) - dev_err(dev, - "The DDP package file version is lower than the driver supports. The driver requires version %d.%d.x.x. Please use an updated DDP Package file. Entering Safe Mode.\n", + dev_err(dev, "The DDP package file version is lower than the driver supports. The driver requires version %d.%d.x.x. Please use an updated DDP Package file. Entering Safe Mode.\n", ICE_PKG_SUPP_VER_MAJ, ICE_PKG_SUPP_VER_MNR); break; case ICE_ERR_AQ_ERROR: switch (hw->pkg_dwnld_status) { case ICE_AQ_RC_ENOSEC: case ICE_AQ_RC_EBADSIG: - dev_err(dev, - "The DDP package could not be loaded because its signature is not valid. Please use a valid DDP Package. Entering Safe Mode.\n"); + dev_err(dev, "The DDP package could not be loaded because its signature is not valid. Please use a valid DDP Package. Entering Safe Mode.\n"); return; case ICE_AQ_RC_ESVN: - dev_err(dev, - "The DDP Package could not be loaded because its security revision is too low. Please use an updated DDP Package. Entering Safe Mode.\n"); + dev_err(dev, "The DDP Package could not be loaded because its security revision is too low. Please use an updated DDP Package. Entering Safe Mode.\n"); return; case ICE_AQ_RC_EBADMAN: case ICE_AQ_RC_EBADBUF: - dev_err(dev, - "An error occurred on the device while loading the DDP package. The device will be reset.\n"); + dev_err(dev, "An error occurred on the device while loading the DDP package. The device will be reset.\n"); return; default: break; } /* fall-through */ default: - dev_err(dev, - "An unknown error (%d) occurred when loading the DDP package. Entering Safe Mode.\n", + dev_err(dev, "An unknown error (%d) occurred when loading the DDP package. Entering Safe Mode.\n", *status); break; } @@ -3037,8 +3010,7 @@ ice_load_pkg(const struct firmware *firmware, struct ice_pf *pf) status = ice_init_pkg(hw, hw->pkg_copy, hw->pkg_size); ice_log_pkg_init(hw, &status); } else { - dev_err(dev, - "The DDP package file failed to load. Entering Safe Mode.\n"); + dev_err(dev, "The DDP package file failed to load. Entering Safe Mode.\n"); } if (status) { @@ -3064,8 +3036,7 @@ ice_load_pkg(const struct firmware *firmware, struct ice_pf *pf) static void ice_verify_cacheline_size(struct ice_pf *pf) { if (rd32(&pf->hw, GLPCI_CNF2) & GLPCI_CNF2_CACHELINE_SIZE_M) - dev_warn(ice_pf_to_dev(pf), - "%d Byte cache line assumption is invalid, driver may have Tx timeouts!\n", + dev_warn(ice_pf_to_dev(pf), "%d Byte cache line assumption is invalid, driver may have Tx timeouts!\n", ICE_CACHE_LINE_BYTES); } @@ -3158,8 +3129,7 @@ static void ice_request_fw(struct ice_pf *pf) dflt_pkg_load: err = request_firmware(&firmware, ICE_DDP_PKG_FILE, dev); if (err) { - dev_err(dev, - "The DDP package file was not found or could not be read. Entering Safe Mode\n"); + dev_err(dev, "The DDP package file was not found or could not be read. Entering Safe Mode\n"); return; } @@ -3251,8 +3221,7 @@ ice_probe(struct pci_dev *pdev, const struct pci_device_id __always_unused *ent) * true */ if (ice_is_safe_mode(pf)) { - dev_err(dev, - "Package download failed. Advanced features disabled - Device now in Safe Mode\n"); + dev_err(dev, "Package download failed. Advanced features disabled - Device now in Safe Mode\n"); /* we already got function/device capabilities but these don't * reflect what the driver needs to do in safe mode. Instead of * adding conditional logic everywhere to ignore these @@ -3329,8 +3298,7 @@ ice_probe(struct pci_dev *pdev, const struct pci_device_id __always_unused *ent) /* tell the firmware we are up */ err = ice_send_version(pf); if (err) { - dev_err(dev, - "probe failed sending driver version %s. error: %d\n", + dev_err(dev, "probe failed sending driver version %s. error: %d\n", ice_drv_ver, err); goto err_alloc_sw_unroll; } @@ -3471,8 +3439,7 @@ static pci_ers_result_t ice_pci_err_slot_reset(struct pci_dev *pdev) err = pci_enable_device_mem(pdev); if (err) { - dev_err(&pdev->dev, - "Cannot re-enable PCI device after reset, error %d\n", + dev_err(&pdev->dev, "Cannot re-enable PCI device after reset, error %d\n", err); result = PCI_ERS_RESULT_DISCONNECT; } else { @@ -3491,8 +3458,7 @@ static pci_ers_result_t ice_pci_err_slot_reset(struct pci_dev *pdev) err = pci_cleanup_aer_uncorrect_error_status(pdev); if (err) - dev_dbg(&pdev->dev, - "pci_cleanup_aer_uncorrect_error_status failed, error %d\n", + dev_dbg(&pdev->dev, "pci_cleanup_aer_uncorrect_error_status failed, error %d\n", err); /* non-fatal, continue */ @@ -3511,8 +3477,8 @@ static void ice_pci_err_resume(struct pci_dev *pdev) struct ice_pf *pf = pci_get_drvdata(pdev); if (!pf) { - dev_err(&pdev->dev, - "%s failed, device is unrecoverable\n", __func__); + dev_err(&pdev->dev, "%s failed, device is unrecoverable\n", + __func__); return; } @@ -3760,8 +3726,7 @@ ice_set_tx_maxrate(struct net_device *netdev, int queue_index, u32 maxrate) /* Validate maxrate requested is within permitted range */ if (maxrate && (maxrate > (ICE_SCHED_MAX_BW / 1000))) { - netdev_err(netdev, - "Invalid max rate %d specified for the queue %d\n", + netdev_err(netdev, "Invalid max rate %d specified for the queue %d\n", maxrate, queue_index); return -EINVAL; } @@ -3777,8 +3742,8 @@ ice_set_tx_maxrate(struct net_device *netdev, int queue_index, u32 maxrate) status = ice_cfg_q_bw_lmt(vsi->port_info, vsi->idx, tc, q_handle, ICE_MAX_BW, maxrate * 1000); if (status) { - netdev_err(netdev, - "Unable to set Tx max rate, error %d\n", status); + netdev_err(netdev, "Unable to set Tx max rate, error %d\n", + status); return -EIO; } @@ -3870,15 +3835,13 @@ ice_set_features(struct net_device *netdev, netdev_features_t features) /* Don't set any netdev advanced features with device in Safe Mode */ if (ice_is_safe_mode(vsi->back)) { - dev_err(ice_pf_to_dev(vsi->back), - "Device is in Safe Mode - not enabling advanced netdev features\n"); + dev_err(ice_pf_to_dev(vsi->back), "Device is in Safe Mode - not enabling advanced netdev features\n"); return ret; } /* Do not change setting during reset */ if (ice_is_reset_in_progress(pf->state)) { - dev_err(ice_pf_to_dev(vsi->back), - "Device is resetting, changing advanced netdev features temporarily unavailable.\n"); + dev_err(ice_pf_to_dev(vsi->back), "Device is resetting, changing advanced netdev features temporarily unavailable.\n"); return -EBUSY; } @@ -4366,21 +4329,18 @@ int ice_down(struct ice_vsi *vsi) tx_err = ice_vsi_stop_lan_tx_rings(vsi, ICE_NO_RESET, 0); if (tx_err) - netdev_err(vsi->netdev, - "Failed stop Tx rings, VSI %d error %d\n", + netdev_err(vsi->netdev, "Failed stop Tx rings, VSI %d error %d\n", vsi->vsi_num, tx_err); if (!tx_err && ice_is_xdp_ena_vsi(vsi)) { tx_err = ice_vsi_stop_xdp_tx_rings(vsi); if (tx_err) - netdev_err(vsi->netdev, - "Failed stop XDP rings, VSI %d error %d\n", + netdev_err(vsi->netdev, "Failed stop XDP rings, VSI %d error %d\n", vsi->vsi_num, tx_err); } rx_err = ice_vsi_stop_rx_rings(vsi); if (rx_err) - netdev_err(vsi->netdev, - "Failed stop Rx rings, VSI %d error %d\n", + netdev_err(vsi->netdev, "Failed stop Rx rings, VSI %d error %d\n", vsi->vsi_num, rx_err); ice_napi_disable_all(vsi); @@ -4388,8 +4348,7 @@ int ice_down(struct ice_vsi *vsi) if (test_bit(ICE_FLAG_LINK_DOWN_ON_CLOSE_ENA, vsi->back->flags)) { link_err = ice_force_phys_link_state(vsi, false); if (link_err) - netdev_err(vsi->netdev, - "Failed to set physical link down, VSI %d error %d\n", + netdev_err(vsi->netdev, "Failed to set physical link down, VSI %d error %d\n", vsi->vsi_num, link_err); } @@ -4400,8 +4359,7 @@ int ice_down(struct ice_vsi *vsi) ice_clean_rx_ring(vsi->rx_rings[i]); if (tx_err || rx_err || link_err) { - netdev_err(vsi->netdev, - "Failed to close VSI 0x%04X on switch 0x%04X\n", + netdev_err(vsi->netdev, "Failed to close VSI 0x%04X on switch 0x%04X\n", vsi->vsi_num, vsi->vsw->sw_id); return -EIO; } @@ -4548,8 +4506,7 @@ static void ice_vsi_release_all(struct ice_pf *pf) err = ice_vsi_release(pf->vsi[i]); if (err) - dev_dbg(ice_pf_to_dev(pf), - "Failed to release pf->vsi[%d], err %d, vsi_num = %d\n", + dev_dbg(ice_pf_to_dev(pf), "Failed to release pf->vsi[%d], err %d, vsi_num = %d\n", i, err, pf->vsi[i]->vsi_num); } } @@ -4576,8 +4533,7 @@ static int ice_vsi_rebuild_by_type(struct ice_pf *pf, enum ice_vsi_type type) /* rebuild the VSI */ err = ice_vsi_rebuild(vsi, true); if (err) { - dev_err(dev, - "rebuild VSI failed, err %d, VSI index %d, type %s\n", + dev_err(dev, "rebuild VSI failed, err %d, VSI index %d, type %s\n", err, vsi->idx, ice_vsi_type_str(type)); return err; } @@ -4585,8 +4541,7 @@ static int ice_vsi_rebuild_by_type(struct ice_pf *pf, enum ice_vsi_type type) /* replay filters for the VSI */ status = ice_replay_vsi(&pf->hw, vsi->idx); if (status) { - dev_err(dev, - "replay VSI failed, status %d, VSI index %d, type %s\n", + dev_err(dev, "replay VSI failed, status %d, VSI index %d, type %s\n", status, vsi->idx, ice_vsi_type_str(type)); return -EIO; } @@ -4599,8 +4554,7 @@ static int ice_vsi_rebuild_by_type(struct ice_pf *pf, enum ice_vsi_type type) /* enable the VSI */ err = ice_ena_vsi(vsi, false); if (err) { - dev_err(dev, - "enable VSI failed, err %d, VSI index %d, type %s\n", + dev_err(dev, "enable VSI failed, err %d, VSI index %d, type %s\n", err, vsi->idx, ice_vsi_type_str(type)); return err; } @@ -4678,8 +4632,7 @@ static void ice_rebuild(struct ice_pf *pf, enum ice_reset_req reset_type) } if (pf->first_sw->dflt_vsi_ena) - dev_info(dev, - "Clearing default VSI, re-enable after reset completes\n"); + dev_info(dev, "Clearing default VSI, re-enable after reset completes\n"); /* clear the default VSI configuration if it exists */ pf->first_sw->dflt_vsi = NULL; pf->first_sw->dflt_vsi_ena = false; @@ -4730,8 +4683,7 @@ static void ice_rebuild(struct ice_pf *pf, enum ice_reset_req reset_type) /* tell the firmware we are up */ ret = ice_send_version(pf); if (ret) { - dev_err(dev, - "Rebuild failed due to error sending driver version: %d\n", + dev_err(dev, "Rebuild failed due to error sending driver version: %d\n", ret); goto err_vsi_rebuild; } @@ -5179,8 +5131,7 @@ int ice_open(struct net_device *netdev) if (pi->phy.link_info.link_info & ICE_AQ_MEDIA_AVAILABLE) { err = ice_force_phys_link_state(vsi, true); if (err) { - netdev_err(netdev, - "Failed to set physical link up, error %d\n", + netdev_err(netdev, "Failed to set physical link up, error %d\n", err); return err; } diff --git a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c index 8f1a934a318c..eb60abd8394f 100644 --- a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c +++ b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c @@ -199,8 +199,7 @@ static void ice_dis_vf_mappings(struct ice_vf *vf) if (vsi->rx_mapping_mode == ICE_VSI_MAP_CONTIG) wr32(hw, VPLAN_RX_QBASE(vf->vf_id), 0); else - dev_err(dev, - "Scattered mode for VF Rx queues is not yet implemented\n"); + dev_err(dev, "Scattered mode for VF Rx queues is not yet implemented\n"); } /** @@ -402,8 +401,7 @@ static void ice_trigger_vf_reset(struct ice_vf *vf, bool is_vflr, bool is_pfr) if ((reg & VF_TRANS_PENDING_M) == 0) break; - dev_err(dev, - "VF %d PCI transactions stuck\n", vf->vf_id); + dev_err(dev, "VF %d PCI transactions stuck\n", vf->vf_id); udelay(ICE_PCI_CIAD_WAIT_DELAY_US); } } @@ -1553,8 +1551,7 @@ ice_vc_send_msg_to_vf(struct ice_vf *vf, u32 v_opcode, dev_info(dev, "VF %d failed opcode %d, retval: %d\n", vf->vf_id, v_opcode, v_retval); if (vf->num_inval_msgs > ICE_DFLT_NUM_INVAL_MSGS_ALLOWED) { - dev_err(dev, - "Number of invalid messages exceeded for VF %d\n", + dev_err(dev, "Number of invalid messages exceeded for VF %d\n", vf->vf_id); dev_err(dev, "Use PF Control I/F to enable the VF\n"); set_bit(ICE_VF_STATE_DIS, vf->vf_states); @@ -1569,8 +1566,7 @@ ice_vc_send_msg_to_vf(struct ice_vf *vf, u32 v_opcode, aq_ret = ice_aq_send_msg_to_vf(&pf->hw, vf->vf_id, v_opcode, v_retval, msg, msglen, NULL); if (aq_ret && pf->hw.mailboxq.sq_last_status != ICE_AQ_RC_ENOSYS) { - dev_info(dev, - "Unable to send the message to VF %d ret %d aq_err %d\n", + dev_info(dev, "Unable to send the message to VF %d ret %d aq_err %d\n", vf->vf_id, aq_ret, pf->hw.mailboxq.sq_last_status); return -EIO; } @@ -1914,8 +1910,7 @@ int ice_set_vf_spoofchk(struct net_device *netdev, int vf_id, bool ena) } if (vf_vsi->type != ICE_VSI_VF) { - netdev_err(netdev, - "Type %d of VSI %d for VF %d is no ICE_VSI_VF\n", + netdev_err(netdev, "Type %d of VSI %d for VF %d is no ICE_VSI_VF\n", vf_vsi->type, vf_vsi->vsi_num, vf->vf_id); return -ENODEV; } @@ -1945,8 +1940,7 @@ int ice_set_vf_spoofchk(struct net_device *netdev, int vf_id, bool ena) status = ice_update_vsi(&pf->hw, vf_vsi->idx, ctx, NULL); if (status) { - dev_err(dev, - "Failed to %sable spoofchk on VF %d VSI %d\n error %d", + dev_err(dev, "Failed to %sable spoofchk on VF %d VSI %d\n error %d", ena ? "en" : "dis", vf->vf_id, vf_vsi->vsi_num, status); ret = -EIO; goto out; @@ -2063,8 +2057,7 @@ static int ice_vc_ena_qs_msg(struct ice_vf *vf, u8 *msg) continue; if (ice_vsi_ctrl_rx_ring(vsi, true, vf_q_id)) { - dev_err(ice_pf_to_dev(vsi->back), - "Failed to enable Rx ring %d on VSI %d\n", + dev_err(ice_pf_to_dev(vsi->back), "Failed to enable Rx ring %d on VSI %d\n", vf_q_id, vsi->vsi_num); v_ret = VIRTCHNL_STATUS_ERR_PARAM; goto error_param; @@ -2166,8 +2159,7 @@ static int ice_vc_dis_qs_msg(struct ice_vf *vf, u8 *msg) if (ice_vsi_stop_tx_ring(vsi, ICE_NO_RESET, vf->vf_id, ring, &txq_meta)) { - dev_err(ice_pf_to_dev(vsi->back), - "Failed to stop Tx ring %d on VSI %d\n", + dev_err(ice_pf_to_dev(vsi->back), "Failed to stop Tx ring %d on VSI %d\n", vf_q_id, vsi->vsi_num); v_ret = VIRTCHNL_STATUS_ERR_PARAM; goto error_param; @@ -2193,8 +2185,7 @@ static int ice_vc_dis_qs_msg(struct ice_vf *vf, u8 *msg) continue; if (ice_vsi_ctrl_rx_ring(vsi, false, vf_q_id)) { - dev_err(ice_pf_to_dev(vsi->back), - "Failed to stop Rx ring %d on VSI %d\n", + dev_err(ice_pf_to_dev(vsi->back), "Failed to stop Rx ring %d on VSI %d\n", vf_q_id, vsi->vsi_num); v_ret = VIRTCHNL_STATUS_ERR_PARAM; goto error_param; @@ -2357,8 +2348,7 @@ static int ice_vc_cfg_qs_msg(struct ice_vf *vf, u8 *msg) if (qci->num_queue_pairs > ICE_MAX_BASE_QS_PER_VF || qci->num_queue_pairs > min_t(u16, vsi->alloc_txq, vsi->alloc_rxq)) { - dev_err(ice_pf_to_dev(pf), - "VF-%d requesting more than supported number of queues: %d\n", + dev_err(ice_pf_to_dev(pf), "VF-%d requesting more than supported number of queues: %d\n", vf->vf_id, min_t(u16, vsi->alloc_txq, vsi->alloc_rxq)); v_ret = VIRTCHNL_STATUS_ERR_PARAM; goto error_param; @@ -2570,8 +2560,7 @@ ice_vc_handle_mac_addr_msg(struct ice_vf *vf, u8 *msg, bool set) */ if (set && !ice_is_vf_trusted(vf) && (vf->num_mac + al->num_elements) > ICE_MAX_MACADDR_PER_VF) { - dev_err(ice_pf_to_dev(pf), - "Can't add more MAC addresses, because VF-%d is not trusted, switch the VF to trusted mode in order to add more functionalities\n", + dev_err(ice_pf_to_dev(pf), "Can't add more MAC addresses, because VF-%d is not trusted, switch the VF to trusted mode in order to add more functionalities\n", vf->vf_id); v_ret = VIRTCHNL_STATUS_ERR_PARAM; goto handle_mac_exit; @@ -2670,8 +2659,7 @@ static int ice_vc_request_qs_msg(struct ice_vf *vf, u8 *msg) vfres->num_queue_pairs = ICE_MAX_BASE_QS_PER_VF; } else if (req_queues > cur_queues && req_queues - cur_queues > tx_rx_queue_left) { - dev_warn(dev, - "VF %d requested %u more queues, but only %u left.\n", + dev_warn(dev, "VF %d requested %u more queues, but only %u left.\n", vf->vf_id, req_queues - cur_queues, tx_rx_queue_left); vfres->num_queue_pairs = min_t(u16, max_allowed_vf_queues, ICE_MAX_BASE_QS_PER_VF); @@ -2821,8 +2809,8 @@ static int ice_vc_process_vlan_msg(struct ice_vf *vf, u8 *msg, bool add_v) for (i = 0; i < vfl->num_elements; i++) { if (vfl->vlan_id[i] > ICE_MAX_VLANID) { v_ret = VIRTCHNL_STATUS_ERR_PARAM; - dev_err(dev, - "invalid VF VLAN id %d\n", vfl->vlan_id[i]); + dev_err(dev, "invalid VF VLAN id %d\n", + vfl->vlan_id[i]); goto error_param; } } @@ -2836,8 +2824,7 @@ static int ice_vc_process_vlan_msg(struct ice_vf *vf, u8 *msg, bool add_v) if (add_v && !ice_is_vf_trusted(vf) && vsi->num_vlan >= ICE_MAX_VLAN_PER_VF) { - dev_info(dev, - "VF-%d is not trusted, switch the VF to trusted mode, in order to add more VLAN addresses\n", + dev_info(dev, "VF-%d is not trusted, switch the VF to trusted mode, in order to add more VLAN addresses\n", vf->vf_id); /* There is no need to let VF know about being not trusted, * so we can just return success message here @@ -2860,8 +2847,7 @@ static int ice_vc_process_vlan_msg(struct ice_vf *vf, u8 *msg, bool add_v) if (!ice_is_vf_trusted(vf) && vsi->num_vlan >= ICE_MAX_VLAN_PER_VF) { - dev_info(dev, - "VF-%d is not trusted, switch the VF to trusted mode, in order to add more VLAN addresses\n", + dev_info(dev, "VF-%d is not trusted, switch the VF to trusted mode, in order to add more VLAN addresses\n", vf->vf_id); /* There is no need to let VF know about being * not trusted, so we can just return success @@ -2889,8 +2875,7 @@ static int ice_vc_process_vlan_msg(struct ice_vf *vf, u8 *msg, bool add_v) status = ice_cfg_vlan_pruning(vsi, true, false); if (status) { v_ret = VIRTCHNL_STATUS_ERR_PARAM; - dev_err(dev, - "Enable VLAN pruning on VLAN ID: %d failed error-%d\n", + dev_err(dev, "Enable VLAN pruning on VLAN ID: %d failed error-%d\n", vid, status); goto error_param; } @@ -2903,8 +2888,7 @@ static int ice_vc_process_vlan_msg(struct ice_vf *vf, u8 *msg, bool add_v) promisc_m, vid); if (status) { v_ret = VIRTCHNL_STATUS_ERR_PARAM; - dev_err(dev, - "Enable Unicast/multicast promiscuous mode on VLAN ID:%d failed error-%d\n", + dev_err(dev, "Enable Unicast/multicast promiscuous mode on VLAN ID:%d failed error-%d\n", vid, status); } } @@ -3140,8 +3124,7 @@ error_handler: case VIRTCHNL_OP_GET_VF_RESOURCES: err = ice_vc_get_vf_res_msg(vf, msg); if (ice_vf_init_vlan_stripping(vf)) - dev_err(dev, - "Failed to initialize VLAN stripping for VF %d\n", + dev_err(dev, "Failed to initialize VLAN stripping for VF %d\n", vf->vf_id); ice_vc_notify_vf_link_state(vf); break; @@ -3313,8 +3296,7 @@ int ice_set_vf_mac(struct net_device *netdev, int vf_id, u8 *mac) */ ether_addr_copy(vf->dflt_lan_addr.addr, mac); vf->pf_set_mac = true; - netdev_info(netdev, - "MAC on VF %d set to %pM. VF driver will be reinitialized\n", + netdev_info(netdev, "MAC on VF %d set to %pM. VF driver will be reinitialized\n", vf_id, mac); ice_vc_reset_vf(vf); @@ -3332,10 +3314,8 @@ int ice_set_vf_mac(struct net_device *netdev, int vf_id, u8 *mac) int ice_set_vf_trust(struct net_device *netdev, int vf_id, bool trusted) { struct ice_pf *pf = ice_netdev_to_pf(netdev); - struct device *dev; struct ice_vf *vf; - dev = ice_pf_to_dev(pf); if (ice_validate_vf_id(pf, vf_id)) return -EINVAL; @@ -3358,7 +3338,7 @@ int ice_set_vf_trust(struct net_device *netdev, int vf_id, bool trusted) vf->trusted = trusted; ice_vc_reset_vf(vf); - dev_info(dev, "VF %u is now %strusted\n", + dev_info(ice_pf_to_dev(pf), "VF %u is now %strusted\n", vf_id, trusted ? "" : "un"); return 0; diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.c b/drivers/net/ethernet/intel/ice/ice_xsk.c index 149dca0012ba..4d3407bbd4c4 100644 --- a/drivers/net/ethernet/intel/ice/ice_xsk.c +++ b/drivers/net/ethernet/intel/ice/ice_xsk.c @@ -338,8 +338,8 @@ static int ice_xsk_umem_dma_map(struct ice_vsi *vsi, struct xdp_umem *umem) DMA_BIDIRECTIONAL, ICE_RX_DMA_ATTR); if (dma_mapping_error(dev, dma)) { - dev_dbg(dev, - "XSK UMEM DMA mapping error on page num %d", i); + dev_dbg(dev, "XSK UMEM DMA mapping error on page num %d\n", + i); goto out_unmap; } -- cgit v1.2.3 From 3306f79f428e858a80d5eadb974c80a1faa399f3 Mon Sep 17 00:00:00 2001 From: Anirudh Venkataramanan Date: Thu, 6 Feb 2020 01:20:11 -0800 Subject: ice: Cleanup ice_vsi_alloc_q_vectors 1. Remove local variable num_q_vectors and use vsi->num_q_vectors instead 2. Remove local variable pf and pass vsi->back to ice_pf_to_dev Signed-off-by: Anirudh Venkataramanan Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/ice/ice_base.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_base.c b/drivers/net/ethernet/intel/ice/ice_base.c index 46f427a95056..81885efadc7a 100644 --- a/drivers/net/ethernet/intel/ice/ice_base.c +++ b/drivers/net/ethernet/intel/ice/ice_base.c @@ -503,20 +503,15 @@ int ice_vsi_ctrl_rx_ring(struct ice_vsi *vsi, bool ena, u16 rxq_idx) */ int ice_vsi_alloc_q_vectors(struct ice_vsi *vsi) { - struct ice_pf *pf = vsi->back; - int v_idx = 0, num_q_vectors; - struct device *dev; - int err; + struct device *dev = ice_pf_to_dev(vsi->back); + int v_idx, err; - dev = ice_pf_to_dev(pf); if (vsi->q_vectors[0]) { dev_dbg(dev, "VSI %d has existing q_vectors\n", vsi->vsi_num); return -EEXIST; } - num_q_vectors = vsi->num_q_vectors; - - for (v_idx = 0; v_idx < num_q_vectors; v_idx++) { + for (v_idx = 0; v_idx < vsi->num_q_vectors; v_idx++) { err = ice_vsi_alloc_q_vector(vsi, v_idx); if (err) goto err_out; -- cgit v1.2.3 From 1d8bd9927234081db15a1d42a7f99505244e3703 Mon Sep 17 00:00:00 2001 From: Ben Shelton Date: Thu, 6 Feb 2020 01:20:12 -0800 Subject: ice: Use correct netif error function Use the correct netif_msg_[tx,rx]_error() function to determine whether to print the MDD event type. Signed-off-by: Ben Shelton Signed-off-by: Tony Nguyen Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/ice/ice_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c index db7e8069f37b..49f4f5eb90d1 100644 --- a/drivers/net/ethernet/intel/ice/ice_main.c +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -1228,7 +1228,7 @@ static void ice_handle_mdd_event(struct ice_pf *pf) u16 queue = ((reg & GL_MDET_TX_TCLAN_QNUM_M) >> GL_MDET_TX_TCLAN_QNUM_S); - if (netif_msg_rx_err(pf)) + if (netif_msg_tx_err(pf)) dev_info(dev, "Malicious Driver Detection event %d on TX queue %d PF# %d VF# %d\n", event, queue, pf_num, vf_num); wr32(hw, GL_MDET_TX_TCLAN, 0xffffffff); -- cgit v1.2.3 From 4ee656bba8013929bcc050bcebc39a47fe763ee9 Mon Sep 17 00:00:00 2001 From: Tony Nguyen Date: Thu, 6 Feb 2020 01:20:13 -0800 Subject: ice: Trivial fixes This is a collection of trivial fixes including fixing whitespace, typos, function headers, reverse Christmas tree, etc. Signed-off-by: Tony Nguyen Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/ice/ice_adminq_cmd.h | 1 + drivers/net/ethernet/intel/ice/ice_common.c | 4 ++-- drivers/net/ethernet/intel/ice/ice_dcb.c | 8 ++++---- drivers/net/ethernet/intel/ice/ice_dcb_lib.c | 4 ++-- drivers/net/ethernet/intel/ice/ice_ethtool.c | 14 +++++++------- drivers/net/ethernet/intel/ice/ice_lib.c | 7 ++++--- drivers/net/ethernet/intel/ice/ice_main.c | 4 +++- drivers/net/ethernet/intel/ice/ice_txrx.c | 9 ++++----- drivers/net/ethernet/intel/ice/ice_txrx.h | 4 ++-- drivers/net/ethernet/intel/ice/ice_type.h | 2 +- drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c | 3 +-- 11 files changed, 31 insertions(+), 29 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h b/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h index 4459bc564b11..6873998cf145 100644 --- a/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h +++ b/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h @@ -1660,6 +1660,7 @@ struct ice_aqc_get_pkg_info_resp { __le32 count; struct ice_aqc_get_pkg_info pkg_info[1]; }; + /** * struct ice_aq_desc - Admin Queue (AQ) descriptor * @flags: ICE_AQ_FLAG_* flags diff --git a/drivers/net/ethernet/intel/ice/ice_common.c b/drivers/net/ethernet/intel/ice/ice_common.c index a46d51357650..04d5db0a25bf 100644 --- a/drivers/net/ethernet/intel/ice/ice_common.c +++ b/drivers/net/ethernet/intel/ice/ice_common.c @@ -588,10 +588,10 @@ void ice_output_fw_log(struct ice_hw *hw, struct ice_aq_desc *desc, void *buf) } /** - * ice_get_itr_intrl_gran - determine int/intrl granularity + * ice_get_itr_intrl_gran * @hw: pointer to the HW struct * - * Determines the ITR/intrl granularities based on the maximum aggregate + * Determines the ITR/INTRL granularities based on the maximum aggregate * bandwidth according to the device's configuration during power-on. */ static void ice_get_itr_intrl_gran(struct ice_hw *hw) diff --git a/drivers/net/ethernet/intel/ice/ice_dcb.c b/drivers/net/ethernet/intel/ice/ice_dcb.c index 713e8a892e14..adb8dab765c8 100644 --- a/drivers/net/ethernet/intel/ice/ice_dcb.c +++ b/drivers/net/ethernet/intel/ice/ice_dcb.c @@ -1323,13 +1323,13 @@ enum ice_status ice_set_dcb_cfg(struct ice_port_info *pi) } /** - * ice_aq_query_port_ets - query port ets configuration + * ice_aq_query_port_ets - query port ETS configuration * @pi: port information structure * @buf: pointer to buffer * @buf_size: buffer size in bytes * @cd: pointer to command details structure or NULL * - * query current port ets configuration + * query current port ETS configuration */ static enum ice_status ice_aq_query_port_ets(struct ice_port_info *pi, @@ -1416,13 +1416,13 @@ ice_update_port_tc_tree_cfg(struct ice_port_info *pi, } /** - * ice_query_port_ets - query port ets configuration + * ice_query_port_ets - query port ETS configuration * @pi: port information structure * @buf: pointer to buffer * @buf_size: buffer size in bytes * @cd: pointer to command details structure or NULL * - * query current port ets configuration and update the + * query current port ETS configuration and update the * SW DB with the TC changes */ enum ice_status diff --git a/drivers/net/ethernet/intel/ice/ice_dcb_lib.c b/drivers/net/ethernet/intel/ice/ice_dcb_lib.c index adc69fe1bd64..7108fb41b604 100644 --- a/drivers/net/ethernet/intel/ice/ice_dcb_lib.c +++ b/drivers/net/ethernet/intel/ice/ice_dcb_lib.c @@ -412,9 +412,9 @@ static int ice_dcb_init_cfg(struct ice_pf *pf, bool locked) } /** - * ice_dcb_sw_default_config - Apply a default DCB config + * ice_dcb_sw_dflt_cfg - Apply a default DCB config * @pf: PF to apply config to - * @ets_willing: configure ets willing + * @ets_willing: configure ETS willing * @locked: was this function called with RTNL held */ static int ice_dcb_sw_dflt_cfg(struct ice_pf *pf, bool ets_willing, bool locked) diff --git a/drivers/net/ethernet/intel/ice/ice_ethtool.c b/drivers/net/ethernet/intel/ice/ice_ethtool.c index 4b29d1ae56a7..b002ab4e5838 100644 --- a/drivers/net/ethernet/intel/ice/ice_ethtool.c +++ b/drivers/net/ethernet/intel/ice/ice_ethtool.c @@ -3800,11 +3800,11 @@ ice_get_module_eeprom(struct net_device *netdev, static const struct ethtool_ops ice_ethtool_ops = { .get_link_ksettings = ice_get_link_ksettings, .set_link_ksettings = ice_set_link_ksettings, - .get_drvinfo = ice_get_drvinfo, - .get_regs_len = ice_get_regs_len, - .get_regs = ice_get_regs, - .get_msglevel = ice_get_msglevel, - .set_msglevel = ice_set_msglevel, + .get_drvinfo = ice_get_drvinfo, + .get_regs_len = ice_get_regs_len, + .get_regs = ice_get_regs, + .get_msglevel = ice_get_msglevel, + .set_msglevel = ice_set_msglevel, .self_test = ice_self_test, .get_link = ethtool_op_get_link, .get_eeprom_len = ice_get_eeprom_len, @@ -3831,8 +3831,8 @@ static const struct ethtool_ops ice_ethtool_ops = { .get_channels = ice_get_channels, .set_channels = ice_set_channels, .get_ts_info = ethtool_op_get_ts_info, - .get_per_queue_coalesce = ice_get_per_q_coalesce, - .set_per_queue_coalesce = ice_set_per_q_coalesce, + .get_per_queue_coalesce = ice_get_per_q_coalesce, + .set_per_queue_coalesce = ice_set_per_q_coalesce, .get_fecparam = ice_get_fecparam, .set_fecparam = ice_set_fecparam, .get_module_info = ice_get_module_info, diff --git a/drivers/net/ethernet/intel/ice/ice_lib.c b/drivers/net/ethernet/intel/ice/ice_lib.c index 4f5116554dcc..d974e2fa3e63 100644 --- a/drivers/net/ethernet/intel/ice/ice_lib.c +++ b/drivers/net/ethernet/intel/ice/ice_lib.c @@ -1230,8 +1230,9 @@ static void ice_vsi_set_rss_flow_fld(struct ice_vsi *vsi) * * Returns 0 on success or ENOMEM on failure. */ -int ice_add_mac_to_list(struct ice_vsi *vsi, struct list_head *add_list, - const u8 *macaddr) +int +ice_add_mac_to_list(struct ice_vsi *vsi, struct list_head *add_list, + const u8 *macaddr) { struct ice_fltr_list_entry *tmp; struct ice_pf *pf = vsi->back; @@ -2824,8 +2825,8 @@ static void ice_vsi_update_q_map(struct ice_vsi *vsi, struct ice_vsi_ctx *ctx) int ice_vsi_cfg_tc(struct ice_vsi *vsi, u8 ena_tc) { u16 max_txqs[ICE_MAX_TRAFFIC_CLASS] = { 0 }; - struct ice_vsi_ctx *ctx; struct ice_pf *pf = vsi->back; + struct ice_vsi_ctx *ctx; enum ice_status status; struct device *dev; int i, ret = 0; diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c index 49f4f5eb90d1..5ef28052c0f8 100644 --- a/drivers/net/ethernet/intel/ice/ice_main.c +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -3153,7 +3153,9 @@ ice_probe(struct pci_dev *pdev, const struct pci_device_id __always_unused *ent) struct ice_hw *hw; int err; - /* this driver uses devres, see Documentation/driver-api/driver-model/devres.rst */ + /* this driver uses devres, see + * Documentation/driver-api/driver-model/devres.rst + */ err = pcim_enable_device(pdev); if (err) return err; diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c b/drivers/net/ethernet/intel/ice/ice_txrx.c index 1d4755acca3d..4de61dbedd36 100644 --- a/drivers/net/ethernet/intel/ice/ice_txrx.c +++ b/drivers/net/ethernet/intel/ice/ice_txrx.c @@ -644,7 +644,7 @@ static bool ice_page_is_reserved(struct page *page) * Update the offset within page so that Rx buf will be ready to be reused. * For systems with PAGE_SIZE < 8192 this function will flip the page offset * so the second half of page assigned to Rx buffer will be used, otherwise - * the offset is moved by the @size bytes + * the offset is moved by "size" bytes */ static void ice_rx_buf_adjust_pg_offset(struct ice_rx_buf *rx_buf, unsigned int size) @@ -1619,11 +1619,11 @@ ice_tx_map(struct ice_ring *tx_ring, struct ice_tx_buf *first, { u64 td_offset, td_tag, td_cmd; u16 i = tx_ring->next_to_use; - skb_frag_t *frag; unsigned int data_len, size; struct ice_tx_desc *tx_desc; struct ice_tx_buf *tx_buf; struct sk_buff *skb; + skb_frag_t *frag; dma_addr_t dma; td_tag = off->td_l2tag1; @@ -1736,9 +1736,8 @@ ice_tx_map(struct ice_ring *tx_ring, struct ice_tx_buf *first, ice_maybe_stop_tx(tx_ring, DESC_NEEDED); /* notify HW of packet */ - if (netif_xmit_stopped(txring_txq(tx_ring)) || !netdev_xmit_more()) { + if (netif_xmit_stopped(txring_txq(tx_ring)) || !netdev_xmit_more()) writel(i, tx_ring->tail); - } return; @@ -2076,7 +2075,7 @@ static bool __ice_chk_linearize(struct sk_buff *skb) frag = &skb_shinfo(skb)->frags[0]; /* Initialize size to the negative value of gso_size minus 1. We - * use this as the worst case scenerio in which the frag ahead + * use this as the worst case scenario in which the frag ahead * of us only provides one byte which is why we are limited to 6 * descriptors for a single transmit as the header and previous * fragment are already consuming 2 descriptors. diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.h b/drivers/net/ethernet/intel/ice/ice_txrx.h index a86270696df1..14a1bf445889 100644 --- a/drivers/net/ethernet/intel/ice/ice_txrx.h +++ b/drivers/net/ethernet/intel/ice/ice_txrx.h @@ -33,8 +33,8 @@ * frame. * * Note: For cache line sizes 256 or larger this value is going to end - * up negative. In these cases we should fall back to the legacy - * receive path. + * up negative. In these cases we should fall back to the legacy + * receive path. */ #if (PAGE_SIZE < 8192) #define ICE_2K_TOO_SMALL_WITH_PADDING \ diff --git a/drivers/net/ethernet/intel/ice/ice_type.h b/drivers/net/ethernet/intel/ice/ice_type.h index b361ffabb0ca..db0ef6ba907f 100644 --- a/drivers/net/ethernet/intel/ice/ice_type.h +++ b/drivers/net/ethernet/intel/ice/ice_type.h @@ -517,7 +517,7 @@ struct ice_hw { struct ice_fw_log_cfg fw_log; /* Device max aggregate bandwidths corresponding to the GL_PWR_MODE_CTL - * register. Used for determining the ITR/intrl granularity during + * register. Used for determining the ITR/INTRL granularity during * initialization. */ #define ICE_MAX_AGG_BW_200G 0x0 diff --git a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c index eb60abd8394f..262714d5f54a 100644 --- a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c +++ b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c @@ -1093,7 +1093,6 @@ bool ice_reset_all_vfs(struct ice_pf *pf, bool is_vflr) * finished resetting. */ for (i = 0, v = 0; i < 10 && v < pf->num_alloc_vfs; i++) { - /* Check each VF in sequence */ while (v < pf->num_alloc_vfs) { u32 reg; @@ -2637,8 +2636,8 @@ static int ice_vc_request_qs_msg(struct ice_vf *vf, u8 *msg) struct ice_pf *pf = vf->pf; u16 max_allowed_vf_queues; u16 tx_rx_queue_left; - u16 cur_queues; struct device *dev; + u16 cur_queues; dev = ice_pf_to_dev(pf); if (!test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states)) { -- cgit v1.2.3 From fe154a2422333c4cd87bb0d8a19fb2df066cc6ee Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Mon, 3 Feb 2020 15:27:24 +0000 Subject: drm/panfrost: Remove set but not used variable 'bo' Fixes gcc '-Wunused-but-set-variable' warning: drivers/gpu/drm/panfrost/panfrost_job.c: In function 'panfrost_job_cleanup': drivers/gpu/drm/panfrost/panfrost_job.c:278:31: warning: variable 'bo' set but not used [-Wunused-but-set-variable] commit bdefca2d8dc0 ("drm/panfrost: Add the panfrost_gem_mapping concept") involved this unused variable. Reported-by: Hulk Robot Signed-off-by: YueHaibing Reviewed-by: Steven Price Reviewed-by: Alyssas Rosenzweig Signed-off-by: Rob Herring Link: https://patchwork.freedesktop.org/patch/msgid/20200203152724.42611-1-yuehaibing@huawei.com --- drivers/gpu/drm/panfrost/panfrost_job.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/drivers/gpu/drm/panfrost/panfrost_job.c b/drivers/gpu/drm/panfrost/panfrost_job.c index 7157dfd7dea3..9a1a72a748e7 100644 --- a/drivers/gpu/drm/panfrost/panfrost_job.c +++ b/drivers/gpu/drm/panfrost/panfrost_job.c @@ -280,12 +280,8 @@ static void panfrost_job_cleanup(struct kref *ref) } if (job->bos) { - struct panfrost_gem_object *bo; - - for (i = 0; i < job->bo_count; i++) { - bo = to_panfrost_bo(job->bos[i]); + for (i = 0; i < job->bo_count; i++) drm_gem_object_put_unlocked(job->bos[i]); - } kvfree(job->bos); } -- cgit v1.2.3 From 6cd1ed50efd88261298577cd92a14f2768eddeeb Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 10 Feb 2020 11:07:21 -0800 Subject: vt: vt_ioctl: fix race in VT_RESIZEX We need to make sure vc_cons[i].d is not NULL after grabbing console_lock(), or risk a crash. general protection fault, probably for non-canonical address 0xdffffc0000000068: 0000 [#1] PREEMPT SMP KASAN KASAN: null-ptr-deref in range [0x0000000000000340-0x0000000000000347] CPU: 1 PID: 19462 Comm: syz-executor.5 Not tainted 5.5.0-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:vt_ioctl+0x1f96/0x26d0 drivers/tty/vt/vt_ioctl.c:883 Code: 74 41 e8 bd a6 84 fd 48 89 d8 48 c1 e8 03 42 80 3c 28 00 0f 85 e4 04 00 00 48 8b 03 48 8d b8 40 03 00 00 48 89 fa 48 c1 ea 03 <42> 0f b6 14 2a 84 d2 74 09 80 fa 03 0f 8e b1 05 00 00 44 89 b8 40 RSP: 0018:ffffc900086d7bb0 EFLAGS: 00010202 RAX: 0000000000000000 RBX: ffffffff8c34ee88 RCX: ffffc9001415c000 RDX: 0000000000000068 RSI: ffffffff83f0e6e3 RDI: 0000000000000340 RBP: ffffc900086d7cd0 R08: ffff888054ce0100 R09: fffffbfff16a2f6d R10: ffff888054ce0998 R11: ffff888054ce0100 R12: 000000000000001d R13: dffffc0000000000 R14: 1ffff920010daf79 R15: 000000000000ff7f FS: 00007f7d13c12700(0000) GS:ffff8880ae900000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007ffd477e3c38 CR3: 0000000095d0a000 CR4: 00000000001406e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: tty_ioctl+0xa37/0x14f0 drivers/tty/tty_io.c:2660 vfs_ioctl fs/ioctl.c:47 [inline] ksys_ioctl+0x123/0x180 fs/ioctl.c:763 __do_sys_ioctl fs/ioctl.c:772 [inline] __se_sys_ioctl fs/ioctl.c:770 [inline] __x64_sys_ioctl+0x73/0xb0 fs/ioctl.c:770 do_syscall_64+0xfa/0x790 arch/x86/entry/common.c:294 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x45b399 Code: ad b6 fb ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 7b b6 fb ff c3 66 2e 0f 1f 84 00 00 00 00 RSP: 002b:00007f7d13c11c78 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 RAX: ffffffffffffffda RBX: 00007f7d13c126d4 RCX: 000000000045b399 RDX: 0000000020000080 RSI: 000000000000560a RDI: 0000000000000003 RBP: 000000000075bf20 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 00000000ffffffff R13: 0000000000000666 R14: 00000000004c7f04 R15: 000000000075bf2c Modules linked in: ---[ end trace 80970faf7a67eb77 ]--- RIP: 0010:vt_ioctl+0x1f96/0x26d0 drivers/tty/vt/vt_ioctl.c:883 Code: 74 41 e8 bd a6 84 fd 48 89 d8 48 c1 e8 03 42 80 3c 28 00 0f 85 e4 04 00 00 48 8b 03 48 8d b8 40 03 00 00 48 89 fa 48 c1 ea 03 <42> 0f b6 14 2a 84 d2 74 09 80 fa 03 0f 8e b1 05 00 00 44 89 b8 40 RSP: 0018:ffffc900086d7bb0 EFLAGS: 00010202 RAX: 0000000000000000 RBX: ffffffff8c34ee88 RCX: ffffc9001415c000 RDX: 0000000000000068 RSI: ffffffff83f0e6e3 RDI: 0000000000000340 RBP: ffffc900086d7cd0 R08: ffff888054ce0100 R09: fffffbfff16a2f6d R10: ffff888054ce0998 R11: ffff888054ce0100 R12: 000000000000001d R13: dffffc0000000000 R14: 1ffff920010daf79 R15: 000000000000ff7f FS: 00007f7d13c12700(0000) GS:ffff8880ae900000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007ffd477e3c38 CR3: 0000000095d0a000 CR4: 00000000001406e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Eric Dumazet Cc: stable Reported-by: syzbot Link: https://lore.kernel.org/r/20200210190721.200418-1-edumazet@google.com Signed-off-by: Greg Kroah-Hartman --- drivers/tty/vt/vt_ioctl.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/drivers/tty/vt/vt_ioctl.c b/drivers/tty/vt/vt_ioctl.c index 8b0ed139592f..ee6c91ef1f6c 100644 --- a/drivers/tty/vt/vt_ioctl.c +++ b/drivers/tty/vt/vt_ioctl.c @@ -876,15 +876,20 @@ int vt_ioctl(struct tty_struct *tty, return -EINVAL; for (i = 0; i < MAX_NR_CONSOLES; i++) { + struct vc_data *vcp; + if (!vc_cons[i].d) continue; console_lock(); - if (v.v_vlin) - vc_cons[i].d->vc_scan_lines = v.v_vlin; - if (v.v_clin) - vc_cons[i].d->vc_font.height = v.v_clin; - vc_cons[i].d->vc_resize_user = 1; - vc_resize(vc_cons[i].d, v.v_cols, v.v_rows); + vcp = vc_cons[i].d; + if (vcp) { + if (v.v_vlin) + vcp->vc_scan_lines = v.v_vlin; + if (v.v_clin) + vcp->vc_font.height = v.v_clin; + vcp->vc_resize_user = 1; + vc_resize(vcp, v.v_cols, v.v_rows); + } console_unlock(); } break; -- cgit v1.2.3 From 0bf999f9c5e74c7ecf9dafb527146601e5c848b9 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 9 Feb 2020 19:36:14 -0800 Subject: linux/pipe_fs_i.h: fix kernel-doc warnings after @wait was split Fix kernel-doc warnings in struct pipe_inode_info after @wait was split into @rd_wait and @wr_wait. include/linux/pipe_fs_i.h:66: warning: Function parameter or member 'rd_wait' not described in 'pipe_inode_info' include/linux/pipe_fs_i.h:66: warning: Function parameter or member 'wr_wait' not described in 'pipe_inode_info' Fixes: 0ddad21d3e99 ("pipe: use exclusive waits when reading or writing") Signed-off-by: Randy Dunlap Signed-off-by: Linus Torvalds --- include/linux/pipe_fs_i.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h index d5765039652a..ae58fad7f1e0 100644 --- a/include/linux/pipe_fs_i.h +++ b/include/linux/pipe_fs_i.h @@ -29,7 +29,8 @@ struct pipe_buffer { /** * struct pipe_inode_info - a linux kernel pipe * @mutex: mutex protecting the whole thing - * @wait: reader/writer wait point in case of empty/full pipe + * @rd_wait: reader wait point in case of empty pipe + * @wr_wait: writer wait point in case of full pipe * @head: The point of buffer production * @tail: The point of buffer consumption * @max_usage: The maximum number of slots that may be used in the ring -- cgit v1.2.3 From f76707831829530ffdd3888bebc108aecefccaa0 Mon Sep 17 00:00:00 2001 From: Fugang Duan Date: Tue, 11 Feb 2020 14:16:01 +0800 Subject: tty: serial: imx: setup the correct sg entry for tx dma MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There has oops as below happen on i.MX8MP EVK platform that has 6G bytes DDR memory. when (xmit->tail < xmit->head) && (xmit->head == 0), it setups one sg entry with sg->length is zero: sg_set_buf(sgl + 1, xmit->buf, xmit->head); if xmit->buf is allocated from >4G address space, and SDMA only support <4G address space, then dma_map_sg() will call swiotlb_map() to do bounce buffer copying and mapping. But swiotlb_map() don't allow sg entry's length is zero, otherwise report BUG_ON(). So the patch is to correct the tx DMA scatter list. Oops: [ 287.675715] kernel BUG at kernel/dma/swiotlb.c:497! [ 287.680592] Internal error: Oops - BUG: 0 [#1] PREEMPT SMP [ 287.686075] Modules linked in: [ 287.689133] CPU: 0 PID: 0 Comm: swapper/0 Not tainted 5.4.3-00016-g3fdc4e0-dirty #10 [ 287.696872] Hardware name: FSL i.MX8MP EVK (DT) [ 287.701402] pstate: 80000085 (Nzcv daIf -PAN -UAO) [ 287.706199] pc : swiotlb_tbl_map_single+0x1fc/0x310 [ 287.711076] lr : swiotlb_map+0x60/0x148 [ 287.714909] sp : ffff800010003c00 [ 287.718221] x29: ffff800010003c00 x28: 0000000000000000 [ 287.723533] x27: 0000000000000040 x26: ffff800011ae0000 [ 287.728844] x25: ffff800011ae09f8 x24: 0000000000000000 [ 287.734155] x23: 00000001b7af9000 x22: 0000000000000000 [ 287.739465] x21: ffff000176409c10 x20: 00000000001f7ffe [ 287.744776] x19: ffff000176409c10 x18: 000000000000002e [ 287.750087] x17: 0000000000000000 x16: 0000000000000000 [ 287.755397] x15: 0000000000000000 x14: 0000000000000000 [ 287.760707] x13: ffff00017f334000 x12: 0000000000000001 [ 287.766018] x11: 00000000001fffff x10: 0000000000000000 [ 287.771328] x9 : 0000000000000003 x8 : 0000000000000000 [ 287.776638] x7 : 0000000000000000 x6 : 0000000000000000 [ 287.781949] x5 : 0000000000200000 x4 : 0000000000000000 [ 287.787259] x3 : 0000000000000001 x2 : 00000001b7af9000 [ 287.792570] x1 : 00000000fbfff000 x0 : 0000000000000000 [ 287.797881] Call trace: [ 287.800328] swiotlb_tbl_map_single+0x1fc/0x310 [ 287.804859] swiotlb_map+0x60/0x148 [ 287.808347] dma_direct_map_page+0xf0/0x130 [ 287.812530] dma_direct_map_sg+0x78/0xe0 [ 287.816453] imx_uart_dma_tx+0x134/0x2f8 [ 287.820374] imx_uart_dma_tx_callback+0xd8/0x168 [ 287.824992] vchan_complete+0x194/0x200 [ 287.828828] tasklet_action_common.isra.0+0x154/0x1a0 [ 287.833879] tasklet_action+0x24/0x30 [ 287.837540] __do_softirq+0x120/0x23c [ 287.841202] irq_exit+0xb8/0xd8 [ 287.844343] __handle_domain_irq+0x64/0xb8 [ 287.848438] gic_handle_irq+0x5c/0x148 [ 287.852185] el1_irq+0xb8/0x180 [ 287.855327] cpuidle_enter_state+0x84/0x360 [ 287.859508] cpuidle_enter+0x34/0x48 [ 287.863083] call_cpuidle+0x18/0x38 [ 287.866571] do_idle+0x1e0/0x280 [ 287.869798] cpu_startup_entry+0x20/0x40 [ 287.873721] rest_init+0xd4/0xe0 [ 287.876949] arch_call_rest_init+0xc/0x14 [ 287.880958] start_kernel+0x420/0x44c [ 287.884622] Code: 9124c021 9417aff8 a94363f7 17ffffd5 (d4210000) [ 287.890718] ---[ end trace 5bc44c4ab6b009ce ]--- [ 287.895334] Kernel panic - not syncing: Fatal exception in interrupt [ 287.901686] SMP: stopping secondary CPUs [ 288.905607] SMP: failed to stop secondary CPUs 0-1 [ 288.910395] Kernel Offset: disabled [ 288.913882] CPU features: 0x0002,2000200c [ 288.917888] Memory Limit: none [ 288.920944] ---[ end Kernel panic - not syncing: Fatal exception in interrupt ]--- Reported-by: Eagle Zhou Tested-by: Eagle Zhou Signed-off-by: Fugang Duan Cc: stable Fixes: 7942f8577f2a ("serial: imx: TX DMA: clean up sg initialization") Reviewed-by: Uwe Kleine-König Link: https://lore.kernel.org/r/1581401761-6378-1-git-send-email-fugang.duan@nxp.com Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/imx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/tty/serial/imx.c b/drivers/tty/serial/imx.c index 0c6c63166250..d337782b3648 100644 --- a/drivers/tty/serial/imx.c +++ b/drivers/tty/serial/imx.c @@ -599,7 +599,7 @@ static void imx_uart_dma_tx(struct imx_port *sport) sport->tx_bytes = uart_circ_chars_pending(xmit); - if (xmit->tail < xmit->head) { + if (xmit->tail < xmit->head || xmit->head == 0) { sport->dma_tx_nents = 1; sg_init_one(sgl, xmit->buf + xmit->tail, sport->tx_bytes); } else { -- cgit v1.2.3 From 7febbcbc48fc92e3f33863b32ed715ba4aff18c4 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 11 Feb 2020 15:55:59 +0200 Subject: serial: 8250: Check UPF_IRQ_SHARED in advance The commit 54e53b2e8081 ("tty: serial: 8250: pass IRQ shared flag to UART ports") nicely explained the problem: ---8<---8<--- On some systems IRQ lines between multiple UARTs might be shared. If so, the irqflags have to be configured accordingly. The reason is: The 8250 port startup code performs IRQ tests *before* the IRQ handler for that particular port is registered. This is performed in serial8250_do_startup(). This function checks whether IRQF_SHARED is configured and only then disables the IRQ line while testing. This test is performed upon each open() of the UART device. Imagine two UARTs share the same IRQ line: On is already opened and the IRQ is active. When the second UART is opened, the IRQ line has to be disabled while performing IRQ tests. Otherwise an IRQ might handler might be invoked, but the IRQ itself cannot be handled, because the corresponding handler isn't registered, yet. That's because the 8250 code uses a chain-handler and invokes the corresponding port's IRQ handling routines himself. Unfortunately this IRQF_SHARED flag isn't configured for UARTs probed via device tree even if the IRQs are shared. This way, the actual and shared IRQ line isn't disabled while performing tests and the kernel correctly detects a spurious IRQ. So, adding this flag to the DT probe solves the issue. Note: The UPF_SHARE_IRQ flag is configured unconditionally. Therefore, the IRQF_SHARED flag can be set unconditionally as well. Example stack trace by performing `echo 1 > /dev/ttyS2` on a non-patched system: |irq 85: nobody cared (try booting with the "irqpoll" option) | [...] |handlers: |[] irq_default_primary_handler threaded [] serial8250_interrupt |Disabling IRQ #85 ---8<---8<--- But unfortunately didn't fix the root cause. Let's try again here by moving IRQ flag assignment from serial_link_irq_chain() to serial8250_do_startup(). This should fix the similar issue reported for 8250_pnp case. Since this change we don't need to have custom solutions in 8250_aspeed_vuart and 8250_of drivers, thus, drop them. Fixes: 1c2f04937b3e ("serial: 8250: add IRQ trigger support") Reported-by: Li RongQing Cc: Kurt Kanzenbach Cc: Vikram Pandita Signed-off-by: Andy Shevchenko Cc: stable Acked-by: Kurt Kanzenbach Link: https://lore.kernel.org/r/20200211135559.85960-1-andriy.shevchenko@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/8250/8250_aspeed_vuart.c | 1 - drivers/tty/serial/8250/8250_core.c | 5 ++--- drivers/tty/serial/8250/8250_of.c | 1 - drivers/tty/serial/8250/8250_port.c | 4 ++++ 4 files changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/tty/serial/8250/8250_aspeed_vuart.c b/drivers/tty/serial/8250/8250_aspeed_vuart.c index d657aa14c3e4..c33e02cbde93 100644 --- a/drivers/tty/serial/8250/8250_aspeed_vuart.c +++ b/drivers/tty/serial/8250/8250_aspeed_vuart.c @@ -446,7 +446,6 @@ static int aspeed_vuart_probe(struct platform_device *pdev) port.port.line = rc; port.port.irq = irq_of_parse_and_map(np, 0); - port.port.irqflags = IRQF_SHARED; port.port.handle_irq = aspeed_vuart_handle_irq; port.port.iotype = UPIO_MEM; port.port.type = PORT_16550A; diff --git a/drivers/tty/serial/8250/8250_core.c b/drivers/tty/serial/8250/8250_core.c index 0894a22fd702..f2a33c9082a6 100644 --- a/drivers/tty/serial/8250/8250_core.c +++ b/drivers/tty/serial/8250/8250_core.c @@ -174,7 +174,7 @@ static int serial_link_irq_chain(struct uart_8250_port *up) struct hlist_head *h; struct hlist_node *n; struct irq_info *i; - int ret, irq_flags = up->port.flags & UPF_SHARE_IRQ ? IRQF_SHARED : 0; + int ret; mutex_lock(&hash_mutex); @@ -209,9 +209,8 @@ static int serial_link_irq_chain(struct uart_8250_port *up) INIT_LIST_HEAD(&up->list); i->head = &up->list; spin_unlock_irq(&i->lock); - irq_flags |= up->port.irqflags; ret = request_irq(up->port.irq, serial8250_interrupt, - irq_flags, up->port.name, i); + up->port.irqflags, up->port.name, i); if (ret < 0) serial_do_unlink(i, up); } diff --git a/drivers/tty/serial/8250/8250_of.c b/drivers/tty/serial/8250/8250_of.c index 531ad67395e0..f6687756ec5e 100644 --- a/drivers/tty/serial/8250/8250_of.c +++ b/drivers/tty/serial/8250/8250_of.c @@ -202,7 +202,6 @@ static int of_platform_serial_setup(struct platform_device *ofdev, port->type = type; port->uartclk = clk; - port->irqflags |= IRQF_SHARED; if (of_property_read_bool(np, "no-loopback-test")) port->flags |= UPF_SKIP_TEST; diff --git a/drivers/tty/serial/8250/8250_port.c b/drivers/tty/serial/8250/8250_port.c index 430e3467aff7..0325f2e53b74 100644 --- a/drivers/tty/serial/8250/8250_port.c +++ b/drivers/tty/serial/8250/8250_port.c @@ -2177,6 +2177,10 @@ int serial8250_do_startup(struct uart_port *port) } } + /* Check if we need to have shared IRQs */ + if (port->irq && (up->port.flags & UPF_SHARE_IRQ)) + up->port.irqflags |= IRQF_SHARED; + if (port->irq && !(up->port.flags & UPF_NO_THRE_TEST)) { unsigned char iir1; /* -- cgit v1.2.3 From 679aac5ead2f18d223554a52b543e1195e181811 Mon Sep 17 00:00:00 2001 From: satya priya Date: Tue, 11 Feb 2020 15:43:02 +0530 Subject: tty: serial: qcom_geni_serial: Fix RX cancel command failure RX cancel command fails when BT is switched on and off multiple times. To handle this, poll for the cancel bit in SE_GENI_S_IRQ_STATUS register instead of SE_GENI_S_CMD_CTRL_REG. As per the HPG update, handle the RX last bit after cancel command and flush out the RX FIFO buffer. Signed-off-by: satya priya Cc: stable Link: https://lore.kernel.org/r/1581415982-8793-1-git-send-email-skakit@codeaurora.org Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/qcom_geni_serial.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/drivers/tty/serial/qcom_geni_serial.c b/drivers/tty/serial/qcom_geni_serial.c index 191abb18fc2a..0bd1684cabb3 100644 --- a/drivers/tty/serial/qcom_geni_serial.c +++ b/drivers/tty/serial/qcom_geni_serial.c @@ -129,6 +129,7 @@ static int handle_rx_console(struct uart_port *uport, u32 bytes, bool drop); static int handle_rx_uart(struct uart_port *uport, u32 bytes, bool drop); static unsigned int qcom_geni_serial_tx_empty(struct uart_port *port); static void qcom_geni_serial_stop_rx(struct uart_port *uport); +static void qcom_geni_serial_handle_rx(struct uart_port *uport, bool drop); static const unsigned long root_freq[] = {7372800, 14745600, 19200000, 29491200, 32000000, 48000000, 64000000, 80000000, @@ -599,7 +600,7 @@ static void qcom_geni_serial_stop_rx(struct uart_port *uport) u32 irq_en; u32 status; struct qcom_geni_serial_port *port = to_dev_port(uport, uport); - u32 irq_clear = S_CMD_DONE_EN; + u32 s_irq_status; irq_en = readl(uport->membase + SE_GENI_S_IRQ_EN); irq_en &= ~(S_RX_FIFO_WATERMARK_EN | S_RX_FIFO_LAST_EN); @@ -615,10 +616,19 @@ static void qcom_geni_serial_stop_rx(struct uart_port *uport) return; geni_se_cancel_s_cmd(&port->se); - qcom_geni_serial_poll_bit(uport, SE_GENI_S_CMD_CTRL_REG, - S_GENI_CMD_CANCEL, false); + qcom_geni_serial_poll_bit(uport, SE_GENI_S_IRQ_STATUS, + S_CMD_CANCEL_EN, true); + /* + * If timeout occurs secondary engine remains active + * and Abort sequence is executed. + */ + s_irq_status = readl(uport->membase + SE_GENI_S_IRQ_STATUS); + /* Flush the Rx buffer */ + if (s_irq_status & S_RX_FIFO_LAST_EN) + qcom_geni_serial_handle_rx(uport, true); + writel(s_irq_status, uport->membase + SE_GENI_S_IRQ_CLEAR); + status = readl(uport->membase + SE_GENI_STATUS); - writel(irq_clear, uport->membase + SE_GENI_S_IRQ_CLEAR); if (status & S_GENI_CMD_ACTIVE) qcom_geni_serial_abort_rx(uport); } -- cgit v1.2.3 From dde2bb2da01e96c17f0a44b4a3cf72a30e66e3ef Mon Sep 17 00:00:00 2001 From: Boris Brezillon Date: Thu, 6 Feb 2020 15:13:27 +0100 Subject: drm/panfrost: perfcnt: Reserve/use the AS attached to the perfcnt MMU context We need to use the AS attached to the opened FD when dumping counters. Reported-by: Antonio Caggiano Fixes: 7282f7645d06 ("drm/panfrost: Implement per FD address spaces") Cc: Signed-off-by: Boris Brezillon Reviewed-by: Steven Price Tested-by: Antonio Caggiano Signed-off-by: Rob Herring Link: https://patchwork.freedesktop.org/patch/msgid/20200206141327.446127-1-boris.brezillon@collabora.com --- drivers/gpu/drm/panfrost/panfrost_mmu.c | 7 ++++++- drivers/gpu/drm/panfrost/panfrost_perfcnt.c | 11 ++++------- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/drivers/gpu/drm/panfrost/panfrost_mmu.c b/drivers/gpu/drm/panfrost/panfrost_mmu.c index 763cfca886a7..3107b0738e40 100644 --- a/drivers/gpu/drm/panfrost/panfrost_mmu.c +++ b/drivers/gpu/drm/panfrost/panfrost_mmu.c @@ -151,7 +151,12 @@ u32 panfrost_mmu_as_get(struct panfrost_device *pfdev, struct panfrost_mmu *mmu) as = mmu->as; if (as >= 0) { int en = atomic_inc_return(&mmu->as_count); - WARN_ON(en >= NUM_JOB_SLOTS); + + /* + * AS can be retained by active jobs or a perfcnt context, + * hence the '+ 1' here. + */ + WARN_ON(en >= (NUM_JOB_SLOTS + 1)); list_move(&mmu->list, &pfdev->as_lru_list); goto out; diff --git a/drivers/gpu/drm/panfrost/panfrost_perfcnt.c b/drivers/gpu/drm/panfrost/panfrost_perfcnt.c index 684820448be3..6913578d5aa7 100644 --- a/drivers/gpu/drm/panfrost/panfrost_perfcnt.c +++ b/drivers/gpu/drm/panfrost/panfrost_perfcnt.c @@ -73,7 +73,7 @@ static int panfrost_perfcnt_enable_locked(struct panfrost_device *pfdev, struct panfrost_file_priv *user = file_priv->driver_priv; struct panfrost_perfcnt *perfcnt = pfdev->perfcnt; struct drm_gem_shmem_object *bo; - u32 cfg; + u32 cfg, as; int ret; if (user == perfcnt->user) @@ -126,12 +126,8 @@ static int panfrost_perfcnt_enable_locked(struct panfrost_device *pfdev, perfcnt->user = user; - /* - * Always use address space 0 for now. - * FIXME: this needs to be updated when we start using different - * address space. - */ - cfg = GPU_PERFCNT_CFG_AS(0) | + as = panfrost_mmu_as_get(pfdev, perfcnt->mapping->mmu); + cfg = GPU_PERFCNT_CFG_AS(as) | GPU_PERFCNT_CFG_MODE(GPU_PERFCNT_CFG_MODE_MANUAL); /* @@ -195,6 +191,7 @@ static int panfrost_perfcnt_disable_locked(struct panfrost_device *pfdev, drm_gem_shmem_vunmap(&perfcnt->mapping->obj->base.base, perfcnt->buf); perfcnt->buf = NULL; panfrost_gem_close(&perfcnt->mapping->obj->base.base, file_priv); + panfrost_mmu_as_put(pfdev, perfcnt->mapping->mmu); panfrost_gem_mapping_put(perfcnt->mapping); perfcnt->mapping = NULL; pm_runtime_mark_last_busy(pfdev->dev); -- cgit v1.2.3 From 205447fa9e0a44cc42a74788eb2f6c96f91d5cd6 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Wed, 12 Feb 2020 10:24:26 +0100 Subject: hwmon: (pmbus/xdpe12284) fix typo in compatible strings Make sure that the driver compatible strings matches the binding by removing the space between the manufacturer and model. Fixes: aaafb7c8eb1c ("hwmon: (pmbus) Add support for Infineon Multi-phase xdpe122 family controllers") Cc: Vadim Pasternak Signed-off-by: Johan Hovold Link: https://lore.kernel.org/r/20200212092426.24012-1-johan@kernel.org Signed-off-by: Guenter Roeck --- drivers/hwmon/pmbus/xdpe12284.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/hwmon/pmbus/xdpe12284.c b/drivers/hwmon/pmbus/xdpe12284.c index 3d47806ff4d3..ecd9b65627ec 100644 --- a/drivers/hwmon/pmbus/xdpe12284.c +++ b/drivers/hwmon/pmbus/xdpe12284.c @@ -94,8 +94,8 @@ static const struct i2c_device_id xdpe122_id[] = { MODULE_DEVICE_TABLE(i2c, xdpe122_id); static const struct of_device_id __maybe_unused xdpe122_of_match[] = { - {.compatible = "infineon, xdpe12254"}, - {.compatible = "infineon, xdpe12284"}, + {.compatible = "infineon,xdpe12254"}, + {.compatible = "infineon,xdpe12284"}, {} }; MODULE_DEVICE_TABLE(of, xdpe122_of_match); -- cgit v1.2.3 From c14335ebb92a98646ddbf447e6cacc66de5269ad Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Sun, 9 Feb 2020 21:12:02 -0800 Subject: scsi: Revert "target/core: Inline transport_lun_remove_cmd()" Commit 83f85b8ec305 postponed the percpu_ref_put(&se_cmd->se_lun->lun_ref) call from command completion to the time when the final command reference is dropped. That approach is not compatible with the iSCSI target driver because the iSCSI target driver keeps the command with the highest stat_sn after it has completed until the next command is received (see also iscsit_ack_from_expstatsn()). Fix this regression by reverting commit 83f85b8ec305. Fixes: 83f85b8ec305 ("scsi: target/core: Inline transport_lun_remove_cmd()") Cc: Pavel Zakharov Cc: Mike Christie Cc: Link: https://lore.kernel.org/r/20200210051202.12934-1-bvanassche@acm.org Reported-by: Pavel Zakharov Signed-off-by: Bart Van Assche Signed-off-by: Martin K. Petersen --- drivers/target/target_core_transport.c | 31 ++++++++++++++++++++++++++++--- 1 file changed, 28 insertions(+), 3 deletions(-) diff --git a/drivers/target/target_core_transport.c b/drivers/target/target_core_transport.c index ea482d4b1f00..0ae9e60fc4d5 100644 --- a/drivers/target/target_core_transport.c +++ b/drivers/target/target_core_transport.c @@ -666,6 +666,11 @@ static int transport_cmd_check_stop_to_fabric(struct se_cmd *cmd) target_remove_from_state_list(cmd); + /* + * Clear struct se_cmd->se_lun before the handoff to FE. + */ + cmd->se_lun = NULL; + spin_lock_irqsave(&cmd->t_state_lock, flags); /* * Determine if frontend context caller is requesting the stopping of @@ -693,6 +698,17 @@ static int transport_cmd_check_stop_to_fabric(struct se_cmd *cmd) return cmd->se_tfo->check_stop_free(cmd); } +static void transport_lun_remove_cmd(struct se_cmd *cmd) +{ + struct se_lun *lun = cmd->se_lun; + + if (!lun) + return; + + if (cmpxchg(&cmd->lun_ref_active, true, false)) + percpu_ref_put(&lun->lun_ref); +} + static void target_complete_failure_work(struct work_struct *work) { struct se_cmd *cmd = container_of(work, struct se_cmd, work); @@ -783,6 +799,8 @@ static void target_handle_abort(struct se_cmd *cmd) WARN_ON_ONCE(kref_read(&cmd->cmd_kref) == 0); + transport_lun_remove_cmd(cmd); + transport_cmd_check_stop_to_fabric(cmd); } @@ -1708,6 +1726,7 @@ static void target_complete_tmr_failure(struct work_struct *work) se_cmd->se_tmr_req->response = TMR_LUN_DOES_NOT_EXIST; se_cmd->se_tfo->queue_tm_rsp(se_cmd); + transport_lun_remove_cmd(se_cmd); transport_cmd_check_stop_to_fabric(se_cmd); } @@ -1898,6 +1917,7 @@ void transport_generic_request_failure(struct se_cmd *cmd, goto queue_full; check_stop: + transport_lun_remove_cmd(cmd); transport_cmd_check_stop_to_fabric(cmd); return; @@ -2195,6 +2215,7 @@ queue_status: transport_handle_queue_full(cmd, cmd->se_dev, ret, false); return; } + transport_lun_remove_cmd(cmd); transport_cmd_check_stop_to_fabric(cmd); } @@ -2289,6 +2310,7 @@ static void target_complete_ok_work(struct work_struct *work) if (ret) goto queue_full; + transport_lun_remove_cmd(cmd); transport_cmd_check_stop_to_fabric(cmd); return; } @@ -2314,6 +2336,7 @@ static void target_complete_ok_work(struct work_struct *work) if (ret) goto queue_full; + transport_lun_remove_cmd(cmd); transport_cmd_check_stop_to_fabric(cmd); return; } @@ -2349,6 +2372,7 @@ queue_rsp: if (ret) goto queue_full; + transport_lun_remove_cmd(cmd); transport_cmd_check_stop_to_fabric(cmd); return; } @@ -2384,6 +2408,7 @@ queue_status: break; } + transport_lun_remove_cmd(cmd); transport_cmd_check_stop_to_fabric(cmd); return; @@ -2710,6 +2735,9 @@ int transport_generic_free_cmd(struct se_cmd *cmd, int wait_for_tasks) */ if (cmd->state_active) target_remove_from_state_list(cmd); + + if (cmd->se_lun) + transport_lun_remove_cmd(cmd); } if (aborted) cmd->free_compl = &compl; @@ -2781,9 +2809,6 @@ static void target_release_cmd_kref(struct kref *kref) struct completion *abrt_compl = se_cmd->abrt_compl; unsigned long flags; - if (se_cmd->lun_ref_active) - percpu_ref_put(&se_cmd->se_lun->lun_ref); - if (se_sess) { spin_lock_irqsave(&se_sess->sess_cmd_lock, flags); list_del_init(&se_cmd->se_cmd_list); -- cgit v1.2.3 From 0e99b2c625da181aebf1a3d13493e3f7a5057a9c Mon Sep 17 00:00:00 2001 From: Tomas Henzl Date: Tue, 4 Feb 2020 16:24:13 +0100 Subject: scsi: megaraid_sas: silence a warning Add a flag to DMA memory allocation to silence a warning. This driver allocates DMA memory for IO frames. This allocation may exceed MAX_ORDER pages for few megaraid_sas controllers (controllers with very high queue depth). Consequently, the driver has logic to keep reducing the controller queue depth until the DMA memory allocation succeeds. On impacted megaraid_sas controllers there would be multiple DMA allocation failures until driver settled on an allocation that fit. These failed DMA allocation requests caused stack traces in system logs. These were not harmful and this patch silences those warnings/stack traces. [mkp: clarified commit desc] Link: https://lore.kernel.org/r/20200204152413.7107-1-thenzl@redhat.com Signed-off-by: Tomas Henzl Acked-by: Sumit Saxena Reviewed-by: Lee Duncan Signed-off-by: Martin K. Petersen --- drivers/scsi/megaraid/megaraid_sas_fusion.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/scsi/megaraid/megaraid_sas_fusion.c b/drivers/scsi/megaraid/megaraid_sas_fusion.c index f3b36fd0a0eb..b2ad96564484 100644 --- a/drivers/scsi/megaraid/megaraid_sas_fusion.c +++ b/drivers/scsi/megaraid/megaraid_sas_fusion.c @@ -623,7 +623,8 @@ retry_alloc: fusion->io_request_frames = dma_pool_alloc(fusion->io_request_frames_pool, - GFP_KERNEL, &fusion->io_request_frames_phys); + GFP_KERNEL | __GFP_NOWARN, + &fusion->io_request_frames_phys); if (!fusion->io_request_frames) { if (instance->max_fw_cmds >= (MEGASAS_REDUCE_QD_COUNT * 2)) { instance->max_fw_cmds -= MEGASAS_REDUCE_QD_COUNT; @@ -661,7 +662,7 @@ retry_alloc: fusion->io_request_frames = dma_pool_alloc(fusion->io_request_frames_pool, - GFP_KERNEL, + GFP_KERNEL | __GFP_NOWARN, &fusion->io_request_frames_phys); if (!fusion->io_request_frames) { -- cgit v1.2.3 From 7563439adfae153b20331f1567c8b5d0e5cbd8a7 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 11 Feb 2020 06:30:06 -0700 Subject: io-wq: don't call kXalloc_node() with non-online node Glauber reports a crash on init on a box he has: RIP: 0010:__alloc_pages_nodemask+0x132/0x340 Code: 18 01 75 04 41 80 ce 80 89 e8 48 8b 54 24 08 8b 74 24 1c c1 e8 0c 48 8b 3c 24 83 e0 01 88 44 24 20 48 85 d2 0f 85 74 01 00 00 <3b> 77 08 0f 82 6b 01 00 00 48 89 7c 24 10 89 ea 48 8b 07 b9 00 02 RSP: 0018:ffffb8be4d0b7c28 EFLAGS: 00010246 RAX: 0000000000000000 RBX: 0000000000000000 RCX: 000000000000e8e8 RDX: 0000000000000000 RSI: 0000000000000002 RDI: 0000000000002080 RBP: 0000000000012cc0 R08: 0000000000000000 R09: 0000000000000002 R10: 0000000000000dc0 R11: ffff995c60400100 R12: 0000000000000000 R13: 0000000000012cc0 R14: 0000000000000001 R15: ffff995c60db00f0 FS: 00007f4d115ca900(0000) GS:ffff995c60d80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000002088 CR3: 00000017cca66002 CR4: 00000000007606e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 PKRU: 55555554 Call Trace: alloc_slab_page+0x46/0x320 new_slab+0x9d/0x4e0 ___slab_alloc+0x507/0x6a0 ? io_wq_create+0xb4/0x2a0 __slab_alloc+0x1c/0x30 kmem_cache_alloc_node_trace+0xa6/0x260 io_wq_create+0xb4/0x2a0 io_uring_setup+0x97f/0xaa0 ? io_remove_personalities+0x30/0x30 ? io_poll_trigger_evfd+0x30/0x30 do_syscall_64+0x5b/0x1c0 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x7f4d116cb1ed which is due to the 'wqe' and 'worker' allocation being node affine. But it isn't valid to call the node affine allocation if the node isn't online. Setup structures for even offline nodes, as usual, but skip them in terms of thread setup to not waste resources. If the node isn't online, just alloc memory with NUMA_NO_NODE. Reported-by: Glauber Costa Tested-by: Glauber Costa Signed-off-by: Jens Axboe --- fs/io-wq.c | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/fs/io-wq.c b/fs/io-wq.c index 182aa17dc2ca..0a5ab1a8f69a 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -699,11 +699,16 @@ static int io_wq_manager(void *data) /* create fixed workers */ refcount_set(&wq->refs, workers_to_create); for_each_node(node) { + if (!node_online(node)) + continue; if (!create_io_worker(wq, wq->wqes[node], IO_WQ_ACCT_BOUND)) goto err; workers_to_create--; } + while (workers_to_create--) + refcount_dec(&wq->refs); + complete(&wq->done); while (!kthread_should_stop()) { @@ -711,6 +716,9 @@ static int io_wq_manager(void *data) struct io_wqe *wqe = wq->wqes[node]; bool fork_worker[2] = { false, false }; + if (!node_online(node)) + continue; + spin_lock_irq(&wqe->lock); if (io_wqe_need_worker(wqe, IO_WQ_ACCT_BOUND)) fork_worker[IO_WQ_ACCT_BOUND] = true; @@ -829,7 +837,9 @@ static bool io_wq_for_each_worker(struct io_wqe *wqe, list_for_each_entry_rcu(worker, &wqe->all_list, all_list) { if (io_worker_get(worker)) { - ret = func(worker, data); + /* no task if node is/was offline */ + if (worker->task) + ret = func(worker, data); io_worker_release(worker); if (ret) break; @@ -1084,6 +1094,8 @@ void io_wq_flush(struct io_wq *wq) for_each_node(node) { struct io_wqe *wqe = wq->wqes[node]; + if (!node_online(node)) + continue; init_completion(&data.done); INIT_IO_WORK(&data.work, io_wq_flush_func); data.work.flags |= IO_WQ_WORK_INTERNAL; @@ -1115,12 +1127,15 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) for_each_node(node) { struct io_wqe *wqe; + int alloc_node = node; - wqe = kzalloc_node(sizeof(struct io_wqe), GFP_KERNEL, node); + if (!node_online(alloc_node)) + alloc_node = NUMA_NO_NODE; + wqe = kzalloc_node(sizeof(struct io_wqe), GFP_KERNEL, alloc_node); if (!wqe) goto err; wq->wqes[node] = wqe; - wqe->node = node; + wqe->node = alloc_node; wqe->acct[IO_WQ_ACCT_BOUND].max_workers = bounded; atomic_set(&wqe->acct[IO_WQ_ACCT_BOUND].nr_running, 0); if (wq->user) { @@ -1128,7 +1143,6 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) task_rlimit(current, RLIMIT_NPROC); } atomic_set(&wqe->acct[IO_WQ_ACCT_UNBOUND].nr_running, 0); - wqe->node = node; wqe->wq = wq; spin_lock_init(&wqe->lock); INIT_WQ_LIST(&wqe->work_list); -- cgit v1.2.3 From f52aa79df43c4509146140de0241bc21a4a3b4c7 Mon Sep 17 00:00:00 2001 From: Frank Sorenson Date: Wed, 12 Feb 2020 15:31:48 -0600 Subject: cifs: Fix mode output in debugging statements A number of the debug statements output file or directory mode in hex. Change these to print using octal. Signed-off-by: Frank Sorenson Signed-off-by: Steve French --- fs/cifs/cifsacl.c | 4 ++-- fs/cifs/connect.c | 2 +- fs/cifs/inode.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index 440828afcdde..716574aab3b6 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -601,7 +601,7 @@ static void access_flags_to_mode(__le32 ace_flags, int type, umode_t *pmode, ((flags & FILE_EXEC_RIGHTS) == FILE_EXEC_RIGHTS)) *pmode |= (S_IXUGO & (*pbits_to_set)); - cifs_dbg(NOISY, "access flags 0x%x mode now 0x%x\n", flags, *pmode); + cifs_dbg(NOISY, "access flags 0x%x mode now %04o\n", flags, *pmode); return; } @@ -630,7 +630,7 @@ static void mode_to_access_flags(umode_t mode, umode_t bits_to_use, if (mode & S_IXUGO) *pace_flags |= SET_FILE_EXEC_RIGHTS; - cifs_dbg(NOISY, "mode: 0x%x, access flags now 0x%x\n", + cifs_dbg(NOISY, "mode: %04o, access flags now 0x%x\n", mode, *pace_flags); return; } diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index a941ac7a659d..4804d1df8c1c 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -4151,7 +4151,7 @@ int cifs_setup_cifs_sb(struct smb_vol *pvolume_info, cifs_sb->mnt_gid = pvolume_info->linux_gid; cifs_sb->mnt_file_mode = pvolume_info->file_mode; cifs_sb->mnt_dir_mode = pvolume_info->dir_mode; - cifs_dbg(FYI, "file mode: 0x%hx dir mode: 0x%hx\n", + cifs_dbg(FYI, "file mode: %04ho dir mode: %04ho\n", cifs_sb->mnt_file_mode, cifs_sb->mnt_dir_mode); cifs_sb->actimeo = pvolume_info->actimeo; diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 9ba623b601ec..b5e6635c578e 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -1648,7 +1648,7 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, umode_t mode) struct TCP_Server_Info *server; char *full_path; - cifs_dbg(FYI, "In cifs_mkdir, mode = 0x%hx inode = 0x%p\n", + cifs_dbg(FYI, "In cifs_mkdir, mode = %04ho inode = 0x%p\n", mode, inode); cifs_sb = CIFS_SB(inode->i_sb); -- cgit v1.2.3 From 9f35a31283775e6f6af73fb2c95c686a4c0acac7 Mon Sep 17 00:00:00 2001 From: Alexander Tsoy Date: Thu, 13 Feb 2020 02:54:50 +0300 Subject: ALSA: usb-audio: Add clock validity quirk for Denon MC7000/MCX8000 It should be safe to ignore clock validity check result if the following conditions are met: - only one single sample rate is supported; - the terminal is directly connected to the clock source; - the clock type is internal. This is to deal with some Denon DJ controllers that always reports that clock is invalid. Tested-by: Tobias Oszlanyi Signed-off-by: Alexander Tsoy Cc: Link: https://lore.kernel.org/r/20200212235450.697348-1-alexander@tsoy.me Signed-off-by: Takashi Iwai --- sound/usb/clock.c | 91 +++++++++++++++++++++++++++++++++++++----------------- sound/usb/clock.h | 4 +-- sound/usb/format.c | 3 +- 3 files changed, 66 insertions(+), 32 deletions(-) diff --git a/sound/usb/clock.c b/sound/usb/clock.c index 018b1ecb5404..a48313dfa967 100644 --- a/sound/usb/clock.c +++ b/sound/usb/clock.c @@ -151,8 +151,34 @@ static int uac_clock_selector_set_val(struct snd_usb_audio *chip, int selector_i return ret; } +/* + * Assume the clock is valid if clock source supports only one single sample + * rate, the terminal is connected directly to it (there is no clock selector) + * and clock type is internal. This is to deal with some Denon DJ controllers + * that always reports that clock is invalid. + */ +static bool uac_clock_source_is_valid_quirk(struct snd_usb_audio *chip, + struct audioformat *fmt, + int source_id) +{ + if (fmt->protocol == UAC_VERSION_2) { + struct uac_clock_source_descriptor *cs_desc = + snd_usb_find_clock_source(chip->ctrl_intf, source_id); + + if (!cs_desc) + return false; + + return (fmt->nr_rates == 1 && + (fmt->clock & 0xff) == cs_desc->bClockID && + (cs_desc->bmAttributes & 0x3) != + UAC_CLOCK_SOURCE_TYPE_EXT); + } + + return false; +} + static bool uac_clock_source_is_valid(struct snd_usb_audio *chip, - int protocol, + struct audioformat *fmt, int source_id) { int err; @@ -160,7 +186,7 @@ static bool uac_clock_source_is_valid(struct snd_usb_audio *chip, struct usb_device *dev = chip->dev; u32 bmControls; - if (protocol == UAC_VERSION_3) { + if (fmt->protocol == UAC_VERSION_3) { struct uac3_clock_source_descriptor *cs_desc = snd_usb_find_clock_source_v3(chip->ctrl_intf, source_id); @@ -194,10 +220,14 @@ static bool uac_clock_source_is_valid(struct snd_usb_audio *chip, return false; } - return data ? true : false; + if (data) + return true; + else + return uac_clock_source_is_valid_quirk(chip, fmt, source_id); } -static int __uac_clock_find_source(struct snd_usb_audio *chip, int entity_id, +static int __uac_clock_find_source(struct snd_usb_audio *chip, + struct audioformat *fmt, int entity_id, unsigned long *visited, bool validate) { struct uac_clock_source_descriptor *source; @@ -217,7 +247,7 @@ static int __uac_clock_find_source(struct snd_usb_audio *chip, int entity_id, source = snd_usb_find_clock_source(chip->ctrl_intf, entity_id); if (source) { entity_id = source->bClockID; - if (validate && !uac_clock_source_is_valid(chip, UAC_VERSION_2, + if (validate && !uac_clock_source_is_valid(chip, fmt, entity_id)) { usb_audio_err(chip, "clock source %d is not valid, cannot use\n", @@ -248,8 +278,9 @@ static int __uac_clock_find_source(struct snd_usb_audio *chip, int entity_id, } cur = ret; - ret = __uac_clock_find_source(chip, selector->baCSourceID[ret - 1], - visited, validate); + ret = __uac_clock_find_source(chip, fmt, + selector->baCSourceID[ret - 1], + visited, validate); if (!validate || ret > 0 || !chip->autoclock) return ret; @@ -260,8 +291,9 @@ static int __uac_clock_find_source(struct snd_usb_audio *chip, int entity_id, if (i == cur) continue; - ret = __uac_clock_find_source(chip, selector->baCSourceID[i - 1], - visited, true); + ret = __uac_clock_find_source(chip, fmt, + selector->baCSourceID[i - 1], + visited, true); if (ret < 0) continue; @@ -281,14 +313,16 @@ static int __uac_clock_find_source(struct snd_usb_audio *chip, int entity_id, /* FIXME: multipliers only act as pass-thru element for now */ multiplier = snd_usb_find_clock_multiplier(chip->ctrl_intf, entity_id); if (multiplier) - return __uac_clock_find_source(chip, multiplier->bCSourceID, - visited, validate); + return __uac_clock_find_source(chip, fmt, + multiplier->bCSourceID, + visited, validate); return -EINVAL; } -static int __uac3_clock_find_source(struct snd_usb_audio *chip, int entity_id, - unsigned long *visited, bool validate) +static int __uac3_clock_find_source(struct snd_usb_audio *chip, + struct audioformat *fmt, int entity_id, + unsigned long *visited, bool validate) { struct uac3_clock_source_descriptor *source; struct uac3_clock_selector_descriptor *selector; @@ -307,7 +341,7 @@ static int __uac3_clock_find_source(struct snd_usb_audio *chip, int entity_id, source = snd_usb_find_clock_source_v3(chip->ctrl_intf, entity_id); if (source) { entity_id = source->bClockID; - if (validate && !uac_clock_source_is_valid(chip, UAC_VERSION_3, + if (validate && !uac_clock_source_is_valid(chip, fmt, entity_id)) { usb_audio_err(chip, "clock source %d is not valid, cannot use\n", @@ -338,7 +372,8 @@ static int __uac3_clock_find_source(struct snd_usb_audio *chip, int entity_id, } cur = ret; - ret = __uac3_clock_find_source(chip, selector->baCSourceID[ret - 1], + ret = __uac3_clock_find_source(chip, fmt, + selector->baCSourceID[ret - 1], visited, validate); if (!validate || ret > 0 || !chip->autoclock) return ret; @@ -350,8 +385,9 @@ static int __uac3_clock_find_source(struct snd_usb_audio *chip, int entity_id, if (i == cur) continue; - ret = __uac3_clock_find_source(chip, selector->baCSourceID[i - 1], - visited, true); + ret = __uac3_clock_find_source(chip, fmt, + selector->baCSourceID[i - 1], + visited, true); if (ret < 0) continue; @@ -372,7 +408,8 @@ static int __uac3_clock_find_source(struct snd_usb_audio *chip, int entity_id, multiplier = snd_usb_find_clock_multiplier_v3(chip->ctrl_intf, entity_id); if (multiplier) - return __uac3_clock_find_source(chip, multiplier->bCSourceID, + return __uac3_clock_find_source(chip, fmt, + multiplier->bCSourceID, visited, validate); return -EINVAL; @@ -389,18 +426,18 @@ static int __uac3_clock_find_source(struct snd_usb_audio *chip, int entity_id, * * Returns the clock source UnitID (>=0) on success, or an error. */ -int snd_usb_clock_find_source(struct snd_usb_audio *chip, int protocol, - int entity_id, bool validate) +int snd_usb_clock_find_source(struct snd_usb_audio *chip, + struct audioformat *fmt, bool validate) { DECLARE_BITMAP(visited, 256); memset(visited, 0, sizeof(visited)); - switch (protocol) { + switch (fmt->protocol) { case UAC_VERSION_2: - return __uac_clock_find_source(chip, entity_id, visited, + return __uac_clock_find_source(chip, fmt, fmt->clock, visited, validate); case UAC_VERSION_3: - return __uac3_clock_find_source(chip, entity_id, visited, + return __uac3_clock_find_source(chip, fmt, fmt->clock, visited, validate); default: return -EINVAL; @@ -501,8 +538,7 @@ static int set_sample_rate_v2v3(struct snd_usb_audio *chip, int iface, * automatic clock selection if the current clock is not * valid. */ - clock = snd_usb_clock_find_source(chip, fmt->protocol, - fmt->clock, true); + clock = snd_usb_clock_find_source(chip, fmt, true); if (clock < 0) { /* We did not find a valid clock, but that might be * because the current sample rate does not match an @@ -510,8 +546,7 @@ static int set_sample_rate_v2v3(struct snd_usb_audio *chip, int iface, * and we will do another validation after setting the * rate. */ - clock = snd_usb_clock_find_source(chip, fmt->protocol, - fmt->clock, false); + clock = snd_usb_clock_find_source(chip, fmt, false); if (clock < 0) return clock; } @@ -577,7 +612,7 @@ static int set_sample_rate_v2v3(struct snd_usb_audio *chip, int iface, validation: /* validate clock after rate change */ - if (!uac_clock_source_is_valid(chip, fmt->protocol, clock)) + if (!uac_clock_source_is_valid(chip, fmt, clock)) return -ENXIO; return 0; } diff --git a/sound/usb/clock.h b/sound/usb/clock.h index 076e31b79ee0..68df0fbe09d0 100644 --- a/sound/usb/clock.h +++ b/sound/usb/clock.h @@ -6,7 +6,7 @@ int snd_usb_init_sample_rate(struct snd_usb_audio *chip, int iface, struct usb_host_interface *alts, struct audioformat *fmt, int rate); -int snd_usb_clock_find_source(struct snd_usb_audio *chip, int protocol, - int entity_id, bool validate); +int snd_usb_clock_find_source(struct snd_usb_audio *chip, + struct audioformat *fmt, bool validate); #endif /* __USBAUDIO_CLOCK_H */ diff --git a/sound/usb/format.c b/sound/usb/format.c index 50cb183958bf..9f5cb4ed3a0c 100644 --- a/sound/usb/format.c +++ b/sound/usb/format.c @@ -336,8 +336,7 @@ static int parse_audio_format_rates_v2v3(struct snd_usb_audio *chip, struct usb_device *dev = chip->dev; unsigned char tmp[2], *data; int nr_triplets, data_size, ret = 0, ret_l6; - int clock = snd_usb_clock_find_source(chip, fp->protocol, - fp->clock, false); + int clock = snd_usb_clock_find_source(chip, fp, false); if (clock < 0) { dev_err(&dev->dev, -- cgit v1.2.3 From 216aa145aaf379a50b17afc812db71d893bd6683 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 12 Feb 2020 18:25:18 +0100 Subject: EDAC/mc: Fix use-after-free and memleaks during device removal A test kernel with the options DEBUG_TEST_DRIVER_REMOVE, KASAN and DEBUG_KMEMLEAK set, revealed several issues when removing an mci device: 1) Use-after-free: On 27.11.19 17:07:33, John Garry wrote: > [ 22.104498] BUG: KASAN: use-after-free in > edac_remove_sysfs_mci_device+0x148/0x180 The use-after-free is caused by the mci_for_each_dimm() macro called in edac_remove_sysfs_mci_device(). The iterator was introduced with c498afaf7df8 ("EDAC: Introduce an mci_for_each_dimm() iterator"). The iterator loop calls device_unregister(&dimm->dev), which removes the sysfs entry of the device, but also frees the dimm struct in dimm_attr_release(). When incrementing the loop in mci_for_each_dimm(), the dimm struct is accessed again, after having been freed already. The fix is to free all the mci device's subsequent dimm and csrow objects at a later point, in _edac_mc_free(), when the mci device itself is being freed. This keeps the data structures intact and the mci device can be fully used until its removal. The change allows the safe usage of mci_for_each_dimm() to release dimm devices from sysfs. 2) Memory leaks: Following memory leaks have been detected: # grep edac /sys/kernel/debug/kmemleak | sort | uniq -c 1 [<000000003c0f58f9>] edac_mc_alloc+0x3bc/0x9d0 # mci->csrows 16 [<00000000bb932dc0>] edac_mc_alloc+0x49c/0x9d0 # csr->channels 16 [<00000000e2734dba>] edac_mc_alloc+0x518/0x9d0 # csr->channels[chn] 1 [<00000000eb040168>] edac_mc_alloc+0x5c8/0x9d0 # mci->dimms 34 [<00000000ef737c29>] ghes_edac_register+0x1c8/0x3f8 # see edac_mc_alloc() All leaks are from memory allocated by edac_mc_alloc(). Note: The test above shows that edac_mc_alloc() was called here from ghes_edac_register(), thus both functions show up in the stack trace but the module causing the leaks is edac_mc. The comments with the data structures involved were made manually by analyzing the objdump. The data structures listed above and created by edac_mc_alloc() are not properly removed during device removal, which is done in edac_mc_free(). There are two paths implemented to remove the device depending on device registration, _edac_mc_free() is called if the device is not registered and edac_unregister_sysfs() otherwise. The implemenations differ. For the sysfs case, the mci device removal lacks the removal of subsequent data structures (csrows, channels, dimms). This causes the memory leaks (see mci_attr_release()). [ bp: Massage commit message. ] Fixes: c498afaf7df8 ("EDAC: Introduce an mci_for_each_dimm() iterator") Fixes: faa2ad09c01c ("edac_mc: edac_mc_free() cannot assume mem_ctl_info is registered in sysfs.") Fixes: 7a623c039075 ("edac: rewrite the sysfs code to use struct device") Reported-by: John Garry Signed-off-by: Robert Richter Signed-off-by: Borislav Petkov Tested-by: John Garry Cc: Link: https://lkml.kernel.org/r/20200212120340.4764-3-rrichter@marvell.com --- drivers/edac/edac_mc.c | 12 +++--------- drivers/edac/edac_mc_sysfs.c | 15 +++------------ 2 files changed, 6 insertions(+), 21 deletions(-) diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c index 7243b88f81d8..69e0d90460e6 100644 --- a/drivers/edac/edac_mc.c +++ b/drivers/edac/edac_mc.c @@ -505,16 +505,10 @@ void edac_mc_free(struct mem_ctl_info *mci) { edac_dbg(1, "\n"); - /* If we're not yet registered with sysfs free only what was allocated - * in edac_mc_alloc(). - */ - if (!device_is_registered(&mci->dev)) { - _edac_mc_free(mci); - return; - } + if (device_is_registered(&mci->dev)) + edac_unregister_sysfs(mci); - /* the mci instance is freed here, when the sysfs object is dropped */ - edac_unregister_sysfs(mci); + _edac_mc_free(mci); } EXPORT_SYMBOL_GPL(edac_mc_free); diff --git a/drivers/edac/edac_mc_sysfs.c b/drivers/edac/edac_mc_sysfs.c index 0367554e7437..1c9c6a7b9f66 100644 --- a/drivers/edac/edac_mc_sysfs.c +++ b/drivers/edac/edac_mc_sysfs.c @@ -276,10 +276,7 @@ static const struct attribute_group *csrow_attr_groups[] = { static void csrow_attr_release(struct device *dev) { - struct csrow_info *csrow = container_of(dev, struct csrow_info, dev); - - edac_dbg(1, "device %s released\n", dev_name(dev)); - kfree(csrow); + /* release device with _edac_mc_free() */ } static const struct device_type csrow_attr_type = { @@ -608,10 +605,7 @@ static const struct attribute_group *dimm_attr_groups[] = { static void dimm_attr_release(struct device *dev) { - struct dimm_info *dimm = container_of(dev, struct dimm_info, dev); - - edac_dbg(1, "device %s released\n", dev_name(dev)); - kfree(dimm); + /* release device with _edac_mc_free() */ } static const struct device_type dimm_attr_type = { @@ -893,10 +887,7 @@ static const struct attribute_group *mci_attr_groups[] = { static void mci_attr_release(struct device *dev) { - struct mem_ctl_info *mci = container_of(dev, struct mem_ctl_info, dev); - - edac_dbg(1, "device %s released\n", dev_name(dev)); - kfree(mci); + /* release device with _edac_mc_free() */ } static const struct device_type mci_attr_type = { -- cgit v1.2.3 From 4d59588c09f2a2daedad2a544d4d1b602ab3a8af Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 12 Feb 2020 13:03:39 +0100 Subject: EDAC/sysfs: Remove csrow objects on errors All created csrow objects must be removed in the error path of edac_create_csrow_objects(). The objects have been added as devices. They need to be removed by doing a device_del() *and* put_device() call to also free their memory. The missing put_device() leaves a memory leak. Use device_unregister() instead of device_del() which properly unregisters the device doing both. Fixes: 7adc05d2dc3a ("EDAC/sysfs: Drop device references properly") Signed-off-by: Robert Richter Signed-off-by: Borislav Petkov Tested-by: John Garry Cc: Link: https://lkml.kernel.org/r/20200212120340.4764-4-rrichter@marvell.com --- drivers/edac/edac_mc_sysfs.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/edac/edac_mc_sysfs.c b/drivers/edac/edac_mc_sysfs.c index 1c9c6a7b9f66..c70ec0a306d8 100644 --- a/drivers/edac/edac_mc_sysfs.c +++ b/drivers/edac/edac_mc_sysfs.c @@ -444,8 +444,7 @@ error: csrow = mci->csrows[i]; if (!nr_pages_per_csrow(csrow)) continue; - - device_del(&mci->csrows[i]->dev); + device_unregister(&mci->csrows[i]->dev); } return err; -- cgit v1.2.3 From 03cd45d2e219301880cabc357e3cf478a500080f Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Thu, 13 Feb 2020 12:56:04 +0300 Subject: thunderbolt: Prevent crash if non-active NVMem file is read The driver does not populate .reg_read callback for the non-active NVMem because the file is supposed to be write-only. However, it turns out NVMem subsystem does not yet support this and expects that the .reg_read callback is provided. If user reads the binary attribute it triggers NULL pointer dereference like this one: BUG: kernel NULL pointer dereference, address: 0000000000000000 ... Call Trace: bin_attr_nvmem_read+0x64/0x80 kernfs_fop_read+0xa7/0x180 vfs_read+0xbd/0x170 ksys_read+0x5a/0xd0 do_syscall_64+0x43/0x150 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Fix this in the driver by providing .reg_read callback that always returns an error. Reported-by: Nicholas Johnson Fixes: e6b245ccd524 ("thunderbolt: Add support for host and device NVM firmware upgrade") Signed-off-by: Mika Westerberg Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20200213095604.1074-1-mika.westerberg@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/thunderbolt/switch.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/thunderbolt/switch.c b/drivers/thunderbolt/switch.c index ad5479f21174..7d6ecc342508 100644 --- a/drivers/thunderbolt/switch.c +++ b/drivers/thunderbolt/switch.c @@ -348,6 +348,12 @@ out: return ret; } +static int tb_switch_nvm_no_read(void *priv, unsigned int offset, void *val, + size_t bytes) +{ + return -EPERM; +} + static int tb_switch_nvm_write(void *priv, unsigned int offset, void *val, size_t bytes) { @@ -393,6 +399,7 @@ static struct nvmem_device *register_nvmem(struct tb_switch *sw, int id, config.read_only = true; } else { config.name = "nvm_non_active"; + config.reg_read = tb_switch_nvm_no_read; config.reg_write = tb_switch_nvm_write; config.root_only = true; } -- cgit v1.2.3 From b6570fdb96edf45bcf71884bd2644bd73d348d1a Mon Sep 17 00:00:00 2001 From: Samuel Holland Date: Thu, 13 Feb 2020 00:11:44 -0600 Subject: ASoC: codec2codec: avoid invalid/double-free of pcm runtime The PCM runtime was freed during PMU in the case that the event hook encountered an error. However, it is also unconditionally freed during PMD. Avoid a double-free by dropping the call to kfree in the PMU hook. Fixes: a72706ed8208 ("ASoC: codec2codec: remove ephemeral variables") Cc: stable@vger.kernel.org Signed-off-by: Samuel Holland Link: https://lore.kernel.org/r/20200213061147.29386-2-samuel@sholland.org Signed-off-by: Mark Brown --- sound/soc/soc-dapm.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/sound/soc/soc-dapm.c b/sound/soc/soc-dapm.c index bc20ad9abf8b..8b24396675ec 100644 --- a/sound/soc/soc-dapm.c +++ b/sound/soc/soc-dapm.c @@ -3916,9 +3916,6 @@ snd_soc_dai_link_event_pre_pmu(struct snd_soc_dapm_widget *w, runtime->rate = params_rate(params); out: - if (ret < 0) - kfree(runtime); - kfree(params); return ret; } -- cgit v1.2.3 From a0767da7774d91a668f9c223cec3e76172cd833b Mon Sep 17 00:00:00 2001 From: Michael Guralnik Date: Wed, 12 Feb 2020 09:26:31 +0200 Subject: RDMA/core: Add missing list deletion on freeing event queue When the uobject file scheme was revised to allow device disassociation from the file it became possible for read() to still happen the driver destroys the uobject. The old clode code was not tolerant to concurrent read, and when it was moved to the driver destroy it creates a bug. Ensure the event_list is empty after driver destroy by adding the missing list_del(). Otherwise read() can trigger a use after free and double kfree. Fixes: f7c8416ccea5 ("RDMA/core: Simplify destruction of FD uobjects") Link: https://lore.kernel.org/r/20200212072635.682689-6-leon@kernel.org Signed-off-by: Michael Guralnik Reviewed-by: Yishai Hadas Signed-off-by: Leon Romanovsky Reviewed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/uverbs_std_types.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/infiniband/core/uverbs_std_types.c b/drivers/infiniband/core/uverbs_std_types.c index 994d8744b246..3abfc63225cb 100644 --- a/drivers/infiniband/core/uverbs_std_types.c +++ b/drivers/infiniband/core/uverbs_std_types.c @@ -220,6 +220,7 @@ void ib_uverbs_free_event_queue(struct ib_uverbs_event_queue *event_queue) list_for_each_entry_safe(entry, tmp, &event_queue->event_list, list) { if (entry->counter) list_del(&entry->obj_list); + list_del(&entry->list); kfree(entry); } spin_unlock_irq(&event_queue->lock); -- cgit v1.2.3 From a8af8694a5e8ddaaef4bd7b6426c12b7759c846c Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Wed, 12 Feb 2020 09:26:32 +0200 Subject: RDMA/mlx5: Fix async events cleanup flows As in the prior patch, the devx code is not fully cleaning up its event_lists before finishing driver_destroy allowing a later read to trigger user after free conditions. Re-arrange things so that the event_list is always empty after destroy and ensure it remains empty until the file is closed. Fixes: f7c8416ccea5 ("RDMA/core: Simplify destruction of FD uobjects") Link: https://lore.kernel.org/r/20200212072635.682689-7-leon@kernel.org Signed-off-by: Yishai Hadas Signed-off-by: Leon Romanovsky Reviewed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/devx.c | 51 +++++++++++++++++++++------------------ 1 file changed, 28 insertions(+), 23 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c index d7efc9f6daf0..46e1ab771f10 100644 --- a/drivers/infiniband/hw/mlx5/devx.c +++ b/drivers/infiniband/hw/mlx5/devx.c @@ -2319,14 +2319,12 @@ static int deliver_event(struct devx_event_subscription *event_sub, if (ev_file->omit_data) { spin_lock_irqsave(&ev_file->lock, flags); - if (!list_empty(&event_sub->event_list)) { + if (!list_empty(&event_sub->event_list) || + ev_file->is_destroyed) { spin_unlock_irqrestore(&ev_file->lock, flags); return 0; } - /* is_destroyed is ignored here because we don't have any memory - * allocation to clean up for the omit_data case - */ list_add_tail(&event_sub->event_list, &ev_file->event_list); spin_unlock_irqrestore(&ev_file->lock, flags); wake_up_interruptible(&ev_file->poll_wait); @@ -2473,11 +2471,11 @@ static ssize_t devx_async_cmd_event_read(struct file *filp, char __user *buf, return -ERESTARTSYS; } - if (list_empty(&ev_queue->event_list) && - ev_queue->is_destroyed) - return -EIO; - spin_lock_irq(&ev_queue->lock); + if (ev_queue->is_destroyed) { + spin_unlock_irq(&ev_queue->lock); + return -EIO; + } } event = list_entry(ev_queue->event_list.next, @@ -2551,10 +2549,6 @@ static ssize_t devx_async_event_read(struct file *filp, char __user *buf, return -EOVERFLOW; } - if (ev_file->is_destroyed) { - spin_unlock_irq(&ev_file->lock); - return -EIO; - } while (list_empty(&ev_file->event_list)) { spin_unlock_irq(&ev_file->lock); @@ -2667,8 +2661,10 @@ static int devx_async_cmd_event_destroy_uobj(struct ib_uobject *uobj, spin_lock_irq(&comp_ev_file->ev_queue.lock); list_for_each_entry_safe(entry, tmp, - &comp_ev_file->ev_queue.event_list, list) + &comp_ev_file->ev_queue.event_list, list) { + list_del(&entry->list); kvfree(entry); + } spin_unlock_irq(&comp_ev_file->ev_queue.lock); return 0; }; @@ -2680,11 +2676,29 @@ static int devx_async_event_destroy_uobj(struct ib_uobject *uobj, container_of(uobj, struct devx_async_event_file, uobj); struct devx_event_subscription *event_sub, *event_sub_tmp; - struct devx_async_event_data *entry, *tmp; struct mlx5_ib_dev *dev = ev_file->dev; spin_lock_irq(&ev_file->lock); ev_file->is_destroyed = 1; + + /* free the pending events allocation */ + if (ev_file->omit_data) { + struct devx_event_subscription *event_sub, *tmp; + + list_for_each_entry_safe(event_sub, tmp, &ev_file->event_list, + event_list) + list_del_init(&event_sub->event_list); + + } else { + struct devx_async_event_data *entry, *tmp; + + list_for_each_entry_safe(entry, tmp, &ev_file->event_list, + list) { + list_del(&entry->list); + kfree(entry); + } + } + spin_unlock_irq(&ev_file->lock); wake_up_interruptible(&ev_file->poll_wait); @@ -2699,15 +2713,6 @@ static int devx_async_event_destroy_uobj(struct ib_uobject *uobj, } mutex_unlock(&dev->devx_event_table.event_xa_lock); - /* free the pending events allocation */ - if (!ev_file->omit_data) { - spin_lock_irq(&ev_file->lock); - list_for_each_entry_safe(entry, tmp, - &ev_file->event_list, list) - kfree(entry); /* read can't come any more */ - spin_unlock_irq(&ev_file->lock); - } - put_device(&dev->ib_dev.dev); return 0; }; -- cgit v1.2.3 From 9ea04d0df6e6541c6736b43bff45f1e54875a1db Mon Sep 17 00:00:00 2001 From: Yonatan Cohen Date: Wed, 12 Feb 2020 09:26:34 +0200 Subject: IB/umad: Fix kernel crash while unloading ib_umad When disassociating a device from umad we must ensure that the sysfs access is prevented before blocking the fops, otherwise assumptions in syfs don't hold: CPU0 CPU1 ib_umad_kill_port() ibdev_show() port->ib_dev = NULL dev_name(port->ib_dev) The prior patch made an error in moving the device_destroy(), it should have been split into device_del() (above) and put_device() (below). At this point we already have the split, so move the device_del() back to its original place. kernel stack PF: error_code(0x0000) - not-present page Oops: 0000 [#1] SMP DEBUG_PAGEALLOC PTI RIP: 0010:ibdev_show+0x18/0x50 [ib_umad] RSP: 0018:ffffc9000097fe40 EFLAGS: 00010282 RAX: 0000000000000000 RBX: ffffffffa0441120 RCX: ffff8881df514000 RDX: ffff8881df514000 RSI: ffffffffa0441120 RDI: ffff8881df1e8870 RBP: ffffffff81caf000 R08: ffff8881df1e8870 R09: 0000000000000000 R10: 0000000000001000 R11: 0000000000000003 R12: ffff88822f550b40 R13: 0000000000000001 R14: ffffc9000097ff08 R15: ffff8882238bad58 FS: 00007f1437ff3740(0000) GS:ffff888236940000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000000004e8 CR3: 00000001e0dfc001 CR4: 00000000001606e0 Call Trace: dev_attr_show+0x15/0x50 sysfs_kf_seq_show+0xb8/0x1a0 seq_read+0x12d/0x350 vfs_read+0x89/0x140 ksys_read+0x55/0xd0 do_syscall_64+0x55/0x1b0 entry_SYSCALL_64_after_hwframe+0x44/0xa9: Fixes: cf7ad3030271 ("IB/umad: Avoid destroying device while it is accessed") Link: https://lore.kernel.org/r/20200212072635.682689-9-leon@kernel.org Signed-off-by: Yonatan Cohen Signed-off-by: Leon Romanovsky Reviewed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/user_mad.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/core/user_mad.c b/drivers/infiniband/core/user_mad.c index d1407fa378e8..1235ffb2389b 100644 --- a/drivers/infiniband/core/user_mad.c +++ b/drivers/infiniband/core/user_mad.c @@ -1312,6 +1312,9 @@ static void ib_umad_kill_port(struct ib_umad_port *port) struct ib_umad_file *file; int id; + cdev_device_del(&port->sm_cdev, &port->sm_dev); + cdev_device_del(&port->cdev, &port->dev); + mutex_lock(&port->file_mutex); /* Mark ib_dev NULL and block ioctl or other file ops to progress @@ -1331,8 +1334,6 @@ static void ib_umad_kill_port(struct ib_umad_port *port) mutex_unlock(&port->file_mutex); - cdev_device_del(&port->sm_cdev, &port->sm_dev); - cdev_device_del(&port->cdev, &port->dev); ida_free(&umad_ida, port->dev_num); /* balances device_initialize() */ -- cgit v1.2.3 From 9b6d3bbc1335404b331f4f11dc896066bdf1c752 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Wed, 12 Feb 2020 09:26:35 +0200 Subject: RDMA/mlx5: Prevent overflow in mmap offset calculations The cmd and index variables declared as u16 and the result is supposed to be stored in u64. The C arithmetic rules doesn't promote "(index >> 8) << 16" to be u64 and leaves the end result to be u16. Fixes: 7be76bef320b ("IB/mlx5: Introduce VAR object and its alloc/destroy methods") Link: https://lore.kernel.org/r/20200212072635.682689-10-leon@kernel.org Signed-off-by: Leon Romanovsky Reviewed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index e874d688d040..987bfdcd12a5 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -2283,8 +2283,8 @@ static int mlx5_ib_mmap_offset(struct mlx5_ib_dev *dev, static u64 mlx5_entry_to_mmap_offset(struct mlx5_user_mmap_entry *entry) { - u16 cmd = entry->rdma_entry.start_pgoff >> 16; - u16 index = entry->rdma_entry.start_pgoff & 0xFFFF; + u64 cmd = (entry->rdma_entry.start_pgoff >> 16) & 0xFFFF; + u64 index = entry->rdma_entry.start_pgoff & 0xFFFF; return (((index >> 8) << 16) | (cmd << MLX5_IB_MMAP_CMD_SHIFT) | (index & 0xFF)) << PAGE_SHIFT; -- cgit v1.2.3 From 8ac0e6641c7ca14833a2a8c6f13d8e0a435e535c Mon Sep 17 00:00:00 2001 From: Zhu Yanjun Date: Wed, 12 Feb 2020 09:26:33 +0200 Subject: RDMA/rxe: Fix soft lockup problem due to using tasklets in softirq When run stress tests with RXE, the following Call Traces often occur watchdog: BUG: soft lockup - CPU#2 stuck for 22s! [swapper/2:0] ... Call Trace: create_object+0x3f/0x3b0 kmem_cache_alloc_node_trace+0x129/0x2d0 __kmalloc_reserve.isra.52+0x2e/0x80 __alloc_skb+0x83/0x270 rxe_init_packet+0x99/0x150 [rdma_rxe] rxe_requester+0x34e/0x11a0 [rdma_rxe] rxe_do_task+0x85/0xf0 [rdma_rxe] tasklet_action_common.isra.21+0xeb/0x100 __do_softirq+0xd0/0x298 irq_exit+0xc5/0xd0 smp_apic_timer_interrupt+0x68/0x120 apic_timer_interrupt+0xf/0x20 ... The root cause is that tasklet is actually a softirq. In a tasklet handler, another softirq handler is triggered. Usually these softirq handlers run on the same cpu core. So this will cause "soft lockup Bug". Fixes: 8700e3e7c485 ("Soft RoCE driver") Link: https://lore.kernel.org/r/20200212072635.682689-8-leon@kernel.org Signed-off-by: Zhu Yanjun Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_comp.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_comp.c b/drivers/infiniband/sw/rxe/rxe_comp.c index 116cafc9afcf..4bc88708b355 100644 --- a/drivers/infiniband/sw/rxe/rxe_comp.c +++ b/drivers/infiniband/sw/rxe/rxe_comp.c @@ -329,7 +329,7 @@ static inline enum comp_state check_ack(struct rxe_qp *qp, qp->comp.psn = pkt->psn; if (qp->req.wait_psn) { qp->req.wait_psn = 0; - rxe_run_task(&qp->req.task, 1); + rxe_run_task(&qp->req.task, 0); } } return COMPST_ERROR_RETRY; @@ -463,7 +463,7 @@ static void do_complete(struct rxe_qp *qp, struct rxe_send_wqe *wqe) */ if (qp->req.wait_fence) { qp->req.wait_fence = 0; - rxe_run_task(&qp->req.task, 1); + rxe_run_task(&qp->req.task, 0); } } @@ -479,7 +479,7 @@ static inline enum comp_state complete_ack(struct rxe_qp *qp, if (qp->req.need_rd_atomic) { qp->comp.timeout_retry = 0; qp->req.need_rd_atomic = 0; - rxe_run_task(&qp->req.task, 1); + rxe_run_task(&qp->req.task, 0); } } @@ -725,7 +725,7 @@ int rxe_completer(void *arg) RXE_CNT_COMP_RETRY); qp->req.need_retry = 1; qp->comp.started_retry = 1; - rxe_run_task(&qp->req.task, 1); + rxe_run_task(&qp->req.task, 0); } if (pkt) { -- cgit v1.2.3 From 0fbb027b44e79700da80e4b8bd1c1914d4796af6 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Thu, 13 Feb 2020 07:03:49 +0100 Subject: ALSA: pcm: Fix double hw_free calls The commit 66f2d19f8116 ("ALSA: pcm: Fix memory leak at closing a stream without hw_free") tried to fix the regression wrt the missing hw_free call at closing without SNDRV_PCM_IOCTL_HW_FREE ioctl. However, the code change dropped mistakenly the state check, resulting in calling hw_free twice when SNDRV_PCM_IOCTL_HW_FRE got called beforehand. For most drivers, this is almost harmless, but the drivers like SOF show another regression now. This patch adds the state condition check before calling do_hw_free() at releasing the stream for avoiding the double hw_free calls. Fixes: 66f2d19f8116 ("ALSA: pcm: Fix memory leak at closing a stream without hw_free") Reported-by: Bard Liao Reported-by: Pierre-Louis Bossart Tested-by: Pierre-Louis Bossart Cc: Link: https://lore.kernel.org/r/s5hd0ajyprg.wl-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/core/pcm_native.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c index 336406bcb59e..d5443eeb8b63 100644 --- a/sound/core/pcm_native.c +++ b/sound/core/pcm_native.c @@ -2594,7 +2594,8 @@ void snd_pcm_release_substream(struct snd_pcm_substream *substream) snd_pcm_drop(substream); if (substream->hw_opened) { - do_hw_free(substream); + if (substream->runtime->status->state != SNDRV_PCM_STATE_OPEN) + do_hw_free(substream); substream->ops->close(substream); substream->hw_opened = 0; } -- cgit v1.2.3 From 0b96da639a4874311e9b5156405f69ef9fc3bef8 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Thu, 13 Feb 2020 22:12:05 +0800 Subject: bcache: ignore pending signals when creating gc and allocator thread When run a cache set, all the bcache btree node of this cache set will be checked by bch_btree_check(). If the bcache btree is very large, iterating all the btree nodes will occupy too much system memory and the bcache registering process might be selected and killed by system OOM killer. kthread_run() will fail if current process has pending signal, therefore the kthread creating in run_cache_set() for gc and allocator kernel threads are very probably failed for a very large bcache btree. Indeed such OOM is safe and the registering process will exit after the registration done. Therefore this patch flushes pending signals during the cache set start up, specificly in bch_cache_allocator_start() and bch_gc_thread_start(), to make sure run_cache_set() won't fail for large cahced data set. Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/alloc.c | 18 ++++++++++++++++-- drivers/md/bcache/btree.c | 13 +++++++++++++ 2 files changed, 29 insertions(+), 2 deletions(-) diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index a1df0d95151c..8bc1faf71ff2 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -67,6 +67,7 @@ #include #include #include +#include #include #define MAX_OPEN_BUCKETS 128 @@ -733,8 +734,21 @@ int bch_open_buckets_alloc(struct cache_set *c) int bch_cache_allocator_start(struct cache *ca) { - struct task_struct *k = kthread_run(bch_allocator_thread, - ca, "bcache_allocator"); + struct task_struct *k; + + /* + * In case previous btree check operation occupies too many + * system memory for bcache btree node cache, and the + * registering process is selected by OOM killer. Here just + * ignore the SIGKILL sent by OOM killer if there is, to + * avoid kthread_run() being failed by pending signals. The + * bcache registering process will exit after the registration + * done. + */ + if (signal_pending(current)) + flush_signals(current); + + k = kthread_run(bch_allocator_thread, ca, "bcache_allocator"); if (IS_ERR(k)) return PTR_ERR(k); diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index fa872df4e770..b12186c87f52 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include #include @@ -1913,6 +1914,18 @@ static int bch_gc_thread(void *arg) int bch_gc_thread_start(struct cache_set *c) { + /* + * In case previous btree check operation occupies too many + * system memory for bcache btree node cache, and the + * registering process is selected by OOM killer. Here just + * ignore the SIGKILL sent by OOM killer if there is, to + * avoid kthread_run() being failed by pending signals. The + * bcache registering process will exit after the registration + * done. + */ + if (signal_pending(current)) + flush_signals(current); + c->gc_thread = kthread_run(bch_gc_thread, c, "bcache_gc"); return PTR_ERR_OR_ZERO(c->gc_thread); } -- cgit v1.2.3 From 309cc719a2c869b71a7388209a0a80d4284d98fd Mon Sep 17 00:00:00 2001 From: Coly Li Date: Thu, 13 Feb 2020 22:12:06 +0800 Subject: bcache: Revert "bcache: shrink btree node cache after bch_btree_check()" This reverts commit 1df3877ff6a4810054237c3259d900ded4468969. In my testing, sometimes even all the cached btree nodes are freed, creating gc and allocator kernel threads may still fail. Finally it turns out that kthread_run() may fail if there is pending signal for current task. And the pending signal is sent from OOM killer which is triggered by memory consuption in bch_btree_check(). Therefore explicitly shrinking bcache btree node here does not help, and after the shrinker callback is improved, as well as pending signals are ignored before creating kernel threads, now such operation is unncessary anymore. This patch reverts the commit 1df3877ff6a4 ("bcache: shrink btree node cache after bch_btree_check()") because we have better improvement now. Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/super.c | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 2749daf09724..0c3c5419c52b 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1917,23 +1917,6 @@ static int run_cache_set(struct cache_set *c) if (bch_btree_check(c)) goto err; - /* - * bch_btree_check() may occupy too much system memory which - * has negative effects to user space application (e.g. data - * base) performance. Shrink the mca cache memory proactively - * here to avoid competing memory with user space workloads.. - */ - if (!c->shrinker_disabled) { - struct shrink_control sc; - - sc.gfp_mask = GFP_KERNEL; - sc.nr_to_scan = c->btree_cache_used * c->btree_pages; - /* first run to clear b->accessed tag */ - c->shrink.scan_objects(&c->shrink, &sc); - /* second run to reap non-accessed nodes */ - c->shrink.scan_objects(&c->shrink, &sc); - } - bch_journal_mark(c, &journal); bch_initial_gc_finish(c); pr_debug("btree_check() done"); -- cgit v1.2.3 From 4ec31cb6241d95879aac337cc6b50c45dd10cfcb Mon Sep 17 00:00:00 2001 From: Coly Li Date: Thu, 13 Feb 2020 22:12:07 +0800 Subject: bcache: remove macro nr_to_fifo_front() Macro nr_to_fifo_front() is only used once in btree_flush_write(), it is unncessary indeed. This patch removes this macro and does calculation directly in place. Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/journal.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index 6730820780b0..0e3ff9745ac7 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -417,8 +417,6 @@ err: /* Journalling */ -#define nr_to_fifo_front(p, front_p, mask) (((p) - (front_p)) & (mask)) - static void btree_flush_write(struct cache_set *c) { struct btree *b, *t, *btree_nodes[BTREE_FLUSH_NR]; @@ -510,9 +508,8 @@ static void btree_flush_write(struct cache_set *c) * journal entry can be reclaimed). These selected nodes * will be ignored and skipped in the folowing for-loop. */ - if (nr_to_fifo_front(btree_current_write(b)->journal, - fifo_front_p, - mask) != 0) { + if (((btree_current_write(b)->journal - fifo_front_p) & + mask) != 0) { mutex_unlock(&b->write_lock); continue; } -- cgit v1.2.3 From 1dd017882e01d2fcd9c5dbbf1eb376211111c393 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Wed, 12 Feb 2020 10:06:51 +0200 Subject: RDMA/core: Fix protection fault in get_pkey_idx_qp_list We don't need to set pkey as valid in case that user set only one of pkey index or port number, otherwise it will be resulted in NULL pointer dereference while accessing to uninitialized pkey list. The following crash from Syzkaller revealed it. kasan: CONFIG_KASAN_INLINE enabled kasan: GPF could be caused by NULL-ptr deref or user memory access general protection fault: 0000 [#1] SMP KASAN PTI CPU: 1 PID: 14753 Comm: syz-executor.2 Not tainted 5.5.0-rc5 #2 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.1-0-ga5cab58e9a3f-prebuilt.qemu.org 04/01/2014 RIP: 0010:get_pkey_idx_qp_list+0x161/0x2d0 Code: 01 00 00 49 8b 5e 20 4c 39 e3 0f 84 b9 00 00 00 e8 e4 42 6e fe 48 8d 7b 10 48 b8 00 00 00 00 00 fc ff df 48 89 fa 48 c1 ea 03 <0f> b6 04 02 84 c0 74 08 3c 01 0f 8e d0 00 00 00 48 8d 7d 04 48 b8 RSP: 0018:ffffc9000bc6f950 EFLAGS: 00010202 RAX: dffffc0000000000 RBX: 0000000000000000 RCX: ffffffff82c8bdec RDX: 0000000000000002 RSI: ffffc900030a8000 RDI: 0000000000000010 RBP: ffff888112c8ce80 R08: 0000000000000004 R09: fffff5200178df1f R10: 0000000000000001 R11: fffff5200178df1f R12: ffff888115dc4430 R13: ffff888115da8498 R14: ffff888115dc4410 R15: ffff888115da8000 FS: 00007f20777de700(0000) GS:ffff88811b100000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000001b2f721000 CR3: 00000001173ca002 CR4: 0000000000360ee0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: port_pkey_list_insert+0xd7/0x7c0 ib_security_modify_qp+0x6fa/0xfc0 _ib_modify_qp+0x8c4/0xbf0 modify_qp+0x10da/0x16d0 ib_uverbs_modify_qp+0x9a/0x100 ib_uverbs_write+0xaa5/0xdf0 __vfs_write+0x7c/0x100 vfs_write+0x168/0x4a0 ksys_write+0xc8/0x200 do_syscall_64+0x9c/0x390 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Fixes: d291f1a65232 ("IB/core: Enforce PKey security on QPs") Link: https://lore.kernel.org/r/20200212080651.GB679970@unreal Signed-off-by: Maor Gottlieb Signed-off-by: Leon Romanovsky Message-Id: <20200212080651.GB679970@unreal> --- drivers/infiniband/core/security.c | 24 +++++++++--------------- 1 file changed, 9 insertions(+), 15 deletions(-) diff --git a/drivers/infiniband/core/security.c b/drivers/infiniband/core/security.c index 6eb6d2717ca5..2b4d80393bd0 100644 --- a/drivers/infiniband/core/security.c +++ b/drivers/infiniband/core/security.c @@ -339,22 +339,16 @@ static struct ib_ports_pkeys *get_new_pps(const struct ib_qp *qp, if (!new_pps) return NULL; - if (qp_attr_mask & (IB_QP_PKEY_INDEX | IB_QP_PORT)) { - if (!qp_pps) { - new_pps->main.port_num = qp_attr->port_num; - new_pps->main.pkey_index = qp_attr->pkey_index; - } else { - new_pps->main.port_num = (qp_attr_mask & IB_QP_PORT) ? - qp_attr->port_num : - qp_pps->main.port_num; - - new_pps->main.pkey_index = - (qp_attr_mask & IB_QP_PKEY_INDEX) ? - qp_attr->pkey_index : - qp_pps->main.pkey_index; - } + if (qp_attr_mask & IB_QP_PORT) + new_pps->main.port_num = + (qp_pps) ? qp_pps->main.port_num : qp_attr->port_num; + if (qp_attr_mask & IB_QP_PKEY_INDEX) + new_pps->main.pkey_index = (qp_pps) ? qp_pps->main.pkey_index : + qp_attr->pkey_index; + if ((qp_attr_mask & IB_QP_PKEY_INDEX) && (qp_attr_mask & IB_QP_PORT)) new_pps->main.state = IB_PORT_PKEY_VALID; - } else if (qp_pps) { + + if (!(qp_attr_mask & (IB_QP_PKEY_INDEX || IB_QP_PORT)) && qp_pps) { new_pps->main.port_num = qp_pps->main.port_num; new_pps->main.pkey_index = qp_pps->main.pkey_index; if (qp_pps->main.state != IB_PORT_PKEY_NOT_VALID) -- cgit v1.2.3 From 14c9ca0583eee8df285d68a0e6ec71053efd2228 Mon Sep 17 00:00:00 2001 From: Andreas Dilger Date: Sun, 26 Jan 2020 15:03:34 -0700 Subject: ext4: don't assume that mmp_nodename/bdevname have NUL Don't assume that the mmp_nodename and mmp_bdevname strings are NUL terminated, since they are filled in by snprintf(), which is not guaranteed to do so. Link: https://lore.kernel.org/r/1580076215-1048-1-git-send-email-adilger@dilger.ca Signed-off-by: Andreas Dilger Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/mmp.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c index 1c44b1a32001..87f7551c5132 100644 --- a/fs/ext4/mmp.c +++ b/fs/ext4/mmp.c @@ -120,10 +120,10 @@ void __dump_mmp_msg(struct super_block *sb, struct mmp_struct *mmp, { __ext4_warning(sb, function, line, "%s", msg); __ext4_warning(sb, function, line, - "MMP failure info: last update time: %llu, last update " - "node: %s, last update device: %s", - (long long unsigned int) le64_to_cpu(mmp->mmp_time), - mmp->mmp_nodename, mmp->mmp_bdevname); + "MMP failure info: last update time: %llu, last update node: %.*s, last update device: %.*s", + (unsigned long long)le64_to_cpu(mmp->mmp_time), + (int)sizeof(mmp->mmp_nodename), mmp->mmp_nodename, + (int)sizeof(mmp->mmp_bdevname), mmp->mmp_bdevname); } /* @@ -154,6 +154,7 @@ static int kmmpd(void *data) mmp_check_interval = max(EXT4_MMP_CHECK_MULT * mmp_update_interval, EXT4_MMP_MIN_CHECK_INTERVAL); mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval); + BUILD_BUG_ON(sizeof(mmp->mmp_bdevname) < BDEVNAME_SIZE); bdevname(bh->b_bdev, mmp->mmp_bdevname); memcpy(mmp->mmp_nodename, init_utsname()->nodename, @@ -379,7 +380,8 @@ skip: /* * Start a kernel thread to update the MMP block periodically. */ - EXT4_SB(sb)->s_mmp_tsk = kthread_run(kmmpd, mmpd_data, "kmmpd-%s", + EXT4_SB(sb)->s_mmp_tsk = kthread_run(kmmpd, mmpd_data, "kmmpd-%.*s", + (int)sizeof(mmp->mmp_bdevname), bdevname(bh->b_bdev, mmp->mmp_bdevname)); if (IS_ERR(EXT4_SB(sb)->s_mmp_tsk)) { -- cgit v1.2.3 From 46d36880d1c6f9b9a0cbaf90235355ea1f4cab96 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 30 Jan 2020 12:11:48 +0100 Subject: ext4: simplify checking quota limits in ext4_statfs() Coverity reports that conditions checking quota limits in ext4_statfs() contain dead code. Indeed it is right and current conditions can be simplified. Link: https://lore.kernel.org/r/20200130111148.10766-1-jack@suse.cz Reported-by: Coverity Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/super.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 88b213bd32bc..f23367a779e8 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -5585,10 +5585,7 @@ static int ext4_statfs_project(struct super_block *sb, return PTR_ERR(dquot); spin_lock(&dquot->dq_dqb_lock); - limit = 0; - if (dquot->dq_dqb.dqb_bsoftlimit && - (!limit || dquot->dq_dqb.dqb_bsoftlimit < limit)) - limit = dquot->dq_dqb.dqb_bsoftlimit; + limit = dquot->dq_dqb.dqb_bsoftlimit; if (dquot->dq_dqb.dqb_bhardlimit && (!limit || dquot->dq_dqb.dqb_bhardlimit < limit)) limit = dquot->dq_dqb.dqb_bhardlimit; @@ -5603,10 +5600,7 @@ static int ext4_statfs_project(struct super_block *sb, (buf->f_blocks - curblock) : 0; } - limit = 0; - if (dquot->dq_dqb.dqb_isoftlimit && - (!limit || dquot->dq_dqb.dqb_isoftlimit < limit)) - limit = dquot->dq_dqb.dqb_isoftlimit; + limit = dquot->dq_dqb.dqb_isoftlimit; if (dquot->dq_dqb.dqb_ihardlimit && (!limit || dquot->dq_dqb.dqb_ihardlimit < limit)) limit = dquot->dq_dqb.dqb_ihardlimit; -- cgit v1.2.3 From c7ff8573ad21dcdcbcffd66fbfca3b53cd67d2b1 Mon Sep 17 00:00:00 2001 From: Harald Freudenberger Date: Wed, 22 Jan 2020 14:43:23 +0100 Subject: crypto/testmgr: enable selftests for paes-s390 ciphers This patch enables the selftests for the s390 specific protected key AES (PAES) cipher implementations: * cbc-paes-s390 * ctr-paes-s390 * ecb-paes-s390 * xts-paes-s390 PAES is an AES cipher but with encrypted ('protected') key material. However, the paes ciphers are able to derive an protected key from clear key material with the help of the pkey kernel module. So this patch now enables the generic AES tests for the paes ciphers. Under the hood the setkey() functions rearrange the clear key values as clear key token and so the pkey kernel module is able to provide protected key blobs from the given clear key values. The derived protected key blobs are then used within the paes cipers and should produce the very same results as the generic AES implementation with the clear key values. The s390-paes cipher testlist entries are surrounded by #if IS_ENABLED(CONFIG_CRYPTO_PAES_S390) because they don't make any sense on non s390 platforms or without the PAES cipher implementation. Link: http://lkml.kernel.org/r/20200213083946.zicarnnt3wizl5ty@gondor.apana.org.au Acked-by: Herbert Xu Signed-off-by: Harald Freudenberger Signed-off-by: Vasily Gorbik --- crypto/testmgr.c | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/crypto/testmgr.c b/crypto/testmgr.c index 88f33c0efb23..ccb3d60729fc 100644 --- a/crypto/testmgr.c +++ b/crypto/testmgr.c @@ -4436,6 +4436,15 @@ static const struct alg_test_desc alg_test_descs[] = { .cipher = __VECS(tf_cbc_tv_template) }, }, { +#if IS_ENABLED(CONFIG_CRYPTO_PAES_S390) + .alg = "cbc-paes-s390", + .fips_allowed = 1, + .test = alg_test_skcipher, + .suite = { + .cipher = __VECS(aes_cbc_tv_template) + } + }, { +#endif .alg = "cbcmac(aes)", .fips_allowed = 1, .test = alg_test_hash, @@ -4587,6 +4596,15 @@ static const struct alg_test_desc alg_test_descs[] = { .cipher = __VECS(tf_ctr_tv_template) } }, { +#if IS_ENABLED(CONFIG_CRYPTO_PAES_S390) + .alg = "ctr-paes-s390", + .fips_allowed = 1, + .test = alg_test_skcipher, + .suite = { + .cipher = __VECS(aes_ctr_tv_template) + } + }, { +#endif .alg = "cts(cbc(aes))", .test = alg_test_skcipher, .fips_allowed = 1, @@ -4879,6 +4897,15 @@ static const struct alg_test_desc alg_test_descs[] = { .cipher = __VECS(xtea_tv_template) } }, { +#if IS_ENABLED(CONFIG_CRYPTO_PAES_S390) + .alg = "ecb-paes-s390", + .fips_allowed = 1, + .test = alg_test_skcipher, + .suite = { + .cipher = __VECS(aes_tv_template) + } + }, { +#endif .alg = "ecdh", .test = alg_test_kpp, .fips_allowed = 1, @@ -5465,6 +5492,15 @@ static const struct alg_test_desc alg_test_descs[] = { .cipher = __VECS(tf_xts_tv_template) } }, { +#if IS_ENABLED(CONFIG_CRYPTO_PAES_S390) + .alg = "xts-paes-s390", + .fips_allowed = 1, + .test = alg_test_skcipher, + .suite = { + .cipher = __VECS(aes_xts_tv_template) + } + }, { +#endif .alg = "xts4096(paes)", .test = alg_test_null, .fips_allowed = 1, -- cgit v1.2.3 From 4f97a68192bd33b9963b400759cef0ca5963af00 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 6 Feb 2020 17:35:01 -0500 Subject: ext4: fix support for inode sizes > 1024 bytes A recent commit, 9803387c55f7 ("ext4: validate the debug_want_extra_isize mount option at parse time"), moved mount-time checks around. One of those changes moved the inode size check before the blocksize variable was set to the blocksize of the file system. After 9803387c55f7 was set to the minimum allowable blocksize, which in practice on most systems would be 1024 bytes. This cuased file systems with inode sizes larger than 1024 bytes to be rejected with a message: EXT4-fs (sdXX): unsupported inode size: 4096 Fixes: 9803387c55f7 ("ext4: validate the debug_want_extra_isize mount option at parse time") Link: https://lore.kernel.org/r/20200206225252.GA3673@mit.edu Reported-by: Herbert Poetzl Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/super.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index f23367a779e8..b0b9150c9773 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3814,6 +3814,15 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) */ sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT; + blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size); + if (blocksize < EXT4_MIN_BLOCK_SIZE || + blocksize > EXT4_MAX_BLOCK_SIZE) { + ext4_msg(sb, KERN_ERR, + "Unsupported filesystem blocksize %d (%d log_block_size)", + blocksize, le32_to_cpu(es->s_log_block_size)); + goto failed_mount; + } + if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV) { sbi->s_inode_size = EXT4_GOOD_OLD_INODE_SIZE; sbi->s_first_ino = EXT4_GOOD_OLD_FIRST_INO; @@ -3831,6 +3840,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) ext4_msg(sb, KERN_ERR, "unsupported inode size: %d", sbi->s_inode_size); + ext4_msg(sb, KERN_ERR, "blocksize: %d", blocksize); goto failed_mount; } /* @@ -4033,14 +4043,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) if (!ext4_feature_set_ok(sb, (sb_rdonly(sb)))) goto failed_mount; - blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size); - if (blocksize < EXT4_MIN_BLOCK_SIZE || - blocksize > EXT4_MAX_BLOCK_SIZE) { - ext4_msg(sb, KERN_ERR, - "Unsupported filesystem blocksize %d (%d log_block_size)", - blocksize, le32_to_cpu(es->s_log_block_size)); - goto failed_mount; - } if (le32_to_cpu(es->s_log_block_size) > (EXT4_MAX_BLOCK_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) { ext4_msg(sb, KERN_ERR, -- cgit v1.2.3 From 48a34311953d921235f4d7bbd2111690d2e469cf Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 10 Feb 2020 15:43:16 +0100 Subject: ext4: fix checksum errors with indexed dirs DIR_INDEX has been introduced as a compat ext4 feature. That means that even kernels / tools that don't understand the feature may modify the filesystem. This works because for kernels not understanding indexed dir format, internal htree nodes appear just as empty directory entries. Index dir aware kernels then check the htree structure is still consistent before using the data. This all worked reasonably well until metadata checksums were introduced. The problem is that these effectively made DIR_INDEX only ro-compatible because internal htree nodes store checksums in a different place than normal directory blocks. Thus any modification ignorant to DIR_INDEX (or just clearing EXT4_INDEX_FL from the inode) will effectively cause checksum mismatch and trigger kernel errors. So we have to be more careful when dealing with indexed directories on filesystems with checksumming enabled. 1) We just disallow loading any directory inodes with EXT4_INDEX_FL when DIR_INDEX is not enabled. This is harsh but it should be very rare (it means someone disabled DIR_INDEX on existing filesystem and didn't run e2fsck), e2fsck can fix the problem, and we don't want to answer the difficult question: "Should we rather corrupt the directory more or should we ignore that DIR_INDEX feature is not set?" 2) When we find out htree structure is corrupted (but the filesystem and the directory should in support htrees), we continue just ignoring htree information for reading but we refuse to add new entries to the directory to avoid corrupting it more. Link: https://lore.kernel.org/r/20200210144316.22081-1-jack@suse.cz Fixes: dbe89444042a ("ext4: Calculate and verify checksums for htree nodes") Reviewed-by: Andreas Dilger Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/dir.c | 14 ++++++++------ fs/ext4/ext4.h | 5 ++++- fs/ext4/inode.c | 12 ++++++++++++ fs/ext4/namei.c | 7 +++++++ 4 files changed, 31 insertions(+), 7 deletions(-) diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index 1f340743c9a8..9aa1f75409b0 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -129,12 +129,14 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) if (err != ERR_BAD_DX_DIR) { return err; } - /* - * We don't set the inode dirty flag since it's not - * critical that it get flushed back to the disk. - */ - ext4_clear_inode_flag(file_inode(file), - EXT4_INODE_INDEX); + /* Can we just clear INDEX flag to ignore htree information? */ + if (!ext4_has_metadata_csum(sb)) { + /* + * We don't set the inode dirty flag since it's not + * critical that it gets flushed back to the disk. + */ + ext4_clear_inode_flag(inode, EXT4_INODE_INDEX); + } } if (ext4_has_inline_data(inode)) { diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 9a2ee2428ecc..4441331d06cc 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2544,8 +2544,11 @@ void ext4_insert_dentry(struct inode *inode, struct ext4_filename *fname); static inline void ext4_update_dx_flag(struct inode *inode) { - if (!ext4_has_feature_dir_index(inode->i_sb)) + if (!ext4_has_feature_dir_index(inode->i_sb)) { + /* ext4_iget() should have caught this... */ + WARN_ON_ONCE(ext4_has_feature_metadata_csum(inode->i_sb)); ext4_clear_inode_flag(inode, EXT4_INODE_INDEX); + } } static const unsigned char ext4_filetype_table[] = { DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 3313168b680f..c04a15fc8b6a 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4644,6 +4644,18 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, ret = -EFSCORRUPTED; goto bad_inode; } + /* + * If dir_index is not enabled but there's dir with INDEX flag set, + * we'd normally treat htree data as empty space. But with metadata + * checksumming that corrupts checksums so forbid that. + */ + if (!ext4_has_feature_dir_index(sb) && ext4_has_metadata_csum(sb) && + ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) { + ext4_error_inode(inode, function, line, 0, + "iget: Dir with htree data on filesystem without dir_index feature."); + ret = -EFSCORRUPTED; + goto bad_inode; + } ei->i_disksize = inode->i_size; #ifdef CONFIG_QUOTA ei->i_reserved_quota = 0; diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 129d2ebae00d..ceff4b4b1877 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2213,6 +2213,13 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, retval = ext4_dx_add_entry(handle, &fname, dir, inode); if (!retval || (retval != ERR_BAD_DX_DIR)) goto out; + /* Can we just ignore htree data? */ + if (ext4_has_metadata_csum(sb)) { + EXT4_ERROR_INODE(dir, + "Directory has corrupted htree index."); + retval = -EFSCORRUPTED; + goto out; + } ext4_clear_inode_flag(dir, EXT4_INODE_INDEX); dx_fallback++; ext4_mark_inode_dirty(handle, dir); -- cgit v1.2.3 From af133ade9a40794a37104ecbcc2827c0ea373a3c Mon Sep 17 00:00:00 2001 From: Shijie Luo Date: Mon, 10 Feb 2020 20:17:52 -0500 Subject: ext4: add cond_resched() to ext4_protect_reserved_inode When journal size is set too big by "mkfs.ext4 -J size=", or when we mount a crafted image to make journal inode->i_size too big, the loop, "while (i < num)", holds cpu too long. This could cause soft lockup. [ 529.357541] Call trace: [ 529.357551] dump_backtrace+0x0/0x198 [ 529.357555] show_stack+0x24/0x30 [ 529.357562] dump_stack+0xa4/0xcc [ 529.357568] watchdog_timer_fn+0x300/0x3e8 [ 529.357574] __hrtimer_run_queues+0x114/0x358 [ 529.357576] hrtimer_interrupt+0x104/0x2d8 [ 529.357580] arch_timer_handler_virt+0x38/0x58 [ 529.357584] handle_percpu_devid_irq+0x90/0x248 [ 529.357588] generic_handle_irq+0x34/0x50 [ 529.357590] __handle_domain_irq+0x68/0xc0 [ 529.357593] gic_handle_irq+0x6c/0x150 [ 529.357595] el1_irq+0xb8/0x140 [ 529.357599] __ll_sc_atomic_add_return_acquire+0x14/0x20 [ 529.357668] ext4_map_blocks+0x64/0x5c0 [ext4] [ 529.357693] ext4_setup_system_zone+0x330/0x458 [ext4] [ 529.357717] ext4_fill_super+0x2170/0x2ba8 [ext4] [ 529.357722] mount_bdev+0x1a8/0x1e8 [ 529.357746] ext4_mount+0x44/0x58 [ext4] [ 529.357748] mount_fs+0x50/0x170 [ 529.357752] vfs_kern_mount.part.9+0x54/0x188 [ 529.357755] do_mount+0x5ac/0xd78 [ 529.357758] ksys_mount+0x9c/0x118 [ 529.357760] __arm64_sys_mount+0x28/0x38 [ 529.357764] el0_svc_common+0x78/0x130 [ 529.357766] el0_svc_handler+0x38/0x78 [ 529.357769] el0_svc+0x8/0xc [ 541.356516] watchdog: BUG: soft lockup - CPU#0 stuck for 23s! [mount:18674] Link: https://lore.kernel.org/r/20200211011752.29242-1-luoshijie1@huawei.com Reviewed-by: Jan Kara Signed-off-by: Shijie Luo Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/block_validity.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c index 1ee04e76bbe0..0a734ffb4310 100644 --- a/fs/ext4/block_validity.c +++ b/fs/ext4/block_validity.c @@ -207,6 +207,7 @@ static int ext4_protect_reserved_inode(struct super_block *sb, return PTR_ERR(inode); num = (inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits; while (i < num) { + cond_resched(); map.m_lblk = i; map.m_len = num - i; n = ext4_map_blocks(NULL, inode, &map, 0); -- cgit v1.2.3 From 6a66a7ded12baa6ebbb2e3e82f8cb91382814839 Mon Sep 17 00:00:00 2001 From: "zhangyi (F)" Date: Thu, 13 Feb 2020 14:38:20 +0800 Subject: jbd2: move the clearing of b_modified flag to the journal_unmap_buffer() There is no need to delay the clearing of b_modified flag to the transaction committing time when unmapping the journalled buffer, so just move it to the journal_unmap_buffer(). Link: https://lore.kernel.org/r/20200213063821.30455-2-yi.zhang@huawei.com Reviewed-by: Jan Kara Signed-off-by: zhangyi (F) Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/jbd2/commit.c | 43 +++++++++++++++---------------------------- fs/jbd2/transaction.c | 10 ++++++---- 2 files changed, 21 insertions(+), 32 deletions(-) diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 2494095e0340..6396fe70085b 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -976,34 +976,21 @@ restart_loop: * it. */ /* - * A buffer which has been freed while still being journaled by - * a previous transaction. - */ - if (buffer_freed(bh)) { - /* - * If the running transaction is the one containing - * "add to orphan" operation (b_next_transaction != - * NULL), we have to wait for that transaction to - * commit before we can really get rid of the buffer. - * So just clear b_modified to not confuse transaction - * credit accounting and refile the buffer to - * BJ_Forget of the running transaction. If the just - * committed transaction contains "add to orphan" - * operation, we can completely invalidate the buffer - * now. We are rather through in that since the - * buffer may be still accessible when blocksize < - * pagesize and it is attached to the last partial - * page. - */ - jh->b_modified = 0; - if (!jh->b_next_transaction) { - clear_buffer_freed(bh); - clear_buffer_jbddirty(bh); - clear_buffer_mapped(bh); - clear_buffer_new(bh); - clear_buffer_req(bh); - bh->b_bdev = NULL; - } + * A buffer which has been freed while still being journaled + * by a previous transaction, refile the buffer to BJ_Forget of + * the running transaction. If the just committed transaction + * contains "add to orphan" operation, we can completely + * invalidate the buffer now. We are rather through in that + * since the buffer may be still accessible when blocksize < + * pagesize and it is attached to the last partial page. + */ + if (buffer_freed(bh) && !jh->b_next_transaction) { + clear_buffer_freed(bh); + clear_buffer_jbddirty(bh); + clear_buffer_mapped(bh); + clear_buffer_new(bh); + clear_buffer_req(bh); + bh->b_bdev = NULL; } if (buffer_jbddirty(bh)) { diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index e77a5a0b4e46..2dd848a743ed 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -2329,14 +2329,16 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh, return -EBUSY; } /* - * OK, buffer won't be reachable after truncate. We just set - * j_next_transaction to the running transaction (if there is - * one) and mark buffer as freed so that commit code knows it - * should clear dirty bits when it is done with the buffer. + * OK, buffer won't be reachable after truncate. We just clear + * b_modified to not confuse transaction credit accounting, and + * set j_next_transaction to the running transaction (if there + * is one) and mark buffer as freed so that commit code knows + * it should clear dirty bits when it is done with the buffer. */ set_buffer_freed(bh); if (journal->j_running_transaction && buffer_jbddirty(bh)) jh->b_next_transaction = journal->j_running_transaction; + jh->b_modified = 0; spin_unlock(&journal->j_list_lock); spin_unlock(&jh->b_state_lock); write_unlock(&journal->j_state_lock); -- cgit v1.2.3 From c96dceeabf765d0b1b1f29c3bf50a5c01315b820 Mon Sep 17 00:00:00 2001 From: "zhangyi (F)" Date: Thu, 13 Feb 2020 14:38:21 +0800 Subject: jbd2: do not clear the BH_Mapped flag when forgetting a metadata buffer Commit 904cdbd41d74 ("jbd2: clear dirty flag when revoking a buffer from an older transaction") set the BH_Freed flag when forgetting a metadata buffer which belongs to the committing transaction, it indicate the committing process clear dirty bits when it is done with the buffer. But it also clear the BH_Mapped flag at the same time, which may trigger below NULL pointer oops when block_size < PAGE_SIZE. rmdir 1 kjournald2 mkdir 2 jbd2_journal_commit_transaction commit transaction N jbd2_journal_forget set_buffer_freed(bh1) jbd2_journal_commit_transaction commit transaction N+1 ... clear_buffer_mapped(bh1) ext4_getblk(bh2 ummapped) ... grow_dev_page init_page_buffers bh1->b_private=NULL bh2->b_private=NULL jbd2_journal_put_journal_head(jh1) __journal_remove_journal_head(hb1) jh1 is NULL and trigger oops *) Dir entry block bh1 and bh2 belongs to one page, and the bh2 has already been unmapped. For the metadata buffer we forgetting, we should always keep the mapped flag and clear the dirty flags is enough, so this patch pick out the these buffers and keep their BH_Mapped flag. Link: https://lore.kernel.org/r/20200213063821.30455-3-yi.zhang@huawei.com Fixes: 904cdbd41d74 ("jbd2: clear dirty flag when revoking a buffer from an older transaction") Reviewed-by: Jan Kara Signed-off-by: zhangyi (F) Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/jbd2/commit.c | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 6396fe70085b..27373f5792a4 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -985,12 +985,29 @@ restart_loop: * pagesize and it is attached to the last partial page. */ if (buffer_freed(bh) && !jh->b_next_transaction) { + struct address_space *mapping; + clear_buffer_freed(bh); clear_buffer_jbddirty(bh); - clear_buffer_mapped(bh); - clear_buffer_new(bh); - clear_buffer_req(bh); - bh->b_bdev = NULL; + + /* + * Block device buffers need to stay mapped all the + * time, so it is enough to clear buffer_jbddirty and + * buffer_freed bits. For the file mapping buffers (i.e. + * journalled data) we need to unmap buffer and clear + * more bits. We also need to be careful about the check + * because the data page mapping can get cleared under + * out hands, which alse need not to clear more bits + * because the page and buffers will be freed and can + * never be reused once we are done with them. + */ + mapping = READ_ONCE(bh->b_page->mapping); + if (mapping && !sb_is_blkdev_sb(mapping->host->i_sb)) { + clear_buffer_mapped(bh); + clear_buffer_new(bh); + clear_buffer_req(bh); + bh->b_bdev = NULL; + } } if (buffer_jbddirty(bh)) { -- cgit v1.2.3 From 6e5cf31fbe651bed7ba1df768f2e123531132417 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 4 Feb 2020 13:28:41 +0100 Subject: x86/mce/amd: Publish the bank pointer only after setup has succeeded threshold_create_bank() creates a bank descriptor per MCA error thresholding counter which can be controlled over sysfs. It publishes the pointer to that bank in a per-CPU variable and then goes on to create additional thresholding blocks if the bank has such. However, that creation of additional blocks in allocate_threshold_blocks() can fail, leading to a use-after-free through the per-CPU pointer. Therefore, publish that pointer only after all blocks have been setup successfully. Fixes: 019f34fccfd5 ("x86, MCE, AMD: Move shared bank to node descriptor") Reported-by: Saar Amar Reported-by: Dan Carpenter Signed-off-by: Borislav Petkov Cc: Link: http://lkml.kernel.org/r/20200128140846.phctkvx5btiexvbx@kili.mountain --- arch/x86/kernel/cpu/mce/amd.c | 33 ++++++++++++++++----------------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index b3a50d962851..e7313e5c497c 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -1198,8 +1198,9 @@ static const char *get_name(unsigned int bank, struct threshold_block *b) return buf_mcatype; } -static int allocate_threshold_blocks(unsigned int cpu, unsigned int bank, - unsigned int block, u32 address) +static int allocate_threshold_blocks(unsigned int cpu, struct threshold_bank *tb, + unsigned int bank, unsigned int block, + u32 address) { struct threshold_block *b = NULL; u32 low, high; @@ -1243,16 +1244,12 @@ static int allocate_threshold_blocks(unsigned int cpu, unsigned int bank, INIT_LIST_HEAD(&b->miscj); - if (per_cpu(threshold_banks, cpu)[bank]->blocks) { - list_add(&b->miscj, - &per_cpu(threshold_banks, cpu)[bank]->blocks->miscj); - } else { - per_cpu(threshold_banks, cpu)[bank]->blocks = b; - } + if (tb->blocks) + list_add(&b->miscj, &tb->blocks->miscj); + else + tb->blocks = b; - err = kobject_init_and_add(&b->kobj, &threshold_ktype, - per_cpu(threshold_banks, cpu)[bank]->kobj, - get_name(bank, b)); + err = kobject_init_and_add(&b->kobj, &threshold_ktype, tb->kobj, get_name(bank, b)); if (err) goto out_free; recurse: @@ -1260,7 +1257,7 @@ recurse: if (!address) return 0; - err = allocate_threshold_blocks(cpu, bank, block, address); + err = allocate_threshold_blocks(cpu, tb, bank, block, address); if (err) goto out_free; @@ -1345,8 +1342,6 @@ static int threshold_create_bank(unsigned int cpu, unsigned int bank) goto out_free; } - per_cpu(threshold_banks, cpu)[bank] = b; - if (is_shared_bank(bank)) { refcount_set(&b->cpus, 1); @@ -1357,9 +1352,13 @@ static int threshold_create_bank(unsigned int cpu, unsigned int bank) } } - err = allocate_threshold_blocks(cpu, bank, 0, msr_ops.misc(bank)); - if (!err) - goto out; + err = allocate_threshold_blocks(cpu, b, bank, 0, msr_ops.misc(bank)); + if (err) + goto out_free; + + per_cpu(threshold_banks, cpu)[bank] = b; + + return 0; out_free: kfree(b); -- cgit v1.2.3 From 4508cf76b1ecdf20a456b6b161acbe78f3b23358 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Thu, 13 Feb 2020 12:43:42 +0100 Subject: serial: cpm_uart: call cpm_muram_init before registering console Christophe reports that powerpc 8xx silently fails to 5.6-rc1. It turns out I was wrong about nobody relying on the lazy initialization of the cpm/qe muram in commit b6231ea2b3c6 (soc: fsl: qe: drop broken lazy call of cpm_muram_init()). Rather than reinstating the somewhat dubious lazy call (initializing a currently held spinlock, and implicitly doing a GFP_KERNEL under that spinlock), make sure that cpm_muram_init() is called early enough - I thought the calls from the subsys_initcalls were good enough, but when used by console drivers, that's obviously not the case. cpm_muram_init() is safe to call twice (there's an early return if it is already initialized), so keep the call from cpm_init() - in case SERIAL_CPM_CONSOLE=n. Fixes: b6231ea2b3c6 (soc: fsl: qe: drop broken lazy call of cpm_muram_init()) Reported-by: Christophe Leroy Signed-off-by: Rasmus Villemoes Tested-by: Christophe Leroy Link: https://lore.kernel.org/r/20200213114342.21712-1-linux@rasmusvillemoes.dk Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/cpm_uart/cpm_uart_core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/tty/serial/cpm_uart/cpm_uart_core.c b/drivers/tty/serial/cpm_uart/cpm_uart_core.c index 19d5a4cf29a6..d4b81b06e0cb 100644 --- a/drivers/tty/serial/cpm_uart/cpm_uart_core.c +++ b/drivers/tty/serial/cpm_uart/cpm_uart_core.c @@ -1373,6 +1373,7 @@ static struct console cpm_scc_uart_console = { static int __init cpm_uart_console_init(void) { + cpm_muram_init(); register_console(&cpm_scc_uart_console); return 0; } -- cgit v1.2.3 From 687bff0cd08f790d540cfb7b2349f0d876cdddec Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Mon, 10 Feb 2020 09:11:30 +0100 Subject: vt: selection, handle pending signals in paste_selection When pasting a selection to a vt, the task is set as INTERRUPTIBLE while waiting for a tty to unthrottle. But signals are not handled at all. Normally, this is not a problem as tty_ldisc_receive_buf receives all the goods and a user has no reason to interrupt the task. There are two scenarios where this matters: 1) when the tty is throttled and a signal is sent to the process, it spins on a CPU until the tty is unthrottled. schedule() does not really echedule, but returns immediately, of course. 2) when the sel_buffer becomes invalid, KASAN prevents any reads from it and the loop simply does not proceed and spins forever (causing the tty to throttle, but the code never sleeps, the same as above). This sometimes happens as there is a race in the sel_buffer handling code. So add signal handling to this ioctl (TIOCL_PASTESEL) and return -EINTR in case a signal is pending. Signed-off-by: Jiri Slaby Cc: stable Link: https://lore.kernel.org/r/20200210081131.23572-1-jslaby@suse.cz Signed-off-by: Greg Kroah-Hartman --- drivers/tty/vt/selection.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/tty/vt/selection.c b/drivers/tty/vt/selection.c index 78732feaf65b..44d974d4159f 100644 --- a/drivers/tty/vt/selection.c +++ b/drivers/tty/vt/selection.c @@ -29,6 +29,8 @@ #include #include +#include + /* Don't take this from : 011-015 on the screen aren't spaces */ #define isspace(c) ((c) == ' ') @@ -350,6 +352,7 @@ int paste_selection(struct tty_struct *tty) unsigned int count; struct tty_ldisc *ld; DECLARE_WAITQUEUE(wait, current); + int ret = 0; console_lock(); poke_blanked_console(); @@ -363,6 +366,10 @@ int paste_selection(struct tty_struct *tty) add_wait_queue(&vc->paste_wait, &wait); while (sel_buffer && sel_buffer_lth > pasted) { set_current_state(TASK_INTERRUPTIBLE); + if (signal_pending(current)) { + ret = -EINTR; + break; + } if (tty_throttled(tty)) { schedule(); continue; @@ -378,6 +385,6 @@ int paste_selection(struct tty_struct *tty) tty_buffer_unlock_exclusive(&vc->port); tty_ldisc_deref(ld); - return 0; + return ret; } EXPORT_SYMBOL_GPL(paste_selection); -- cgit v1.2.3 From 07e6124a1a46b4b5a9b3cacc0c306b50da87abf5 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Mon, 10 Feb 2020 09:11:31 +0100 Subject: vt: selection, close sel_buffer race syzkaller reported this UAF: BUG: KASAN: use-after-free in n_tty_receive_buf_common+0x2481/0x2940 drivers/tty/n_tty.c:1741 Read of size 1 at addr ffff8880089e40e9 by task syz-executor.1/13184 CPU: 0 PID: 13184 Comm: syz-executor.1 Not tainted 5.4.7 #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.12.0-1 04/01/2014 Call Trace: ... kasan_report+0xe/0x20 mm/kasan/common.c:634 n_tty_receive_buf_common+0x2481/0x2940 drivers/tty/n_tty.c:1741 tty_ldisc_receive_buf+0xac/0x190 drivers/tty/tty_buffer.c:461 paste_selection+0x297/0x400 drivers/tty/vt/selection.c:372 tioclinux+0x20d/0x4e0 drivers/tty/vt/vt.c:3044 vt_ioctl+0x1bcf/0x28d0 drivers/tty/vt/vt_ioctl.c:364 tty_ioctl+0x525/0x15a0 drivers/tty/tty_io.c:2657 vfs_ioctl fs/ioctl.c:47 [inline] It is due to a race between parallel paste_selection (TIOCL_PASTESEL) and set_selection_user (TIOCL_SETSEL) invocations. One uses sel_buffer, while the other frees it and reallocates a new one for another selection. Add a mutex to close this race. The mutex takes care properly of sel_buffer and sel_buffer_lth only. The other selection global variables (like sel_start, sel_end, and sel_cons) are protected only in set_selection_user. The other functions need quite some more work to close the races of the variables there. This is going to happen later. This likely fixes (I am unsure as there is no reproducer provided) bug 206361 too. It was marked as CVE-2020-8648. Signed-off-by: Jiri Slaby Reported-by: syzbot+59997e8d5cbdc486e6f6@syzkaller.appspotmail.com References: https://bugzilla.kernel.org/show_bug.cgi?id=206361 Cc: stable Link: https://lore.kernel.org/r/20200210081131.23572-2-jslaby@suse.cz Signed-off-by: Greg Kroah-Hartman --- drivers/tty/vt/selection.c | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/drivers/tty/vt/selection.c b/drivers/tty/vt/selection.c index 44d974d4159f..0c50d7410b31 100644 --- a/drivers/tty/vt/selection.c +++ b/drivers/tty/vt/selection.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include @@ -45,6 +46,7 @@ static volatile int sel_start = -1; /* cleared by clear_selection */ static int sel_end; static int sel_buffer_lth; static char *sel_buffer; +static DEFINE_MUTEX(sel_lock); /* clear_selection, highlight and highlight_pointer can be called from interrupt (via scrollback/front) */ @@ -186,7 +188,7 @@ int set_selection_kernel(struct tiocl_selection *v, struct tty_struct *tty) char *bp, *obp; int i, ps, pe, multiplier; u32 c; - int mode; + int mode, ret = 0; poke_blanked_console(); @@ -212,6 +214,7 @@ int set_selection_kernel(struct tiocl_selection *v, struct tty_struct *tty) if (ps > pe) /* make sel_start <= sel_end */ swap(ps, pe); + mutex_lock(&sel_lock); if (sel_cons != vc_cons[fg_console].d) { clear_selection(); sel_cons = vc_cons[fg_console].d; @@ -257,9 +260,10 @@ int set_selection_kernel(struct tiocl_selection *v, struct tty_struct *tty) break; case TIOCL_SELPOINTER: highlight_pointer(pe); - return 0; + goto unlock; default: - return -EINVAL; + ret = -EINVAL; + goto unlock; } /* remove the pointer */ @@ -281,7 +285,7 @@ int set_selection_kernel(struct tiocl_selection *v, struct tty_struct *tty) else if (new_sel_start == sel_start) { if (new_sel_end == sel_end) /* no action required */ - return 0; + goto unlock; else if (new_sel_end > sel_end) /* extend to right */ highlight(sel_end + 2, new_sel_end); else /* contract from right */ @@ -309,7 +313,8 @@ int set_selection_kernel(struct tiocl_selection *v, struct tty_struct *tty) if (!bp) { printk(KERN_WARNING "selection: kmalloc() failed\n"); clear_selection(); - return -ENOMEM; + ret = -ENOMEM; + goto unlock; } kfree(sel_buffer); sel_buffer = bp; @@ -334,7 +339,9 @@ int set_selection_kernel(struct tiocl_selection *v, struct tty_struct *tty) } } sel_buffer_lth = bp - sel_buffer; - return 0; +unlock: + mutex_unlock(&sel_lock); + return ret; } EXPORT_SYMBOL_GPL(set_selection_kernel); @@ -364,6 +371,7 @@ int paste_selection(struct tty_struct *tty) tty_buffer_lock_exclusive(&vc->port); add_wait_queue(&vc->paste_wait, &wait); + mutex_lock(&sel_lock); while (sel_buffer && sel_buffer_lth > pasted) { set_current_state(TASK_INTERRUPTIBLE); if (signal_pending(current)) { @@ -371,7 +379,9 @@ int paste_selection(struct tty_struct *tty) break; } if (tty_throttled(tty)) { + mutex_unlock(&sel_lock); schedule(); + mutex_lock(&sel_lock); continue; } __set_current_state(TASK_RUNNING); @@ -380,6 +390,7 @@ int paste_selection(struct tty_struct *tty) count); pasted += count; } + mutex_unlock(&sel_lock); remove_wait_queue(&vc->paste_wait, &wait); __set_current_state(TASK_RUNNING); -- cgit v1.2.3 From 3e8393630e928767aeb23f4744518de4ea5cc35a Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Wed, 12 Feb 2020 14:00:40 +0000 Subject: selftests: use LDLIBS for libraries instead of LDFLAGS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit While building selftests, the following errors were observed: > tools/testing/selftests/timens' > gcc -Wall -Werror -pthread -lrt -ldl timens.c -o tools/testing/selftests/timens/timens > /usr/bin/ld: /tmp/ccGy5CST.o: in function `check_config_posix_timers': > timens.c:(.text+0x65a): undefined reference to `timer_create' > collect2: error: ld returned 1 exit status Quoting commit 870f193d48c2 ("selftests: net: use LDLIBS instead of LDFLAGS"): The default Makefile rule looks like: $(CC) $(CFLAGS) $(LDFLAGS) $@ $^ $(LDLIBS) When linking is done by gcc itself, no issue, but when it needs to be passed to proper ld, only LDLIBS follows and then ld cannot know what libs to link with. More detail: https://www.gnu.org/software/make/manual/html_node/Implicit-Variables.html LDFLAGS Extra flags to give to compilers when they are supposed to invoke the linker, ‘ld’, such as -L. Libraries (-lfoo) should be added to the LDLIBS variable instead. LDLIBS Library flags or names given to compilers when they are supposed to invoke the linker, ‘ld’. LOADLIBES is a deprecated (but still supported) alternative to LDLIBS. Non-library linker flags, such as -L, should go in the LDFLAGS variable. While at here, correct other selftests, not only timens ones. Reported-by: Shuah Khan Signed-off-by: Dmitry Safonov Tested-by: Shuah Khan Signed-off-by: Shuah Khan --- tools/testing/selftests/futex/functional/Makefile | 2 +- tools/testing/selftests/net/Makefile | 4 ++-- tools/testing/selftests/rtc/Makefile | 2 +- tools/testing/selftests/timens/Makefile | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/futex/functional/Makefile b/tools/testing/selftests/futex/functional/Makefile index 30996306cabc..23207829ec75 100644 --- a/tools/testing/selftests/futex/functional/Makefile +++ b/tools/testing/selftests/futex/functional/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 INCLUDES := -I../include -I../../ CFLAGS := $(CFLAGS) -g -O2 -Wall -D_GNU_SOURCE -pthread $(INCLUDES) -LDFLAGS := $(LDFLAGS) -pthread -lrt +LDLIBS := -lpthread -lrt HEADERS := \ ../include/futextest.h \ diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index b5694196430a..287ae916ec0b 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -27,5 +27,5 @@ KSFT_KHDR_INSTALL := 1 include ../lib.mk $(OUTPUT)/reuseport_bpf_numa: LDLIBS += -lnuma -$(OUTPUT)/tcp_mmap: LDFLAGS += -lpthread -$(OUTPUT)/tcp_inq: LDFLAGS += -lpthread +$(OUTPUT)/tcp_mmap: LDLIBS += -lpthread +$(OUTPUT)/tcp_inq: LDLIBS += -lpthread diff --git a/tools/testing/selftests/rtc/Makefile b/tools/testing/selftests/rtc/Makefile index de9c8566672a..2d93d65723c9 100644 --- a/tools/testing/selftests/rtc/Makefile +++ b/tools/testing/selftests/rtc/Makefile @@ -1,6 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 CFLAGS += -O3 -Wl,-no-as-needed -Wall -LDFLAGS += -lrt -lpthread -lm +LDLIBS += -lrt -lpthread -lm TEST_GEN_PROGS = rtctest diff --git a/tools/testing/selftests/timens/Makefile b/tools/testing/selftests/timens/Makefile index e9fb30bd8aeb..b4fd9a934654 100644 --- a/tools/testing/selftests/timens/Makefile +++ b/tools/testing/selftests/timens/Makefile @@ -2,6 +2,6 @@ TEST_GEN_PROGS := timens timerfd timer clock_nanosleep procfs exec TEST_GEN_PROGS_EXTENDED := gettime_perf CFLAGS := -Wall -Werror -pthread -LDFLAGS := -lrt -ldl +LDLIBS := -lrt -ldl include ../lib.mk -- cgit v1.2.3 From 9a0584f05687947d5a0b87f046bcd2592a55e67c Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Thu, 13 Feb 2020 18:26:56 +1100 Subject: selftests: openat2: fix build error on newer glibc It appears that newer glibcs check that openat(O_CREAT) was provided a fourth argument (rather than passing garbage), resulting in the following build error: > In file included from /usr/include/fcntl.h:301, > from helpers.c:9: > In function 'openat', > inlined from 'touchat' at helpers.c:49:11: > /usr/include/x86_64-linux-gnu/bits/fcntl2.h:126:4: error: call to > '__openat_missing_mode' declared with attribute error: openat with O_CREAT > or O_TMPFILE in third argument needs 4 arguments > 126 | __openat_missing_mode (); > | ^~~~~~~~~~~~~~~~~~~~~~~~ Reported-by: Shuah Khan Signed-off-by: Aleksa Sarai Tested-by: Shuah Khan Signed-off-by: Shuah Khan --- tools/testing/selftests/openat2/helpers.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/openat2/helpers.c b/tools/testing/selftests/openat2/helpers.c index e9a6557ab16f..5074681ffdc9 100644 --- a/tools/testing/selftests/openat2/helpers.c +++ b/tools/testing/selftests/openat2/helpers.c @@ -46,7 +46,7 @@ int sys_renameat2(int olddirfd, const char *oldpath, int touchat(int dfd, const char *path) { - int fd = openat(dfd, path, O_CREAT); + int fd = openat(dfd, path, O_CREAT, 0700); if (fd >= 0) close(fd); return fd; -- cgit v1.2.3 From ca1c671302825182629d3c1a60363cee6f5455bb Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 12 Feb 2020 11:12:30 -0500 Subject: xprtrdma: Fix DMA scatter-gather list mapping imbalance The @nents value that was passed to ib_dma_map_sg() has to be passed to the matching ib_dma_unmap_sg() call. If ib_dma_map_sg() choses to concatenate sg entries, it will return a different nents value than it was passed. The bug was exposed by recent changes to the AMD IOMMU driver, which enabled sg entry concatenation. Looking all the way back to commit 4143f34e01e9 ("xprtrdma: Port to new memory registration API") and reviewing other kernel ULPs, it's not clear that the frwr_map() logic was ever correct for this case. Reported-by: Andre Tomt Suggested-by: Robin Murphy Signed-off-by: Chuck Lever Cc: stable@vger.kernel.org Reviewed-by: Jason Gunthorpe Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/frwr_ops.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 095be887753e..125297c9aa3e 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -288,8 +288,8 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt, { struct rpcrdma_ia *ia = &r_xprt->rx_ia; struct ib_reg_wr *reg_wr; + int i, n, dma_nents; struct ib_mr *ibmr; - int i, n; u8 key; if (nsegs > ia->ri_max_frwr_depth) @@ -313,15 +313,16 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt, break; } mr->mr_dir = rpcrdma_data_dir(writing); + mr->mr_nents = i; - mr->mr_nents = - ib_dma_map_sg(ia->ri_id->device, mr->mr_sg, i, mr->mr_dir); - if (!mr->mr_nents) + dma_nents = ib_dma_map_sg(ia->ri_id->device, mr->mr_sg, mr->mr_nents, + mr->mr_dir); + if (!dma_nents) goto out_dmamap_err; ibmr = mr->frwr.fr_mr; - n = ib_map_mr_sg(ibmr, mr->mr_sg, mr->mr_nents, NULL, PAGE_SIZE); - if (unlikely(n != mr->mr_nents)) + n = ib_map_mr_sg(ibmr, mr->mr_sg, dma_nents, NULL, PAGE_SIZE); + if (n != dma_nents) goto out_mapmr_err; ibmr->iova &= 0x00000000ffffffff; -- cgit v1.2.3 From cd1b659d8ce7697ee9799b64f887528315b9097b Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Wed, 12 Feb 2020 17:32:12 -0500 Subject: NFSv4.1 make cachethis=no for writes Turning caching off for writes on the server should improve performance. Fixes: fba83f34119a ("NFS: Pass "privileged" value to nfs4_init_sequence()") Signed-off-by: Olga Kornievskaia Reviewed-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/nfs4proc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 6616a575711e..69b7ab7a5815 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -5336,7 +5336,7 @@ static void nfs4_proc_write_setup(struct nfs_pgio_header *hdr, hdr->timestamp = jiffies; msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_WRITE]; - nfs4_init_sequence(&hdr->args.seq_args, &hdr->res.seq_res, 1, 0); + nfs4_init_sequence(&hdr->args.seq_args, &hdr->res.seq_res, 0, 0); nfs4_state_protect_write(server->nfs_client, clnt, msg, hdr); } -- cgit v1.2.3 From 8c75593c6eee0f661ddf25dfde0e6ad2a84be7a9 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 13 Feb 2020 14:51:06 -0500 Subject: NFSv4: Ensure the delegation is pinned in nfs_do_return_delegation() The call to nfs_do_return_delegation() needs to be taken without any RCU locks. Add a refcount to make sure the delegation remains pinned in memory until we're done. Fixes: ee05f456772d ("NFSv4: Fix races between open and delegreturn") Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/delegation.c | 33 ++++++++++++++++++++++++++------- fs/nfs/delegation.h | 1 + 2 files changed, 27 insertions(+), 7 deletions(-) diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index d856326836a2..c17ff826e7e9 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -47,10 +47,22 @@ static void nfs_mark_delegation_revoked(struct nfs_delegation *delegation) } } +static struct nfs_delegation *nfs_get_delegation(struct nfs_delegation *delegation) +{ + refcount_inc(&delegation->refcount); + return delegation; +} + +static void nfs_put_delegation(struct nfs_delegation *delegation) +{ + if (refcount_dec_and_test(&delegation->refcount)) + __nfs_free_delegation(delegation); +} + static void nfs_free_delegation(struct nfs_delegation *delegation) { nfs_mark_delegation_revoked(delegation); - __nfs_free_delegation(delegation); + nfs_put_delegation(delegation); } /** @@ -275,8 +287,10 @@ nfs_start_delegation_return_locked(struct nfs_inode *nfsi) if (delegation == NULL) goto out; spin_lock(&delegation->lock); - if (!test_and_set_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) - ret = delegation; + if (!test_and_set_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) { + /* Refcount matched in nfs_end_delegation_return() */ + ret = nfs_get_delegation(delegation); + } spin_unlock(&delegation->lock); if (ret) nfs_clear_verifier_delegated(&nfsi->vfs_inode); @@ -397,6 +411,7 @@ int nfs_inode_set_delegation(struct inode *inode, const struct cred *cred, if (delegation == NULL) return -ENOMEM; nfs4_stateid_copy(&delegation->stateid, stateid); + refcount_set(&delegation->refcount, 1); delegation->type = type; delegation->pagemod_limit = pagemod_limit; delegation->change_attr = inode_peek_iversion_raw(inode); @@ -496,6 +511,8 @@ static int nfs_end_delegation_return(struct inode *inode, struct nfs_delegation err = nfs_do_return_delegation(inode, delegation, issync); out: + /* Refcount matched in nfs_start_delegation_return_locked() */ + nfs_put_delegation(delegation); return err; } @@ -690,7 +707,8 @@ void nfs4_inode_return_delegation_on_close(struct inode *inode) list_empty(&NFS_I(inode)->open_files) && !test_and_set_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) { clear_bit(NFS_DELEGATION_RETURN_IF_CLOSED, &delegation->flags); - ret = delegation; + /* Refcount matched in nfs_end_delegation_return() */ + ret = nfs_get_delegation(delegation); } spin_unlock(&delegation->lock); if (ret) @@ -1094,10 +1112,11 @@ restart: delegation = nfs_start_delegation_return_locked(NFS_I(inode)); rcu_read_unlock(); if (delegation != NULL) { - delegation = nfs_detach_delegation(NFS_I(inode), - delegation, server); - if (delegation != NULL) + if (nfs_detach_delegation(NFS_I(inode), delegation, + server) != NULL) nfs_free_delegation(delegation); + /* Match nfs_start_delegation_return_locked */ + nfs_put_delegation(delegation); } iput(inode); nfs_sb_deactive(server->super); diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h index 31b84604d383..9b00a0b7f832 100644 --- a/fs/nfs/delegation.h +++ b/fs/nfs/delegation.h @@ -22,6 +22,7 @@ struct nfs_delegation { unsigned long pagemod_limit; __u64 change_attr; unsigned long flags; + refcount_t refcount; spinlock_t lock; struct rcu_head rcu; }; -- cgit v1.2.3 From 5d63944f8206a80636ae8cb4b9107d3b49f43d37 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 13 Feb 2020 14:51:07 -0500 Subject: NFSv4: Ensure the delegation cred is pinned when we call delegreturn Ensure we don't release the delegation cred during the call to nfs4_proc_delegreturn(). Fixes: ee05f456772d ("NFSv4: Fix races between open and delegreturn") Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/delegation.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index c17ff826e7e9..1865322de142 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -255,13 +255,18 @@ void nfs_inode_reclaim_delegation(struct inode *inode, const struct cred *cred, static int nfs_do_return_delegation(struct inode *inode, struct nfs_delegation *delegation, int issync) { + const struct cred *cred; int res = 0; - if (!test_bit(NFS_DELEGATION_REVOKED, &delegation->flags)) - res = nfs4_proc_delegreturn(inode, - delegation->cred, + if (!test_bit(NFS_DELEGATION_REVOKED, &delegation->flags)) { + spin_lock(&delegation->lock); + cred = get_cred(delegation->cred); + spin_unlock(&delegation->lock); + res = nfs4_proc_delegreturn(inode, cred, &delegation->stateid, issync); + put_cred(cred); + } return res; } -- cgit v1.2.3 From a1028dcfd0dd97884072288d0c8ed7f30399b528 Mon Sep 17 00:00:00 2001 From: Harigovindan P Date: Thu, 6 Feb 2020 14:26:15 +0530 Subject: drm/msm/dsi: save pll state before dsi host is powered off Save pll state before dsi host is powered off. Without this change some register values gets resetted. Signed-off-by: Harigovindan P Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/dsi/dsi_manager.c | 5 +++++ drivers/gpu/drm/msm/dsi/phy/dsi_phy.c | 4 ---- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/msm/dsi/dsi_manager.c b/drivers/gpu/drm/msm/dsi/dsi_manager.c index acc711fd14f8..4864b9558f65 100644 --- a/drivers/gpu/drm/msm/dsi/dsi_manager.c +++ b/drivers/gpu/drm/msm/dsi/dsi_manager.c @@ -506,6 +506,7 @@ static void dsi_mgr_bridge_post_disable(struct drm_bridge *bridge) struct msm_dsi *msm_dsi1 = dsi_mgr_get_dsi(DSI_1); struct mipi_dsi_host *host = msm_dsi->host; struct drm_panel *panel = msm_dsi->panel; + struct msm_dsi_pll *src_pll; bool is_dual_dsi = IS_DUAL_DSI(); int ret; @@ -539,6 +540,10 @@ static void dsi_mgr_bridge_post_disable(struct drm_bridge *bridge) id, ret); } + /* Save PLL status if it is a clock source */ + src_pll = msm_dsi_phy_get_pll(msm_dsi->phy); + msm_dsi_pll_save_state(src_pll); + ret = msm_dsi_host_power_off(host); if (ret) pr_err("%s: host %d power off failed,%d\n", __func__, id, ret); diff --git a/drivers/gpu/drm/msm/dsi/phy/dsi_phy.c b/drivers/gpu/drm/msm/dsi/phy/dsi_phy.c index b0cfa67d2a57..f509ebd77500 100644 --- a/drivers/gpu/drm/msm/dsi/phy/dsi_phy.c +++ b/drivers/gpu/drm/msm/dsi/phy/dsi_phy.c @@ -724,10 +724,6 @@ void msm_dsi_phy_disable(struct msm_dsi_phy *phy) if (!phy || !phy->cfg->ops.disable) return; - /* Save PLL status if it is a clock source */ - if (phy->usecase != MSM_DSI_PHY_SLAVE) - msm_dsi_pll_save_state(phy->pll); - phy->cfg->ops.disable(phy); dsi_phy_regulator_disable(phy); -- cgit v1.2.3 From c6659785dfb3f8d75f1fe637e4222ff8178f5280 Mon Sep 17 00:00:00 2001 From: Harigovindan P Date: Thu, 6 Feb 2020 14:42:01 +0530 Subject: drm/msm/dsi/pll: call vco set rate explicitly For a given byte clock, if VCO recalc value is exactly same as vco set rate value, vco_set_rate does not get called assuming VCO is already set to required value. But Due to GDSC toggle, VCO values are erased in the HW. To make sure VCO is programmed correctly, we forcefully call set_rate from vco_prepare. Signed-off-by: Harigovindan P Reviewed-by: Jeffrey Hugo Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/dsi/pll/dsi_pll_10nm.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/gpu/drm/msm/dsi/pll/dsi_pll_10nm.c b/drivers/gpu/drm/msm/dsi/pll/dsi_pll_10nm.c index 1c894548dd72..6ac04fc303f5 100644 --- a/drivers/gpu/drm/msm/dsi/pll/dsi_pll_10nm.c +++ b/drivers/gpu/drm/msm/dsi/pll/dsi_pll_10nm.c @@ -411,6 +411,12 @@ static int dsi_pll_10nm_vco_prepare(struct clk_hw *hw) if (pll_10nm->slave) dsi_pll_enable_pll_bias(pll_10nm->slave); + rc = dsi_pll_10nm_vco_set_rate(hw,pll_10nm->vco_current_rate, 0); + if (rc) { + pr_err("vco_set_rate failed, rc=%d\n", rc); + return rc; + } + /* Start PLL */ pll_write(pll_10nm->phy_cmn_mmio + REG_DSI_10nm_PHY_CMN_PLL_CNTRL, 0x01); -- cgit v1.2.3 From 8fc7036ee652207ca992fbb9abb64090c355a9e0 Mon Sep 17 00:00:00 2001 From: Rob Clark Date: Thu, 13 Feb 2020 12:01:35 -0800 Subject: drm/msm/dpu: fix BGR565 vs RGB565 confusion The component order between the two was swapped, resulting in incorrect color when games with 565 visual hit the overlay path instead of GPU composition. Fixes: 25fdd5933e4c ("drm/msm: Add SDM845 DPU support") Signed-off-by: Rob Clark Reviewed-by: Sean Paul Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/disp/dpu1/dpu_formats.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_formats.c b/drivers/gpu/drm/msm/disp/dpu1/dpu_formats.c index 528632690f1e..a05282dede91 100644 --- a/drivers/gpu/drm/msm/disp/dpu1/dpu_formats.c +++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_formats.c @@ -255,13 +255,13 @@ static const struct dpu_format dpu_format_map[] = { INTERLEAVED_RGB_FMT(RGB565, 0, COLOR_5BIT, COLOR_6BIT, COLOR_5BIT, - C2_R_Cr, C0_G_Y, C1_B_Cb, 0, 3, + C1_B_Cb, C0_G_Y, C2_R_Cr, 0, 3, false, 2, 0, DPU_FETCH_LINEAR, 1), INTERLEAVED_RGB_FMT(BGR565, 0, COLOR_5BIT, COLOR_6BIT, COLOR_5BIT, - C1_B_Cb, C0_G_Y, C2_R_Cr, 0, 3, + C2_R_Cr, C0_G_Y, C1_B_Cb, 0, 3, false, 2, 0, DPU_FETCH_LINEAR, 1), -- cgit v1.2.3 From 304db6cb7610eb9104e1f911b235d87dc4802558 Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Tue, 11 Feb 2020 10:13:44 +0800 Subject: page_pool: refill page when alloc.count of pool is zero "do {} while" in page_pool_refill_alloc_cache will always refill page once whether refill is true or false, and whether alloc.count of pool is less than PP_ALLOC_CACHE_REFILL or not this is wrong, and will cause overflow of pool->alloc.cache the caller of __page_pool_get_cached should provide guarantee that pool->alloc.cache is safe to access, so in_serving_softirq should be removed as suggested by Jesper Dangaard Brouer in https://patchwork.ozlabs.org/patch/1233713/ so fix this issue by calling page_pool_refill_alloc_cache() only when pool->alloc.count is zero Fixes: 44768decb7c0 ("page_pool: handle page recycle for NUMA_NO_NODE condition") Signed-off-by: Li RongQing Suggested-by: Jesper Dangaard Brouer Acked-by: Jesper Dangaard Brouer Acked-by: Ilias Apalodimas Signed-off-by: David S. Miller --- net/core/page_pool.c | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 9b7cbe35df37..10d2b255df5e 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -99,8 +99,7 @@ EXPORT_SYMBOL(page_pool_create); static void __page_pool_return_page(struct page_pool *pool, struct page *page); noinline -static struct page *page_pool_refill_alloc_cache(struct page_pool *pool, - bool refill) +static struct page *page_pool_refill_alloc_cache(struct page_pool *pool) { struct ptr_ring *r = &pool->ring; struct page *page; @@ -141,8 +140,7 @@ static struct page *page_pool_refill_alloc_cache(struct page_pool *pool, page = NULL; break; } - } while (pool->alloc.count < PP_ALLOC_CACHE_REFILL && - refill); + } while (pool->alloc.count < PP_ALLOC_CACHE_REFILL); /* Return last page */ if (likely(pool->alloc.count > 0)) @@ -155,20 +153,16 @@ static struct page *page_pool_refill_alloc_cache(struct page_pool *pool, /* fast path */ static struct page *__page_pool_get_cached(struct page_pool *pool) { - bool refill = false; struct page *page; - /* Test for safe-context, caller should provide this guarantee */ - if (likely(in_serving_softirq())) { - if (likely(pool->alloc.count)) { - /* Fast-path */ - page = pool->alloc.cache[--pool->alloc.count]; - return page; - } - refill = true; + /* Caller MUST guarantee safe non-concurrent access, e.g. softirq */ + if (likely(pool->alloc.count)) { + /* Fast-path */ + page = pool->alloc.cache[--pool->alloc.count]; + } else { + page = page_pool_refill_alloc_cache(pool); } - page = page_pool_refill_alloc_cache(pool, refill); return page; } -- cgit v1.2.3 From 6ee2deb6fbed6ed343040215d10f3c73d00304df Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Tue, 11 Feb 2020 18:31:54 +0800 Subject: net/flow_dissector: remove unexist field description @thoff has moved to struct flow_dissector_key_control. Fixes: 42aecaa9bb2b ("net: Get skb hash over flow_keys structure") Signed-off-by: Hangbin Liu Signed-off-by: David S. Miller --- include/net/flow_dissector.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h index d93017a7ce5c..e9391e877f9a 100644 --- a/include/net/flow_dissector.h +++ b/include/net/flow_dissector.h @@ -33,7 +33,6 @@ enum flow_dissect_ret { /** * struct flow_dissector_key_basic: - * @thoff: Transport header offset * @n_proto: Network header protocol (eg. IPv4/IPv6) * @ip_proto: Transport header protocol (eg. TCP/UDP) */ -- cgit v1.2.3 From 1afa3cc90f8fb745c777884d79eaa1001d6927a6 Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Tue, 11 Feb 2020 19:33:39 +0100 Subject: net/sched: matchall: add missing validation of TCA_MATCHALL_FLAGS unlike other classifiers that can be offloaded (i.e. users can set flags like 'skip_hw' and 'skip_sw'), 'cls_matchall' doesn't validate the size of netlink attribute 'TCA_MATCHALL_FLAGS' provided by user: add a proper entry to mall_policy. Fixes: b87f7936a932 ("net/sched: Add match-all classifier hw offloading.") Signed-off-by: Davide Caratti Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- net/sched/cls_matchall.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c index 039cc86974f4..610a0b728161 100644 --- a/net/sched/cls_matchall.c +++ b/net/sched/cls_matchall.c @@ -157,6 +157,7 @@ static void *mall_get(struct tcf_proto *tp, u32 handle) static const struct nla_policy mall_policy[TCA_MATCHALL_MAX + 1] = { [TCA_MATCHALL_UNSPEC] = { .type = NLA_UNSPEC }, [TCA_MATCHALL_CLASSID] = { .type = NLA_U32 }, + [TCA_MATCHALL_FLAGS] = { .type = NLA_U32 }, }; static int mall_set_parms(struct net *net, struct tcf_proto *tp, -- cgit v1.2.3 From e2debf0852c4d66ba1a8bde12869b196094c70a7 Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Tue, 11 Feb 2020 19:33:40 +0100 Subject: net/sched: flower: add missing validation of TCA_FLOWER_FLAGS unlike other classifiers that can be offloaded (i.e. users can set flags like 'skip_hw' and 'skip_sw'), 'cls_flower' doesn't validate the size of netlink attribute 'TCA_FLOWER_FLAGS' provided by user: add a proper entry to fl_policy. Fixes: 5b33f48842fa ("net/flower: Introduce hardware offload support") Signed-off-by: Davide Caratti Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- net/sched/cls_flower.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index f9c0d1e8d380..7e54d2ab5254 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -691,6 +691,7 @@ static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = { .len = 128 / BITS_PER_BYTE }, [TCA_FLOWER_KEY_CT_LABELS_MASK] = { .type = NLA_BINARY, .len = 128 / BITS_PER_BYTE }, + [TCA_FLOWER_FLAGS] = { .type = NLA_U32 }, }; static const struct nla_policy -- cgit v1.2.3 From 0b41713b606694257b90d61ba7e2712d8457648b Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Tue, 11 Feb 2020 20:47:05 +0100 Subject: icmp: introduce helper for nat'd source address in network device context This introduces a helper function to be called only by network drivers that wraps calls to icmp[v6]_send in a conntrack transformation, in case NAT has been used. We don't want to pollute the non-driver path, though, so we introduce this as a helper to be called by places that actually make use of this, as suggested by Florian. Signed-off-by: Jason A. Donenfeld Cc: Florian Westphal Signed-off-by: David S. Miller --- include/linux/icmpv6.h | 6 ++++++ include/net/icmp.h | 6 ++++++ net/ipv4/icmp.c | 33 +++++++++++++++++++++++++++++++++ net/ipv6/ip6_icmp.c | 34 ++++++++++++++++++++++++++++++++++ 4 files changed, 79 insertions(+) diff --git a/include/linux/icmpv6.h b/include/linux/icmpv6.h index ef1cbb5f454f..93338fd54af8 100644 --- a/include/linux/icmpv6.h +++ b/include/linux/icmpv6.h @@ -31,6 +31,12 @@ static inline void icmpv6_send(struct sk_buff *skb, } #endif +#if IS_ENABLED(CONFIG_NF_NAT) +void icmpv6_ndo_send(struct sk_buff *skb_in, u8 type, u8 code, __u32 info); +#else +#define icmpv6_ndo_send icmpv6_send +#endif + extern int icmpv6_init(void); extern int icmpv6_err_convert(u8 type, u8 code, int *err); diff --git a/include/net/icmp.h b/include/net/icmp.h index 5d4bfdba9adf..9ac2d2672a93 100644 --- a/include/net/icmp.h +++ b/include/net/icmp.h @@ -43,6 +43,12 @@ static inline void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 __icmp_send(skb_in, type, code, info, &IPCB(skb_in)->opt); } +#if IS_ENABLED(CONFIG_NF_NAT) +void icmp_ndo_send(struct sk_buff *skb_in, int type, int code, __be32 info); +#else +#define icmp_ndo_send icmp_send +#endif + int icmp_rcv(struct sk_buff *skb); int icmp_err(struct sk_buff *skb, u32 info); int icmp_init(void); diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 18068ed42f25..f369e7ce685b 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -748,6 +748,39 @@ out:; } EXPORT_SYMBOL(__icmp_send); +#if IS_ENABLED(CONFIG_NF_NAT) +#include +void icmp_ndo_send(struct sk_buff *skb_in, int type, int code, __be32 info) +{ + struct sk_buff *cloned_skb = NULL; + enum ip_conntrack_info ctinfo; + struct nf_conn *ct; + __be32 orig_ip; + + ct = nf_ct_get(skb_in, &ctinfo); + if (!ct || !(ct->status & IPS_SRC_NAT)) { + icmp_send(skb_in, type, code, info); + return; + } + + if (skb_shared(skb_in)) + skb_in = cloned_skb = skb_clone(skb_in, GFP_ATOMIC); + + if (unlikely(!skb_in || skb_network_header(skb_in) < skb_in->head || + (skb_network_header(skb_in) + sizeof(struct iphdr)) > + skb_tail_pointer(skb_in) || skb_ensure_writable(skb_in, + skb_network_offset(skb_in) + sizeof(struct iphdr)))) + goto out; + + orig_ip = ip_hdr(skb_in)->saddr; + ip_hdr(skb_in)->saddr = ct->tuplehash[0].tuple.src.u3.ip; + icmp_send(skb_in, type, code, info); + ip_hdr(skb_in)->saddr = orig_ip; +out: + consume_skb(cloned_skb); +} +EXPORT_SYMBOL(icmp_ndo_send); +#endif static void icmp_socket_deliver(struct sk_buff *skb, u32 info) { diff --git a/net/ipv6/ip6_icmp.c b/net/ipv6/ip6_icmp.c index 02045494c24c..e0086758b6ee 100644 --- a/net/ipv6/ip6_icmp.c +++ b/net/ipv6/ip6_icmp.c @@ -45,4 +45,38 @@ out: rcu_read_unlock(); } EXPORT_SYMBOL(icmpv6_send); + +#if IS_ENABLED(CONFIG_NF_NAT) +#include +void icmpv6_ndo_send(struct sk_buff *skb_in, u8 type, u8 code, __u32 info) +{ + struct sk_buff *cloned_skb = NULL; + enum ip_conntrack_info ctinfo; + struct in6_addr orig_ip; + struct nf_conn *ct; + + ct = nf_ct_get(skb_in, &ctinfo); + if (!ct || !(ct->status & IPS_SRC_NAT)) { + icmpv6_send(skb_in, type, code, info); + return; + } + + if (skb_shared(skb_in)) + skb_in = cloned_skb = skb_clone(skb_in, GFP_ATOMIC); + + if (unlikely(!skb_in || skb_network_header(skb_in) < skb_in->head || + (skb_network_header(skb_in) + sizeof(struct ipv6hdr)) > + skb_tail_pointer(skb_in) || skb_ensure_writable(skb_in, + skb_network_offset(skb_in) + sizeof(struct ipv6hdr)))) + goto out; + + orig_ip = ipv6_hdr(skb_in)->saddr; + ipv6_hdr(skb_in)->saddr = ct->tuplehash[0].tuple.src.u3.in6; + icmpv6_send(skb_in, type, code, info); + ipv6_hdr(skb_in)->saddr = orig_ip; +out: + consume_skb(cloned_skb); +} +EXPORT_SYMBOL(icmpv6_ndo_send); +#endif #endif -- cgit v1.2.3 From e0fce6f945a26d4e953a147fe7ca11410322c9fe Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Tue, 11 Feb 2020 20:47:06 +0100 Subject: gtp: use icmp_ndo_send helper Because gtp is calling icmp from network device context, it should use the ndo helper so that the rate limiting applies correctly. Signed-off-by: Jason A. Donenfeld Cc: Harald Welte Signed-off-by: David S. Miller --- drivers/net/gtp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/gtp.c b/drivers/net/gtp.c index af07ea760b35..672cd2caf2fb 100644 --- a/drivers/net/gtp.c +++ b/drivers/net/gtp.c @@ -546,8 +546,8 @@ static int gtp_build_skb_ip4(struct sk_buff *skb, struct net_device *dev, mtu < ntohs(iph->tot_len)) { netdev_dbg(dev, "packet too big, fragmentation needed\n"); memset(IPCB(skb), 0, sizeof(*IPCB(skb))); - icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, - htonl(mtu)); + icmp_ndo_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, + htonl(mtu)); goto err_rt; } -- cgit v1.2.3 From 67c9a7e1e3ac491b5df018803639addc36f154ba Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Tue, 11 Feb 2020 20:47:07 +0100 Subject: sunvnet: use icmp_ndo_send helper Because sunvnet is calling icmp from network device context, it should use the ndo helper so that the rate limiting applies correctly. While we're at it, doing the additional route lookup before calling icmp_ndo_send is superfluous, since this is the job of the icmp code in the first place. Signed-off-by: Jason A. Donenfeld Cc: Shannon Nelson Signed-off-by: David S. Miller --- drivers/net/ethernet/sun/sunvnet_common.c | 23 ++++------------------- 1 file changed, 4 insertions(+), 19 deletions(-) diff --git a/drivers/net/ethernet/sun/sunvnet_common.c b/drivers/net/ethernet/sun/sunvnet_common.c index c23ce838ff63..8dc6c9ff22e1 100644 --- a/drivers/net/ethernet/sun/sunvnet_common.c +++ b/drivers/net/ethernet/sun/sunvnet_common.c @@ -1350,27 +1350,12 @@ sunvnet_start_xmit_common(struct sk_buff *skb, struct net_device *dev, if (vio_version_after_eq(&port->vio, 1, 3)) localmtu -= VLAN_HLEN; - if (skb->protocol == htons(ETH_P_IP)) { - struct flowi4 fl4; - struct rtable *rt = NULL; - - memset(&fl4, 0, sizeof(fl4)); - fl4.flowi4_oif = dev->ifindex; - fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos); - fl4.daddr = ip_hdr(skb)->daddr; - fl4.saddr = ip_hdr(skb)->saddr; - - rt = ip_route_output_key(dev_net(dev), &fl4); - if (!IS_ERR(rt)) { - skb_dst_set(skb, &rt->dst); - icmp_send(skb, ICMP_DEST_UNREACH, - ICMP_FRAG_NEEDED, - htonl(localmtu)); - } - } + if (skb->protocol == htons(ETH_P_IP)) + icmp_ndo_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, + htonl(localmtu)); #if IS_ENABLED(CONFIG_IPV6) else if (skb->protocol == htons(ETH_P_IPV6)) - icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, localmtu); + icmpv6_ndo_send(skb, ICMPV6_PKT_TOOBIG, 0, localmtu); #endif goto out_dropped; } -- cgit v1.2.3 From a12d7f3cbdc72c7625881c8dc2660fc2c979fdf2 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Tue, 11 Feb 2020 20:47:08 +0100 Subject: wireguard: device: use icmp_ndo_send helper Because wireguard is calling icmp from network device context, it should use the ndo helper so that the rate limiting applies correctly. This commit adds a small test to the wireguard test suite to ensure that the new functions continue doing the right thing in the context of wireguard. It does this by setting up a condition that will definately evoke an icmp error message from the driver, but along a nat'd path. Signed-off-by: Jason A. Donenfeld Signed-off-by: David S. Miller --- drivers/net/wireguard/device.c | 4 ++-- tools/testing/selftests/wireguard/netns.sh | 11 +++++++++++ 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/drivers/net/wireguard/device.c b/drivers/net/wireguard/device.c index 16b19824b9ad..43db442b1373 100644 --- a/drivers/net/wireguard/device.c +++ b/drivers/net/wireguard/device.c @@ -203,9 +203,9 @@ err_peer: err: ++dev->stats.tx_errors; if (skb->protocol == htons(ETH_P_IP)) - icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0); + icmp_ndo_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0); else if (skb->protocol == htons(ETH_P_IPV6)) - icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0); + icmpv6_ndo_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0); kfree_skb(skb); return ret; } diff --git a/tools/testing/selftests/wireguard/netns.sh b/tools/testing/selftests/wireguard/netns.sh index f5ab1cda8bb5..138d46b3f330 100755 --- a/tools/testing/selftests/wireguard/netns.sh +++ b/tools/testing/selftests/wireguard/netns.sh @@ -24,6 +24,7 @@ set -e exec 3>&1 +export LANG=C export WG_HIDE_KEYS=never netns0="wg-test-$$-0" netns1="wg-test-$$-1" @@ -297,7 +298,17 @@ ip1 -4 rule add table main suppress_prefixlength 0 n1 ping -W 1 -c 100 -f 192.168.99.7 n1 ping -W 1 -c 100 -f abab::1111 +# Have ns2 NAT into wg0 packets from ns0, but return an icmp error along the right route. +n2 iptables -t nat -A POSTROUTING -s 10.0.0.0/24 -d 192.168.241.0/24 -j SNAT --to 192.168.241.2 +n0 iptables -t filter -A INPUT \! -s 10.0.0.0/24 -i vethrs -j DROP # Manual rpfilter just to be explicit. +n2 bash -c 'printf 1 > /proc/sys/net/ipv4/ip_forward' +ip0 -4 route add 192.168.241.1 via 10.0.0.100 +n2 wg set wg0 peer "$pub1" remove +[[ $(! n0 ping -W 1 -c 1 192.168.241.1 || false) == *"From 10.0.0.100 icmp_seq=1 Destination Host Unreachable"* ]] + n0 iptables -t nat -F +n0 iptables -t filter -F +n2 iptables -t nat -F ip0 link del vethrc ip0 link del vethrs ip1 link del wg0 -- cgit v1.2.3 From 45942ba890e6f35232727a5fa33d732681f4eb9f Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Tue, 11 Feb 2020 20:47:09 +0100 Subject: xfrm: interface: use icmp_ndo_send helper Because xfrmi is calling icmp from network device context, it should use the ndo helper so that the rate limiting applies correctly. Signed-off-by: Jason A. Donenfeld Cc: Nicolas Dichtel Cc: Steffen Klassert Signed-off-by: David S. Miller --- net/xfrm/xfrm_interface.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/xfrm/xfrm_interface.c b/net/xfrm/xfrm_interface.c index dc651a628dcf..3361e3ac5714 100644 --- a/net/xfrm/xfrm_interface.c +++ b/net/xfrm/xfrm_interface.c @@ -300,10 +300,10 @@ xfrmi_xmit2(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) if (mtu < IPV6_MIN_MTU) mtu = IPV6_MIN_MTU; - icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); + icmpv6_ndo_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); } else { - icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, - htonl(mtu)); + icmp_ndo_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, + htonl(mtu)); } dst_release(dst); -- cgit v1.2.3 From 3da627073b56955b4f1d028c4b8092af59375938 Mon Sep 17 00:00:00 2001 From: Tyler Hicks Date: Thu, 13 Feb 2020 21:48:42 +0000 Subject: Documentation/process: Swap out the ambassador for Canonical John Johansen will take over as the process ambassador for Canonical when dealing with embargoed hardware issues. Cc: John Johansen Cc: linux-kernel@vger.kernel.org Cc: linux-doc@vger.kernel.org Cc: Alex Shi Cc: Harry Wei Cc: Thomas Gleixner Cc: Greg Kroah-Hartman Cc: Jonathan Corbet Acked-by: John Johansen Signed-off-by: Tyler Hicks Link: https://lore.kernel.org/r/20200213214842.21312-1-tyhicks@canonical.com Signed-off-by: Greg Kroah-Hartman --- Documentation/process/embargoed-hardware-issues.rst | 2 +- Documentation/translations/zh_CN/process/embargoed-hardware-issues.rst | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/process/embargoed-hardware-issues.rst b/Documentation/process/embargoed-hardware-issues.rst index 64f375c02358..a19d084f9b2c 100644 --- a/Documentation/process/embargoed-hardware-issues.rst +++ b/Documentation/process/embargoed-hardware-issues.rst @@ -254,7 +254,7 @@ an involved disclosed party. The current ambassadors list: VMware Xen Andrew Cooper - Canonical Tyler Hicks + Canonical John Johansen Debian Ben Hutchings Oracle Konrad Rzeszutek Wilk Red Hat Josh Poimboeuf diff --git a/Documentation/translations/zh_CN/process/embargoed-hardware-issues.rst b/Documentation/translations/zh_CN/process/embargoed-hardware-issues.rst index b93f1af68261..88273ebe7823 100644 --- a/Documentation/translations/zh_CN/process/embargoed-hardware-issues.rst +++ b/Documentation/translations/zh_CN/process/embargoed-hardware-issues.rst @@ -183,7 +183,7 @@ CVE分配 VMware Xen Andrew Cooper - Canonical Tyler Hicks + Canonical John Johansen Debian Ben Hutchings Oracle Konrad Rzeszutek Wilk Red Hat Josh Poimboeuf -- cgit v1.2.3 From 2ca10259b4189a433c309054496dd6af1415f992 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 13 Feb 2020 17:17:35 -0700 Subject: io_uring: prune request from overflow list on flush MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Carter reported an issue where he could produce a stall on ring exit, when we're cleaning up requests that match the given file table. For this particular test case, a combination of a few things caused the issue: - The cq ring was overflown - The request being canceled was in the overflow list The combination of the above means that the cq overflow list holds a reference to the request. The request is canceled correctly, but since the overflow list holds a reference to it, the final put won't happen. Since the final put doesn't happen, the request remains in the inflight. Hence we never finish the cancelation flush. Fix this by removing requests from the overflow list if we're canceling them. Cc: stable@vger.kernel.org # 5.5 Reported-by: Carter Li 李通洲 Signed-off-by: Jens Axboe --- fs/io_uring.c | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/fs/io_uring.c b/fs/io_uring.c index 6d4e20d59729..5a826017ebb8 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -481,6 +481,7 @@ enum { REQ_F_TIMEOUT_NOSEQ_BIT, REQ_F_COMP_LOCKED_BIT, REQ_F_NEED_CLEANUP_BIT, + REQ_F_OVERFLOW_BIT, }; enum { @@ -521,6 +522,8 @@ enum { REQ_F_COMP_LOCKED = BIT(REQ_F_COMP_LOCKED_BIT), /* needs cleanup */ REQ_F_NEED_CLEANUP = BIT(REQ_F_NEED_CLEANUP_BIT), + /* in overflow list */ + REQ_F_OVERFLOW = BIT(REQ_F_OVERFLOW_BIT), }; /* @@ -1103,6 +1106,7 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) req = list_first_entry(&ctx->cq_overflow_list, struct io_kiocb, list); list_move(&req->list, &list); + req->flags &= ~REQ_F_OVERFLOW; if (cqe) { WRITE_ONCE(cqe->user_data, req->user_data); WRITE_ONCE(cqe->res, req->result); @@ -1155,6 +1159,7 @@ static void io_cqring_fill_event(struct io_kiocb *req, long res) set_bit(0, &ctx->sq_check_overflow); set_bit(0, &ctx->cq_check_overflow); } + req->flags |= REQ_F_OVERFLOW; refcount_inc(&req->refs); req->result = res; list_add_tail(&req->list, &ctx->cq_overflow_list); @@ -6463,6 +6468,29 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx, if (!cancel_req) break; + if (cancel_req->flags & REQ_F_OVERFLOW) { + spin_lock_irq(&ctx->completion_lock); + list_del(&cancel_req->list); + cancel_req->flags &= ~REQ_F_OVERFLOW; + if (list_empty(&ctx->cq_overflow_list)) { + clear_bit(0, &ctx->sq_check_overflow); + clear_bit(0, &ctx->cq_check_overflow); + } + spin_unlock_irq(&ctx->completion_lock); + + WRITE_ONCE(ctx->rings->cq_overflow, + atomic_inc_return(&ctx->cached_cq_overflow)); + + /* + * Put inflight ref and overflow ref. If that's + * all we had, then we're done with this request. + */ + if (refcount_sub_and_test(2, &cancel_req->refs)) { + io_put_req(cancel_req); + continue; + } + } + io_wq_cancel_work(ctx->io_wq, &cancel_req->work); io_put_req(cancel_req); schedule(); -- cgit v1.2.3 From cfb8d7811f815d17babadd87436300261fd54de7 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Thu, 13 Feb 2020 16:56:48 -0800 Subject: Input: goldfish_events - replace zero-length array with flexible-array member The current codebase makes use of the zero-length array language extension to the C90 standard, but the preferred mechanism to declare variable-length types such as these ones is a flexible array member[1][2], introduced in C99: struct foo { int stuff; struct boo array[]; }; By making use of the mechanism above, we will get a compiler warning in case the flexible array does not occur last in the structure, which will help us prevent some kind of undefined behavior bugs from being inadvertently introduced[3] to the codebase from now on. Also, notice that, dynamic memory allocations won't be affected by this change: "Flexible array members have incomplete type, and so the sizeof operator may not be applied. As a quirk of the original implementation of zero-length arrays, sizeof evaluates to zero."[1] This issue was found with the help of Coccinelle. [1] https://gcc.gnu.org/onlinedocs/gcc/Zero-Length.html [2] https://github.com/KSPP/linux/issues/21 [3] commit 76497732932f ("cxgb3/l2t: Fix undefined behaviour") Signed-off-by: Gustavo A. R. Silva Link: https://lore.kernel.org/r/20200213002430.GA31056@embeddedor.com Signed-off-by: Dmitry Torokhov --- drivers/input/keyboard/goldfish_events.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/input/keyboard/goldfish_events.c b/drivers/input/keyboard/goldfish_events.c index bc8c85a52a10..57d435fc5c73 100644 --- a/drivers/input/keyboard/goldfish_events.c +++ b/drivers/input/keyboard/goldfish_events.c @@ -30,7 +30,7 @@ struct event_dev { struct input_dev *input; int irq; void __iomem *addr; - char name[0]; + char name[]; }; static irqreturn_t events_interrupt(int irq, void *dev_id) -- cgit v1.2.3 From a1b9b65edfd8f195dafaebf68f7d321eb9b3ab82 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Thu, 13 Feb 2020 16:57:00 -0800 Subject: Input: gpio_keys - replace zero-length array with flexible-array member The current codebase makes use of the zero-length array language extension to the C90 standard, but the preferred mechanism to declare variable-length types such as these ones is a flexible array member[1][2], introduced in C99: struct foo { int stuff; struct boo array[]; }; By making use of the mechanism above, we will get a compiler warning in case the flexible array does not occur last in the structure, which will help us prevent some kind of undefined behavior bugs from being inadvertently introduced[3] to the codebase from now on. Also, notice that, dynamic memory allocations won't be affected by this change: "Flexible array members have incomplete type, and so the sizeof operator may not be applied. As a quirk of the original implementation of zero-length arrays, sizeof evaluates to zero."[1] This issue was found with the help of Coccinelle. [1] https://gcc.gnu.org/onlinedocs/gcc/Zero-Length.html [2] https://github.com/KSPP/linux/issues/21 [3] commit 76497732932f ("cxgb3/l2t: Fix undefined behaviour") Signed-off-by: Gustavo A. R. Silva Link: https://lore.kernel.org/r/20200213002600.GA31916@embeddedor.com Signed-off-by: Dmitry Torokhov --- drivers/input/keyboard/gpio_keys.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/input/keyboard/gpio_keys.c b/drivers/input/keyboard/gpio_keys.c index 1f56d53454b2..53c9ff338dea 100644 --- a/drivers/input/keyboard/gpio_keys.c +++ b/drivers/input/keyboard/gpio_keys.c @@ -55,7 +55,7 @@ struct gpio_keys_drvdata { struct input_dev *input; struct mutex disable_lock; unsigned short *keymap; - struct gpio_button_data data[0]; + struct gpio_button_data data[]; }; /* -- cgit v1.2.3 From bf502391353b928e63096127e5fd8482080203f5 Mon Sep 17 00:00:00 2001 From: Lyude Paul Date: Thu, 13 Feb 2020 16:59:15 -0800 Subject: Input: synaptics - switch T470s to RMI4 by default This supports RMI4 and everything seems to work, including the touchpad buttons. So, let's enable this by default. Signed-off-by: Lyude Paul Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20200204194322.112638-1-lyude@redhat.com Signed-off-by: Dmitry Torokhov --- drivers/input/mouse/synaptics.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/input/mouse/synaptics.c b/drivers/input/mouse/synaptics.c index 1ae6f8bba9ae..8cb8475657ca 100644 --- a/drivers/input/mouse/synaptics.c +++ b/drivers/input/mouse/synaptics.c @@ -169,6 +169,7 @@ static const char * const smbus_pnp_ids[] = { "LEN004a", /* W541 */ "LEN005b", /* P50 */ "LEN005e", /* T560 */ + "LEN006c", /* T470s */ "LEN0071", /* T480 */ "LEN0072", /* X1 Carbon Gen 5 (2017) - Elan/ALPS trackpoint */ "LEN0073", /* X1 Carbon G5 (Elantech) */ -- cgit v1.2.3 From b8a3d819f872e0a3a0a6db0dbbcd48071042fb98 Mon Sep 17 00:00:00 2001 From: Gaurav Agrawal Date: Thu, 13 Feb 2020 17:06:10 -0800 Subject: Input: synaptics - enable SMBus on ThinkPad L470 Add touchpad LEN2044 to the list, as it is capable of working with psmouse.synaptics_intertouch=1 Signed-off-by: Gaurav Agrawal Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/CADdtggVzVJq5gGNmFhKSz2MBwjTpdN5YVOdr4D3Hkkv=KZRc9g@mail.gmail.com Signed-off-by: Dmitry Torokhov --- drivers/input/mouse/synaptics.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/input/mouse/synaptics.c b/drivers/input/mouse/synaptics.c index 8cb8475657ca..36f410aa4bad 100644 --- a/drivers/input/mouse/synaptics.c +++ b/drivers/input/mouse/synaptics.c @@ -180,6 +180,7 @@ static const char * const smbus_pnp_ids[] = { "LEN0097", /* X280 -> ALPS trackpoint */ "LEN009b", /* T580 */ "LEN200f", /* T450s */ + "LEN2044", /* L470 */ "LEN2054", /* E480 */ "LEN2055", /* E580 */ "SYN3052", /* HP EliteBook 840 G4 */ -- cgit v1.2.3 From 5179a9dfa9440c1781816e2c9a183d1d2512dc61 Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Thu, 13 Feb 2020 17:07:47 -0800 Subject: Input: synaptics - remove the LEN0049 dmi id from topbuttonpad list The Yoga 11e is using LEN0049, but it doesn't have a trackstick. Thus, there is no need to create a software top buttons row. However, it seems that the device works under SMBus, so keep it as part of the smbus_pnp_ids. Signed-off-by: Benjamin Tissoires Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20200115013023.9710-1-benjamin.tissoires@redhat.com Signed-off-by: Dmitry Torokhov --- drivers/input/mouse/synaptics.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/input/mouse/synaptics.c b/drivers/input/mouse/synaptics.c index 36f410aa4bad..2c666fb34625 100644 --- a/drivers/input/mouse/synaptics.c +++ b/drivers/input/mouse/synaptics.c @@ -146,7 +146,6 @@ static const char * const topbuttonpad_pnp_ids[] = { "LEN0042", /* Yoga */ "LEN0045", "LEN0047", - "LEN0049", "LEN2000", /* S540 */ "LEN2001", /* Edge E431 */ "LEN2002", /* Edge E531 */ @@ -166,6 +165,7 @@ static const char * const smbus_pnp_ids[] = { /* all of the topbuttonpad_pnp_ids are valid, we just add some extras */ "LEN0048", /* X1 Carbon 3 */ "LEN0046", /* X250 */ + "LEN0049", /* Yoga 11e */ "LEN004a", /* W541 */ "LEN005b", /* P50 */ "LEN005e", /* T560 */ -- cgit v1.2.3 From 51dede9c05df2b78acd6dcf6a17d21f0877d2d7b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 13 Feb 2020 19:01:34 +0100 Subject: x86/mce/amd: Fix kobject lifetime Accessing the MCA thresholding controls in sysfs concurrently with CPU hotplug can lead to a couple of KASAN-reported issues: BUG: KASAN: use-after-free in sysfs_file_ops+0x155/0x180 Read of size 8 at addr ffff888367578940 by task grep/4019 and BUG: KASAN: use-after-free in show_error_count+0x15c/0x180 Read of size 2 at addr ffff888368a05514 by task grep/4454 for example. Both result from the fact that the threshold block creation/teardown code frees the descriptor memory itself instead of defining proper ->release function and leaving it to the driver core to take care of that, after all sysfs accesses have completed. Do that and get rid of the custom freeing code, fixing the above UAFs in the process. [ bp: write commit message. ] Fixes: 95268664390b ("[PATCH] x86_64: mce_amd support for family 0x10 processors") Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Cc: Link: https://lkml.kernel.org/r/20200214082801.13836-1-bp@alien8.de --- arch/x86/kernel/cpu/mce/amd.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index e7313e5c497c..52de616a8065 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -1163,9 +1163,12 @@ static const struct sysfs_ops threshold_ops = { .store = store, }; +static void threshold_block_release(struct kobject *kobj); + static struct kobj_type threshold_ktype = { .sysfs_ops = &threshold_ops, .default_attrs = default_attrs, + .release = threshold_block_release, }; static const char *get_name(unsigned int bank, struct threshold_block *b) @@ -1367,8 +1370,12 @@ static int threshold_create_bank(unsigned int cpu, unsigned int bank) return err; } -static void deallocate_threshold_block(unsigned int cpu, - unsigned int bank) +static void threshold_block_release(struct kobject *kobj) +{ + kfree(to_block(kobj)); +} + +static void deallocate_threshold_block(unsigned int cpu, unsigned int bank) { struct threshold_block *pos = NULL; struct threshold_block *tmp = NULL; @@ -1378,13 +1385,11 @@ static void deallocate_threshold_block(unsigned int cpu, return; list_for_each_entry_safe(pos, tmp, &head->blocks->miscj, miscj) { - kobject_put(&pos->kobj); list_del(&pos->miscj); - kfree(pos); + kobject_put(&pos->kobj); } - kfree(per_cpu(threshold_banks, cpu)[bank]->blocks); - per_cpu(threshold_banks, cpu)[bank]->blocks = NULL; + kobject_put(&head->blocks->kobj); } static void __threshold_remove_blocks(struct threshold_bank *b) -- cgit v1.2.3 From cba6437a1854fde5934098ec3bd0ee83af3129f5 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 12 Feb 2020 12:19:41 +0100 Subject: genirq/proc: Reject invalid affinity masks (again) Qian Cai reported that the WARN_ON() in the x86/msi affinity setting code, which catches cases where the affinity setting is not done on the CPU which is the current target of the interrupt, triggers during CPU hotplug stress testing. It turns out that the warning which was added with the commit addressing the MSI affinity race unearthed yet another long standing bug. If user space writes a bogus affinity mask, i.e. it contains no online CPUs, then it calls irq_select_affinity_usr(). This was introduced for ALPHA in eee45269b0f5 ("[PATCH] Alpha: convert to generic irq framework (generic part)") and subsequently made available for all architectures in 18404756765c ("genirq: Expose default irq affinity mask (take 3)") which introduced the circumvention of the affinity setting restrictions for interrupt which cannot be moved in process context. The whole exercise is bogus in various aspects: 1) If the interrupt is already started up then there is absolutely no point to honour a bogus interrupt affinity setting from user space. The interrupt is already assigned to an online CPU and it does not make any sense to reassign it to some other randomly chosen online CPU. 2) If the interupt is not yet started up then there is no point either. A subsequent startup of the interrupt will invoke irq_setup_affinity() anyway which will chose a valid target CPU. So the only correct solution is to just return -EINVAL in case user space wrote an affinity mask which does not contain any online CPUs, except for ALPHA which has it's own magic sauce for this. Fixes: 18404756765c ("genirq: Expose default irq affinity mask (take 3)") Reported-by: Qian Cai Signed-off-by: Thomas Gleixner Tested-by: Qian Cai Link: https://lkml.kernel.org/r/878sl8xdbm.fsf@nanos.tec.linutronix.de --- kernel/irq/internals.h | 2 -- kernel/irq/manage.c | 18 ++---------------- kernel/irq/proc.c | 22 ++++++++++++++++++++++ 3 files changed, 24 insertions(+), 18 deletions(-) diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 3924fbe829d4..c9d8eb7f5c02 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -128,8 +128,6 @@ static inline void unregister_handler_proc(unsigned int irq, extern bool irq_can_set_affinity_usr(unsigned int irq); -extern int irq_select_affinity_usr(unsigned int irq); - extern void irq_set_thread_affinity(struct irq_desc *desc); extern int irq_do_set_affinity(struct irq_data *data, diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 3089a60ea8f9..7eee98c38f25 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -481,23 +481,9 @@ int irq_setup_affinity(struct irq_desc *desc) { return irq_select_affinity(irq_desc_get_irq(desc)); } -#endif +#endif /* CONFIG_AUTO_IRQ_AFFINITY */ +#endif /* CONFIG_SMP */ -/* - * Called when a bogus affinity is set via /proc/irq - */ -int irq_select_affinity_usr(unsigned int irq) -{ - struct irq_desc *desc = irq_to_desc(irq); - unsigned long flags; - int ret; - - raw_spin_lock_irqsave(&desc->lock, flags); - ret = irq_setup_affinity(desc); - raw_spin_unlock_irqrestore(&desc->lock, flags); - return ret; -} -#endif /** * irq_set_vcpu_affinity - Set vcpu affinity for the interrupt diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 9e5783d98033..32c071d7bc03 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -111,6 +111,28 @@ static int irq_affinity_list_proc_show(struct seq_file *m, void *v) return show_irq_affinity(AFFINITY_LIST, m); } +#ifndef CONFIG_AUTO_IRQ_AFFINITY +static inline int irq_select_affinity_usr(unsigned int irq) +{ + /* + * If the interrupt is started up already then this fails. The + * interrupt is assigned to an online CPU already. There is no + * point to move it around randomly. Tell user space that the + * selected mask is bogus. + * + * If not then any change to the affinity is pointless because the + * startup code invokes irq_setup_affinity() which will select + * a online CPU anyway. + */ + return -EINVAL; +} +#else +/* ALPHA magic affinity auto selector. Keep it for historical reasons. */ +static inline int irq_select_affinity_usr(unsigned int irq) +{ + return irq_select_affinity(irq); +} +#endif static ssize_t write_irq_affinity(int type, struct file *file, const char __user *buffer, size_t count, loff_t *pos) -- cgit v1.2.3 From ea75080110a4c1fa011b0a73cb8f42227143ee3e Mon Sep 17 00:00:00 2001 From: Sergey Matyukevich Date: Thu, 13 Feb 2020 13:16:16 +0000 Subject: cfg80211: add missing policy for NL80211_ATTR_STATUS_CODE The nl80211_policy is missing for NL80211_ATTR_STATUS_CODE attribute. As a result, for strictly validated commands, it's assumed to not be supported. Signed-off-by: Sergey Matyukevich Link: https://lore.kernel.org/r/20200213131608.10541-2-sergey.matyukevich.os@quantenna.com Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 123b8d720a59..cedf17d4933f 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -437,6 +437,7 @@ const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_CONTROL_PORT_NO_ENCRYPT] = { .type = NLA_FLAG }, [NL80211_ATTR_CONTROL_PORT_OVER_NL80211] = { .type = NLA_FLAG }, [NL80211_ATTR_PRIVACY] = { .type = NLA_FLAG }, + [NL80211_ATTR_STATUS_CODE] = { .type = NLA_U16 }, [NL80211_ATTR_CIPHER_SUITE_GROUP] = { .type = NLA_U32 }, [NL80211_ATTR_WPA_VERSIONS] = { .type = NLA_U32 }, [NL80211_ATTR_PID] = { .type = NLA_U32 }, -- cgit v1.2.3 From 33181ea7f5a62a17fbe55f0f73428ecb5e686be8 Mon Sep 17 00:00:00 2001 From: Shay Bar Date: Mon, 10 Feb 2020 15:07:28 +0200 Subject: mac80211: fix wrong 160/80+80 MHz setting MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Before this patch, STA's would set new width of 160/80+80 MHz based on AP capability only. This is wrong because STA may not support > 80MHz BW. Fix is to verify STA has 160/80+80 MHz capability before increasing its width to > 80MHz. The "support_80_80" and "support_160" setting is based on: "Table 9-272 — Setting of the Supported Channel Width Set subfield and Extended NSS BW Support subfield at a STA transmitting the VHT Capabilities Information field" From "Draft P802.11REVmd_D3.0.pdf" Signed-off-by: Aviad Brikman Signed-off-by: Shay Bar Link: https://lore.kernel.org/r/20200210130728.23674-1-shay.bar@celeno.com Signed-off-by: Johannes Berg --- net/mac80211/util.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/net/mac80211/util.c b/net/mac80211/util.c index 739e90555d8b..decd46b38393 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -2993,10 +2993,22 @@ bool ieee80211_chandef_vht_oper(struct ieee80211_hw *hw, int cf0, cf1; int ccfs0, ccfs1, ccfs2; int ccf0, ccf1; + u32 vht_cap; + bool support_80_80 = false; + bool support_160 = false; if (!oper || !htop) return false; + vht_cap = hw->wiphy->bands[chandef->chan->band]->vht_cap.cap; + support_160 = (vht_cap & (IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK | + IEEE80211_VHT_CAP_EXT_NSS_BW_MASK)); + support_80_80 = ((vht_cap & + IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160_80PLUS80MHZ) || + (vht_cap & IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160MHZ && + vht_cap & IEEE80211_VHT_CAP_EXT_NSS_BW_MASK) || + ((vht_cap & IEEE80211_VHT_CAP_EXT_NSS_BW_MASK) >> + IEEE80211_VHT_CAP_EXT_NSS_BW_SHIFT > 1)); ccfs0 = oper->center_freq_seg0_idx; ccfs1 = oper->center_freq_seg1_idx; ccfs2 = (le16_to_cpu(htop->operation_mode) & @@ -3024,10 +3036,10 @@ bool ieee80211_chandef_vht_oper(struct ieee80211_hw *hw, unsigned int diff; diff = abs(ccf1 - ccf0); - if (diff == 8) { + if ((diff == 8) && support_160) { new.width = NL80211_CHAN_WIDTH_160; new.center_freq1 = cf1; - } else if (diff > 8) { + } else if ((diff > 8) && support_80_80) { new.width = NL80211_CHAN_WIDTH_80P80; new.center_freq2 = cf1; } -- cgit v1.2.3 From 62765941155e487b351a72479078bd6fec973563 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Fri, 14 Feb 2020 09:34:43 -0300 Subject: perf llvm: Fix script used to obtain kernel make directives to work with new kbuild Before this patch: # ./perf test 39 41 39: LLVM search and compile : 39.1: Basic BPF llvm compile : Ok 39.2: kbuild searching : FAILED! 39.3: Compile source for BPF prologue generation : Skip 39.4: Compile source for BPF relocation : Skip 41: BPF filter : 41.1: Basic BPF filtering : Ok 41.2: BPF pinning : Ok 41.3: BPF prologue generation : FAILED! 41.4: BPF relocation checker : Skip # Using 'perf test -v' for these tests shows that it is not finding uapi/linux/fs.h, which ends up being because we don't setup the right header path. Fix it. After this patch: # perf test 39 41 39: LLVM search and compile : 39.1: Basic BPF llvm compile : Ok 39.2: kbuild searching : Ok 39.3: Compile source for BPF prologue generation : Ok 39.4: Compile source for BPF relocation : Ok 41: BPF filter : 41.1: Basic BPF filtering : Ok 41.2: BPF pinning : Ok 41.3: BPF prologue generation : Ok 41.4: BPF relocation checker : Ok # Longer description: In llvm-utils.c we use some techniques to obtain the kbuild make directives and that recently stopped working as now 'ar' gets called and expects to find the dummy.o used to echo these variables: $(NOSTDINC_FLAGS) $(LINUXINCLUDE) $(EXTRA_CFLAGS) Add the $(CC) line to satisfy that, making sure this works with all kernels, i.e. preserving the temp directory and files in it used for this technique we can see that it works everywhere: # make -s -C /lib/modules/5.4.18-100.fc30.x86_64/build M=/tmp/tmp.qgaFHgxjZ4/ clean # ls -la /tmp/tmp.qgaFHgxjZ4/ total 4 drwx------. 2 root root 80 Feb 14 09:42 . drwxrwxrwt. 47 root root 1200 Feb 14 09:42 .. -rw-r--r--. 1 root root 0 Feb 13 17:14 dummy.c -rw-r--r--. 1 root root 121 Feb 13 17:14 Makefile # # cat /tmp/tmp.qgaFHgxjZ4/Makefile obj-y := dummy.o $(obj)/%.o: $(src)/%.c @echo -n "$(NOSTDINC_FLAGS) $(LINUXINCLUDE) $(EXTRA_CFLAGS)" $(CC) -c -o $@ $< # Then build with an old kernel Makefile: # make -s -C /lib/modules/5.4.18-100.fc30.x86_64/build M=/tmp/tmp.qgaFHgxjZ4/ dummy.o -nostdinc -isystem /usr/lib/gcc/x86_64-redhat-linux/9/include -I./arch/x86/include -I./arch/x86/include/generated -I./include -I./arch/x86/include/uapi -I./arch/x86/include/generated/uapi -I./include/uapi -I./include/generated/uapi -include ./include/linux/kconfig.h # # ls -la /tmp/tmp.qgaFHgxjZ4/ total 8 drwx------. 2 root root 100 Feb 14 09:43 . drwxrwxrwt. 47 root root 1200 Feb 14 09:43 .. -rw-r--r--. 1 root root 0 Feb 13 17:14 dummy.c -rw-r--r--. 1 root root 936 Feb 14 09:43 dummy.o -rw-r--r--. 1 root root 121 Feb 13 17:14 Makefile # And a new one: # make -s -C /lib/modules/5.4.18-100.fc30.x86_64/build M=/tmp/tmp.qgaFHgxjZ4/ clean # ls -la /tmp/tmp.qgaFHgxjZ4/ total 4 drwx------. 2 root root 80 Feb 14 09:43 . drwxrwxrwt. 47 root root 1200 Feb 14 09:43 .. -rw-r--r--. 1 root root 0 Feb 13 17:14 dummy.c -rw-r--r--. 1 root root 121 Feb 13 17:14 Makefile # make -s -C /lib/modules/5.6.0-rc1+/build M=/tmp/tmp.qgaFHgxjZ4/ dummy.o -nostdinc -isystem /usr/lib/gcc/x86_64-redhat-linux/9/include -I/home/acme/git/linux/arch/x86/include -I./arch/x86/include/generated -I/home/acme/git/linux/include -I./include -I/home/acme/git/linux/arch/x86/include/uapi -I./arch/x86/include/generated/uapi -I/home/acme/git/linux/include/uapi -I./include/generated/uapi -include /home/acme/git/linux/include/linux/kconfig.h # # ls -la /tmp/tmp.qgaFHgxjZ4/ total 16 drwx------. 2 root root 160 Feb 14 09:44 . drwxrwxrwt. 47 root root 1200 Feb 14 09:44 .. -rw-r--r--. 1 root root 158 Feb 14 09:44 built-in.a -rw-r--r--. 1 root root 149 Feb 14 09:44 .built-in.a.cmd -rw-r--r--. 1 root root 0 Feb 13 17:14 dummy.c -rw-r--r--. 1 root root 936 Feb 14 09:44 dummy.o -rw-r--r--. 1 root root 121 Feb 13 17:14 Makefile -rw-r--r--. 1 root root 0 Feb 14 09:44 modules.order # Reported-by: Thomas Richter Tested-by: Thomas Richter Cc: Adrian Hunter Cc: Daniel Borkmann Cc: He Kuang Cc: Jiri Olsa Cc: Masahiro Yamada Cc: Namhyung Kim Cc: Sumanth Korikkar Cc: Vasily Gorbik Cc: Wang Nan Cc: Zefan Li Link: https://www.spinics.net/lists/linux-perf-users/msg10600.html Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/llvm-utils.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/perf/util/llvm-utils.c b/tools/perf/util/llvm-utils.c index eae47c2509eb..b5af680fc667 100644 --- a/tools/perf/util/llvm-utils.c +++ b/tools/perf/util/llvm-utils.c @@ -288,6 +288,7 @@ static const char *kinc_fetch_script = "obj-y := dummy.o\n" "\\$(obj)/%.o: \\$(src)/%.c\n" "\t@echo -n \"\\$(NOSTDINC_FLAGS) \\$(LINUXINCLUDE) \\$(EXTRA_CFLAGS)\"\n" +"\t\\$(CC) -c -o \\$@ \\$<\n" "EOF\n" "touch $TMPDIR/dummy.c\n" "make -s -C $KBUILD_DIR M=$TMPDIR $KBUILD_OPTS dummy.o 2>/dev/null\n" -- cgit v1.2.3 From bb51e669fa49feb5904f452b2991b240ef31bc97 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Fri, 14 Feb 2020 12:13:14 +0100 Subject: ALSA: seq: Avoid concurrent access to queue flags The queue flags are represented in bit fields and the concurrent access may result in unexpected results. Although the current code should be mostly OK as it's only reading a field while writing other fields as KCSAN reported, it's safer to cover both with a proper spinlock protection. This patch fixes the possible concurrent read by protecting with q->owner_lock. Also the queue owner field is protected as well since it's the field to be protected by the lock itself. Reported-by: syzbot+65c6c92d04304d0a8efc@syzkaller.appspotmail.com Reported-by: syzbot+e60ddfa48717579799dd@syzkaller.appspotmail.com Link: https://lore.kernel.org/r/20200214111316.26939-2-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/core/seq/seq_queue.c | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/sound/core/seq/seq_queue.c b/sound/core/seq/seq_queue.c index caf68bf42f13..20c552cf8398 100644 --- a/sound/core/seq/seq_queue.c +++ b/sound/core/seq/seq_queue.c @@ -392,6 +392,7 @@ int snd_seq_queue_check_access(int queueid, int client) int snd_seq_queue_set_owner(int queueid, int client, int locked) { struct snd_seq_queue *q = queueptr(queueid); + unsigned long flags; if (q == NULL) return -EINVAL; @@ -401,8 +402,10 @@ int snd_seq_queue_set_owner(int queueid, int client, int locked) return -EPERM; } + spin_lock_irqsave(&q->owner_lock, flags); q->locked = locked ? 1 : 0; q->owner = client; + spin_unlock_irqrestore(&q->owner_lock, flags); queue_access_unlock(q); queuefree(q); @@ -539,15 +542,17 @@ void snd_seq_queue_client_termination(int client) unsigned long flags; int i; struct snd_seq_queue *q; + bool matched; for (i = 0; i < SNDRV_SEQ_MAX_QUEUES; i++) { if ((q = queueptr(i)) == NULL) continue; spin_lock_irqsave(&q->owner_lock, flags); - if (q->owner == client) + matched = (q->owner == client); + if (matched) q->klocked = 1; spin_unlock_irqrestore(&q->owner_lock, flags); - if (q->owner == client) { + if (matched) { if (q->timer->running) snd_seq_timer_stop(q->timer); snd_seq_timer_reset(q->timer); @@ -739,6 +744,8 @@ void snd_seq_info_queues_read(struct snd_info_entry *entry, int i, bpm; struct snd_seq_queue *q; struct snd_seq_timer *tmr; + bool locked; + int owner; for (i = 0; i < SNDRV_SEQ_MAX_QUEUES; i++) { if ((q = queueptr(i)) == NULL) @@ -750,9 +757,14 @@ void snd_seq_info_queues_read(struct snd_info_entry *entry, else bpm = 0; + spin_lock_irq(&q->owner_lock); + locked = q->locked; + owner = q->owner; + spin_unlock_irq(&q->owner_lock); + snd_iprintf(buffer, "queue %d: [%s]\n", q->queue, q->name); - snd_iprintf(buffer, "owned by client : %d\n", q->owner); - snd_iprintf(buffer, "lock status : %s\n", q->locked ? "Locked" : "Free"); + snd_iprintf(buffer, "owned by client : %d\n", owner); + snd_iprintf(buffer, "lock status : %s\n", locked ? "Locked" : "Free"); snd_iprintf(buffer, "queued time events : %d\n", snd_seq_prioq_avail(q->timeq)); snd_iprintf(buffer, "queued tick events : %d\n", snd_seq_prioq_avail(q->tickq)); snd_iprintf(buffer, "timer state : %s\n", tmr->running ? "Running" : "Stopped"); -- cgit v1.2.3 From dc7497795e014d84699c3b8809ed6df35352dd74 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Fri, 14 Feb 2020 12:13:15 +0100 Subject: ALSA: seq: Fix concurrent access to queue current tick/time snd_seq_check_queue() passes the current tick and time of the given queue as a pointer to snd_seq_prioq_cell_out(), but those might be updated concurrently by the seq timer update. Fix it by retrieving the current tick and time via the proper helper functions at first, and pass those values to snd_seq_prioq_cell_out() later in the loops. snd_seq_timer_get_cur_time() takes a new argument and adjusts with the current system time only when it's requested so; this update isn't needed for snd_seq_check_queue(), as it's called either from the interrupt handler or right after queuing. Also, snd_seq_timer_get_cur_tick() is changed to read the value in the spinlock for the concurrency, too. Reported-by: syzbot+fd5e0eaa1a32999173b2@syzkaller.appspotmail.com Link: https://lore.kernel.org/r/20200214111316.26939-3-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/core/seq/seq_clientmgr.c | 4 ++-- sound/core/seq/seq_queue.c | 9 ++++++--- sound/core/seq/seq_timer.c | 13 ++++++++++--- sound/core/seq/seq_timer.h | 3 ++- 4 files changed, 20 insertions(+), 9 deletions(-) diff --git a/sound/core/seq/seq_clientmgr.c b/sound/core/seq/seq_clientmgr.c index 6d9592f0ae1d..cc93157fa950 100644 --- a/sound/core/seq/seq_clientmgr.c +++ b/sound/core/seq/seq_clientmgr.c @@ -580,7 +580,7 @@ static int update_timestamp_of_queue(struct snd_seq_event *event, event->queue = queue; event->flags &= ~SNDRV_SEQ_TIME_STAMP_MASK; if (real_time) { - event->time.time = snd_seq_timer_get_cur_time(q->timer); + event->time.time = snd_seq_timer_get_cur_time(q->timer, true); event->flags |= SNDRV_SEQ_TIME_STAMP_REAL; } else { event->time.tick = snd_seq_timer_get_cur_tick(q->timer); @@ -1659,7 +1659,7 @@ static int snd_seq_ioctl_get_queue_status(struct snd_seq_client *client, tmr = queue->timer; status->events = queue->tickq->cells + queue->timeq->cells; - status->time = snd_seq_timer_get_cur_time(tmr); + status->time = snd_seq_timer_get_cur_time(tmr, true); status->tick = snd_seq_timer_get_cur_tick(tmr); status->running = tmr->running; diff --git a/sound/core/seq/seq_queue.c b/sound/core/seq/seq_queue.c index 20c552cf8398..71a6ea62c3be 100644 --- a/sound/core/seq/seq_queue.c +++ b/sound/core/seq/seq_queue.c @@ -238,6 +238,8 @@ void snd_seq_check_queue(struct snd_seq_queue *q, int atomic, int hop) { unsigned long flags; struct snd_seq_event_cell *cell; + snd_seq_tick_time_t cur_tick; + snd_seq_real_time_t cur_time; if (q == NULL) return; @@ -254,17 +256,18 @@ void snd_seq_check_queue(struct snd_seq_queue *q, int atomic, int hop) __again: /* Process tick queue... */ + cur_tick = snd_seq_timer_get_cur_tick(q->timer); for (;;) { - cell = snd_seq_prioq_cell_out(q->tickq, - &q->timer->tick.cur_tick); + cell = snd_seq_prioq_cell_out(q->tickq, &cur_tick); if (!cell) break; snd_seq_dispatch_event(cell, atomic, hop); } /* Process time queue... */ + cur_time = snd_seq_timer_get_cur_time(q->timer, false); for (;;) { - cell = snd_seq_prioq_cell_out(q->timeq, &q->timer->cur_time); + cell = snd_seq_prioq_cell_out(q->timeq, &cur_time); if (!cell) break; snd_seq_dispatch_event(cell, atomic, hop); diff --git a/sound/core/seq/seq_timer.c b/sound/core/seq/seq_timer.c index be59b59c9be4..1645e4142e30 100644 --- a/sound/core/seq/seq_timer.c +++ b/sound/core/seq/seq_timer.c @@ -428,14 +428,15 @@ int snd_seq_timer_continue(struct snd_seq_timer *tmr) } /* return current 'real' time. use timeofday() to get better granularity. */ -snd_seq_real_time_t snd_seq_timer_get_cur_time(struct snd_seq_timer *tmr) +snd_seq_real_time_t snd_seq_timer_get_cur_time(struct snd_seq_timer *tmr, + bool adjust_ktime) { snd_seq_real_time_t cur_time; unsigned long flags; spin_lock_irqsave(&tmr->lock, flags); cur_time = tmr->cur_time; - if (tmr->running) { + if (adjust_ktime && tmr->running) { struct timespec64 tm; ktime_get_ts64(&tm); @@ -452,7 +453,13 @@ snd_seq_real_time_t snd_seq_timer_get_cur_time(struct snd_seq_timer *tmr) high PPQ values) */ snd_seq_tick_time_t snd_seq_timer_get_cur_tick(struct snd_seq_timer *tmr) { - return tmr->tick.cur_tick; + snd_seq_tick_time_t cur_tick; + unsigned long flags; + + spin_lock_irqsave(&tmr->lock, flags); + cur_tick = tmr->tick.cur_tick; + spin_unlock_irqrestore(&tmr->lock, flags); + return cur_tick; } diff --git a/sound/core/seq/seq_timer.h b/sound/core/seq/seq_timer.h index 66c3e344eae3..4bec57df8158 100644 --- a/sound/core/seq/seq_timer.h +++ b/sound/core/seq/seq_timer.h @@ -120,7 +120,8 @@ int snd_seq_timer_set_tempo_ppq(struct snd_seq_timer *tmr, int tempo, int ppq); int snd_seq_timer_set_position_tick(struct snd_seq_timer *tmr, snd_seq_tick_time_t position); int snd_seq_timer_set_position_time(struct snd_seq_timer *tmr, snd_seq_real_time_t position); int snd_seq_timer_set_skew(struct snd_seq_timer *tmr, unsigned int skew, unsigned int base); -snd_seq_real_time_t snd_seq_timer_get_cur_time(struct snd_seq_timer *tmr); +snd_seq_real_time_t snd_seq_timer_get_cur_time(struct snd_seq_timer *tmr, + bool adjust_ktime); snd_seq_tick_time_t snd_seq_timer_get_cur_tick(struct snd_seq_timer *tmr); extern int seq_default_timer_class; -- cgit v1.2.3 From dfa9a5efe8b932a84b3b319250aa3ac60c20f876 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Fri, 14 Feb 2020 12:13:16 +0100 Subject: ALSA: rawmidi: Avoid bit fields for state flags The rawmidi state flags (opened, append, active_sensing) are stored in bit fields that can be potentially racy when concurrently accessed without any locks. Although the current code should be fine, there is also no any real benefit by keeping the bitfields for this kind of short number of members. This patch changes those bit fields flags to the simple bool fields. There should be no size increase of the snd_rawmidi_substream by this change. Reported-by: syzbot+576cc007eb9f2c968200@syzkaller.appspotmail.com Link: https://lore.kernel.org/r/20200214111316.26939-4-tiwai@suse.de Signed-off-by: Takashi Iwai --- include/sound/rawmidi.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/sound/rawmidi.h b/include/sound/rawmidi.h index 40ab20439fee..a36b7227a15a 100644 --- a/include/sound/rawmidi.h +++ b/include/sound/rawmidi.h @@ -77,9 +77,9 @@ struct snd_rawmidi_substream { struct list_head list; /* list of all substream for given stream */ int stream; /* direction */ int number; /* substream number */ - unsigned int opened: 1, /* open flag */ - append: 1, /* append flag (merge more streams) */ - active_sensing: 1; /* send active sensing when close */ + bool opened; /* open flag */ + bool append; /* append flag (merge more streams) */ + bool active_sensing; /* send active sensing when close */ int use_count; /* use counter (for output) */ size_t bytes; struct snd_rawmidi *rmidi; -- cgit v1.2.3 From d0db7ed397517c8b2be24a0d1abfa15df776908e Mon Sep 17 00:00:00 2001 From: Yufeng Mo Date: Fri, 14 Feb 2020 09:53:41 +0800 Subject: net: hns3: add management table after IMP reset In the current process, the management table is missing after the IMP reset. This patch adds the management table to the reset process. Fixes: f5aac71c0327 ("net: hns3: add manager table initialization for hardware") Signed-off-by: Yufeng Mo Signed-off-by: Huazhong Tan Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index ec5f6eeb639b..25ac57325129 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -9834,6 +9834,13 @@ static int hclge_reset_ae_dev(struct hnae3_ae_dev *ae_dev) return ret; } + ret = init_mgr_tbl(hdev); + if (ret) { + dev_err(&pdev->dev, + "failed to reinit manager table, ret = %d\n", ret); + return ret; + } + ret = hclge_init_fd_config(hdev); if (ret) { dev_err(&pdev->dev, "fd table init fail, ret=%d\n", ret); -- cgit v1.2.3 From 19eb1123b4e9337fe20b1763fec528f837ec6568 Mon Sep 17 00:00:00 2001 From: Yonglong Liu Date: Fri, 14 Feb 2020 09:53:42 +0800 Subject: net: hns3: fix VF bandwidth does not take effect in some case When enabling 4 TC after setting the bandwidth of VF, the bandwidth of VF will resume to default value, because of the qset resources changed in this case. This patch fixes it by using a fixed VF's qset resources according to HNAE3_MAX_TC macro. Fixes: ee9e44248f52 ("net: hns3: add support for configuring bandwidth of VF on the host") Signed-off-by: Yonglong Liu Signed-off-by: Huazhong Tan Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c index 180224eab1ca..28db13253a5e 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c @@ -566,7 +566,7 @@ static void hclge_tm_vport_tc_info_update(struct hclge_vport *vport) */ kinfo->num_tc = vport->vport_id ? 1 : min_t(u16, vport->alloc_tqps, hdev->tm_info.num_tc); - vport->qs_offset = (vport->vport_id ? hdev->tm_info.num_tc : 0) + + vport->qs_offset = (vport->vport_id ? HNAE3_MAX_TC : 0) + (vport->vport_id ? (vport->vport_id - 1) : 0); max_rss_size = min_t(u16, hdev->rss_size_max, -- cgit v1.2.3 From 47327c9315b2f3ae4ab659457977a26669631f20 Mon Sep 17 00:00:00 2001 From: Guangbin Huang Date: Fri, 14 Feb 2020 09:53:43 +0800 Subject: net: hns3: fix a copying IPv6 address error in hclge_fd_get_flow_tuples() The IPv6 address defined in struct in6_addr is specified as big endian, but there is no specified endian in struct hclge_fd_rule_tuples, so it will cause a problem if directly use memcpy() to copy ipv6 address between these two structures since this field in struct hclge_fd_rule_tuples is little endian. This patch fixes this problem by using be32_to_cpu() to convert endian of IPv6 address of struct in6_addr before copying. Fixes: d93ed94fbeaf ("net: hns3: add aRFS support for PF") Signed-off-by: Guangbin Huang Signed-off-by: Huazhong Tan Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index 25ac57325129..492bc9446463 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -6113,6 +6113,9 @@ static int hclge_get_all_rules(struct hnae3_handle *handle, static void hclge_fd_get_flow_tuples(const struct flow_keys *fkeys, struct hclge_fd_rule_tuples *tuples) { +#define flow_ip6_src fkeys->addrs.v6addrs.src.in6_u.u6_addr32 +#define flow_ip6_dst fkeys->addrs.v6addrs.dst.in6_u.u6_addr32 + tuples->ether_proto = be16_to_cpu(fkeys->basic.n_proto); tuples->ip_proto = fkeys->basic.ip_proto; tuples->dst_port = be16_to_cpu(fkeys->ports.dst); @@ -6121,12 +6124,12 @@ static void hclge_fd_get_flow_tuples(const struct flow_keys *fkeys, tuples->src_ip[3] = be32_to_cpu(fkeys->addrs.v4addrs.src); tuples->dst_ip[3] = be32_to_cpu(fkeys->addrs.v4addrs.dst); } else { - memcpy(tuples->src_ip, - fkeys->addrs.v6addrs.src.in6_u.u6_addr32, - sizeof(tuples->src_ip)); - memcpy(tuples->dst_ip, - fkeys->addrs.v6addrs.dst.in6_u.u6_addr32, - sizeof(tuples->dst_ip)); + int i; + + for (i = 0; i < IPV6_SIZE; i++) { + tuples->src_ip[i] = be32_to_cpu(flow_ip6_src[i]); + tuples->dst_ip[i] = be32_to_cpu(flow_ip6_dst[i]); + } } } -- cgit v1.2.3 From 67f562e3e147750a02b2a91d21a163fc44a1d13e Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Fri, 14 Feb 2020 08:58:59 +0100 Subject: net/smc: transfer fasync_list in case of fallback SMC does not work together with FASTOPEN. If sendmsg() is called with flag MSG_FASTOPEN in SMC_INIT state, the SMC-socket switches to fallback mode. To handle the previous ioctl FIOASYNC call correctly in this case, it is necessary to transfer the socket wait queue fasync_list to the internal TCP socket. Reported-by: syzbot+4b1fe8105f8044a26162@syzkaller.appspotmail.com Fixes: ee9dfbef02d18 ("net/smc: handle sockopts forcing fallback") Signed-off-by: Ursula Braun Signed-off-by: Karsten Graul Signed-off-by: David S. Miller --- net/smc/af_smc.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index cee5bf4a9bb9..90988a511cd5 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -470,6 +470,8 @@ static void smc_switch_to_fallback(struct smc_sock *smc) if (smc->sk.sk_socket && smc->sk.sk_socket->file) { smc->clcsock->file = smc->sk.sk_socket->file; smc->clcsock->file->private_data = smc->clcsock; + smc->clcsock->wq.fasync_list = + smc->sk.sk_socket->wq.fasync_list; } } -- cgit v1.2.3 From 369537c97024dca99303a8d4d6ab38b4f54d3909 Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Fri, 14 Feb 2020 08:59:00 +0100 Subject: net/smc: no peer ID in CLC decline for SMCD Just SMCR requires a CLC Peer ID, but not SMCD. The field should be zero for SMCD. Fixes: c758dfddc1b5 ("net/smc: add SMC-D support in CLC messages") Signed-off-by: Ursula Braun Signed-off-by: Karsten Graul Signed-off-by: David S. Miller --- net/smc/smc_clc.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index 0879f7bed967..86cccc24e52e 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -372,7 +372,9 @@ int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info) dclc.hdr.length = htons(sizeof(struct smc_clc_msg_decline)); dclc.hdr.version = SMC_CLC_V1; dclc.hdr.flag = (peer_diag_info == SMC_CLC_DECL_SYNCERR) ? 1 : 0; - memcpy(dclc.id_for_peer, local_systemid, sizeof(local_systemid)); + if (smc->conn.lgr && !smc->conn.lgr->is_smcd) + memcpy(dclc.id_for_peer, local_systemid, + sizeof(local_systemid)); dclc.peer_diagnosis = htonl(peer_diag_info); memcpy(dclc.trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); -- cgit v1.2.3 From 5fdcce211b3a41c3fed229333027f59a94a2f265 Mon Sep 17 00:00:00 2001 From: William Dauchy Date: Thu, 13 Feb 2020 18:19:22 +0100 Subject: net, ip6_tunnel: enhance tunnel locate with link check With ipip, it is possible to create an extra interface explicitly attached to a given physical interface: # ip link show tunl0 4: tunl0@NONE: mtu 1480 qdisc noop state DOWN mode DEFAULT group default qlen 1000 link/ipip 0.0.0.0 brd 0.0.0.0 # ip link add tunl1 type ipip dev eth0 # ip link show tunl1 6: tunl1@eth0: mtu 1480 qdisc noop state DOWN mode DEFAULT group default qlen 1000 link/ipip 0.0.0.0 brd 0.0.0.0 But it is not possible with ip6tnl: # ip link show ip6tnl0 5: ip6tnl0@NONE: mtu 1452 qdisc noop state DOWN mode DEFAULT group default qlen 1000 link/tunnel6 :: brd :: # ip link add ip6tnl1 type ip6tnl dev eth0 RTNETLINK answers: File exists This patch aims to make it possible by adding link comparaison in both tunnel locate and lookup functions; we also modify mtu calculation when attached to an interface with a lower mtu. This permits to make use of x-netns communication by moving the newly created tunnel in a given netns. Signed-off-by: William Dauchy Reviewed-by: Nicolas Dichtel Signed-off-by: David S. Miller --- net/ipv6/ip6_tunnel.c | 68 +++++++++++++++++++++++++++++++++++---------------- 1 file changed, 47 insertions(+), 21 deletions(-) diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index b5dd20c4599b..5d65436ad5ad 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -121,6 +121,7 @@ static struct net_device_stats *ip6_get_stats(struct net_device *dev) /** * ip6_tnl_lookup - fetch tunnel matching the end-point addresses + * @link: ifindex of underlying interface * @remote: the address of the tunnel exit-point * @local: the address of the tunnel entry-point * @@ -134,37 +135,56 @@ static struct net_device_stats *ip6_get_stats(struct net_device *dev) for (t = rcu_dereference(start); t; t = rcu_dereference(t->next)) static struct ip6_tnl * -ip6_tnl_lookup(struct net *net, const struct in6_addr *remote, const struct in6_addr *local) +ip6_tnl_lookup(struct net *net, int link, + const struct in6_addr *remote, const struct in6_addr *local) { unsigned int hash = HASH(remote, local); - struct ip6_tnl *t; + struct ip6_tnl *t, *cand = NULL; struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); struct in6_addr any; for_each_ip6_tunnel_rcu(ip6n->tnls_r_l[hash]) { - if (ipv6_addr_equal(local, &t->parms.laddr) && - ipv6_addr_equal(remote, &t->parms.raddr) && - (t->dev->flags & IFF_UP)) + if (!ipv6_addr_equal(local, &t->parms.laddr) || + !ipv6_addr_equal(remote, &t->parms.raddr) || + !(t->dev->flags & IFF_UP)) + continue; + + if (link == t->parms.link) return t; + else + cand = t; } memset(&any, 0, sizeof(any)); hash = HASH(&any, local); for_each_ip6_tunnel_rcu(ip6n->tnls_r_l[hash]) { - if (ipv6_addr_equal(local, &t->parms.laddr) && - ipv6_addr_any(&t->parms.raddr) && - (t->dev->flags & IFF_UP)) + if (!ipv6_addr_equal(local, &t->parms.laddr) || + !ipv6_addr_any(&t->parms.raddr) || + !(t->dev->flags & IFF_UP)) + continue; + + if (link == t->parms.link) return t; + else if (!cand) + cand = t; } hash = HASH(remote, &any); for_each_ip6_tunnel_rcu(ip6n->tnls_r_l[hash]) { - if (ipv6_addr_equal(remote, &t->parms.raddr) && - ipv6_addr_any(&t->parms.laddr) && - (t->dev->flags & IFF_UP)) + if (!ipv6_addr_equal(remote, &t->parms.raddr) || + !ipv6_addr_any(&t->parms.laddr) || + !(t->dev->flags & IFF_UP)) + continue; + + if (link == t->parms.link) return t; + else if (!cand) + cand = t; } + if (cand) + return cand; + t = rcu_dereference(ip6n->collect_md_tun); if (t && t->dev->flags & IFF_UP) return t; @@ -351,7 +371,8 @@ static struct ip6_tnl *ip6_tnl_locate(struct net *net, (t = rtnl_dereference(*tp)) != NULL; tp = &t->next) { if (ipv6_addr_equal(local, &t->parms.laddr) && - ipv6_addr_equal(remote, &t->parms.raddr)) { + ipv6_addr_equal(remote, &t->parms.raddr) && + p->link == t->parms.link) { if (create) return ERR_PTR(-EEXIST); @@ -485,7 +506,7 @@ ip6_tnl_err(struct sk_buff *skb, __u8 ipproto, struct inet6_skb_parm *opt, processing of the error. */ rcu_read_lock(); - t = ip6_tnl_lookup(dev_net(skb->dev), &ipv6h->daddr, &ipv6h->saddr); + t = ip6_tnl_lookup(dev_net(skb->dev), skb->dev->ifindex, &ipv6h->daddr, &ipv6h->saddr); if (!t) goto out; @@ -887,7 +908,7 @@ static int ipxip6_rcv(struct sk_buff *skb, u8 ipproto, int ret = -1; rcu_read_lock(); - t = ip6_tnl_lookup(dev_net(skb->dev), &ipv6h->saddr, &ipv6h->daddr); + t = ip6_tnl_lookup(dev_net(skb->dev), skb->dev->ifindex, &ipv6h->saddr, &ipv6h->daddr); if (t) { u8 tproto = READ_ONCE(t->parms.proto); @@ -1420,8 +1441,10 @@ tx_err: static void ip6_tnl_link_config(struct ip6_tnl *t) { struct net_device *dev = t->dev; + struct net_device *tdev = NULL; struct __ip6_tnl_parm *p = &t->parms; struct flowi6 *fl6 = &t->fl.u.ip6; + unsigned int mtu; int t_hlen; memcpy(dev->dev_addr, &p->laddr, sizeof(struct in6_addr)); @@ -1457,22 +1480,25 @@ static void ip6_tnl_link_config(struct ip6_tnl *t) struct rt6_info *rt = rt6_lookup(t->net, &p->raddr, &p->laddr, p->link, NULL, strict); + if (rt) { + tdev = rt->dst.dev; + ip6_rt_put(rt); + } - if (!rt) - return; + if (!tdev && p->link) + tdev = __dev_get_by_index(t->net, p->link); - if (rt->dst.dev) { - dev->hard_header_len = rt->dst.dev->hard_header_len + - t_hlen; + if (tdev) { + dev->hard_header_len = tdev->hard_header_len + t_hlen; + mtu = min_t(unsigned int, tdev->mtu, IP6_MAX_MTU); - dev->mtu = rt->dst.dev->mtu - t_hlen; + dev->mtu = mtu - t_hlen; if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) dev->mtu -= 8; if (dev->mtu < IPV6_MIN_MTU) dev->mtu = IPV6_MIN_MTU; } - ip6_rt_put(rt); } } -- cgit v1.2.3 From 04fb91243a853dbde216d829c79d9632e52aa8d9 Mon Sep 17 00:00:00 2001 From: Per Forlin Date: Thu, 13 Feb 2020 15:37:09 +0100 Subject: net: dsa: tag_qca: Make sure there is headroom for tag Passing tag size to skb_cow_head will make sure there is enough headroom for the tag data. This change does not introduce any overhead in case there is already available headroom for tag. Signed-off-by: Per Forlin Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/tag_qca.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/dsa/tag_qca.c b/net/dsa/tag_qca.c index c8a128c9e5e0..70db7c909f74 100644 --- a/net/dsa/tag_qca.c +++ b/net/dsa/tag_qca.c @@ -33,7 +33,7 @@ static struct sk_buff *qca_tag_xmit(struct sk_buff *skb, struct net_device *dev) struct dsa_port *dp = dsa_slave_to_port(dev); u16 *phdr, hdr; - if (skb_cow_head(skb, 0) < 0) + if (skb_cow_head(skb, QCA_HDR_LEN) < 0) return NULL; skb_push(skb, QCA_HDR_LEN); -- cgit v1.2.3 From ddc9abaf5d9924569afe09a605c9012089d0c25b Mon Sep 17 00:00:00 2001 From: Per Forlin Date: Thu, 13 Feb 2020 15:37:10 +0100 Subject: net: dsa: tag_ar9331: Make sure there is headroom for tag Passing tag size to skb_cow_head will make sure there is enough headroom for the tag data. This change does not introduce any overhead in case there is already available headroom for tag. Signed-off-by: Per Forlin Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/tag_ar9331.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/dsa/tag_ar9331.c b/net/dsa/tag_ar9331.c index 466ffa92a474..55b00694cdba 100644 --- a/net/dsa/tag_ar9331.c +++ b/net/dsa/tag_ar9331.c @@ -31,7 +31,7 @@ static struct sk_buff *ar9331_tag_xmit(struct sk_buff *skb, __le16 *phdr; u16 hdr; - if (skb_cow_head(skb, 0) < 0) + if (skb_cow_head(skb, AR9331_HDR_LEN) < 0) return NULL; phdr = skb_push(skb, AR9331_HDR_LEN); -- cgit v1.2.3 From a1fa83bdab784fa0ff2e92870011c0dcdbd2f680 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 12 Feb 2020 22:28:20 -0800 Subject: netdevice.h: fix all kernel-doc and Sphinx warnings Eliminate all kernel-doc and Sphinx warnings in . Fixes these warnings: ../include/linux/netdevice.h:2100: warning: Function parameter or member 'gso_partial_features' not described in 'net_device' ../include/linux/netdevice.h:2100: warning: Function parameter or member 'l3mdev_ops' not described in 'net_device' ../include/linux/netdevice.h:2100: warning: Function parameter or member 'xfrmdev_ops' not described in 'net_device' ../include/linux/netdevice.h:2100: warning: Function parameter or member 'tlsdev_ops' not described in 'net_device' ../include/linux/netdevice.h:2100: warning: Function parameter or member 'name_assign_type' not described in 'net_device' ../include/linux/netdevice.h:2100: warning: Function parameter or member 'ieee802154_ptr' not described in 'net_device' ../include/linux/netdevice.h:2100: warning: Function parameter or member 'mpls_ptr' not described in 'net_device' ../include/linux/netdevice.h:2100: warning: Function parameter or member 'xdp_prog' not described in 'net_device' ../include/linux/netdevice.h:2100: warning: Function parameter or member 'gro_flush_timeout' not described in 'net_device' ../include/linux/netdevice.h:2100: warning: Function parameter or member 'xdp_bulkq' not described in 'net_device' ../include/linux/netdevice.h:2100: warning: Function parameter or member 'xps_cpus_map' not described in 'net_device' ../include/linux/netdevice.h:2100: warning: Function parameter or member 'xps_rxqs_map' not described in 'net_device' ../include/linux/netdevice.h:2100: warning: Function parameter or member 'qdisc_hash' not described in 'net_device' ../include/linux/netdevice.h:3552: WARNING: Inline emphasis start-string without end-string. ../include/linux/netdevice.h:3552: WARNING: Inline emphasis start-string without end-string. Signed-off-by: Randy Dunlap Signed-off-by: David S. Miller --- include/linux/netdevice.h | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index a9c6b5c61d27..9f1f633235f6 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1616,6 +1616,7 @@ enum netdev_priv_flags { * and drivers will need to set them appropriately. * * @mpls_features: Mask of features inheritable by MPLS + * @gso_partial_features: value(s) from NETIF_F_GSO\* * * @ifindex: interface index * @group: The group the device belongs to @@ -1640,8 +1641,11 @@ enum netdev_priv_flags { * @netdev_ops: Includes several pointers to callbacks, * if one wants to override the ndo_*() functions * @ethtool_ops: Management operations + * @l3mdev_ops: Layer 3 master device operations * @ndisc_ops: Includes callbacks for different IPv6 neighbour * discovery handling. Necessary for e.g. 6LoWPAN. + * @xfrmdev_ops: Transformation offload operations + * @tlsdev_ops: Transport Layer Security offload operations * @header_ops: Includes callbacks for creating,parsing,caching,etc * of Layer 2 headers. * @@ -1680,6 +1684,7 @@ enum netdev_priv_flags { * @dev_port: Used to differentiate devices that share * the same function * @addr_list_lock: XXX: need comments on this one + * @name_assign_type: network interface name assignment type * @uc_promisc: Counter that indicates promiscuous mode * has been enabled due to the need to listen to * additional unicast addresses in a device that @@ -1702,6 +1707,9 @@ enum netdev_priv_flags { * @ip6_ptr: IPv6 specific data * @ax25_ptr: AX.25 specific data * @ieee80211_ptr: IEEE 802.11 specific data, assign before registering + * @ieee802154_ptr: IEEE 802.15.4 low-rate Wireless Personal Area Network + * device struct + * @mpls_ptr: mpls_dev struct pointer * * @dev_addr: Hw address (before bcast, * because most packets are unicast) @@ -1710,6 +1718,8 @@ enum netdev_priv_flags { * @num_rx_queues: Number of RX queues * allocated at register_netdev() time * @real_num_rx_queues: Number of RX queues currently active in device + * @xdp_prog: XDP sockets filter program pointer + * @gro_flush_timeout: timeout for GRO layer in NAPI * * @rx_handler: handler for received packets * @rx_handler_data: XXX: need comments on this one @@ -1731,10 +1741,14 @@ enum netdev_priv_flags { * @qdisc: Root qdisc from userspace point of view * @tx_queue_len: Max frames per queue allowed * @tx_global_lock: XXX: need comments on this one + * @xdp_bulkq: XDP device bulk queue + * @xps_cpus_map: all CPUs map for XPS device + * @xps_rxqs_map: all RXQs map for XPS device * * @xps_maps: XXX: need comments on this one * @miniq_egress: clsact qdisc specific data for * egress processing + * @qdisc_hash: qdisc hash table * @watchdog_timeo: Represents the timeout that is used by * the watchdog (see dev_watchdog()) * @watchdog_timer: List of timers @@ -3548,7 +3562,7 @@ static inline unsigned int netif_attrmask_next(int n, const unsigned long *srcp, } /** - * netif_attrmask_next_and - get the next CPU/Rx queue in *src1p & *src2p + * netif_attrmask_next_and - get the next CPU/Rx queue in \*src1p & \*src2p * @n: CPU/Rx queue index * @src1p: the first CPUs/Rx queues mask pointer * @src2p: the second CPUs/Rx queues mask pointer -- cgit v1.2.3 From 4b8a1ca4628343829f373bf0d4e087fe50c451e5 Mon Sep 17 00:00:00 2001 From: Tzung-Bi Shih Date: Fri, 14 Feb 2020 18:57:42 +0800 Subject: ASoC: max98090: revert invalid fix for handling SHDN Reverts commit 62d5ae4cafb7 ("ASoC: max98090: save and restore SHDN when changing sensitive registers"). A critical side-effect was observed: when keep playing something, the recorded sound has chance to break (clipping). Signed-off-by: Tzung-Bi Shih Link: https://lore.kernel.org/r/20200214105744.82258-2-tzungbi@google.com Signed-off-by: Mark Brown --- sound/soc/codecs/max98090.c | 434 +++++++++++++------------------------------- sound/soc/codecs/max98090.h | 3 +- 2 files changed, 124 insertions(+), 313 deletions(-) diff --git a/sound/soc/codecs/max98090.c b/sound/soc/codecs/max98090.c index 5bc2c6411b33..032adc14562d 100644 --- a/sound/soc/codecs/max98090.c +++ b/sound/soc/codecs/max98090.c @@ -5,150 +5,24 @@ * Copyright 2011-2012 Maxim Integrated Products */ -#include -#include #include #include #include -#include #include #include #include #include #include +#include +#include #include -#include #include #include #include #include +#include #include "max98090.h" -static void max98090_shdn_save_locked(struct max98090_priv *max98090) -{ - int shdn = 0; - - /* saved_shdn, saved_count, SHDN are protected by card->dapm_mutex */ - regmap_read(max98090->regmap, M98090_REG_DEVICE_SHUTDOWN, &shdn); - max98090->saved_shdn |= shdn; - ++max98090->saved_count; - - if (shdn) - regmap_write(max98090->regmap, M98090_REG_DEVICE_SHUTDOWN, 0x0); -} - -static void max98090_shdn_restore_locked(struct max98090_priv *max98090) -{ - /* saved_shdn, saved_count, SHDN are protected by card->dapm_mutex */ - if (--max98090->saved_count == 0) { - if (max98090->saved_shdn) { - regmap_write(max98090->regmap, - M98090_REG_DEVICE_SHUTDOWN, - M98090_SHDNN_MASK); - max98090->saved_shdn = 0; - } - } -} - -static void max98090_shdn_save(struct max98090_priv *max98090) -{ - mutex_lock_nested(&max98090->component->card->dapm_mutex, - SND_SOC_DAPM_CLASS_RUNTIME); - max98090_shdn_save_locked(max98090); -} - -static void max98090_shdn_restore(struct max98090_priv *max98090) -{ - max98090_shdn_restore_locked(max98090); - mutex_unlock(&max98090->component->card->dapm_mutex); -} - -static int max98090_put_volsw(struct snd_kcontrol *kcontrol, - struct snd_ctl_elem_value *ucontrol) -{ - struct snd_soc_component *component = - snd_soc_kcontrol_component(kcontrol); - struct max98090_priv *max98090 = - snd_soc_component_get_drvdata(component); - int ret; - - max98090_shdn_save(max98090); - ret = snd_soc_put_volsw(kcontrol, ucontrol); - max98090_shdn_restore(max98090); - - return ret; -} - -static int max98090_dapm_put_enum_double(struct snd_kcontrol *kcontrol, - struct snd_ctl_elem_value *ucontrol) -{ - struct snd_soc_component *component = - snd_soc_dapm_kcontrol_component(kcontrol); - struct max98090_priv *max98090 = - snd_soc_component_get_drvdata(component); - int ret; - - max98090_shdn_save(max98090); - ret = snd_soc_dapm_put_enum_double_locked(kcontrol, ucontrol); - max98090_shdn_restore(max98090); - - return ret; -} - -static int max98090_put_enum_double(struct snd_kcontrol *kcontrol, - struct snd_ctl_elem_value *ucontrol) -{ - struct snd_soc_component *component = - snd_soc_kcontrol_component(kcontrol); - struct max98090_priv *max98090 = - snd_soc_component_get_drvdata(component); - int ret; - - max98090_shdn_save(max98090); - ret = snd_soc_put_enum_double(kcontrol, ucontrol); - max98090_shdn_restore(max98090); - - return ret; -} - -static int max98090_bytes_put(struct snd_kcontrol *kcontrol, - struct snd_ctl_elem_value *ucontrol) -{ - struct snd_soc_component *component = - snd_soc_kcontrol_component(kcontrol); - struct max98090_priv *max98090 = - snd_soc_component_get_drvdata(component); - int ret; - - max98090_shdn_save(max98090); - ret = snd_soc_bytes_put(kcontrol, ucontrol); - max98090_shdn_restore(max98090); - - return ret; -} - -static int max98090_dapm_event(struct snd_soc_dapm_widget *w, - struct snd_kcontrol *kcontrol, int event) -{ - struct snd_soc_component *component = - snd_soc_dapm_to_component(w->dapm); - struct max98090_priv *max98090 = - snd_soc_component_get_drvdata(component); - - switch (event) { - case SND_SOC_DAPM_PRE_PMU: - case SND_SOC_DAPM_PRE_PMD: - max98090_shdn_save_locked(max98090); - break; - case SND_SOC_DAPM_POST_PMU: - case SND_SOC_DAPM_POST_PMD: - max98090_shdn_restore_locked(max98090); - break; - } - - return 0; -} - /* Allows for sparsely populated register maps */ static const struct reg_default max98090_reg[] = { { 0x00, 0x00 }, /* 00 Software Reset */ @@ -632,13 +506,10 @@ static SOC_ENUM_SINGLE_DECL(max98090_adchp_enum, max98090_pwr_perf_text); static const struct snd_kcontrol_new max98090_snd_controls[] = { - SOC_ENUM_EXT("MIC Bias VCM Bandgap", max98090_vcmbandgap_enum, - snd_soc_get_enum_double, max98090_put_enum_double), + SOC_ENUM("MIC Bias VCM Bandgap", max98090_vcmbandgap_enum), - SOC_SINGLE_EXT("DMIC MIC Comp Filter Config", - M98090_REG_DIGITAL_MIC_CONFIG, - M98090_DMIC_COMP_SHIFT, M98090_DMIC_COMP_NUM - 1, 0, - snd_soc_get_volsw, max98090_put_volsw), + SOC_SINGLE("DMIC MIC Comp Filter Config", M98090_REG_DIGITAL_MIC_CONFIG, + M98090_DMIC_COMP_SHIFT, M98090_DMIC_COMP_NUM - 1, 0), SOC_SINGLE_EXT_TLV("MIC1 Boost Volume", M98090_REG_MIC1_INPUT_LEVEL, M98090_MIC_PA1EN_SHIFT, @@ -693,34 +564,24 @@ static const struct snd_kcontrol_new max98090_snd_controls[] = { M98090_AVR_SHIFT, M98090_AVR_NUM - 1, 1, max98090_av_tlv), - SOC_ENUM_EXT("ADC Oversampling Rate", max98090_osr128_enum, - snd_soc_get_enum_double, max98090_put_enum_double), - SOC_SINGLE_EXT("ADC Quantizer Dither", M98090_REG_ADC_CONTROL, - M98090_ADCDITHER_SHIFT, M98090_ADCDITHER_NUM - 1, 0, - snd_soc_get_volsw, max98090_put_volsw), - SOC_ENUM_EXT("ADC High Performance Mode", max98090_adchp_enum, - snd_soc_get_enum_double, max98090_put_enum_double), - - SOC_SINGLE_EXT("DAC Mono Mode", M98090_REG_IO_CONFIGURATION, - M98090_DMONO_SHIFT, M98090_DMONO_NUM - 1, 0, - snd_soc_get_volsw, max98090_put_volsw), - SOC_SINGLE_EXT("SDIN Mode", M98090_REG_IO_CONFIGURATION, - M98090_SDIEN_SHIFT, M98090_SDIEN_NUM - 1, 0, - snd_soc_get_volsw, max98090_put_volsw), - SOC_SINGLE_EXT("SDOUT Mode", M98090_REG_IO_CONFIGURATION, - M98090_SDOEN_SHIFT, M98090_SDOEN_NUM - 1, 0, - snd_soc_get_volsw, max98090_put_volsw), - SOC_SINGLE_EXT("SDOUT Hi-Z Mode", M98090_REG_IO_CONFIGURATION, - M98090_HIZOFF_SHIFT, M98090_HIZOFF_NUM - 1, 1, - snd_soc_get_volsw, max98090_put_volsw), - SOC_ENUM_EXT("Filter Mode", max98090_mode_enum, - snd_soc_get_enum_double, max98090_put_enum_double), - SOC_SINGLE_EXT("Record Path DC Blocking", M98090_REG_FILTER_CONFIG, - M98090_AHPF_SHIFT, M98090_AHPF_NUM - 1, 0, - snd_soc_get_volsw, max98090_put_volsw), - SOC_SINGLE_EXT("Playback Path DC Blocking", M98090_REG_FILTER_CONFIG, - M98090_DHPF_SHIFT, M98090_DHPF_NUM - 1, 0, - snd_soc_get_volsw, max98090_put_volsw), + SOC_ENUM("ADC Oversampling Rate", max98090_osr128_enum), + SOC_SINGLE("ADC Quantizer Dither", M98090_REG_ADC_CONTROL, + M98090_ADCDITHER_SHIFT, M98090_ADCDITHER_NUM - 1, 0), + SOC_ENUM("ADC High Performance Mode", max98090_adchp_enum), + + SOC_SINGLE("DAC Mono Mode", M98090_REG_IO_CONFIGURATION, + M98090_DMONO_SHIFT, M98090_DMONO_NUM - 1, 0), + SOC_SINGLE("SDIN Mode", M98090_REG_IO_CONFIGURATION, + M98090_SDIEN_SHIFT, M98090_SDIEN_NUM - 1, 0), + SOC_SINGLE("SDOUT Mode", M98090_REG_IO_CONFIGURATION, + M98090_SDOEN_SHIFT, M98090_SDOEN_NUM - 1, 0), + SOC_SINGLE("SDOUT Hi-Z Mode", M98090_REG_IO_CONFIGURATION, + M98090_HIZOFF_SHIFT, M98090_HIZOFF_NUM - 1, 1), + SOC_ENUM("Filter Mode", max98090_mode_enum), + SOC_SINGLE("Record Path DC Blocking", M98090_REG_FILTER_CONFIG, + M98090_AHPF_SHIFT, M98090_AHPF_NUM - 1, 0), + SOC_SINGLE("Playback Path DC Blocking", M98090_REG_FILTER_CONFIG, + M98090_DHPF_SHIFT, M98090_DHPF_NUM - 1, 0), SOC_SINGLE_TLV("Digital BQ Volume", M98090_REG_ADC_BIQUAD_LEVEL, M98090_AVBQ_SHIFT, M98090_AVBQ_NUM - 1, 1, max98090_dv_tlv), SOC_SINGLE_EXT_TLV("Digital Sidetone Volume", @@ -733,17 +594,13 @@ static const struct snd_kcontrol_new max98090_snd_controls[] = { SOC_SINGLE_TLV("Digital Volume", M98090_REG_DAI_PLAYBACK_LEVEL, M98090_DV_SHIFT, M98090_DV_NUM - 1, 1, max98090_dv_tlv), - SND_SOC_BYTES_E("EQ Coefficients", M98090_REG_EQUALIZER_BASE, 105, - snd_soc_bytes_get, max98090_bytes_put), - SOC_SINGLE_EXT("Digital EQ 3 Band Switch", M98090_REG_DSP_FILTER_ENABLE, - M98090_EQ3BANDEN_SHIFT, M98090_EQ3BANDEN_NUM - 1, 0, - snd_soc_get_volsw, max98090_put_volsw), - SOC_SINGLE_EXT("Digital EQ 5 Band Switch", M98090_REG_DSP_FILTER_ENABLE, - M98090_EQ5BANDEN_SHIFT, M98090_EQ5BANDEN_NUM - 1, 0, - snd_soc_get_volsw, max98090_put_volsw), - SOC_SINGLE_EXT("Digital EQ 7 Band Switch", M98090_REG_DSP_FILTER_ENABLE, - M98090_EQ7BANDEN_SHIFT, M98090_EQ7BANDEN_NUM - 1, 0, - snd_soc_get_volsw, max98090_put_volsw), + SND_SOC_BYTES("EQ Coefficients", M98090_REG_EQUALIZER_BASE, 105), + SOC_SINGLE("Digital EQ 3 Band Switch", M98090_REG_DSP_FILTER_ENABLE, + M98090_EQ3BANDEN_SHIFT, M98090_EQ3BANDEN_NUM - 1, 0), + SOC_SINGLE("Digital EQ 5 Band Switch", M98090_REG_DSP_FILTER_ENABLE, + M98090_EQ5BANDEN_SHIFT, M98090_EQ5BANDEN_NUM - 1, 0), + SOC_SINGLE("Digital EQ 7 Band Switch", M98090_REG_DSP_FILTER_ENABLE, + M98090_EQ7BANDEN_SHIFT, M98090_EQ7BANDEN_NUM - 1, 0), SOC_SINGLE("Digital EQ Clipping Detection", M98090_REG_DAI_PLAYBACK_LEVEL_EQ, M98090_EQCLPN_SHIFT, M98090_EQCLPN_NUM - 1, 1), @@ -751,34 +608,25 @@ static const struct snd_kcontrol_new max98090_snd_controls[] = { M98090_DVEQ_SHIFT, M98090_DVEQ_NUM - 1, 1, max98090_dv_tlv), - SOC_SINGLE_EXT("ALC Enable", M98090_REG_DRC_TIMING, - M98090_DRCEN_SHIFT, M98090_DRCEN_NUM - 1, 0, - snd_soc_get_volsw, max98090_put_volsw), - SOC_ENUM_EXT("ALC Attack Time", max98090_drcatk_enum, - snd_soc_get_enum_double, max98090_put_enum_double), - SOC_ENUM_EXT("ALC Release Time", max98090_drcrls_enum, - snd_soc_get_enum_double, max98090_put_enum_double), + SOC_SINGLE("ALC Enable", M98090_REG_DRC_TIMING, + M98090_DRCEN_SHIFT, M98090_DRCEN_NUM - 1, 0), + SOC_ENUM("ALC Attack Time", max98090_drcatk_enum), + SOC_ENUM("ALC Release Time", max98090_drcrls_enum), SOC_SINGLE_TLV("ALC Make Up Volume", M98090_REG_DRC_GAIN, M98090_DRCG_SHIFT, M98090_DRCG_NUM - 1, 0, max98090_alcmakeup_tlv), - SOC_ENUM_EXT("ALC Compression Ratio", max98090_alccmp_enum, - snd_soc_get_enum_double, max98090_put_enum_double), - SOC_ENUM_EXT("ALC Expansion Ratio", max98090_drcexp_enum, - snd_soc_get_enum_double, max98090_put_enum_double), - SOC_SINGLE_EXT_TLV("ALC Compression Threshold Volume", + SOC_ENUM("ALC Compression Ratio", max98090_alccmp_enum), + SOC_ENUM("ALC Expansion Ratio", max98090_drcexp_enum), + SOC_SINGLE_TLV("ALC Compression Threshold Volume", M98090_REG_DRC_COMPRESSOR, M98090_DRCTHC_SHIFT, - M98090_DRCTHC_NUM - 1, 1, - snd_soc_get_volsw, max98090_put_volsw, max98090_alccomp_tlv), - SOC_SINGLE_EXT_TLV("ALC Expansion Threshold Volume", + M98090_DRCTHC_NUM - 1, 1, max98090_alccomp_tlv), + SOC_SINGLE_TLV("ALC Expansion Threshold Volume", M98090_REG_DRC_EXPANDER, M98090_DRCTHE_SHIFT, - M98090_DRCTHE_NUM - 1, 1, - snd_soc_get_volsw, max98090_put_volsw, max98090_drcexp_tlv), + M98090_DRCTHE_NUM - 1, 1, max98090_drcexp_tlv), - SOC_ENUM_EXT("DAC HP Playback Performance Mode", - max98090_dac_perfmode_enum, - snd_soc_get_enum_double, max98090_put_enum_double), - SOC_ENUM_EXT("DAC High Performance Mode", max98090_dachp_enum, - snd_soc_get_enum_double, max98090_put_enum_double), + SOC_ENUM("DAC HP Playback Performance Mode", + max98090_dac_perfmode_enum), + SOC_ENUM("DAC High Performance Mode", max98090_dachp_enum), SOC_SINGLE_TLV("Headphone Left Mixer Volume", M98090_REG_HP_CONTROL, M98090_MIXHPLG_SHIFT, @@ -836,12 +684,9 @@ static const struct snd_kcontrol_new max98090_snd_controls[] = { SOC_SINGLE("Volume Adjustment Smoothing", M98090_REG_LEVEL_CONTROL, M98090_VSENN_SHIFT, M98090_VSENN_NUM - 1, 1), - SND_SOC_BYTES_E("Biquad Coefficients", - M98090_REG_RECORD_BIQUAD_BASE, 15, - snd_soc_bytes_get, max98090_bytes_put), - SOC_SINGLE_EXT("Biquad Switch", M98090_REG_DSP_FILTER_ENABLE, - M98090_ADCBQEN_SHIFT, M98090_ADCBQEN_NUM - 1, 0, - snd_soc_get_volsw, max98090_put_volsw), + SND_SOC_BYTES("Biquad Coefficients", M98090_REG_RECORD_BIQUAD_BASE, 15), + SOC_SINGLE("Biquad Switch", M98090_REG_DSP_FILTER_ENABLE, + M98090_ADCBQEN_SHIFT, M98090_ADCBQEN_NUM - 1, 0), }; static const struct snd_kcontrol_new max98091_snd_controls[] = { @@ -850,12 +695,10 @@ static const struct snd_kcontrol_new max98091_snd_controls[] = { M98090_DMIC34_ZEROPAD_SHIFT, M98090_DMIC34_ZEROPAD_NUM - 1, 0), - SOC_ENUM_EXT("Filter DMIC34 Mode", max98090_filter_dmic34mode_enum, - snd_soc_get_enum_double, max98090_put_enum_double), - SOC_SINGLE_EXT("DMIC34 DC Blocking", M98090_REG_FILTER_CONFIG, + SOC_ENUM("Filter DMIC34 Mode", max98090_filter_dmic34mode_enum), + SOC_SINGLE("DMIC34 DC Blocking", M98090_REG_FILTER_CONFIG, M98090_FLT_DMIC34HPF_SHIFT, - M98090_FLT_DMIC34HPF_NUM - 1, 0, - snd_soc_get_volsw, max98090_put_volsw), + M98090_FLT_DMIC34HPF_NUM - 1, 0), SOC_SINGLE_TLV("DMIC3 Boost Volume", M98090_REG_DMIC3_VOLUME, M98090_DMIC_AV3G_SHIFT, M98090_DMIC_AV3G_NUM - 1, 0, @@ -873,9 +716,8 @@ static const struct snd_kcontrol_new max98091_snd_controls[] = { SND_SOC_BYTES("DMIC34 Biquad Coefficients", M98090_REG_DMIC34_BIQUAD_BASE, 15), - SOC_SINGLE_EXT("DMIC34 Biquad Switch", M98090_REG_DSP_FILTER_ENABLE, - M98090_DMIC34BQEN_SHIFT, M98090_DMIC34BQEN_NUM - 1, 0, - snd_soc_get_volsw, max98090_put_volsw), + SOC_SINGLE("DMIC34 Biquad Switch", M98090_REG_DSP_FILTER_ENABLE, + M98090_DMIC34BQEN_SHIFT, M98090_DMIC34BQEN_NUM - 1, 0), SOC_SINGLE_TLV("DMIC34 BQ PreAttenuation Volume", M98090_REG_DMIC34_BQ_PREATTEN, M98090_AV34BQ_SHIFT, @@ -929,6 +771,19 @@ static int max98090_micinput_event(struct snd_soc_dapm_widget *w, return 0; } +static int max98090_shdn_event(struct snd_soc_dapm_widget *w, + struct snd_kcontrol *kcontrol, int event) +{ + struct snd_soc_component *component = snd_soc_dapm_to_component(w->dapm); + struct max98090_priv *max98090 = snd_soc_component_get_drvdata(component); + + if (event & SND_SOC_DAPM_POST_PMU) + max98090->shdn_pending = true; + + return 0; + +} + static const char *mic1_mux_text[] = { "IN12", "IN56" }; static SOC_ENUM_SINGLE_DECL(mic1_mux_enum, @@ -1029,14 +884,10 @@ static SOC_ENUM_SINGLE_DECL(ltenr_mux_enum, lten_mux_text); static const struct snd_kcontrol_new max98090_ltenl_mux = - SOC_DAPM_ENUM_EXT("LTENL Mux", ltenl_mux_enum, - snd_soc_dapm_get_enum_double, - max98090_dapm_put_enum_double); + SOC_DAPM_ENUM("LTENL Mux", ltenl_mux_enum); static const struct snd_kcontrol_new max98090_ltenr_mux = - SOC_DAPM_ENUM_EXT("LTENR Mux", ltenr_mux_enum, - snd_soc_dapm_get_enum_double, - max98090_dapm_put_enum_double); + SOC_DAPM_ENUM("LTENR Mux", ltenr_mux_enum); static const char *lben_mux_text[] = { "Normal", "Loopback" }; @@ -1051,14 +902,10 @@ static SOC_ENUM_SINGLE_DECL(lbenr_mux_enum, lben_mux_text); static const struct snd_kcontrol_new max98090_lbenl_mux = - SOC_DAPM_ENUM_EXT("LBENL Mux", lbenl_mux_enum, - snd_soc_dapm_get_enum_double, - max98090_dapm_put_enum_double); + SOC_DAPM_ENUM("LBENL Mux", lbenl_mux_enum); static const struct snd_kcontrol_new max98090_lbenr_mux = - SOC_DAPM_ENUM_EXT("LBENR Mux", lbenr_mux_enum, - snd_soc_dapm_get_enum_double, - max98090_dapm_put_enum_double); + SOC_DAPM_ENUM("LBENR Mux", lbenr_mux_enum); static const char *stenl_mux_text[] = { "Normal", "Sidetone Left" }; @@ -1225,25 +1072,21 @@ static const struct snd_soc_dapm_widget max98090_dapm_widgets[] = { SND_SOC_DAPM_INPUT("IN56"), SND_SOC_DAPM_SUPPLY("MICBIAS", M98090_REG_INPUT_ENABLE, - M98090_MBEN_SHIFT, 0, max98090_dapm_event, - SND_SOC_DAPM_PRE_POST_PMU | SND_SOC_DAPM_PRE_POST_PMD), + M98090_MBEN_SHIFT, 0, NULL, 0), SND_SOC_DAPM_SUPPLY("SHDN", M98090_REG_DEVICE_SHUTDOWN, M98090_SHDNN_SHIFT, 0, NULL, 0), SND_SOC_DAPM_SUPPLY("SDIEN", M98090_REG_IO_CONFIGURATION, - M98090_SDIEN_SHIFT, 0, max98090_dapm_event, - SND_SOC_DAPM_PRE_POST_PMU | SND_SOC_DAPM_PRE_POST_PMD), + M98090_SDIEN_SHIFT, 0, NULL, 0), SND_SOC_DAPM_SUPPLY("SDOEN", M98090_REG_IO_CONFIGURATION, - M98090_SDOEN_SHIFT, 0, max98090_dapm_event, - SND_SOC_DAPM_PRE_POST_PMU | SND_SOC_DAPM_PRE_POST_PMD), + M98090_SDOEN_SHIFT, 0, NULL, 0), SND_SOC_DAPM_SUPPLY("DMICL_ENA", M98090_REG_DIGITAL_MIC_ENABLE, - M98090_DIGMICL_SHIFT, 0, max98090_dapm_event, - SND_SOC_DAPM_PRE_POST_PMU | SND_SOC_DAPM_PRE_POST_PMD), + M98090_DIGMICL_SHIFT, 0, max98090_shdn_event, + SND_SOC_DAPM_POST_PMU), SND_SOC_DAPM_SUPPLY("DMICR_ENA", M98090_REG_DIGITAL_MIC_ENABLE, - M98090_DIGMICR_SHIFT, 0, max98090_dapm_event, - SND_SOC_DAPM_PRE_POST_PMU | SND_SOC_DAPM_PRE_POST_PMD), + M98090_DIGMICR_SHIFT, 0, max98090_shdn_event, + SND_SOC_DAPM_POST_PMU), SND_SOC_DAPM_SUPPLY("AHPF", M98090_REG_FILTER_CONFIG, - M98090_AHPF_SHIFT, 0, max98090_dapm_event, - SND_SOC_DAPM_PRE_POST_PMU | SND_SOC_DAPM_PRE_POST_PMD), + M98090_AHPF_SHIFT, 0, NULL, 0), /* * Note: Sysclk and misc power supplies are taken care of by SHDN @@ -1273,12 +1116,10 @@ static const struct snd_soc_dapm_widget max98090_dapm_widgets[] = { &max98090_lineb_mixer_controls[0], ARRAY_SIZE(max98090_lineb_mixer_controls)), - SND_SOC_DAPM_PGA_E("LINEA Input", M98090_REG_INPUT_ENABLE, - M98090_LINEAEN_SHIFT, 0, NULL, 0, max98090_dapm_event, - SND_SOC_DAPM_PRE_POST_PMU | SND_SOC_DAPM_PRE_POST_PMD), - SND_SOC_DAPM_PGA_E("LINEB Input", M98090_REG_INPUT_ENABLE, - M98090_LINEBEN_SHIFT, 0, NULL, 0, max98090_dapm_event, - SND_SOC_DAPM_PRE_POST_PMU | SND_SOC_DAPM_PRE_POST_PMD), + SND_SOC_DAPM_PGA("LINEA Input", M98090_REG_INPUT_ENABLE, + M98090_LINEAEN_SHIFT, 0, NULL, 0), + SND_SOC_DAPM_PGA("LINEB Input", M98090_REG_INPUT_ENABLE, + M98090_LINEBEN_SHIFT, 0, NULL, 0), SND_SOC_DAPM_MIXER("Left ADC Mixer", SND_SOC_NOPM, 0, 0, &max98090_left_adc_mixer_controls[0], @@ -1289,11 +1130,11 @@ static const struct snd_soc_dapm_widget max98090_dapm_widgets[] = { ARRAY_SIZE(max98090_right_adc_mixer_controls)), SND_SOC_DAPM_ADC_E("ADCL", NULL, M98090_REG_INPUT_ENABLE, - M98090_ADLEN_SHIFT, 0, max98090_dapm_event, - SND_SOC_DAPM_PRE_POST_PMU | SND_SOC_DAPM_PRE_POST_PMD), + M98090_ADLEN_SHIFT, 0, max98090_shdn_event, + SND_SOC_DAPM_POST_PMU), SND_SOC_DAPM_ADC_E("ADCR", NULL, M98090_REG_INPUT_ENABLE, - M98090_ADREN_SHIFT, 0, max98090_dapm_event, - SND_SOC_DAPM_PRE_POST_PMU | SND_SOC_DAPM_PRE_POST_PMD), + M98090_ADREN_SHIFT, 0, max98090_shdn_event, + SND_SOC_DAPM_POST_PMU), SND_SOC_DAPM_AIF_OUT("AIFOUTL", "HiFi Capture", 0, SND_SOC_NOPM, 0, 0), @@ -1321,12 +1162,10 @@ static const struct snd_soc_dapm_widget max98090_dapm_widgets[] = { SND_SOC_DAPM_AIF_IN("AIFINL", "HiFi Playback", 0, SND_SOC_NOPM, 0, 0), SND_SOC_DAPM_AIF_IN("AIFINR", "HiFi Playback", 1, SND_SOC_NOPM, 0, 0), - SND_SOC_DAPM_DAC_E("DACL", NULL, M98090_REG_OUTPUT_ENABLE, - M98090_DALEN_SHIFT, 0, max98090_dapm_event, - SND_SOC_DAPM_PRE_POST_PMU | SND_SOC_DAPM_PRE_POST_PMD), - SND_SOC_DAPM_DAC_E("DACR", NULL, M98090_REG_OUTPUT_ENABLE, - M98090_DAREN_SHIFT, 0, max98090_dapm_event, - SND_SOC_DAPM_PRE_POST_PMU | SND_SOC_DAPM_PRE_POST_PMD), + SND_SOC_DAPM_DAC("DACL", NULL, M98090_REG_OUTPUT_ENABLE, + M98090_DALEN_SHIFT, 0), + SND_SOC_DAPM_DAC("DACR", NULL, M98090_REG_OUTPUT_ENABLE, + M98090_DAREN_SHIFT, 0), SND_SOC_DAPM_MIXER("Left Headphone Mixer", SND_SOC_NOPM, 0, 0, &max98090_left_hp_mixer_controls[0], @@ -1361,26 +1200,20 @@ static const struct snd_soc_dapm_widget max98090_dapm_widgets[] = { SND_SOC_DAPM_MUX("MIXHPRSEL Mux", SND_SOC_NOPM, 0, 0, &max98090_mixhprsel_mux), - SND_SOC_DAPM_PGA_E("HP Left Out", M98090_REG_OUTPUT_ENABLE, - M98090_HPLEN_SHIFT, 0, NULL, 0, max98090_dapm_event, - SND_SOC_DAPM_PRE_POST_PMU | SND_SOC_DAPM_PRE_POST_PMD), - SND_SOC_DAPM_PGA_E("HP Right Out", M98090_REG_OUTPUT_ENABLE, - M98090_HPREN_SHIFT, 0, NULL, 0, max98090_dapm_event, - SND_SOC_DAPM_PRE_POST_PMU | SND_SOC_DAPM_PRE_POST_PMD), - - SND_SOC_DAPM_PGA_E("SPK Left Out", M98090_REG_OUTPUT_ENABLE, - M98090_SPLEN_SHIFT, 0, NULL, 0, max98090_dapm_event, - SND_SOC_DAPM_PRE_POST_PMU | SND_SOC_DAPM_PRE_POST_PMD), - SND_SOC_DAPM_PGA_E("SPK Right Out", M98090_REG_OUTPUT_ENABLE, - M98090_SPREN_SHIFT, 0, NULL, 0, max98090_dapm_event, - SND_SOC_DAPM_PRE_POST_PMU | SND_SOC_DAPM_PRE_POST_PMD), - - SND_SOC_DAPM_PGA_E("RCV Left Out", M98090_REG_OUTPUT_ENABLE, - M98090_RCVLEN_SHIFT, 0, NULL, 0, max98090_dapm_event, - SND_SOC_DAPM_PRE_POST_PMU | SND_SOC_DAPM_PRE_POST_PMD), - SND_SOC_DAPM_PGA_E("RCV Right Out", M98090_REG_OUTPUT_ENABLE, - M98090_RCVREN_SHIFT, 0, NULL, 0, max98090_dapm_event, - SND_SOC_DAPM_PRE_POST_PMU | SND_SOC_DAPM_PRE_POST_PMD), + SND_SOC_DAPM_PGA("HP Left Out", M98090_REG_OUTPUT_ENABLE, + M98090_HPLEN_SHIFT, 0, NULL, 0), + SND_SOC_DAPM_PGA("HP Right Out", M98090_REG_OUTPUT_ENABLE, + M98090_HPREN_SHIFT, 0, NULL, 0), + + SND_SOC_DAPM_PGA("SPK Left Out", M98090_REG_OUTPUT_ENABLE, + M98090_SPLEN_SHIFT, 0, NULL, 0), + SND_SOC_DAPM_PGA("SPK Right Out", M98090_REG_OUTPUT_ENABLE, + M98090_SPREN_SHIFT, 0, NULL, 0), + + SND_SOC_DAPM_PGA("RCV Left Out", M98090_REG_OUTPUT_ENABLE, + M98090_RCVLEN_SHIFT, 0, NULL, 0), + SND_SOC_DAPM_PGA("RCV Right Out", M98090_REG_OUTPUT_ENABLE, + M98090_RCVREN_SHIFT, 0, NULL, 0), SND_SOC_DAPM_OUTPUT("HPL"), SND_SOC_DAPM_OUTPUT("HPR"), @@ -1395,11 +1228,9 @@ static const struct snd_soc_dapm_widget max98091_dapm_widgets[] = { SND_SOC_DAPM_INPUT("DMIC4"), SND_SOC_DAPM_SUPPLY("DMIC3_ENA", M98090_REG_DIGITAL_MIC_ENABLE, - M98090_DIGMIC3_SHIFT, 0, max98090_dapm_event, - SND_SOC_DAPM_PRE_POST_PMU | SND_SOC_DAPM_PRE_POST_PMD), + M98090_DIGMIC3_SHIFT, 0, NULL, 0), SND_SOC_DAPM_SUPPLY("DMIC4_ENA", M98090_REG_DIGITAL_MIC_ENABLE, - M98090_DIGMIC4_SHIFT, 0, max98090_dapm_event, - SND_SOC_DAPM_PRE_POST_PMU | SND_SOC_DAPM_PRE_POST_PMD), + M98090_DIGMIC4_SHIFT, 0, NULL, 0), }; static const struct snd_soc_dapm_route max98090_dapm_routes[] = { @@ -1670,11 +1501,6 @@ static void max98090_configure_bclk(struct snd_soc_component *component) return; } - /* - * Master mode: no need to save and restore SHDN for the following - * sensitive registers. - */ - /* Check for supported PCLK to LRCLK ratios */ for (i = 0; i < ARRAY_SIZE(pclk_rates); i++) { if ((pclk_rates[i] == max98090->sysclk) && @@ -1761,14 +1587,12 @@ static int max98090_dai_set_fmt(struct snd_soc_dai *codec_dai, switch (fmt & SND_SOC_DAIFMT_MASTER_MASK) { case SND_SOC_DAIFMT_CBS_CFS: /* Set to slave mode PLL - MAS mode off */ - max98090_shdn_save(max98090); snd_soc_component_write(component, M98090_REG_CLOCK_RATIO_NI_MSB, 0x00); snd_soc_component_write(component, M98090_REG_CLOCK_RATIO_NI_LSB, 0x00); snd_soc_component_update_bits(component, M98090_REG_CLOCK_MODE, M98090_USE_M1_MASK, 0); - max98090_shdn_restore(max98090); max98090->master = false; break; case SND_SOC_DAIFMT_CBM_CFM: @@ -1794,9 +1618,7 @@ static int max98090_dai_set_fmt(struct snd_soc_dai *codec_dai, dev_err(component->dev, "DAI clock mode unsupported"); return -EINVAL; } - max98090_shdn_save(max98090); snd_soc_component_write(component, M98090_REG_MASTER_MODE, regval); - max98090_shdn_restore(max98090); regval = 0; switch (fmt & SND_SOC_DAIFMT_FORMAT_MASK) { @@ -1841,10 +1663,8 @@ static int max98090_dai_set_fmt(struct snd_soc_dai *codec_dai, if (max98090->tdm_slots > 1) regval ^= M98090_BCI_MASK; - max98090_shdn_save(max98090); snd_soc_component_write(component, M98090_REG_INTERFACE_FORMAT, regval); - max98090_shdn_restore(max98090); } return 0; @@ -1856,7 +1676,6 @@ static int max98090_set_tdm_slot(struct snd_soc_dai *codec_dai, struct snd_soc_component *component = codec_dai->component; struct max98090_priv *max98090 = snd_soc_component_get_drvdata(component); struct max98090_cdata *cdata; - cdata = &max98090->dai[0]; if (slots < 0 || slots > 4) @@ -1866,7 +1685,6 @@ static int max98090_set_tdm_slot(struct snd_soc_dai *codec_dai, max98090->tdm_width = slot_width; if (max98090->tdm_slots > 1) { - max98090_shdn_save(max98090); /* SLOTL SLOTR SLOTDLY */ snd_soc_component_write(component, M98090_REG_TDM_FORMAT, 0 << M98090_TDM_SLOTL_SHIFT | @@ -1877,7 +1695,6 @@ static int max98090_set_tdm_slot(struct snd_soc_dai *codec_dai, snd_soc_component_update_bits(component, M98090_REG_TDM_CONTROL, M98090_TDM_MASK, M98090_TDM_MASK); - max98090_shdn_restore(max98090); } /* @@ -2077,7 +1894,6 @@ static int max98090_configure_dmic(struct max98090_priv *max98090, dmic_freq = dmic_table[pclk_index].settings[micclk_index].freq; dmic_comp = dmic_table[pclk_index].settings[micclk_index].comp[i]; - max98090_shdn_save(max98090); regmap_update_bits(max98090->regmap, M98090_REG_DIGITAL_MIC_ENABLE, M98090_MICCLK_MASK, micclk_index << M98090_MICCLK_SHIFT); @@ -2086,7 +1902,6 @@ static int max98090_configure_dmic(struct max98090_priv *max98090, M98090_DMIC_COMP_MASK | M98090_DMIC_FREQ_MASK, dmic_comp << M98090_DMIC_COMP_SHIFT | dmic_freq << M98090_DMIC_FREQ_SHIFT); - max98090_shdn_restore(max98090); return 0; } @@ -2123,10 +1938,8 @@ static int max98090_dai_hw_params(struct snd_pcm_substream *substream, switch (params_width(params)) { case 16: - max98090_shdn_save(max98090); snd_soc_component_update_bits(component, M98090_REG_INTERFACE_FORMAT, M98090_WS_MASK, 0); - max98090_shdn_restore(max98090); break; default: return -EINVAL; @@ -2137,7 +1950,6 @@ static int max98090_dai_hw_params(struct snd_pcm_substream *substream, cdata->rate = max98090->lrclk; - max98090_shdn_save(max98090); /* Update filter mode */ if (max98090->lrclk < 24000) snd_soc_component_update_bits(component, M98090_REG_FILTER_CONFIG, @@ -2153,7 +1965,6 @@ static int max98090_dai_hw_params(struct snd_pcm_substream *substream, else snd_soc_component_update_bits(component, M98090_REG_FILTER_CONFIG, M98090_DHF_MASK, M98090_DHF_MASK); - max98090_shdn_restore(max98090); max98090_configure_dmic(max98090, max98090->dmic_freq, max98090->pclk, max98090->lrclk); @@ -2184,7 +1995,6 @@ static int max98090_dai_set_sysclk(struct snd_soc_dai *dai, * 0x02 (when master clk is 20MHz to 40MHz).. * 0x03 (when master clk is 40MHz to 60MHz).. */ - max98090_shdn_save(max98090); if ((freq >= 10000000) && (freq <= 20000000)) { snd_soc_component_write(component, M98090_REG_SYSTEM_CLOCK, M98090_PSCLK_DIV1); @@ -2199,10 +2009,8 @@ static int max98090_dai_set_sysclk(struct snd_soc_dai *dai, max98090->pclk = freq >> 2; } else { dev_err(component->dev, "Invalid master clock frequency\n"); - max98090_shdn_restore(max98090); return -EINVAL; } - max98090_shdn_restore(max98090); max98090->sysclk = freq; @@ -2314,12 +2122,10 @@ static void max98090_pll_work(struct max98090_priv *max98090) */ /* Toggle shutdown OFF then ON */ - mutex_lock(&component->card->dapm_mutex); snd_soc_component_update_bits(component, M98090_REG_DEVICE_SHUTDOWN, M98090_SHDNN_MASK, 0); snd_soc_component_update_bits(component, M98090_REG_DEVICE_SHUTDOWN, M98090_SHDNN_MASK, M98090_SHDNN_MASK); - mutex_unlock(&component->card->dapm_mutex); for (i = 0; i < 10; ++i) { /* Give PLL time to lock */ @@ -2642,12 +2448,7 @@ static int max98090_probe(struct snd_soc_component *component) */ snd_soc_component_read32(component, M98090_REG_DEVICE_STATUS); - /* - * SHDN should be 0 at the point, no need to save/restore for the - * following registers. - * - * High Performance is default - */ + /* High Performance is default */ snd_soc_component_update_bits(component, M98090_REG_DAC_CONTROL, M98090_DACHP_MASK, 1 << M98090_DACHP_SHIFT); @@ -2658,12 +2459,7 @@ static int max98090_probe(struct snd_soc_component *component) M98090_ADCHP_MASK, 1 << M98090_ADCHP_SHIFT); - /* - * SHDN should be 0 at the point, no need to save/restore for the - * following registers. - * - * Turn on VCM bandgap reference - */ + /* Turn on VCM bandgap reference */ snd_soc_component_write(component, M98090_REG_BIAS_CONTROL, M98090_VCM_MODE_MASK); @@ -2695,9 +2491,25 @@ static void max98090_remove(struct snd_soc_component *component) max98090->component = NULL; } +static void max98090_seq_notifier(struct snd_soc_component *component, + enum snd_soc_dapm_type event, int subseq) +{ + struct max98090_priv *max98090 = snd_soc_component_get_drvdata(component); + + if (max98090->shdn_pending) { + snd_soc_component_update_bits(component, M98090_REG_DEVICE_SHUTDOWN, + M98090_SHDNN_MASK, 0); + msleep(40); + snd_soc_component_update_bits(component, M98090_REG_DEVICE_SHUTDOWN, + M98090_SHDNN_MASK, M98090_SHDNN_MASK); + max98090->shdn_pending = false; + } +} + static const struct snd_soc_component_driver soc_component_dev_max98090 = { .probe = max98090_probe, .remove = max98090_remove, + .seq_notifier = max98090_seq_notifier, .set_bias_level = max98090_set_bias_level, .idle_bias_on = 1, .use_pmdown_time = 1, diff --git a/sound/soc/codecs/max98090.h b/sound/soc/codecs/max98090.h index 0a31708b7df7..a197114b0dad 100644 --- a/sound/soc/codecs/max98090.h +++ b/sound/soc/codecs/max98090.h @@ -1539,8 +1539,7 @@ struct max98090_priv { unsigned int pa2en; unsigned int sidetone; bool master; - int saved_count; - int saved_shdn; + bool shdn_pending; }; int max98090_mic_detect(struct snd_soc_component *component, -- cgit v1.2.3 From 8f486296459c084b106d907414540301bd9485fd Mon Sep 17 00:00:00 2001 From: Tzung-Bi Shih Date: Fri, 14 Feb 2020 18:57:43 +0800 Subject: ASoC: dapm: remove snd_soc_dapm_put_enum_double_locked Reverts commit 839284e79482 ("ASoC: dapm: add snd_soc_dapm_put_enum_double_locked"). Signed-off-by: Tzung-Bi Shih Link: https://lore.kernel.org/r/20200214105744.82258-3-tzungbi@google.com Signed-off-by: Mark Brown --- include/sound/soc-dapm.h | 2 -- sound/soc/soc-dapm.c | 54 ++++++++++++------------------------------------ 2 files changed, 13 insertions(+), 43 deletions(-) diff --git a/include/sound/soc-dapm.h b/include/sound/soc-dapm.h index 2a306c6f3fbc..1b6afbc1a4ed 100644 --- a/include/sound/soc-dapm.h +++ b/include/sound/soc-dapm.h @@ -392,8 +392,6 @@ int snd_soc_dapm_get_enum_double(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_value *ucontrol); int snd_soc_dapm_put_enum_double(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_value *ucontrol); -int snd_soc_dapm_put_enum_double_locked(struct snd_kcontrol *kcontrol, - struct snd_ctl_elem_value *ucontrol); int snd_soc_dapm_info_pin_switch(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_info *uinfo); int snd_soc_dapm_get_pin_switch(struct snd_kcontrol *kcontrol, diff --git a/sound/soc/soc-dapm.c b/sound/soc/soc-dapm.c index 8b24396675ec..9b130561d562 100644 --- a/sound/soc/soc-dapm.c +++ b/sound/soc/soc-dapm.c @@ -3441,8 +3441,17 @@ int snd_soc_dapm_get_enum_double(struct snd_kcontrol *kcontrol, } EXPORT_SYMBOL_GPL(snd_soc_dapm_get_enum_double); -static int __snd_soc_dapm_put_enum_double(struct snd_kcontrol *kcontrol, - struct snd_ctl_elem_value *ucontrol, int locked) +/** + * snd_soc_dapm_put_enum_double - dapm enumerated double mixer set callback + * @kcontrol: mixer control + * @ucontrol: control element information + * + * Callback to set the value of a dapm enumerated double mixer control. + * + * Returns 0 for success. + */ +int snd_soc_dapm_put_enum_double(struct snd_kcontrol *kcontrol, + struct snd_ctl_elem_value *ucontrol) { struct snd_soc_dapm_context *dapm = snd_soc_dapm_kcontrol_dapm(kcontrol); struct snd_soc_card *card = dapm->card; @@ -3465,9 +3474,7 @@ static int __snd_soc_dapm_put_enum_double(struct snd_kcontrol *kcontrol, mask |= e->mask << e->shift_r; } - if (!locked) - mutex_lock_nested(&card->dapm_mutex, - SND_SOC_DAPM_CLASS_RUNTIME); + mutex_lock_nested(&card->dapm_mutex, SND_SOC_DAPM_CLASS_RUNTIME); change = dapm_kcontrol_set_value(kcontrol, val); @@ -3489,50 +3496,15 @@ static int __snd_soc_dapm_put_enum_double(struct snd_kcontrol *kcontrol, card->update = NULL; } - if (!locked) - mutex_unlock(&card->dapm_mutex); + mutex_unlock(&card->dapm_mutex); if (ret > 0) soc_dpcm_runtime_update(card); return change; } - -/** - * snd_soc_dapm_put_enum_double - dapm enumerated double mixer set callback - * @kcontrol: mixer control - * @ucontrol: control element information - * - * Callback to set the value of a dapm enumerated double mixer control. - * - * Returns 0 for success. - */ -int snd_soc_dapm_put_enum_double(struct snd_kcontrol *kcontrol, - struct snd_ctl_elem_value *ucontrol) -{ - return __snd_soc_dapm_put_enum_double(kcontrol, ucontrol, 0); -} EXPORT_SYMBOL_GPL(snd_soc_dapm_put_enum_double); -/** - * snd_soc_dapm_put_enum_double_locked - dapm enumerated double mixer set - * callback - * @kcontrol: mixer control - * @ucontrol: control element information - * - * Callback to set the value of a dapm enumerated double mixer control. - * Must acquire dapm_mutex before calling the function. - * - * Returns 0 for success. - */ -int snd_soc_dapm_put_enum_double_locked(struct snd_kcontrol *kcontrol, - struct snd_ctl_elem_value *ucontrol) -{ - dapm_assert_locked(snd_soc_dapm_kcontrol_dapm(kcontrol)); - return __snd_soc_dapm_put_enum_double(kcontrol, ucontrol, 1); -} -EXPORT_SYMBOL_GPL(snd_soc_dapm_put_enum_double_locked); - /** * snd_soc_dapm_info_pin_switch - Info for a pin switch * -- cgit v1.2.3 From 2c6251ad91afc2e3c671c904702e8d121d3d50c0 Mon Sep 17 00:00:00 2001 From: Steve French Date: Wed, 12 Feb 2020 22:37:08 -0600 Subject: cifs: enable change notification for SMB2.1 dialect It was originally enabled only for SMB3 or later dialects, but had requests to add it to SMB2.1 mounts as well given the large number of systems at that dialect level. Signed-off-by: Steve French Reported-by: L Walsh Acked-by: Ronnie Sahlberg --- fs/cifs/smb2ops.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index baa825f4cec0..aef33630e315 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -4795,6 +4795,7 @@ struct smb_version_operations smb21_operations = { .wp_retry_size = smb2_wp_retry_size, .dir_needs_close = smb2_dir_needs_close, .enum_snapshots = smb3_enum_snapshots, + .notify = smb3_notify, .get_dfs_refer = smb2_get_dfs_refer, .select_sectype = smb2_select_sectype, #ifdef CONFIG_CIFS_XATTR -- cgit v1.2.3 From 85db6b7ae65f33be4bb44f1c28261a3faa126437 Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg Date: Thu, 13 Feb 2020 12:14:47 +1000 Subject: cifs: make sure we do not overflow the max EA buffer size RHBZ: 1752437 Before we add a new EA we should check that this will not overflow the maximum buffer we have available to read the EAs back. Otherwise we can get into a situation where the EAs are so big that we can not read them back to the client and thus we can not list EAs anymore or delete them. Signed-off-by: Ronnie Sahlberg Signed-off-by: Steve French CC: Stable --- fs/cifs/smb2ops.c | 35 ++++++++++++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index aef33630e315..e47190cae163 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -1116,7 +1116,8 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon, void *data[1]; struct smb2_file_full_ea_info *ea = NULL; struct kvec close_iov[1]; - int rc; + struct smb2_query_info_rsp *rsp; + int rc, used_len = 0; if (smb3_encryption_required(tcon)) flags |= CIFS_TRANSFORM_REQ; @@ -1139,6 +1140,38 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon, cifs_sb); if (rc == -ENODATA) goto sea_exit; + } else { + /* If we are adding a attribute we should first check + * if there will be enough space available to store + * the new EA. If not we should not add it since we + * would not be able to even read the EAs back. + */ + rc = smb2_query_info_compound(xid, tcon, utf16_path, + FILE_READ_EA, + FILE_FULL_EA_INFORMATION, + SMB2_O_INFO_FILE, + CIFSMaxBufSize - + MAX_SMB2_CREATE_RESPONSE_SIZE - + MAX_SMB2_CLOSE_RESPONSE_SIZE, + &rsp_iov[1], &resp_buftype[1], cifs_sb); + if (rc == 0) { + rsp = (struct smb2_query_info_rsp *)rsp_iov[1].iov_base; + used_len = le32_to_cpu(rsp->OutputBufferLength); + } + free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base); + resp_buftype[1] = CIFS_NO_BUFFER; + memset(&rsp_iov[1], 0, sizeof(rsp_iov[1])); + rc = 0; + + /* Use a fudge factor of 256 bytes in case we collide + * with a different set_EAs command. + */ + if(CIFSMaxBufSize - MAX_SMB2_CREATE_RESPONSE_SIZE - + MAX_SMB2_CLOSE_RESPONSE_SIZE - 256 < + used_len + ea_name_len + ea_value_len + 1) { + rc = -ENOSPC; + goto sea_exit; + } } } -- cgit v1.2.3 From 2d570a7c0251c594489a2c16b82b14ae30345c03 Mon Sep 17 00:00:00 2001 From: Anton Eidelman Date: Mon, 10 Feb 2020 10:37:18 -0800 Subject: nvme/tcp: fix bug on double requeue when send fails When nvme_tcp_io_work() fails to send to socket due to connection close/reset, error_recovery work is triggered from nvme_tcp_state_change() socket callback. This cancels all the active requests in the tagset, which requeues them. The failed request, however, was ended and thus requeued individually as well unless send returned -EPIPE. Another return code to be treated the same way is -ECONNRESET. Double requeue caused BUG_ON(blk_queued_rq(rq)) in blk_mq_requeue_request() from either the individual requeue of the failed request or the bulk requeue from blk_mq_tagset_busy_iter(, nvme_cancel_request, ); Signed-off-by: Anton Eidelman Reviewed-by: Sagi Grimberg Signed-off-by: Keith Busch Signed-off-by: Jens Axboe --- drivers/nvme/host/tcp.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 6d43b23a0fc8..f8fa5c5b79f1 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -1054,7 +1054,12 @@ static void nvme_tcp_io_work(struct work_struct *w) } else if (unlikely(result < 0)) { dev_err(queue->ctrl->ctrl.device, "failed to send request %d\n", result); - if (result != -EPIPE) + + /* + * Fail the request unless peer closed the connection, + * in which case error recovery flow will complete all. + */ + if ((result != -EPIPE) && (result != -ECONNRESET)) nvme_tcp_fail_request(queue->request); nvme_tcp_done_send_req(queue); return; -- cgit v1.2.3 From 97b2512ad000a409b4073dd1a71e4157d76675cb Mon Sep 17 00:00:00 2001 From: Nigel Kirkland Date: Mon, 10 Feb 2020 16:01:45 -0800 Subject: nvme: prevent warning triggered by nvme_stop_keep_alive Delayed keep alive work is queued on system workqueue and may be cancelled via nvme_stop_keep_alive from nvme_reset_wq, nvme_fc_wq or nvme_wq. Check_flush_dependency detects mismatched attributes between the work-queue context used to cancel the keep alive work and system-wq. Specifically system-wq does not have the WQ_MEM_RECLAIM flag, whereas the contexts used to cancel keep alive work have WQ_MEM_RECLAIM flag. Example warning: workqueue: WQ_MEM_RECLAIM nvme-reset-wq:nvme_fc_reset_ctrl_work [nvme_fc] is flushing !WQ_MEM_RECLAIM events:nvme_keep_alive_work [nvme_core] To avoid the flags mismatch, delayed keep alive work is queued on nvme_wq. However this creates a secondary concern where work and a request to cancel that work may be in the same work queue - namely err_work in the rdma and tcp transports, which will want to flush/cancel the keep alive work which will now be on nvme_wq. After reviewing the transports, it looks like err_work can be moved to nvme_reset_wq. In fact that aligns them better with transition into RESETTING and performing related reset work in nvme_reset_wq. Change nvme-rdma and nvme-tcp to perform err_work in nvme_reset_wq. Signed-off-by: Nigel Kirkland Signed-off-by: James Smart Reviewed-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 10 +++++----- drivers/nvme/host/rdma.c | 2 +- drivers/nvme/host/tcp.c | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 5dc32b72e7fa..7f05deada7f4 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -66,8 +66,8 @@ MODULE_PARM_DESC(streams, "turn on support for Streams write directives"); * nvme_reset_wq - hosts nvme reset works * nvme_delete_wq - hosts nvme delete works * - * nvme_wq will host works such are scan, aen handling, fw activation, - * keep-alive error recovery, periodic reconnects etc. nvme_reset_wq + * nvme_wq will host works such as scan, aen handling, fw activation, + * keep-alive, periodic reconnects etc. nvme_reset_wq * runs reset works which also flush works hosted on nvme_wq for * serialization purposes. nvme_delete_wq host controller deletion * works which flush reset works for serialization. @@ -976,7 +976,7 @@ static void nvme_keep_alive_end_io(struct request *rq, blk_status_t status) startka = true; spin_unlock_irqrestore(&ctrl->lock, flags); if (startka) - schedule_delayed_work(&ctrl->ka_work, ctrl->kato * HZ); + queue_delayed_work(nvme_wq, &ctrl->ka_work, ctrl->kato * HZ); } static int nvme_keep_alive(struct nvme_ctrl *ctrl) @@ -1006,7 +1006,7 @@ static void nvme_keep_alive_work(struct work_struct *work) dev_dbg(ctrl->device, "reschedule traffic based keep-alive timer\n"); ctrl->comp_seen = false; - schedule_delayed_work(&ctrl->ka_work, ctrl->kato * HZ); + queue_delayed_work(nvme_wq, &ctrl->ka_work, ctrl->kato * HZ); return; } @@ -1023,7 +1023,7 @@ static void nvme_start_keep_alive(struct nvme_ctrl *ctrl) if (unlikely(ctrl->kato == 0)) return; - schedule_delayed_work(&ctrl->ka_work, ctrl->kato * HZ); + queue_delayed_work(nvme_wq, &ctrl->ka_work, ctrl->kato * HZ); } void nvme_stop_keep_alive(struct nvme_ctrl *ctrl) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 2a47c6c5007e..3e85c5cacefd 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -1088,7 +1088,7 @@ static void nvme_rdma_error_recovery(struct nvme_rdma_ctrl *ctrl) if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RESETTING)) return; - queue_work(nvme_wq, &ctrl->err_work); + queue_work(nvme_reset_wq, &ctrl->err_work); } static void nvme_rdma_wr_error(struct ib_cq *cq, struct ib_wc *wc, diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index f8fa5c5b79f1..49d4373b84eb 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -422,7 +422,7 @@ static void nvme_tcp_error_recovery(struct nvme_ctrl *ctrl) if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING)) return; - queue_work(nvme_wq, &to_tcp_ctrl(ctrl)->err_work); + queue_work(nvme_reset_wq, &to_tcp_ctrl(ctrl)->err_work); } static int nvme_tcp_process_nvme_cqe(struct nvme_tcp_queue *queue, -- cgit v1.2.3 From fa46c6fb5d61b1f17b06d7c6ef75478b576304c7 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 13 Feb 2020 01:41:05 +0900 Subject: nvme/pci: move cqe check after device shutdown Many users have reported nvme triggered irq_startup() warnings during shutdown. The driver uses the nvme queue's irq to synchronize scanning for completions, and enabling an interrupt affined to only offline CPUs triggers the alarming warning. Move the final CQE check to after disabling the device and all registered interrupts have been torn down so that we do not have any IRQ to synchronize. Link: https://bugzilla.kernel.org/show_bug.cgi?id=206509 Reviewed-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch Signed-off-by: Jens Axboe --- drivers/nvme/host/pci.c | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index da392b50f73e..9c80f9f08149 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1401,6 +1401,23 @@ static void nvme_disable_admin_queue(struct nvme_dev *dev, bool shutdown) nvme_poll_irqdisable(nvmeq, -1); } +/* + * Called only on a device that has been disabled and after all other threads + * that can check this device's completion queues have synced. This is the + * last chance for the driver to see a natural completion before + * nvme_cancel_request() terminates all incomplete requests. + */ +static void nvme_reap_pending_cqes(struct nvme_dev *dev) +{ + u16 start, end; + int i; + + for (i = dev->ctrl.queue_count - 1; i > 0; i--) { + nvme_process_cq(&dev->queues[i], &start, &end, -1); + nvme_complete_cqes(&dev->queues[i], start, end); + } +} + static int nvme_cmb_qdepth(struct nvme_dev *dev, int nr_io_queues, int entry_size) { @@ -2235,11 +2252,6 @@ static bool __nvme_disable_io_queues(struct nvme_dev *dev, u8 opcode) if (timeout == 0) return false; - /* handle any remaining CQEs */ - if (opcode == nvme_admin_delete_cq && - !test_bit(NVMEQ_DELETE_ERROR, &nvmeq->flags)) - nvme_poll_irqdisable(nvmeq, -1); - sent--; if (nr_queues) goto retry; @@ -2428,6 +2440,7 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown) nvme_suspend_io_queues(dev); nvme_suspend_queue(&dev->queues[0]); nvme_pci_disable(dev); + nvme_reap_pending_cqes(dev); blk_mq_tagset_busy_iter(&dev->tagset, nvme_cancel_request, &dev->ctrl); blk_mq_tagset_busy_iter(&dev->admin_tagset, nvme_cancel_request, &dev->ctrl); -- cgit v1.2.3 From f25372ffc3f6c2684b57fb718219137e6ee2b64c Mon Sep 17 00:00:00 2001 From: Yi Zhang Date: Fri, 14 Feb 2020 18:48:02 +0800 Subject: nvme: fix the parameter order for nvme_get_log in nvme_get_fw_slot_info nvme fw-activate operation will get bellow warning log, fix it by update the parameter order [ 113.231513] nvme nvme0: Get FW SLOT INFO log error Fixes: 0e98719b0e4b ("nvme: simplify the API for getting log pages") Reported-by: Sujith Pandel Reviewed-by: David Milburn Signed-off-by: Yi Zhang Signed-off-by: Keith Busch Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 7f05deada7f4..ada59df642d2 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3867,7 +3867,7 @@ static void nvme_get_fw_slot_info(struct nvme_ctrl *ctrl) if (!log) return; - if (nvme_get_log(ctrl, NVME_NSID_ALL, 0, NVME_LOG_FW_SLOT, log, + if (nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_FW_SLOT, 0, log, sizeof(*log), 0)) dev_warn(ctrl->device, "Get FW SLOT INFO log error\n"); kfree(log); -- cgit v1.2.3 From 43064f5c8b8818e35fa254496ef00aabd63d547a Mon Sep 17 00:00:00 2001 From: Bhawanpreet Lakha Date: Thu, 6 Feb 2020 14:26:38 -0500 Subject: drm/amd/display: fix backwards byte order in rx_caps. We were using incorrect byte order after we started using the drm_defines So fix it. Fixes: 02837a91ae75 ("drm/amd/display: add and use defines from drm_hdcp.h") Signed-off-by: JinZe.Xu Signed-off-by: Bhawanpreet Lakha Reviewed-by: Wenjing Liu Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/modules/hdcp/hdcp2_execution.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/display/modules/hdcp/hdcp2_execution.c b/drivers/gpu/drm/amd/display/modules/hdcp/hdcp2_execution.c index f730b94ac3c0..55246711700b 100644 --- a/drivers/gpu/drm/amd/display/modules/hdcp/hdcp2_execution.c +++ b/drivers/gpu/drm/amd/display/modules/hdcp/hdcp2_execution.c @@ -46,8 +46,8 @@ static inline enum mod_hdcp_status check_hdcp2_capable(struct mod_hdcp *hdcp) enum mod_hdcp_status status; if (is_dp_hdcp(hdcp)) - status = (hdcp->auth.msg.hdcp2.rxcaps_dp[2] & HDCP_2_2_RX_CAPS_VERSION_VAL) && - HDCP_2_2_DP_HDCP_CAPABLE(hdcp->auth.msg.hdcp2.rxcaps_dp[0]) ? + status = (hdcp->auth.msg.hdcp2.rxcaps_dp[0] == HDCP_2_2_RX_CAPS_VERSION_VAL) && + HDCP_2_2_DP_HDCP_CAPABLE(hdcp->auth.msg.hdcp2.rxcaps_dp[2]) ? MOD_HDCP_STATUS_SUCCESS : MOD_HDCP_STATUS_HDCP2_NOT_CAPABLE; else -- cgit v1.2.3 From c6f8c440441029d5621ee5153676243234a4b76e Mon Sep 17 00:00:00 2001 From: Bhawanpreet Lakha Date: Fri, 7 Feb 2020 10:41:20 -0500 Subject: drm/amd/display: fix dtm unloading there was a type in the terminate command. We should be calling psp_dtm_unload() instead of psp_hdcp_unload() Fixes: 143f23053333 ("drm/amdgpu: psp DTM init") Signed-off-by: Bhawanpreet Lakha Reviewed-by: Feifei Xu Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c index 3a1570dafe34..146f96661b6b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c @@ -1013,6 +1013,30 @@ static int psp_dtm_initialize(struct psp_context *psp) return 0; } +static int psp_dtm_unload(struct psp_context *psp) +{ + int ret; + struct psp_gfx_cmd_resp *cmd; + + /* + * TODO: bypass the unloading in sriov for now + */ + if (amdgpu_sriov_vf(psp->adev)) + return 0; + + cmd = kzalloc(sizeof(struct psp_gfx_cmd_resp), GFP_KERNEL); + if (!cmd) + return -ENOMEM; + + psp_prep_ta_unload_cmd_buf(cmd, psp->dtm_context.session_id); + + ret = psp_cmd_submit_buf(psp, NULL, cmd, psp->fence_buf_mc_addr); + + kfree(cmd); + + return ret; +} + int psp_dtm_invoke(struct psp_context *psp, uint32_t ta_cmd_id) { /* @@ -1037,7 +1061,7 @@ static int psp_dtm_terminate(struct psp_context *psp) if (!psp->dtm_context.dtm_initialized) return 0; - ret = psp_hdcp_unload(psp); + ret = psp_dtm_unload(psp); if (ret) return ret; -- cgit v1.2.3 From aad4e2dbe543bc1633bc208ac7bddc4f0bb185ba Mon Sep 17 00:00:00 2001 From: Evan Quan Date: Tue, 11 Feb 2020 12:39:53 +0800 Subject: drm/amd/powerplay: always refetch the enabled features status on dpm enablement Otherwise, the cached dpm features status may be inconsistent under some case(e.g. baco reset of Navi asic). Signed-off-by: Evan Quan Acked-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/powerplay/smu_v11_0.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/powerplay/smu_v11_0.c b/drivers/gpu/drm/amd/powerplay/smu_v11_0.c index 0dc49479a7eb..b06c057a9002 100644 --- a/drivers/gpu/drm/amd/powerplay/smu_v11_0.c +++ b/drivers/gpu/drm/amd/powerplay/smu_v11_0.c @@ -898,6 +898,9 @@ int smu_v11_0_system_features_control(struct smu_context *smu, if (ret) return ret; + bitmap_zero(feature->enabled, feature->feature_num); + bitmap_zero(feature->supported, feature->feature_num); + if (en) { ret = smu_feature_get_enabled_mask(smu, feature_mask, 2); if (ret) @@ -907,9 +910,6 @@ int smu_v11_0_system_features_control(struct smu_context *smu, feature->feature_num); bitmap_copy(feature->supported, (unsigned long *)&feature_mask, feature->feature_num); - } else { - bitmap_zero(feature->enabled, feature->feature_num); - bitmap_zero(feature->supported, feature->feature_num); } return ret; -- cgit v1.2.3 From c657b936ea98630ef5ba4f130ab1ad5c534d0165 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Wed, 12 Feb 2020 01:46:16 -0500 Subject: drm/amdgpu/soc15: fix xclk for raven It's 25 Mhz (refclk / 4). This fixes the interpretation of the rlc clock counter. Acked-by: Evan Quan Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/soc15.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.c b/drivers/gpu/drm/amd/amdgpu/soc15.c index 15f3424a1ff7..2b488dfb2f21 100644 --- a/drivers/gpu/drm/amd/amdgpu/soc15.c +++ b/drivers/gpu/drm/amd/amdgpu/soc15.c @@ -272,7 +272,12 @@ static u32 soc15_get_config_memsize(struct amdgpu_device *adev) static u32 soc15_get_xclk(struct amdgpu_device *adev) { - return adev->clock.spll.reference_freq; + u32 reference_clock = adev->clock.spll.reference_freq; + + if (adev->asic_type == CHIP_RAVEN) + return reference_clock / 4; + + return reference_clock; } -- cgit v1.2.3 From 120cf959308e1bda984e40a9edd25ee2d6262efd Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Wed, 12 Feb 2020 08:51:29 -0500 Subject: drm/amdgpu/gfx9: disable gfxoff when reading rlc clock Otherwise we readback all ones. Fixes rlc counter readback while gfxoff is active. Reviewed-by: Xiaojie Yuan Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c index b33a4eb39193..6d6aca08d6fa 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c @@ -3959,6 +3959,7 @@ static uint64_t gfx_v9_0_get_gpu_clock_counter(struct amdgpu_device *adev) { uint64_t clock; + amdgpu_gfx_off_ctrl(adev, false); mutex_lock(&adev->gfx.gpu_clock_mutex); if (adev->asic_type == CHIP_VEGA10 && amdgpu_sriov_runtime(adev)) { uint32_t tmp, lsb, msb, i = 0; @@ -3977,6 +3978,7 @@ static uint64_t gfx_v9_0_get_gpu_clock_counter(struct amdgpu_device *adev) ((uint64_t)RREG32_SOC15(GC, 0, mmRLC_GPU_CLOCK_COUNT_MSB) << 32ULL); } mutex_unlock(&adev->gfx.gpu_clock_mutex); + amdgpu_gfx_off_ctrl(adev, true); return clock; } -- cgit v1.2.3 From b08c3ed609aabc4e76e74edc4404f0c26279d7ed Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Wed, 12 Feb 2020 08:52:32 -0500 Subject: drm/amdgpu/gfx10: disable gfxoff when reading rlc clock Otherwise we readback all ones. Fixes rlc counter readback while gfxoff is active. Reviewed-by: Xiaojie Yuan Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c index 1785fdad6ecb..22bbb36c768e 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c @@ -3923,11 +3923,13 @@ static uint64_t gfx_v10_0_get_gpu_clock_counter(struct amdgpu_device *adev) { uint64_t clock; + amdgpu_gfx_off_ctrl(adev, false); mutex_lock(&adev->gfx.gpu_clock_mutex); WREG32_SOC15(GC, 0, mmRLC_CAPTURE_GPU_CLOCK_COUNT, 1); clock = (uint64_t)RREG32_SOC15(GC, 0, mmRLC_GPU_CLOCK_COUNT_LSB) | ((uint64_t)RREG32_SOC15(GC, 0, mmRLC_GPU_CLOCK_COUNT_MSB) << 32ULL); mutex_unlock(&adev->gfx.gpu_clock_mutex); + amdgpu_gfx_off_ctrl(adev, true); return clock; } -- cgit v1.2.3 From 685eff513183d6d64a5f413531e683d23b8b198b Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 6 Feb 2020 10:27:54 -0400 Subject: IB/mlx5: Use div64_u64 for num_var_hw_entries calculation On i386: ERROR: "__udivdi3" [drivers/infiniband/hw/mlx5/mlx5_ib.ko] undefined! ERROR: "__divdi3" [drivers/infiniband/hw/mlx5/mlx5_ib.ko] undefined! Fixes: f164be8c0366 ("IB/mlx5: Extend caps stage to handle VAR capabilities") Reported-by: Randy Dunlap Acked-by: Randy Dunlap # build-tested Reported-by: Alexander Lobakin Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 987bfdcd12a5..e4bcfa81b70a 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -6545,7 +6545,7 @@ static int mlx5_ib_init_var_table(struct mlx5_ib_dev *dev) doorbell_bar_offset); bar_size = (1ULL << log_doorbell_bar_size) * 4096; var_table->stride_size = 1ULL << log_doorbell_stride; - var_table->num_var_hw_entries = bar_size / var_table->stride_size; + var_table->num_var_hw_entries = div64_u64(bar_size, var_table->stride_size); mutex_init(&var_table->bitmap_lock); var_table->bitmap = bitmap_zalloc(var_table->num_var_hw_entries, GFP_KERNEL); -- cgit v1.2.3 From 726464596b5d3f10b7c655129a62168e5c17d60c Mon Sep 17 00:00:00 2001 From: Tyler Hicks Date: Wed, 12 Feb 2020 23:35:03 +0000 Subject: MAINTAINERS: eCryptfs: Update maintainer address and downgrade status Adjust my email address to a personal account. Downgrade the status of eCryptfs maintenance to 'Odd Fixes' since it has not been part of my work responsibilities recently and I've had little personal time to devote to it. eCryptfs hasn't seen active development in some time. New deployments of file level encryption should use more modern solutions, such as fscrypt, where possible. Signed-off-by: Tyler Hicks --- MAINTAINERS | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 08176d64eed5..04ee092e1940 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5740,12 +5740,12 @@ S: Maintained F: drivers/media/dvb-frontends/ec100* ECRYPT FILE SYSTEM -M: Tyler Hicks +M: Tyler Hicks L: ecryptfs@vger.kernel.org W: http://ecryptfs.org W: https://launchpad.net/ecryptfs T: git git://git.kernel.org/pub/scm/linux/kernel/git/tyhicks/ecryptfs.git -S: Supported +S: Odd Fixes F: Documentation/filesystems/ecryptfs.txt F: fs/ecryptfs/ -- cgit v1.2.3 From f8e48a8408f5e23dd514916fda128a87e34f8ffd Mon Sep 17 00:00:00 2001 From: Tyler Hicks Date: Thu, 13 Feb 2020 21:25:54 +0000 Subject: eCryptfs: Replace deactivated email address Replace a recently deactived email address with one that I'll be able to personally control and keep alive. Signed-off-by: Tyler Hicks --- fs/ecryptfs/ecryptfs_kernel.h | 2 +- fs/ecryptfs/main.c | 2 +- fs/ecryptfs/messaging.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h index 1c1a56be7ea2..e6ac78c62ca4 100644 --- a/fs/ecryptfs/ecryptfs_kernel.h +++ b/fs/ecryptfs/ecryptfs_kernel.h @@ -8,7 +8,7 @@ * Copyright (C) 2004-2008 International Business Machines Corp. * Author(s): Michael A. Halcrow * Trevor S. Highland - * Tyler Hicks + * Tyler Hicks */ #ifndef ECRYPTFS_KERNEL_H diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index b8a7ce379ffe..e63259fdef28 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c @@ -7,7 +7,7 @@ * Copyright (C) 2004-2007 International Business Machines Corp. * Author(s): Michael A. Halcrow * Michael C. Thompson - * Tyler Hicks + * Tyler Hicks */ #include diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c index c05ca39aa449..8646ba76def3 100644 --- a/fs/ecryptfs/messaging.c +++ b/fs/ecryptfs/messaging.c @@ -4,7 +4,7 @@ * * Copyright (C) 2004-2008 International Business Machines Corp. * Author(s): Michael A. Halcrow - * Tyler Hicks + * Tyler Hicks */ #include #include -- cgit v1.2.3 From 2c2a7552dd6465e8fde6bc9cccf8d66ed1c1eb72 Mon Sep 17 00:00:00 2001 From: Aditya Pakki Date: Fri, 14 Feb 2020 12:21:01 -0600 Subject: ecryptfs: replace BUG_ON with error handling code In crypt_scatterlist, if the crypt_stat argument is not set up correctly, the kernel crashes. Instead, by returning an error code upstream, the error is handled safely. The issue is detected via a static analysis tool written by us. Fixes: 237fead619984 (ecryptfs: fs/Makefile and fs/Kconfig) Signed-off-by: Aditya Pakki Signed-off-by: Tyler Hicks --- fs/ecryptfs/crypto.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index f91db24bbf3b..a064b408d841 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -311,8 +311,10 @@ static int crypt_scatterlist(struct ecryptfs_crypt_stat *crypt_stat, struct extent_crypt_result ecr; int rc = 0; - BUG_ON(!crypt_stat || !crypt_stat->tfm - || !(crypt_stat->flags & ECRYPTFS_STRUCT_INITIALIZED)); + if (!crypt_stat || !crypt_stat->tfm + || !(crypt_stat->flags & ECRYPTFS_STRUCT_INITIALIZED)) + return -EINVAL; + if (unlikely(ecryptfs_verbosity > 0)) { ecryptfs_printk(KERN_DEBUG, "Key size [%zd]; key:\n", crypt_stat->key_size); -- cgit v1.2.3 From 76261ada16dcc3be610396a46d35acc3efbda682 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 12 Feb 2020 21:08:59 -0800 Subject: scsi: Revert "RDMA/isert: Fix a recently introduced regression related to logout" Since commit 04060db41178 introduces soft lockups when toggling network interfaces, revert it. Link: https://marc.info/?l=target-devel&m=158157054906196 Cc: Rahul Kundu Cc: Mike Marciniszyn Cc: Sagi Grimberg Reported-by: Dakshaja Uppalapati Fixes: 04060db41178 ("scsi: RDMA/isert: Fix a recently introduced regression related to logout") Signed-off-by: Bart Van Assche Signed-off-by: Martin K. Petersen --- drivers/infiniband/ulp/isert/ib_isert.c | 12 ++++++++++++ drivers/target/iscsi/iscsi_target.c | 6 +++--- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/ulp/isert/ib_isert.c b/drivers/infiniband/ulp/isert/ib_isert.c index b273e421e910..a1a035270cab 100644 --- a/drivers/infiniband/ulp/isert/ib_isert.c +++ b/drivers/infiniband/ulp/isert/ib_isert.c @@ -2575,6 +2575,17 @@ isert_wait4logout(struct isert_conn *isert_conn) } } +static void +isert_wait4cmds(struct iscsi_conn *conn) +{ + isert_info("iscsi_conn %p\n", conn); + + if (conn->sess) { + target_sess_cmd_list_set_waiting(conn->sess->se_sess); + target_wait_for_sess_cmds(conn->sess->se_sess); + } +} + /** * isert_put_unsol_pending_cmds() - Drop commands waiting for * unsolicitate dataout @@ -2622,6 +2633,7 @@ static void isert_wait_conn(struct iscsi_conn *conn) ib_drain_qp(isert_conn->qp); isert_put_unsol_pending_cmds(conn); + isert_wait4cmds(conn); isert_wait4logout(isert_conn); queue_work(isert_release_wq, &isert_conn->release_work); diff --git a/drivers/target/iscsi/iscsi_target.c b/drivers/target/iscsi/iscsi_target.c index b94ed4e30770..7251a87bb576 100644 --- a/drivers/target/iscsi/iscsi_target.c +++ b/drivers/target/iscsi/iscsi_target.c @@ -4149,6 +4149,9 @@ int iscsit_close_connection( iscsit_stop_nopin_response_timer(conn); iscsit_stop_nopin_timer(conn); + if (conn->conn_transport->iscsit_wait_conn) + conn->conn_transport->iscsit_wait_conn(conn); + /* * During Connection recovery drop unacknowledged out of order * commands for this connection, and prepare the other commands @@ -4234,9 +4237,6 @@ int iscsit_close_connection( target_sess_cmd_list_set_waiting(sess->se_sess); target_wait_for_sess_cmds(sess->se_sess); - if (conn->conn_transport->iscsit_wait_conn) - conn->conn_transport->iscsit_wait_conn(conn); - ahash_request_free(conn->conn_tx_hash); if (conn->conn_rx_hash) { struct crypto_ahash *tfm; -- cgit v1.2.3 From 807b9515b7d044cf77df31f1af9d842a76ecd5cb Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 12 Feb 2020 21:09:00 -0800 Subject: scsi: Revert "target: iscsi: Wait for all commands to finish before freeing a session" Since commit e9d3009cb936 introduced a regression and since the fix for that regression was not perfect, revert this commit. Link: https://marc.info/?l=target-devel&m=158157054906195 Cc: Rahul Kundu Cc: Mike Marciniszyn Cc: Sagi Grimberg Reported-by: Dakshaja Uppalapati Fixes: e9d3009cb936 ("scsi: target: iscsi: Wait for all commands to finish before freeing a session") Signed-off-by: Bart Van Assche Signed-off-by: Martin K. Petersen --- drivers/target/iscsi/iscsi_target.c | 10 ++-------- include/scsi/iscsi_proto.h | 1 - 2 files changed, 2 insertions(+), 9 deletions(-) diff --git a/drivers/target/iscsi/iscsi_target.c b/drivers/target/iscsi/iscsi_target.c index 7251a87bb576..09e55ea0bf5d 100644 --- a/drivers/target/iscsi/iscsi_target.c +++ b/drivers/target/iscsi/iscsi_target.c @@ -1165,9 +1165,7 @@ int iscsit_setup_scsi_cmd(struct iscsi_conn *conn, struct iscsi_cmd *cmd, hdr->cmdsn, be32_to_cpu(hdr->data_length), payload_length, conn->cid); - if (target_get_sess_cmd(&cmd->se_cmd, true) < 0) - return iscsit_add_reject_cmd(cmd, - ISCSI_REASON_WAITING_FOR_LOGOUT, buf); + target_get_sess_cmd(&cmd->se_cmd, true); cmd->sense_reason = transport_lookup_cmd_lun(&cmd->se_cmd, scsilun_to_int(&hdr->lun)); @@ -2004,9 +2002,7 @@ iscsit_handle_task_mgt_cmd(struct iscsi_conn *conn, struct iscsi_cmd *cmd, conn->sess->se_sess, 0, DMA_NONE, TCM_SIMPLE_TAG, cmd->sense_buffer + 2); - if (target_get_sess_cmd(&cmd->se_cmd, true) < 0) - return iscsit_add_reject_cmd(cmd, - ISCSI_REASON_WAITING_FOR_LOGOUT, buf); + target_get_sess_cmd(&cmd->se_cmd, true); /* * TASK_REASSIGN for ERL=2 / connection stays inside of @@ -4234,8 +4230,6 @@ int iscsit_close_connection( * must wait until they have completed. */ iscsit_check_conn_usage_count(conn); - target_sess_cmd_list_set_waiting(sess->se_sess); - target_wait_for_sess_cmds(sess->se_sess); ahash_request_free(conn->conn_tx_hash); if (conn->conn_rx_hash) { diff --git a/include/scsi/iscsi_proto.h b/include/scsi/iscsi_proto.h index 533f56733ba8..b71b5c4f418c 100644 --- a/include/scsi/iscsi_proto.h +++ b/include/scsi/iscsi_proto.h @@ -627,7 +627,6 @@ struct iscsi_reject { #define ISCSI_REASON_BOOKMARK_INVALID 9 #define ISCSI_REASON_BOOKMARK_NO_RESOURCES 10 #define ISCSI_REASON_NEGOTIATION_RESET 11 -#define ISCSI_REASON_WAITING_FOR_LOGOUT 12 /* Max. number of Key=Value pairs in a text message */ #define MAX_KEY_VALUE_PAIRS 8192 -- cgit v1.2.3 From 3d87c75d84e20d8812dbfba87e46ffca29d75d40 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Fri, 14 Feb 2020 17:01:46 -0800 Subject: Input: gpio_keys_polled - replace zero-length array with flexible-array member The current codebase makes use of the zero-length array language extension to the C90 standard, but the preferred mechanism to declare variable-length types such as these ones is a flexible array member[1][2], introduced in C99: struct foo { int stuff; struct boo array[]; }; By making use of the mechanism above, we will get a compiler warning in case the flexible array does not occur last in the structure, which will help us prevent some kind of undefined behavior bugs from being inadvertently introduced[3] to the codebase from now on. Also, notice that, dynamic memory allocations won't be affected by this change: "Flexible array members have incomplete type, and so the sizeof operator may not be applied. As a quirk of the original implementation of zero-length arrays, sizeof evaluates to zero."[1] This issue was found with the help of Coccinelle. [1] https://gcc.gnu.org/onlinedocs/gcc/Zero-Length.html [2] https://github.com/KSPP/linux/issues/21 [3] commit 76497732932f ("cxgb3/l2t: Fix undefined behaviour") Signed-off-by: Gustavo A. R. Silva Link: https://lore.kernel.org/r/20200214171907.GA26588@embeddedor Signed-off-by: Dmitry Torokhov --- drivers/input/keyboard/gpio_keys_polled.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/input/keyboard/gpio_keys_polled.c b/drivers/input/keyboard/gpio_keys_polled.c index 6eb0a2f3f9de..c3937d2fc744 100644 --- a/drivers/input/keyboard/gpio_keys_polled.c +++ b/drivers/input/keyboard/gpio_keys_polled.c @@ -38,7 +38,7 @@ struct gpio_keys_polled_dev { const struct gpio_keys_platform_data *pdata; unsigned long rel_axis_seen[BITS_TO_LONGS(REL_CNT)]; unsigned long abs_axis_seen[BITS_TO_LONGS(ABS_CNT)]; - struct gpio_keys_button_data data[0]; + struct gpio_keys_button_data data[]; }; static void gpio_keys_button_event(struct input_dev *input, -- cgit v1.2.3 From 94bef5d57992f9e987a9b7e8fa736ee204ac4f7a Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Fri, 14 Feb 2020 17:02:11 -0800 Subject: Input: tca6416-keypad - replace zero-length array with flexible-array member The current codebase makes use of the zero-length array language extension to the C90 standard, but the preferred mechanism to declare variable-length types such as these ones is a flexible array member[1][2], introduced in C99: struct foo { int stuff; struct boo array[]; }; By making use of the mechanism above, we will get a compiler warning in case the flexible array does not occur last in the structure, which will help us prevent some kind of undefined behavior bugs from being inadvertently introduced[3] to the codebase from now on. Also, notice that, dynamic memory allocations won't be affected by this change: "Flexible array members have incomplete type, and so the sizeof operator may not be applied. As a quirk of the original implementation of zero-length arrays, sizeof evaluates to zero."[1] This issue was found with the help of Coccinelle. [1] https://gcc.gnu.org/onlinedocs/gcc/Zero-Length.html [2] https://github.com/KSPP/linux/issues/21 [3] commit 76497732932f ("cxgb3/l2t: Fix undefined behaviour") Signed-off-by: Gustavo A. R. Silva Link: https://lore.kernel.org/r/20200214172022.GA27490@embeddedor Signed-off-by: Dmitry Torokhov --- drivers/input/keyboard/tca6416-keypad.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/input/keyboard/tca6416-keypad.c b/drivers/input/keyboard/tca6416-keypad.c index 2a14769de637..21758767ccf0 100644 --- a/drivers/input/keyboard/tca6416-keypad.c +++ b/drivers/input/keyboard/tca6416-keypad.c @@ -33,7 +33,7 @@ MODULE_DEVICE_TABLE(i2c, tca6416_id); struct tca6416_drv_data { struct input_dev *input; - struct tca6416_button data[0]; + struct tca6416_button data[]; }; struct tca6416_keypad_chip { @@ -48,7 +48,7 @@ struct tca6416_keypad_chip { int irqnum; u16 pinmask; bool use_polling; - struct tca6416_button buttons[0]; + struct tca6416_button buttons[]; }; static int tca6416_write_reg(struct tca6416_keypad_chip *chip, int reg, u16 val) -- cgit v1.2.3 From 3dbae15538972c9e1578cb216964c2840361a538 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Fri, 14 Feb 2020 17:03:12 -0800 Subject: Input: cyapa - replace zero-length array with flexible-array member The current codebase makes use of the zero-length array language extension to the C90 standard, but the preferred mechanism to declare variable-length types such as these ones is a flexible array member[1][2], introduced in C99: struct foo { int stuff; struct boo array[]; }; By making use of the mechanism above, we will get a compiler warning in case the flexible array does not occur last in the structure, which will help us prevent some kind of undefined behavior bugs from being inadvertently introduced[3] to the codebase from now on. Also, notice that, dynamic memory allocations won't be affected by this change: "Flexible array members have incomplete type, and so the sizeof operator may not be applied. As a quirk of the original implementation of zero-length arrays, sizeof evaluates to zero."[1] This issue was found with the help of Coccinelle. [1] https://gcc.gnu.org/onlinedocs/gcc/Zero-Length.html [2] https://github.com/KSPP/linux/issues/21 [3] commit 76497732932f ("cxgb3/l2t: Fix undefined behaviour") Signed-off-by: Gustavo A. R. Silva Link: https://lore.kernel.org/r/20200214172132.GA28389@embeddedor Signed-off-by: Dmitry Torokhov --- drivers/input/mouse/cyapa_gen5.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/input/mouse/cyapa_gen5.c b/drivers/input/mouse/cyapa_gen5.c index 14239fbd72cf..7f012bfa2658 100644 --- a/drivers/input/mouse/cyapa_gen5.c +++ b/drivers/input/mouse/cyapa_gen5.c @@ -250,7 +250,7 @@ struct cyapa_tsg_bin_image_data_record { struct cyapa_tsg_bin_image { struct cyapa_tsg_bin_image_head image_head; - struct cyapa_tsg_bin_image_data_record records[0]; + struct cyapa_tsg_bin_image_data_record records[]; } __packed; struct pip_bl_packet_start { @@ -271,7 +271,7 @@ struct pip_bl_cmd_head { u8 report_id; /* Bootloader output report id, must be 40h */ u8 rsvd; /* Reserved, must be 0 */ struct pip_bl_packet_start packet_start; - u8 data[0]; /* Command data variable based on commands */ + u8 data[]; /* Command data variable based on commands */ } __packed; /* Initiate bootload command data structure. */ @@ -300,7 +300,7 @@ struct tsg_bl_metadata_row_params { struct tsg_bl_flash_row_head { u8 flash_array_id; __le16 flash_row_id; - u8 flash_data[0]; + u8 flash_data[]; } __packed; struct pip_app_cmd_head { @@ -314,7 +314,7 @@ struct pip_app_cmd_head { * Bit 6-0: command code. */ u8 cmd_code; - u8 parameter_data[0]; /* Parameter data variable based on cmd_code */ + u8 parameter_data[]; /* Parameter data variable based on cmd_code */ } __packed; /* Application get/set parameter command data structure */ -- cgit v1.2.3 From d65d87a07476aa17df2dcb3ad18c22c154315bec Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 14 Feb 2020 18:11:19 -0500 Subject: ext4: improve explanation of a mount failure caused by a misconfigured kernel If CONFIG_QFMT_V2 is not enabled, but CONFIG_QUOTA is enabled, when a user tries to mount a file system with the quota or project quota enabled, the kernel will emit a very confusing messsage: EXT4-fs warning (device vdc): ext4_enable_quotas:5914: Failed to enable quota tracking (type=0, err=-3). Please run e2fsck to fix. EXT4-fs (vdc): mount failed We will now report an explanatory message indicating which kernel configuration options have to be enabled, to avoid customer/sysadmin confusion. Link: https://lore.kernel.org/r/20200215012738.565735-1-tytso@mit.edu Google-Bug-Id: 149093531 Fixes: 7c319d328505b778 ("ext4: make quota as first class supported feature") Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/super.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index b0b9150c9773..f131eaa52f22 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3009,17 +3009,11 @@ static int ext4_feature_set_ok(struct super_block *sb, int readonly) return 0; } -#ifndef CONFIG_QUOTA - if (ext4_has_feature_quota(sb) && !readonly) { +#if !defined(CONFIG_QUOTA) || !defined(CONFIG_QFMT_V2) + if (!readonly && (ext4_has_feature_quota(sb) || + ext4_has_feature_project(sb))) { ext4_msg(sb, KERN_ERR, - "Filesystem with quota feature cannot be mounted RDWR " - "without CONFIG_QUOTA"); - return 0; - } - if (ext4_has_feature_project(sb) && !readonly) { - ext4_msg(sb, KERN_ERR, - "Filesystem with project quota feature cannot be mounted RDWR " - "without CONFIG_QUOTA"); + "The kernel was not built with CONFIG_QUOTA and CONFIG_QFMT_V2"); return 0; } #endif /* CONFIG_QUOTA */ -- cgit v1.2.3 From 3bf3c9744694803bd2d6f0ee70a6369b980530fd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Beh=C3=BAn?= Date: Sat, 15 Feb 2020 15:21:30 +0100 Subject: bus: moxtet: fix potential stack buffer overflow MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The input_read function declares the size of the hex array relative to sizeof(buf), but buf is a pointer argument of the function. The hex array is meant to contain hexadecimal representation of the bin array. Link: https://lore.kernel.org/r/20200215142130.22743-1-marek.behun@nic.cz Fixes: 5bc7f990cd98 ("bus: Add support for Moxtet bus") Signed-off-by: Marek Behún Reported-by: sohu0106 Signed-off-by: Olof Johansson --- drivers/bus/moxtet.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/bus/moxtet.c b/drivers/bus/moxtet.c index 15fa293819a0..b20fdcbd035b 100644 --- a/drivers/bus/moxtet.c +++ b/drivers/bus/moxtet.c @@ -465,7 +465,7 @@ static ssize_t input_read(struct file *file, char __user *buf, size_t len, { struct moxtet *moxtet = file->private_data; u8 bin[TURRIS_MOX_MAX_MODULES]; - u8 hex[sizeof(buf) * 2 + 1]; + u8 hex[sizeof(bin) * 2 + 1]; int ret, n; ret = moxtet_spi_read(moxtet, bin); -- cgit v1.2.3 From 7fbeb95d0f68e21e6ca61284f1ac681630976947 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 16 Feb 2020 01:01:18 +0300 Subject: io_uring: add missing io_req_cancelled() fallocate_finish() is missing cancellation check. Add it. It's safe to do that, as only flags setup and sqe fields copy are done before it gets into __io_fallocate(). Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/io_uring.c b/fs/io_uring.c index 5a826017ebb8..29565d82291f 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2517,6 +2517,9 @@ static void io_fallocate_finish(struct io_wq_work **workptr) struct io_kiocb *nxt = NULL; int ret; + if (io_req_cancelled(req)) + return; + ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off, req->sync.len); if (ret < 0) @@ -2904,6 +2907,7 @@ static void io_close_finish(struct io_wq_work **workptr) struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work); struct io_kiocb *nxt = NULL; + /* not cancellable, don't do io_req_cancelled() */ __io_close_finish(req, &nxt); if (nxt) io_wq_assign_next(workptr, nxt); -- cgit v1.2.3 From 11a48a5a18c63fd7621bb050228cebf13566e4d8 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 16 Feb 2020 13:16:59 -0800 Subject: Linux 5.6-rc2 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 84b71845c43f..aab38cb02b24 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 5 PATCHLEVEL = 6 SUBLEVEL = 0 -EXTRAVERSION = -rc1 +EXTRAVERSION = -rc2 NAME = Kleptomaniac Octopus # *DOCUMENTATION* -- cgit v1.2.3 From d4f194ed9eb9841a8f978710e4d24296f791a85b Mon Sep 17 00:00:00 2001 From: Sam Bobroff Date: Fri, 7 Feb 2020 15:57:31 +1100 Subject: powerpc/eeh: Fix deadlock handling dead PHB Recovering a dead PHB can currently cause a deadlock as the PCI rescan/remove lock is taken twice. This is caused as part of an existing bug in eeh_handle_special_event(). The pe is processed while traversing the PHBs even though the pe is unrelated to the loop. This causes the pe to be, incorrectly, processed more than once. Untangling this section can move the pe processing out of the loop and also outside the locked section, correcting both problems. Fixes: 2e25505147b8 ("powerpc/eeh: Fix crash when edev->pdev changes") Cc: stable@vger.kernel.org # 5.4+ Signed-off-by: Sam Bobroff Reviewed-by: Frederic Barrat Tested-by: Frederic Barrat Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/0547e82dbf90ee0729a2979a8cac5c91665c621f.1581051445.git.sbobroff@linux.ibm.com --- arch/powerpc/kernel/eeh_driver.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c index a1eaffe868de..7b048cee767c 100644 --- a/arch/powerpc/kernel/eeh_driver.c +++ b/arch/powerpc/kernel/eeh_driver.c @@ -1184,6 +1184,17 @@ void eeh_handle_special_event(void) eeh_pe_state_mark(pe, EEH_PE_RECOVERING); eeh_handle_normal_event(pe); } else { + eeh_for_each_pe(pe, tmp_pe) + eeh_pe_for_each_dev(tmp_pe, edev, tmp_edev) + edev->mode &= ~EEH_DEV_NO_HANDLER; + + /* Notify all devices to be down */ + eeh_pe_state_clear(pe, EEH_PE_PRI_BUS, true); + eeh_set_channel_state(pe, pci_channel_io_perm_failure); + eeh_pe_report( + "error_detected(permanent failure)", pe, + eeh_report_failure, NULL); + pci_lock_rescan_remove(); list_for_each_entry(hose, &hose_list, list_node) { phb_pe = eeh_phb_pe_get(hose); @@ -1192,16 +1203,6 @@ void eeh_handle_special_event(void) (phb_pe->state & EEH_PE_RECOVERING)) continue; - eeh_for_each_pe(pe, tmp_pe) - eeh_pe_for_each_dev(tmp_pe, edev, tmp_edev) - edev->mode &= ~EEH_DEV_NO_HANDLER; - - /* Notify all devices to be down */ - eeh_pe_state_clear(pe, EEH_PE_PRI_BUS, true); - eeh_set_channel_state(pe, pci_channel_io_perm_failure); - eeh_pe_report( - "error_detected(permanent failure)", pe, - eeh_report_failure, NULL); bus = eeh_pe_bus_get(phb_pe); if (!bus) { pr_err("%s: Cannot find PCI bus for " -- cgit v1.2.3 From f2b67ef90b0d5eca0f2255e02cf2f620bc0ddcdb Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 6 Feb 2020 13:50:28 +0000 Subject: powerpc/hugetlb: Fix 512k hugepages on 8xx with 16k page size Commit 55c8fc3f4930 ("powerpc/8xx: reintroduce 16K pages with HW assistance") redefined pte_t as a struct of 4 pte_basic_t, because in 16K pages mode there are four identical entries in the page table. But the size of hugepage tables is calculated based of the size of (void *). Therefore, we end up with page tables of size 1k instead of 4k for 512k pages. As 512k hugepage tables are the same size as standard page tables, ie 4k, use the standard page tables instead of PGT_CACHE tables. Fixes: 3fb69c6a1a13 ("powerpc/8xx: Enable 512k hugepage support with HW assistance") Cc: stable@vger.kernel.org # v5.0+ Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/90ec56a2315be602494619ed0223bba3b0b8d619.1580997007.git.christophe.leroy@c-s.fr --- arch/powerpc/mm/hugetlbpage.c | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index 73d4873fc7f8..33b3461d91e8 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -53,20 +53,24 @@ static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp, if (pshift >= pdshift) { cachep = PGT_CACHE(PTE_T_ORDER); num_hugepd = 1 << (pshift - pdshift); + new = NULL; } else if (IS_ENABLED(CONFIG_PPC_8xx)) { - cachep = PGT_CACHE(PTE_INDEX_SIZE); + cachep = NULL; num_hugepd = 1; + new = pte_alloc_one(mm); } else { cachep = PGT_CACHE(pdshift - pshift); num_hugepd = 1; + new = NULL; } - if (!cachep) { + if (!cachep && !new) { WARN_ONCE(1, "No page table cache created for hugetlb tables"); return -ENOMEM; } - new = kmem_cache_alloc(cachep, pgtable_gfp_flags(mm, GFP_KERNEL)); + if (cachep) + new = kmem_cache_alloc(cachep, pgtable_gfp_flags(mm, GFP_KERNEL)); BUG_ON(pshift > HUGEPD_SHIFT_MASK); BUG_ON((unsigned long)new & HUGEPD_SHIFT_MASK); @@ -97,7 +101,10 @@ static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp, if (i < num_hugepd) { for (i = i - 1 ; i >= 0; i--, hpdp--) *hpdp = __hugepd(0); - kmem_cache_free(cachep, new); + if (cachep) + kmem_cache_free(cachep, new); + else + pte_free(mm, new); } else { kmemleak_ignore(new); } @@ -324,8 +331,7 @@ static void free_hugepd_range(struct mmu_gather *tlb, hugepd_t *hpdp, int pdshif if (shift >= pdshift) hugepd_free(tlb, hugepte); else if (IS_ENABLED(CONFIG_PPC_8xx)) - pgtable_free_tlb(tlb, hugepte, - get_hugepd_cache_index(PTE_INDEX_SIZE)); + pgtable_free_tlb(tlb, hugepte, 0); else pgtable_free_tlb(tlb, hugepte, get_hugepd_cache_index(pdshift - shift)); @@ -639,12 +645,13 @@ static int __init hugetlbpage_init(void) * if we have pdshift and shift value same, we don't * use pgt cache for hugepd. */ - if (pdshift > shift && IS_ENABLED(CONFIG_PPC_8xx)) - pgtable_cache_add(PTE_INDEX_SIZE); - else if (pdshift > shift) - pgtable_cache_add(pdshift - shift); - else if (IS_ENABLED(CONFIG_PPC_FSL_BOOK3E) || IS_ENABLED(CONFIG_PPC_8xx)) + if (pdshift > shift) { + if (!IS_ENABLED(CONFIG_PPC_8xx)) + pgtable_cache_add(pdshift - shift); + } else if (IS_ENABLED(CONFIG_PPC_FSL_BOOK3E) || + IS_ENABLED(CONFIG_PPC_8xx)) { pgtable_cache_add(PTE_T_ORDER); + } configured = true; } -- cgit v1.2.3 From 50a175dd18de7a647e72aca7daf4744e3a5a81e3 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Sun, 9 Feb 2020 16:02:41 +0000 Subject: powerpc/hugetlb: Fix 8M hugepages on 8xx With HW assistance all page tables must be 4k aligned, the 8xx drops the last 12 bits during the walk. Redefine HUGEPD_SHIFT_MASK to mask last 12 bits out. HUGEPD_SHIFT_MASK is used to for alignment of page table cache. Fixes: 22569b881d37 ("powerpc/8xx: Enable 8M hugepage support with HW assistance") Cc: stable@vger.kernel.org # v5.0+ Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/778b1a248c4c7ca79640eeff7740044da6a220a0.1581264115.git.christophe.leroy@c-s.fr --- arch/powerpc/include/asm/page.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/powerpc/include/asm/page.h b/arch/powerpc/include/asm/page.h index 86332080399a..080a0bf8e54b 100644 --- a/arch/powerpc/include/asm/page.h +++ b/arch/powerpc/include/asm/page.h @@ -295,8 +295,13 @@ static inline bool pfn_valid(unsigned long pfn) /* * Some number of bits at the level of the page table that points to * a hugepte are used to encode the size. This masks those bits. + * On 8xx, HW assistance requires 4k alignment for the hugepte. */ +#ifdef CONFIG_PPC_8xx +#define HUGEPD_SHIFT_MASK 0xfff +#else #define HUGEPD_SHIFT_MASK 0x3f +#endif #ifndef __ASSEMBLY__ -- cgit v1.2.3 From a4031afb9d10d97f4d0285844abbc0ab04245304 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Sun, 9 Feb 2020 18:14:42 +0000 Subject: powerpc/8xx: Fix clearing of bits 20-23 in ITLB miss In ITLB miss handled the line supposed to clear bits 20-23 on the L2 ITLB entry is buggy and does indeed nothing, leading to undefined value which could allow execution when it shouldn't. Properly do the clearing with the relevant instruction. Fixes: 74fabcadfd43 ("powerpc/8xx: don't use r12/SPRN_SPRG_SCRATCH2 in TLB Miss handlers") Cc: stable@vger.kernel.org # v5.0+ Signed-off-by: Christophe Leroy Reviewed-by: Leonardo Bras Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/4f70c2778163affce8508a210f65d140e84524b4.1581272050.git.christophe.leroy@c-s.fr --- arch/powerpc/kernel/head_8xx.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S index 9922306ae512..073a651787df 100644 --- a/arch/powerpc/kernel/head_8xx.S +++ b/arch/powerpc/kernel/head_8xx.S @@ -256,7 +256,7 @@ InstructionTLBMiss: * set. All other Linux PTE bits control the behavior * of the MMU. */ - rlwimi r10, r10, 0, 0x0f00 /* Clear bits 20-23 */ + rlwinm r10, r10, 0, ~0x0f00 /* Clear bits 20-23 */ rlwimi r10, r10, 4, 0x0400 /* Copy _PAGE_EXEC into bit 21 */ ori r10, r10, RPN_PATTERN | 0x200 /* Set 22 and 24-27 */ mtspr SPRN_MI_RPN, r10 /* Update TLB entry */ -- cgit v1.2.3 From e8023b030ce1748930e2dc76353a262fe47d4745 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Tue, 11 Feb 2020 15:32:56 +0800 Subject: selftests: forwarding: use proto icmp for {gretap, ip6gretap}_mac testing For tc ip_proto filter, when we extract the flow via __skb_flow_dissect() without flag FLOW_DISSECTOR_F_STOP_AT_ENCAP, we will continue extract to the inner proto. So for GRE + ICMP messages, we should not track GRE proto, but inner ICMP proto. For test mirror_gre.sh, it may make user confused if we capture ICMP message on $h3(since the flow is GRE message). So I move the capture dev to h3-gt{4,6}, and only capture ICMP message. Before the fix: ]# ./mirror_gre.sh TEST: ingress mirror to gretap (skip_hw) [ OK ] TEST: egress mirror to gretap (skip_hw) [ OK ] TEST: ingress mirror to ip6gretap (skip_hw) [ OK ] TEST: egress mirror to ip6gretap (skip_hw) [ OK ] TEST: ingress mirror to gretap: envelope MAC (skip_hw) [FAIL] Expected to capture 10 packets, got 0. TEST: egress mirror to gretap: envelope MAC (skip_hw) [FAIL] Expected to capture 10 packets, got 0. TEST: ingress mirror to ip6gretap: envelope MAC (skip_hw) [FAIL] Expected to capture 10 packets, got 0. TEST: egress mirror to ip6gretap: envelope MAC (skip_hw) [FAIL] Expected to capture 10 packets, got 0. TEST: two simultaneously configured mirrors (skip_hw) [ OK ] WARN: Could not test offloaded functionality After fix: ]# ./mirror_gre.sh TEST: ingress mirror to gretap (skip_hw) [ OK ] TEST: egress mirror to gretap (skip_hw) [ OK ] TEST: ingress mirror to ip6gretap (skip_hw) [ OK ] TEST: egress mirror to ip6gretap (skip_hw) [ OK ] TEST: ingress mirror to gretap: envelope MAC (skip_hw) [ OK ] TEST: egress mirror to gretap: envelope MAC (skip_hw) [ OK ] TEST: ingress mirror to ip6gretap: envelope MAC (skip_hw) [ OK ] TEST: egress mirror to ip6gretap: envelope MAC (skip_hw) [ OK ] TEST: two simultaneously configured mirrors (skip_hw) [ OK ] WARN: Could not test offloaded functionality Fixes: ba8d39871a10 ("selftests: forwarding: Add test for mirror to gretap") Signed-off-by: Hangbin Liu Reviewed-by: Petr Machata Tested-by: Petr Machata Signed-off-by: David S. Miller --- .../testing/selftests/net/forwarding/mirror_gre.sh | 25 +++++++++++----------- 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/tools/testing/selftests/net/forwarding/mirror_gre.sh b/tools/testing/selftests/net/forwarding/mirror_gre.sh index e6fd7a18c655..0266443601bc 100755 --- a/tools/testing/selftests/net/forwarding/mirror_gre.sh +++ b/tools/testing/selftests/net/forwarding/mirror_gre.sh @@ -63,22 +63,23 @@ test_span_gre_mac() { local tundev=$1; shift local direction=$1; shift - local prot=$1; shift local what=$1; shift - local swp3mac=$(mac_get $swp3) - local h3mac=$(mac_get $h3) + case "$direction" in + ingress) local src_mac=$(mac_get $h1); local dst_mac=$(mac_get $h2) + ;; + egress) local src_mac=$(mac_get $h2); local dst_mac=$(mac_get $h1) + ;; + esac RET=0 mirror_install $swp1 $direction $tundev "matchall $tcflags" - tc filter add dev $h3 ingress pref 77 prot $prot \ - flower ip_proto 0x2f src_mac $swp3mac dst_mac $h3mac \ - action pass + icmp_capture_install h3-${tundev} "src_mac $src_mac dst_mac $dst_mac" - mirror_test v$h1 192.0.2.1 192.0.2.2 $h3 77 10 + mirror_test v$h1 192.0.2.1 192.0.2.2 h3-${tundev} 100 10 - tc filter del dev $h3 ingress pref 77 + icmp_capture_uninstall h3-${tundev} mirror_uninstall $swp1 $direction log_test "$direction $what: envelope MAC ($tcflags)" @@ -120,14 +121,14 @@ test_ip6gretap() test_gretap_mac() { - test_span_gre_mac gt4 ingress ip "mirror to gretap" - test_span_gre_mac gt4 egress ip "mirror to gretap" + test_span_gre_mac gt4 ingress "mirror to gretap" + test_span_gre_mac gt4 egress "mirror to gretap" } test_ip6gretap_mac() { - test_span_gre_mac gt6 ingress ipv6 "mirror to ip6gretap" - test_span_gre_mac gt6 egress ipv6 "mirror to ip6gretap" + test_span_gre_mac gt6 ingress "mirror to ip6gretap" + test_span_gre_mac gt6 egress "mirror to ip6gretap" } test_all() -- cgit v1.2.3 From e404b8c7cfb31654c9024d497cec58a501501692 Mon Sep 17 00:00:00 2001 From: Benjamin Poirier Date: Wed, 12 Feb 2020 10:41:06 +0900 Subject: ipv6: Fix route replacement with dev-only route After commit 27596472473a ("ipv6: fix ECMP route replacement") it is no longer possible to replace an ECMP-able route by a non ECMP-able route. For example, ip route add 2001:db8::1/128 via fe80::1 dev dummy0 ip route replace 2001:db8::1/128 dev dummy0 does not work as expected. Tweak the replacement logic so that point 3 in the log of the above commit becomes: 3. If the new route is not ECMP-able, and no matching non-ECMP-able route exists, replace matching ECMP-able route (if any) or add the new route. We can now summarize the entire replace semantics to: When doing a replace, prefer replacing a matching route of the same "ECMP-able-ness" as the replace argument. If there is no such candidate, fallback to the first route found. Fixes: 27596472473a ("ipv6: fix ECMP route replacement") Signed-off-by: Benjamin Poirier Reviewed-by: Michal Kubecek Signed-off-by: David S. Miller --- net/ipv6/ip6_fib.c | 7 ++++--- tools/testing/selftests/net/fib_tests.sh | 6 ++++++ 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 58fbde244381..72abf892302f 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -1102,8 +1102,7 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt, found++; break; } - if (rt_can_ecmp) - fallback_ins = fallback_ins ?: ins; + fallback_ins = fallback_ins ?: ins; goto next_iter; } @@ -1146,7 +1145,9 @@ next_iter: } if (fallback_ins && !found) { - /* No ECMP-able route found, replace first non-ECMP one */ + /* No matching route with same ecmp-able-ness found, replace + * first matching route + */ ins = fallback_ins; iter = rcu_dereference_protected(*ins, lockdep_is_held(&rt->fib6_table->tb6_lock)); diff --git a/tools/testing/selftests/net/fib_tests.sh b/tools/testing/selftests/net/fib_tests.sh index 6dd403103800..60273f1bc7d9 100755 --- a/tools/testing/selftests/net/fib_tests.sh +++ b/tools/testing/selftests/net/fib_tests.sh @@ -910,6 +910,12 @@ ipv6_rt_replace_mpath() check_route6 "2001:db8:104::/64 via 2001:db8:101::3 dev veth1 metric 1024" log_test $? 0 "Multipath with single path via multipath attribute" + # multipath with dev-only + add_initial_route6 "nexthop via 2001:db8:101::2 nexthop via 2001:db8:103::2" + run_cmd "$IP -6 ro replace 2001:db8:104::/64 dev veth1" + check_route6 "2001:db8:104::/64 dev veth1 metric 1024" + log_test $? 0 "Multipath with dev-only" + # route replace fails - invalid nexthop 1 add_initial_route6 "nexthop via 2001:db8:101::2 nexthop via 2001:db8:103::2" run_cmd "$IP -6 ro replace 2001:db8:104::/64 nexthop via 2001:db8:111::3 nexthop via 2001:db8:103::3" -- cgit v1.2.3 From afecdb376bd81d7e16578f0cfe82a1aec7ae18f3 Mon Sep 17 00:00:00 2001 From: Benjamin Poirier Date: Wed, 12 Feb 2020 10:41:07 +0900 Subject: ipv6: Fix nlmsg_flags when splitting a multipath route When splitting an RTA_MULTIPATH request into multiple routes and adding the second and later components, we must not simply remove NLM_F_REPLACE but instead replace it by NLM_F_CREATE. Otherwise, it may look like the netlink message was malformed. For example, ip route add 2001:db8::1/128 dev dummy0 ip route change 2001:db8::1/128 nexthop via fe80::30:1 dev dummy0 \ nexthop via fe80::30:2 dev dummy0 results in the following warnings: [ 1035.057019] IPv6: RTM_NEWROUTE with no NLM_F_CREATE or NLM_F_REPLACE [ 1035.057517] IPv6: NLM_F_CREATE should be set when creating new route This patch makes the nlmsg sequence look equivalent for __ip6_ins_rt() to what it would get if the multipath route had been added in multiple netlink operations: ip route add 2001:db8::1/128 dev dummy0 ip route change 2001:db8::1/128 nexthop via fe80::30:1 dev dummy0 ip route append 2001:db8::1/128 nexthop via fe80::30:2 dev dummy0 Fixes: 27596472473a ("ipv6: fix ECMP route replacement") Signed-off-by: Benjamin Poirier Reviewed-by: Michal Kubecek Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/route.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 4fbdc60b4e07..2931224b674e 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -5198,6 +5198,7 @@ static int ip6_route_multipath_add(struct fib6_config *cfg, */ cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL | NLM_F_REPLACE); + cfg->fc_nlinfo.nlh->nlmsg_flags |= NLM_F_CREATE; nhn++; } -- cgit v1.2.3 From 0d4597c8c5abdeeaf50774066c16683f30184dc8 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 11 Feb 2020 19:03:55 -0800 Subject: net/rds: Track user mapped pages through special API Convert net/rds to use the newly introduces pin_user_pages() API, which properly sets FOLL_PIN. Setting FOLL_PIN is now required for code that requires tracking of pinned pages. Note that this effectively changes the code's behavior: it now ultimately calls set_page_dirty_lock(), instead of set_page_dirty(). This is probably more accurate. As Christoph Hellwig put it, "set_page_dirty() is only safe if we are dealing with a file backed page where we have reference on the inode it hangs off." [1] [1] https://lore.kernel.org/r/20190723153640.GB720@lst.de Cc: Hans Westgaard Ry Cc: Santosh Shilimkar Signed-off-by: Leon Romanovsky Signed-off-by: John Hubbard Acked-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/rdma.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/net/rds/rdma.c b/net/rds/rdma.c index 3341eee87bf9..585e6b3b69ce 100644 --- a/net/rds/rdma.c +++ b/net/rds/rdma.c @@ -162,10 +162,9 @@ static int rds_pin_pages(unsigned long user_addr, unsigned int nr_pages, if (write) gup_flags |= FOLL_WRITE; - ret = get_user_pages_fast(user_addr, nr_pages, gup_flags, pages); + ret = pin_user_pages_fast(user_addr, nr_pages, gup_flags, pages); if (ret >= 0 && ret < nr_pages) { - while (ret--) - put_page(pages[ret]); + unpin_user_pages(pages, ret); ret = -EFAULT; } @@ -300,8 +299,7 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args, * to release anything. */ if (!need_odp) { - for (i = 0 ; i < nents; i++) - put_page(sg_page(&sg[i])); + unpin_user_pages(pages, nr_pages); kfree(sg); } ret = PTR_ERR(trans_private); @@ -325,7 +323,12 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args, if (cookie_ret) *cookie_ret = cookie; - if (args->cookie_addr && put_user(cookie, (u64 __user *)(unsigned long) args->cookie_addr)) { + if (args->cookie_addr && + put_user(cookie, (u64 __user *)(unsigned long)args->cookie_addr)) { + if (!need_odp) { + unpin_user_pages(pages, nr_pages); + kfree(sg); + } ret = -EFAULT; goto out; } @@ -496,9 +499,7 @@ void rds_rdma_free_op(struct rm_rdma_op *ro) * is the case for a RDMA_READ which copies from remote * to local memory */ - if (!ro->op_write) - set_page_dirty(page); - put_page(page); + unpin_user_pages_dirty_lock(&page, 1, !ro->op_write); } } @@ -515,8 +516,7 @@ void rds_atomic_free_op(struct rm_atomic_op *ao) /* Mark page dirty if it was possibly modified, which * is the case for a RDMA_READ which copies from remote * to local memory */ - set_page_dirty(page); - put_page(page); + unpin_user_pages_dirty_lock(&page, 1, true); kfree(ao->op_notifier); ao->op_notifier = NULL; @@ -944,7 +944,7 @@ int rds_cmsg_atomic(struct rds_sock *rs, struct rds_message *rm, return ret; err: if (page) - put_page(page); + unpin_user_page(page); rm->atomic.op_active = 0; kfree(rm->atomic.op_notifier); -- cgit v1.2.3 From 540e585a79e9d643ede077b73bcc7aa2d7b4d919 Mon Sep 17 00:00:00 2001 From: Jethro Beekman Date: Wed, 12 Feb 2020 16:43:41 +0100 Subject: net: fib_rules: Correctly set table field when table number exceeds 8 bits In 709772e6e06564ed94ba740de70185ac3d792773, RT_TABLE_COMPAT was added to allow legacy software to deal with routing table numbers >= 256, but the same change to FIB rule queries was overlooked. Signed-off-by: Jethro Beekman Signed-off-by: David S. Miller --- net/core/fib_rules.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 3e7e15278c46..bd7eba9066f8 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -974,7 +974,7 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule, frh = nlmsg_data(nlh); frh->family = ops->family; - frh->table = rule->table; + frh->table = rule->table < 256 ? rule->table : RT_TABLE_COMPAT; if (nla_put_u32(skb, FRA_TABLE, rule->table)) goto nla_put_failure; if (nla_put_u32(skb, FRA_SUPPRESS_PREFIXLEN, rule->suppress_prefixlen)) -- cgit v1.2.3 From e6a41c23df0d5da01540d2abef41591589c0b4be Mon Sep 17 00:00:00 2001 From: Alexandre Belloni Date: Wed, 12 Feb 2020 17:45:38 +0100 Subject: net: macb: ensure interface is not suspended on at91rm9200 Because of autosuspend, at91ether_start is called with clocks disabled. Ensure that pm_runtime doesn't suspend the interface as soon as it is opened as there is no pm_runtime support is the other relevant parts of the platform support for at91rm9200. Fixes: d54f89af6cc4 ("net: macb: Add pm runtime support") Signed-off-by: Alexandre Belloni Reviewed-by: Claudiu Beznea Acked-by: Nicolas Ferre Signed-off-by: David S. Miller --- drivers/net/ethernet/cadence/macb_main.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c index 4508f0d150da..def94e91883a 100644 --- a/drivers/net/ethernet/cadence/macb_main.c +++ b/drivers/net/ethernet/cadence/macb_main.c @@ -3790,6 +3790,10 @@ static int at91ether_open(struct net_device *dev) u32 ctl; int ret; + ret = pm_runtime_get_sync(&lp->pdev->dev); + if (ret < 0) + return ret; + /* Clear internal statistics */ ctl = macb_readl(lp, NCR); macb_writel(lp, NCR, ctl | MACB_BIT(CLRSTAT)); @@ -3854,7 +3858,7 @@ static int at91ether_close(struct net_device *dev) q->rx_buffers, q->rx_buffers_dma); q->rx_buffers = NULL; - return 0; + return pm_runtime_put(&lp->pdev->dev); } /* Transmit packet */ -- cgit v1.2.3 From 44bfa9c5e5f06c72540273813e4c66beb5a8c213 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 12 Feb 2020 20:58:26 -0800 Subject: net: rtnetlink: fix bugs in rtnl_alt_ifname() Since IFLA_ALT_IFNAME is an NLA_STRING, we have no guarantee it is nul terminated. We should use nla_strdup() instead of kstrdup(), since this helper will make sure not accessing out-of-bounds data. BUG: KMSAN: uninit-value in strlen+0x5e/0xa0 lib/string.c:535 CPU: 1 PID: 19157 Comm: syz-executor.5 Not tainted 5.5.0-rc5-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x1c9/0x220 lib/dump_stack.c:118 kmsan_report+0xf7/0x1e0 mm/kmsan/kmsan_report.c:118 __msan_warning+0x58/0xa0 mm/kmsan/kmsan_instr.c:215 strlen+0x5e/0xa0 lib/string.c:535 kstrdup+0x7f/0x1a0 mm/util.c:59 rtnl_alt_ifname net/core/rtnetlink.c:3495 [inline] rtnl_linkprop+0x85d/0xc00 net/core/rtnetlink.c:3553 rtnl_newlinkprop+0x9d/0xb0 net/core/rtnetlink.c:3568 rtnetlink_rcv_msg+0x1153/0x1570 net/core/rtnetlink.c:5424 netlink_rcv_skb+0x451/0x650 net/netlink/af_netlink.c:2477 rtnetlink_rcv+0x50/0x60 net/core/rtnetlink.c:5442 netlink_unicast_kernel net/netlink/af_netlink.c:1302 [inline] netlink_unicast+0xf9e/0x1100 net/netlink/af_netlink.c:1328 netlink_sendmsg+0x1248/0x14d0 net/netlink/af_netlink.c:1917 sock_sendmsg_nosec net/socket.c:639 [inline] sock_sendmsg net/socket.c:659 [inline] ____sys_sendmsg+0x12b6/0x1350 net/socket.c:2330 ___sys_sendmsg net/socket.c:2384 [inline] __sys_sendmsg+0x451/0x5f0 net/socket.c:2417 __do_sys_sendmsg net/socket.c:2426 [inline] __se_sys_sendmsg+0x97/0xb0 net/socket.c:2424 __x64_sys_sendmsg+0x4a/0x70 net/socket.c:2424 do_syscall_64+0xb8/0x160 arch/x86/entry/common.c:296 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x45b3b9 Code: ad b6 fb ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 7b b6 fb ff c3 66 2e 0f 1f 84 00 00 00 00 RSP: 002b:00007ff1c7b1ac78 EFLAGS: 00000246 ORIG_RAX: 000000000000002e RAX: ffffffffffffffda RBX: 00007ff1c7b1b6d4 RCX: 000000000045b3b9 RDX: 0000000000000000 RSI: 0000000020000040 RDI: 0000000000000003 RBP: 000000000075bf20 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 00000000ffffffff R13: 00000000000009cb R14: 00000000004cb3dd R15: 000000000075bf2c Uninit was created at: kmsan_save_stack_with_flags mm/kmsan/kmsan.c:144 [inline] kmsan_internal_poison_shadow+0x66/0xd0 mm/kmsan/kmsan.c:127 kmsan_slab_alloc+0x8a/0xe0 mm/kmsan/kmsan_hooks.c:82 slab_alloc_node mm/slub.c:2774 [inline] __kmalloc_node_track_caller+0xb40/0x1200 mm/slub.c:4382 __kmalloc_reserve net/core/skbuff.c:141 [inline] __alloc_skb+0x2fd/0xac0 net/core/skbuff.c:209 alloc_skb include/linux/skbuff.h:1049 [inline] netlink_alloc_large_skb net/netlink/af_netlink.c:1174 [inline] netlink_sendmsg+0x7d3/0x14d0 net/netlink/af_netlink.c:1892 sock_sendmsg_nosec net/socket.c:639 [inline] sock_sendmsg net/socket.c:659 [inline] ____sys_sendmsg+0x12b6/0x1350 net/socket.c:2330 ___sys_sendmsg net/socket.c:2384 [inline] __sys_sendmsg+0x451/0x5f0 net/socket.c:2417 __do_sys_sendmsg net/socket.c:2426 [inline] __se_sys_sendmsg+0x97/0xb0 net/socket.c:2424 __x64_sys_sendmsg+0x4a/0x70 net/socket.c:2424 do_syscall_64+0xb8/0x160 arch/x86/entry/common.c:296 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Fixes: 36fbf1e52bd3 ("net: rtnetlink: add linkprop commands to add and delete alternative ifnames") Signed-off-by: Eric Dumazet Cc: Jiri Pirko Reported-by: syzbot Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 09c44bf2e1d2..e1152f4ffe33 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -3504,27 +3504,25 @@ static int rtnl_alt_ifname(int cmd, struct net_device *dev, struct nlattr *attr, if (err) return err; - alt_ifname = nla_data(attr); + alt_ifname = nla_strdup(attr, GFP_KERNEL); + if (!alt_ifname) + return -ENOMEM; + if (cmd == RTM_NEWLINKPROP) { - alt_ifname = kstrdup(alt_ifname, GFP_KERNEL); - if (!alt_ifname) - return -ENOMEM; err = netdev_name_node_alt_create(dev, alt_ifname); - if (err) { - kfree(alt_ifname); - return err; - } + if (!err) + alt_ifname = NULL; } else if (cmd == RTM_DELLINKPROP) { err = netdev_name_node_alt_destroy(dev, alt_ifname); - if (err) - return err; } else { - WARN_ON(1); - return 0; + WARN_ON_ONCE(1); + err = -EINVAL; } - *changed = true; - return 0; + kfree(alt_ifname); + if (!err) + *changed = true; + return err; } static int rtnl_linkprop(int cmd, struct sk_buff *skb, struct nlmsghdr *nlh, -- cgit v1.2.3 From 4e867c9a50ff1a07ed0b86c3b1c8bc773933d728 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Thu, 13 Feb 2020 17:40:54 +0800 Subject: selftests: forwarding: vxlan_bridge_1d: fix tos value After commit 71130f29979c ("vxlan: fix tos value before xmit") we start strict vxlan xmit tos value by RT_TOS(), which limits the tos value less than 0x1E. With current value 0x40 the test will failed with "v1: Expected to capture 10 packets, got 0". So let's choose a smaller tos value for testing. Fixes: d417ecf533fe ("selftests: forwarding: vxlan_bridge_1d: Add a TOS test") Signed-off-by: Hangbin Liu Signed-off-by: David S. Miller --- tools/testing/selftests/net/forwarding/vxlan_bridge_1d.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/net/forwarding/vxlan_bridge_1d.sh b/tools/testing/selftests/net/forwarding/vxlan_bridge_1d.sh index bb10e33690b2..353613fc1947 100755 --- a/tools/testing/selftests/net/forwarding/vxlan_bridge_1d.sh +++ b/tools/testing/selftests/net/forwarding/vxlan_bridge_1d.sh @@ -516,9 +516,9 @@ test_tos() RET=0 tc filter add dev v1 egress pref 77 prot ip \ - flower ip_tos 0x40 action pass - vxlan_ping_test $h1 192.0.2.3 "-Q 0x40" v1 egress 77 10 - vxlan_ping_test $h1 192.0.2.3 "-Q 0x30" v1 egress 77 0 + flower ip_tos 0x11 action pass + vxlan_ping_test $h1 192.0.2.3 "-Q 0x11" v1 egress 77 10 + vxlan_ping_test $h1 192.0.2.3 "-Q 0x12" v1 egress 77 0 tc filter del dev v1 egress pref 77 prot ip log_test "VXLAN: envelope TOS inheritance" -- cgit v1.2.3 From 15beab0a9d797be1b7c67458da007a62269be29a Mon Sep 17 00:00:00 2001 From: Dmitry Bezrukov Date: Fri, 14 Feb 2020 18:44:51 +0300 Subject: net: atlantic: checksum compat issue Yet another checksum offload compatibility issue was found. The known issue is that AQC HW marks tcp packets with 0xFFFF checksum as invalid (1). This is workarounded in driver, passing all the suspicious packets up to the stack for further csum validation. Another HW problem (2) is that it hides invalid csum of LRO aggregated packets inside of the individual descriptors. That was workarounded by forced scan of all LRO descriptors for checksum errors. However the scan logic was joint for both LRO and multi-descriptor packets (jumbos). And this causes the issue. We have to drop LRO packets with the detected bad checksum because of (2), but we have to pass jumbo packets to stack because of (1). When using windows tcp partner with jumbo frames but with LSO disabled driver discards such frames as bad checksummed. But only LRO frames should be dropped, not jumbos. On such a configurations tcp stream have a chance of drops and stucks. (1) 76f254d4afe2 ("net: aquantia: tcp checksum 0xffff being handled incorrectly") (2) d08b9a0a3ebd ("net: aquantia: do not pass lro session with invalid tcp checksum") Fixes: d08b9a0a3ebd ("net: aquantia: do not pass lro session with invalid tcp checksum") Signed-off-by: Dmitry Bezrukov Signed-off-by: Igor Russkikh Signed-off-by: Dmitry Bogdanov Signed-off-by: David S. Miller --- drivers/net/ethernet/aquantia/atlantic/aq_ring.c | 3 ++- drivers/net/ethernet/aquantia/atlantic/aq_ring.h | 3 ++- drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c | 5 +++-- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_ring.c b/drivers/net/ethernet/aquantia/atlantic/aq_ring.c index 951d86f8b66e..6941999ae845 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_ring.c +++ b/drivers/net/ethernet/aquantia/atlantic/aq_ring.c @@ -351,7 +351,8 @@ int aq_ring_rx_clean(struct aq_ring_s *self, err = 0; goto err_exit; } - if (buff->is_error || buff->is_cso_err) { + if (buff->is_error || + (buff->is_lro && buff->is_cso_err)) { buff_ = buff; do { next_ = buff_->next, diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_ring.h b/drivers/net/ethernet/aquantia/atlantic/aq_ring.h index 991e4d31b094..2c96f20f6289 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_ring.h +++ b/drivers/net/ethernet/aquantia/atlantic/aq_ring.h @@ -78,7 +78,8 @@ struct __packed aq_ring_buff_s { u32 is_cleaned:1; u32 is_error:1; u32 is_vlan:1; - u32 rsvd3:4; + u32 is_lro:1; + u32 rsvd3:3; u16 eop_index; u16 rsvd4; }; diff --git a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c index ec041f78d063..5784da26f868 100644 --- a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c +++ b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c @@ -823,6 +823,8 @@ static int hw_atl_b0_hw_ring_rx_receive(struct aq_hw_s *self, } } + buff->is_lro = !!(HW_ATL_B0_RXD_WB_STAT2_RSCCNT & + rxd_wb->status); if (HW_ATL_B0_RXD_WB_STAT2_EOP & rxd_wb->status) { buff->len = rxd_wb->pkt_len % AQ_CFG_RX_FRAME_MAX; @@ -835,8 +837,7 @@ static int hw_atl_b0_hw_ring_rx_receive(struct aq_hw_s *self, rxd_wb->pkt_len > AQ_CFG_RX_FRAME_MAX ? AQ_CFG_RX_FRAME_MAX : rxd_wb->pkt_len; - if (HW_ATL_B0_RXD_WB_STAT2_RSCCNT & - rxd_wb->status) { + if (buff->is_lro) { /* LRO */ buff->next = rxd_wb->next_desc_ptr; ++ring->stats.rx.lro_packets; -- cgit v1.2.3 From e7b5f97e6574dc4918e375d5f8d24ec31653cd6d Mon Sep 17 00:00:00 2001 From: Igor Russkikh Date: Fri, 14 Feb 2020 18:44:52 +0300 Subject: net: atlantic: check rpc result and wait for rpc address Artificial HW reliability tests revealed a possible hangup in the driver. Normally, when device disappears from bus, all register reads returns 0xFFFFFFFF. At remote procedure invocation towards FW there is a logic where result is compared with -1 in a loop. That caused an infinite loop if hardware due to some issues disappears from bus. Add extra result checks to prevent this. Signed-off-by: Dmitry Bogdanov Signed-off-by: Igor Russkikh Signed-off-by: David S. Miller --- .../ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.c b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.c index f547baa6c954..354705f9bc49 100644 --- a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.c +++ b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.c @@ -22,6 +22,7 @@ #define HW_ATL_MIF_ADDR 0x0208U #define HW_ATL_MIF_VAL 0x020CU +#define HW_ATL_MPI_RPC_ADDR 0x0334U #define HW_ATL_RPC_CONTROL_ADR 0x0338U #define HW_ATL_RPC_STATE_ADR 0x033CU @@ -53,15 +54,14 @@ enum mcp_area { }; static int hw_atl_utils_ver_match(u32 ver_expected, u32 ver_actual); - static int hw_atl_utils_mpi_set_state(struct aq_hw_s *self, enum hal_atl_utils_fw_state_e state); - static u32 hw_atl_utils_get_mpi_mbox_tid(struct aq_hw_s *self); static u32 hw_atl_utils_mpi_get_state(struct aq_hw_s *self); static u32 hw_atl_utils_mif_cmd_get(struct aq_hw_s *self); static u32 hw_atl_utils_mif_addr_get(struct aq_hw_s *self); static u32 hw_atl_utils_rpc_state_get(struct aq_hw_s *self); +static u32 aq_fw1x_rpc_get(struct aq_hw_s *self); int hw_atl_utils_initfw(struct aq_hw_s *self, const struct aq_fw_ops **fw_ops) { @@ -476,6 +476,10 @@ static int hw_atl_utils_init_ucp(struct aq_hw_s *self, self, self->mbox_addr, self->mbox_addr != 0U, 1000U, 10000U); + err = readx_poll_timeout_atomic(aq_fw1x_rpc_get, self, + self->rpc_addr, + self->rpc_addr != 0U, + 1000U, 100000U); return err; } @@ -531,6 +535,12 @@ int hw_atl_utils_fw_rpc_wait(struct aq_hw_s *self, self, fw.val, sw.tid == fw.tid, 1000U, 100000U); + if (err < 0) + goto err_exit; + + err = aq_hw_err_from_flags(self); + if (err < 0) + goto err_exit; if (fw.len == 0xFFFFU) { err = hw_atl_utils_fw_rpc_call(self, sw.len); @@ -1025,6 +1035,11 @@ static u32 hw_atl_utils_rpc_state_get(struct aq_hw_s *self) return aq_hw_read_reg(self, HW_ATL_RPC_STATE_ADR); } +static u32 aq_fw1x_rpc_get(struct aq_hw_s *self) +{ + return aq_hw_read_reg(self, HW_ATL_MPI_RPC_ADDR); +} + const struct aq_fw_ops aq_fw_1x_ops = { .init = hw_atl_utils_mpi_create, .deinit = hw_atl_fw1x_deinit, -- cgit v1.2.3 From f08a464c27ca0a4050333baa271504b27ce834b7 Mon Sep 17 00:00:00 2001 From: Egor Pomozov Date: Fri, 14 Feb 2020 18:44:53 +0300 Subject: net: atlantic: ptp gpio adjustments Clock adjustment data should be passed to FW as well, otherwise in some cases a drift was observed when using GPIO features. Signed-off-by: Egor Pomozov Signed-off-by: Igor Russkikh Signed-off-by: Dmitry Bogdanov Signed-off-by: David S. Miller --- drivers/net/ethernet/aquantia/atlantic/aq_hw.h | 2 ++ drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c | 4 +++- .../ethernet/aquantia/atlantic/hw_atl/hw_atl_utils_fw2x.c | 12 ++++++++++++ 3 files changed, 17 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_hw.h b/drivers/net/ethernet/aquantia/atlantic/aq_hw.h index cc70c606b6ef..251767c31f7e 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_hw.h +++ b/drivers/net/ethernet/aquantia/atlantic/aq_hw.h @@ -337,6 +337,8 @@ struct aq_fw_ops { void (*enable_ptp)(struct aq_hw_s *self, int enable); + void (*adjust_ptp)(struct aq_hw_s *self, uint64_t adj); + int (*set_eee_rate)(struct aq_hw_s *self, u32 speed); int (*get_eee_rate)(struct aq_hw_s *self, u32 *rate, diff --git a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c index 5784da26f868..9acdb3fbb750 100644 --- a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c +++ b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c @@ -1162,6 +1162,8 @@ static int hw_atl_b0_adj_sys_clock(struct aq_hw_s *self, s64 delta) { self->ptp_clk_offset += delta; + self->aq_fw_ops->adjust_ptp(self, self->ptp_clk_offset); + return 0; } @@ -1212,7 +1214,7 @@ static int hw_atl_b0_gpio_pulse(struct aq_hw_s *self, u32 index, fwreq.ptp_gpio_ctrl.index = index; fwreq.ptp_gpio_ctrl.period = period; /* Apply time offset */ - fwreq.ptp_gpio_ctrl.start = start - self->ptp_clk_offset; + fwreq.ptp_gpio_ctrl.start = start; size = sizeof(fwreq.msg_id) + sizeof(fwreq.ptp_gpio_ctrl); return self->aq_fw_ops->send_fw_request(self, &fwreq, size); diff --git a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils_fw2x.c b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils_fw2x.c index 97ebf849695f..77a4ed64830f 100644 --- a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils_fw2x.c +++ b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils_fw2x.c @@ -30,6 +30,9 @@ #define HW_ATL_FW3X_EXT_CONTROL_ADDR 0x378 #define HW_ATL_FW3X_EXT_STATE_ADDR 0x37c +#define HW_ATL_FW3X_PTP_ADJ_LSW_ADDR 0x50a0 +#define HW_ATL_FW3X_PTP_ADJ_MSW_ADDR 0x50a4 + #define HW_ATL_FW2X_CAP_PAUSE BIT(CAPS_HI_PAUSE) #define HW_ATL_FW2X_CAP_ASYM_PAUSE BIT(CAPS_HI_ASYMMETRIC_PAUSE) #define HW_ATL_FW2X_CAP_SLEEP_PROXY BIT(CAPS_HI_SLEEP_PROXY) @@ -475,6 +478,14 @@ static void aq_fw3x_enable_ptp(struct aq_hw_s *self, int enable) aq_hw_write_reg(self, HW_ATL_FW3X_EXT_CONTROL_ADDR, ptp_opts); } +static void aq_fw3x_adjust_ptp(struct aq_hw_s *self, uint64_t adj) +{ + aq_hw_write_reg(self, HW_ATL_FW3X_PTP_ADJ_LSW_ADDR, + (adj >> 0) & 0xffffffff); + aq_hw_write_reg(self, HW_ATL_FW3X_PTP_ADJ_MSW_ADDR, + (adj >> 32) & 0xffffffff); +} + static int aq_fw2x_led_control(struct aq_hw_s *self, u32 mode) { if (self->fw_ver_actual < HW_ATL_FW_VER_LED) @@ -633,4 +644,5 @@ const struct aq_fw_ops aq_fw_2x_ops = { .enable_ptp = aq_fw3x_enable_ptp, .led_control = aq_fw2x_led_control, .set_phyloopback = aq_fw2x_set_phyloopback, + .adjust_ptp = aq_fw3x_adjust_ptp, }; -- cgit v1.2.3 From b42726fcf76e9367e524392e0ead7e672cc0791c Mon Sep 17 00:00:00 2001 From: Nikita Danilov Date: Fri, 14 Feb 2020 18:44:54 +0300 Subject: net: atlantic: better loopback mode handling Add checks to not enable multiple loopback modes simultaneously, It was also discovered that for dma loopback to function correctly promisc mode should be enabled on device. Fixes: ea4b4d7fc106 ("net: atlantic: loopback tests via private flags") Signed-off-by: Nikita Danilov Signed-off-by: Igor Russkikh Signed-off-by: Dmitry Bogdanov Signed-off-by: David S. Miller --- drivers/net/ethernet/aquantia/atlantic/aq_ethtool.c | 5 +++++ drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c | 13 ++++++++----- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_ethtool.c b/drivers/net/ethernet/aquantia/atlantic/aq_ethtool.c index a1f99bef4a68..7b55633d2cb9 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_ethtool.c +++ b/drivers/net/ethernet/aquantia/atlantic/aq_ethtool.c @@ -722,6 +722,11 @@ static int aq_ethtool_set_priv_flags(struct net_device *ndev, u32 flags) if (flags & ~AQ_PRIV_FLAGS_MASK) return -EOPNOTSUPP; + if (hweight32((flags | priv_flags) & AQ_HW_LOOPBACK_MASK) > 1) { + netdev_info(ndev, "Can't enable more than one loopback simultaneously\n"); + return -EINVAL; + } + cfg->priv_flags = flags; if ((priv_flags ^ flags) & BIT(AQ_HW_LOOPBACK_DMA_NET)) { diff --git a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c index 9acdb3fbb750..d20d91cdece8 100644 --- a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c +++ b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c @@ -885,13 +885,16 @@ static int hw_atl_b0_hw_packet_filter_set(struct aq_hw_s *self, { struct aq_nic_cfg_s *cfg = self->aq_nic_cfg; unsigned int i = 0U; + u32 vlan_promisc; + u32 l2_promisc; - hw_atl_rpfl2promiscuous_mode_en_set(self, - IS_FILTER_ENABLED(IFF_PROMISC)); + l2_promisc = IS_FILTER_ENABLED(IFF_PROMISC) || + !!(cfg->priv_flags & BIT(AQ_HW_LOOPBACK_DMA_NET)); + vlan_promisc = l2_promisc || cfg->is_vlan_force_promisc; - hw_atl_rpf_vlan_prom_mode_en_set(self, - IS_FILTER_ENABLED(IFF_PROMISC) || - cfg->is_vlan_force_promisc); + hw_atl_rpfl2promiscuous_mode_en_set(self, l2_promisc); + + hw_atl_rpf_vlan_prom_mode_en_set(self, vlan_promisc); hw_atl_rpfl2multicast_flr_en_set(self, IS_FILTER_ENABLED(IFF_ALLMULTI) && -- cgit v1.2.3 From a4980919ad6a7be548d499bc5338015e1a9191c6 Mon Sep 17 00:00:00 2001 From: Pavel Belous Date: Fri, 14 Feb 2020 18:44:55 +0300 Subject: net: atlantic: fix use after free kasan warn skb->len is used to calculate statistics after xmit invocation. Under a stress load it may happen that skb will be xmited, rx interrupt will come and skb will be freed, all before xmit function is even returned. Eventually, skb->len will access unallocated area. Moving stats calculation into tx_clean routine. Fixes: 018423e90bee ("net: ethernet: aquantia: Add ring support code") Reported-by: Christophe Vu-Brugier Signed-off-by: Igor Russkikh Signed-off-by: Pavel Belous Signed-off-by: Dmitry Bogdanov Signed-off-by: David S. Miller --- drivers/net/ethernet/aquantia/atlantic/aq_nic.c | 4 ---- drivers/net/ethernet/aquantia/atlantic/aq_ring.c | 7 +++++-- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_nic.c b/drivers/net/ethernet/aquantia/atlantic/aq_nic.c index c85e3e29012c..263beea1859c 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_nic.c +++ b/drivers/net/ethernet/aquantia/atlantic/aq_nic.c @@ -655,10 +655,6 @@ int aq_nic_xmit(struct aq_nic_s *self, struct sk_buff *skb) if (likely(frags)) { err = self->aq_hw_ops->hw_ring_tx_xmit(self->aq_hw, ring, frags); - if (err >= 0) { - ++ring->stats.tx.packets; - ring->stats.tx.bytes += skb->len; - } } else { err = NETDEV_TX_BUSY; } diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_ring.c b/drivers/net/ethernet/aquantia/atlantic/aq_ring.c index 6941999ae845..bae95a618560 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_ring.c +++ b/drivers/net/ethernet/aquantia/atlantic/aq_ring.c @@ -272,9 +272,12 @@ bool aq_ring_tx_clean(struct aq_ring_s *self) } } - if (unlikely(buff->is_eop)) - dev_kfree_skb_any(buff->skb); + if (unlikely(buff->is_eop)) { + ++self->stats.rx.packets; + self->stats.tx.bytes += buff->skb->len; + dev_kfree_skb_any(buff->skb); + } buff->pa = 0U; buff->eop_index = 0xffffU; self->sw_head = aq_ring_next_dx(self, self->sw_head); -- cgit v1.2.3 From 380ec5b9af7f0d57dbf6ac067fd9f33cff2fef71 Mon Sep 17 00:00:00 2001 From: Pavel Belous Date: Fri, 14 Feb 2020 18:44:56 +0300 Subject: net: atlantic: fix potential error handling Code inspection found that in case of mapping error we do return current 'ret' value. But beside error, it is used to count number of descriptors allocated for the packet. In that case map_skb function could return '1'. Changing it to return zero (number of mapped descriptors for skb) Fixes: 018423e90bee ("net: ethernet: aquantia: Add ring support code") Signed-off-by: Pavel Belous Signed-off-by: Igor Russkikh Signed-off-by: Dmitry Bogdanov Signed-off-by: David S. Miller --- drivers/net/ethernet/aquantia/atlantic/aq_nic.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_nic.c b/drivers/net/ethernet/aquantia/atlantic/aq_nic.c index 263beea1859c..e95f6a6bef73 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_nic.c +++ b/drivers/net/ethernet/aquantia/atlantic/aq_nic.c @@ -533,8 +533,10 @@ unsigned int aq_nic_map_skb(struct aq_nic_s *self, struct sk_buff *skb, dx_buff->len, DMA_TO_DEVICE); - if (unlikely(dma_mapping_error(aq_nic_get_dev(self), dx_buff->pa))) + if (unlikely(dma_mapping_error(aq_nic_get_dev(self), dx_buff->pa))) { + ret = 0; goto exit; + } first = dx_buff; dx_buff->len_pkt = skb->len; -- cgit v1.2.3 From 52a22f4d6ff95e8bdca557765c04893eb5dd83fd Mon Sep 17 00:00:00 2001 From: Pavel Belous Date: Fri, 14 Feb 2020 18:44:57 +0300 Subject: net: atlantic: possible fault in transition to hibernation during hibernation freeze, aq_nic_stop could be invoked on a stopped device. That may cause panic on access to not yet allocated vector/ring structures. Add a check to stop device if it is not yet stopped. Similiarly after freeze in hibernation thaw, aq_nic_start could be invoked on a not initialized net device. Result will be the same. Add a check to start device if it is initialized. In our case, this is the same as started. Fixes: 8aaa112a57c1 ("net: atlantic: refactoring pm logic") Signed-off-by: Pavel Belous Signed-off-by: Nikita Danilov Signed-off-by: Igor Russkikh Signed-off-by: Dmitry Bogdanov Signed-off-by: David S. Miller --- drivers/net/ethernet/aquantia/atlantic/aq_pci_func.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_pci_func.c b/drivers/net/ethernet/aquantia/atlantic/aq_pci_func.c index 6b27af0db499..78b6f3248756 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_pci_func.c +++ b/drivers/net/ethernet/aquantia/atlantic/aq_pci_func.c @@ -359,7 +359,8 @@ static int aq_suspend_common(struct device *dev, bool deep) netif_device_detach(nic->ndev); netif_tx_stop_all_queues(nic->ndev); - aq_nic_stop(nic); + if (netif_running(nic->ndev)) + aq_nic_stop(nic); if (deep) { aq_nic_deinit(nic, !nic->aq_hw->aq_nic_cfg->wol); @@ -375,7 +376,7 @@ static int atl_resume_common(struct device *dev, bool deep) { struct pci_dev *pdev = to_pci_dev(dev); struct aq_nic_s *nic; - int ret; + int ret = 0; nic = pci_get_drvdata(pdev); @@ -390,9 +391,11 @@ static int atl_resume_common(struct device *dev, bool deep) goto err_exit; } - ret = aq_nic_start(nic); - if (ret) - goto err_exit; + if (netif_running(nic->ndev)) { + ret = aq_nic_start(nic); + if (ret) + goto err_exit; + } netif_device_attach(nic->ndev); netif_tx_start_all_queues(nic->ndev); -- cgit v1.2.3 From 5a292c89a84d49b598f8978f154bdda48b1072c0 Mon Sep 17 00:00:00 2001 From: Dmitry Bogdanov Date: Fri, 14 Feb 2020 18:44:58 +0300 Subject: net: atlantic: fix out of range usage of active_vlans array fix static checker warning: drivers/net/ethernet/aquantia/atlantic/aq_filters.c:166 aq_check_approve_fvlan() error: passing untrusted data to 'test_bit()' Reported-by: Dan Carpenter Fixes: 7975d2aff5af: ("net: aquantia: add support of rx-vlan-filter offload") Signed-off-by: Dmitry Bogdanov Signed-off-by: David S. Miller --- drivers/net/ethernet/aquantia/atlantic/aq_filters.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_filters.c b/drivers/net/ethernet/aquantia/atlantic/aq_filters.c index 6102251bb909..03ff92bc4a7f 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_filters.c +++ b/drivers/net/ethernet/aquantia/atlantic/aq_filters.c @@ -163,7 +163,7 @@ aq_check_approve_fvlan(struct aq_nic_s *aq_nic, } if ((aq_nic->ndev->features & NETIF_F_HW_VLAN_CTAG_FILTER) && - (!test_bit(be16_to_cpu(fsp->h_ext.vlan_tci), + (!test_bit(be16_to_cpu(fsp->h_ext.vlan_tci) & VLAN_VID_MASK, aq_nic->active_vlans))) { netdev_err(aq_nic->ndev, "ethtool: unknown vlan-id specified"); -- cgit v1.2.3 From e08ad80551b4b33c02f2fce1522f6c227d3976cf Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 14 Feb 2020 07:53:53 -0800 Subject: net: add strict checks in netdev_name_node_alt_destroy() netdev_name_node_alt_destroy() does a lookup over all device names of a namespace. We need to make sure the name belongs to the device of interest, and that we do not destroy its primary name, since we rely on it being not deleted : dev->name_node would indeed point to freed memory. syzbot report was the following : BUG: KASAN: use-after-free in dev_net include/linux/netdevice.h:2206 [inline] BUG: KASAN: use-after-free in mld_force_mld_version net/ipv6/mcast.c:1172 [inline] BUG: KASAN: use-after-free in mld_in_v2_mode_only net/ipv6/mcast.c:1180 [inline] BUG: KASAN: use-after-free in mld_in_v1_mode+0x203/0x230 net/ipv6/mcast.c:1190 Read of size 8 at addr ffff88809886c588 by task swapper/1/0 CPU: 1 PID: 0 Comm: swapper/1 Not tainted 5.6.0-rc1-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x197/0x210 lib/dump_stack.c:118 print_address_description.constprop.0.cold+0xd4/0x30b mm/kasan/report.c:374 __kasan_report.cold+0x1b/0x32 mm/kasan/report.c:506 kasan_report+0x12/0x20 mm/kasan/common.c:641 __asan_report_load8_noabort+0x14/0x20 mm/kasan/generic_report.c:135 dev_net include/linux/netdevice.h:2206 [inline] mld_force_mld_version net/ipv6/mcast.c:1172 [inline] mld_in_v2_mode_only net/ipv6/mcast.c:1180 [inline] mld_in_v1_mode+0x203/0x230 net/ipv6/mcast.c:1190 mld_send_initial_cr net/ipv6/mcast.c:2083 [inline] mld_dad_timer_expire+0x24/0x230 net/ipv6/mcast.c:2118 call_timer_fn+0x1ac/0x780 kernel/time/timer.c:1404 expire_timers kernel/time/timer.c:1449 [inline] __run_timers kernel/time/timer.c:1773 [inline] __run_timers kernel/time/timer.c:1740 [inline] run_timer_softirq+0x6c3/0x1790 kernel/time/timer.c:1786 __do_softirq+0x262/0x98c kernel/softirq.c:292 invoke_softirq kernel/softirq.c:373 [inline] irq_exit+0x19b/0x1e0 kernel/softirq.c:413 exiting_irq arch/x86/include/asm/apic.h:546 [inline] smp_apic_timer_interrupt+0x1a3/0x610 arch/x86/kernel/apic/apic.c:1146 apic_timer_interrupt+0xf/0x20 arch/x86/entry/entry_64.S:829 RIP: 0010:native_safe_halt+0xe/0x10 arch/x86/include/asm/irqflags.h:61 Code: 68 73 c5 f9 eb 8a cc cc cc cc cc cc e9 07 00 00 00 0f 00 2d 94 be 59 00 f4 c3 66 90 e9 07 00 00 00 0f 00 2d 84 be 59 00 fb f4 cc 55 48 89 e5 41 57 41 56 41 55 41 54 53 e8 de 2a 74 f9 e8 09 RSP: 0018:ffffc90000d3fd68 EFLAGS: 00000282 ORIG_RAX: ffffffffffffff13 RAX: 1ffffffff136761a RBX: ffff8880a99fc340 RCX: 0000000000000000 RDX: dffffc0000000000 RSI: 0000000000000006 RDI: ffff8880a99fcbd4 RBP: ffffc90000d3fd98 R08: ffff8880a99fc340 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000000 R12: dffffc0000000000 R13: ffffffff8aa5a1c0 R14: 0000000000000000 R15: 0000000000000001 arch_cpu_idle+0xa/0x10 arch/x86/kernel/process.c:686 default_idle_call+0x84/0xb0 kernel/sched/idle.c:94 cpuidle_idle_call kernel/sched/idle.c:154 [inline] do_idle+0x3c8/0x6e0 kernel/sched/idle.c:269 cpu_startup_entry+0x1b/0x20 kernel/sched/idle.c:361 start_secondary+0x2f4/0x410 arch/x86/kernel/smpboot.c:264 secondary_startup_64+0xa4/0xb0 arch/x86/kernel/head_64.S:242 Allocated by task 10229: save_stack+0x23/0x90 mm/kasan/common.c:72 set_track mm/kasan/common.c:80 [inline] __kasan_kmalloc mm/kasan/common.c:515 [inline] __kasan_kmalloc.constprop.0+0xcf/0xe0 mm/kasan/common.c:488 kasan_kmalloc+0x9/0x10 mm/kasan/common.c:529 __do_kmalloc_node mm/slab.c:3616 [inline] __kmalloc_node+0x4e/0x70 mm/slab.c:3623 kmalloc_node include/linux/slab.h:578 [inline] kvmalloc_node+0x68/0x100 mm/util.c:574 kvmalloc include/linux/mm.h:645 [inline] kvzalloc include/linux/mm.h:653 [inline] alloc_netdev_mqs+0x98/0xe40 net/core/dev.c:9797 rtnl_create_link+0x22d/0xaf0 net/core/rtnetlink.c:3047 __rtnl_newlink+0xf9f/0x1790 net/core/rtnetlink.c:3309 rtnl_newlink+0x69/0xa0 net/core/rtnetlink.c:3377 rtnetlink_rcv_msg+0x45e/0xaf0 net/core/rtnetlink.c:5438 netlink_rcv_skb+0x177/0x450 net/netlink/af_netlink.c:2477 rtnetlink_rcv+0x1d/0x30 net/core/rtnetlink.c:5456 netlink_unicast_kernel net/netlink/af_netlink.c:1302 [inline] netlink_unicast+0x59e/0x7e0 net/netlink/af_netlink.c:1328 netlink_sendmsg+0x91c/0xea0 net/netlink/af_netlink.c:1917 sock_sendmsg_nosec net/socket.c:652 [inline] sock_sendmsg+0xd7/0x130 net/socket.c:672 __sys_sendto+0x262/0x380 net/socket.c:1998 __do_compat_sys_socketcall net/compat.c:771 [inline] __se_compat_sys_socketcall net/compat.c:719 [inline] __ia32_compat_sys_socketcall+0x530/0x710 net/compat.c:719 do_syscall_32_irqs_on arch/x86/entry/common.c:337 [inline] do_fast_syscall_32+0x27b/0xe16 arch/x86/entry/common.c:408 entry_SYSENTER_compat+0x70/0x7f arch/x86/entry/entry_64_compat.S:139 Freed by task 10229: save_stack+0x23/0x90 mm/kasan/common.c:72 set_track mm/kasan/common.c:80 [inline] kasan_set_free_info mm/kasan/common.c:337 [inline] __kasan_slab_free+0x102/0x150 mm/kasan/common.c:476 kasan_slab_free+0xe/0x10 mm/kasan/common.c:485 __cache_free mm/slab.c:3426 [inline] kfree+0x10a/0x2c0 mm/slab.c:3757 __netdev_name_node_alt_destroy+0x1ff/0x2a0 net/core/dev.c:322 netdev_name_node_alt_destroy+0x57/0x80 net/core/dev.c:334 rtnl_alt_ifname net/core/rtnetlink.c:3518 [inline] rtnl_linkprop.isra.0+0x575/0x6f0 net/core/rtnetlink.c:3567 rtnl_dellinkprop+0x46/0x60 net/core/rtnetlink.c:3588 rtnetlink_rcv_msg+0x45e/0xaf0 net/core/rtnetlink.c:5438 netlink_rcv_skb+0x177/0x450 net/netlink/af_netlink.c:2477 rtnetlink_rcv+0x1d/0x30 net/core/rtnetlink.c:5456 netlink_unicast_kernel net/netlink/af_netlink.c:1302 [inline] netlink_unicast+0x59e/0x7e0 net/netlink/af_netlink.c:1328 netlink_sendmsg+0x91c/0xea0 net/netlink/af_netlink.c:1917 sock_sendmsg_nosec net/socket.c:652 [inline] sock_sendmsg+0xd7/0x130 net/socket.c:672 ____sys_sendmsg+0x753/0x880 net/socket.c:2343 ___sys_sendmsg+0x100/0x170 net/socket.c:2397 __sys_sendmsg+0x105/0x1d0 net/socket.c:2430 __compat_sys_sendmsg net/compat.c:642 [inline] __do_compat_sys_sendmsg net/compat.c:649 [inline] __se_compat_sys_sendmsg net/compat.c:646 [inline] __ia32_compat_sys_sendmsg+0x7a/0xb0 net/compat.c:646 do_syscall_32_irqs_on arch/x86/entry/common.c:337 [inline] do_fast_syscall_32+0x27b/0xe16 arch/x86/entry/common.c:408 entry_SYSENTER_compat+0x70/0x7f arch/x86/entry/entry_64_compat.S:139 The buggy address belongs to the object at ffff88809886c000 which belongs to the cache kmalloc-4k of size 4096 The buggy address is located 1416 bytes inside of 4096-byte region [ffff88809886c000, ffff88809886d000) The buggy address belongs to the page: page:ffffea0002621b00 refcount:1 mapcount:0 mapping:ffff8880aa402000 index:0x0 compound_mapcount: 0 flags: 0xfffe0000010200(slab|head) raw: 00fffe0000010200 ffffea0002610d08 ffffea0002607608 ffff8880aa402000 raw: 0000000000000000 ffff88809886c000 0000000100000001 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff88809886c480: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff88809886c500: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb >ffff88809886c580: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ^ ffff88809886c600: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff88809886c680: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb Fixes: 36fbf1e52bd3 ("net: rtnetlink: add linkprop commands to add and delete alternative ifnames") Signed-off-by: Eric Dumazet Reported-by: syzbot Cc: Jiri Pirko Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- net/core/dev.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/net/core/dev.c b/net/core/dev.c index a6316b336128..b6d13f3f1e5a 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -331,6 +331,12 @@ int netdev_name_node_alt_destroy(struct net_device *dev, const char *name) name_node = netdev_name_node_lookup(net, name); if (!name_node) return -ENOENT; + /* lookup might have found our primary name or a name belonging + * to another device. + */ + if (name_node == dev->name_node || name_node->dev != dev) + return -EINVAL; + __netdev_name_node_alt_destroy(name_node); return 0; -- cgit v1.2.3 From 6f08e98d62799e53c89dbf2c9a49d77e20ca648c Mon Sep 17 00:00:00 2001 From: Arun Parameswaran Date: Fri, 14 Feb 2020 13:47:46 -0800 Subject: net: phy: restore mdio regs in the iproc mdio driver The mii management register in iproc mdio block does not have a retention register so it is lost on suspend. Save and restore value of register while resuming from suspend. Fixes: bb1a619735b4 ("net: phy: Initialize mdio clock at probe function") Signed-off-by: Arun Parameswaran Signed-off-by: Scott Branden Reviewed-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/phy/mdio-bcm-iproc.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/drivers/net/phy/mdio-bcm-iproc.c b/drivers/net/phy/mdio-bcm-iproc.c index 7e9975d25066..f1ded03f0229 100644 --- a/drivers/net/phy/mdio-bcm-iproc.c +++ b/drivers/net/phy/mdio-bcm-iproc.c @@ -178,6 +178,23 @@ static int iproc_mdio_remove(struct platform_device *pdev) return 0; } +#ifdef CONFIG_PM_SLEEP +int iproc_mdio_resume(struct device *dev) +{ + struct platform_device *pdev = to_platform_device(dev); + struct iproc_mdio_priv *priv = platform_get_drvdata(pdev); + + /* restore the mii clock configuration */ + iproc_mdio_config_clk(priv->base); + + return 0; +} + +static const struct dev_pm_ops iproc_mdio_pm_ops = { + .resume = iproc_mdio_resume +}; +#endif /* CONFIG_PM_SLEEP */ + static const struct of_device_id iproc_mdio_of_match[] = { { .compatible = "brcm,iproc-mdio", }, { /* sentinel */ }, @@ -188,6 +205,9 @@ static struct platform_driver iproc_mdio_driver = { .driver = { .name = "iproc-mdio", .of_match_table = iproc_mdio_of_match, +#ifdef CONFIG_PM_SLEEP + .pm = &iproc_mdio_pm_ops, +#endif }, .probe = iproc_mdio_probe, .remove = iproc_mdio_remove, -- cgit v1.2.3 From b6e4a1aeeb14cad595f70b31cc376903d322c821 Mon Sep 17 00:00:00 2001 From: Mat Martineau Date: Fri, 14 Feb 2020 14:14:29 -0800 Subject: mptcp: Protect subflow socket options before connection completes Userspace should not be able to directly manipulate subflow socket options before a connection is established since it is not yet known if it will be an MPTCP subflow or a TCP fallback subflow. TCP fallback subflows can be more directly controlled by userspace because they are regular TCP connections, while MPTCP subflow sockets need to be configured for the specific needs of MPTCP. Use the same logic as sendmsg/recvmsg to ensure that socket option calls are only passed through to known TCP fallback subflows. Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- net/mptcp/protocol.c | 48 +++++++++++++++++++----------------------------- 1 file changed, 19 insertions(+), 29 deletions(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 030dee668e0a..e9aa6807b5be 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -755,60 +755,50 @@ static int mptcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval, unsigned int optlen) { struct mptcp_sock *msk = mptcp_sk(sk); - int ret = -EOPNOTSUPP; struct socket *ssock; - struct sock *ssk; pr_debug("msk=%p", msk); /* @@ the meaning of setsockopt() when the socket is connected and - * there are multiple subflows is not defined. + * there are multiple subflows is not yet defined. It is up to the + * MPTCP-level socket to configure the subflows until the subflow + * is in TCP fallback, when TCP socket options are passed through + * to the one remaining subflow. */ lock_sock(sk); - ssock = __mptcp_socket_create(msk, MPTCP_SAME_STATE); - if (IS_ERR(ssock)) { - release_sock(sk); - return ret; - } + ssock = __mptcp_tcp_fallback(msk); + if (ssock) + return tcp_setsockopt(ssock->sk, level, optname, optval, + optlen); - ssk = ssock->sk; - sock_hold(ssk); release_sock(sk); - ret = tcp_setsockopt(ssk, level, optname, optval, optlen); - sock_put(ssk); - - return ret; + return -EOPNOTSUPP; } static int mptcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *option) { struct mptcp_sock *msk = mptcp_sk(sk); - int ret = -EOPNOTSUPP; struct socket *ssock; - struct sock *ssk; pr_debug("msk=%p", msk); - /* @@ the meaning of getsockopt() when the socket is connected and - * there are multiple subflows is not defined. + /* @@ the meaning of setsockopt() when the socket is connected and + * there are multiple subflows is not yet defined. It is up to the + * MPTCP-level socket to configure the subflows until the subflow + * is in TCP fallback, when socket options are passed through + * to the one remaining subflow. */ lock_sock(sk); - ssock = __mptcp_socket_create(msk, MPTCP_SAME_STATE); - if (IS_ERR(ssock)) { - release_sock(sk); - return ret; - } + ssock = __mptcp_tcp_fallback(msk); + if (ssock) + return tcp_getsockopt(ssock->sk, level, optname, optval, + option); - ssk = ssock->sk; - sock_hold(ssk); release_sock(sk); - ret = tcp_getsockopt(ssk, level, optname, optval, option); - sock_put(ssk); - - return ret; + return -EOPNOTSUPP; } static int mptcp_get_port(struct sock *sk, unsigned short snum) -- cgit v1.2.3 From 04ddf1208f03e1dbc39a4619c40eba640051b950 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Fri, 14 Feb 2020 23:57:20 +0100 Subject: wireguard: selftests: reduce complexity and fix make races This gives us fewer dependencies and shortens build time, fixes up some hash checking race conditions, and also fixes missing directory creation that caused issues on massively parallel builds. Signed-off-by: Jason A. Donenfeld Signed-off-by: David S. Miller --- tools/testing/selftests/wireguard/qemu/Makefile | 38 +++++++++---------------- 1 file changed, 14 insertions(+), 24 deletions(-) diff --git a/tools/testing/selftests/wireguard/qemu/Makefile b/tools/testing/selftests/wireguard/qemu/Makefile index f10aa3590adc..28d477683e8a 100644 --- a/tools/testing/selftests/wireguard/qemu/Makefile +++ b/tools/testing/selftests/wireguard/qemu/Makefile @@ -38,19 +38,17 @@ endef define file_download = $(DISTFILES_PATH)/$(1): mkdir -p $(DISTFILES_PATH) - flock -x $$@.lock -c '[ -f $$@ ] && exit 0; wget -O $$@.tmp $(MIRROR)$(1) || wget -O $$@.tmp $(2)$(1) || rm -f $$@.tmp' - if echo "$(3) $$@.tmp" | sha256sum -c -; then mv $$@.tmp $$@; else rm -f $$@.tmp; exit 71; fi + flock -x $$@.lock -c '[ -f $$@ ] && exit 0; wget -O $$@.tmp $(MIRROR)$(1) || wget -O $$@.tmp $(2)$(1) || rm -f $$@.tmp; [ -f $$@.tmp ] || exit 1; if echo "$(3) $$@.tmp" | sha256sum -c -; then mv $$@.tmp $$@; else rm -f $$@.tmp; exit 71; fi' endef $(eval $(call tar_download,MUSL,musl,1.1.24,.tar.gz,https://www.musl-libc.org/releases/,1370c9a812b2cf2a7d92802510cca0058cc37e66a7bedd70051f0a34015022a3)) -$(eval $(call tar_download,LIBMNL,libmnl,1.0.4,.tar.bz2,https://www.netfilter.org/projects/libmnl/files/,171f89699f286a5854b72b91d06e8f8e3683064c5901fb09d954a9ab6f551f81)) $(eval $(call tar_download,IPERF,iperf,3.7,.tar.gz,https://downloads.es.net/pub/iperf/,d846040224317caf2f75c843d309a950a7db23f9b44b94688ccbe557d6d1710c)) $(eval $(call tar_download,BASH,bash,5.0,.tar.gz,https://ftp.gnu.org/gnu/bash/,b4a80f2ac66170b2913efbfb9f2594f1f76c7b1afd11f799e22035d63077fb4d)) $(eval $(call tar_download,IPROUTE2,iproute2,5.4.0,.tar.xz,https://www.kernel.org/pub/linux/utils/net/iproute2/,fe97aa60a0d4c5ac830be18937e18dc3400ca713a33a89ad896ff1e3d46086ae)) $(eval $(call tar_download,IPTABLES,iptables,1.8.4,.tar.bz2,https://www.netfilter.org/projects/iptables/files/,993a3a5490a544c2cbf2ef15cf7e7ed21af1845baf228318d5c36ef8827e157c)) $(eval $(call tar_download,NMAP,nmap,7.80,.tar.bz2,https://nmap.org/dist/,fcfa5a0e42099e12e4bf7a68ebe6fde05553383a682e816a7ec9256ab4773faa)) $(eval $(call tar_download,IPUTILS,iputils,s20190709,.tar.gz,https://github.com/iputils/iputils/archive/s20190709.tar.gz/#,a15720dd741d7538dd2645f9f516d193636ae4300ff7dbc8bfca757bf166490a)) -$(eval $(call tar_download,WIREGUARD_TOOLS,wireguard-tools,1.0.20191226,.tar.xz,https://git.zx2c4.com/wireguard-tools/snapshot/,aa8af0fdc9872d369d8c890a84dbc2a2466b55795dccd5b47721b2d97644b04f)) +$(eval $(call tar_download,WIREGUARD_TOOLS,wireguard-tools,1.0.20200206,.tar.xz,https://git.zx2c4.com/wireguard-tools/snapshot/,f5207248c6a3c3e3bfc9ab30b91c1897b00802ed861e1f9faaed873366078c64)) KERNEL_BUILD_PATH := $(BUILD_PATH)/kernel$(if $(findstring yes,$(DEBUG_KERNEL)),-debug) rwildcard=$(foreach d,$(wildcard $1*),$(call rwildcard,$d/,$2) $(filter $(subst *,%,$2),$d)) @@ -295,21 +293,13 @@ $(IPERF_PATH)/src/iperf3: | $(IPERF_PATH)/.installed $(USERSPACE_DEPS) $(MAKE) -C $(IPERF_PATH) $(STRIP) -s $@ -$(LIBMNL_PATH)/.installed: $(LIBMNL_TAR) - flock -s $<.lock tar -C $(BUILD_PATH) -xf $< - touch $@ - -$(LIBMNL_PATH)/src/.libs/libmnl.a: | $(LIBMNL_PATH)/.installed $(USERSPACE_DEPS) - cd $(LIBMNL_PATH) && ./configure --prefix=/ $(CROSS_COMPILE_FLAG) --enable-static --disable-shared - $(MAKE) -C $(LIBMNL_PATH) - sed -i 's:prefix=.*:prefix=$(LIBMNL_PATH):' $(LIBMNL_PATH)/libmnl.pc - $(WIREGUARD_TOOLS_PATH)/.installed: $(WIREGUARD_TOOLS_TAR) + mkdir -p $(BUILD_PATH) flock -s $<.lock tar -C $(BUILD_PATH) -xf $< touch $@ -$(WIREGUARD_TOOLS_PATH)/src/wg: | $(WIREGUARD_TOOLS_PATH)/.installed $(LIBMNL_PATH)/src/.libs/libmnl.a $(USERSPACE_DEPS) - LDFLAGS="$(LDFLAGS) -L$(LIBMNL_PATH)/src/.libs" $(MAKE) -C $(WIREGUARD_TOOLS_PATH)/src LIBMNL_CFLAGS="-I$(LIBMNL_PATH)/include" LIBMNL_LDLIBS="-lmnl" wg +$(WIREGUARD_TOOLS_PATH)/src/wg: | $(WIREGUARD_TOOLS_PATH)/.installed $(USERSPACE_DEPS) + $(MAKE) -C $(WIREGUARD_TOOLS_PATH)/src wg $(STRIP) -s $@ $(BUILD_PATH)/init: init.c | $(USERSPACE_DEPS) @@ -340,17 +330,17 @@ $(BASH_PATH)/bash: | $(BASH_PATH)/.installed $(USERSPACE_DEPS) $(IPROUTE2_PATH)/.installed: $(IPROUTE2_TAR) mkdir -p $(BUILD_PATH) flock -s $<.lock tar -C $(BUILD_PATH) -xf $< - printf 'CC:=$(CC)\nPKG_CONFIG:=pkg-config\nTC_CONFIG_XT:=n\nTC_CONFIG_ATM:=n\nTC_CONFIG_IPSET:=n\nIP_CONFIG_SETNS:=y\nHAVE_ELF:=n\nHAVE_MNL:=y\nHAVE_BERKELEY_DB:=n\nHAVE_LATEX:=n\nHAVE_PDFLATEX:=n\nCFLAGS+=-DHAVE_SETNS -DHAVE_LIBMNL -I$(LIBMNL_PATH)/include\nLDLIBS+=-lmnl' > $(IPROUTE2_PATH)/config.mk + printf 'CC:=$(CC)\nPKG_CONFIG:=pkg-config\nTC_CONFIG_XT:=n\nTC_CONFIG_ATM:=n\nTC_CONFIG_IPSET:=n\nIP_CONFIG_SETNS:=y\nHAVE_ELF:=n\nHAVE_MNL:=n\nHAVE_BERKELEY_DB:=n\nHAVE_LATEX:=n\nHAVE_PDFLATEX:=n\nCFLAGS+=-DHAVE_SETNS\n' > $(IPROUTE2_PATH)/config.mk printf 'lib: snapshot\n\t$$(MAKE) -C lib\nip/ip: lib\n\t$$(MAKE) -C ip ip\nmisc/ss: lib\n\t$$(MAKE) -C misc ss\n' >> $(IPROUTE2_PATH)/Makefile touch $@ -$(IPROUTE2_PATH)/ip/ip: | $(IPROUTE2_PATH)/.installed $(LIBMNL_PATH)/src/.libs/libmnl.a $(USERSPACE_DEPS) - LDFLAGS="$(LDFLAGS) -L$(LIBMNL_PATH)/src/.libs" PKG_CONFIG_LIBDIR="$(LIBMNL_PATH)" $(MAKE) -C $(IPROUTE2_PATH) PREFIX=/ ip/ip - $(STRIP) -s $(IPROUTE2_PATH)/ip/ip +$(IPROUTE2_PATH)/ip/ip: | $(IPROUTE2_PATH)/.installed $(USERSPACE_DEPS) + $(MAKE) -C $(IPROUTE2_PATH) PREFIX=/ ip/ip + $(STRIP) -s $@ -$(IPROUTE2_PATH)/misc/ss: | $(IPROUTE2_PATH)/.installed $(LIBMNL_PATH)/src/.libs/libmnl.a $(USERSPACE_DEPS) - LDFLAGS="$(LDFLAGS) -L$(LIBMNL_PATH)/src/.libs" PKG_CONFIG_LIBDIR="$(LIBMNL_PATH)" $(MAKE) -C $(IPROUTE2_PATH) PREFIX=/ misc/ss - $(STRIP) -s $(IPROUTE2_PATH)/misc/ss +$(IPROUTE2_PATH)/misc/ss: | $(IPROUTE2_PATH)/.installed $(USERSPACE_DEPS) + $(MAKE) -C $(IPROUTE2_PATH) PREFIX=/ misc/ss + $(STRIP) -s $@ $(IPTABLES_PATH)/.installed: $(IPTABLES_TAR) mkdir -p $(BUILD_PATH) @@ -358,8 +348,8 @@ $(IPTABLES_PATH)/.installed: $(IPTABLES_TAR) sed -i -e "/nfnetlink=[01]/s:=[01]:=0:" -e "/nfconntrack=[01]/s:=[01]:=0:" $(IPTABLES_PATH)/configure touch $@ -$(IPTABLES_PATH)/iptables/xtables-legacy-multi: | $(IPTABLES_PATH)/.installed $(LIBMNL_PATH)/src/.libs/libmnl.a $(USERSPACE_DEPS) - cd $(IPTABLES_PATH) && PKG_CONFIG_LIBDIR="$(LIBMNL_PATH)" ./configure --prefix=/ $(CROSS_COMPILE_FLAG) --enable-static --disable-shared --disable-nftables --disable-bpf-compiler --disable-nfsynproxy --disable-libipq --with-kernel=$(BUILD_PATH)/include +$(IPTABLES_PATH)/iptables/xtables-legacy-multi: | $(IPTABLES_PATH)/.installed $(USERSPACE_DEPS) + cd $(IPTABLES_PATH) && ./configure --prefix=/ $(CROSS_COMPILE_FLAG) --enable-static --disable-shared --disable-nftables --disable-bpf-compiler --disable-nfsynproxy --disable-libipq --disable-connlabel --with-kernel=$(BUILD_PATH)/include $(MAKE) -C $(IPTABLES_PATH) $(STRIP) -s $@ -- cgit v1.2.3 From 2a8a4df36462aa85b0db87b7c5ea145ba67e34a8 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Fri, 14 Feb 2020 23:57:21 +0100 Subject: wireguard: receive: reset last_under_load to zero This is a small optimization that prevents more expensive comparisons from happening when they are no longer necessary, by clearing the last_under_load variable whenever we wind up in a state where we were under load but we no longer are. Signed-off-by: Jason A. Donenfeld Suggested-by: Matt Dunwoodie Signed-off-by: David S. Miller --- drivers/net/wireguard/receive.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/net/wireguard/receive.c b/drivers/net/wireguard/receive.c index 9c6bab9c981f..4a153894cee2 100644 --- a/drivers/net/wireguard/receive.c +++ b/drivers/net/wireguard/receive.c @@ -118,10 +118,13 @@ static void wg_receive_handshake_packet(struct wg_device *wg, under_load = skb_queue_len(&wg->incoming_handshakes) >= MAX_QUEUED_INCOMING_HANDSHAKES / 8; - if (under_load) + if (under_load) { last_under_load = ktime_get_coarse_boottime_ns(); - else if (last_under_load) + } else if (last_under_load) { under_load = !wg_birthdate_has_expired(last_under_load, 1); + if (!under_load) + last_under_load = 0; + } mac_state = wg_cookie_validate_packet(&wg->cookie_checker, skb, under_load); if ((under_load && mac_state == VALID_MAC_WITH_COOKIE) || -- cgit v1.2.3 From 175f1ca9a9ed8689d2028da1a7c624bb4fb4ff7e Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Fri, 14 Feb 2020 23:57:22 +0100 Subject: wireguard: send: account for mtu=0 devices It turns out there's an easy way to get packets queued up while still having an MTU of zero, and that's via persistent keep alive. This commit makes sure that in whatever condition, we don't wind up dividing by zero. Note that an MTU of zero for a wireguard interface is something quasi-valid, so I don't think the correct fix is to limit it via min_mtu. This can be reproduced easily with: ip link add wg0 type wireguard ip link add wg1 type wireguard ip link set wg0 up mtu 0 ip link set wg1 up wg set wg0 private-key <(wg genkey) wg set wg1 listen-port 1 private-key <(wg genkey) peer $(wg show wg0 public-key) wg set wg0 peer $(wg show wg1 public-key) persistent-keepalive 1 endpoint 127.0.0.1:1 However, while min_mtu=0 seems fine, it makes sense to restrict the max_mtu. This commit also restricts the maximum MTU to the greatest number for which rounding up to the padding multiple won't overflow a signed integer. Packets this large were always rejected anyway eventually, due to checks deeper in, but it seems more sound not to even let the administrator configure something that won't work anyway. We use this opportunity to clean up this function a bit so that it's clear which paths we're expecting. Signed-off-by: Jason A. Donenfeld Cc: Eric Dumazet Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- drivers/net/wireguard/device.c | 7 ++++--- drivers/net/wireguard/send.c | 16 +++++++++++----- 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/drivers/net/wireguard/device.c b/drivers/net/wireguard/device.c index 43db442b1373..cdc96968b0f4 100644 --- a/drivers/net/wireguard/device.c +++ b/drivers/net/wireguard/device.c @@ -258,6 +258,8 @@ static void wg_setup(struct net_device *dev) enum { WG_NETDEV_FEATURES = NETIF_F_HW_CSUM | NETIF_F_RXCSUM | NETIF_F_SG | NETIF_F_GSO | NETIF_F_GSO_SOFTWARE | NETIF_F_HIGHDMA }; + const int overhead = MESSAGE_MINIMUM_LENGTH + sizeof(struct udphdr) + + max(sizeof(struct ipv6hdr), sizeof(struct iphdr)); dev->netdev_ops = &netdev_ops; dev->hard_header_len = 0; @@ -271,9 +273,8 @@ static void wg_setup(struct net_device *dev) dev->features |= WG_NETDEV_FEATURES; dev->hw_features |= WG_NETDEV_FEATURES; dev->hw_enc_features |= WG_NETDEV_FEATURES; - dev->mtu = ETH_DATA_LEN - MESSAGE_MINIMUM_LENGTH - - sizeof(struct udphdr) - - max(sizeof(struct ipv6hdr), sizeof(struct iphdr)); + dev->mtu = ETH_DATA_LEN - overhead; + dev->max_mtu = round_down(INT_MAX, MESSAGE_PADDING_MULTIPLE) - overhead; SET_NETDEV_DEVTYPE(dev, &device_type); diff --git a/drivers/net/wireguard/send.c b/drivers/net/wireguard/send.c index c13260563446..7348c10cbae3 100644 --- a/drivers/net/wireguard/send.c +++ b/drivers/net/wireguard/send.c @@ -143,16 +143,22 @@ static void keep_key_fresh(struct wg_peer *peer) static unsigned int calculate_skb_padding(struct sk_buff *skb) { + unsigned int padded_size, last_unit = skb->len; + + if (unlikely(!PACKET_CB(skb)->mtu)) + return ALIGN(last_unit, MESSAGE_PADDING_MULTIPLE) - last_unit; + /* We do this modulo business with the MTU, just in case the networking * layer gives us a packet that's bigger than the MTU. In that case, we * wouldn't want the final subtraction to overflow in the case of the - * padded_size being clamped. + * padded_size being clamped. Fortunately, that's very rarely the case, + * so we optimize for that not happening. */ - unsigned int last_unit = skb->len % PACKET_CB(skb)->mtu; - unsigned int padded_size = ALIGN(last_unit, MESSAGE_PADDING_MULTIPLE); + if (unlikely(last_unit > PACKET_CB(skb)->mtu)) + last_unit %= PACKET_CB(skb)->mtu; - if (padded_size > PACKET_CB(skb)->mtu) - padded_size = PACKET_CB(skb)->mtu; + padded_size = min(PACKET_CB(skb)->mtu, + ALIGN(last_unit, MESSAGE_PADDING_MULTIPLE)); return padded_size - last_unit; } -- cgit v1.2.3 From 1fbc33b0a7feb6ca72bf7dc8a05d81485ee8ee2e Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Fri, 14 Feb 2020 23:57:23 +0100 Subject: wireguard: socket: remove extra call to synchronize_net synchronize_net() is a wrapper around synchronize_rcu(), so there's no point in having synchronize_net and synchronize_rcu back to back, despite the documentation comment suggesting maybe it's somewhat useful, "Wait for packets currently being received to be done." This commit removes the extra call. Signed-off-by: Jason A. Donenfeld Suggested-by: Eric Dumazet Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- drivers/net/wireguard/socket.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/net/wireguard/socket.c b/drivers/net/wireguard/socket.c index 262f3b5c819d..b0d6541582d3 100644 --- a/drivers/net/wireguard/socket.c +++ b/drivers/net/wireguard/socket.c @@ -432,7 +432,6 @@ void wg_socket_reinit(struct wg_device *wg, struct sock *new4, wg->incoming_port = ntohs(inet_sk(new4)->inet_sport); mutex_unlock(&wg->socket_update_lock); synchronize_rcu(); - synchronize_net(); sock_free(old4); sock_free(old6); } -- cgit v1.2.3 From d965a5432d4c3e6b9c3d2bc1d4a800013bbf76f6 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Fri, 14 Feb 2020 15:26:19 -0800 Subject: net: dsa: b53: Ensure the default VID is untagged We need to ensure that the default VID is untagged otherwise the switch will be sending tagged frames and the results can be problematic. This is especially true with b53 switches that use VID 0 as their default VLAN since VID 0 has a special meaning. Fixes: fea83353177a ("net: dsa: b53: Fix default VLAN ID") Fixes: 061f6a505ac3 ("net: dsa: Add ndo_vlan_rx_{add, kill}_vid implementation") Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/dsa/b53/b53_common.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c index 449a22172e07..1a69286daa8d 100644 --- a/drivers/net/dsa/b53/b53_common.c +++ b/drivers/net/dsa/b53/b53_common.c @@ -1366,6 +1366,9 @@ void b53_vlan_add(struct dsa_switch *ds, int port, b53_get_vlan_entry(dev, vid, vl); + if (vid == 0 && vid == b53_default_pvid(dev)) + untagged = true; + vl->members |= BIT(port); if (untagged && !dsa_is_cpu_port(ds, port)) vl->untag |= BIT(port); -- cgit v1.2.3 From 6699170376ab941c1cc5c3bcefa766efc3575c73 Mon Sep 17 00:00:00 2001 From: Michal Kubecek Date: Sat, 15 Feb 2020 01:55:53 +0100 Subject: ethtool: fix application of verbose no_mask bitset A bitset without mask in a _SET request means we want exactly the bits in the bitset to be set. This works correctly for compact format but when verbose format is parsed, ethnl_update_bitset32_verbose() only sets the bits present in the request bitset but does not clear the rest. This can cause incorrect results like lion:~ # ethtool eth0 | grep Wake Supports Wake-on: pumbg Wake-on: g lion:~ # ethtool -s eth0 wol u lion:~ # ethtool eth0 | grep Wake Supports Wake-on: pumbg Wake-on: ug when the second ethtool command issues request ETHTOOL_MSG_WOL_SET ETHTOOL_A_WOL_HEADER ETHTOOL_A_HEADER_DEV_NAME = "eth0" ETHTOOL_A_WOL_MODES ETHTOOL_A_BITSET_NOMASK ETHTOOL_A_BITSET_BITS ETHTOOL_A_BITSET_BITS_BIT ETHTOOL_BITSET_BIT_INDEX = 1 Fix the logic by clearing the whole target bitmap before we start iterating through the request bits. Fixes: 10b518d4e6dd ("ethtool: netlink bitset handling") Signed-off-by: Michal Kubecek Signed-off-by: David S. Miller --- net/ethtool/bitset.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/ethtool/bitset.c b/net/ethtool/bitset.c index fce45dac4205..8977fe1f3946 100644 --- a/net/ethtool/bitset.c +++ b/net/ethtool/bitset.c @@ -447,7 +447,10 @@ ethnl_update_bitset32_verbose(u32 *bitmap, unsigned int nbits, "mask only allowed in compact bitset"); return -EINVAL; } + no_mask = tb[ETHTOOL_A_BITSET_NOMASK]; + if (no_mask) + ethnl_bitmap32_clear(bitmap, 0, nbits, mod); nla_for_each_nested(bit_attr, tb[ETHTOOL_A_BITSET_BITS], rem) { bool old_val, new_val; -- cgit v1.2.3 From c4c10784293ec89746721b1a40cb730b0106deea Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sat, 15 Feb 2020 08:17:28 +0100 Subject: NFC: pn544: Fix a typo in a debug message The ending character of the string shoulb be \n, not \b. Fixes: 17936b43f0fd ("NFC: Standardize logging style") Signed-off-by: Christophe JAILLET Signed-off-by: David S. Miller --- drivers/nfc/pn544/pn544.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nfc/pn544/pn544.c b/drivers/nfc/pn544/pn544.c index 2b83156efe3f..b788870473e8 100644 --- a/drivers/nfc/pn544/pn544.c +++ b/drivers/nfc/pn544/pn544.c @@ -682,7 +682,7 @@ static int pn544_hci_tm_send(struct nfc_hci_dev *hdev, struct sk_buff *skb) static int pn544_hci_check_presence(struct nfc_hci_dev *hdev, struct nfc_target *target) { - pr_debug("supported protocol %d\b", target->supported_protocols); + pr_debug("supported protocol %d\n", target->supported_protocols); if (target->supported_protocols & (NFC_PROTO_ISO14443_MASK | NFC_PROTO_ISO14443_B_MASK)) { return nfc_hci_send_cmd(hdev, target->hci_reader_gate, -- cgit v1.2.3 From 064ff66e2bef84f1153087612032b5b9eab005bd Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Sat, 15 Feb 2020 10:50:08 +0000 Subject: bonding: add missing netdev_update_lockdep_key() After bond_release(), netdev_update_lockdep_key() should be called. But both ioctl path and attribute path don't call netdev_update_lockdep_key(). This patch adds missing netdev_update_lockdep_key(). Test commands: ip link add bond0 type bond ip link add bond1 type bond ifenslave bond0 bond1 ifenslave -d bond0 bond1 ifenslave bond1 bond0 Splat looks like: [ 29.501182][ T1046] WARNING: possible circular locking dependency detected [ 29.501945][ T1039] hardirqs last disabled at (1962): [] handle_mm_fault+0x13f/0x700 [ 29.503442][ T1046] 5.5.0+ #322 Not tainted [ 29.503447][ T1046] ------------------------------------------------------ [ 29.504277][ T1039] softirqs last enabled at (1180): [] __do_softirq+0x678/0x981 [ 29.505443][ T1046] ifenslave/1046 is trying to acquire lock: [ 29.505886][ T1039] softirqs last disabled at (1169): [] irq_exit+0x17a/0x1a0 [ 29.509997][ T1046] ffff88805d5da280 (&dev->addr_list_lock_key#3){+...}, at: dev_mc_sync_multiple+0x95/0x120 [ 29.511243][ T1046] [ 29.511243][ T1046] but task is already holding lock: [ 29.512192][ T1046] ffff8880460f2280 (&dev->addr_list_lock_key#4){+...}, at: bond_enslave+0x4482/0x47b0 [bonding] [ 29.514124][ T1046] [ 29.514124][ T1046] which lock already depends on the new lock. [ 29.514124][ T1046] [ 29.517297][ T1046] [ 29.517297][ T1046] the existing dependency chain (in reverse order) is: [ 29.518231][ T1046] [ 29.518231][ T1046] -> #1 (&dev->addr_list_lock_key#4){+...}: [ 29.519076][ T1046] _raw_spin_lock+0x30/0x70 [ 29.519588][ T1046] dev_mc_sync_multiple+0x95/0x120 [ 29.520208][ T1046] bond_enslave+0x448d/0x47b0 [bonding] [ 29.520862][ T1046] bond_option_slaves_set+0x1a3/0x370 [bonding] [ 29.521640][ T1046] __bond_opt_set+0x1ff/0xbb0 [bonding] [ 29.522438][ T1046] __bond_opt_set_notify+0x2b/0xf0 [bonding] [ 29.523251][ T1046] bond_opt_tryset_rtnl+0x92/0xf0 [bonding] [ 29.524082][ T1046] bonding_sysfs_store_option+0x8a/0xf0 [bonding] [ 29.524959][ T1046] kernfs_fop_write+0x276/0x410 [ 29.525620][ T1046] vfs_write+0x197/0x4a0 [ 29.526218][ T1046] ksys_write+0x141/0x1d0 [ 29.526818][ T1046] do_syscall_64+0x99/0x4f0 [ 29.527430][ T1046] entry_SYSCALL_64_after_hwframe+0x49/0xbe [ 29.528265][ T1046] [ 29.528265][ T1046] -> #0 (&dev->addr_list_lock_key#3){+...}: [ 29.529272][ T1046] __lock_acquire+0x2d8d/0x3de0 [ 29.529935][ T1046] lock_acquire+0x164/0x3b0 [ 29.530638][ T1046] _raw_spin_lock+0x30/0x70 [ 29.531187][ T1046] dev_mc_sync_multiple+0x95/0x120 [ 29.531790][ T1046] bond_enslave+0x448d/0x47b0 [bonding] [ 29.532451][ T1046] bond_option_slaves_set+0x1a3/0x370 [bonding] [ 29.533163][ T1046] __bond_opt_set+0x1ff/0xbb0 [bonding] [ 29.533789][ T1046] __bond_opt_set_notify+0x2b/0xf0 [bonding] [ 29.534595][ T1046] bond_opt_tryset_rtnl+0x92/0xf0 [bonding] [ 29.535500][ T1046] bonding_sysfs_store_option+0x8a/0xf0 [bonding] [ 29.536379][ T1046] kernfs_fop_write+0x276/0x410 [ 29.537057][ T1046] vfs_write+0x197/0x4a0 [ 29.537640][ T1046] ksys_write+0x141/0x1d0 [ 29.538251][ T1046] do_syscall_64+0x99/0x4f0 [ 29.538870][ T1046] entry_SYSCALL_64_after_hwframe+0x49/0xbe [ 29.539659][ T1046] [ 29.539659][ T1046] other info that might help us debug this: [ 29.539659][ T1046] [ 29.540953][ T1046] Possible unsafe locking scenario: [ 29.540953][ T1046] [ 29.541883][ T1046] CPU0 CPU1 [ 29.542540][ T1046] ---- ---- [ 29.543209][ T1046] lock(&dev->addr_list_lock_key#4); [ 29.543880][ T1046] lock(&dev->addr_list_lock_key#3); [ 29.544873][ T1046] lock(&dev->addr_list_lock_key#4); [ 29.545863][ T1046] lock(&dev->addr_list_lock_key#3); [ 29.546525][ T1046] [ 29.546525][ T1046] *** DEADLOCK *** [ 29.546525][ T1046] [ 29.547542][ T1046] 5 locks held by ifenslave/1046: [ 29.548196][ T1046] #0: ffff88806044c478 (sb_writers#5){.+.+}, at: vfs_write+0x3bb/0x4a0 [ 29.549248][ T1046] #1: ffff88805af00890 (&of->mutex){+.+.}, at: kernfs_fop_write+0x1cf/0x410 [ 29.550343][ T1046] #2: ffff88805b8b54b0 (kn->count#157){.+.+}, at: kernfs_fop_write+0x1f2/0x410 [ 29.551575][ T1046] #3: ffffffffaecf4cf0 (rtnl_mutex){+.+.}, at: bond_opt_tryset_rtnl+0x5f/0xf0 [bonding] [ 29.552819][ T1046] #4: ffff8880460f2280 (&dev->addr_list_lock_key#4){+...}, at: bond_enslave+0x4482/0x47b0 [bonding] [ 29.554175][ T1046] [ 29.554175][ T1046] stack backtrace: [ 29.554907][ T1046] CPU: 0 PID: 1046 Comm: ifenslave Not tainted 5.5.0+ #322 [ 29.555854][ T1046] Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006 [ 29.557064][ T1046] Call Trace: [ 29.557504][ T1046] dump_stack+0x96/0xdb [ 29.558054][ T1046] check_noncircular+0x371/0x450 [ 29.558723][ T1046] ? print_circular_bug.isra.35+0x310/0x310 [ 29.559486][ T1046] ? hlock_class+0x130/0x130 [ 29.560100][ T1046] ? __lock_acquire+0x2d8d/0x3de0 [ 29.560761][ T1046] __lock_acquire+0x2d8d/0x3de0 [ 29.561366][ T1046] ? register_lock_class+0x14d0/0x14d0 [ 29.562045][ T1046] ? find_held_lock+0x39/0x1d0 [ 29.562641][ T1046] lock_acquire+0x164/0x3b0 [ 29.563199][ T1046] ? dev_mc_sync_multiple+0x95/0x120 [ 29.563872][ T1046] _raw_spin_lock+0x30/0x70 [ 29.564464][ T1046] ? dev_mc_sync_multiple+0x95/0x120 [ 29.565146][ T1046] dev_mc_sync_multiple+0x95/0x120 [ 29.565793][ T1046] bond_enslave+0x448d/0x47b0 [bonding] [ 29.566487][ T1046] ? bond_update_slave_arr+0x940/0x940 [bonding] [ 29.567279][ T1046] ? bstr_printf+0xc20/0xc20 [ 29.567857][ T1046] ? stack_trace_consume_entry+0x160/0x160 [ 29.568614][ T1046] ? deactivate_slab.isra.77+0x2c5/0x800 [ 29.569320][ T1046] ? check_chain_key+0x236/0x5d0 [ 29.569939][ T1046] ? sscanf+0x93/0xc0 [ 29.570442][ T1046] ? vsscanf+0x1e20/0x1e20 [ 29.571003][ T1046] bond_option_slaves_set+0x1a3/0x370 [bonding] [ ... ] Fixes: ab92d68fc22f ("net: core: add generic lockdep keys") Signed-off-by: Taehee Yoo Signed-off-by: David S. Miller --- drivers/net/bonding/bond_main.c | 2 ++ drivers/net/bonding/bond_options.c | 2 ++ 2 files changed, 4 insertions(+) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 48d5ec770b94..1e9d5d35fc78 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -3640,6 +3640,8 @@ static int bond_do_ioctl(struct net_device *bond_dev, struct ifreq *ifr, int cmd case BOND_RELEASE_OLD: case SIOCBONDRELEASE: res = bond_release(bond_dev, slave_dev); + if (!res) + netdev_update_lockdep_key(slave_dev); break; case BOND_SETHWADDR_OLD: case SIOCBONDSETHWADDR: diff --git a/drivers/net/bonding/bond_options.c b/drivers/net/bonding/bond_options.c index ddb3916d3506..215c10923289 100644 --- a/drivers/net/bonding/bond_options.c +++ b/drivers/net/bonding/bond_options.c @@ -1398,6 +1398,8 @@ static int bond_option_slaves_set(struct bonding *bond, case '-': slave_dbg(bond->dev, dev, "Releasing interface\n"); ret = bond_release(bond->dev, dev); + if (!ret) + netdev_update_lockdep_key(dev); break; default: -- cgit v1.2.3 From 7151affeef8d527f50b4b68a871fd28bd660023f Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Sat, 15 Feb 2020 10:50:21 +0000 Subject: net: export netdev_next_lower_dev_rcu() netdev_next_lower_dev_rcu() will be used to implement a function, which is to walk all lower interfaces. There are already functions that they walk their lower interface. (netdev_walk_all_lower_dev_rcu, netdev_walk_all_lower_dev()). But, there would be cases that couldn't be covered by given netdev_walk_all_lower_dev_{rcu}() function. So, some modules would want to implement own function, which is to walk all lower interfaces. In the next patch, netdev_next_lower_dev_rcu() will be used. In addition, this patch removes two unused prototypes in netdevice.h. Signed-off-by: Taehee Yoo Signed-off-by: David S. Miller --- include/linux/netdevice.h | 7 +++---- net/core/dev.c | 6 +++--- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 9f1f633235f6..6c3f7032e8d9 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -72,6 +72,8 @@ void netdev_set_default_ethtool_ops(struct net_device *dev, #define NET_RX_SUCCESS 0 /* keep 'em coming, baby */ #define NET_RX_DROP 1 /* packet dropped */ +#define MAX_NEST_DEV 8 + /* * Transmit return codes: transmit return codes originate from three different * namespaces: @@ -4389,11 +4391,8 @@ void *netdev_lower_get_next(struct net_device *dev, ldev; \ ldev = netdev_lower_get_next(dev, &(iter))) -struct net_device *netdev_all_lower_get_next(struct net_device *dev, +struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev, struct list_head **iter); -struct net_device *netdev_all_lower_get_next_rcu(struct net_device *dev, - struct list_head **iter); - int netdev_walk_all_lower_dev(struct net_device *dev, int (*fn)(struct net_device *lower_dev, void *data), diff --git a/net/core/dev.c b/net/core/dev.c index b6d13f3f1e5a..2577ebfed293 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -146,7 +146,6 @@ #include "net-sysfs.h" #define MAX_GRO_SKBS 8 -#define MAX_NEST_DEV 8 /* This should be increased if a protocol with a bigger head is added. */ #define GRO_MAX_HEAD (MAX_HEADER + 128) @@ -7207,8 +7206,8 @@ static int __netdev_walk_all_lower_dev(struct net_device *dev, return 0; } -static struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev, - struct list_head **iter) +struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev, + struct list_head **iter) { struct netdev_adjacent *lower; @@ -7220,6 +7219,7 @@ static struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev, return lower->dev; } +EXPORT_SYMBOL(netdev_next_lower_dev_rcu); static u8 __netdev_upper_depth(struct net_device *dev) { -- cgit v1.2.3 From b3e80d44f5b1b470dd9e2dbc6816e63a5c519709 Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Sat, 15 Feb 2020 10:50:40 +0000 Subject: bonding: fix lockdep warning in bond_get_stats() In the "struct bonding", there is stats_lock. This lock protects "bond_stats" in the "struct bonding". bond_stats is updated in the bond_get_stats() and this function would be executed concurrently. So, the lock is needed. Bonding interfaces would be nested. So, either stats_lock should use dynamic lockdep class key or stats_lock should be used by spin_lock_nested(). In the current code, stats_lock is using a dynamic lockdep class key. But there is no updating stats_lock_key routine So, lockdep warning will occur. Test commands: ip link add bond0 type bond ip link add bond1 type bond ip link set bond0 master bond1 ip link set bond0 nomaster ip link set bond1 master bond0 Splat looks like: [ 38.420603][ T957] 5.5.0+ #394 Not tainted [ 38.421074][ T957] ------------------------------------------------------ [ 38.421837][ T957] ip/957 is trying to acquire lock: [ 38.422399][ T957] ffff888063262cd8 (&bond->stats_lock_key#2){+.+.}, at: bond_get_stats+0x90/0x4d0 [bonding] [ 38.423528][ T957] [ 38.423528][ T957] but task is already holding lock: [ 38.424526][ T957] ffff888065fd2cd8 (&bond->stats_lock_key){+.+.}, at: bond_get_stats+0x90/0x4d0 [bonding] [ 38.426075][ T957] [ 38.426075][ T957] which lock already depends on the new lock. [ 38.426075][ T957] [ 38.428536][ T957] [ 38.428536][ T957] the existing dependency chain (in reverse order) is: [ 38.429475][ T957] [ 38.429475][ T957] -> #1 (&bond->stats_lock_key){+.+.}: [ 38.430273][ T957] _raw_spin_lock+0x30/0x70 [ 38.430812][ T957] bond_get_stats+0x90/0x4d0 [bonding] [ 38.431451][ T957] dev_get_stats+0x1ec/0x270 [ 38.432088][ T957] bond_get_stats+0x1a5/0x4d0 [bonding] [ 38.432767][ T957] dev_get_stats+0x1ec/0x270 [ 38.433322][ T957] rtnl_fill_stats+0x44/0xbe0 [ 38.433866][ T957] rtnl_fill_ifinfo+0xeb2/0x3720 [ 38.434474][ T957] rtmsg_ifinfo_build_skb+0xca/0x170 [ 38.435081][ T957] rtmsg_ifinfo_event.part.33+0x1b/0xb0 [ 38.436848][ T957] rtnetlink_event+0xcd/0x120 [ 38.437455][ T957] notifier_call_chain+0x90/0x160 [ 38.438067][ T957] netdev_change_features+0x74/0xa0 [ 38.438708][ T957] bond_compute_features.isra.45+0x4e6/0x6f0 [bonding] [ 38.439522][ T957] bond_enslave+0x3639/0x47b0 [bonding] [ 38.440225][ T957] do_setlink+0xaab/0x2ef0 [ 38.440786][ T957] __rtnl_newlink+0x9c5/0x1270 [ 38.441463][ T957] rtnl_newlink+0x65/0x90 [ 38.442075][ T957] rtnetlink_rcv_msg+0x4a8/0x890 [ 38.442774][ T957] netlink_rcv_skb+0x121/0x350 [ 38.443451][ T957] netlink_unicast+0x42e/0x610 [ 38.444282][ T957] netlink_sendmsg+0x65a/0xb90 [ 38.444992][ T957] ____sys_sendmsg+0x5ce/0x7a0 [ 38.445679][ T957] ___sys_sendmsg+0x10f/0x1b0 [ 38.446365][ T957] __sys_sendmsg+0xc6/0x150 [ 38.447007][ T957] do_syscall_64+0x99/0x4f0 [ 38.447668][ T957] entry_SYSCALL_64_after_hwframe+0x49/0xbe [ 38.448538][ T957] [ 38.448538][ T957] -> #0 (&bond->stats_lock_key#2){+.+.}: [ 38.449554][ T957] __lock_acquire+0x2d8d/0x3de0 [ 38.450148][ T957] lock_acquire+0x164/0x3b0 [ 38.450711][ T957] _raw_spin_lock+0x30/0x70 [ 38.451292][ T957] bond_get_stats+0x90/0x4d0 [bonding] [ 38.451950][ T957] dev_get_stats+0x1ec/0x270 [ 38.452425][ T957] bond_get_stats+0x1a5/0x4d0 [bonding] [ 38.453362][ T957] dev_get_stats+0x1ec/0x270 [ 38.453825][ T957] rtnl_fill_stats+0x44/0xbe0 [ 38.454390][ T957] rtnl_fill_ifinfo+0xeb2/0x3720 [ 38.456257][ T957] rtmsg_ifinfo_build_skb+0xca/0x170 [ 38.456998][ T957] rtmsg_ifinfo_event.part.33+0x1b/0xb0 [ 38.459351][ T957] rtnetlink_event+0xcd/0x120 [ 38.460086][ T957] notifier_call_chain+0x90/0x160 [ 38.460829][ T957] netdev_change_features+0x74/0xa0 [ 38.461752][ T957] bond_compute_features.isra.45+0x4e6/0x6f0 [bonding] [ 38.462705][ T957] bond_enslave+0x3639/0x47b0 [bonding] [ 38.463476][ T957] do_setlink+0xaab/0x2ef0 [ 38.464141][ T957] __rtnl_newlink+0x9c5/0x1270 [ 38.464897][ T957] rtnl_newlink+0x65/0x90 [ 38.465522][ T957] rtnetlink_rcv_msg+0x4a8/0x890 [ 38.466215][ T957] netlink_rcv_skb+0x121/0x350 [ 38.466895][ T957] netlink_unicast+0x42e/0x610 [ 38.467583][ T957] netlink_sendmsg+0x65a/0xb90 [ 38.468285][ T957] ____sys_sendmsg+0x5ce/0x7a0 [ 38.469202][ T957] ___sys_sendmsg+0x10f/0x1b0 [ 38.469884][ T957] __sys_sendmsg+0xc6/0x150 [ 38.470587][ T957] do_syscall_64+0x99/0x4f0 [ 38.471245][ T957] entry_SYSCALL_64_after_hwframe+0x49/0xbe [ 38.472093][ T957] [ 38.472093][ T957] other info that might help us debug this: [ 38.472093][ T957] [ 38.473438][ T957] Possible unsafe locking scenario: [ 38.473438][ T957] [ 38.474898][ T957] CPU0 CPU1 [ 38.476234][ T957] ---- ---- [ 38.480171][ T957] lock(&bond->stats_lock_key); [ 38.480808][ T957] lock(&bond->stats_lock_key#2); [ 38.481791][ T957] lock(&bond->stats_lock_key); [ 38.482754][ T957] lock(&bond->stats_lock_key#2); [ 38.483416][ T957] [ 38.483416][ T957] *** DEADLOCK *** [ 38.483416][ T957] [ 38.484505][ T957] 3 locks held by ip/957: [ 38.485048][ T957] #0: ffffffffbccf6230 (rtnl_mutex){+.+.}, at: rtnetlink_rcv_msg+0x457/0x890 [ 38.486198][ T957] #1: ffff888065fd2cd8 (&bond->stats_lock_key){+.+.}, at: bond_get_stats+0x90/0x4d0 [bonding] [ 38.487625][ T957] #2: ffffffffbc9254c0 (rcu_read_lock){....}, at: bond_get_stats+0x5/0x4d0 [bonding] [ 38.488897][ T957] [ 38.488897][ T957] stack backtrace: [ 38.489646][ T957] CPU: 1 PID: 957 Comm: ip Not tainted 5.5.0+ #394 [ 38.490497][ T957] Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006 [ 38.492810][ T957] Call Trace: [ 38.493219][ T957] dump_stack+0x96/0xdb [ 38.493709][ T957] check_noncircular+0x371/0x450 [ 38.494344][ T957] ? lookup_address+0x60/0x60 [ 38.494923][ T957] ? print_circular_bug.isra.35+0x310/0x310 [ 38.495699][ T957] ? hlock_class+0x130/0x130 [ 38.496334][ T957] ? __lock_acquire+0x2d8d/0x3de0 [ 38.496979][ T957] __lock_acquire+0x2d8d/0x3de0 [ 38.497607][ T957] ? register_lock_class+0x14d0/0x14d0 [ 38.498333][ T957] ? check_chain_key+0x236/0x5d0 [ 38.499003][ T957] lock_acquire+0x164/0x3b0 [ 38.499800][ T957] ? bond_get_stats+0x90/0x4d0 [bonding] [ 38.500706][ T957] _raw_spin_lock+0x30/0x70 [ 38.501435][ T957] ? bond_get_stats+0x90/0x4d0 [bonding] [ 38.502311][ T957] bond_get_stats+0x90/0x4d0 [bonding] [ ... ] But, there is another problem. The dynamic lockdep class key is protected by RTNL, but bond_get_stats() would be called outside of RTNL. So, it would use an invalid dynamic lockdep class key. In order to fix this issue, stats_lock uses spin_lock_nested() instead of a dynamic lockdep key. The bond_get_stats() calls bond_get_lowest_level_rcu() to get the correct nest level value, which will be used by spin_lock_nested(). The "dev->lower_level" indicates lower nest level value, but this value is invalid outside of RTNL. So, bond_get_lowest_level_rcu() returns valid lower nest level value in the RCU critical section. bond_get_lowest_level_rcu() will be work only when LOCKDEP is enabled. Fixes: 089bca2caed0 ("bonding: use dynamic lockdep key instead of subclass") Signed-off-by: Taehee Yoo Signed-off-by: David S. Miller --- drivers/net/bonding/bond_main.c | 53 ++++++++++++++++++++++++++++++++++++++--- 1 file changed, 50 insertions(+), 3 deletions(-) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 1e9d5d35fc78..d10805e5e623 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -3526,6 +3526,47 @@ static void bond_fold_stats(struct rtnl_link_stats64 *_res, } } +#ifdef CONFIG_LOCKDEP +static int bond_get_lowest_level_rcu(struct net_device *dev) +{ + struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1]; + struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1]; + int cur = 0, max = 0; + + now = dev; + iter = &dev->adj_list.lower; + + while (1) { + next = NULL; + while (1) { + ldev = netdev_next_lower_dev_rcu(now, &iter); + if (!ldev) + break; + + next = ldev; + niter = &ldev->adj_list.lower; + dev_stack[cur] = now; + iter_stack[cur++] = iter; + if (max <= cur) + max = cur; + break; + } + + if (!next) { + if (!cur) + return max; + next = dev_stack[--cur]; + niter = iter_stack[cur]; + } + + now = next; + iter = niter; + } + + return max; +} +#endif + static void bond_get_stats(struct net_device *bond_dev, struct rtnl_link_stats64 *stats) { @@ -3533,11 +3574,17 @@ static void bond_get_stats(struct net_device *bond_dev, struct rtnl_link_stats64 temp; struct list_head *iter; struct slave *slave; + int nest_level = 0; - spin_lock(&bond->stats_lock); - memcpy(stats, &bond->bond_stats, sizeof(*stats)); rcu_read_lock(); +#ifdef CONFIG_LOCKDEP + nest_level = bond_get_lowest_level_rcu(bond_dev); +#endif + + spin_lock_nested(&bond->stats_lock, nest_level); + memcpy(stats, &bond->bond_stats, sizeof(*stats)); + bond_for_each_slave_rcu(bond, slave, iter) { const struct rtnl_link_stats64 *new = dev_get_stats(slave->dev, &temp); @@ -3547,10 +3594,10 @@ static void bond_get_stats(struct net_device *bond_dev, /* save off the slave stats for the next run */ memcpy(&slave->slave_stats, new, sizeof(*new)); } - rcu_read_unlock(); memcpy(&bond->bond_stats, stats, sizeof(*stats)); spin_unlock(&bond->stats_lock); + rcu_read_unlock(); } static int bond_do_ioctl(struct net_device *bond_dev, struct ifreq *ifr, int cmd) -- cgit v1.2.3 From 357b41caf949c57e426f1c5f18574b6b46583406 Mon Sep 17 00:00:00 2001 From: Matthieu Baerts Date: Sat, 15 Feb 2020 15:45:56 +0100 Subject: mptcp: select CRYPTO Without this modification and if CRYPTO is not selected, we have this warning: WARNING: unmet direct dependencies detected for CRYPTO_LIB_SHA256 Depends on [n]: CRYPTO [=n] Selected by [y]: - MPTCP [=y] && NET [=y] && INET [=y] MPTCP selects CRYPTO_LIB_SHA256 which seems to depend on CRYPTO. CRYPTO is now selected to avoid this issue. Even though the config system prints that warning, it looks like sha256.c is compiled and linked even without CONFIG_CRYPTO. Since MPTCP will end up needing CONFIG_CRYPTO anyway in future commits -- currently in preparation for net-next -- we propose to add it now to fix the warning. The dependency in the config system comes from the fact that CRYPTO_LIB_SHA256 is defined in "lib/crypto/Kconfig" which is sourced from "crypto/Kconfig" only if CRYPTO is selected. Fixes: 65492c5a6ab5 (mptcp: move from sha1 (v0) to sha256 (v1)) Signed-off-by: Matthieu Baerts Signed-off-by: David S. Miller --- net/mptcp/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/net/mptcp/Kconfig b/net/mptcp/Kconfig index 49f6054e7f4e..a9ed3bf1d93f 100644 --- a/net/mptcp/Kconfig +++ b/net/mptcp/Kconfig @@ -4,6 +4,7 @@ config MPTCP depends on INET select SKB_EXTENSIONS select CRYPTO_LIB_SHA256 + select CRYPTO help Multipath TCP (MPTCP) connections send and receive data over multiple subflows in order to utilize multiple network paths. Each subflow -- cgit v1.2.3 From 69233bba6543a37755158ca3382765387b8078df Mon Sep 17 00:00:00 2001 From: Marek Vasut Date: Sat, 15 Feb 2020 17:54:17 +0100 Subject: net: ks8851-ml: Remove 8-bit bus accessors This driver is mixing 8-bit and 16-bit bus accessors for reasons unknown, however the speculation is that this was some sort of attempt to support the 8-bit bus mode. As per the KS8851-16MLL documentation, all two registers accessed via the 8-bit accessors are internally 16-bit registers, so reading them using 16-bit accessors is fine. The KS_CCR read can be converted to 16-bit read outright, as it is already a concatenation of two 8-bit reads of that register. The KS_RXQCR accesses are 8-bit only, however writing the top 8 bits of the register is OK as well, since the driver caches the entire 16-bit register value anyway. Finally, the driver is not used by any hardware in the kernel right now. The only hardware available to me is one with 16-bit bus, so I have no way to test the 8-bit bus mode, however it is unlikely this ever really worked anyway. If the 8-bit bus mode is ever required, it can be easily added by adjusting the 16-bit accessors to do 2 consecutive accesses, which is how this should have been done from the beginning. Signed-off-by: Marek Vasut Cc: David S. Miller Cc: Lukas Wunner Cc: Petr Stetiar Cc: YueHaibing Signed-off-by: David S. Miller --- drivers/net/ethernet/micrel/ks8851_mll.c | 45 ++++---------------------------- 1 file changed, 5 insertions(+), 40 deletions(-) diff --git a/drivers/net/ethernet/micrel/ks8851_mll.c b/drivers/net/ethernet/micrel/ks8851_mll.c index a41a90c589db..e2fb20154511 100644 --- a/drivers/net/ethernet/micrel/ks8851_mll.c +++ b/drivers/net/ethernet/micrel/ks8851_mll.c @@ -156,24 +156,6 @@ static int msg_enable; * chip is busy transferring packet data (RX/TX FIFO accesses). */ -/** - * ks_rdreg8 - read 8 bit register from device - * @ks : The chip information - * @offset: The register address - * - * Read a 8bit register from the chip, returning the result - */ -static u8 ks_rdreg8(struct ks_net *ks, int offset) -{ - u16 data; - u8 shift_bit = offset & 0x03; - u8 shift_data = (offset & 1) << 3; - ks->cmd_reg_cache = (u16) offset | (u16)(BE0 << shift_bit); - iowrite16(ks->cmd_reg_cache, ks->hw_addr_cmd); - data = ioread16(ks->hw_addr); - return (u8)(data >> shift_data); -} - /** * ks_rdreg16 - read 16 bit register from device * @ks : The chip information @@ -189,22 +171,6 @@ static u16 ks_rdreg16(struct ks_net *ks, int offset) return ioread16(ks->hw_addr); } -/** - * ks_wrreg8 - write 8bit register value to chip - * @ks: The chip information - * @offset: The register address - * @value: The value to write - * - */ -static void ks_wrreg8(struct ks_net *ks, int offset, u8 value) -{ - u8 shift_bit = (offset & 0x03); - u16 value_write = (u16)(value << ((offset & 1) << 3)); - ks->cmd_reg_cache = (u16)offset | (BE0 << shift_bit); - iowrite16(ks->cmd_reg_cache, ks->hw_addr_cmd); - iowrite16(value_write, ks->hw_addr); -} - /** * ks_wrreg16 - write 16bit register value to chip * @ks: The chip information @@ -324,8 +290,7 @@ static void ks_read_config(struct ks_net *ks) u16 reg_data = 0; /* Regardless of bus width, 8 bit read should always work.*/ - reg_data = ks_rdreg8(ks, KS_CCR) & 0x00FF; - reg_data |= ks_rdreg8(ks, KS_CCR+1) << 8; + reg_data = ks_rdreg16(ks, KS_CCR); /* addr/data bus are multiplexed */ ks->sharedbus = (reg_data & CCR_SHARED) == CCR_SHARED; @@ -429,7 +394,7 @@ static inline void ks_read_qmu(struct ks_net *ks, u16 *buf, u32 len) /* 1. set sudo DMA mode */ ks_wrreg16(ks, KS_RXFDPR, RXFDPR_RXFPAI); - ks_wrreg8(ks, KS_RXQCR, (ks->rc_rxqcr | RXQCR_SDA) & 0xff); + ks_wrreg16(ks, KS_RXQCR, ks->rc_rxqcr | RXQCR_SDA); /* 2. read prepend data */ /** @@ -446,7 +411,7 @@ static inline void ks_read_qmu(struct ks_net *ks, u16 *buf, u32 len) ks_inblk(ks, buf, ALIGN(len, 4)); /* 4. reset sudo DMA Mode */ - ks_wrreg8(ks, KS_RXQCR, ks->rc_rxqcr); + ks_wrreg16(ks, KS_RXQCR, ks->rc_rxqcr); } /** @@ -679,13 +644,13 @@ static void ks_write_qmu(struct ks_net *ks, u8 *pdata, u16 len) ks->txh.txw[1] = cpu_to_le16(len); /* 1. set sudo-DMA mode */ - ks_wrreg8(ks, KS_RXQCR, (ks->rc_rxqcr | RXQCR_SDA) & 0xff); + ks_wrreg16(ks, KS_RXQCR, ks->rc_rxqcr | RXQCR_SDA); /* 2. write status/lenth info */ ks_outblk(ks, ks->txh.txw, 4); /* 3. write pkt data */ ks_outblk(ks, (u16 *)pdata, ALIGN(len, 4)); /* 4. reset sudo-DMA mode */ - ks_wrreg8(ks, KS_RXQCR, ks->rc_rxqcr); + ks_wrreg16(ks, KS_RXQCR, ks->rc_rxqcr); /* 5. Enqueue Tx(move the pkt from TX buffer into TXQ) */ ks_wrreg16(ks, KS_TXQCR, TXQCR_METFE); /* 6. wait until TXQCR_METFE is auto-cleared */ -- cgit v1.2.3 From edacb098ea9c31589276152f09b4439052c0f2b1 Mon Sep 17 00:00:00 2001 From: Marek Vasut Date: Sat, 15 Feb 2020 17:54:18 +0100 Subject: net: ks8851-ml: Fix 16-bit data access The packet data written to and read from Micrel KSZ8851-16MLLI must be byte-swapped in 16-bit mode, add this byte-swapping. Signed-off-by: Marek Vasut Cc: David S. Miller Cc: Lukas Wunner Cc: Petr Stetiar Cc: YueHaibing Signed-off-by: David S. Miller --- drivers/net/ethernet/micrel/ks8851_mll.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/micrel/ks8851_mll.c b/drivers/net/ethernet/micrel/ks8851_mll.c index e2fb20154511..5ae206ae5d2b 100644 --- a/drivers/net/ethernet/micrel/ks8851_mll.c +++ b/drivers/net/ethernet/micrel/ks8851_mll.c @@ -197,7 +197,7 @@ static inline void ks_inblk(struct ks_net *ks, u16 *wptr, u32 len) { len >>= 1; while (len--) - *wptr++ = (u16)ioread16(ks->hw_addr); + *wptr++ = be16_to_cpu(ioread16(ks->hw_addr)); } /** @@ -211,7 +211,7 @@ static inline void ks_outblk(struct ks_net *ks, u16 *wptr, u32 len) { len >>= 1; while (len--) - iowrite16(*wptr++, ks->hw_addr); + iowrite16(cpu_to_be16(*wptr++), ks->hw_addr); } static void ks_disable_int(struct ks_net *ks) -- cgit v1.2.3 From 58292104832fef6cb4a89f736012c0e0724c3442 Mon Sep 17 00:00:00 2001 From: Marek Vasut Date: Sat, 15 Feb 2020 17:54:19 +0100 Subject: net: ks8851-ml: Fix 16-bit IO operation The Micrel KSZ8851-16MLLI datasheet DS00002357B page 12 states that BE[3:0] signals are active high. This contradicts the measurements of the behavior of the actual chip, where these signals behave as active low. For example, to read the CIDER register, the bus must expose 0xc0c0 during the address phase, which means BE[3:0]=4'b1100. Signed-off-by: Marek Vasut Cc: David S. Miller Cc: Lukas Wunner Cc: Petr Stetiar Cc: YueHaibing Signed-off-by: David S. Miller --- drivers/net/ethernet/micrel/ks8851_mll.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/micrel/ks8851_mll.c b/drivers/net/ethernet/micrel/ks8851_mll.c index 5ae206ae5d2b..1c9e70c8cc30 100644 --- a/drivers/net/ethernet/micrel/ks8851_mll.c +++ b/drivers/net/ethernet/micrel/ks8851_mll.c @@ -166,7 +166,7 @@ static int msg_enable; static u16 ks_rdreg16(struct ks_net *ks, int offset) { - ks->cmd_reg_cache = (u16)offset | ((BE1 | BE0) << (offset & 0x02)); + ks->cmd_reg_cache = (u16)offset | ((BE3 | BE2) >> (offset & 0x02)); iowrite16(ks->cmd_reg_cache, ks->hw_addr_cmd); return ioread16(ks->hw_addr); } @@ -181,7 +181,7 @@ static u16 ks_rdreg16(struct ks_net *ks, int offset) static void ks_wrreg16(struct ks_net *ks, int offset, u16 value) { - ks->cmd_reg_cache = (u16)offset | ((BE1 | BE0) << (offset & 0x02)); + ks->cmd_reg_cache = (u16)offset | ((BE3 | BE2) >> (offset & 0x02)); iowrite16(ks->cmd_reg_cache, ks->hw_addr_cmd); iowrite16(value, ks->hw_addr); } -- cgit v1.2.3 From 66256e0b15bd72e1e1c24c4cef4281a95636781c Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 15 Feb 2020 11:42:37 -0800 Subject: net/sock.h: fix all kernel-doc warnings Fix all kernel-doc warnings for . Fixes these warnings: ../include/net/sock.h:232: warning: Function parameter or member 'skc_addrpair' not described in 'sock_common' ../include/net/sock.h:232: warning: Function parameter or member 'skc_portpair' not described in 'sock_common' ../include/net/sock.h:232: warning: Function parameter or member 'skc_ipv6only' not described in 'sock_common' ../include/net/sock.h:232: warning: Function parameter or member 'skc_net_refcnt' not described in 'sock_common' ../include/net/sock.h:232: warning: Function parameter or member 'skc_v6_daddr' not described in 'sock_common' ../include/net/sock.h:232: warning: Function parameter or member 'skc_v6_rcv_saddr' not described in 'sock_common' ../include/net/sock.h:232: warning: Function parameter or member 'skc_cookie' not described in 'sock_common' ../include/net/sock.h:232: warning: Function parameter or member 'skc_listener' not described in 'sock_common' ../include/net/sock.h:232: warning: Function parameter or member 'skc_tw_dr' not described in 'sock_common' ../include/net/sock.h:232: warning: Function parameter or member 'skc_rcv_wnd' not described in 'sock_common' ../include/net/sock.h:232: warning: Function parameter or member 'skc_tw_rcv_nxt' not described in 'sock_common' ../include/net/sock.h:498: warning: Function parameter or member 'sk_rx_skb_cache' not described in 'sock' ../include/net/sock.h:498: warning: Function parameter or member 'sk_wq_raw' not described in 'sock' ../include/net/sock.h:498: warning: Function parameter or member 'tcp_rtx_queue' not described in 'sock' ../include/net/sock.h:498: warning: Function parameter or member 'sk_tx_skb_cache' not described in 'sock' ../include/net/sock.h:498: warning: Function parameter or member 'sk_route_forced_caps' not described in 'sock' ../include/net/sock.h:498: warning: Function parameter or member 'sk_txtime_report_errors' not described in 'sock' ../include/net/sock.h:498: warning: Function parameter or member 'sk_validate_xmit_skb' not described in 'sock' ../include/net/sock.h:498: warning: Function parameter or member 'sk_bpf_storage' not described in 'sock' ../include/net/sock.h:2024: warning: No description found for return value of 'sk_wmem_alloc_get' ../include/net/sock.h:2035: warning: No description found for return value of 'sk_rmem_alloc_get' ../include/net/sock.h:2046: warning: No description found for return value of 'sk_has_allocations' ../include/net/sock.h:2082: warning: No description found for return value of 'skwq_has_sleeper' ../include/net/sock.h:2244: warning: No description found for return value of 'sk_page_frag' ../include/net/sock.h:2444: warning: Function parameter or member 'tcp_rx_skb_cache_key' not described in 'DECLARE_STATIC_KEY_FALSE' ../include/net/sock.h:2444: warning: Excess function parameter 'sk' description in 'DECLARE_STATIC_KEY_FALSE' ../include/net/sock.h:2444: warning: Excess function parameter 'skb' description in 'DECLARE_STATIC_KEY_FALSE' Signed-off-by: Randy Dunlap Signed-off-by: David S. Miller --- include/net/sock.h | 38 +++++++++++++++++++++++++++++++++----- 1 file changed, 33 insertions(+), 5 deletions(-) diff --git a/include/net/sock.h b/include/net/sock.h index 02162b0378f7..328564525526 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -117,19 +117,26 @@ typedef __u64 __bitwise __addrpair; * struct sock_common - minimal network layer representation of sockets * @skc_daddr: Foreign IPv4 addr * @skc_rcv_saddr: Bound local IPv4 addr + * @skc_addrpair: 8-byte-aligned __u64 union of @skc_daddr & @skc_rcv_saddr * @skc_hash: hash value used with various protocol lookup tables * @skc_u16hashes: two u16 hash values used by UDP lookup tables * @skc_dport: placeholder for inet_dport/tw_dport * @skc_num: placeholder for inet_num/tw_num + * @skc_portpair: __u32 union of @skc_dport & @skc_num * @skc_family: network address family * @skc_state: Connection state * @skc_reuse: %SO_REUSEADDR setting * @skc_reuseport: %SO_REUSEPORT setting + * @skc_ipv6only: socket is IPV6 only + * @skc_net_refcnt: socket is using net ref counting * @skc_bound_dev_if: bound device index if != 0 * @skc_bind_node: bind hash linkage for various protocol lookup tables * @skc_portaddr_node: second hash linkage for UDP/UDP-Lite protocol * @skc_prot: protocol handlers inside a network family * @skc_net: reference to the network namespace of this socket + * @skc_v6_daddr: IPV6 destination address + * @skc_v6_rcv_saddr: IPV6 source address + * @skc_cookie: socket's cookie value * @skc_node: main hash linkage for various protocol lookup tables * @skc_nulls_node: main hash linkage for TCP/UDP/UDP-Lite protocol * @skc_tx_queue_mapping: tx queue number for this connection @@ -137,7 +144,15 @@ typedef __u64 __bitwise __addrpair; * @skc_flags: place holder for sk_flags * %SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE, * %SO_OOBINLINE settings, %SO_TIMESTAMPING settings + * @skc_listener: connection request listener socket (aka rsk_listener) + * [union with @skc_flags] + * @skc_tw_dr: (aka tw_dr) ptr to &struct inet_timewait_death_row + * [union with @skc_flags] * @skc_incoming_cpu: record/match cpu processing incoming packets + * @skc_rcv_wnd: (aka rsk_rcv_wnd) TCP receive window size (possibly scaled) + * [union with @skc_incoming_cpu] + * @skc_tw_rcv_nxt: (aka tw_rcv_nxt) TCP window next expected seq number + * [union with @skc_incoming_cpu] * @skc_refcnt: reference count * * This is the minimal network layer representation of sockets, the header @@ -245,6 +260,7 @@ struct bpf_sk_storage; * @sk_dst_cache: destination cache * @sk_dst_pending_confirm: need to confirm neighbour * @sk_policy: flow policy + * @sk_rx_skb_cache: cache copy of recently accessed RX skb * @sk_receive_queue: incoming packets * @sk_wmem_alloc: transmit queue bytes committed * @sk_tsq_flags: TCP Small Queues flags @@ -265,6 +281,8 @@ struct bpf_sk_storage; * @sk_no_check_rx: allow zero checksum in RX packets * @sk_route_caps: route capabilities (e.g. %NETIF_F_TSO) * @sk_route_nocaps: forbidden route capabilities (e.g NETIF_F_GSO_MASK) + * @sk_route_forced_caps: static, forced route capabilities + * (set in tcp_init_sock()) * @sk_gso_type: GSO type (e.g. %SKB_GSO_TCPV4) * @sk_gso_max_size: Maximum GSO segment size to build * @sk_gso_max_segs: Maximum number of GSO segments @@ -303,6 +321,8 @@ struct bpf_sk_storage; * @sk_frag: cached page frag * @sk_peek_off: current peek_offset value * @sk_send_head: front of stuff to transmit + * @tcp_rtx_queue: TCP re-transmit queue [union with @sk_send_head] + * @sk_tx_skb_cache: cache copy of recently accessed TX skb * @sk_security: used by security modules * @sk_mark: generic packet mark * @sk_cgrp_data: cgroup data for this cgroup @@ -313,11 +333,14 @@ struct bpf_sk_storage; * @sk_write_space: callback to indicate there is bf sending space available * @sk_error_report: callback to indicate errors (e.g. %MSG_ERRQUEUE) * @sk_backlog_rcv: callback to process the backlog + * @sk_validate_xmit_skb: ptr to an optional validate function * @sk_destruct: called at sock freeing time, i.e. when all refcnt == 0 * @sk_reuseport_cb: reuseport group container + * @sk_bpf_storage: ptr to cache and control for bpf_sk_storage * @sk_rcu: used during RCU grace period * @sk_clockid: clockid used by time-based scheduling (SO_TXTIME) * @sk_txtime_deadline_mode: set deadline mode for SO_TXTIME + * @sk_txtime_report_errors: set report errors mode for SO_TXTIME * @sk_txtime_unused: unused txtime flags */ struct sock { @@ -393,7 +416,9 @@ struct sock { struct sk_filter __rcu *sk_filter; union { struct socket_wq __rcu *sk_wq; + /* private: */ struct socket_wq *sk_wq_raw; + /* public: */ }; #ifdef CONFIG_XFRM struct xfrm_policy __rcu *sk_policy[2]; @@ -2017,7 +2042,7 @@ static inline int skb_copy_to_page_nocache(struct sock *sk, struct iov_iter *fro * sk_wmem_alloc_get - returns write allocations * @sk: socket * - * Returns sk_wmem_alloc minus initial offset of one + * Return: sk_wmem_alloc minus initial offset of one */ static inline int sk_wmem_alloc_get(const struct sock *sk) { @@ -2028,7 +2053,7 @@ static inline int sk_wmem_alloc_get(const struct sock *sk) * sk_rmem_alloc_get - returns read allocations * @sk: socket * - * Returns sk_rmem_alloc + * Return: sk_rmem_alloc */ static inline int sk_rmem_alloc_get(const struct sock *sk) { @@ -2039,7 +2064,7 @@ static inline int sk_rmem_alloc_get(const struct sock *sk) * sk_has_allocations - check if allocations are outstanding * @sk: socket * - * Returns true if socket has write or read allocations + * Return: true if socket has write or read allocations */ static inline bool sk_has_allocations(const struct sock *sk) { @@ -2050,7 +2075,7 @@ static inline bool sk_has_allocations(const struct sock *sk) * skwq_has_sleeper - check if there are any waiting processes * @wq: struct socket_wq * - * Returns true if socket_wq has waiting processes + * Return: true if socket_wq has waiting processes * * The purpose of the skwq_has_sleeper and sock_poll_wait is to wrap the memory * barrier call. They were added due to the race found within the tcp code. @@ -2238,6 +2263,9 @@ struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp, * gfpflags_allow_blocking() isn't enough here as direct reclaim may nest * inside other socket operations and end up recursing into sk_page_frag() * while it's already in use. + * + * Return: a per task page_frag if context allows that, + * otherwise a per socket one. */ static inline struct page_frag *sk_page_frag(struct sock *sk) { @@ -2432,6 +2460,7 @@ static inline void skb_setup_tx_timestamp(struct sk_buff *skb, __u16 tsflags) &skb_shinfo(skb)->tskey); } +DECLARE_STATIC_KEY_FALSE(tcp_rx_skb_cache_key); /** * sk_eat_skb - Release a skb if it is no longer needed * @sk: socket to eat this skb from @@ -2440,7 +2469,6 @@ static inline void skb_setup_tx_timestamp(struct sk_buff *skb, __u16 tsflags) * This routine must be called with interrupts disabled or with the socket * locked so that the sk_buff queue operation is ok. */ -DECLARE_STATIC_KEY_FALSE(tcp_rx_skb_cache_key); static inline void sk_eat_skb(struct sock *sk, struct sk_buff *skb) { __skb_unlink(skb, &sk->sk_receive_queue); -- cgit v1.2.3 From 8955b4357d6fc98734b53855b76ee37014a7e492 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 15 Feb 2020 13:41:12 -0800 Subject: skbuff: remove stale bit mask comments Remove stale comments since this flag is no longer a bit mask but is a bit field. Signed-off-by: Randy Dunlap Signed-off-by: David S. Miller --- net/core/skbuff.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 864cb9e9622f..1365a556152c 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -467,7 +467,6 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len, return NULL; } - /* use OR instead of assignment to avoid clearing of bits in mask */ if (pfmemalloc) skb->pfmemalloc = 1; skb->head_frag = 1; @@ -527,7 +526,6 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len, return NULL; } - /* use OR instead of assignment to avoid clearing of bits in mask */ if (nc->page.pfmemalloc) skb->pfmemalloc = 1; skb->head_frag = 1; -- cgit v1.2.3 From d2f273f0a9205257b91af1d3d461ee29688c2f24 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 15 Feb 2020 15:34:07 -0800 Subject: skbuff.h: fix all kernel-doc warnings Fix all kernel-doc warnings in . Fixes these warnings: ../include/linux/skbuff.h:890: warning: Function parameter or member 'list' not described in 'sk_buff' ../include/linux/skbuff.h:890: warning: Function parameter or member 'dev_scratch' not described in 'sk_buff' ../include/linux/skbuff.h:890: warning: Function parameter or member 'ip_defrag_offset' not described in 'sk_buff' ../include/linux/skbuff.h:890: warning: Function parameter or member 'skb_mstamp_ns' not described in 'sk_buff' ../include/linux/skbuff.h:890: warning: Function parameter or member '__cloned_offset' not described in 'sk_buff' ../include/linux/skbuff.h:890: warning: Function parameter or member 'head_frag' not described in 'sk_buff' ../include/linux/skbuff.h:890: warning: Function parameter or member '__pkt_type_offset' not described in 'sk_buff' ../include/linux/skbuff.h:890: warning: Function parameter or member 'encapsulation' not described in 'sk_buff' ../include/linux/skbuff.h:890: warning: Function parameter or member 'encap_hdr_csum' not described in 'sk_buff' ../include/linux/skbuff.h:890: warning: Function parameter or member 'csum_valid' not described in 'sk_buff' ../include/linux/skbuff.h:890: warning: Function parameter or member '__pkt_vlan_present_offset' not described in 'sk_buff' ../include/linux/skbuff.h:890: warning: Function parameter or member 'vlan_present' not described in 'sk_buff' ../include/linux/skbuff.h:890: warning: Function parameter or member 'csum_complete_sw' not described in 'sk_buff' ../include/linux/skbuff.h:890: warning: Function parameter or member 'csum_level' not described in 'sk_buff' ../include/linux/skbuff.h:890: warning: Function parameter or member 'inner_protocol_type' not described in 'sk_buff' ../include/linux/skbuff.h:890: warning: Function parameter or member 'remcsum_offload' not described in 'sk_buff' ../include/linux/skbuff.h:890: warning: Function parameter or member 'sender_cpu' not described in 'sk_buff' ../include/linux/skbuff.h:890: warning: Function parameter or member 'reserved_tailroom' not described in 'sk_buff' ../include/linux/skbuff.h:890: warning: Function parameter or member 'inner_ipproto' not described in 'sk_buff' Signed-off-by: Randy Dunlap Signed-off-by: David S. Miller --- include/linux/skbuff.h | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index ca8806b69388..5b50278c4bc8 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -611,9 +611,15 @@ typedef unsigned char *sk_buff_data_t; * @next: Next buffer in list * @prev: Previous buffer in list * @tstamp: Time we arrived/left + * @skb_mstamp_ns: (aka @tstamp) earliest departure time; start point + * for retransmit timer * @rbnode: RB tree node, alternative to next/prev for netem/tcp + * @list: queue head * @sk: Socket we are owned by + * @ip_defrag_offset: (aka @sk) alternate use of @sk, used in + * fragmentation management * @dev: Device we arrived on/are leaving by + * @dev_scratch: (aka @dev) alternate use of @dev when @dev would be %NULL * @cb: Control buffer. Free for use by every layer. Put private vars here * @_skb_refdst: destination entry (with norefcount bit) * @sp: the security path, used for xfrm @@ -632,6 +638,9 @@ typedef unsigned char *sk_buff_data_t; * @pkt_type: Packet class * @fclone: skbuff clone status * @ipvs_property: skbuff is owned by ipvs + * @inner_protocol_type: whether the inner protocol is + * ENCAP_TYPE_ETHER or ENCAP_TYPE_IPPROTO + * @remcsum_offload: remote checksum offload is enabled * @offload_fwd_mark: Packet was L2-forwarded in hardware * @offload_l3_fwd_mark: Packet was L3-forwarded in hardware * @tc_skip_classify: do not classify packet. set by IFB device @@ -650,6 +659,8 @@ typedef unsigned char *sk_buff_data_t; * @tc_index: Traffic control index * @hash: the packet hash * @queue_mapping: Queue mapping for multiqueue devices + * @head_frag: skb was allocated from page fragments, + * not allocated by kmalloc() or vmalloc(). * @pfmemalloc: skbuff was allocated from PFMEMALLOC reserves * @active_extensions: active extensions (skb_ext_id types) * @ndisc_nodetype: router type (from link layer) @@ -660,15 +671,28 @@ typedef unsigned char *sk_buff_data_t; * @wifi_acked_valid: wifi_acked was set * @wifi_acked: whether frame was acked on wifi or not * @no_fcs: Request NIC to treat last 4 bytes as Ethernet FCS + * @encapsulation: indicates the inner headers in the skbuff are valid + * @encap_hdr_csum: software checksum is needed + * @csum_valid: checksum is already valid * @csum_not_inet: use CRC32c to resolve CHECKSUM_PARTIAL + * @csum_complete_sw: checksum was completed by software + * @csum_level: indicates the number of consecutive checksums found in + * the packet minus one that have been verified as + * CHECKSUM_UNNECESSARY (max 3) * @dst_pending_confirm: need to confirm neighbour * @decrypted: Decrypted SKB * @napi_id: id of the NAPI struct this skb came from + * @sender_cpu: (aka @napi_id) source CPU in XPS * @secmark: security marking * @mark: Generic packet mark + * @reserved_tailroom: (aka @mark) number of bytes of free space available + * at the tail of an sk_buff + * @vlan_present: VLAN tag is present * @vlan_proto: vlan encapsulation protocol * @vlan_tci: vlan tag control information * @inner_protocol: Protocol (encapsulation) + * @inner_ipproto: (aka @inner_protocol) stores ipproto when + * skb->inner_protocol_type == ENCAP_TYPE_IPPROTO; * @inner_transport_header: Inner transport layer header (encapsulation) * @inner_network_header: Network layer header (encapsulation) * @inner_mac_header: Link layer header (encapsulation) @@ -750,7 +774,9 @@ struct sk_buff { #endif #define CLONED_OFFSET() offsetof(struct sk_buff, __cloned_offset) + /* private: */ __u8 __cloned_offset[0]; + /* public: */ __u8 cloned:1, nohdr:1, fclone:2, @@ -775,7 +801,9 @@ struct sk_buff { #endif #define PKT_TYPE_OFFSET() offsetof(struct sk_buff, __pkt_type_offset) + /* private: */ __u8 __pkt_type_offset[0]; + /* public: */ __u8 pkt_type:3; __u8 ignore_df:1; __u8 nf_trace:1; @@ -798,7 +826,9 @@ struct sk_buff { #define PKT_VLAN_PRESENT_BIT 0 #endif #define PKT_VLAN_PRESENT_OFFSET() offsetof(struct sk_buff, __pkt_vlan_present_offset) + /* private: */ __u8 __pkt_vlan_present_offset[0]; + /* public: */ __u8 vlan_present:1; __u8 csum_complete_sw:1; __u8 csum_level:2; -- cgit v1.2.3 From 9a6a0dea16177ccaecc116f560232e63bec115f1 Mon Sep 17 00:00:00 2001 From: Paul Cercueil Date: Sun, 16 Feb 2020 16:39:43 -0300 Subject: net: ethernet: dm9000: Handle -EPROBE_DEFER in dm9000_parse_dt() The call to of_get_mac_address() can return -EPROBE_DEFER, for instance when the MAC address is read from a NVMEM driver that did not probe yet. Cc: H. Nikolaus Schaller Cc: Mathieu Malaterre Signed-off-by: Paul Cercueil Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/ethernet/davicom/dm9000.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/davicom/dm9000.c b/drivers/net/ethernet/davicom/dm9000.c index 1ea3372775e6..e94ae9b94dbf 100644 --- a/drivers/net/ethernet/davicom/dm9000.c +++ b/drivers/net/ethernet/davicom/dm9000.c @@ -1405,6 +1405,8 @@ static struct dm9000_plat_data *dm9000_parse_dt(struct device *dev) mac_addr = of_get_mac_address(np); if (!IS_ERR(mac_addr)) ether_addr_copy(pdata->dev_addr, mac_addr); + else if (PTR_ERR(mac_addr) == -EPROBE_DEFER) + return ERR_CAST(mac_addr); return pdata; } -- cgit v1.2.3 From 072663f86d62571fe540d9e1d24eb873a1b1182f Mon Sep 17 00:00:00 2001 From: Ben Skeggs Date: Wed, 15 Jan 2020 06:34:22 +1000 Subject: drm/nouveau/acr/tu11x: initial support Signed-off-by: Ben Skeggs --- drivers/gpu/drm/nouveau/nvkm/engine/device/base.c | 2 ++ drivers/gpu/drm/nouveau/nvkm/subdev/acr/tu102.c | 14 ++++++++++++++ drivers/gpu/drm/nouveau/nvkm/subdev/fb/gv100.c | 2 ++ 3 files changed, 18 insertions(+) diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/device/base.c b/drivers/gpu/drm/nouveau/nvkm/engine/device/base.c index c7d700916eae..4fe9b38db459 100644 --- a/drivers/gpu/drm/nouveau/nvkm/engine/device/base.c +++ b/drivers/gpu/drm/nouveau/nvkm/engine/device/base.c @@ -2579,6 +2579,7 @@ nv166_chipset = { static const struct nvkm_device_chip nv167_chipset = { .name = "TU117", + .acr = tu102_acr_new, .bar = tu102_bar_new, .bios = nvkm_bios_new, .bus = gf100_bus_new, @@ -2615,6 +2616,7 @@ nv167_chipset = { static const struct nvkm_device_chip nv168_chipset = { .name = "TU116", + .acr = tu102_acr_new, .bar = tu102_bar_new, .bios = nvkm_bios_new, .bus = gf100_bus_new, diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/acr/tu102.c b/drivers/gpu/drm/nouveau/nvkm/subdev/acr/tu102.c index 7f4b89d82d32..d28d8f36ae24 100644 --- a/drivers/gpu/drm/nouveau/nvkm/subdev/acr/tu102.c +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/acr/tu102.c @@ -107,6 +107,12 @@ MODULE_FIRMWARE("nvidia/tu104/acr/ucode_unload.bin"); MODULE_FIRMWARE("nvidia/tu106/acr/unload_bl.bin"); MODULE_FIRMWARE("nvidia/tu106/acr/ucode_unload.bin"); +MODULE_FIRMWARE("nvidia/tu116/acr/unload_bl.bin"); +MODULE_FIRMWARE("nvidia/tu116/acr/ucode_unload.bin"); + +MODULE_FIRMWARE("nvidia/tu117/acr/unload_bl.bin"); +MODULE_FIRMWARE("nvidia/tu117/acr/ucode_unload.bin"); + static const struct nvkm_acr_hsf_fwif tu102_acr_unload_fwif[] = { { 0, nvkm_acr_hsfw_load, &gp108_acr_unload_0 }, @@ -130,6 +136,8 @@ tu102_acr_asb_0 = { MODULE_FIRMWARE("nvidia/tu102/acr/ucode_asb.bin"); MODULE_FIRMWARE("nvidia/tu104/acr/ucode_asb.bin"); MODULE_FIRMWARE("nvidia/tu106/acr/ucode_asb.bin"); +MODULE_FIRMWARE("nvidia/tu116/acr/ucode_asb.bin"); +MODULE_FIRMWARE("nvidia/tu117/acr/ucode_asb.bin"); static const struct nvkm_acr_hsf_fwif tu102_acr_asb_fwif[] = { @@ -154,6 +162,12 @@ MODULE_FIRMWARE("nvidia/tu104/acr/ucode_ahesasc.bin"); MODULE_FIRMWARE("nvidia/tu106/acr/bl.bin"); MODULE_FIRMWARE("nvidia/tu106/acr/ucode_ahesasc.bin"); +MODULE_FIRMWARE("nvidia/tu116/acr/bl.bin"); +MODULE_FIRMWARE("nvidia/tu116/acr/ucode_ahesasc.bin"); + +MODULE_FIRMWARE("nvidia/tu117/acr/bl.bin"); +MODULE_FIRMWARE("nvidia/tu117/acr/ucode_ahesasc.bin"); + static const struct nvkm_acr_hsf_fwif tu102_acr_ahesasc_fwif[] = { { 0, nvkm_acr_hsfw_load, &tu102_acr_ahesasc_0 }, diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/gv100.c b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/gv100.c index 389bad312bf2..10ff5d053f7e 100644 --- a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/gv100.c +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/gv100.c @@ -51,3 +51,5 @@ MODULE_FIRMWARE("nvidia/gv100/nvdec/scrubber.bin"); MODULE_FIRMWARE("nvidia/tu102/nvdec/scrubber.bin"); MODULE_FIRMWARE("nvidia/tu104/nvdec/scrubber.bin"); MODULE_FIRMWARE("nvidia/tu106/nvdec/scrubber.bin"); +MODULE_FIRMWARE("nvidia/tu116/nvdec/scrubber.bin"); +MODULE_FIRMWARE("nvidia/tu117/nvdec/scrubber.bin"); -- cgit v1.2.3 From b99ef12b80cfe48a14e7918c2f799c37d2195aca Mon Sep 17 00:00:00 2001 From: Ben Skeggs Date: Wed, 15 Jan 2020 06:34:22 +1000 Subject: drm/nouveau/gr/tu11x: initial support Signed-off-by: Ben Skeggs --- drivers/gpu/drm/nouveau/nvkm/engine/device/base.c | 2 ++ drivers/gpu/drm/nouveau/nvkm/engine/gr/tu102.c | 26 +++++++++++++++++++++++ 2 files changed, 28 insertions(+) diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/device/base.c b/drivers/gpu/drm/nouveau/nvkm/engine/device/base.c index 4fe9b38db459..8ebbe1656008 100644 --- a/drivers/gpu/drm/nouveau/nvkm/engine/device/base.c +++ b/drivers/gpu/drm/nouveau/nvkm/engine/device/base.c @@ -2608,6 +2608,7 @@ nv167_chipset = { .disp = tu102_disp_new, .dma = gv100_dma_new, .fifo = tu102_fifo_new, + .gr = tu102_gr_new, .nvdec[0] = gm107_nvdec_new, .nvenc[0] = gm107_nvenc_new, .sec2 = tu102_sec2_new, @@ -2645,6 +2646,7 @@ nv168_chipset = { .disp = tu102_disp_new, .dma = gv100_dma_new, .fifo = tu102_fifo_new, + .gr = tu102_gr_new, .nvdec[0] = gm107_nvdec_new, .nvenc[0] = gm107_nvenc_new, .sec2 = tu102_sec2_new, diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/gr/tu102.c b/drivers/gpu/drm/nouveau/nvkm/engine/gr/tu102.c index 454668b1cf54..a9efa4d78be9 100644 --- a/drivers/gpu/drm/nouveau/nvkm/engine/gr/tu102.c +++ b/drivers/gpu/drm/nouveau/nvkm/engine/gr/tu102.c @@ -164,6 +164,32 @@ MODULE_FIRMWARE("nvidia/tu106/gr/sw_nonctx.bin"); MODULE_FIRMWARE("nvidia/tu106/gr/sw_bundle_init.bin"); MODULE_FIRMWARE("nvidia/tu106/gr/sw_method_init.bin"); +MODULE_FIRMWARE("nvidia/tu117/gr/fecs_bl.bin"); +MODULE_FIRMWARE("nvidia/tu117/gr/fecs_inst.bin"); +MODULE_FIRMWARE("nvidia/tu117/gr/fecs_data.bin"); +MODULE_FIRMWARE("nvidia/tu117/gr/fecs_sig.bin"); +MODULE_FIRMWARE("nvidia/tu117/gr/gpccs_bl.bin"); +MODULE_FIRMWARE("nvidia/tu117/gr/gpccs_inst.bin"); +MODULE_FIRMWARE("nvidia/tu117/gr/gpccs_data.bin"); +MODULE_FIRMWARE("nvidia/tu117/gr/gpccs_sig.bin"); +MODULE_FIRMWARE("nvidia/tu117/gr/sw_ctx.bin"); +MODULE_FIRMWARE("nvidia/tu117/gr/sw_nonctx.bin"); +MODULE_FIRMWARE("nvidia/tu117/gr/sw_bundle_init.bin"); +MODULE_FIRMWARE("nvidia/tu117/gr/sw_method_init.bin"); + +MODULE_FIRMWARE("nvidia/tu116/gr/fecs_bl.bin"); +MODULE_FIRMWARE("nvidia/tu116/gr/fecs_inst.bin"); +MODULE_FIRMWARE("nvidia/tu116/gr/fecs_data.bin"); +MODULE_FIRMWARE("nvidia/tu116/gr/fecs_sig.bin"); +MODULE_FIRMWARE("nvidia/tu116/gr/gpccs_bl.bin"); +MODULE_FIRMWARE("nvidia/tu116/gr/gpccs_inst.bin"); +MODULE_FIRMWARE("nvidia/tu116/gr/gpccs_data.bin"); +MODULE_FIRMWARE("nvidia/tu116/gr/gpccs_sig.bin"); +MODULE_FIRMWARE("nvidia/tu116/gr/sw_ctx.bin"); +MODULE_FIRMWARE("nvidia/tu116/gr/sw_nonctx.bin"); +MODULE_FIRMWARE("nvidia/tu116/gr/sw_bundle_init.bin"); +MODULE_FIRMWARE("nvidia/tu116/gr/sw_method_init.bin"); + static const struct gf100_gr_fwif tu102_gr_fwif[] = { { 0, gm200_gr_load, &tu102_gr, &gp108_gr_fecs_acr, &gp108_gr_gpccs_acr }, -- cgit v1.2.3 From f287d3d19769b1d22cba4e51fa0487f2697713c9 Mon Sep 17 00:00:00 2001 From: Lyude Paul Date: Wed, 12 Feb 2020 18:11:49 -0500 Subject: drm/nouveau/kms/gv100-: Re-set LUT after clearing for modesets While certain modeset operations on gv100+ need us to temporarily disable the LUT, we make the mistake of sometimes neglecting to reprogram the LUT after such modesets. In particular, moving a head from one encoder to another seems to trigger this quite often. GV100+ is very picky about having a LUT in most scenarios, so this causes the display engine to hang with the following error code: disp: chid 1 stat 00005080 reason 5 [INVALID_STATE] mthd 0200 data 00000001 code 0000002d) So, fix this by always re-programming the LUT if we're clearing it in a state where the wndw is still visible, and has a XLUT handle programmed. Signed-off-by: Lyude Paul Fixes: facaed62b4cb ("drm/nouveau/kms/gv100: initial support") Cc: # v4.18+ Signed-off-by: Ben Skeggs --- drivers/gpu/drm/nouveau/dispnv50/wndw.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/nouveau/dispnv50/wndw.c b/drivers/gpu/drm/nouveau/dispnv50/wndw.c index 890315291b01..bb737f9281e6 100644 --- a/drivers/gpu/drm/nouveau/dispnv50/wndw.c +++ b/drivers/gpu/drm/nouveau/dispnv50/wndw.c @@ -458,6 +458,8 @@ nv50_wndw_atomic_check(struct drm_plane *plane, struct drm_plane_state *state) asyw->clr.ntfy = armw->ntfy.handle != 0; asyw->clr.sema = armw->sema.handle != 0; asyw->clr.xlut = armw->xlut.handle != 0; + if (asyw->clr.xlut && asyw->visible) + asyw->set.xlut = asyw->xlut.handle != 0; asyw->clr.csc = armw->csc.valid; if (wndw->func->image_clr) asyw->clr.image = armw->image.handle[0] != 0; -- cgit v1.2.3 From bab5417f5f0118ce914bc5b2f8381e959e891155 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Fri, 14 Feb 2020 08:11:48 -0800 Subject: USB: misc: iowarrior: add support for the 100 device Add a new device id for the 100 devie. It has 4 interfaces like the 28 and 28L devices but a larger endpoint so more I/O pins. Cc: Christoph Jung Cc: stable Signed-off-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/20200214161148.GA3963518@kroah.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/misc/iowarrior.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/usb/misc/iowarrior.c b/drivers/usb/misc/iowarrior.c index d20b60acfe8a..dce20301e367 100644 --- a/drivers/usb/misc/iowarrior.c +++ b/drivers/usb/misc/iowarrior.c @@ -36,6 +36,7 @@ /* fuller speed iowarrior */ #define USB_DEVICE_ID_CODEMERCS_IOW28 0x1504 #define USB_DEVICE_ID_CODEMERCS_IOW28L 0x1505 +#define USB_DEVICE_ID_CODEMERCS_IOW100 0x1506 /* OEMed devices */ #define USB_DEVICE_ID_CODEMERCS_IOW24SAG 0x158a @@ -144,6 +145,7 @@ static const struct usb_device_id iowarrior_ids[] = { {USB_DEVICE(USB_VENDOR_ID_CODEMERCS, USB_DEVICE_ID_CODEMERCS_IOW56AM)}, {USB_DEVICE(USB_VENDOR_ID_CODEMERCS, USB_DEVICE_ID_CODEMERCS_IOW28)}, {USB_DEVICE(USB_VENDOR_ID_CODEMERCS, USB_DEVICE_ID_CODEMERCS_IOW28L)}, + {USB_DEVICE(USB_VENDOR_ID_CODEMERCS, USB_DEVICE_ID_CODEMERCS_IOW100)}, {} /* Terminating entry */ }; MODULE_DEVICE_TABLE(usb, iowarrior_ids); @@ -386,6 +388,7 @@ static ssize_t iowarrior_write(struct file *file, case USB_DEVICE_ID_CODEMERCS_IOW56AM: case USB_DEVICE_ID_CODEMERCS_IOW28: case USB_DEVICE_ID_CODEMERCS_IOW28L: + case USB_DEVICE_ID_CODEMERCS_IOW100: /* The IOW56 uses asynchronous IO and more urbs */ if (atomic_read(&dev->write_busy) == MAX_WRITES_IN_FLIGHT) { /* Wait until we are below the limit for submitted urbs */ @@ -786,7 +789,8 @@ static int iowarrior_probe(struct usb_interface *interface, if ((dev->product_id == USB_DEVICE_ID_CODEMERCS_IOW56) || (dev->product_id == USB_DEVICE_ID_CODEMERCS_IOW56AM) || (dev->product_id == USB_DEVICE_ID_CODEMERCS_IOW28) || - (dev->product_id == USB_DEVICE_ID_CODEMERCS_IOW28L)) { + (dev->product_id == USB_DEVICE_ID_CODEMERCS_IOW28L) || + (dev->product_id == USB_DEVICE_ID_CODEMERCS_IOW100)) { res = usb_find_last_int_out_endpoint(iface_desc, &dev->int_out_endpoint); if (res) { @@ -802,7 +806,8 @@ static int iowarrior_probe(struct usb_interface *interface, ((dev->product_id == USB_DEVICE_ID_CODEMERCS_IOW56) || (dev->product_id == USB_DEVICE_ID_CODEMERCS_IOW56AM) || (dev->product_id == USB_DEVICE_ID_CODEMERCS_IOW28) || - (dev->product_id == USB_DEVICE_ID_CODEMERCS_IOW28L))) + (dev->product_id == USB_DEVICE_ID_CODEMERCS_IOW28L) || + (dev->product_id == USB_DEVICE_ID_CODEMERCS_IOW100))) /* IOWarrior56 has wMaxPacketSize different from report size */ dev->report_size = 7; -- cgit v1.2.3 From 6a757c07e51f80ac34325fcd558490d2d1439e1b Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 3 Feb 2020 17:37:07 +0100 Subject: netfilter: conntrack: allow insertion of clashing entries This patch further relaxes the need to drop an skb due to a clash with an existing conntrack entry. Current clash resolution handles the case where the clash occurs between two identical entries (distinct nf_conn objects with same tuples), i.e.: Original Reply existing: 10.2.3.4:42 -> 10.8.8.8:53 10.2.3.4:42 <- 10.0.0.6:5353 clashing: 10.2.3.4:42 -> 10.8.8.8:53 10.2.3.4:42 <- 10.0.0.6:5353 ... existing handling will discard the unconfirmed clashing entry and makes skb->_nfct point to the existing one. The skb can then be processed normally just as if the clash would not have existed in the first place. For other clashes, the skb needs to be dropped. This frequently happens with DNS resolvers that send A and AAAA queries back-to-back when NAT rules are present that cause packets to get different DNAT transformations applied, for example: -m statistics --mode random ... -j DNAT --dnat-to 10.0.0.6:5353 -m statistics --mode random ... -j DNAT --dnat-to 10.0.0.7:5353 In this case the A or AAAA query is dropped which incurs a costly delay during name resolution. This patch also allows this collision type: Original Reply existing: 10.2.3.4:42 -> 10.8.8.8:53 10.2.3.4:42 <- 10.0.0.6:5353 clashing: 10.2.3.4:42 -> 10.8.8.8:53 10.2.3.4:42 <- 10.0.0.7:5353 In this case, clash is in original direction -- the reply direction is still unique. The change makes it so that when the 2nd colliding packet is received, the clashing conntrack is tagged with new IPS_NAT_CLASH_BIT, gets a fixed 1 second timeout and is inserted in the reply direction only. The entry is hidden from 'conntrack -L', it will time out quickly and it can be early dropped because it will never progress to the ASSURED state. To avoid special-casing the delete code path to special case the ORIGINAL hlist_nulls node, a new helper, "hlist_nulls_add_fake", is added so hlist_nulls_del() will work. Example: CPU A: CPU B: 1. 10.2.3.4:42 -> 10.8.8.8:53 (A) 2. 10.2.3.4:42 -> 10.8.8.8:53 (AAAA) 3. Apply DNAT, reply changed to 10.0.0.6 4. 10.2.3.4:42 -> 10.8.8.8:53 (AAAA) 5. Apply DNAT, reply changed to 10.0.0.7 6. confirm/commit to conntrack table, no collisions 7. commit clashing entry Reply comes in: 10.2.3.4:42 <- 10.0.0.6:5353 (A) -> Finds a conntrack, DNAT is reversed & packet forwarded to 10.2.3.4:42 10.2.3.4:42 <- 10.0.0.7:5353 (AAAA) -> Finds a conntrack, DNAT is reversed & packet forwarded to 10.2.3.4:42 The conntrack entry is deleted from table, as it has the NAT_CLASH bit set. In case of a retransmit from ORIGINAL dir, all further packets will get the DNAT transformation to 10.0.0.6. I tried to come up with other solutions but they all have worse problems. Alternatives considered were: 1. Confirm ct entries at allocation time, not in postrouting. a. will cause uneccesarry work when the skb that creates the conntrack is dropped by ruleset. b. in case nat is applied, ct entry would need to be moved in the table, which requires another spinlock pair to be taken. c. breaks the 'unconfirmed entry is private to cpu' assumption: we would need to guard all nfct->ext allocation requests with ct->lock spinlock. 2. Make the unconfirmed list a hash table instead of a pcpu list. Shares drawback c) of the first alternative. 3. Document this is expected and force users to rearrange their ruleset (e.g. by using "-m cluster" instead of "-m statistics"). nft has the 'jhash' expression which can be used instead of 'numgen'. Major drawback: doesn't fix what I consider a bug, not very realistic and I believe its reasonable to have the existing rulesets to 'just work'. 4. Document this is expected and force users to steer problematic packets to the same CPU -- this would serialize the "allocate new conntrack entry/nat table evaluation/perform nat/confirm entry", so no race can occur. Similar drawback to 3. Another advantage of this patch compared to 1) and 2) is that there are no changes to the hot path; things are handled in the udp tracker and the clash resolution path. Cc: rcu@vger.kernel.org Cc: "Paul E. McKenney" Cc: Josh Triplett Cc: Jozsef Kadlecsik Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/rculist_nulls.h | 7 ++ include/uapi/linux/netfilter/nf_conntrack_common.h | 12 +++- net/netfilter/nf_conntrack_core.c | 76 +++++++++++++++++++++- net/netfilter/nf_conntrack_proto_udp.c | 20 ++++-- 4 files changed, 108 insertions(+), 7 deletions(-) diff --git a/include/linux/rculist_nulls.h b/include/linux/rculist_nulls.h index e5b752027a03..9670b54b484a 100644 --- a/include/linux/rculist_nulls.h +++ b/include/linux/rculist_nulls.h @@ -145,6 +145,13 @@ static inline void hlist_nulls_add_tail_rcu(struct hlist_nulls_node *n, } } +/* after that hlist_nulls_del will work */ +static inline void hlist_nulls_add_fake(struct hlist_nulls_node *n) +{ + n->pprev = &n->next; + n->next = (struct hlist_nulls_node *)NULLS_MARKER(NULL); +} + /** * hlist_nulls_for_each_entry_rcu - iterate over rcu list of given type * @tpos: the type * to use as a loop cursor. diff --git a/include/uapi/linux/netfilter/nf_conntrack_common.h b/include/uapi/linux/netfilter/nf_conntrack_common.h index 336014bf8868..b6f0bb1dc799 100644 --- a/include/uapi/linux/netfilter/nf_conntrack_common.h +++ b/include/uapi/linux/netfilter/nf_conntrack_common.h @@ -97,6 +97,15 @@ enum ip_conntrack_status { IPS_UNTRACKED_BIT = 12, IPS_UNTRACKED = (1 << IPS_UNTRACKED_BIT), +#ifdef __KERNEL__ + /* Re-purposed for in-kernel use: + * Tags a conntrack entry that clashed with an existing entry + * on insert. + */ + IPS_NAT_CLASH_BIT = IPS_UNTRACKED_BIT, + IPS_NAT_CLASH = IPS_UNTRACKED, +#endif + /* Conntrack got a helper explicitly attached via CT target. */ IPS_HELPER_BIT = 13, IPS_HELPER = (1 << IPS_HELPER_BIT), @@ -110,7 +119,8 @@ enum ip_conntrack_status { */ IPS_UNCHANGEABLE_MASK = (IPS_NAT_DONE_MASK | IPS_NAT_MASK | IPS_EXPECTED | IPS_CONFIRMED | IPS_DYING | - IPS_SEQ_ADJUST | IPS_TEMPLATE | IPS_OFFLOAD), + IPS_SEQ_ADJUST | IPS_TEMPLATE | IPS_UNTRACKED | + IPS_OFFLOAD), __IPS_MAX_BIT = 15, }; diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 3f069eb0f0fc..1927fc296f95 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -940,11 +940,71 @@ static int __nf_ct_resolve_clash(struct sk_buff *skb, return NF_DROP; } +/** + * nf_ct_resolve_clash_harder - attempt to insert clashing conntrack entry + * + * @skb: skb that causes the collision + * @repl_idx: hash slot for reply direction + * + * Called when origin or reply direction had a clash. + * The skb can be handled without packet drop provided the reply direction + * is unique or there the existing entry has the identical tuple in both + * directions. + * + * Caller must hold conntrack table locks to prevent concurrent updates. + * + * Returns NF_DROP if the clash could not be handled. + */ +static int nf_ct_resolve_clash_harder(struct sk_buff *skb, u32 repl_idx) +{ + struct nf_conn *loser_ct = (struct nf_conn *)skb_nfct(skb); + const struct nf_conntrack_zone *zone; + struct nf_conntrack_tuple_hash *h; + struct hlist_nulls_node *n; + struct net *net; + + zone = nf_ct_zone(loser_ct); + net = nf_ct_net(loser_ct); + + /* Reply direction must never result in a clash, unless both origin + * and reply tuples are identical. + */ + hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[repl_idx], hnnode) { + if (nf_ct_key_equal(h, + &loser_ct->tuplehash[IP_CT_DIR_REPLY].tuple, + zone, net)) + return __nf_ct_resolve_clash(skb, h); + } + + /* We want the clashing entry to go away real soon: 1 second timeout. */ + loser_ct->timeout = nfct_time_stamp + HZ; + + /* IPS_NAT_CLASH removes the entry automatically on the first + * reply. Also prevents UDP tracker from moving the entry to + * ASSURED state, i.e. the entry can always be evicted under + * pressure. + */ + loser_ct->status |= IPS_FIXED_TIMEOUT | IPS_NAT_CLASH; + + __nf_conntrack_insert_prepare(loser_ct); + + /* fake add for ORIGINAL dir: we want lookups to only find the entry + * already in the table. This also hides the clashing entry from + * ctnetlink iteration, i.e. conntrack -L won't show them. + */ + hlist_nulls_add_fake(&loser_ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode); + + hlist_nulls_add_head_rcu(&loser_ct->tuplehash[IP_CT_DIR_REPLY].hnnode, + &nf_conntrack_hash[repl_idx]); + return NF_ACCEPT; +} + /** * nf_ct_resolve_clash - attempt to handle clash without packet drop * * @skb: skb that causes the clash * @h: tuplehash of the clashing entry already in table + * @hash_reply: hash slot for reply direction * * A conntrack entry can be inserted to the connection tracking table * if there is no existing entry with an identical tuple. @@ -963,10 +1023,18 @@ static int __nf_ct_resolve_clash(struct sk_buff *skb, * exactly the same, only the to-be-confirmed conntrack entry is discarded * and @skb is associated with the conntrack entry already in the table. * + * Failing that, the new, unconfirmed conntrack is still added to the table + * provided that the collision only occurs in the ORIGINAL direction. + * The new entry will be added after the existing one in the hash list, + * so packets in the ORIGINAL direction will continue to match the existing + * entry. The new entry will also have a fixed timeout so it expires -- + * due to the collision, it will not see bidirectional traffic. + * * Returns NF_DROP if the clash could not be resolved. */ static __cold noinline int -nf_ct_resolve_clash(struct sk_buff *skb, struct nf_conntrack_tuple_hash *h) +nf_ct_resolve_clash(struct sk_buff *skb, struct nf_conntrack_tuple_hash *h, + u32 reply_hash) { /* This is the conntrack entry already in hashes that won race. */ struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); @@ -987,6 +1055,10 @@ nf_ct_resolve_clash(struct sk_buff *skb, struct nf_conntrack_tuple_hash *h) if (ret == NF_ACCEPT) return ret; + ret = nf_ct_resolve_clash_harder(skb, reply_hash); + if (ret == NF_ACCEPT) + return ret; + drop: nf_ct_add_to_dying_list(loser_ct); NF_CT_STAT_INC(net, drop); @@ -1101,7 +1173,7 @@ __nf_conntrack_confirm(struct sk_buff *skb) return NF_ACCEPT; out: - ret = nf_ct_resolve_clash(skb, h); + ret = nf_ct_resolve_clash(skb, h, reply_hash); dying: nf_conntrack_double_unlock(hash, reply_hash); local_bh_enable(); diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c index 7365b43f8f98..760ca2422816 100644 --- a/net/netfilter/nf_conntrack_proto_udp.c +++ b/net/netfilter/nf_conntrack_proto_udp.c @@ -81,6 +81,18 @@ static bool udp_error(struct sk_buff *skb, return false; } +static void nf_conntrack_udp_refresh_unreplied(struct nf_conn *ct, + struct sk_buff *skb, + enum ip_conntrack_info ctinfo, + u32 extra_jiffies) +{ + if (unlikely(ctinfo == IP_CT_ESTABLISHED_REPLY && + ct->status & IPS_NAT_CLASH)) + nf_ct_kill(ct); + else + nf_ct_refresh_acct(ct, ctinfo, skb, extra_jiffies); +} + /* Returns verdict for packet, and may modify conntracktype */ int nf_conntrack_udp_packet(struct nf_conn *ct, struct sk_buff *skb, @@ -116,8 +128,8 @@ int nf_conntrack_udp_packet(struct nf_conn *ct, if (!test_and_set_bit(IPS_ASSURED_BIT, &ct->status)) nf_conntrack_event_cache(IPCT_ASSURED, ct); } else { - nf_ct_refresh_acct(ct, ctinfo, skb, - timeouts[UDP_CT_UNREPLIED]); + nf_conntrack_udp_refresh_unreplied(ct, skb, ctinfo, + timeouts[UDP_CT_UNREPLIED]); } return NF_ACCEPT; } @@ -198,8 +210,8 @@ int nf_conntrack_udplite_packet(struct nf_conn *ct, if (!test_and_set_bit(IPS_ASSURED_BIT, &ct->status)) nf_conntrack_event_cache(IPCT_ASSURED, ct); } else { - nf_ct_refresh_acct(ct, ctinfo, skb, - timeouts[UDP_CT_UNREPLIED]); + nf_conntrack_udp_refresh_unreplied(ct, skb, ctinfo, + timeouts[UDP_CT_UNREPLIED]); } return NF_ACCEPT; } -- cgit v1.2.3 From 5eee7c625d414fb62985439ed58ab755d8988c76 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 3 Feb 2020 08:10:29 -0800 Subject: watchdog: fix mtk_wdt.c RESET_CONTROLLER build error Fix build error when CONFIG_RESET_CONTROLLER is not set by selecting RESET_CONTROLLER. ld: drivers/watchdog/mtk_wdt.o: in function `mtk_wdt_probe': mtk_wdt.c:(.text+0x3ec): undefined reference to `devm_reset_controller_register' Signed-off-by: Randy Dunlap Fixes: c254e103082b74e ("watchdog: mtk_wdt: mt8183: Add reset controller") Cc: Wim Van Sebroeck Cc: Guenter Roeck Cc: Matthias Brugger Cc: linux-watchdog@vger.kernel.org Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/77c1e557-4941-3806-2933-6c3583576390@infradead.org Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/watchdog/Kconfig b/drivers/watchdog/Kconfig index cec868f8db3f..c3c8e0786a99 100644 --- a/drivers/watchdog/Kconfig +++ b/drivers/watchdog/Kconfig @@ -841,6 +841,7 @@ config MEDIATEK_WATCHDOG tristate "Mediatek SoCs watchdog support" depends on ARCH_MEDIATEK || COMPILE_TEST select WATCHDOG_CORE + select RESET_CONTROLLER help Say Y here to include support for the watchdog timer in Mediatek SoCs. -- cgit v1.2.3 From e9a0e65eda3f78d0b04ec6136c591c000cbc3b76 Mon Sep 17 00:00:00 2001 From: Marco Felsch Date: Mon, 20 Jan 2020 10:17:29 +0100 Subject: watchdog: da9062: do not ping the hw during stop() The da9062 hw has a minimum ping cool down phase of at least 200ms. The driver takes that into account by setting the min_hw_heartbeat_ms to 300ms and the core guarantees that the hw limit is observed for the ping() calls. But the core can't guarantee the required minimum ping cool down phase if a stop() command is send immediately after the ping() command. So it is not allowed to ping the watchdog within the stop() command as the driver does. Remove the ping can be done without doubts because the watchdog gets disabled anyway and a (re)start resets the watchdog counter too. Signed-off-by: Marco Felsch Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20200120091729.16256-1-m.felsch@pengutronix.de [groeck: Updated description] Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/da9062_wdt.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/drivers/watchdog/da9062_wdt.c b/drivers/watchdog/da9062_wdt.c index 47eefe072b40..777d7eec7f2e 100644 --- a/drivers/watchdog/da9062_wdt.c +++ b/drivers/watchdog/da9062_wdt.c @@ -95,13 +95,6 @@ static int da9062_wdt_stop(struct watchdog_device *wdd) struct da9062_watchdog *wdt = watchdog_get_drvdata(wdd); int ret; - ret = da9062_reset_watchdog_timer(wdt); - if (ret) { - dev_err(wdt->hw->dev, "Failed to ping the watchdog (err = %d)\n", - ret); - return ret; - } - ret = regmap_update_bits(wdt->hw->regmap, DA9062AA_CONTROL_D, DA9062AA_TWDSCALE_MASK, -- cgit v1.2.3 From 8541673d2a5f2faccff345e35991e2f9887779ea Mon Sep 17 00:00:00 2001 From: Marco Felsch Date: Fri, 7 Feb 2020 08:15:18 +0100 Subject: watchdog: da9062: fix power management ops This fixes commit f6c98b08381c ("watchdog: da9062: add power management ops"). During discussion [1] we agreed that this should be configurable because it is a device quirk if we can't use the hw watchdog auto suspend function. [1] https://lore.kernel.org/linux-watchdog/20191128171931.22563-1-m.felsch@pengutronix.de/ Signed-off-by: Marco Felsch Fixes: f6c98b08381c ("watchdog: da9062: add power management ops") Reviewed-by: Guenter Roeck Reviewed-by: Adam Thomson Link: https://lore.kernel.org/r/20200207071518.5559-1-m.felsch@pengutronix.de Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/da9062_wdt.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/drivers/watchdog/da9062_wdt.c b/drivers/watchdog/da9062_wdt.c index 777d7eec7f2e..0ad15d55071c 100644 --- a/drivers/watchdog/da9062_wdt.c +++ b/drivers/watchdog/da9062_wdt.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include @@ -31,6 +32,7 @@ static const unsigned int wdt_timeout[] = { 0, 2, 4, 8, 16, 32, 65, 131 }; struct da9062_watchdog { struct da9062 *hw; struct watchdog_device wdtdev; + bool use_sw_pm; }; static unsigned int da9062_wdt_timeout_to_sel(unsigned int secs) @@ -193,6 +195,8 @@ static int da9062_wdt_probe(struct platform_device *pdev) if (!wdt) return -ENOMEM; + wdt->use_sw_pm = device_property_present(dev, "dlg,use-sw-pm"); + wdt->hw = chip; wdt->wdtdev.info = &da9062_watchdog_info; @@ -219,6 +223,10 @@ static int da9062_wdt_probe(struct platform_device *pdev) static int __maybe_unused da9062_wdt_suspend(struct device *dev) { struct watchdog_device *wdd = dev_get_drvdata(dev); + struct da9062_watchdog *wdt = watchdog_get_drvdata(wdd); + + if (!wdt->use_sw_pm) + return 0; if (watchdog_active(wdd)) return da9062_wdt_stop(wdd); @@ -229,6 +237,10 @@ static int __maybe_unused da9062_wdt_suspend(struct device *dev) static int __maybe_unused da9062_wdt_resume(struct device *dev) { struct watchdog_device *wdd = dev_get_drvdata(dev); + struct da9062_watchdog *wdt = watchdog_get_drvdata(wdd); + + if (!wdt->use_sw_pm) + return 0; if (watchdog_active(wdd)) return da9062_wdt_start(wdd); -- cgit v1.2.3 From 44144c809e39d64ff9931c7e8956c42b2baa89e6 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Sat, 8 Feb 2020 05:08:03 -0800 Subject: watchdog: da9062: Add dependency on I2C Since commit 057b52b4b3d58 ("watchdog: da9062: make restart handler atomic safe"), the driver calls i2c functions directly. It now therefore depends on I2C. This is a hard dependency which overrides COMPILE_TEST. Reported-by: kbuild test robot Reported-by: Randy Dunlap Fixes: 057b52b4b3d58 ("watchdog: da9062: make restart handler atomic safe") Cc: Marco Felsch Cc: Adam Thomson Cc: Stefan Lengfeld Reviewed-by: Marco Felsch Acked-by: Randy Dunlap Acked-by: Geert Uytterhoeven Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/watchdog/Kconfig b/drivers/watchdog/Kconfig index c3c8e0786a99..9ea2b43d4b01 100644 --- a/drivers/watchdog/Kconfig +++ b/drivers/watchdog/Kconfig @@ -207,6 +207,7 @@ config DA9063_WATCHDOG config DA9062_WATCHDOG tristate "Dialog DA9062/61 Watchdog" depends on MFD_DA9062 || COMPILE_TEST + depends on I2C select WATCHDOG_CORE help Support for the watchdog in the DA9062 and DA9061 PMICs. -- cgit v1.2.3 From 4aadf4b49ec7d80c5db842ca28479d07108c9484 Mon Sep 17 00:00:00 2001 From: Tzung-Bi Shih Date: Mon, 17 Feb 2020 11:16:52 +0800 Subject: ASoC: hdmi-codec: set plugged_cb to NULL when component removing Sets plugged_cb to NULL when component removing to notify its consumers : no further plugged status report is required. Signed-off-by: Tzung-Bi Shih Link: https://lore.kernel.org/r/20200217105513.1.Icc323daaf71ad02f191fd8d91136b01b61eca5e3@changeid Signed-off-by: Mark Brown --- sound/soc/codecs/hdmi-codec.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/sound/soc/codecs/hdmi-codec.c b/sound/soc/codecs/hdmi-codec.c index 543363102d03..bc2903d27e6e 100644 --- a/sound/soc/codecs/hdmi-codec.c +++ b/sound/soc/codecs/hdmi-codec.c @@ -779,7 +779,17 @@ static int hdmi_of_xlate_dai_id(struct snd_soc_component *component, return ret; } +static void hdmi_remove(struct snd_soc_component *component) +{ + struct hdmi_codec_priv *hcp = snd_soc_component_get_drvdata(component); + + if (hcp->hcd.ops->hook_plugged_cb) + hcp->hcd.ops->hook_plugged_cb(component->dev->parent, + hcp->hcd.data, NULL, NULL); +} + static const struct snd_soc_component_driver hdmi_driver = { + .remove = hdmi_remove, .dapm_widgets = hdmi_widgets, .num_dapm_widgets = ARRAY_SIZE(hdmi_widgets), .of_xlate_dai_id = hdmi_of_xlate_dai_id, -- cgit v1.2.3 From 3bc7b6c15fffdf3f818df31198c8c040ad8f7ea9 Mon Sep 17 00:00:00 2001 From: Ravulapati Vishnu vardhan rao Date: Mon, 17 Feb 2020 16:09:19 +0530 Subject: ASoC: amd: ACP needs to be powered off in BIOS. Removed this logic because It is BIOS which needs to power off the ACP power domian through ACP_PGFSM_CTRL register when you De-initialize ACP Engine. Signed-off-by: Ravulapati Vishnu vardhan rao Link: https://lore.kernel.org/r/1581935964-15059-1-git-send-email-Vishnuvardhanrao.Ravulapati@amd.com Signed-off-by: Mark Brown --- sound/soc/amd/raven/pci-acp3x.c | 23 ----------------------- 1 file changed, 23 deletions(-) diff --git a/sound/soc/amd/raven/pci-acp3x.c b/sound/soc/amd/raven/pci-acp3x.c index 65330bb50e74..da60e2ec5535 100644 --- a/sound/soc/amd/raven/pci-acp3x.c +++ b/sound/soc/amd/raven/pci-acp3x.c @@ -45,23 +45,6 @@ static int acp3x_power_on(void __iomem *acp3x_base) return -ETIMEDOUT; } -static int acp3x_power_off(void __iomem *acp3x_base) -{ - u32 val; - int timeout; - - rv_writel(ACP_PGFSM_CNTL_POWER_OFF_MASK, - acp3x_base + mmACP_PGFSM_CONTROL); - timeout = 0; - while (++timeout < 500) { - val = rv_readl(acp3x_base + mmACP_PGFSM_STATUS); - if ((val & ACP_PGFSM_STATUS_MASK) == ACP_POWERED_OFF) - return 0; - udelay(1); - } - return -ETIMEDOUT; -} - static int acp3x_reset(void __iomem *acp3x_base) { u32 val; @@ -115,12 +98,6 @@ static int acp3x_deinit(void __iomem *acp3x_base) pr_err("ACP3x reset failed\n"); return ret; } - /* power off */ - ret = acp3x_power_off(acp3x_base); - if (ret) { - pr_err("ACP3x power off failed\n"); - return ret; - } return 0; } -- cgit v1.2.3 From a655e2b107d463ce2745188ce050d07daed09a71 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Mon, 17 Feb 2020 16:19:47 +0100 Subject: ALSA: hda/realtek - Apply quirk for MSI GP63, too The same quirk that was applied to MSI GL73 is needed for MSI GP63, too. Adding the entry with the SSID 1462:1228. BugLink: https://bugzilla.kernel.org/show_bug.cgi?id=206503 Cc: Link: https://lore.kernel.org/r/20200217151947.17528-1-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 6c8cb4ce517e..82485e06dde1 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -2447,6 +2447,7 @@ static const struct snd_pci_quirk alc882_fixup_tbl[] = { SND_PCI_QUIRK(0x1071, 0x8258, "Evesham Voyaeger", ALC882_FIXUP_EAPD), SND_PCI_QUIRK(0x1458, 0xa002, "Gigabyte EP45-DS3/Z87X-UD3H", ALC889_FIXUP_FRONT_HP_NO_PRESENCE), SND_PCI_QUIRK(0x1458, 0xa0b8, "Gigabyte AZ370-Gaming", ALC1220_FIXUP_GB_DUAL_CODECS), + SND_PCI_QUIRK(0x1462, 0x1228, "MSI-GP63", ALC1220_FIXUP_CLEVO_P950), SND_PCI_QUIRK(0x1462, 0x1276, "MSI-GL73", ALC1220_FIXUP_CLEVO_P950), SND_PCI_QUIRK(0x1462, 0x7350, "MSI-7350", ALC889_FIXUP_CD), SND_PCI_QUIRK(0x1462, 0xda57, "MSI Z270-Gaming", ALC1220_FIXUP_GB_DUAL_CODECS), -- cgit v1.2.3 From 52e29e331070cd7d52a64cbf1b0958212a340e28 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 17 Jan 2020 09:02:20 -0500 Subject: btrfs: don't set path->leave_spinning for truncate The only time we actually leave the path spinning is if we're truncating a small amount and don't actually free an extent, which is not a common occurrence. We have to set the path blocking in order to add the delayed ref anyway, so the first extent we find we set the path to blocking and stay blocking for the duration of the operation. With the upcoming file extent map stuff there will be another case that we have to have the path blocking, so just swap to blocking always. Note: this patch also fixes a warning after 28553fa992cb ("Btrfs: fix race between shrinking truncate and fiemap") got merged that inserts extent locks around truncation so the path must not leave spinning locks after btrfs_search_slot. [70.794783] BUG: sleeping function called from invalid context at mm/slab.h:565 [70.794834] in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 1141, name: rsync [70.794863] 5 locks held by rsync/1141: [70.794876] #0: ffff888417b9c408 (sb_writers#17){.+.+}, at: mnt_want_write+0x20/0x50 [70.795030] #1: ffff888428de28e8 (&type->i_mutex_dir_key#13/1){+.+.}, at: lock_rename+0xf1/0x100 [70.795051] #2: ffff888417b9c608 (sb_internal#2){.+.+}, at: start_transaction+0x394/0x560 [70.795124] #3: ffff888403081768 (btrfs-fs-01){++++}, at: btrfs_try_tree_write_lock+0x2f/0x160 [70.795203] #4: ffff888403086568 (btrfs-fs-00){++++}, at: btrfs_try_tree_write_lock+0x2f/0x160 [70.795222] CPU: 5 PID: 1141 Comm: rsync Not tainted 5.6.0-rc2-backup+ #2 [70.795362] Call Trace: [70.795374] dump_stack+0x71/0xa0 [70.795445] ___might_sleep.part.96.cold.106+0xa6/0xb6 [70.795459] kmem_cache_alloc+0x1d3/0x290 [70.795471] alloc_extent_state+0x22/0x1c0 [70.795544] __clear_extent_bit+0x3ba/0x580 [70.795557] ? _raw_spin_unlock_irq+0x24/0x30 [70.795569] btrfs_truncate_inode_items+0x339/0xe50 [70.795647] btrfs_evict_inode+0x269/0x540 [70.795659] ? dput.part.38+0x29/0x460 [70.795671] evict+0xcd/0x190 [70.795682] __dentry_kill+0xd6/0x180 [70.795754] dput.part.38+0x2ad/0x460 [70.795765] do_renameat2+0x3cb/0x540 [70.795777] __x64_sys_rename+0x1c/0x20 Reported-by: Dave Jones Fixes: 28553fa992cb ("Btrfs: fix race between shrinking truncate and fiemap") CC: stable@vger.kernel.org # 4.4+ Reviewed-by: Filipe Manana Signed-off-by: Josef Bacik Reviewed-by: David Sterba [ add note ] Signed-off-by: David Sterba --- fs/btrfs/inode.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 7d26b4bfb2c6..36deef69f847 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4142,7 +4142,6 @@ search_again: goto out; } - path->leave_spinning = 1; ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret < 0) goto out; @@ -4294,7 +4293,6 @@ delete: root == fs_info->tree_root)) { struct btrfs_ref ref = { 0 }; - btrfs_set_path_blocking(path); bytes_deleted += extent_num_bytes; btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, -- cgit v1.2.3 From e20d3a055a457a10a4c748ce5b7c2ed3173a1324 Mon Sep 17 00:00:00 2001 From: Johannes Krude Date: Wed, 12 Feb 2020 20:32:27 +0100 Subject: bpf, offload: Replace bitwise AND by logical AND in bpf_prog_offload_info_fill This if guards whether user-space wants a copy of the offload-jited bytecode and whether this bytecode exists. By erroneously doing a bitwise AND instead of a logical AND on user- and kernel-space buffer-size can lead to no data being copied to user-space especially when user-space size is a power of two and bigger then the kernel-space buffer. Fixes: fcfb126defda ("bpf: add new jited info fields in bpf_dev_offload and bpf_prog_info") Signed-off-by: Johannes Krude Signed-off-by: Daniel Borkmann Acked-by: Jakub Kicinski Link: https://lore.kernel.org/bpf/20200212193227.GA3769@phlox.h.transitiv.net --- kernel/bpf/offload.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index 2c5dc6541ece..bd09290e3648 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -321,7 +321,7 @@ int bpf_prog_offload_info_fill(struct bpf_prog_info *info, ulen = info->jited_prog_len; info->jited_prog_len = aux->offload->jited_len; - if (info->jited_prog_len & ulen) { + if (info->jited_prog_len && ulen) { uinsns = u64_to_user_ptr(info->jited_prog_insns); ulen = min_t(u32, info->jited_prog_len, ulen); if (copy_to_user(uinsns, aux->offload->jited_image, ulen)) { -- cgit v1.2.3 From 8b101a5e14f2161869636ff9cb4907b7749dc0c2 Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Fri, 24 Jan 2020 08:48:55 +0300 Subject: s390/cio: cio_ignore_proc_seq_next should increase position index if seq_file .next fuction does not change position index, read after some lseek can generate unexpected output. Link: https://bugzilla.kernel.org/show_bug.cgi?id=206283 Link: https://lore.kernel.org/r/d44c53a7-9bc1-15c7-6d4a-0c10cb9dffce@virtuozzo.com Reviewed-by: Cornelia Huck Signed-off-by: Christian Borntraeger Signed-off-by: Vasily Averin Signed-off-by: Vasily Gorbik --- drivers/s390/cio/blacklist.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/s390/cio/blacklist.c b/drivers/s390/cio/blacklist.c index da642e811f7f..4dd2eb634856 100644 --- a/drivers/s390/cio/blacklist.c +++ b/drivers/s390/cio/blacklist.c @@ -303,8 +303,10 @@ static void * cio_ignore_proc_seq_next(struct seq_file *s, void *it, loff_t *offset) { struct ccwdev_iter *iter; + loff_t p = *offset; - if (*offset >= (__MAX_SUBCHANNEL + 1) * (__MAX_SSID + 1)) + (*offset)++; + if (p >= (__MAX_SUBCHANNEL + 1) * (__MAX_SSID + 1)) return NULL; iter = it; if (iter->devno == __MAX_SUBCHANNEL) { @@ -314,7 +316,6 @@ cio_ignore_proc_seq_next(struct seq_file *s, void *it, loff_t *offset) return NULL; } else iter->devno++; - (*offset)++; return iter; } -- cgit v1.2.3 From b16c3724dd717a354d0f0257fd647ef5518982aa Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Thu, 13 Feb 2020 11:13:13 -0500 Subject: s390/defconfig: enable CONFIG_PROTECTED_VIRTUALIZATION_GUEST The guest support for protected virtualization is default on most distributions. Also refresh defconfig and debug_defconfig. Signed-off-by: Christian Borntraeger Signed-off-by: Vasily Gorbik --- arch/s390/configs/debug_defconfig | 28 +++++++++++++--------------- arch/s390/configs/defconfig | 11 +++++------ 2 files changed, 18 insertions(+), 21 deletions(-) diff --git a/arch/s390/configs/debug_defconfig b/arch/s390/configs/debug_defconfig index 2e60c80395ab..0c86ba19fa2b 100644 --- a/arch/s390/configs/debug_defconfig +++ b/arch/s390/configs/debug_defconfig @@ -53,6 +53,7 @@ CONFIG_VFIO_AP=m CONFIG_CRASH_DUMP=y CONFIG_HIBERNATION=y CONFIG_PM_DEBUG=y +CONFIG_PROTECTED_VIRTUALIZATION_GUEST=y CONFIG_CMM=m CONFIG_APPLDATA_BASE=y CONFIG_KVM=m @@ -474,7 +475,6 @@ CONFIG_NLMON=m # CONFIG_NET_VENDOR_EMULEX is not set # CONFIG_NET_VENDOR_EZCHIP is not set # CONFIG_NET_VENDOR_GOOGLE is not set -# CONFIG_NET_VENDOR_HP is not set # CONFIG_NET_VENDOR_HUAWEI is not set # CONFIG_NET_VENDOR_INTEL is not set # CONFIG_NET_VENDOR_MARVELL is not set @@ -684,7 +684,6 @@ CONFIG_CRYPTO_ADIANTUM=m CONFIG_CRYPTO_XCBC=m CONFIG_CRYPTO_VMAC=m CONFIG_CRYPTO_CRC32=m -CONFIG_CRYPTO_XXHASH=m CONFIG_CRYPTO_MICHAEL_MIC=m CONFIG_CRYPTO_RMD128=m CONFIG_CRYPTO_RMD160=m @@ -748,7 +747,6 @@ CONFIG_DEBUG_INFO_DWARF4=y CONFIG_GDB_SCRIPTS=y CONFIG_FRAME_WARN=1024 CONFIG_HEADERS_INSTALL=y -CONFIG_HEADERS_CHECK=y CONFIG_DEBUG_SECTION_MISMATCH=y CONFIG_MAGIC_SYSRQ=y CONFIG_DEBUG_PAGEALLOC=y @@ -772,9 +770,9 @@ CONFIG_DEBUG_MEMORY_INIT=y CONFIG_MEMORY_NOTIFIER_ERROR_INJECT=m CONFIG_DEBUG_PER_CPU_MAPS=y CONFIG_DEBUG_SHIRQ=y +CONFIG_PANIC_ON_OOPS=y CONFIG_DETECT_HUNG_TASK=y CONFIG_WQ_WATCHDOG=y -CONFIG_PANIC_ON_OOPS=y CONFIG_DEBUG_TIMEKEEPING=y CONFIG_PROVE_LOCKING=y CONFIG_LOCK_STAT=y @@ -783,9 +781,20 @@ CONFIG_DEBUG_ATOMIC_SLEEP=y CONFIG_DEBUG_LOCKING_API_SELFTESTS=y CONFIG_DEBUG_SG=y CONFIG_DEBUG_NOTIFIERS=y +CONFIG_BUG_ON_DATA_CORRUPTION=y CONFIG_DEBUG_CREDENTIALS=y CONFIG_RCU_TORTURE_TEST=m CONFIG_RCU_CPU_STALL_TIMEOUT=300 +CONFIG_LATENCYTOP=y +CONFIG_FUNCTION_PROFILER=y +CONFIG_STACK_TRACER=y +CONFIG_IRQSOFF_TRACER=y +CONFIG_PREEMPT_TRACER=y +CONFIG_SCHED_TRACER=y +CONFIG_FTRACE_SYSCALLS=y +CONFIG_BLK_DEV_IO_TRACE=y +CONFIG_HIST_TRIGGERS=y +CONFIG_S390_PTDUMP=y CONFIG_NOTIFIER_ERROR_INJECTION=m CONFIG_NETDEV_NOTIFIER_ERROR_INJECT=m CONFIG_FAULT_INJECTION=y @@ -796,15 +805,6 @@ CONFIG_FAIL_IO_TIMEOUT=y CONFIG_FAIL_FUTEX=y CONFIG_FAULT_INJECTION_DEBUG_FS=y CONFIG_FAULT_INJECTION_STACKTRACE_FILTER=y -CONFIG_LATENCYTOP=y -CONFIG_IRQSOFF_TRACER=y -CONFIG_PREEMPT_TRACER=y -CONFIG_SCHED_TRACER=y -CONFIG_FTRACE_SYSCALLS=y -CONFIG_STACK_TRACER=y -CONFIG_BLK_DEV_IO_TRACE=y -CONFIG_FUNCTION_PROFILER=y -CONFIG_HIST_TRIGGERS=y CONFIG_LKDTM=m CONFIG_TEST_LIST_SORT=y CONFIG_TEST_SORT=y @@ -814,5 +814,3 @@ CONFIG_INTERVAL_TREE_TEST=m CONFIG_PERCPU_TEST=m CONFIG_ATOMIC64_SELFTEST=y CONFIG_TEST_BPF=m -CONFIG_BUG_ON_DATA_CORRUPTION=y -CONFIG_S390_PTDUMP=y diff --git a/arch/s390/configs/defconfig b/arch/s390/configs/defconfig index 25f799849582..6b27d861a9a3 100644 --- a/arch/s390/configs/defconfig +++ b/arch/s390/configs/defconfig @@ -53,6 +53,7 @@ CONFIG_VFIO_AP=m CONFIG_CRASH_DUMP=y CONFIG_HIBERNATION=y CONFIG_PM_DEBUG=y +CONFIG_PROTECTED_VIRTUALIZATION_GUEST=y CONFIG_CMM=m CONFIG_APPLDATA_BASE=y CONFIG_KVM=m @@ -470,7 +471,6 @@ CONFIG_NLMON=m # CONFIG_NET_VENDOR_EMULEX is not set # CONFIG_NET_VENDOR_EZCHIP is not set # CONFIG_NET_VENDOR_GOOGLE is not set -# CONFIG_NET_VENDOR_HP is not set # CONFIG_NET_VENDOR_HUAWEI is not set # CONFIG_NET_VENDOR_INTEL is not set # CONFIG_NET_VENDOR_MARVELL is not set @@ -677,7 +677,6 @@ CONFIG_CRYPTO_ADIANTUM=m CONFIG_CRYPTO_XCBC=m CONFIG_CRYPTO_VMAC=m CONFIG_CRYPTO_CRC32=m -CONFIG_CRYPTO_XXHASH=m CONFIG_CRYPTO_MICHAEL_MIC=m CONFIG_CRYPTO_RMD128=m CONFIG_CRYPTO_RMD160=m @@ -739,18 +738,18 @@ CONFIG_DEBUG_SECTION_MISMATCH=y CONFIG_MAGIC_SYSRQ=y CONFIG_DEBUG_MEMORY_INIT=y CONFIG_PANIC_ON_OOPS=y +CONFIG_BUG_ON_DATA_CORRUPTION=y CONFIG_RCU_TORTURE_TEST=m CONFIG_RCU_CPU_STALL_TIMEOUT=60 CONFIG_LATENCYTOP=y +CONFIG_FUNCTION_PROFILER=y +CONFIG_STACK_TRACER=y CONFIG_SCHED_TRACER=y CONFIG_FTRACE_SYSCALLS=y -CONFIG_STACK_TRACER=y CONFIG_BLK_DEV_IO_TRACE=y -CONFIG_FUNCTION_PROFILER=y CONFIG_HIST_TRIGGERS=y +CONFIG_S390_PTDUMP=y CONFIG_LKDTM=m CONFIG_PERCPU_TEST=m CONFIG_ATOMIC64_SELFTEST=y CONFIG_TEST_BPF=m -CONFIG_BUG_ON_DATA_CORRUPTION=y -CONFIG_S390_PTDUMP=y -- cgit v1.2.3 From 0d730b57b95f7e703bb5789de91a7ece6d3a68ac Mon Sep 17 00:00:00 2001 From: chenqiwu Date: Fri, 14 Feb 2020 21:51:33 +0800 Subject: s390/cio: use kobj_to_dev() API Use kobj_to_dev() API instead of container_of(). Reviewed-by: Cornelia Huck Signed-off-by: chenqiwu Signed-off-by: chenqiwu Message-Id: <1581688293-17283-1-git-send-email-qiwuchen55@gmail.com> Signed-off-by: Vasily Gorbik --- drivers/s390/cio/chp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/s390/cio/chp.c b/drivers/s390/cio/chp.c index 51038ec309c1..dfcbe54591fb 100644 --- a/drivers/s390/cio/chp.c +++ b/drivers/s390/cio/chp.c @@ -135,7 +135,7 @@ static ssize_t chp_measurement_chars_read(struct file *filp, struct channel_path *chp; struct device *device; - device = container_of(kobj, struct device, kobj); + device = kobj_to_dev(kobj); chp = to_channelpath(device); if (chp->cmg == -1) return 0; @@ -184,7 +184,7 @@ static ssize_t chp_measurement_read(struct file *filp, struct kobject *kobj, struct device *device; unsigned int size; - device = container_of(kobj, struct device, kobj); + device = kobj_to_dev(kobj); chp = to_channelpath(device); css = to_css(chp->dev.parent); -- cgit v1.2.3 From 05ccaca003e4f26aceb7f075c073a49159af6e9e Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Wed, 12 Feb 2020 09:46:22 +0100 Subject: s390/pkey/zcrypt: spelling s/crytp/crypt/ Fix typos in a comments. Link: https://lkml.kernel.org/r/20200212084622.9219-1-geert+renesas@glider.be Signed-off-by: Geert Uytterhoeven Signed-off-by: Vasily Gorbik --- drivers/s390/crypto/zcrypt_ep11misc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/s390/crypto/zcrypt_ep11misc.c b/drivers/s390/crypto/zcrypt_ep11misc.c index d4caf46ff9df..2afe2153b34e 100644 --- a/drivers/s390/crypto/zcrypt_ep11misc.c +++ b/drivers/s390/crypto/zcrypt_ep11misc.c @@ -887,7 +887,7 @@ static int ep11_unwrapkey(u16 card, u16 domain, /* empty pin tag */ *p++ = 0x04; *p++ = 0; - /* encrytped key value tag and bytes */ + /* encrypted key value tag and bytes */ p += asn1tag_write(p, 0x04, enckey, enckeysize); /* reply cprb and payload */ @@ -1095,7 +1095,7 @@ int ep11_clr2keyblob(u16 card, u16 domain, u32 keybitsize, u32 keygenflags, /* Step 1: generate AES 256 bit random kek key */ rc = ep11_genaeskey(card, domain, 256, - 0x00006c00, /* EN/DECRYTP, WRAP/UNWRAP */ + 0x00006c00, /* EN/DECRYPT, WRAP/UNWRAP */ kek, &keklen); if (rc) { DEBUG_ERR( -- cgit v1.2.3 From 380324734956c64cd060e1db4304f3117ac15809 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Thu, 13 Feb 2020 23:42:07 -0700 Subject: s390/mm: Explicitly compare PAGE_DEFAULT_KEY against zero in storage_key_init_range Clang warns: In file included from ../arch/s390/purgatory/purgatory.c:10: In file included from ../include/linux/kexec.h:18: In file included from ../include/linux/crash_core.h:6: In file included from ../include/linux/elfcore.h:5: In file included from ../include/linux/user.h:1: In file included from ../arch/s390/include/asm/user.h:11: ../arch/s390/include/asm/page.h:45:6: warning: converting the result of '<<' to a boolean always evaluates to false [-Wtautological-constant-compare] if (PAGE_DEFAULT_KEY) ^ ../arch/s390/include/asm/page.h:23:44: note: expanded from macro 'PAGE_DEFAULT_KEY' #define PAGE_DEFAULT_KEY (PAGE_DEFAULT_ACC << 4) ^ 1 warning generated. Explicitly compare this against zero to silence the warning as it is intended to be used in a boolean context. Fixes: de3fa841e429 ("s390/mm: fix compile for PAGE_DEFAULT_KEY != 0") Link: https://github.com/ClangBuiltLinux/linux/issues/860 Link: https://lkml.kernel.org/r/20200214064207.10381-1-natechancellor@gmail.com Acked-by: Christian Borntraeger Signed-off-by: Nathan Chancellor Signed-off-by: Vasily Gorbik --- arch/s390/include/asm/page.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/s390/include/asm/page.h b/arch/s390/include/asm/page.h index 85e944f04c70..1019efd85b9d 100644 --- a/arch/s390/include/asm/page.h +++ b/arch/s390/include/asm/page.h @@ -42,7 +42,7 @@ void __storage_key_init_range(unsigned long start, unsigned long end); static inline void storage_key_init_range(unsigned long start, unsigned long end) { - if (PAGE_DEFAULT_KEY) + if (PAGE_DEFAULT_KEY != 0) __storage_key_init_range(start, end); } -- cgit v1.2.3 From 788d671517b5c81efbed9310ccbadb8cca86a08e Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Sat, 8 Feb 2020 07:10:52 -0700 Subject: s390/kaslr: Fix casts in get_random Clang warns: ../arch/s390/boot/kaslr.c:78:25: warning: passing 'char *' to parameter of type 'const u8 *' (aka 'const unsigned char *') converts between pointers to integer types with different sign [-Wpointer-sign] (char *) entropy, (char *) entropy, ^~~~~~~~~~~~~~~~ ../arch/s390/include/asm/cpacf.h:280:28: note: passing argument to parameter 'src' here u8 *dest, const u8 *src, long src_len) ^ 2 warnings generated. Fix the cast to match what else is done in this function. Fixes: b2d24b97b2a9 ("s390/kernel: add support for kernel address space layout randomization (KASLR)") Link: https://github.com/ClangBuiltLinux/linux/issues/862 Link: https://lkml.kernel.org/r/20200208141052.48476-1-natechancellor@gmail.com Signed-off-by: Nathan Chancellor Signed-off-by: Vasily Gorbik --- arch/s390/boot/kaslr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/s390/boot/kaslr.c b/arch/s390/boot/kaslr.c index 5d12352545c5..5591243d673e 100644 --- a/arch/s390/boot/kaslr.c +++ b/arch/s390/boot/kaslr.c @@ -75,7 +75,7 @@ static unsigned long get_random(unsigned long limit) *(unsigned long *) prng.parm_block ^= seed; for (i = 0; i < 16; i++) { cpacf_kmc(CPACF_KMC_PRNG, prng.parm_block, - (char *) entropy, (char *) entropy, + (u8 *) entropy, (u8 *) entropy, sizeof(entropy)); memcpy(prng.parm_block, entropy, sizeof(entropy)); } -- cgit v1.2.3 From 94e90f727f7424d827256023cace829cad6896f4 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sun, 16 Feb 2020 23:48:29 +0900 Subject: s390: make 'install' not depend on vmlinux For the same reason as commit 19514fc665ff ("arm, kbuild: make "make install" not depend on vmlinux"), the install targets should never trigger the rebuild of the kernel. The variable, CONFIGURE, is not set by anyone. Remove it as well. Link: https://lkml.kernel.org/r/20200216144829.27023-1-masahiroy@kernel.org Signed-off-by: Masahiro Yamada Signed-off-by: Vasily Gorbik --- arch/s390/Makefile | 2 +- arch/s390/boot/Makefile | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/s390/Makefile b/arch/s390/Makefile index e0e3a465bbfd..8dfa2cf1f05c 100644 --- a/arch/s390/Makefile +++ b/arch/s390/Makefile @@ -146,7 +146,7 @@ all: bzImage #KBUILD_IMAGE is necessary for packaging targets like rpm-pkg, deb-pkg... KBUILD_IMAGE := $(boot)/bzImage -install: vmlinux +install: $(Q)$(MAKE) $(build)=$(boot) $@ bzImage: vmlinux diff --git a/arch/s390/boot/Makefile b/arch/s390/boot/Makefile index e2c47d3a1c89..0ff9261c915e 100644 --- a/arch/s390/boot/Makefile +++ b/arch/s390/boot/Makefile @@ -70,7 +70,7 @@ $(obj)/compressed/vmlinux: $(obj)/startup.a FORCE $(obj)/startup.a: $(OBJECTS) FORCE $(call if_changed,ar) -install: $(CONFIGURE) $(obj)/bzImage +install: sh -x $(srctree)/$(obj)/install.sh $(KERNELRELEASE) $(obj)/bzImage \ System.map "$(INSTALL_PATH)" -- cgit v1.2.3 From dea8d5ce46d7e7f7270b9804df7d1174f88bfd99 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Thu, 30 Jan 2020 16:45:53 +0000 Subject: drm/i915/gem: Require per-engine reset support for non-persistent contexts To enable non-persistent contexts, we require a means of cancelling any inflight work from that context. This is first done "gracefully" by using preemption to kick the active context off the engine, and then forcefully by resetting the engine if it is active. If we are unable to reset the engine to remove hostile userspace, we should not allow userspace to opt into using non-persistent contexts. If the per-engine reset fails, we still do a full GPU reset, but that is rare and usually indicative of much deeper issues. The damage is already done. However, the goal of the interface to allow long running compute jobs without causing collateral damage elsewhere, and if we are unable to support that we should make that known by not providing the interface (and falsely pretending we can). Fixes: a0e047156cde ("drm/i915/gem: Make context persistence optional") Signed-off-by: Chris Wilson Cc: Joonas Lahtinen Cc: Jon Bloomfield Reviewed-by: Joonas Lahtinen Link: https://patchwork.freedesktop.org/patch/msgid/20200130164553.1937718-1-chris@chris-wilson.co.uk (cherry picked from commit d1b9b5f127bc3797fc274cfa4f363e039f045c3a) Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/gem/i915_gem_context.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c b/drivers/gpu/drm/i915/gem/i915_gem_context.c index a2e57e62af30..151a1e8ae36a 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_context.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c @@ -565,6 +565,22 @@ static int __context_set_persistence(struct i915_gem_context *ctx, bool state) if (!(ctx->i915->caps.scheduler & I915_SCHEDULER_CAP_PREEMPTION)) return -ENODEV; + /* + * If the cancel fails, we then need to reset, cleanly! + * + * If the per-engine reset fails, all hope is lost! We resort + * to a full GPU reset in that unlikely case, but realistically + * if the engine could not reset, the full reset does not fare + * much better. The damage has been done. + * + * However, if we cannot reset an engine by itself, we cannot + * cleanup a hanging persistent context without causing + * colateral damage, and we should not pretend we can by + * exposing the interface. + */ + if (!intel_has_reset_engine(&ctx->i915->gt)) + return -ENODEV; + i915_gem_context_clear_persistence(ctx); } -- cgit v1.2.3 From c01e8da2cdb99303547d25b3dbffa3afec56738a Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Mon, 3 Feb 2020 09:41:48 +0000 Subject: drm/i915: Initialise basic fence before acquiring seqno Inside the intel_timeline_get_seqno(), we currently track the retirement of the old cachelines by listening to the new request. This requires that the new request is ready to be used and so requires a minimum bit of initialisation prior to getting the new seqno. Fixes: b1e3177bd1d8 ("drm/i915: Coordinate i915_active with its own mutex") Signed-off-by: Chris Wilson Cc: Tvrtko Ursulin Cc: Matthew Auld Reviewed-by: Tvrtko Ursulin Link: https://patchwork.freedesktop.org/patch/msgid/20200203094152.4150550-2-chris@chris-wilson.co.uk (cherry picked from commit 855e39e65cfc33a73724f1cc644ffc5754864a20) Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/i915_request.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c index 78a5f5d3c070..f56b046a32de 100644 --- a/drivers/gpu/drm/i915/i915_request.c +++ b/drivers/gpu/drm/i915/i915_request.c @@ -595,6 +595,8 @@ static void __i915_request_ctor(void *arg) i915_sw_fence_init(&rq->submit, submit_notify); i915_sw_fence_init(&rq->semaphore, semaphore_notify); + dma_fence_init(&rq->fence, &i915_fence_ops, &rq->lock, 0, 0); + rq->file_priv = NULL; rq->capture_list = NULL; @@ -653,25 +655,30 @@ __i915_request_create(struct intel_context *ce, gfp_t gfp) } } - ret = intel_timeline_get_seqno(tl, rq, &seqno); - if (ret) - goto err_free; - rq->i915 = ce->engine->i915; rq->context = ce; rq->engine = ce->engine; rq->ring = ce->ring; rq->execution_mask = ce->engine->mask; + kref_init(&rq->fence.refcount); + rq->fence.flags = 0; + rq->fence.error = 0; + INIT_LIST_HEAD(&rq->fence.cb_list); + + ret = intel_timeline_get_seqno(tl, rq, &seqno); + if (ret) + goto err_free; + + rq->fence.context = tl->fence_context; + rq->fence.seqno = seqno; + RCU_INIT_POINTER(rq->timeline, tl); RCU_INIT_POINTER(rq->hwsp_cacheline, tl->hwsp_cacheline); rq->hwsp_seqno = tl->hwsp_seqno; rq->rcustate = get_state_synchronize_rcu(); /* acts as smp_mb() */ - dma_fence_init(&rq->fence, &i915_fence_ops, &rq->lock, - tl->fence_context, seqno); - /* We bump the ref for the fence chain */ i915_sw_fence_reinit(&i915_request_get(rq)->submit); i915_sw_fence_reinit(&i915_request_get(rq)->semaphore); -- cgit v1.2.3 From faaca0a0d48e7b122f6e7e2521f4f6fc487d0451 Mon Sep 17 00:00:00 2001 From: Jarkko Sakkinen Date: Tue, 4 Feb 2020 14:16:27 +0200 Subject: tpm: Revert tpm_tis_spi_mod.ko to tpm_tis_spi.ko. Revert tpm_tis_spi_mod.ko back to tpm_tis_spi.ko as the rename could break user space scripts. This can be achieved by renaming tpm_tis_spi.c as tpm_tis_spi_main.c. Then tpm_tis_spi-y can be used inside the makefile. Cc: Andrey Pronin Cc: Stephen Boyd Cc: stable@vger.kernel.org # 5.5.x Fixes: 797c0113c9a4 ("tpm: tpm_tis_spi: Support cr50 devices") Reported-by: Alexander Steffen Tested-by: Alexander Steffen Reviewed-by: Stephen Boyd Signed-off-by: Jarkko Sakkinen --- drivers/char/tpm/Makefile | 8 +- drivers/char/tpm/tpm_tis_spi.c | 298 ------------------------------------ drivers/char/tpm/tpm_tis_spi_main.c | 298 ++++++++++++++++++++++++++++++++++++ 3 files changed, 303 insertions(+), 301 deletions(-) delete mode 100644 drivers/char/tpm/tpm_tis_spi.c create mode 100644 drivers/char/tpm/tpm_tis_spi_main.c diff --git a/drivers/char/tpm/Makefile b/drivers/char/tpm/Makefile index 5a0d99d4fec0..9567e5197f74 100644 --- a/drivers/char/tpm/Makefile +++ b/drivers/char/tpm/Makefile @@ -21,9 +21,11 @@ tpm-$(CONFIG_EFI) += eventlog/efi.o tpm-$(CONFIG_OF) += eventlog/of.o obj-$(CONFIG_TCG_TIS_CORE) += tpm_tis_core.o obj-$(CONFIG_TCG_TIS) += tpm_tis.o -obj-$(CONFIG_TCG_TIS_SPI) += tpm_tis_spi_mod.o -tpm_tis_spi_mod-y := tpm_tis_spi.o -tpm_tis_spi_mod-$(CONFIG_TCG_TIS_SPI_CR50) += tpm_tis_spi_cr50.o + +obj-$(CONFIG_TCG_TIS_SPI) += tpm_tis_spi.o +tpm_tis_spi-y := tpm_tis_spi_main.o +tpm_tis_spi-$(CONFIG_TCG_TIS_SPI_CR50) += tpm_tis_spi_cr50.o + obj-$(CONFIG_TCG_TIS_I2C_ATMEL) += tpm_i2c_atmel.o obj-$(CONFIG_TCG_TIS_I2C_INFINEON) += tpm_i2c_infineon.o obj-$(CONFIG_TCG_TIS_I2C_NUVOTON) += tpm_i2c_nuvoton.o diff --git a/drivers/char/tpm/tpm_tis_spi.c b/drivers/char/tpm/tpm_tis_spi.c deleted file mode 100644 index d1754fd6c573..000000000000 --- a/drivers/char/tpm/tpm_tis_spi.c +++ /dev/null @@ -1,298 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Copyright (C) 2015 Infineon Technologies AG - * Copyright (C) 2016 STMicroelectronics SAS - * - * Authors: - * Peter Huewe - * Christophe Ricard - * - * Maintained by: - * - * Device driver for TCG/TCPA TPM (trusted platform module). - * Specifications at www.trustedcomputinggroup.org - * - * This device driver implements the TPM interface as defined in - * the TCG TPM Interface Spec version 1.3, revision 27 via _raw/native - * SPI access_. - * - * It is based on the original tpm_tis device driver from Leendert van - * Dorn and Kyleen Hall and Jarko Sakkinnen. - */ - -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -#include "tpm.h" -#include "tpm_tis_core.h" -#include "tpm_tis_spi.h" - -#define MAX_SPI_FRAMESIZE 64 - -/* - * TCG SPI flow control is documented in section 6.4 of the spec[1]. In short, - * keep trying to read from the device until MISO goes high indicating the - * wait state has ended. - * - * [1] https://trustedcomputinggroup.org/resource/pc-client-platform-tpm-profile-ptp-specification/ - */ -static int tpm_tis_spi_flow_control(struct tpm_tis_spi_phy *phy, - struct spi_transfer *spi_xfer) -{ - struct spi_message m; - int ret, i; - - if ((phy->iobuf[3] & 0x01) == 0) { - // handle SPI wait states - phy->iobuf[0] = 0; - - for (i = 0; i < TPM_RETRY; i++) { - spi_xfer->len = 1; - spi_message_init(&m); - spi_message_add_tail(spi_xfer, &m); - ret = spi_sync_locked(phy->spi_device, &m); - if (ret < 0) - return ret; - if (phy->iobuf[0] & 0x01) - break; - } - - if (i == TPM_RETRY) - return -ETIMEDOUT; - } - - return 0; -} - -int tpm_tis_spi_transfer(struct tpm_tis_data *data, u32 addr, u16 len, - u8 *in, const u8 *out) -{ - struct tpm_tis_spi_phy *phy = to_tpm_tis_spi_phy(data); - int ret = 0; - struct spi_message m; - struct spi_transfer spi_xfer; - u8 transfer_len; - - spi_bus_lock(phy->spi_device->master); - - while (len) { - transfer_len = min_t(u16, len, MAX_SPI_FRAMESIZE); - - phy->iobuf[0] = (in ? 0x80 : 0) | (transfer_len - 1); - phy->iobuf[1] = 0xd4; - phy->iobuf[2] = addr >> 8; - phy->iobuf[3] = addr; - - memset(&spi_xfer, 0, sizeof(spi_xfer)); - spi_xfer.tx_buf = phy->iobuf; - spi_xfer.rx_buf = phy->iobuf; - spi_xfer.len = 4; - spi_xfer.cs_change = 1; - - spi_message_init(&m); - spi_message_add_tail(&spi_xfer, &m); - ret = spi_sync_locked(phy->spi_device, &m); - if (ret < 0) - goto exit; - - ret = phy->flow_control(phy, &spi_xfer); - if (ret < 0) - goto exit; - - spi_xfer.cs_change = 0; - spi_xfer.len = transfer_len; - spi_xfer.delay_usecs = 5; - - if (in) { - spi_xfer.tx_buf = NULL; - } else if (out) { - spi_xfer.rx_buf = NULL; - memcpy(phy->iobuf, out, transfer_len); - out += transfer_len; - } - - spi_message_init(&m); - spi_message_add_tail(&spi_xfer, &m); - reinit_completion(&phy->ready); - ret = spi_sync_locked(phy->spi_device, &m); - if (ret < 0) - goto exit; - - if (in) { - memcpy(in, phy->iobuf, transfer_len); - in += transfer_len; - } - - len -= transfer_len; - } - -exit: - spi_bus_unlock(phy->spi_device->master); - return ret; -} - -static int tpm_tis_spi_read_bytes(struct tpm_tis_data *data, u32 addr, - u16 len, u8 *result) -{ - return tpm_tis_spi_transfer(data, addr, len, result, NULL); -} - -static int tpm_tis_spi_write_bytes(struct tpm_tis_data *data, u32 addr, - u16 len, const u8 *value) -{ - return tpm_tis_spi_transfer(data, addr, len, NULL, value); -} - -int tpm_tis_spi_read16(struct tpm_tis_data *data, u32 addr, u16 *result) -{ - __le16 result_le; - int rc; - - rc = data->phy_ops->read_bytes(data, addr, sizeof(u16), - (u8 *)&result_le); - if (!rc) - *result = le16_to_cpu(result_le); - - return rc; -} - -int tpm_tis_spi_read32(struct tpm_tis_data *data, u32 addr, u32 *result) -{ - __le32 result_le; - int rc; - - rc = data->phy_ops->read_bytes(data, addr, sizeof(u32), - (u8 *)&result_le); - if (!rc) - *result = le32_to_cpu(result_le); - - return rc; -} - -int tpm_tis_spi_write32(struct tpm_tis_data *data, u32 addr, u32 value) -{ - __le32 value_le; - int rc; - - value_le = cpu_to_le32(value); - rc = data->phy_ops->write_bytes(data, addr, sizeof(u32), - (u8 *)&value_le); - - return rc; -} - -int tpm_tis_spi_init(struct spi_device *spi, struct tpm_tis_spi_phy *phy, - int irq, const struct tpm_tis_phy_ops *phy_ops) -{ - phy->iobuf = devm_kmalloc(&spi->dev, MAX_SPI_FRAMESIZE, GFP_KERNEL); - if (!phy->iobuf) - return -ENOMEM; - - phy->spi_device = spi; - - return tpm_tis_core_init(&spi->dev, &phy->priv, irq, phy_ops, NULL); -} - -static const struct tpm_tis_phy_ops tpm_spi_phy_ops = { - .read_bytes = tpm_tis_spi_read_bytes, - .write_bytes = tpm_tis_spi_write_bytes, - .read16 = tpm_tis_spi_read16, - .read32 = tpm_tis_spi_read32, - .write32 = tpm_tis_spi_write32, -}; - -static int tpm_tis_spi_probe(struct spi_device *dev) -{ - struct tpm_tis_spi_phy *phy; - int irq; - - phy = devm_kzalloc(&dev->dev, sizeof(struct tpm_tis_spi_phy), - GFP_KERNEL); - if (!phy) - return -ENOMEM; - - phy->flow_control = tpm_tis_spi_flow_control; - - /* If the SPI device has an IRQ then use that */ - if (dev->irq > 0) - irq = dev->irq; - else - irq = -1; - - init_completion(&phy->ready); - return tpm_tis_spi_init(dev, phy, irq, &tpm_spi_phy_ops); -} - -typedef int (*tpm_tis_spi_probe_func)(struct spi_device *); - -static int tpm_tis_spi_driver_probe(struct spi_device *spi) -{ - const struct spi_device_id *spi_dev_id = spi_get_device_id(spi); - tpm_tis_spi_probe_func probe_func; - - probe_func = of_device_get_match_data(&spi->dev); - if (!probe_func && spi_dev_id) - probe_func = (tpm_tis_spi_probe_func)spi_dev_id->driver_data; - if (!probe_func) - return -ENODEV; - - return probe_func(spi); -} - -static SIMPLE_DEV_PM_OPS(tpm_tis_pm, tpm_pm_suspend, tpm_tis_spi_resume); - -static int tpm_tis_spi_remove(struct spi_device *dev) -{ - struct tpm_chip *chip = spi_get_drvdata(dev); - - tpm_chip_unregister(chip); - tpm_tis_remove(chip); - return 0; -} - -static const struct spi_device_id tpm_tis_spi_id[] = { - { "tpm_tis_spi", (unsigned long)tpm_tis_spi_probe }, - { "cr50", (unsigned long)cr50_spi_probe }, - {} -}; -MODULE_DEVICE_TABLE(spi, tpm_tis_spi_id); - -static const struct of_device_id of_tis_spi_match[] = { - { .compatible = "st,st33htpm-spi", .data = tpm_tis_spi_probe }, - { .compatible = "infineon,slb9670", .data = tpm_tis_spi_probe }, - { .compatible = "tcg,tpm_tis-spi", .data = tpm_tis_spi_probe }, - { .compatible = "google,cr50", .data = cr50_spi_probe }, - {} -}; -MODULE_DEVICE_TABLE(of, of_tis_spi_match); - -static const struct acpi_device_id acpi_tis_spi_match[] = { - {"SMO0768", 0}, - {} -}; -MODULE_DEVICE_TABLE(acpi, acpi_tis_spi_match); - -static struct spi_driver tpm_tis_spi_driver = { - .driver = { - .name = "tpm_tis_spi", - .pm = &tpm_tis_pm, - .of_match_table = of_match_ptr(of_tis_spi_match), - .acpi_match_table = ACPI_PTR(acpi_tis_spi_match), - }, - .probe = tpm_tis_spi_driver_probe, - .remove = tpm_tis_spi_remove, - .id_table = tpm_tis_spi_id, -}; -module_spi_driver(tpm_tis_spi_driver); - -MODULE_DESCRIPTION("TPM Driver for native SPI access"); -MODULE_LICENSE("GPL"); diff --git a/drivers/char/tpm/tpm_tis_spi_main.c b/drivers/char/tpm/tpm_tis_spi_main.c new file mode 100644 index 000000000000..d1754fd6c573 --- /dev/null +++ b/drivers/char/tpm/tpm_tis_spi_main.c @@ -0,0 +1,298 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2015 Infineon Technologies AG + * Copyright (C) 2016 STMicroelectronics SAS + * + * Authors: + * Peter Huewe + * Christophe Ricard + * + * Maintained by: + * + * Device driver for TCG/TCPA TPM (trusted platform module). + * Specifications at www.trustedcomputinggroup.org + * + * This device driver implements the TPM interface as defined in + * the TCG TPM Interface Spec version 1.3, revision 27 via _raw/native + * SPI access_. + * + * It is based on the original tpm_tis device driver from Leendert van + * Dorn and Kyleen Hall and Jarko Sakkinnen. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "tpm.h" +#include "tpm_tis_core.h" +#include "tpm_tis_spi.h" + +#define MAX_SPI_FRAMESIZE 64 + +/* + * TCG SPI flow control is documented in section 6.4 of the spec[1]. In short, + * keep trying to read from the device until MISO goes high indicating the + * wait state has ended. + * + * [1] https://trustedcomputinggroup.org/resource/pc-client-platform-tpm-profile-ptp-specification/ + */ +static int tpm_tis_spi_flow_control(struct tpm_tis_spi_phy *phy, + struct spi_transfer *spi_xfer) +{ + struct spi_message m; + int ret, i; + + if ((phy->iobuf[3] & 0x01) == 0) { + // handle SPI wait states + phy->iobuf[0] = 0; + + for (i = 0; i < TPM_RETRY; i++) { + spi_xfer->len = 1; + spi_message_init(&m); + spi_message_add_tail(spi_xfer, &m); + ret = spi_sync_locked(phy->spi_device, &m); + if (ret < 0) + return ret; + if (phy->iobuf[0] & 0x01) + break; + } + + if (i == TPM_RETRY) + return -ETIMEDOUT; + } + + return 0; +} + +int tpm_tis_spi_transfer(struct tpm_tis_data *data, u32 addr, u16 len, + u8 *in, const u8 *out) +{ + struct tpm_tis_spi_phy *phy = to_tpm_tis_spi_phy(data); + int ret = 0; + struct spi_message m; + struct spi_transfer spi_xfer; + u8 transfer_len; + + spi_bus_lock(phy->spi_device->master); + + while (len) { + transfer_len = min_t(u16, len, MAX_SPI_FRAMESIZE); + + phy->iobuf[0] = (in ? 0x80 : 0) | (transfer_len - 1); + phy->iobuf[1] = 0xd4; + phy->iobuf[2] = addr >> 8; + phy->iobuf[3] = addr; + + memset(&spi_xfer, 0, sizeof(spi_xfer)); + spi_xfer.tx_buf = phy->iobuf; + spi_xfer.rx_buf = phy->iobuf; + spi_xfer.len = 4; + spi_xfer.cs_change = 1; + + spi_message_init(&m); + spi_message_add_tail(&spi_xfer, &m); + ret = spi_sync_locked(phy->spi_device, &m); + if (ret < 0) + goto exit; + + ret = phy->flow_control(phy, &spi_xfer); + if (ret < 0) + goto exit; + + spi_xfer.cs_change = 0; + spi_xfer.len = transfer_len; + spi_xfer.delay_usecs = 5; + + if (in) { + spi_xfer.tx_buf = NULL; + } else if (out) { + spi_xfer.rx_buf = NULL; + memcpy(phy->iobuf, out, transfer_len); + out += transfer_len; + } + + spi_message_init(&m); + spi_message_add_tail(&spi_xfer, &m); + reinit_completion(&phy->ready); + ret = spi_sync_locked(phy->spi_device, &m); + if (ret < 0) + goto exit; + + if (in) { + memcpy(in, phy->iobuf, transfer_len); + in += transfer_len; + } + + len -= transfer_len; + } + +exit: + spi_bus_unlock(phy->spi_device->master); + return ret; +} + +static int tpm_tis_spi_read_bytes(struct tpm_tis_data *data, u32 addr, + u16 len, u8 *result) +{ + return tpm_tis_spi_transfer(data, addr, len, result, NULL); +} + +static int tpm_tis_spi_write_bytes(struct tpm_tis_data *data, u32 addr, + u16 len, const u8 *value) +{ + return tpm_tis_spi_transfer(data, addr, len, NULL, value); +} + +int tpm_tis_spi_read16(struct tpm_tis_data *data, u32 addr, u16 *result) +{ + __le16 result_le; + int rc; + + rc = data->phy_ops->read_bytes(data, addr, sizeof(u16), + (u8 *)&result_le); + if (!rc) + *result = le16_to_cpu(result_le); + + return rc; +} + +int tpm_tis_spi_read32(struct tpm_tis_data *data, u32 addr, u32 *result) +{ + __le32 result_le; + int rc; + + rc = data->phy_ops->read_bytes(data, addr, sizeof(u32), + (u8 *)&result_le); + if (!rc) + *result = le32_to_cpu(result_le); + + return rc; +} + +int tpm_tis_spi_write32(struct tpm_tis_data *data, u32 addr, u32 value) +{ + __le32 value_le; + int rc; + + value_le = cpu_to_le32(value); + rc = data->phy_ops->write_bytes(data, addr, sizeof(u32), + (u8 *)&value_le); + + return rc; +} + +int tpm_tis_spi_init(struct spi_device *spi, struct tpm_tis_spi_phy *phy, + int irq, const struct tpm_tis_phy_ops *phy_ops) +{ + phy->iobuf = devm_kmalloc(&spi->dev, MAX_SPI_FRAMESIZE, GFP_KERNEL); + if (!phy->iobuf) + return -ENOMEM; + + phy->spi_device = spi; + + return tpm_tis_core_init(&spi->dev, &phy->priv, irq, phy_ops, NULL); +} + +static const struct tpm_tis_phy_ops tpm_spi_phy_ops = { + .read_bytes = tpm_tis_spi_read_bytes, + .write_bytes = tpm_tis_spi_write_bytes, + .read16 = tpm_tis_spi_read16, + .read32 = tpm_tis_spi_read32, + .write32 = tpm_tis_spi_write32, +}; + +static int tpm_tis_spi_probe(struct spi_device *dev) +{ + struct tpm_tis_spi_phy *phy; + int irq; + + phy = devm_kzalloc(&dev->dev, sizeof(struct tpm_tis_spi_phy), + GFP_KERNEL); + if (!phy) + return -ENOMEM; + + phy->flow_control = tpm_tis_spi_flow_control; + + /* If the SPI device has an IRQ then use that */ + if (dev->irq > 0) + irq = dev->irq; + else + irq = -1; + + init_completion(&phy->ready); + return tpm_tis_spi_init(dev, phy, irq, &tpm_spi_phy_ops); +} + +typedef int (*tpm_tis_spi_probe_func)(struct spi_device *); + +static int tpm_tis_spi_driver_probe(struct spi_device *spi) +{ + const struct spi_device_id *spi_dev_id = spi_get_device_id(spi); + tpm_tis_spi_probe_func probe_func; + + probe_func = of_device_get_match_data(&spi->dev); + if (!probe_func && spi_dev_id) + probe_func = (tpm_tis_spi_probe_func)spi_dev_id->driver_data; + if (!probe_func) + return -ENODEV; + + return probe_func(spi); +} + +static SIMPLE_DEV_PM_OPS(tpm_tis_pm, tpm_pm_suspend, tpm_tis_spi_resume); + +static int tpm_tis_spi_remove(struct spi_device *dev) +{ + struct tpm_chip *chip = spi_get_drvdata(dev); + + tpm_chip_unregister(chip); + tpm_tis_remove(chip); + return 0; +} + +static const struct spi_device_id tpm_tis_spi_id[] = { + { "tpm_tis_spi", (unsigned long)tpm_tis_spi_probe }, + { "cr50", (unsigned long)cr50_spi_probe }, + {} +}; +MODULE_DEVICE_TABLE(spi, tpm_tis_spi_id); + +static const struct of_device_id of_tis_spi_match[] = { + { .compatible = "st,st33htpm-spi", .data = tpm_tis_spi_probe }, + { .compatible = "infineon,slb9670", .data = tpm_tis_spi_probe }, + { .compatible = "tcg,tpm_tis-spi", .data = tpm_tis_spi_probe }, + { .compatible = "google,cr50", .data = cr50_spi_probe }, + {} +}; +MODULE_DEVICE_TABLE(of, of_tis_spi_match); + +static const struct acpi_device_id acpi_tis_spi_match[] = { + {"SMO0768", 0}, + {} +}; +MODULE_DEVICE_TABLE(acpi, acpi_tis_spi_match); + +static struct spi_driver tpm_tis_spi_driver = { + .driver = { + .name = "tpm_tis_spi", + .pm = &tpm_tis_pm, + .of_match_table = of_match_ptr(of_tis_spi_match), + .acpi_match_table = ACPI_PTR(acpi_tis_spi_match), + }, + .probe = tpm_tis_spi_driver_probe, + .remove = tpm_tis_spi_remove, + .id_table = tpm_tis_spi_id, +}; +module_spi_driver(tpm_tis_spi_driver); + +MODULE_DESCRIPTION("TPM Driver for native SPI access"); +MODULE_LICENSE("GPL"); -- cgit v1.2.3 From dc10e4181c05a2315ddc375e963b7c763b5ee0df Mon Sep 17 00:00:00 2001 From: Roberto Sassu Date: Mon, 10 Feb 2020 11:00:41 +0100 Subject: tpm: Initialize crypto_id of allocated_banks to HASH_ALGO__LAST chip->allocated_banks, an array of tpm_bank_info structures, contains the list of TPM algorithm IDs of allocated PCR banks. It also contains the corresponding ID of the crypto subsystem, so that users of the TPM driver can calculate a digest for a PCR extend operation. However, if there is no mapping between TPM algorithm ID and crypto ID, the crypto_id field of tpm_bank_info remains set to zero (the array is allocated and initialized with kcalloc() in tpm2_get_pcr_allocation()). Zero should not be used as value for unknown mappings, as it is a valid crypto ID (HASH_ALGO_MD4). Thus, initialize crypto_id to HASH_ALGO__LAST. Cc: stable@vger.kernel.org # 5.1.x Fixes: 879b589210a9 ("tpm: retrieve digest size of unknown algorithms with PCR read") Signed-off-by: Roberto Sassu Reviewed-by: Petr Vorel Reviewed-by: Jarkko Sakkinen Signed-off-by: Jarkko Sakkinen --- drivers/char/tpm/tpm2-cmd.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/char/tpm/tpm2-cmd.c b/drivers/char/tpm/tpm2-cmd.c index 13696deceae8..760329598b99 100644 --- a/drivers/char/tpm/tpm2-cmd.c +++ b/drivers/char/tpm/tpm2-cmd.c @@ -525,6 +525,8 @@ static int tpm2_init_bank_info(struct tpm_chip *chip, u32 bank_index) return 0; } + bank->crypto_id = HASH_ALGO__LAST; + return tpm2_pcr_read(chip, 0, &digest, &bank->digest_size); } -- cgit v1.2.3 From 96228b7df33f8eb9006f8ae96949400aed9bd303 Mon Sep 17 00:00:00 2001 From: Jani Nikula Date: Wed, 12 Feb 2020 18:04:33 +0200 Subject: MAINTAINERS: Update drm/i915 bug filing URL We've moved from bugzilla to gitlab. Cc: stable@vger.kernel.org Reviewed-by: Chris Wilson Signed-off-by: Jani Nikula Link: https://patchwork.freedesktop.org/patch/msgid/20200212160434.6437-1-jani.nikula@intel.com (cherry picked from commit 3a6a4f0810c8ade6f1ff63c34aa9834176b9d88b) Signed-off-by: Jani Nikula --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index a0d86490c2c6..19dd0d4ffdcc 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -8392,7 +8392,7 @@ M: Joonas Lahtinen M: Rodrigo Vivi L: intel-gfx@lists.freedesktop.org W: https://01.org/linuxgraphics/ -B: https://01.org/linuxgraphics/documentation/how-report-bugs +B: https://gitlab.freedesktop.org/drm/intel/-/wikis/How-to-file-i915-bugs C: irc://chat.freenode.net/intel-gfx Q: http://patchwork.freedesktop.org/project/intel-gfx/ T: git git://anongit.freedesktop.org/drm-intel -- cgit v1.2.3 From 7ddc7005a0aa2f43a826b71f5d6bd7d4b90f8f2a Mon Sep 17 00:00:00 2001 From: Jani Nikula Date: Wed, 12 Feb 2020 18:04:34 +0200 Subject: drm/i915: Update drm/i915 bug filing URL We've moved from bugzilla to gitlab. Cc: stable@vger.kernel.org Reviewed-by: Chris Wilson Signed-off-by: Jani Nikula Link: https://patchwork.freedesktop.org/patch/msgid/20200212160434.6437-2-jani.nikula@intel.com (cherry picked from commit ddae4d7af0bbe3b2051f1603459a8b24e9a19324) Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/Kconfig | 5 ++--- drivers/gpu/drm/i915/i915_gpu_error.c | 3 ++- drivers/gpu/drm/i915/i915_utils.c | 5 ++--- 3 files changed, 6 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/i915/Kconfig b/drivers/gpu/drm/i915/Kconfig index ba9595960bbe..907c4471f591 100644 --- a/drivers/gpu/drm/i915/Kconfig +++ b/drivers/gpu/drm/i915/Kconfig @@ -75,9 +75,8 @@ config DRM_I915_CAPTURE_ERROR help This option enables capturing the GPU state when a hang is detected. This information is vital for triaging hangs and assists in debugging. - Please report any hang to - https://bugs.freedesktop.org/enter_bug.cgi?product=DRI - for triaging. + Please report any hang for triaging according to: + https://gitlab.freedesktop.org/drm/intel/-/wikis/How-to-file-i915-bugs If in doubt, say "Y". diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c index 594341e27a47..9e401a5fcae8 100644 --- a/drivers/gpu/drm/i915/i915_gpu_error.c +++ b/drivers/gpu/drm/i915/i915_gpu_error.c @@ -1852,7 +1852,8 @@ void i915_error_state_store(struct i915_gpu_coredump *error) if (!xchg(&warned, true) && ktime_get_real_seconds() - DRIVER_TIMESTAMP < DAY_AS_SECONDS(180)) { pr_info("GPU hangs can indicate a bug anywhere in the entire gfx stack, including userspace.\n"); - pr_info("Please file a _new_ bug report on bugs.freedesktop.org against DRI -> DRM/Intel\n"); + pr_info("Please file a _new_ bug report at https://gitlab.freedesktop.org/drm/intel/issues/new.\n"); + pr_info("Please see https://gitlab.freedesktop.org/drm/intel/-/wikis/How-to-file-i915-bugs for details.\n"); pr_info("drm/i915 developers can then reassign to the right component if it's not a kernel issue.\n"); pr_info("The GPU crash dump is required to analyze GPU hangs, so please always attach it.\n"); pr_info("GPU crash dump saved to /sys/class/drm/card%d/error\n", diff --git a/drivers/gpu/drm/i915/i915_utils.c b/drivers/gpu/drm/i915/i915_utils.c index c47261ae86ea..632d6953c78d 100644 --- a/drivers/gpu/drm/i915/i915_utils.c +++ b/drivers/gpu/drm/i915/i915_utils.c @@ -8,9 +8,8 @@ #include "i915_drv.h" #include "i915_utils.h" -#define FDO_BUG_URL "https://bugs.freedesktop.org/enter_bug.cgi?product=DRI" -#define FDO_BUG_MSG "Please file a bug at " FDO_BUG_URL " against DRM/Intel " \ - "providing the dmesg log by booting with drm.debug=0xf" +#define FDO_BUG_URL "https://gitlab.freedesktop.org/drm/intel/-/wikis/How-to-file-i915-bugs" +#define FDO_BUG_MSG "Please file a bug on drm/i915; see " FDO_BUG_URL " for details." void __i915_printk(struct drm_i915_private *dev_priv, const char *level, -- cgit v1.2.3 From 58e9121c32a245fab47f29ab4ad29dd62470a7e8 Mon Sep 17 00:00:00 2001 From: Matt Roper Date: Thu, 6 Feb 2020 16:14:16 -0800 Subject: drm/i915/ehl: Update port clock voltage level requirements MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Voltage level depends not only on the cdclk, but also on the DDI clock. Last time the bspec voltage level table for EHL was updated, we only updated the cdclk requirements, but forgot to account for the new port clock criteria. Bspec: 21809 Fixes: d147483884ed ("drm/i915/ehl: Update voltage level checks") Cc: José Roberto de Souza Signed-off-by: Matt Roper Link: https://patchwork.freedesktop.org/patch/msgid/20200207001417.1229251-1-matthew.d.roper@intel.com Reviewed-by: José Roberto de Souza (cherry picked from commit 9d5fd37ed7e26efdbe90f492d7eb8b53dcdb61d6) Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/display/intel_ddi.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/display/intel_ddi.c b/drivers/gpu/drm/i915/display/intel_ddi.c index 33f1dc3d7c1a..d9a61f341070 100644 --- a/drivers/gpu/drm/i915/display/intel_ddi.c +++ b/drivers/gpu/drm/i915/display/intel_ddi.c @@ -4251,7 +4251,9 @@ static bool intel_ddi_is_audio_enabled(struct drm_i915_private *dev_priv, void intel_ddi_compute_min_voltage_level(struct drm_i915_private *dev_priv, struct intel_crtc_state *crtc_state) { - if (INTEL_GEN(dev_priv) >= 11 && crtc_state->port_clock > 594000) + if (IS_ELKHARTLAKE(dev_priv) && crtc_state->port_clock > 594000) + crtc_state->min_voltage_level = 3; + else if (INTEL_GEN(dev_priv) >= 11 && crtc_state->port_clock > 594000) crtc_state->min_voltage_level = 1; else if (IS_CANNONLAKE(dev_priv) && crtc_state->port_clock > 594000) crtc_state->min_voltage_level = 2; -- cgit v1.2.3 From 2e0a576511f656933adfe56ef03b9cf3e64b21b7 Mon Sep 17 00:00:00 2001 From: Jani Nikula Date: Thu, 13 Feb 2020 16:04:11 +0200 Subject: drm/i915/dsc: force full modeset whenever DSC is enabled at probe We lack full state readout of DSC config, which may lead to DSC enable using a config that's all zeros, failing spectacularly. Force full modeset and thus compute config at probe to get a sane state, until we implement DSC state readout. Any fastset that did appear to work with DSC at probe, worked by coincidence. [1] is an example of a change that triggered the issue on TGL DSI DSC. [1] http://patchwork.freedesktop.org/patch/msgid/20200212150102.7600-1-ville.syrjala@linux.intel.com Cc: Manasi Navare Cc: Vandita Kulkarni Cc: Ville Syrjala Cc: stable@vger.kernel.org Fixes: fbacb15ea814 ("drm/i915/dsc: add basic hardware state readout support") Acked-by: Matt Roper Signed-off-by: Jani Nikula Link: https://patchwork.freedesktop.org/patch/msgid/20200213140412.32697-3-stanislav.lisovskiy@intel.com (cherry picked from commit a4277aa398d76db109d6b8420934f68daf69a6c3) Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/display/intel_display.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/drivers/gpu/drm/i915/display/intel_display.c b/drivers/gpu/drm/i915/display/intel_display.c index 064dd99bbc49..e68ec25fc97c 100644 --- a/drivers/gpu/drm/i915/display/intel_display.c +++ b/drivers/gpu/drm/i915/display/intel_display.c @@ -17433,6 +17433,24 @@ retry: * have readout for pipe gamma enable. */ crtc_state->uapi.color_mgmt_changed = true; + + /* + * FIXME hack to force full modeset when DSC is being + * used. + * + * As long as we do not have full state readout and + * config comparison of crtc_state->dsc, we have no way + * to ensure reliable fastset. Remove once we have + * readout for DSC. + */ + if (crtc_state->dsc.compression_enable) { + ret = drm_atomic_add_affected_connectors(state, + &crtc->base); + if (ret) + goto out; + crtc_state->uapi.mode_changed = true; + drm_dbg_kms(dev, "Force full modeset for DSC\n"); + } } } -- cgit v1.2.3 From e543e370ec3160c06c2cd897477150dfb23f1afd Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Thu, 6 Feb 2020 20:49:12 +0000 Subject: drm/i915/gt: Prevent queuing retire workers on the virtual engine Virtual engines are fleeting. They carry a reference count and may be freed when their last request is retired. This makes them unsuitable for the task of housing engine->retire.work so assert that it is not used. Tvrtko tracked down an instance where we did indeed violate this rule. In virtual_submit_request, we flush a completed request directly with __i915_request_submit and this causes us to queue that request on the veng's breadcrumb list and signal it. Leading us down a path where we should not attach the retire. Reported-by: Tvrtko Ursulin Fixes: dc93c9b69315 ("drm/i915/gt: Schedule request retirement when signaler idles") Signed-off-by: Chris Wilson Cc: Tvrtko Ursulin Reviewed-by: Mika Kuoppala Link: https://patchwork.freedesktop.org/patch/msgid/20200206204915.2636606-1-chris@chris-wilson.co.uk (cherry picked from commit f91d8156ab8afb32447cd2bf3189219bab943f18) Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/gt/intel_breadcrumbs.c | 3 +++ drivers/gpu/drm/i915/gt/intel_gt_requests.c | 3 +++ 2 files changed, 6 insertions(+) diff --git a/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c b/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c index 0ba524a414c6..cbad7fe722ce 100644 --- a/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c +++ b/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c @@ -136,6 +136,9 @@ static void add_retire(struct intel_breadcrumbs *b, struct intel_timeline *tl) struct intel_engine_cs *engine = container_of(b, struct intel_engine_cs, breadcrumbs); + if (unlikely(intel_engine_is_virtual(engine))) + engine = intel_virtual_engine_get_sibling(engine, 0); + intel_engine_add_retire(engine, tl); } diff --git a/drivers/gpu/drm/i915/gt/intel_gt_requests.c b/drivers/gpu/drm/i915/gt/intel_gt_requests.c index 7ef1d37970f6..8a5054f21bf8 100644 --- a/drivers/gpu/drm/i915/gt/intel_gt_requests.c +++ b/drivers/gpu/drm/i915/gt/intel_gt_requests.c @@ -99,6 +99,9 @@ static bool add_retire(struct intel_engine_cs *engine, void intel_engine_add_retire(struct intel_engine_cs *engine, struct intel_timeline *tl) { + /* We don't deal well with the engine disappearing beneath us */ + GEM_BUG_ON(intel_engine_is_virtual(engine)); + if (add_retire(engine, tl)) schedule_work(&engine->retire_work); } -- cgit v1.2.3 From 19b5f3b419a61808ff2713f1f30b8a88fe14ac9b Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Thu, 6 Feb 2020 20:49:13 +0000 Subject: drm/i915/gt: Protect defer_request() from new waiters Mika spotted <4>[17436.705441] general protection fault: 0000 [#1] PREEMPT SMP PTI <4>[17436.705447] CPU: 2 PID: 0 Comm: swapper/2 Not tainted 5.5.0+ #1 <4>[17436.705449] Hardware name: System manufacturer System Product Name/Z170M-PLUS, BIOS 3805 05/16/2018 <4>[17436.705512] RIP: 0010:__execlists_submission_tasklet+0xc4d/0x16e0 [i915] <4>[17436.705516] Code: c5 4c 8d 60 e0 75 17 e9 8c 07 00 00 49 8b 44 24 20 49 39 c5 4c 8d 60 e0 0f 84 7a 07 00 00 49 8b 5c 24 08 49 8b 87 80 00 00 00 <48> 39 83 d8 fe ff ff 75 d9 48 8b 83 88 fe ff ff a8 01 0f 84 b6 05 <4>[17436.705518] RSP: 0018:ffffc9000012ce80 EFLAGS: 00010083 <4>[17436.705521] RAX: ffff88822ae42000 RBX: 5a5a5a5a5a5a5a5a RCX: dead000000000122 <4>[17436.705523] RDX: ffff88822ae42588 RSI: ffff8881e32a7908 RDI: ffff8881c429fd48 <4>[17436.705525] RBP: ffffc9000012cf00 R08: ffff88822ae42588 R09: 00000000fffffffe <4>[17436.705527] R10: ffff8881c429fb80 R11: 00000000a677cf08 R12: ffff8881c42a0aa8 <4>[17436.705529] R13: ffff8881c429fd38 R14: ffff88822ae42588 R15: ffff8881c429fb80 <4>[17436.705532] FS: 0000000000000000(0000) GS:ffff88822ed00000(0000) knlGS:0000000000000000 <4>[17436.705534] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 <4>[17436.705536] CR2: 00007f858c76d000 CR3: 0000000005610003 CR4: 00000000003606e0 <4>[17436.705538] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 <4>[17436.705540] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 <4>[17436.705542] Call Trace: <4>[17436.705545] <4>[17436.705603] execlists_submission_tasklet+0xc0/0x130 [i915] which is us consuming a partially initialised new waiter in defer_requests(). We can prevent this by initialising the i915_dependency prior to making it visible, and since we are using a concurrent list_add/iterator mark them up to the compiler. Fixes: 8ee36e048c98 ("drm/i915/execlists: Minimalistic timeslicing") Signed-off-by: Chris Wilson Cc: Mika Kuoppala Reviewed-by: Mika Kuoppala Link: https://patchwork.freedesktop.org/patch/msgid/20200206204915.2636606-2-chris@chris-wilson.co.uk (cherry picked from commit f14f27b1663269a81ed62d3961fe70250a1a0623) Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/gt/intel_lrc.c | 7 ++++++- drivers/gpu/drm/i915/i915_scheduler.c | 6 ++++-- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c index a13a8c4b65ab..0a8a2c8026f1 100644 --- a/drivers/gpu/drm/i915/gt/intel_lrc.c +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c @@ -1605,6 +1605,11 @@ last_active(const struct intel_engine_execlists *execlists) return *last; } +#define for_each_waiter(p__, rq__) \ + list_for_each_entry_lockless(p__, \ + &(rq__)->sched.waiters_list, \ + wait_link) + static void defer_request(struct i915_request *rq, struct list_head * const pl) { LIST_HEAD(list); @@ -1622,7 +1627,7 @@ static void defer_request(struct i915_request *rq, struct list_head * const pl) GEM_BUG_ON(i915_request_is_active(rq)); list_move_tail(&rq->sched.link, pl); - list_for_each_entry(p, &rq->sched.waiters_list, wait_link) { + for_each_waiter(p, rq) { struct i915_request *w = container_of(p->waiter, typeof(*w), sched); diff --git a/drivers/gpu/drm/i915/i915_scheduler.c b/drivers/gpu/drm/i915/i915_scheduler.c index 5d96cfba40f8..34b654b4e58a 100644 --- a/drivers/gpu/drm/i915/i915_scheduler.c +++ b/drivers/gpu/drm/i915/i915_scheduler.c @@ -423,8 +423,6 @@ bool __i915_sched_node_add_dependency(struct i915_sched_node *node, if (!node_signaled(signal)) { INIT_LIST_HEAD(&dep->dfs_link); - list_add(&dep->wait_link, &signal->waiters_list); - list_add(&dep->signal_link, &node->signalers_list); dep->signaler = signal; dep->waiter = node; dep->flags = flags; @@ -434,6 +432,10 @@ bool __i915_sched_node_add_dependency(struct i915_sched_node *node, !node_started(signal)) node->flags |= I915_SCHED_HAS_SEMAPHORE_CHAIN; + /* All set, now publish. Beware the lockless walkers. */ + list_add(&dep->signal_link, &node->signalers_list); + list_add_rcu(&dep->wait_link, &signal->waiters_list); + /* * As we do not allow WAIT to preempt inflight requests, * once we have executed a request, along with triggering -- cgit v1.2.3 From 96781fd941b39e1f78098009344ebcd7af861c67 Mon Sep 17 00:00:00 2001 From: Samuel Holland Date: Mon, 17 Feb 2020 00:42:22 -0600 Subject: ASoC: sun8i-codec: Fix setting DAI data format Use the correct mask for this two-bit field. This fixes setting the DAI data format to RIGHT_J or DSP_A. Fixes: 36c684936fae ("ASoC: Add sun8i digital audio codec") Signed-off-by: Samuel Holland Acked-by: Chen-Yu Tsai Cc: stable@kernel.org Link: https://lore.kernel.org/r/20200217064250.15516-7-samuel@sholland.org Signed-off-by: Mark Brown --- sound/soc/sunxi/sun8i-codec.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sound/soc/sunxi/sun8i-codec.c b/sound/soc/sunxi/sun8i-codec.c index 55798bc8eae2..686561df8e13 100644 --- a/sound/soc/sunxi/sun8i-codec.c +++ b/sound/soc/sunxi/sun8i-codec.c @@ -80,6 +80,7 @@ #define SUN8I_SYS_SR_CTRL_AIF1_FS_MASK GENMASK(15, 12) #define SUN8I_SYS_SR_CTRL_AIF2_FS_MASK GENMASK(11, 8) +#define SUN8I_AIF1CLK_CTRL_AIF1_DATA_FMT_MASK GENMASK(3, 2) #define SUN8I_AIF1CLK_CTRL_AIF1_WORD_SIZ_MASK GENMASK(5, 4) #define SUN8I_AIF1CLK_CTRL_AIF1_LRCK_DIV_MASK GENMASK(8, 6) #define SUN8I_AIF1CLK_CTRL_AIF1_BCLK_DIV_MASK GENMASK(12, 9) @@ -241,7 +242,7 @@ static int sun8i_set_fmt(struct snd_soc_dai *dai, unsigned int fmt) return -EINVAL; } regmap_update_bits(scodec->regmap, SUN8I_AIF1CLK_CTRL, - BIT(SUN8I_AIF1CLK_CTRL_AIF1_DATA_FMT), + SUN8I_AIF1CLK_CTRL_AIF1_DATA_FMT_MASK, value << SUN8I_AIF1CLK_CTRL_AIF1_DATA_FMT); return 0; -- cgit v1.2.3 From a81541041ceb55bcec9a8bb8ad3482263f0a205a Mon Sep 17 00:00:00 2001 From: Horatiu Vultur Date: Mon, 17 Feb 2020 09:31:33 +0100 Subject: net: mscc: fix in frame extraction Each extracted frame on Ocelot has an IFH. The frame and IFH are extracted by reading chuncks of 4 bytes from a register. In case the IFH and frames were read corretly it would try to read the next frame. In case there are no more frames in the queue, it checks if there were any previous errors and in that case clear the queue. But this check will always succeed also when there are no errors. Because when extracting the IFH the error is checked against 4(number of bytes read) and then the error is set only if the extraction of the frame failed. So in a happy case where there are no errors the err variable is still 4. So it could be a case where after the check that there are no more frames in the queue, a frame will arrive in the queue but because the error is not reseted, it would try to flush the queue. So the frame will be lost. The fix consist in resetting the error after reading the IFH. Signed-off-by: Horatiu Vultur Acked-by: Alexandre Belloni Signed-off-by: David S. Miller --- drivers/net/ethernet/mscc/ocelot_board.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/net/ethernet/mscc/ocelot_board.c b/drivers/net/ethernet/mscc/ocelot_board.c index b38820849faa..1135a18019c7 100644 --- a/drivers/net/ethernet/mscc/ocelot_board.c +++ b/drivers/net/ethernet/mscc/ocelot_board.c @@ -114,6 +114,14 @@ static irqreturn_t ocelot_xtr_irq_handler(int irq, void *arg) if (err != 4) break; + /* At this point the IFH was read correctly, so it is safe to + * presume that there is no error. The err needs to be reset + * otherwise a frame could come in CPU queue between the while + * condition and the check for error later on. And in that case + * the new frame is just removed and not processed. + */ + err = 0; + ocelot_parse_ifh(ifh, &info); ocelot_port = ocelot->ports[info.port]; -- cgit v1.2.3 From 0a923a76d615b4f54456ce6c52865bb0d5b5516d Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 16 Feb 2020 20:44:09 -0800 Subject: Documentation/hwmon: fix xdpe12284 Sphinx warnings Fix Sphinx format warnings by adding a blank line. Documentation/hwmon/xdpe12284.rst:28: WARNING: Unexpected indentation. Documentation/hwmon/xdpe12284.rst:29: WARNING: Block quote ends without a blank line; unexpected unindent. Signed-off-by: Randy Dunlap Cc: Jean Delvare Cc: Guenter Roeck Cc: linux-hwmon@vger.kernel.org Cc: Vadim Pasternak Link: https://lore.kernel.org/r/0094c570-dd4c-dc0e-386d-ea1c39b6a582@infradead.org Signed-off-by: Guenter Roeck --- Documentation/hwmon/xdpe12284.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/hwmon/xdpe12284.rst b/Documentation/hwmon/xdpe12284.rst index 6b7ae98cc536..67d1f87808e5 100644 --- a/Documentation/hwmon/xdpe12284.rst +++ b/Documentation/hwmon/xdpe12284.rst @@ -24,6 +24,7 @@ This driver implements support for Infineon Multi-phase XDPE122 family dual loop voltage regulators. The family includes XDPE12284 and XDPE12254 devices. The devices from this family complaint with: + - Intel VR13 and VR13HC rev 1.3, IMVP8 rev 1.2 and IMPVP9 rev 1.3 DC-DC converter specification. - Intel SVID rev 1.9. protocol. -- cgit v1.2.3 From 8a9093c79863b58cc2f9874d7ae788f0d622a596 Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Mon, 17 Feb 2020 15:38:09 -0500 Subject: net: sched: correct flower port blocking tc flower rules that are based on src or dst port blocking are sometimes ineffective due to uninitialized stack data. __skb_flow_dissect() extracts ports from the skb for tc flower to match against. However, the port dissection is not done when when the FLOW_DIS_IS_FRAGMENT bit is set in key_control->flags. All callers of __skb_flow_dissect(), zero-out the key_control field except for fl_classify() as used by the flower classifier. Thus, the FLOW_DIS_IS_FRAGMENT may be set on entry to __skb_flow_dissect(), since key_control is allocated on the stack and may not be initialized. Since key_basic and key_control are present for all flow keys, let's make sure they are initialized. Fixes: 62230715fd24 ("flow_dissector: do not dissect l4 ports for fragments") Co-developed-by: Eric Dumazet Signed-off-by: Eric Dumazet Acked-by: Cong Wang Signed-off-by: Jason Baron Signed-off-by: David S. Miller --- include/net/flow_dissector.h | 9 +++++++++ net/sched/cls_flower.c | 1 + 2 files changed, 10 insertions(+) diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h index e9391e877f9a..628383915827 100644 --- a/include/net/flow_dissector.h +++ b/include/net/flow_dissector.h @@ -5,6 +5,7 @@ #include #include #include +#include #include struct sk_buff; @@ -348,4 +349,12 @@ struct bpf_flow_dissector { void *data_end; }; +static inline void +flow_dissector_init_keys(struct flow_dissector_key_control *key_control, + struct flow_dissector_key_basic *key_basic) +{ + memset(key_control, 0, sizeof(*key_control)); + memset(key_basic, 0, sizeof(*key_basic)); +} + #endif diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 7e54d2ab5254..d32d4233d337 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -305,6 +305,7 @@ static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct cls_fl_filter *f; list_for_each_entry_rcu(mask, &head->masks, list) { + flow_dissector_init_keys(&skb_key.control, &skb_key.basic); fl_clear_masked_range(&skb_key, mask); skb_flow_dissect_meta(skb, &mask->dissector, &skb_key); -- cgit v1.2.3 From 245709ec8be89af46ea7ef0444c9c80913999d99 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Tue, 18 Feb 2020 12:07:53 +0800 Subject: sctp: move the format error check out of __sctp_sf_do_9_1_abort When T2 timer is to be stopped, the asoc should also be deleted, otherwise, there will be no chance to call sctp_association_free and the asoc could last in memory forever. However, in sctp_sf_shutdown_sent_abort(), after adding the cmd SCTP_CMD_TIMER_STOP for T2 timer, it may return error due to the format error from __sctp_sf_do_9_1_abort() and miss adding SCTP_CMD_ASSOC_FAILED where the asoc will be deleted. This patch is to fix it by moving the format error check out of __sctp_sf_do_9_1_abort(), and do it before adding the cmd SCTP_CMD_TIMER_STOP for T2 timer. Thanks Hangbin for reporting this issue by the fuzz testing. v1->v2: - improve the comment in the code as Marcelo's suggestion. Fixes: 96ca468b86b0 ("sctp: check invalid value of length parameter in error cause") Reported-by: Hangbin Liu Acked-by: Marcelo Ricardo Leitner Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/sctp/sm_statefuns.c | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index 748e3b19ec1d..6a16af4b1ef6 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -170,6 +170,16 @@ static inline bool sctp_chunk_length_valid(struct sctp_chunk *chunk, return true; } +/* Check for format error in an ABORT chunk */ +static inline bool sctp_err_chunk_valid(struct sctp_chunk *chunk) +{ + struct sctp_errhdr *err; + + sctp_walk_errors(err, chunk->chunk_hdr); + + return (void *)err == (void *)chunk->chunk_end; +} + /********************************************************** * These are the state functions for handling chunk events. **********************************************************/ @@ -2255,6 +2265,9 @@ enum sctp_disposition sctp_sf_shutdown_pending_abort( sctp_bind_addr_state(&asoc->base.bind_addr, &chunk->dest)) return sctp_sf_discard_chunk(net, ep, asoc, type, arg, commands); + if (!sctp_err_chunk_valid(chunk)) + return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); + return __sctp_sf_do_9_1_abort(net, ep, asoc, type, arg, commands); } @@ -2298,6 +2311,9 @@ enum sctp_disposition sctp_sf_shutdown_sent_abort( sctp_bind_addr_state(&asoc->base.bind_addr, &chunk->dest)) return sctp_sf_discard_chunk(net, ep, asoc, type, arg, commands); + if (!sctp_err_chunk_valid(chunk)) + return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); + /* Stop the T2-shutdown timer. */ sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP, SCTP_TO(SCTP_EVENT_TIMEOUT_T2_SHUTDOWN)); @@ -2565,6 +2581,9 @@ enum sctp_disposition sctp_sf_do_9_1_abort( sctp_bind_addr_state(&asoc->base.bind_addr, &chunk->dest)) return sctp_sf_discard_chunk(net, ep, asoc, type, arg, commands); + if (!sctp_err_chunk_valid(chunk)) + return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); + return __sctp_sf_do_9_1_abort(net, ep, asoc, type, arg, commands); } @@ -2582,16 +2601,8 @@ static enum sctp_disposition __sctp_sf_do_9_1_abort( /* See if we have an error cause code in the chunk. */ len = ntohs(chunk->chunk_hdr->length); - if (len >= sizeof(struct sctp_chunkhdr) + sizeof(struct sctp_errhdr)) { - struct sctp_errhdr *err; - - sctp_walk_errors(err, chunk->chunk_hdr); - if ((void *)err != (void *)chunk->chunk_end) - return sctp_sf_pdiscard(net, ep, asoc, type, arg, - commands); - + if (len >= sizeof(struct sctp_chunkhdr) + sizeof(struct sctp_errhdr)) error = ((struct sctp_errhdr *)chunk->skb->data)->cause; - } sctp_add_cmd_sf(commands, SCTP_CMD_SET_SK_ERR, SCTP_ERROR(ECONNRESET)); /* ASSOC_FAILED will DELETE_TCB. */ -- cgit v1.2.3 From 82969e6ef0430a31d58342009e64c7634512eb53 Mon Sep 17 00:00:00 2001 From: Alexandre Belloni Date: Fri, 14 Feb 2020 15:32:24 +0100 Subject: net: cnic: fix spelling mistake "reserverd" -> "reserved" The reserved member should be named reserved3. Signed-off-by: Alexandre Belloni Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/cnic_defs.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/broadcom/cnic_defs.h b/drivers/net/ethernet/broadcom/cnic_defs.h index b38499774071..99e2c6d4d8c3 100644 --- a/drivers/net/ethernet/broadcom/cnic_defs.h +++ b/drivers/net/ethernet/broadcom/cnic_defs.h @@ -543,13 +543,13 @@ struct l4_kwq_update_pg { #define L4_KWQ_UPDATE_PG_RESERVERD2_SHIFT 2 #endif #if defined(__BIG_ENDIAN) - u16 reserverd3; + u16 reserved3; u8 da0; u8 da1; #elif defined(__LITTLE_ENDIAN) u8 da1; u8 da0; - u16 reserverd3; + u16 reserved3; #endif #if defined(__BIG_ENDIAN) u8 da2; -- cgit v1.2.3 From 9b64208f74fbd0e920475ecfe9326f8443fdc3a5 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Mon, 17 Feb 2020 11:43:15 +0800 Subject: selftests: forwarding: vxlan_bridge_1d: use more proper tos value 0x11 and 0x12 set the ECN bits based on RFC2474, it would be better to avoid that. 0x14 and 0x18 would be better and works as well. Reported-by: Petr Machata Fixes: 4e867c9a50ff ("selftests: forwarding: vxlan_bridge_1d: fix tos value") Signed-off-by: Hangbin Liu Signed-off-by: David S. Miller --- tools/testing/selftests/net/forwarding/vxlan_bridge_1d.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/net/forwarding/vxlan_bridge_1d.sh b/tools/testing/selftests/net/forwarding/vxlan_bridge_1d.sh index 353613fc1947..ce6bea9675c0 100755 --- a/tools/testing/selftests/net/forwarding/vxlan_bridge_1d.sh +++ b/tools/testing/selftests/net/forwarding/vxlan_bridge_1d.sh @@ -516,9 +516,9 @@ test_tos() RET=0 tc filter add dev v1 egress pref 77 prot ip \ - flower ip_tos 0x11 action pass - vxlan_ping_test $h1 192.0.2.3 "-Q 0x11" v1 egress 77 10 - vxlan_ping_test $h1 192.0.2.3 "-Q 0x12" v1 egress 77 0 + flower ip_tos 0x14 action pass + vxlan_ping_test $h1 192.0.2.3 "-Q 0x14" v1 egress 77 10 + vxlan_ping_test $h1 192.0.2.3 "-Q 0x18" v1 egress 77 0 tc filter del dev v1 egress pref 77 prot ip log_test "VXLAN: envelope TOS inheritance" -- cgit v1.2.3 From aa3146193ae25d0fe4b96d815169a135db2e8f01 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Sun, 2 Feb 2020 15:39:34 +0000 Subject: drm/i915: Wean off drm_pci_alloc/drm_pci_free drm_pci_alloc and drm_pci_free are just very thin wrappers around dma_alloc_coherent, with a note that we should be removing them. Furthermore since commit de09d31dd38a50fdce106c15abd68432eebbd014 Author: Kirill A. Shutemov Date: Fri Jan 15 16:51:42 2016 -0800 page-flags: define PG_reserved behavior on compound pages As far as I can see there's no users of PG_reserved on compound pages. Let's use PF_NO_COMPOUND here. drm_pci_alloc has been declared broken since it mixes GFP_COMP and SetPageReserved. Avoid this conflict by weaning ourselves off using the abstraction and using the dma functions directly. Reported-by: Taketo Kabe Closes: https://gitlab.freedesktop.org/drm/intel/issues/1027 Fixes: de09d31dd38a ("page-flags: define PG_reserved behavior on compound pages") Signed-off-by: Chris Wilson Cc: # v4.5+ Reviewed-by: Daniel Vetter Link: https://patchwork.freedesktop.org/patch/msgid/20200202153934.3899472-1-chris@chris-wilson.co.uk (cherry picked from commit c6790dc22312f592c1434577258b31c48c72d52a) Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/display/intel_display.c | 2 +- drivers/gpu/drm/i915/gem/i915_gem_object_types.h | 3 - drivers/gpu/drm/i915/gem/i915_gem_phys.c | 98 ++++++++++++------------ drivers/gpu/drm/i915/i915_gem.c | 8 +- 4 files changed, 55 insertions(+), 56 deletions(-) diff --git a/drivers/gpu/drm/i915/display/intel_display.c b/drivers/gpu/drm/i915/display/intel_display.c index e68ec25fc97c..aa453953908b 100644 --- a/drivers/gpu/drm/i915/display/intel_display.c +++ b/drivers/gpu/drm/i915/display/intel_display.c @@ -11087,7 +11087,7 @@ static u32 intel_cursor_base(const struct intel_plane_state *plane_state) u32 base; if (INTEL_INFO(dev_priv)->display.cursor_needs_physical) - base = obj->phys_handle->busaddr; + base = sg_dma_address(obj->mm.pages->sgl); else base = intel_plane_ggtt_offset(plane_state); diff --git a/drivers/gpu/drm/i915/gem/i915_gem_object_types.h b/drivers/gpu/drm/i915/gem/i915_gem_object_types.h index f64ad77e6b1e..c2174da35bb0 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_object_types.h +++ b/drivers/gpu/drm/i915/gem/i915_gem_object_types.h @@ -285,9 +285,6 @@ struct drm_i915_gem_object { void *gvt_info; }; - - /** for phys allocated objects */ - struct drm_dma_handle *phys_handle; }; static inline struct drm_i915_gem_object * diff --git a/drivers/gpu/drm/i915/gem/i915_gem_phys.c b/drivers/gpu/drm/i915/gem/i915_gem_phys.c index b1b7c1b3038a..b07bb40edd5a 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_phys.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_phys.c @@ -22,88 +22,87 @@ static int i915_gem_object_get_pages_phys(struct drm_i915_gem_object *obj) { struct address_space *mapping = obj->base.filp->f_mapping; - struct drm_dma_handle *phys; - struct sg_table *st; struct scatterlist *sg; - char *vaddr; + struct sg_table *st; + dma_addr_t dma; + void *vaddr; + void *dst; int i; - int err; if (WARN_ON(i915_gem_object_needs_bit17_swizzle(obj))) return -EINVAL; - /* Always aligning to the object size, allows a single allocation + /* + * Always aligning to the object size, allows a single allocation * to handle all possible callers, and given typical object sizes, * the alignment of the buddy allocation will naturally match. */ - phys = drm_pci_alloc(obj->base.dev, - roundup_pow_of_two(obj->base.size), - roundup_pow_of_two(obj->base.size)); - if (!phys) + vaddr = dma_alloc_coherent(&obj->base.dev->pdev->dev, + roundup_pow_of_two(obj->base.size), + &dma, GFP_KERNEL); + if (!vaddr) return -ENOMEM; - vaddr = phys->vaddr; + st = kmalloc(sizeof(*st), GFP_KERNEL); + if (!st) + goto err_pci; + + if (sg_alloc_table(st, 1, GFP_KERNEL)) + goto err_st; + + sg = st->sgl; + sg->offset = 0; + sg->length = obj->base.size; + + sg_assign_page(sg, (struct page *)vaddr); + sg_dma_address(sg) = dma; + sg_dma_len(sg) = obj->base.size; + + dst = vaddr; for (i = 0; i < obj->base.size / PAGE_SIZE; i++) { struct page *page; - char *src; + void *src; page = shmem_read_mapping_page(mapping, i); - if (IS_ERR(page)) { - err = PTR_ERR(page); - goto err_phys; - } + if (IS_ERR(page)) + goto err_st; src = kmap_atomic(page); - memcpy(vaddr, src, PAGE_SIZE); - drm_clflush_virt_range(vaddr, PAGE_SIZE); + memcpy(dst, src, PAGE_SIZE); + drm_clflush_virt_range(dst, PAGE_SIZE); kunmap_atomic(src); put_page(page); - vaddr += PAGE_SIZE; + dst += PAGE_SIZE; } intel_gt_chipset_flush(&to_i915(obj->base.dev)->gt); - st = kmalloc(sizeof(*st), GFP_KERNEL); - if (!st) { - err = -ENOMEM; - goto err_phys; - } - - if (sg_alloc_table(st, 1, GFP_KERNEL)) { - kfree(st); - err = -ENOMEM; - goto err_phys; - } - - sg = st->sgl; - sg->offset = 0; - sg->length = obj->base.size; - - sg_dma_address(sg) = phys->busaddr; - sg_dma_len(sg) = obj->base.size; - - obj->phys_handle = phys; - __i915_gem_object_set_pages(obj, st, sg->length); return 0; -err_phys: - drm_pci_free(obj->base.dev, phys); - - return err; +err_st: + kfree(st); +err_pci: + dma_free_coherent(&obj->base.dev->pdev->dev, + roundup_pow_of_two(obj->base.size), + vaddr, dma); + return -ENOMEM; } static void i915_gem_object_put_pages_phys(struct drm_i915_gem_object *obj, struct sg_table *pages) { + dma_addr_t dma = sg_dma_address(pages->sgl); + void *vaddr = sg_page(pages->sgl); + __i915_gem_object_release_shmem(obj, pages, false); if (obj->mm.dirty) { struct address_space *mapping = obj->base.filp->f_mapping; - char *vaddr = obj->phys_handle->vaddr; + void *src = vaddr; int i; for (i = 0; i < obj->base.size / PAGE_SIZE; i++) { @@ -115,15 +114,16 @@ i915_gem_object_put_pages_phys(struct drm_i915_gem_object *obj, continue; dst = kmap_atomic(page); - drm_clflush_virt_range(vaddr, PAGE_SIZE); - memcpy(dst, vaddr, PAGE_SIZE); + drm_clflush_virt_range(src, PAGE_SIZE); + memcpy(dst, src, PAGE_SIZE); kunmap_atomic(dst); set_page_dirty(page); if (obj->mm.madv == I915_MADV_WILLNEED) mark_page_accessed(page); put_page(page); - vaddr += PAGE_SIZE; + + src += PAGE_SIZE; } obj->mm.dirty = false; } @@ -131,7 +131,9 @@ i915_gem_object_put_pages_phys(struct drm_i915_gem_object *obj, sg_free_table(pages); kfree(pages); - drm_pci_free(obj->base.dev, obj->phys_handle); + dma_free_coherent(&obj->base.dev->pdev->dev, + roundup_pow_of_two(obj->base.size), + vaddr, dma); } static void phys_release(struct drm_i915_gem_object *obj) diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index c2de2f45b459..5f6e63952821 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c @@ -180,7 +180,7 @@ i915_gem_phys_pwrite(struct drm_i915_gem_object *obj, struct drm_i915_gem_pwrite *args, struct drm_file *file) { - void *vaddr = obj->phys_handle->vaddr + args->offset; + void *vaddr = sg_page(obj->mm.pages->sgl) + args->offset; char __user *user_data = u64_to_user_ptr(args->data_ptr); /* @@ -844,10 +844,10 @@ i915_gem_pwrite_ioctl(struct drm_device *dev, void *data, ret = i915_gem_gtt_pwrite_fast(obj, args); if (ret == -EFAULT || ret == -ENOSPC) { - if (obj->phys_handle) - ret = i915_gem_phys_pwrite(obj, args, file); - else + if (i915_gem_object_has_struct_page(obj)) ret = i915_gem_shmem_pwrite(obj, args); + else + ret = i915_gem_phys_pwrite(obj, args, file); } i915_gem_object_unpin_pages(obj); -- cgit v1.2.3 From b1339ecac661e1cf3e1dc78ac56bff3aeeaeb92c Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Fri, 7 Feb 2020 21:14:52 +0000 Subject: drm/i915/execlists: Always force a context reload when rewinding RING_TAIL If we rewind the RING_TAIL on a context, due to a preemption event, we must force the context restore for the RING_TAIL update to be properly handled. Rather than note which preemption events may cause us to rewind the tail, compare the new request's tail with the previously submitted RING_TAIL, as it turns out that timeslicing was causing unexpected rewinds. -0 0d.s2 1280851190us : __execlists_submission_tasklet: 0000:00:02.0 rcs0: expired last=130:4698, prio=3, hint=3 -0 0d.s2 1280851192us : __i915_request_unsubmit: 0000:00:02.0 rcs0: fence 66:119966, current 119964 -0 0d.s2 1280851195us : __i915_request_unsubmit: 0000:00:02.0 rcs0: fence 130:4698, current 4695 -0 0d.s2 1280851198us : __i915_request_unsubmit: 0000:00:02.0 rcs0: fence 130:4696, current 4695 ^---- Note we unwind 2 requests from the same context -0 0d.s2 1280851208us : __i915_request_submit: 0000:00:02.0 rcs0: fence 130:4696, current 4695 -0 0d.s2 1280851213us : __i915_request_submit: 0000:00:02.0 rcs0: fence 134:1508, current 1506 ^---- But to apply the new timeslice, we have to replay the first request before the new client can start -- the unexpected RING_TAIL rewind -0 0d.s2 1280851219us : trace_ports: 0000:00:02.0 rcs0: submit { 130:4696*, 134:1508 } synmark2-5425 2..s. 1280851239us : process_csb: 0000:00:02.0 rcs0: cs-irq head=5, tail=0 synmark2-5425 2..s. 1280851240us : process_csb: 0000:00:02.0 rcs0: csb[0]: status=0x00008002:0x00000000 ^---- Preemption event for the ELSP update; note the lite-restore synmark2-5425 2..s. 1280851243us : trace_ports: 0000:00:02.0 rcs0: preempted { 130:4698, 66:119966 } synmark2-5425 2..s. 1280851246us : trace_ports: 0000:00:02.0 rcs0: promote { 130:4696*, 134:1508 } synmark2-5425 2.... 1280851462us : __i915_request_commit: 0000:00:02.0 rcs0: fence 130:4700, current 4695 synmark2-5425 2.... 1280852111us : __i915_request_commit: 0000:00:02.0 rcs0: fence 130:4702, current 4695 synmark2-5425 2.Ns1 1280852296us : process_csb: 0000:00:02.0 rcs0: cs-irq head=0, tail=2 synmark2-5425 2.Ns1 1280852297us : process_csb: 0000:00:02.0 rcs0: csb[1]: status=0x00000814:0x00000000 synmark2-5425 2.Ns1 1280852299us : trace_ports: 0000:00:02.0 rcs0: completed { 130:4696!, 134:1508 } synmark2-5425 2.Ns1 1280852301us : process_csb: 0000:00:02.0 rcs0: csb[2]: status=0x00000818:0x00000040 synmark2-5425 2.Ns1 1280852302us : trace_ports: 0000:00:02.0 rcs0: completed { 134:1508, 0:0 } synmark2-5425 2.Ns1 1280852313us : process_csb: process_csb:2336 GEM_BUG_ON(!i915_request_completed(*execlists->active) && !reset_in_progress(execlists)) Fixes: 8ee36e048c98 ("drm/i915/execlists: Minimalistic timeslicing") Referenecs: 82c69bf58650 ("drm/i915/gt: Detect if we miss WaIdleLiteRestore") Signed-off-by: Chris Wilson Cc: Mika Kuoppala Reviewed-by: Mika Kuoppala Cc: # v5.4+ Link: https://patchwork.freedesktop.org/patch/msgid/20200207211452.2860634-1-chris@chris-wilson.co.uk (cherry picked from commit 5ba32c7be81e53ea8a27190b0f6be98e6c6779af) Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/gt/intel_lrc.c | 18 ++++++++---------- drivers/gpu/drm/i915/gt/intel_ring.c | 1 + drivers/gpu/drm/i915/gt/intel_ring.h | 8 ++++++++ drivers/gpu/drm/i915/gt/intel_ring_types.h | 1 + 4 files changed, 18 insertions(+), 10 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c index 0a8a2c8026f1..438d7a97d45c 100644 --- a/drivers/gpu/drm/i915/gt/intel_lrc.c +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c @@ -1321,7 +1321,7 @@ static u64 execlists_update_context(struct i915_request *rq) { struct intel_context *ce = rq->context; u64 desc = ce->lrc_desc; - u32 tail; + u32 tail, prev; /* * WaIdleLiteRestore:bdw,skl @@ -1334,9 +1334,15 @@ static u64 execlists_update_context(struct i915_request *rq) * subsequent resubmissions (for lite restore). Should that fail us, * and we try and submit the same tail again, force the context * reload. + * + * If we need to return to a preempted context, we need to skip the + * lite-restore and force it to reload the RING_TAIL. Otherwise, the + * HW has a tendency to ignore us rewinding the TAIL to the end of + * an earlier request. */ tail = intel_ring_set_tail(rq->ring, rq->tail); - if (unlikely(ce->lrc_reg_state[CTX_RING_TAIL] == tail)) + prev = ce->lrc_reg_state[CTX_RING_TAIL]; + if (unlikely(intel_ring_direction(rq->ring, tail, prev) <= 0)) desc |= CTX_DESC_FORCE_RESTORE; ce->lrc_reg_state[CTX_RING_TAIL] = tail; rq->tail = rq->wa_tail; @@ -1839,14 +1845,6 @@ static void execlists_dequeue(struct intel_engine_cs *engine) */ __unwind_incomplete_requests(engine); - /* - * If we need to return to the preempted context, we - * need to skip the lite-restore and force it to - * reload the RING_TAIL. Otherwise, the HW has a - * tendency to ignore us rewinding the TAIL to the - * end of an earlier request. - */ - last->context->lrc_desc |= CTX_DESC_FORCE_RESTORE; last = NULL; } else if (need_timeslice(engine, last) && timer_expired(&engine->execlists.timer)) { diff --git a/drivers/gpu/drm/i915/gt/intel_ring.c b/drivers/gpu/drm/i915/gt/intel_ring.c index 374b28f13ca0..6ff803f397c4 100644 --- a/drivers/gpu/drm/i915/gt/intel_ring.c +++ b/drivers/gpu/drm/i915/gt/intel_ring.c @@ -145,6 +145,7 @@ intel_engine_create_ring(struct intel_engine_cs *engine, int size) kref_init(&ring->ref); ring->size = size; + ring->wrap = BITS_PER_TYPE(ring->size) - ilog2(size); /* * Workaround an erratum on the i830 which causes a hang if diff --git a/drivers/gpu/drm/i915/gt/intel_ring.h b/drivers/gpu/drm/i915/gt/intel_ring.h index ea2839d9e044..5bdce24994aa 100644 --- a/drivers/gpu/drm/i915/gt/intel_ring.h +++ b/drivers/gpu/drm/i915/gt/intel_ring.h @@ -56,6 +56,14 @@ static inline u32 intel_ring_wrap(const struct intel_ring *ring, u32 pos) return pos & (ring->size - 1); } +static inline int intel_ring_direction(const struct intel_ring *ring, + u32 next, u32 prev) +{ + typecheck(typeof(ring->size), next); + typecheck(typeof(ring->size), prev); + return (next - prev) << ring->wrap; +} + static inline bool intel_ring_offset_valid(const struct intel_ring *ring, unsigned int pos) diff --git a/drivers/gpu/drm/i915/gt/intel_ring_types.h b/drivers/gpu/drm/i915/gt/intel_ring_types.h index d9f17f38e0cc..3cd7fec7fd8d 100644 --- a/drivers/gpu/drm/i915/gt/intel_ring_types.h +++ b/drivers/gpu/drm/i915/gt/intel_ring_types.h @@ -45,6 +45,7 @@ struct intel_ring { u32 space; u32 size; + u32 wrap; u32 effective_size; }; -- cgit v1.2.3 From 15de9cb5c9c83a23be92b8f7a1178cead1486587 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Tue, 11 Feb 2020 12:01:31 +0000 Subject: drm/i915/gt: Avoid resetting ring->head outside of its timeline mutex We manipulate ring->head while active in i915_request_retire underneath the timeline manipulation. We cannot rely on a stable ring->head outside of the timeline->mutex, in particular while setting up the context for resume and reset. Closes: https://gitlab.freedesktop.org/drm/intel/issues/1126 Fixes: 0881954965e3 ("drm/i915: Introduce intel_context.pin_mutex for pin management") Fixes: e5dadff4b093 ("drm/i915: Protect request retirement with timeline->mutex") References: f3c0efc9fe7a ("drm/i915/execlists: Leave resetting ring to intel_ring") Signed-off-by: Chris Wilson Cc: Matthew Auld Cc: Tvrtko Ursulin Cc: Mika Kuoppala Reviewed-by: Andi Shyti Reviewed-by: Mika Kuoppala Link: https://patchwork.freedesktop.org/patch/msgid/20200211120131.958949-1-chris@chris-wilson.co.uk (cherry picked from commit 42827350f75c56d0fe9f15d8425a1390528958b6) Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/gt/intel_lrc.c | 36 ++++++++++++++---------------- drivers/gpu/drm/i915/gt/intel_ring_types.h | 6 ++--- drivers/gpu/drm/i915/gt/selftest_lrc.c | 2 +- 3 files changed, 21 insertions(+), 23 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c index 438d7a97d45c..fe8a59aaa629 100644 --- a/drivers/gpu/drm/i915/gt/intel_lrc.c +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c @@ -237,7 +237,8 @@ static void execlists_init_reg_state(u32 *reg_state, bool close); static void __execlists_update_reg_state(const struct intel_context *ce, - const struct intel_engine_cs *engine); + const struct intel_engine_cs *engine, + u32 head); static void mark_eio(struct i915_request *rq) { @@ -1186,12 +1187,11 @@ static void reset_active(struct i915_request *rq, head = rq->tail; else head = active_request(ce->timeline, rq)->head; - ce->ring->head = intel_ring_wrap(ce->ring, head); - intel_ring_update_space(ce->ring); + head = intel_ring_wrap(ce->ring, head); /* Scrub the context image to prevent replaying the previous batch */ restore_default_state(ce, engine); - __execlists_update_reg_state(ce, engine); + __execlists_update_reg_state(ce, engine, head); /* We've switched away, so this should be a no-op, but intent matters */ ce->lrc_desc |= CTX_DESC_FORCE_RESTORE; @@ -2863,16 +2863,17 @@ static void execlists_context_unpin(struct intel_context *ce) static void __execlists_update_reg_state(const struct intel_context *ce, - const struct intel_engine_cs *engine) + const struct intel_engine_cs *engine, + u32 head) { struct intel_ring *ring = ce->ring; u32 *regs = ce->lrc_reg_state; - GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->head)); + GEM_BUG_ON(!intel_ring_offset_valid(ring, head)); GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail)); regs[CTX_RING_START] = i915_ggtt_offset(ring->vma); - regs[CTX_RING_HEAD] = ring->head; + regs[CTX_RING_HEAD] = head; regs[CTX_RING_TAIL] = ring->tail; /* RPCS */ @@ -2901,7 +2902,7 @@ __execlists_context_pin(struct intel_context *ce, ce->lrc_desc = lrc_descriptor(ce, engine) | CTX_DESC_FORCE_RESTORE; ce->lrc_reg_state = vaddr + LRC_STATE_PN * PAGE_SIZE; - __execlists_update_reg_state(ce, engine); + __execlists_update_reg_state(ce, engine, ce->ring->tail); return 0; } @@ -2942,7 +2943,7 @@ static void execlists_context_reset(struct intel_context *ce) /* Scrub away the garbage */ execlists_init_reg_state(ce->lrc_reg_state, ce, ce->engine, ce->ring, true); - __execlists_update_reg_state(ce, ce->engine); + __execlists_update_reg_state(ce, ce->engine, ce->ring->tail); ce->lrc_desc |= CTX_DESC_FORCE_RESTORE; } @@ -3497,6 +3498,7 @@ static void __execlists_reset(struct intel_engine_cs *engine, bool stalled) struct intel_engine_execlists * const execlists = &engine->execlists; struct intel_context *ce; struct i915_request *rq; + u32 head; mb(); /* paranoia: read the CSB pointers from after the reset */ clflush(execlists->csb_write); @@ -3524,15 +3526,15 @@ static void __execlists_reset(struct intel_engine_cs *engine, bool stalled) if (i915_request_completed(rq)) { /* Idle context; tidy up the ring so we can restart afresh */ - ce->ring->head = intel_ring_wrap(ce->ring, rq->tail); + head = intel_ring_wrap(ce->ring, rq->tail); goto out_replay; } /* Context has requests still in-flight; it should not be idle! */ GEM_BUG_ON(i915_active_is_idle(&ce->active)); rq = active_request(ce->timeline, rq); - ce->ring->head = intel_ring_wrap(ce->ring, rq->head); - GEM_BUG_ON(ce->ring->head == ce->ring->tail); + head = intel_ring_wrap(ce->ring, rq->head); + GEM_BUG_ON(head == ce->ring->tail); /* * If this request hasn't started yet, e.g. it is waiting on a @@ -3577,10 +3579,9 @@ static void __execlists_reset(struct intel_engine_cs *engine, bool stalled) out_replay: ENGINE_TRACE(engine, "replay {head:%04x, tail:%04x}\n", - ce->ring->head, ce->ring->tail); - intel_ring_update_space(ce->ring); + head, ce->ring->tail); __execlists_reset_reg_state(ce, engine); - __execlists_update_reg_state(ce, engine); + __execlists_update_reg_state(ce, engine, head); ce->lrc_desc |= CTX_DESC_FORCE_RESTORE; /* paranoid: GPU was reset! */ unwind: @@ -5223,10 +5224,7 @@ void intel_lr_context_reset(struct intel_engine_cs *engine, restore_default_state(ce, engine); /* Rerun the request; its payload has been neutered (if guilty). */ - ce->ring->head = head; - intel_ring_update_space(ce->ring); - - __execlists_update_reg_state(ce, engine); + __execlists_update_reg_state(ce, engine, head); } bool diff --git a/drivers/gpu/drm/i915/gt/intel_ring_types.h b/drivers/gpu/drm/i915/gt/intel_ring_types.h index 3cd7fec7fd8d..1a189ea00fd8 100644 --- a/drivers/gpu/drm/i915/gt/intel_ring_types.h +++ b/drivers/gpu/drm/i915/gt/intel_ring_types.h @@ -39,9 +39,9 @@ struct intel_ring { */ atomic_t pin_count; - u32 head; - u32 tail; - u32 emit; + u32 head; /* updated during retire, loosely tracks RING_HEAD */ + u32 tail; /* updated on submission, used for RING_TAIL */ + u32 emit; /* updated during request construction */ u32 space; u32 size; diff --git a/drivers/gpu/drm/i915/gt/selftest_lrc.c b/drivers/gpu/drm/i915/gt/selftest_lrc.c index 65718ca2326e..b292f8cbd0bf 100644 --- a/drivers/gpu/drm/i915/gt/selftest_lrc.c +++ b/drivers/gpu/drm/i915/gt/selftest_lrc.c @@ -186,7 +186,7 @@ static int live_unlite_restore(struct intel_gt *gt, int prio) } GEM_BUG_ON(!ce[1]->ring->size); intel_ring_reset(ce[1]->ring, ce[1]->ring->size / 2); - __execlists_update_reg_state(ce[1], engine); + __execlists_update_reg_state(ce[1], engine, ce[1]->ring->head); rq[0] = igt_spinner_create_request(&spin, ce[0], MI_ARB_CHECK); if (IS_ERR(rq[0])) { -- cgit v1.2.3 From cc5049ae4d457194796f854eb2e38b9727ad8c2d Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Tue, 18 Feb 2020 09:09:15 +0100 Subject: ALSA: hda/realtek - Apply quirk for yet another MSI laptop MSI GP65 laptop with SSID 1462:1293 requires the same quirk as other MSI models. BugLink: https://bugzilla.kernel.org/show_bug.cgi?id=204159 Cc: Link: https://lore.kernel.org/r/20200218080915.3433-1-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 82485e06dde1..477589e7ec1d 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -2449,6 +2449,7 @@ static const struct snd_pci_quirk alc882_fixup_tbl[] = { SND_PCI_QUIRK(0x1458, 0xa0b8, "Gigabyte AZ370-Gaming", ALC1220_FIXUP_GB_DUAL_CODECS), SND_PCI_QUIRK(0x1462, 0x1228, "MSI-GP63", ALC1220_FIXUP_CLEVO_P950), SND_PCI_QUIRK(0x1462, 0x1276, "MSI-GL73", ALC1220_FIXUP_CLEVO_P950), + SND_PCI_QUIRK(0x1462, 0x1293, "MSI-GP65", ALC1220_FIXUP_CLEVO_P950), SND_PCI_QUIRK(0x1462, 0x7350, "MSI-7350", ALC889_FIXUP_CD), SND_PCI_QUIRK(0x1462, 0xda57, "MSI Z270-Gaming", ALC1220_FIXUP_GB_DUAL_CODECS), SND_PCI_QUIRK_VENDOR(0x1462, "MSI", ALC882_FIXUP_GPIO3), -- cgit v1.2.3 From 44eeb081b8630bb3ad3cd381d1ae1831463e48bb Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Tue, 18 Feb 2020 10:14:09 +0100 Subject: ALSA: hda: Use scnprintf() for printing texts for sysfs/procfs Some code in HD-audio driver calls snprintf() in a loop and still expects that the return value were actually written size, while snprintf() returns the expected would-be length instead. When the given buffer limit were small, this leads to a buffer overflow. Use scnprintf() for addressing those issues. It returns the actually written size unlike snprintf(). Cc: Link: https://lore.kernel.org/r/20200218091409.27162-1-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/hda/hdmi_chmap.c | 2 +- sound/pci/hda/hda_codec.c | 2 +- sound/pci/hda/hda_eld.c | 2 +- sound/pci/hda/hda_sysfs.c | 4 ++-- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/sound/hda/hdmi_chmap.c b/sound/hda/hdmi_chmap.c index 5fd6d575e123..aad5c4bf4d34 100644 --- a/sound/hda/hdmi_chmap.c +++ b/sound/hda/hdmi_chmap.c @@ -250,7 +250,7 @@ void snd_hdac_print_channel_allocation(int spk_alloc, char *buf, int buflen) for (i = 0, j = 0; i < ARRAY_SIZE(cea_speaker_allocation_names); i++) { if (spk_alloc & (1 << i)) - j += snprintf(buf + j, buflen - j, " %s", + j += scnprintf(buf + j, buflen - j, " %s", cea_speaker_allocation_names[i]); } buf[j] = '\0'; /* necessary when j == 0 */ diff --git a/sound/pci/hda/hda_codec.c b/sound/pci/hda/hda_codec.c index 5dc42f932739..53e7732ef752 100644 --- a/sound/pci/hda/hda_codec.c +++ b/sound/pci/hda/hda_codec.c @@ -4022,7 +4022,7 @@ void snd_print_pcm_bits(int pcm, char *buf, int buflen) for (i = 0, j = 0; i < ARRAY_SIZE(bits); i++) if (pcm & (AC_SUPPCM_BITS_8 << i)) - j += snprintf(buf + j, buflen - j, " %d", bits[i]); + j += scnprintf(buf + j, buflen - j, " %d", bits[i]); buf[j] = '\0'; /* necessary when j == 0 */ } diff --git a/sound/pci/hda/hda_eld.c b/sound/pci/hda/hda_eld.c index bb46c89b7f63..136477ed46ae 100644 --- a/sound/pci/hda/hda_eld.c +++ b/sound/pci/hda/hda_eld.c @@ -360,7 +360,7 @@ static void hdmi_print_pcm_rates(int pcm, char *buf, int buflen) for (i = 0, j = 0; i < ARRAY_SIZE(alsa_rates); i++) if (pcm & (1 << i)) - j += snprintf(buf + j, buflen - j, " %d", + j += scnprintf(buf + j, buflen - j, " %d", alsa_rates[i]); buf[j] = '\0'; /* necessary when j == 0 */ diff --git a/sound/pci/hda/hda_sysfs.c b/sound/pci/hda/hda_sysfs.c index 0607ed5d1959..eb8ec109d7ad 100644 --- a/sound/pci/hda/hda_sysfs.c +++ b/sound/pci/hda/hda_sysfs.c @@ -222,7 +222,7 @@ static ssize_t init_verbs_show(struct device *dev, int i, len = 0; mutex_lock(&codec->user_mutex); snd_array_for_each(&codec->init_verbs, i, v) { - len += snprintf(buf + len, PAGE_SIZE - len, + len += scnprintf(buf + len, PAGE_SIZE - len, "0x%02x 0x%03x 0x%04x\n", v->nid, v->verb, v->param); } @@ -272,7 +272,7 @@ static ssize_t hints_show(struct device *dev, int i, len = 0; mutex_lock(&codec->user_mutex); snd_array_for_each(&codec->hints, i, hint) { - len += snprintf(buf + len, PAGE_SIZE - len, + len += scnprintf(buf + len, PAGE_SIZE - len, "%s = %s\n", hint->key, hint->val); } mutex_unlock(&codec->user_mutex); -- cgit v1.2.3 From 2464cc4c345699adea52c7aef75707207cb8a2f6 Mon Sep 17 00:00:00 2001 From: Gustavo Luiz Duarte Date: Tue, 11 Feb 2020 00:38:29 -0300 Subject: powerpc/tm: Fix clearing MSR[TS] in current when reclaiming on signal delivery After a treclaim, we expect to be in non-transactional state. If we don't clear the current thread's MSR[TS] before we get preempted, then tm_recheckpoint_new_task() will recheckpoint and we get rescheduled in suspended transaction state. When handling a signal caught in transactional state, handle_rt_signal64() calls get_tm_stackpointer() that treclaims the transaction using tm_reclaim_current() but without clearing the thread's MSR[TS]. This can cause the TM Bad Thing exception below if later we pagefault and get preempted trying to access the user's sigframe, using __put_user(). Afterwards, when we are rescheduled back into do_page_fault() (but now in suspended state since the thread's MSR[TS] was not cleared), upon executing 'rfid' after completion of the page fault handling, the exception is raised because a transition from suspended to non-transactional state is invalid. Unexpected TM Bad Thing exception at c00000000000de44 (msr 0x8000000302a03031) tm_scratch=800000010280b033 Oops: Unrecoverable exception, sig: 6 [#1] LE PAGE_SIZE=64K MMU=Hash SMP NR_CPUS=2048 NUMA pSeries CPU: 25 PID: 15547 Comm: a.out Not tainted 5.4.0-rc2 #32 NIP: c00000000000de44 LR: c000000000034728 CTR: 0000000000000000 REGS: c00000003fe7bd70 TRAP: 0700 Not tainted (5.4.0-rc2) MSR: 8000000302a03031 CR: 44000884 XER: 00000000 CFAR: c00000000000dda4 IRQMASK: 0 PACATMSCRATCH: 800000010280b033 GPR00: c000000000034728 c000000f65a17c80 c000000001662800 00007fffacf3fd78 GPR04: 0000000000001000 0000000000001000 0000000000000000 c000000f611f8af0 GPR08: 0000000000000000 0000000078006001 0000000000000000 000c000000000000 GPR12: c000000f611f84b0 c00000003ffcb200 0000000000000000 0000000000000000 GPR16: 0000000000000000 0000000000000000 0000000000000000 0000000000000000 GPR20: 0000000000000000 0000000000000000 0000000000000000 c000000f611f8140 GPR24: 0000000000000000 00007fffacf3fd68 c000000f65a17d90 c000000f611f7800 GPR28: c000000f65a17e90 c000000f65a17e90 c000000001685e18 00007fffacf3f000 NIP [c00000000000de44] fast_exception_return+0xf4/0x1b0 LR [c000000000034728] handle_rt_signal64+0x78/0xc50 Call Trace: [c000000f65a17c80] [c000000000034710] handle_rt_signal64+0x60/0xc50 (unreliable) [c000000f65a17d30] [c000000000023640] do_notify_resume+0x330/0x460 [c000000f65a17e20] [c00000000000dcc4] ret_from_except_lite+0x70/0x74 Instruction dump: 7c4ff120 e8410170 7c5a03a6 38400000 f8410060 e8010070 e8410080 e8610088 60000000 60000000 e8810090 e8210078 <4c000024> 48000000 e8610178 88ed0989 ---[ end trace 93094aa44b442f87 ]--- The simplified sequence of events that triggers the above exception is: ... # userspace in NON-TRANSACTIONAL state tbegin # userspace in TRANSACTIONAL state signal delivery # kernelspace in SUSPENDED state handle_rt_signal64() get_tm_stackpointer() treclaim # kernelspace in NON-TRANSACTIONAL state __put_user() page fault happens. We will never get back here because of the TM Bad Thing exception. page fault handling kicks in and we voluntarily preempt ourselves do_page_fault() __schedule() __switch_to(other_task) our task is rescheduled and we recheckpoint because the thread's MSR[TS] was not cleared __switch_to(our_task) switch_to_tm() tm_recheckpoint_new_task() trechkpt # kernelspace in SUSPENDED state The page fault handling resumes, but now we are in suspended transaction state do_page_fault() completes rfid <----- trying to get back where the page fault happened (we were non-transactional back then) TM Bad Thing # illegal transition from suspended to non-transactional This patch fixes that issue by clearing the current thread's MSR[TS] just after treclaim in get_tm_stackpointer() so that we stay in non-transactional state in case we are preempted. In order to make treclaim and clearing the thread's MSR[TS] atomic from a preemption perspective when CONFIG_PREEMPT is set, preempt_disable/enable() is used. It's also necessary to save the previous value of the thread's MSR before get_tm_stackpointer() is called so that it can be exposed to the signal handler later in setup_tm_sigcontexts() to inform the userspace MSR at the moment of the signal delivery. Found with tm-signal-context-force-tm kernel selftest. Fixes: 2b0a576d15e0 ("powerpc: Add new transactional memory state to the signal context") Cc: stable@vger.kernel.org # v3.9 Signed-off-by: Gustavo Luiz Duarte Acked-by: Michael Neuling Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20200211033831.11165-1-gustavold@linux.ibm.com --- arch/powerpc/kernel/signal.c | 17 +++++++++++++++-- arch/powerpc/kernel/signal_32.c | 28 ++++++++++++++-------------- arch/powerpc/kernel/signal_64.c | 22 ++++++++++------------ 3 files changed, 39 insertions(+), 28 deletions(-) diff --git a/arch/powerpc/kernel/signal.c b/arch/powerpc/kernel/signal.c index e6c30cee6abf..d215f9554553 100644 --- a/arch/powerpc/kernel/signal.c +++ b/arch/powerpc/kernel/signal.c @@ -200,14 +200,27 @@ unsigned long get_tm_stackpointer(struct task_struct *tsk) * normal/non-checkpointed stack pointer. */ + unsigned long ret = tsk->thread.regs->gpr[1]; + #ifdef CONFIG_PPC_TRANSACTIONAL_MEM BUG_ON(tsk != current); if (MSR_TM_ACTIVE(tsk->thread.regs->msr)) { + preempt_disable(); tm_reclaim_current(TM_CAUSE_SIGNAL); if (MSR_TM_TRANSACTIONAL(tsk->thread.regs->msr)) - return tsk->thread.ckpt_regs.gpr[1]; + ret = tsk->thread.ckpt_regs.gpr[1]; + + /* + * If we treclaim, we must clear the current thread's TM bits + * before re-enabling preemption. Otherwise we might be + * preempted and have the live MSR[TS] changed behind our back + * (tm_recheckpoint_new_task() would recheckpoint). Besides, we + * enter the signal handler in non-transactional state. + */ + tsk->thread.regs->msr &= ~MSR_TS_MASK; + preempt_enable(); } #endif - return tsk->thread.regs->gpr[1]; + return ret; } diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c index 98600b276f76..1b090a76b444 100644 --- a/arch/powerpc/kernel/signal_32.c +++ b/arch/powerpc/kernel/signal_32.c @@ -489,19 +489,11 @@ static int save_user_regs(struct pt_regs *regs, struct mcontext __user *frame, */ static int save_tm_user_regs(struct pt_regs *regs, struct mcontext __user *frame, - struct mcontext __user *tm_frame, int sigret) + struct mcontext __user *tm_frame, int sigret, + unsigned long msr) { - unsigned long msr = regs->msr; - WARN_ON(tm_suspend_disabled); - /* Remove TM bits from thread's MSR. The MSR in the sigcontext - * just indicates to userland that we were doing a transaction, but we - * don't want to return in transactional state. This also ensures - * that flush_fp_to_thread won't set TIF_RESTORE_TM again. - */ - regs->msr &= ~MSR_TS_MASK; - /* Save both sets of general registers */ if (save_general_regs(¤t->thread.ckpt_regs, frame) || save_general_regs(regs, tm_frame)) @@ -912,6 +904,10 @@ int handle_rt_signal32(struct ksignal *ksig, sigset_t *oldset, int sigret; unsigned long tramp; struct pt_regs *regs = tsk->thread.regs; +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM + /* Save the thread's msr before get_tm_stackpointer() changes it */ + unsigned long msr = regs->msr; +#endif BUG_ON(tsk != current); @@ -944,13 +940,13 @@ int handle_rt_signal32(struct ksignal *ksig, sigset_t *oldset, #ifdef CONFIG_PPC_TRANSACTIONAL_MEM tm_frame = &rt_sf->uc_transact.uc_mcontext; - if (MSR_TM_ACTIVE(regs->msr)) { + if (MSR_TM_ACTIVE(msr)) { if (__put_user((unsigned long)&rt_sf->uc_transact, &rt_sf->uc.uc_link) || __put_user((unsigned long)tm_frame, &rt_sf->uc_transact.uc_regs)) goto badframe; - if (save_tm_user_regs(regs, frame, tm_frame, sigret)) + if (save_tm_user_regs(regs, frame, tm_frame, sigret, msr)) goto badframe; } else @@ -1369,6 +1365,10 @@ int handle_signal32(struct ksignal *ksig, sigset_t *oldset, int sigret; unsigned long tramp; struct pt_regs *regs = tsk->thread.regs; +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM + /* Save the thread's msr before get_tm_stackpointer() changes it */ + unsigned long msr = regs->msr; +#endif BUG_ON(tsk != current); @@ -1402,9 +1402,9 @@ int handle_signal32(struct ksignal *ksig, sigset_t *oldset, #ifdef CONFIG_PPC_TRANSACTIONAL_MEM tm_mctx = &frame->mctx_transact; - if (MSR_TM_ACTIVE(regs->msr)) { + if (MSR_TM_ACTIVE(msr)) { if (save_tm_user_regs(regs, &frame->mctx, &frame->mctx_transact, - sigret)) + sigret, msr)) goto badframe; } else diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c index 117515564ec7..84ed2e77ef9c 100644 --- a/arch/powerpc/kernel/signal_64.c +++ b/arch/powerpc/kernel/signal_64.c @@ -192,7 +192,8 @@ static long setup_sigcontext(struct sigcontext __user *sc, static long setup_tm_sigcontexts(struct sigcontext __user *sc, struct sigcontext __user *tm_sc, struct task_struct *tsk, - int signr, sigset_t *set, unsigned long handler) + int signr, sigset_t *set, unsigned long handler, + unsigned long msr) { /* When CONFIG_ALTIVEC is set, we _always_ setup v_regs even if the * process never used altivec yet (MSR_VEC is zero in pt_regs of @@ -207,12 +208,11 @@ static long setup_tm_sigcontexts(struct sigcontext __user *sc, elf_vrreg_t __user *tm_v_regs = sigcontext_vmx_regs(tm_sc); #endif struct pt_regs *regs = tsk->thread.regs; - unsigned long msr = tsk->thread.regs->msr; long err = 0; BUG_ON(tsk != current); - BUG_ON(!MSR_TM_ACTIVE(regs->msr)); + BUG_ON(!MSR_TM_ACTIVE(msr)); WARN_ON(tm_suspend_disabled); @@ -222,13 +222,6 @@ static long setup_tm_sigcontexts(struct sigcontext __user *sc, */ msr |= tsk->thread.ckpt_regs.msr & (MSR_FP | MSR_VEC | MSR_VSX); - /* Remove TM bits from thread's MSR. The MSR in the sigcontext - * just indicates to userland that we were doing a transaction, but we - * don't want to return in transactional state. This also ensures - * that flush_fp_to_thread won't set TIF_RESTORE_TM again. - */ - regs->msr &= ~MSR_TS_MASK; - #ifdef CONFIG_ALTIVEC err |= __put_user(v_regs, &sc->v_regs); err |= __put_user(tm_v_regs, &tm_sc->v_regs); @@ -824,6 +817,10 @@ int handle_rt_signal64(struct ksignal *ksig, sigset_t *set, unsigned long newsp = 0; long err = 0; struct pt_regs *regs = tsk->thread.regs; +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM + /* Save the thread's msr before get_tm_stackpointer() changes it */ + unsigned long msr = regs->msr; +#endif BUG_ON(tsk != current); @@ -841,7 +838,7 @@ int handle_rt_signal64(struct ksignal *ksig, sigset_t *set, err |= __put_user(0, &frame->uc.uc_flags); err |= __save_altstack(&frame->uc.uc_stack, regs->gpr[1]); #ifdef CONFIG_PPC_TRANSACTIONAL_MEM - if (MSR_TM_ACTIVE(regs->msr)) { + if (MSR_TM_ACTIVE(msr)) { /* The ucontext_t passed to userland points to the second * ucontext_t (for transactional state) with its uc_link ptr. */ @@ -849,7 +846,8 @@ int handle_rt_signal64(struct ksignal *ksig, sigset_t *set, err |= setup_tm_sigcontexts(&frame->uc.uc_mcontext, &frame->uc_transact.uc_mcontext, tsk, ksig->sig, NULL, - (unsigned long)ksig->ka.sa.sa_handler); + (unsigned long)ksig->ka.sa.sa_handler, + msr); } else #endif { -- cgit v1.2.3 From 232ca1eecafed8c54491017f0612c33d8c742d74 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Sat, 15 Feb 2020 10:14:25 +0000 Subject: powerpc/32s: Fix DSI and ISI exceptions for CONFIG_VMAP_STACK hash_page() needs to read page tables from kernel memory. When entire kernel memory is mapped by BATs, which is normally the case when CONFIG_STRICT_KERNEL_RWX is not set, it works even if the page hosting the page table is not referenced in the MMU hash table. However, if the page where the page table resides is not covered by a BAT, a DSI fault can be encountered from hash_page(), and it loops forever. This can happen when CONFIG_STRICT_KERNEL_RWX is selected and the alignment of the different regions is too small to allow covering the entire memory with BATs. This also happens when CONFIG_DEBUG_PAGEALLOC is selected or when booting with 'nobats' flag. Also, if the page containing the kernel stack is not present in the MMU hash table, registers cannot be saved and a recursive DSI fault is encountered. To allow hash_page() to properly do its job at all time and load the MMU hash table whenever needed, it must run with data MMU disabled. This means it must be called before re-enabling data MMU. To allow this, registers clobbered by hash_page() and create_hpte() have to be saved in the thread struct together with SRR0, SSR1, DAR and DSISR. It is also necessary to ensure that DSI prolog doesn't overwrite regs saved by prolog of the current running exception. That means: - DSI can only use SPRN_SPRG_SCRATCH0 - Exceptions must free SPRN_SPRG_SCRATCH0 before writing to the stack. This also fixes the Oops reported by Erhard when create_hpte() is called by add_hash_page(). Due to prolog size increase, a few more exceptions had to get split in two parts. Fixes: cd08f109e262 ("powerpc/32s: Enable CONFIG_VMAP_STACK") Reported-by: Erhard F. Signed-off-by: Christophe Leroy Tested-by: Erhard F. Tested-by: Larry Finger Signed-off-by: Michael Ellerman Link: https://bugzilla.kernel.org/show_bug.cgi?id=206501 Link: https://lore.kernel.org/r/64a4aa44686e9fd4b01333401367029771d9b231.1581761633.git.christophe.leroy@c-s.fr --- arch/powerpc/include/asm/processor.h | 4 + arch/powerpc/kernel/asm-offsets.c | 12 +++ arch/powerpc/kernel/head_32.S | 155 ++++++++++++++++++++++++++++++++-- arch/powerpc/kernel/head_32.h | 21 ++++- arch/powerpc/mm/book3s32/hash_low.S | 52 +++++------- arch/powerpc/mm/book3s32/mmu.c | 10 +-- arch/powerpc/mm/kasan/kasan_init_32.c | 3 +- 7 files changed, 212 insertions(+), 45 deletions(-) diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h index 8387698bd5b6..eedcbfb9a6ff 100644 --- a/arch/powerpc/include/asm/processor.h +++ b/arch/powerpc/include/asm/processor.h @@ -168,6 +168,10 @@ struct thread_struct { unsigned long srr1; unsigned long dar; unsigned long dsisr; +#ifdef CONFIG_PPC_BOOK3S_32 + unsigned long r0, r3, r4, r5, r6, r8, r9, r11; + unsigned long lr, ctr; +#endif #endif /* Debug Registers */ struct debug_reg debug; diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index c25e562f1cd9..fcf24a365fc0 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -132,6 +132,18 @@ int main(void) OFFSET(SRR1, thread_struct, srr1); OFFSET(DAR, thread_struct, dar); OFFSET(DSISR, thread_struct, dsisr); +#ifdef CONFIG_PPC_BOOK3S_32 + OFFSET(THR0, thread_struct, r0); + OFFSET(THR3, thread_struct, r3); + OFFSET(THR4, thread_struct, r4); + OFFSET(THR5, thread_struct, r5); + OFFSET(THR6, thread_struct, r6); + OFFSET(THR8, thread_struct, r8); + OFFSET(THR9, thread_struct, r9); + OFFSET(THR11, thread_struct, r11); + OFFSET(THLR, thread_struct, lr); + OFFSET(THCTR, thread_struct, ctr); +#endif #endif #ifdef CONFIG_SPE OFFSET(THREAD_EVR0, thread_struct, evr[0]); diff --git a/arch/powerpc/kernel/head_32.S b/arch/powerpc/kernel/head_32.S index 0493fcac6409..97c887950c3c 100644 --- a/arch/powerpc/kernel/head_32.S +++ b/arch/powerpc/kernel/head_32.S @@ -290,17 +290,55 @@ MachineCheck: 7: EXCEPTION_PROLOG_2 addi r3,r1,STACK_FRAME_OVERHEAD #ifdef CONFIG_PPC_CHRP - bne cr1,1f +#ifdef CONFIG_VMAP_STACK + mfspr r4, SPRN_SPRG_THREAD + tovirt(r4, r4) + lwz r4, RTAS_SP(r4) + cmpwi cr1, r4, 0 #endif - EXC_XFER_STD(0x200, machine_check_exception) -#ifdef CONFIG_PPC_CHRP -1: b machine_check_in_rtas + beq cr1, machine_check_tramp + b machine_check_in_rtas +#else + b machine_check_tramp #endif /* Data access exception. */ . = 0x300 DO_KVM 0x300 DataAccess: +#ifdef CONFIG_VMAP_STACK + mtspr SPRN_SPRG_SCRATCH0,r10 + mfspr r10, SPRN_SPRG_THREAD +BEGIN_MMU_FTR_SECTION + stw r11, THR11(r10) + mfspr r10, SPRN_DSISR + mfcr r11 +#ifdef CONFIG_PPC_KUAP + andis. r10, r10, (DSISR_BAD_FAULT_32S | DSISR_DABRMATCH | DSISR_PROTFAULT)@h +#else + andis. r10, r10, (DSISR_BAD_FAULT_32S | DSISR_DABRMATCH)@h +#endif + mfspr r10, SPRN_SPRG_THREAD + beq hash_page_dsi +.Lhash_page_dsi_cont: + mtcr r11 + lwz r11, THR11(r10) +END_MMU_FTR_SECTION_IFSET(MMU_FTR_HPTE_TABLE) + mtspr SPRN_SPRG_SCRATCH1,r11 + mfspr r11, SPRN_DAR + stw r11, DAR(r10) + mfspr r11, SPRN_DSISR + stw r11, DSISR(r10) + mfspr r11, SPRN_SRR0 + stw r11, SRR0(r10) + mfspr r11, SPRN_SRR1 /* check whether user or kernel */ + stw r11, SRR1(r10) + mfcr r10 + andi. r11, r11, MSR_PR + + EXCEPTION_PROLOG_1 + b handle_page_fault_tramp_1 +#else /* CONFIG_VMAP_STACK */ EXCEPTION_PROLOG handle_dar_dsisr=1 get_and_save_dar_dsisr_on_stack r4, r5, r11 BEGIN_MMU_FTR_SECTION @@ -316,11 +354,32 @@ BEGIN_MMU_FTR_SECTION FTR_SECTION_ELSE b handle_page_fault_tramp_2 ALT_MMU_FTR_SECTION_END_IFSET(MMU_FTR_HPTE_TABLE) +#endif /* CONFIG_VMAP_STACK */ /* Instruction access exception. */ . = 0x400 DO_KVM 0x400 InstructionAccess: +#ifdef CONFIG_VMAP_STACK + mtspr SPRN_SPRG_SCRATCH0,r10 + mtspr SPRN_SPRG_SCRATCH1,r11 + mfspr r10, SPRN_SPRG_THREAD + mfspr r11, SPRN_SRR0 + stw r11, SRR0(r10) + mfspr r11, SPRN_SRR1 /* check whether user or kernel */ + stw r11, SRR1(r10) + mfcr r10 +BEGIN_MMU_FTR_SECTION + andis. r11, r11, SRR1_ISI_NOPT@h /* no pte found? */ + bne hash_page_isi +.Lhash_page_isi_cont: + mfspr r11, SPRN_SRR1 /* check whether user or kernel */ +END_MMU_FTR_SECTION_IFSET(MMU_FTR_HPTE_TABLE) + andi. r11, r11, MSR_PR + + EXCEPTION_PROLOG_1 + EXCEPTION_PROLOG_2 +#else /* CONFIG_VMAP_STACK */ EXCEPTION_PROLOG andis. r0,r9,SRR1_ISI_NOPT@h /* no pte found? */ beq 1f /* if so, try to put a PTE */ @@ -329,6 +388,7 @@ InstructionAccess: BEGIN_MMU_FTR_SECTION bl hash_page END_MMU_FTR_SECTION_IFSET(MMU_FTR_HPTE_TABLE) +#endif /* CONFIG_VMAP_STACK */ 1: mr r4,r12 andis. r5,r9,DSISR_SRR1_MATCH_32S@h /* Filter relevant SRR1 bits */ stw r4, _DAR(r11) @@ -344,7 +404,7 @@ Alignment: EXCEPTION_PROLOG handle_dar_dsisr=1 save_dar_dsisr_on_stack r4, r5, r11 addi r3,r1,STACK_FRAME_OVERHEAD - EXC_XFER_STD(0x600, alignment_exception) + b alignment_exception_tramp /* Program check exception */ EXCEPTION(0x700, ProgramCheck, program_check_exception, EXC_XFER_STD) @@ -645,15 +705,100 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_NEED_DTLB_SW_LRU) . = 0x3000 +machine_check_tramp: + EXC_XFER_STD(0x200, machine_check_exception) + +alignment_exception_tramp: + EXC_XFER_STD(0x600, alignment_exception) + handle_page_fault_tramp_1: +#ifdef CONFIG_VMAP_STACK + EXCEPTION_PROLOG_2 handle_dar_dsisr=1 +#endif lwz r4, _DAR(r11) lwz r5, _DSISR(r11) /* fall through */ handle_page_fault_tramp_2: EXC_XFER_LITE(0x300, handle_page_fault) +#ifdef CONFIG_VMAP_STACK +.macro save_regs_thread thread + stw r0, THR0(\thread) + stw r3, THR3(\thread) + stw r4, THR4(\thread) + stw r5, THR5(\thread) + stw r6, THR6(\thread) + stw r8, THR8(\thread) + stw r9, THR9(\thread) + mflr r0 + stw r0, THLR(\thread) + mfctr r0 + stw r0, THCTR(\thread) +.endm + +.macro restore_regs_thread thread + lwz r0, THLR(\thread) + mtlr r0 + lwz r0, THCTR(\thread) + mtctr r0 + lwz r0, THR0(\thread) + lwz r3, THR3(\thread) + lwz r4, THR4(\thread) + lwz r5, THR5(\thread) + lwz r6, THR6(\thread) + lwz r8, THR8(\thread) + lwz r9, THR9(\thread) +.endm + +hash_page_dsi: + save_regs_thread r10 + mfdsisr r3 + mfdar r4 + mfsrr0 r5 + mfsrr1 r9 + rlwinm r3, r3, 32 - 15, _PAGE_RW /* DSISR_STORE -> _PAGE_RW */ + bl hash_page + mfspr r10, SPRN_SPRG_THREAD + restore_regs_thread r10 + b .Lhash_page_dsi_cont + +hash_page_isi: + mr r11, r10 + mfspr r10, SPRN_SPRG_THREAD + save_regs_thread r10 + li r3, 0 + lwz r4, SRR0(r10) + lwz r9, SRR1(r10) + bl hash_page + mfspr r10, SPRN_SPRG_THREAD + restore_regs_thread r10 + mr r10, r11 + b .Lhash_page_isi_cont + + .globl fast_hash_page_return +fast_hash_page_return: + andis. r10, r9, SRR1_ISI_NOPT@h /* Set on ISI, cleared on DSI */ + mfspr r10, SPRN_SPRG_THREAD + restore_regs_thread r10 + bne 1f + + /* DSI */ + mtcr r11 + lwz r11, THR11(r10) + mfspr r10, SPRN_SPRG_SCRATCH0 + SYNC + RFI + +1: /* ISI */ + mtcr r11 + mfspr r11, SPRN_SPRG_SCRATCH1 + mfspr r10, SPRN_SPRG_SCRATCH0 + SYNC + RFI + stack_overflow: vmap_stack_overflow_exception +#endif AltiVecUnavailable: EXCEPTION_PROLOG diff --git a/arch/powerpc/kernel/head_32.h b/arch/powerpc/kernel/head_32.h index a6a5fbbf8504..9db162f79fe6 100644 --- a/arch/powerpc/kernel/head_32.h +++ b/arch/powerpc/kernel/head_32.h @@ -64,11 +64,25 @@ .endm .macro EXCEPTION_PROLOG_2 handle_dar_dsisr=0 +#if defined(CONFIG_VMAP_STACK) && defined(CONFIG_PPC_BOOK3S) +BEGIN_MMU_FTR_SECTION + mtcr r10 +FTR_SECTION_ELSE + stw r10, _CCR(r11) +ALT_MMU_FTR_SECTION_END_IFSET(MMU_FTR_HPTE_TABLE) +#else stw r10,_CCR(r11) /* save registers */ +#endif + mfspr r10, SPRN_SPRG_SCRATCH0 stw r12,GPR12(r11) stw r9,GPR9(r11) - mfspr r10,SPRN_SPRG_SCRATCH0 stw r10,GPR10(r11) +#if defined(CONFIG_VMAP_STACK) && defined(CONFIG_PPC_BOOK3S) +BEGIN_MMU_FTR_SECTION + mfcr r10 + stw r10, _CCR(r11) +END_MMU_FTR_SECTION_IFSET(MMU_FTR_HPTE_TABLE) +#endif mfspr r12,SPRN_SPRG_SCRATCH1 stw r12,GPR11(r11) mflr r10 @@ -83,6 +97,11 @@ stw r10, _DSISR(r11) .endif lwz r9, SRR1(r12) +#if defined(CONFIG_VMAP_STACK) && defined(CONFIG_PPC_BOOK3S) +BEGIN_MMU_FTR_SECTION + andi. r10, r9, MSR_PR +END_MMU_FTR_SECTION_IFSET(MMU_FTR_HPTE_TABLE) +#endif lwz r12, SRR0(r12) #else mfspr r12,SPRN_SRR0 diff --git a/arch/powerpc/mm/book3s32/hash_low.S b/arch/powerpc/mm/book3s32/hash_low.S index c11b0a005196..2015c4f96238 100644 --- a/arch/powerpc/mm/book3s32/hash_low.S +++ b/arch/powerpc/mm/book3s32/hash_low.S @@ -25,12 +25,6 @@ #include #include -#ifdef CONFIG_VMAP_STACK -#define ADDR_OFFSET 0 -#else -#define ADDR_OFFSET PAGE_OFFSET -#endif - #ifdef CONFIG_SMP .section .bss .align 2 @@ -53,8 +47,8 @@ mmu_hash_lock: .text _GLOBAL(hash_page) #ifdef CONFIG_SMP - lis r8, (mmu_hash_lock - ADDR_OFFSET)@h - ori r8, r8, (mmu_hash_lock - ADDR_OFFSET)@l + lis r8, (mmu_hash_lock - PAGE_OFFSET)@h + ori r8, r8, (mmu_hash_lock - PAGE_OFFSET)@l lis r0,0x0fff b 10f 11: lwz r6,0(r8) @@ -72,12 +66,9 @@ _GLOBAL(hash_page) cmplw 0,r4,r0 ori r3,r3,_PAGE_USER|_PAGE_PRESENT /* test low addresses as user */ mfspr r5, SPRN_SPRG_PGDIR /* phys page-table root */ -#ifdef CONFIG_VMAP_STACK - tovirt(r5, r5) -#endif blt+ 112f /* assume user more likely */ - lis r5, (swapper_pg_dir - ADDR_OFFSET)@ha /* if kernel address, use */ - addi r5 ,r5 ,(swapper_pg_dir - ADDR_OFFSET)@l /* kernel page table */ + lis r5, (swapper_pg_dir - PAGE_OFFSET)@ha /* if kernel address, use */ + addi r5 ,r5 ,(swapper_pg_dir - PAGE_OFFSET)@l /* kernel page table */ rlwimi r3,r9,32-12,29,29 /* MSR_PR -> _PAGE_USER */ 112: #ifndef CONFIG_PTE_64BIT @@ -89,9 +80,6 @@ _GLOBAL(hash_page) lwzx r8,r8,r5 /* Get L1 entry */ rlwinm. r8,r8,0,0,20 /* extract pt base address */ #endif -#ifdef CONFIG_VMAP_STACK - tovirt(r8, r8) -#endif #ifdef CONFIG_SMP beq- hash_page_out /* return if no mapping */ #else @@ -143,30 +131,36 @@ retry: bne- retry /* retry if someone got there first */ mfsrin r3,r4 /* get segment reg for segment */ +#ifndef CONFIG_VMAP_STACK mfctr r0 stw r0,_CTR(r11) +#endif bl create_hpte /* add the hash table entry */ #ifdef CONFIG_SMP eieio - lis r8, (mmu_hash_lock - ADDR_OFFSET)@ha + lis r8, (mmu_hash_lock - PAGE_OFFSET)@ha li r0,0 - stw r0, (mmu_hash_lock - ADDR_OFFSET)@l(r8) + stw r0, (mmu_hash_lock - PAGE_OFFSET)@l(r8) #endif +#ifdef CONFIG_VMAP_STACK + b fast_hash_page_return +#else /* Return from the exception */ lwz r5,_CTR(r11) mtctr r5 lwz r0,GPR0(r11) lwz r8,GPR8(r11) b fast_exception_return +#endif #ifdef CONFIG_SMP hash_page_out: eieio - lis r8, (mmu_hash_lock - ADDR_OFFSET)@ha + lis r8, (mmu_hash_lock - PAGE_OFFSET)@ha li r0,0 - stw r0, (mmu_hash_lock - ADDR_OFFSET)@l(r8) + stw r0, (mmu_hash_lock - PAGE_OFFSET)@l(r8) blr #endif /* CONFIG_SMP */ @@ -341,7 +335,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_NEED_COHERENT) patch_site 1f, patch__hash_page_A1 patch_site 2f, patch__hash_page_A2 /* Get the address of the primary PTE group in the hash table (r3) */ -0: lis r0, (Hash_base - ADDR_OFFSET)@h /* base address of hash table */ +0: lis r0, (Hash_base - PAGE_OFFSET)@h /* base address of hash table */ 1: rlwimi r0,r3,LG_PTEG_SIZE,HASH_LEFT,HASH_RIGHT /* VSID -> hash */ 2: rlwinm r3,r4,20+LG_PTEG_SIZE,HASH_LEFT,HASH_RIGHT /* PI -> hash */ xor r3,r3,r0 /* make primary hash */ @@ -355,10 +349,10 @@ END_FTR_SECTION_IFCLR(CPU_FTR_NEED_COHERENT) beq+ 10f /* no PTE: go look for an empty slot */ tlbie r4 - lis r4, (htab_hash_searches - ADDR_OFFSET)@ha - lwz r6, (htab_hash_searches - ADDR_OFFSET)@l(r4) + lis r4, (htab_hash_searches - PAGE_OFFSET)@ha + lwz r6, (htab_hash_searches - PAGE_OFFSET)@l(r4) addi r6,r6,1 /* count how many searches we do */ - stw r6, (htab_hash_searches - ADDR_OFFSET)@l(r4) + stw r6, (htab_hash_searches - PAGE_OFFSET)@l(r4) /* Search the primary PTEG for a PTE whose 1st (d)word matches r5 */ mtctr r0 @@ -390,10 +384,10 @@ END_FTR_SECTION_IFCLR(CPU_FTR_NEED_COHERENT) beq+ found_empty /* update counter of times that the primary PTEG is full */ - lis r4, (primary_pteg_full - ADDR_OFFSET)@ha - lwz r6, (primary_pteg_full - ADDR_OFFSET)@l(r4) + lis r4, (primary_pteg_full - PAGE_OFFSET)@ha + lwz r6, (primary_pteg_full - PAGE_OFFSET)@l(r4) addi r6,r6,1 - stw r6, (primary_pteg_full - ADDR_OFFSET)@l(r4) + stw r6, (primary_pteg_full - PAGE_OFFSET)@l(r4) patch_site 0f, patch__hash_page_C /* Search the secondary PTEG for an empty slot */ @@ -427,8 +421,8 @@ END_FTR_SECTION_IFCLR(CPU_FTR_NEED_COHERENT) * lockup here but that shouldn't happen */ -1: lis r4, (next_slot - ADDR_OFFSET)@ha /* get next evict slot */ - lwz r6, (next_slot - ADDR_OFFSET)@l(r4) +1: lis r4, (next_slot - PAGE_OFFSET)@ha /* get next evict slot */ + lwz r6, (next_slot - PAGE_OFFSET)@l(r4) addi r6,r6,HPTE_SIZE /* search for candidate */ andi. r6,r6,7*HPTE_SIZE stw r6,next_slot@l(r4) diff --git a/arch/powerpc/mm/book3s32/mmu.c b/arch/powerpc/mm/book3s32/mmu.c index 0a1c65a2c565..f888cbb109b9 100644 --- a/arch/powerpc/mm/book3s32/mmu.c +++ b/arch/powerpc/mm/book3s32/mmu.c @@ -413,7 +413,7 @@ void __init MMU_init_hw(void) void __init MMU_init_hw_patch(void) { unsigned int hmask = Hash_mask >> (16 - LG_HPTEG_SIZE); - unsigned int hash; + unsigned int hash = (unsigned int)Hash - PAGE_OFFSET; if (ppc_md.progress) ppc_md.progress("hash:patch", 0x345); @@ -425,11 +425,6 @@ void __init MMU_init_hw_patch(void) /* * Patch up the instructions in hashtable.S:create_hpte */ - if (IS_ENABLED(CONFIG_VMAP_STACK)) - hash = (unsigned int)Hash; - else - hash = (unsigned int)Hash - PAGE_OFFSET; - modify_instruction_site(&patch__hash_page_A0, 0xffff, hash >> 16); modify_instruction_site(&patch__hash_page_A1, 0x7c0, hash_mb << 6); modify_instruction_site(&patch__hash_page_A2, 0x7c0, hash_mb2 << 6); @@ -439,8 +434,7 @@ void __init MMU_init_hw_patch(void) /* * Patch up the instructions in hashtable.S:flush_hash_page */ - modify_instruction_site(&patch__flush_hash_A0, 0xffff, - ((unsigned int)Hash - PAGE_OFFSET) >> 16); + modify_instruction_site(&patch__flush_hash_A0, 0xffff, hash >> 16); modify_instruction_site(&patch__flush_hash_A1, 0x7c0, hash_mb << 6); modify_instruction_site(&patch__flush_hash_A2, 0x7c0, hash_mb2 << 6); modify_instruction_site(&patch__flush_hash_B, 0xffff, hmask); diff --git a/arch/powerpc/mm/kasan/kasan_init_32.c b/arch/powerpc/mm/kasan/kasan_init_32.c index 16dd95bd0749..db5664dde5ff 100644 --- a/arch/powerpc/mm/kasan/kasan_init_32.c +++ b/arch/powerpc/mm/kasan/kasan_init_32.c @@ -185,8 +185,7 @@ u8 __initdata early_hash[256 << 10] __aligned(256 << 10) = {0}; static void __init kasan_early_hash_table(void) { - unsigned int hash = IS_ENABLED(CONFIG_VMAP_STACK) ? (unsigned int)early_hash : - __pa(early_hash); + unsigned int hash = __pa(early_hash); modify_instruction_site(&patch__hash_page_A0, 0xffff, hash >> 16); modify_instruction_site(&patch__flush_hash_A0, 0xffff, hash >> 16); -- cgit v1.2.3 From 5a528eb67908bcf493f3583cb4726fbd8e129605 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 14 Feb 2020 08:39:50 +0000 Subject: powerpc/chrp: Fix enter_rtas() with CONFIG_VMAP_STACK With CONFIG_VMAP_STACK, data MMU has to be enabled to read data on the stack. Fixes: cd08f109e262 ("powerpc/32s: Enable CONFIG_VMAP_STACK") Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/d2330584f8c42d3039896e2b56f5d39676dc919c.1581669558.git.christophe.leroy@c-s.fr --- arch/powerpc/kernel/entry_32.S | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S index 0713daa651d9..bc056d906b51 100644 --- a/arch/powerpc/kernel/entry_32.S +++ b/arch/powerpc/kernel/entry_32.S @@ -1354,12 +1354,17 @@ _GLOBAL(enter_rtas) mtspr SPRN_SRR0,r8 mtspr SPRN_SRR1,r9 RFI -1: tophys(r9,r1) +1: tophys_novmstack r9, r1 +#ifdef CONFIG_VMAP_STACK + li r0, MSR_KERNEL & ~MSR_IR /* can take DTLB miss */ + mtmsr r0 + isync +#endif lwz r8,INT_FRAME_SIZE+4(r9) /* get return address */ lwz r9,8(r9) /* original msr value */ addi r1,r1,INT_FRAME_SIZE li r0,0 - tophys(r7, r2) + tophys_novmstack r7, r2 stw r0, THREAD + RTAS_SP(r7) mtspr SPRN_SRR0,r8 mtspr SPRN_SRR1,r9 -- cgit v1.2.3 From 477f3488a94e35380c82a7498d46f10fa5f3edd2 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 14 Feb 2020 06:53:00 +0000 Subject: powerpc/6xx: Fix power_save_ppc32_restore() with CONFIG_VMAP_STACK power_save_ppc32_restore() is called during exception entry, before re-enabling the MMU. It substracts KERNELBASE from the address of nap_save_msscr0 to access it. With CONFIG_VMAP_STACK enabled, data MMU translation has already been re-enabled, so power_save_ppc32_restore() has to access nap_save_msscr0 by its virtual address. Reported-by: Larry Finger Signed-off-by: Christophe Leroy Fixes: cd08f109e262 ("powerpc/32s: Enable CONFIG_VMAP_STACK") Tested-by: Larry Finger Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/7bce32ccbab3ba3e3e0f27da6961bf6313df97ed.1581663140.git.christophe.leroy@c-s.fr --- arch/powerpc/kernel/idle_6xx.S | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/powerpc/kernel/idle_6xx.S b/arch/powerpc/kernel/idle_6xx.S index 0ffdd18b9f26..433d97bea1f3 100644 --- a/arch/powerpc/kernel/idle_6xx.S +++ b/arch/powerpc/kernel/idle_6xx.S @@ -166,7 +166,11 @@ BEGIN_FTR_SECTION mfspr r9,SPRN_HID0 andis. r9,r9,HID0_NAP@h beq 1f +#ifdef CONFIG_VMAP_STACK + addis r9, r11, nap_save_msscr0@ha +#else addis r9,r11,(nap_save_msscr0-KERNELBASE)@ha +#endif lwz r9,nap_save_msscr0@l(r9) mtspr SPRN_MSSCR0, r9 sync @@ -174,7 +178,11 @@ BEGIN_FTR_SECTION 1: END_FTR_SECTION_IFSET(CPU_FTR_NAP_DISABLE_L2_PR) BEGIN_FTR_SECTION +#ifdef CONFIG_VMAP_STACK + addis r9, r11, nap_save_hid1@ha +#else addis r9,r11,(nap_save_hid1-KERNELBASE)@ha +#endif lwz r9,nap_save_hid1@l(r9) mtspr SPRN_HID1, r9 END_FTR_SECTION_IFSET(CPU_FTR_DUAL_PLL_750FX) -- cgit v1.2.3 From 066bc3576e653b615ee3f5230a89d69c8ebeeb71 Mon Sep 17 00:00:00 2001 From: Oliver O'Halloran Date: Mon, 17 Feb 2020 15:13:43 +1100 Subject: powerpc/xmon: Fix whitespace handling in getstring() The ls (lookup symbol) and zr (reboot) commands use xmon's getstring() helper to read a string argument from the xmon prompt. This function skips over leading whitespace, but doesn't check if the first "non-whitespace" character is a newline which causes some odd behaviour ( indicates a the enter key was pressed): 0:mon> ls printk printk: c0000000001680c4 0:mon> ls printk Symbol ' printk' not found. 0:mon> With commit 2d9b332d99b ("powerpc/xmon: Allow passing an argument to ppc_md.restart()") we have a similar problem with the zr command. Previously zr took no arguments so "zr would trigger a reboot. With that patch applied a second newline needs to be sent in order for the reboot to occur. Fix this by checking if the leading whitespace ended on a newline: 0:mon> ls Symbol '' not found. Fixes: 2d9b332d99b2 ("powerpc/xmon: Allow passing an argument to ppc_md.restart()") Reported-by: Michael Ellerman Signed-off-by: Oliver O'Halloran Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20200217041343.2454-1-oohall@gmail.com --- arch/powerpc/xmon/xmon.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c index e8c84d265602..0ec9640335bb 100644 --- a/arch/powerpc/xmon/xmon.c +++ b/arch/powerpc/xmon/xmon.c @@ -3435,6 +3435,11 @@ getstring(char *s, int size) int c; c = skipbl(); + if (c == '\n') { + *s = 0; + return; + } + do { if( size > 1 ){ *s++ = c; -- cgit v1.2.3 From 3be54d558c75562e42bc83d665df024bd79d399b Mon Sep 17 00:00:00 2001 From: Javier Martinez Canillas Date: Mon, 17 Feb 2020 12:39:47 +0100 Subject: efi: Only print errors about failing to get certs if EFI vars are found If CONFIG_LOAD_UEFI_KEYS is enabled, the kernel attempts to load the certs from the db, dbx and MokListRT EFI variables into the appropriate keyrings. But it just assumes that the variables will be present and prints an error if the certs can't be loaded, even when is possible that the variables may not exist. For example the MokListRT variable will only be present if shim is used. So only print an error message about failing to get the certs list from an EFI variable if this is found. Otherwise these printed errors just pollute the kernel log ring buffer with confusing messages like the following: [ 5.427251] Couldn't get size: 0x800000000000000e [ 5.427261] MODSIGN: Couldn't get UEFI db list [ 5.428012] Couldn't get size: 0x800000000000000e [ 5.428023] Couldn't get UEFI MokListRT Reported-by: Hans de Goede Signed-off-by: Javier Martinez Canillas Tested-by: Hans de Goede Acked-by: Ard Biesheuvel Signed-off-by: Mimi Zohar --- security/integrity/platform_certs/load_uefi.c | 40 +++++++++++++++++---------- 1 file changed, 26 insertions(+), 14 deletions(-) diff --git a/security/integrity/platform_certs/load_uefi.c b/security/integrity/platform_certs/load_uefi.c index 111898aad56e..f0c908241966 100644 --- a/security/integrity/platform_certs/load_uefi.c +++ b/security/integrity/platform_certs/load_uefi.c @@ -35,16 +35,18 @@ static __init bool uefi_check_ignore_db(void) * Get a certificate list blob from the named EFI variable. */ static __init void *get_cert_list(efi_char16_t *name, efi_guid_t *guid, - unsigned long *size) + unsigned long *size, efi_status_t *status) { - efi_status_t status; unsigned long lsize = 4; unsigned long tmpdb[4]; void *db; - status = efi.get_variable(name, guid, NULL, &lsize, &tmpdb); - if (status != EFI_BUFFER_TOO_SMALL) { - pr_err("Couldn't get size: 0x%lx\n", status); + *status = efi.get_variable(name, guid, NULL, &lsize, &tmpdb); + if (*status == EFI_NOT_FOUND) + return NULL; + + if (*status != EFI_BUFFER_TOO_SMALL) { + pr_err("Couldn't get size: 0x%lx\n", *status); return NULL; } @@ -52,10 +54,10 @@ static __init void *get_cert_list(efi_char16_t *name, efi_guid_t *guid, if (!db) return NULL; - status = efi.get_variable(name, guid, NULL, &lsize, db); - if (status != EFI_SUCCESS) { + *status = efi.get_variable(name, guid, NULL, &lsize, db); + if (*status != EFI_SUCCESS) { kfree(db); - pr_err("Error reading db var: 0x%lx\n", status); + pr_err("Error reading db var: 0x%lx\n", *status); return NULL; } @@ -74,6 +76,7 @@ static int __init load_uefi_certs(void) efi_guid_t mok_var = EFI_SHIM_LOCK_GUID; void *db = NULL, *dbx = NULL, *mok = NULL; unsigned long dbsize = 0, dbxsize = 0, moksize = 0; + efi_status_t status; int rc = 0; if (!efi.get_variable) @@ -83,9 +86,12 @@ static int __init load_uefi_certs(void) * an error if we can't get them. */ if (!uefi_check_ignore_db()) { - db = get_cert_list(L"db", &secure_var, &dbsize); + db = get_cert_list(L"db", &secure_var, &dbsize, &status); if (!db) { - pr_err("MODSIGN: Couldn't get UEFI db list\n"); + if (status == EFI_NOT_FOUND) + pr_debug("MODSIGN: db variable wasn't found\n"); + else + pr_err("MODSIGN: Couldn't get UEFI db list\n"); } else { rc = parse_efi_signature_list("UEFI:db", db, dbsize, get_handler_for_db); @@ -96,9 +102,12 @@ static int __init load_uefi_certs(void) } } - mok = get_cert_list(L"MokListRT", &mok_var, &moksize); + mok = get_cert_list(L"MokListRT", &mok_var, &moksize, &status); if (!mok) { - pr_info("Couldn't get UEFI MokListRT\n"); + if (status == EFI_NOT_FOUND) + pr_debug("MokListRT variable wasn't found\n"); + else + pr_info("Couldn't get UEFI MokListRT\n"); } else { rc = parse_efi_signature_list("UEFI:MokListRT", mok, moksize, get_handler_for_db); @@ -107,9 +116,12 @@ static int __init load_uefi_certs(void) kfree(mok); } - dbx = get_cert_list(L"dbx", &secure_var, &dbxsize); + dbx = get_cert_list(L"dbx", &secure_var, &dbxsize, &status); if (!dbx) { - pr_info("Couldn't get UEFI dbx list\n"); + if (status == EFI_NOT_FOUND) + pr_debug("dbx variable wasn't found\n"); + else + pr_info("Couldn't get UEFI dbx list\n"); } else { rc = parse_efi_signature_list("UEFI:dbx", dbx, dbxsize, -- cgit v1.2.3 From 6a30e1b1dcad0ba94fae757f797812d7d8dcb72c Mon Sep 17 00:00:00 2001 From: Tianjia Zhang Date: Mon, 10 Feb 2020 20:44:39 +0800 Subject: crypto: rename sm3-256 to sm3 in hash_algo_name The name sm3-256 is defined in hash_algo_name in hash_info, but the algorithm name implemented in sm3_generic.c is sm3, which will cause the sm3-256 algorithm to be not found in some application scenarios of the hash algorithm, and an ENOENT error will occur. For example, IMA, keys, and other subsystems that reference hash_algo_name all use the hash algorithm of sm3. Fixes: 5ca4c20cfd37 ("keys, trusted: select hash algorithm for TPM2 chips") Signed-off-by: Tianjia Zhang Reviewed-by: Pascal van Leeuwen Signed-off-by: Mimi Zohar --- crypto/hash_info.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crypto/hash_info.c b/crypto/hash_info.c index c754cb75dd1a..a49ff96bde77 100644 --- a/crypto/hash_info.c +++ b/crypto/hash_info.c @@ -26,7 +26,7 @@ const char *const hash_algo_name[HASH_ALGO__LAST] = { [HASH_ALGO_TGR_128] = "tgr128", [HASH_ALGO_TGR_160] = "tgr160", [HASH_ALGO_TGR_192] = "tgr192", - [HASH_ALGO_SM3_256] = "sm3-256", + [HASH_ALGO_SM3_256] = "sm3", [HASH_ALGO_STREEBOG_256] = "streebog256", [HASH_ALGO_STREEBOG_512] = "streebog512", }; -- cgit v1.2.3 From 5780b9abd530982c2bb1018e2c52c05ab3c30b45 Mon Sep 17 00:00:00 2001 From: Tianjia Zhang Date: Mon, 10 Feb 2020 20:44:40 +0800 Subject: ima: add sm3 algorithm to hash algorithm configuration list sm3 has been supported by the ima hash algorithm, but it is not yet in the Kconfig configuration list. After adding, both ima and tpm2 can support sm3 well. Signed-off-by: Tianjia Zhang Signed-off-by: Mimi Zohar --- security/integrity/ima/Kconfig | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/security/integrity/ima/Kconfig b/security/integrity/ima/Kconfig index 711ff10fa36e..3f3ee4e2eb0d 100644 --- a/security/integrity/ima/Kconfig +++ b/security/integrity/ima/Kconfig @@ -112,6 +112,10 @@ choice config IMA_DEFAULT_HASH_WP512 bool "WP512" depends on CRYPTO_WP512=y && !IMA_TEMPLATE + + config IMA_DEFAULT_HASH_SM3 + bool "SM3" + depends on CRYPTO_SM3=y && !IMA_TEMPLATE endchoice config IMA_DEFAULT_HASH @@ -121,6 +125,7 @@ config IMA_DEFAULT_HASH default "sha256" if IMA_DEFAULT_HASH_SHA256 default "sha512" if IMA_DEFAULT_HASH_SHA512 default "wp512" if IMA_DEFAULT_HASH_WP512 + default "sm3" if IMA_DEFAULT_HASH_SM3 config IMA_WRITE_POLICY bool "Enable multiple writes to the IMA policy" -- cgit v1.2.3 From f25975f42f2f8f2a01303054d6a70c7ceb1fcf54 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Tue, 18 Feb 2020 14:03:34 +0100 Subject: bpf, uapi: Remove text about bpf_redirect_map() giving higher performance MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The performance of bpf_redirect() is now roughly the same as that of bpf_redirect_map(). However, David Ahern pointed out that the header file has not been updated to reflect this, and still says that a significant performance increase is possible when using bpf_redirect_map(). Remove this text from the bpf_redirect_map() description, and reword the description in bpf_redirect() slightly. Also fix the 'Return' section of the bpf_redirect_map() documentation. Fixes: 1d233886dd90 ("xdp: Use bulking for non-map XDP_REDIRECT and consolidate code paths") Reported-by: David Ahern Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Daniel Borkmann Reviewed-by: Quentin Monnet Link: https://lore.kernel.org/bpf/20200218130334.29889-1-toke@redhat.com --- include/uapi/linux/bpf.h | 16 +++++++--------- tools/include/uapi/linux/bpf.h | 16 +++++++--------- 2 files changed, 14 insertions(+), 18 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index f1d74a2bd234..22f235260a3a 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1045,9 +1045,9 @@ union bpf_attr { * supports redirection to the egress interface, and accepts no * flag at all. * - * The same effect can be attained with the more generic - * **bpf_redirect_map**\ (), which requires specific maps to be - * used but offers better performance. + * The same effect can also be attained with the more generic + * **bpf_redirect_map**\ (), which uses a BPF map to store the + * redirect target instead of providing it directly to the helper. * Return * For XDP, the helper returns **XDP_REDIRECT** on success or * **XDP_ABORTED** on error. For other program types, the values @@ -1611,13 +1611,11 @@ union bpf_attr { * the caller. Any higher bits in the *flags* argument must be * unset. * - * When used to redirect packets to net devices, this helper - * provides a high performance increase over **bpf_redirect**\ (). - * This is due to various implementation details of the underlying - * mechanisms, one of which is the fact that **bpf_redirect_map**\ - * () tries to send packet as a "bulk" to the device. + * See also bpf_redirect(), which only supports redirecting to an + * ifindex, but doesn't require a map to do so. * Return - * **XDP_REDIRECT** on success, or **XDP_ABORTED** on error. + * **XDP_REDIRECT** on success, or the value of the two lower bits + * of the **flags* argument on error. * * int bpf_sk_redirect_map(struct sk_buff *skb, struct bpf_map *map, u32 key, u64 flags) * Description diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index f1d74a2bd234..22f235260a3a 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1045,9 +1045,9 @@ union bpf_attr { * supports redirection to the egress interface, and accepts no * flag at all. * - * The same effect can be attained with the more generic - * **bpf_redirect_map**\ (), which requires specific maps to be - * used but offers better performance. + * The same effect can also be attained with the more generic + * **bpf_redirect_map**\ (), which uses a BPF map to store the + * redirect target instead of providing it directly to the helper. * Return * For XDP, the helper returns **XDP_REDIRECT** on success or * **XDP_ABORTED** on error. For other program types, the values @@ -1611,13 +1611,11 @@ union bpf_attr { * the caller. Any higher bits in the *flags* argument must be * unset. * - * When used to redirect packets to net devices, this helper - * provides a high performance increase over **bpf_redirect**\ (). - * This is due to various implementation details of the underlying - * mechanisms, one of which is the fact that **bpf_redirect_map**\ - * () tries to send packet as a "bulk" to the device. + * See also bpf_redirect(), which only supports redirecting to an + * ifindex, but doesn't require a map to do so. * Return - * **XDP_REDIRECT** on success, or **XDP_ABORTED** on error. + * **XDP_REDIRECT** on success, or the value of the two lower bits + * of the **flags* argument on error. * * int bpf_sk_redirect_map(struct sk_buff *skb, struct bpf_map *map, u32 key, u64 flags) * Description -- cgit v1.2.3 From 113e6b7e15e23dc45d5c66eb66bb91a627812e36 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Mon, 17 Feb 2020 18:17:01 +0100 Subject: libbpf: Sanitise internal map names so they are not rejected by the kernel MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The kernel only accepts map names with alphanumeric characters, underscores and periods in their name. However, the auto-generated internal map names used by libbpf takes their prefix from the user-supplied BPF object name, which has no such restriction. This can lead to "Invalid argument" errors when trying to load a BPF program using global variables. Fix this by sanitising the map names, replacing any non-allowed characters with underscores. Fixes: d859900c4c56 ("bpf, libbpf: support global data/bss/rodata sections") Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200217171701.215215-1-toke@redhat.com --- tools/lib/bpf/libbpf.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 514b1a524abb..7469c7dcc15e 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -1283,7 +1284,7 @@ static size_t bpf_map_mmap_sz(const struct bpf_map *map) static char *internal_map_name(struct bpf_object *obj, enum libbpf_map_type type) { - char map_name[BPF_OBJ_NAME_LEN]; + char map_name[BPF_OBJ_NAME_LEN], *p; const char *sfx = libbpf_type_to_btf_name[type]; int sfx_len = max((size_t)7, strlen(sfx)); int pfx_len = min((size_t)BPF_OBJ_NAME_LEN - sfx_len - 1, @@ -1292,6 +1293,11 @@ static char *internal_map_name(struct bpf_object *obj, snprintf(map_name, sizeof(map_name), "%.*s%.*s", pfx_len, obj->name, sfx_len, libbpf_type_to_btf_name[type]); + /* sanitise map name to characters allowed by kernel */ + for (p = map_name; *p && p < map_name + sizeof(map_name); p++) + if (!isalnum(*p) && *p != '_' && *p != '.') + *p = '_'; + return strdup(map_name); } -- cgit v1.2.3 From 1d4615978f525b769990a4a4ef22fb1b9a04cdf1 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 17 Feb 2020 17:12:37 +0100 Subject: iommu/vt-d: Add attach_deferred() helper Implement a helper function to check whether a device's attach process is deferred. Fixes: 1ee0186b9a12 ("iommu/vt-d: Refactor find_domain() helper") Cc: stable@vger.kernel.org # v5.5 Reviewed-by: Jerry Snitselaar Acked-by: Lu Baolu Signed-off-by: Joerg Roedel --- drivers/iommu/intel-iommu.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index 9dc37672bf89..80f2332a5466 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -762,6 +762,11 @@ static int iommu_dummy(struct device *dev) return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO; } +static bool attach_deferred(struct device *dev) +{ + return dev->archdata.iommu == DEFER_DEVICE_DOMAIN_INFO; +} + /** * is_downstream_to_pci_bridge - test if a device belongs to the PCI * sub-hierarchy of a candidate PCI-PCI bridge @@ -2510,8 +2515,7 @@ struct dmar_domain *find_domain(struct device *dev) { struct device_domain_info *info; - if (unlikely(dev->archdata.iommu == DEFER_DEVICE_DOMAIN_INFO || - dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)) + if (unlikely(attach_deferred(dev) || iommu_dummy(dev))) return NULL; if (dev_is_pci(dev)) @@ -2527,7 +2531,7 @@ struct dmar_domain *find_domain(struct device *dev) static struct dmar_domain *deferred_attach_domain(struct device *dev) { - if (unlikely(dev->archdata.iommu == DEFER_DEVICE_DOMAIN_INFO)) { + if (unlikely(attach_deferred(dev))) { struct iommu_domain *domain; dev->archdata.iommu = NULL; @@ -6133,7 +6137,7 @@ intel_iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev) static bool intel_iommu_is_attach_deferred(struct iommu_domain *domain, struct device *dev) { - return dev->archdata.iommu == DEFER_DEVICE_DOMAIN_INFO; + return attach_deferred(dev); } static int -- cgit v1.2.3 From 034d98cc0cdcde2415c6f598fa9125e3eaa02569 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 17 Feb 2020 17:16:19 +0100 Subject: iommu/vt-d: Move deferred device attachment into helper function Move the code that does the deferred device attachment into a separate helper function. Fixes: 1ee0186b9a12 ("iommu/vt-d: Refactor find_domain() helper") Cc: stable@vger.kernel.org # v5.5 Reviewed-by: Jerry Snitselaar Acked-by: Lu Baolu Signed-off-by: Joerg Roedel --- drivers/iommu/intel-iommu.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index 80f2332a5466..42cdcce1602e 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -2529,16 +2529,20 @@ struct dmar_domain *find_domain(struct device *dev) return NULL; } -static struct dmar_domain *deferred_attach_domain(struct device *dev) +static void do_deferred_attach(struct device *dev) { - if (unlikely(attach_deferred(dev))) { - struct iommu_domain *domain; + struct iommu_domain *domain; - dev->archdata.iommu = NULL; - domain = iommu_get_domain_for_dev(dev); - if (domain) - intel_iommu_attach_device(domain, dev); - } + dev->archdata.iommu = NULL; + domain = iommu_get_domain_for_dev(dev); + if (domain) + intel_iommu_attach_device(domain, dev); +} + +static struct dmar_domain *deferred_attach_domain(struct device *dev) +{ + if (unlikely(attach_deferred(dev))) + do_deferred_attach(dev); return find_domain(dev); } -- cgit v1.2.3 From a11bfde9c77df1fd350ea27169ab921f511bf5d0 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 17 Feb 2020 17:20:59 +0100 Subject: iommu/vt-d: Do deferred attachment in iommu_need_mapping() The attachment of deferred devices needs to happen before the check whether the device is identity mapped or not. Otherwise the check will return wrong results, cause warnings boot failures in kdump kernels, like WARNING: CPU: 0 PID: 318 at ../drivers/iommu/intel-iommu.c:592 domain_get_iommu+0x61/0x70 [...] Call Trace: __intel_map_single+0x55/0x190 intel_alloc_coherent+0xac/0x110 dmam_alloc_attrs+0x50/0xa0 ahci_port_start+0xfb/0x1f0 [libahci] ata_host_start.part.39+0x104/0x1e0 [libata] With the earlier check the kdump boot succeeds and a crashdump is written. Fixes: 1ee0186b9a12 ("iommu/vt-d: Refactor find_domain() helper") Cc: stable@vger.kernel.org # v5.5 Reviewed-by: Jerry Snitselaar Acked-by: Lu Baolu Signed-off-by: Joerg Roedel --- drivers/iommu/intel-iommu.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index 42cdcce1602e..723f615c6e84 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -2541,9 +2541,6 @@ static void do_deferred_attach(struct device *dev) static struct dmar_domain *deferred_attach_domain(struct device *dev) { - if (unlikely(attach_deferred(dev))) - do_deferred_attach(dev); - return find_domain(dev); } @@ -3595,6 +3592,9 @@ static bool iommu_need_mapping(struct device *dev) if (iommu_dummy(dev)) return false; + if (unlikely(attach_deferred(dev))) + do_deferred_attach(dev); + ret = identity_mapping(dev); if (ret) { u64 dma_mask = *dev->dma_mask; @@ -3958,7 +3958,11 @@ bounce_map_single(struct device *dev, phys_addr_t paddr, size_t size, int prot = 0; int ret; + if (unlikely(attach_deferred(dev))) + do_deferred_attach(dev); + domain = deferred_attach_domain(dev); + if (WARN_ON(dir == DMA_NONE || !domain)) return DMA_MAPPING_ERROR; -- cgit v1.2.3 From 96d170f3b1a607612caf3618c534d5c64fc2d61b Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 17 Feb 2020 17:27:44 +0100 Subject: iommu/vt-d: Remove deferred_attach_domain() The function is now only a wrapper around find_domain(). Remove the function and call find_domain() directly at the call-sites. Fixes: 1ee0186b9a12 ("iommu/vt-d: Refactor find_domain() helper") Cc: stable@vger.kernel.org # v5.5 Reviewed-by: Jerry Snitselaar Acked-by: Lu Baolu Signed-off-by: Joerg Roedel --- drivers/iommu/intel-iommu.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index 723f615c6e84..69f1c6b8dfcf 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -2539,11 +2539,6 @@ static void do_deferred_attach(struct device *dev) intel_iommu_attach_device(domain, dev); } -static struct dmar_domain *deferred_attach_domain(struct device *dev) -{ - return find_domain(dev); -} - static inline struct device_domain_info * dmar_search_domain_by_dev_info(int segment, int bus, int devfn) { @@ -3643,7 +3638,7 @@ static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr, BUG_ON(dir == DMA_NONE); - domain = deferred_attach_domain(dev); + domain = find_domain(dev); if (!domain) return DMA_MAPPING_ERROR; @@ -3863,7 +3858,7 @@ static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nele if (!iommu_need_mapping(dev)) return dma_direct_map_sg(dev, sglist, nelems, dir, attrs); - domain = deferred_attach_domain(dev); + domain = find_domain(dev); if (!domain) return 0; @@ -3961,7 +3956,7 @@ bounce_map_single(struct device *dev, phys_addr_t paddr, size_t size, if (unlikely(attach_deferred(dev))) do_deferred_attach(dev); - domain = deferred_attach_domain(dev); + domain = find_domain(dev); if (WARN_ON(dir == DMA_NONE || !domain)) return DMA_MAPPING_ERROR; -- cgit v1.2.3 From 1ddb32da4a629fa7f87873d0b6836c2e1feb7518 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 17 Feb 2020 17:29:55 +0100 Subject: iommu/vt-d: Simplify check in identity_mapping() The function only has one call-site and there it is never called with dummy or deferred devices. Simplify the check in the function to account for that. Fixes: 1ee0186b9a12 ("iommu/vt-d: Refactor find_domain() helper") Cc: stable@vger.kernel.org # v5.5 Reviewed-by: Jerry Snitselaar Acked-by: Lu Baolu Signed-off-by: Joerg Roedel --- drivers/iommu/intel-iommu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index 69f1c6b8dfcf..6fa6de2b6ad5 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -2916,7 +2916,7 @@ static int identity_mapping(struct device *dev) struct device_domain_info *info; info = dev->archdata.iommu; - if (info && info != DUMMY_DEVICE_DOMAIN_INFO && info != DEFER_DEVICE_DOMAIN_INFO) + if (info) return (info->domain == si_domain); return 0; -- cgit v1.2.3 From dd1f6308b28edf0452dd5dc7877992903ec61e69 Mon Sep 17 00:00:00 2001 From: Vincenzo Frascino Date: Tue, 18 Feb 2020 16:49:06 +0000 Subject: arm64: lse: Fix LSE atomics with LLVM Commit e0d5896bd356 ("arm64: lse: fix LSE atomics with LLVM's integrated assembler") broke the build when clang is used in connjunction with the binutils assembler ("-no-integrated-as"). This happens because __LSE_PREAMBLE is defined as ".arch armv8-a+lse", which overrides the version of the CPU architecture passed via the "-march" paramter to gas: $ aarch64-none-linux-gnu-as -EL -I ./arch/arm64/include -I ./arch/arm64/include/generated -I ./include -I ./include -I ./arch/arm64/include/uapi -I ./arch/arm64/include/generated/uapi -I ./include/uapi -I ./include/generated/uapi -I ./init -I ./init -march=armv8.3-a -o init/do_mounts.o /tmp/do_mounts-d7992a.s /tmp/do_mounts-d7992a.s: Assembler messages: /tmp/do_mounts-d7992a.s:1959: Error: selected processor does not support `autiasp' /tmp/do_mounts-d7992a.s:2021: Error: selected processor does not support `paciasp' /tmp/do_mounts-d7992a.s:2157: Error: selected processor does not support `autiasp' /tmp/do_mounts-d7992a.s:2175: Error: selected processor does not support `paciasp' /tmp/do_mounts-d7992a.s:2494: Error: selected processor does not support `autiasp' Fix the issue by replacing ".arch armv8-a+lse" with ".arch_extension lse". Sami confirms that the clang integrated assembler does now support the '.arch_extension' directive, so this change will be fine even for LTO builds in future. Fixes: e0d5896bd356cd ("arm64: lse: fix LSE atomics with LLVM's integrated assembler") Cc: Catalin Marinas Cc: Will Deacon Reported-by: Amit Kachhap Tested-by: Sami Tolvanen Signed-off-by: Vincenzo Frascino Signed-off-by: Will Deacon --- arch/arm64/include/asm/lse.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/lse.h b/arch/arm64/include/asm/lse.h index d429f7701c36..5d10051c3e62 100644 --- a/arch/arm64/include/asm/lse.h +++ b/arch/arm64/include/asm/lse.h @@ -6,7 +6,7 @@ #ifdef CONFIG_ARM64_LSE_ATOMICS -#define __LSE_PREAMBLE ".arch armv8-a+lse\n" +#define __LSE_PREAMBLE ".arch_extension lse\n" #include #include -- cgit v1.2.3 From 297a31e3e8318f533cff4fe33ffaefb74f72c6e2 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 17 Feb 2020 17:39:45 +0300 Subject: io_uring: remove unnecessary NULL checks The "kmsg" pointer can't be NULL and we have already dereferenced it so a check here would be useless. Reviewed-by: Stefano Garzarella Signed-off-by: Dan Carpenter Signed-off-by: Jens Axboe --- fs/io_uring.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 29565d82291f..d35b45696c73 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -3075,7 +3075,7 @@ static int io_sendmsg(struct io_kiocb *req, struct io_kiocb **nxt, if (req->io) return -EAGAIN; if (io_alloc_async_ctx(req)) { - if (kmsg && kmsg->iov != kmsg->fast_iov) + if (kmsg->iov != kmsg->fast_iov) kfree(kmsg->iov); return -ENOMEM; } @@ -3229,7 +3229,7 @@ static int io_recvmsg(struct io_kiocb *req, struct io_kiocb **nxt, if (req->io) return -EAGAIN; if (io_alloc_async_ctx(req)) { - if (kmsg && kmsg->iov != kmsg->fast_iov) + if (kmsg->iov != kmsg->fast_iov) kfree(kmsg->iov); return -ENOMEM; } -- cgit v1.2.3 From af6565adb02d3129d3fae4d9d5da945abaf4417a Mon Sep 17 00:00:00 2001 From: Michal Kalderon Date: Mon, 17 Feb 2020 13:37:18 +0200 Subject: qede: Fix race between rdma destroy workqueue and link change event If an event is added while the rdma workqueue is being destroyed it could lead to several races, list corruption, null pointer dereference during queue_work or init_queue. This fixes the race between the two flows which can occur during shutdown. A kref object and a completion object are added to the rdma_dev structure, these are initialized before the workqueue is created. The refcnt is used to indicate work is being added to the workqueue and ensures the cleanup flow won't start while we're in the middle of adding the event. Once the work is added, the refcnt is decreased and the cleanup flow is safe to run. Fixes: cee9fbd8e2e ("qede: Add qedr framework") Signed-off-by: Ariel Elior Signed-off-by: Michal Kalderon Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/qede/qede.h | 2 ++ drivers/net/ethernet/qlogic/qede/qede_rdma.c | 29 +++++++++++++++++++++++++++- 2 files changed, 30 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/qlogic/qede/qede.h b/drivers/net/ethernet/qlogic/qede/qede.h index e8a1b27db84d..234c6f30effb 100644 --- a/drivers/net/ethernet/qlogic/qede/qede.h +++ b/drivers/net/ethernet/qlogic/qede/qede.h @@ -163,6 +163,8 @@ struct qede_rdma_dev { struct list_head entry; struct list_head rdma_event_list; struct workqueue_struct *rdma_wq; + struct kref refcnt; + struct completion event_comp; bool exp_recovery; }; diff --git a/drivers/net/ethernet/qlogic/qede/qede_rdma.c b/drivers/net/ethernet/qlogic/qede/qede_rdma.c index ffabc2d2f082..2d873ae8a234 100644 --- a/drivers/net/ethernet/qlogic/qede/qede_rdma.c +++ b/drivers/net/ethernet/qlogic/qede/qede_rdma.c @@ -59,6 +59,9 @@ static void _qede_rdma_dev_add(struct qede_dev *edev) static int qede_rdma_create_wq(struct qede_dev *edev) { INIT_LIST_HEAD(&edev->rdma_info.rdma_event_list); + kref_init(&edev->rdma_info.refcnt); + init_completion(&edev->rdma_info.event_comp); + edev->rdma_info.rdma_wq = create_singlethread_workqueue("rdma_wq"); if (!edev->rdma_info.rdma_wq) { DP_NOTICE(edev, "qedr: Could not create workqueue\n"); @@ -83,8 +86,23 @@ static void qede_rdma_cleanup_event(struct qede_dev *edev) } } +static void qede_rdma_complete_event(struct kref *ref) +{ + struct qede_rdma_dev *rdma_dev = + container_of(ref, struct qede_rdma_dev, refcnt); + + /* no more events will be added after this */ + complete(&rdma_dev->event_comp); +} + static void qede_rdma_destroy_wq(struct qede_dev *edev) { + /* Avoid race with add_event flow, make sure it finishes before + * we start accessing the list and cleaning up the work + */ + kref_put(&edev->rdma_info.refcnt, qede_rdma_complete_event); + wait_for_completion(&edev->rdma_info.event_comp); + qede_rdma_cleanup_event(edev); destroy_workqueue(edev->rdma_info.rdma_wq); } @@ -310,15 +328,24 @@ static void qede_rdma_add_event(struct qede_dev *edev, if (!edev->rdma_info.qedr_dev) return; + /* We don't want the cleanup flow to start while we're allocating and + * scheduling the work + */ + if (!kref_get_unless_zero(&edev->rdma_info.refcnt)) + return; /* already being destroyed */ + event_node = qede_rdma_get_free_event_node(edev); if (!event_node) - return; + goto out; event_node->event = event; event_node->ptr = edev; INIT_WORK(&event_node->work, qede_rdma_handle_event); queue_work(edev->rdma_info.rdma_wq, &event_node->work); + +out: + kref_put(&edev->rdma_info.refcnt, qede_rdma_complete_event); } void qede_rdma_dev_event_open(struct qede_dev *edev) -- cgit v1.2.3 From d99bfed58d9698c0ea1dbf47e4fdf4b87cc7203f Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 17 Feb 2020 16:54:38 +0100 Subject: mptcp: fix bogus socket flag values Dan Carpenter reports static checker warnings due to bogus BIT() usage: net/mptcp/subflow.c:571 subflow_write_space() warn: test_bit() takes a bit number net/mptcp/subflow.c:694 subflow_state_change() warn: test_bit() takes a bit number net/mptcp/protocol.c:261 ssk_check_wmem() warn: test_bit() takes a bit number [..] This is harmless (we use bits 1 & 2 instead of 0 and 1), but would break eventually when adding BIT(5) (or 6, depends on size of 'long'). Just use 0 and 1, the values are only passed to test/set/clear_bit functions. Fixes: 648ef4b88673 ("mptcp: Implement MPTCP receive path") Reported-by: Dan Carpenter Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- net/mptcp/protocol.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 8a99a2930284..9f8663b30456 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -56,8 +56,8 @@ #define MPTCP_DSS_FLAG_MASK (0x1F) /* MPTCP socket flags */ -#define MPTCP_DATA_READY BIT(0) -#define MPTCP_SEND_SPACE BIT(1) +#define MPTCP_DATA_READY 0 +#define MPTCP_SEND_SPACE 1 /* MPTCP connection sock */ struct mptcp_sock { -- cgit v1.2.3 From 29f20dd6258a6f9c434992a0f1fc522caecda7ef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jonathan=20Neusch=C3=A4fer?= Date: Tue, 18 Feb 2020 16:47:01 +0100 Subject: net: phy: broadcom: Fix a typo ("firsly") MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Jonathan Neuschäfer Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/phy/broadcom.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/phy/broadcom.c b/drivers/net/phy/broadcom.c index 7d68b28bb893..a62229a8b1a4 100644 --- a/drivers/net/phy/broadcom.c +++ b/drivers/net/phy/broadcom.c @@ -410,7 +410,7 @@ static int bcm5481_config_aneg(struct phy_device *phydev) struct device_node *np = phydev->mdio.dev.of_node; int ret; - /* Aneg firsly. */ + /* Aneg firstly. */ ret = genphy_config_aneg(phydev); /* Then we can set up the delay. */ @@ -463,7 +463,7 @@ static int bcm54616s_config_aneg(struct phy_device *phydev) { int ret; - /* Aneg firsly. */ + /* Aneg firstly. */ if (phydev->dev_flags & PHY_BCM_FLAGS_MODE_1000BX) ret = genphy_c37_config_aneg(phydev); else -- cgit v1.2.3 From 379349e9bc3b42b8b2f8f7a03f64a97623fff323 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 18 Feb 2020 18:15:44 +0100 Subject: Revert "net: dev: introduce support for sch BYPASS for lockless qdisc" This reverts commit ba27b4cdaaa66561aaedb2101876e563738d36fe Ahmed reported ouf-of-order issues bisected to commit ba27b4cdaaa6 ("net: dev: introduce support for sch BYPASS for lockless qdisc"). I can't find any working solution other than a plain revert. This will introduce some minor performance regressions for pfifo_fast qdisc. I plan to address them in net-next with more indirect call wrapper boilerplate for qdiscs. Reported-by: Ahmad Fatoum Fixes: ba27b4cdaaa6 ("net: dev: introduce support for sch BYPASS for lockless qdisc") Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- net/core/dev.c | 22 ++-------------------- 1 file changed, 2 insertions(+), 20 deletions(-) diff --git a/net/core/dev.c b/net/core/dev.c index 2577ebfed293..e10bd680dc03 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3662,26 +3662,8 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, qdisc_calculate_pkt_len(skb, q); if (q->flags & TCQ_F_NOLOCK) { - if ((q->flags & TCQ_F_CAN_BYPASS) && READ_ONCE(q->empty) && - qdisc_run_begin(q)) { - if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, - &q->state))) { - __qdisc_drop(skb, &to_free); - rc = NET_XMIT_DROP; - goto end_run; - } - qdisc_bstats_cpu_update(q, skb); - - rc = NET_XMIT_SUCCESS; - if (sch_direct_xmit(skb, q, dev, txq, NULL, true)) - __qdisc_run(q); - -end_run: - qdisc_run_end(q); - } else { - rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK; - qdisc_run(q); - } + rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK; + qdisc_run(q); if (unlikely(to_free)) kfree_skb_list(to_free); -- cgit v1.2.3 From 8c70c3d72833a08214ff8c8df1f7d9778509888d Mon Sep 17 00:00:00 2001 From: Madhuparna Bhowmik Date: Tue, 18 Feb 2020 23:47:18 +0530 Subject: net: netlabel: Use built-in RCU list checking list_for_each_entry_rcu() has built-in RCU and lock checking. Pass cond argument to list_for_each_entry_rcu() to silence false lockdep warning when CONFIG_PROVE_RCU_LIST is enabled by default. Signed-off-by: Madhuparna Bhowmik Signed-off-by: David S. Miller --- net/netlabel/netlabel_unlabeled.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/netlabel/netlabel_unlabeled.c b/net/netlabel/netlabel_unlabeled.c index d2e4ab8d1cb1..77bb1bb22c3b 100644 --- a/net/netlabel/netlabel_unlabeled.c +++ b/net/netlabel/netlabel_unlabeled.c @@ -207,7 +207,8 @@ static struct netlbl_unlhsh_iface *netlbl_unlhsh_search_iface(int ifindex) bkt = netlbl_unlhsh_hash(ifindex); bkt_list = &netlbl_unlhsh_rcu_deref(netlbl_unlhsh)->tbl[bkt]; - list_for_each_entry_rcu(iter, bkt_list, list) + list_for_each_entry_rcu(iter, bkt_list, list, + lockdep_is_held(&netlbl_unlhsh_lock)) if (iter->valid && iter->ifindex == ifindex) return iter; -- cgit v1.2.3 From 9facfdb5467382a21fc0bd4211ced26c06f28832 Mon Sep 17 00:00:00 2001 From: Madhuparna Bhowmik Date: Wed, 19 Feb 2020 00:11:32 +0530 Subject: netlabel_domainhash.c: Use built-in RCU list checking list_for_each_entry_rcu() has built-in RCU and lock checking. Pass cond argument to list_for_each_entry_rcu() to silence false lockdep warning when CONFIG_PROVE_RCU_LIST is enabled by default. Signed-off-by: Madhuparna Bhowmik Signed-off-by: David S. Miller --- net/netlabel/netlabel_domainhash.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/netlabel/netlabel_domainhash.c b/net/netlabel/netlabel_domainhash.c index f5d34da0646e..a1f2320ecc16 100644 --- a/net/netlabel/netlabel_domainhash.c +++ b/net/netlabel/netlabel_domainhash.c @@ -143,7 +143,8 @@ static struct netlbl_dom_map *netlbl_domhsh_search(const char *domain, if (domain != NULL) { bkt = netlbl_domhsh_hash(domain); bkt_list = &netlbl_domhsh_rcu_deref(netlbl_domhsh)->tbl[bkt]; - list_for_each_entry_rcu(iter, bkt_list, list) + list_for_each_entry_rcu(iter, bkt_list, list, + lockdep_is_held(&netlbl_domhsh_lock)) if (iter->valid && netlbl_family_match(iter->family, family) && strcmp(iter->domain, domain) == 0) -- cgit v1.2.3 From 7790614616458b6dd3d90652acfa6b7443ee7041 Mon Sep 17 00:00:00 2001 From: Madhuparna Bhowmik Date: Wed, 19 Feb 2020 01:24:25 +0530 Subject: meter.c: Use built-in RCU list checking hlist_for_each_entry_rcu() has built-in RCU and lock checking. Pass cond argument to list_for_each_entry_rcu() to silence false lockdep warning when CONFIG_PROVE_RCU_LIST is enabled by default. Signed-off-by: Madhuparna Bhowmik Signed-off-by: David S. Miller --- net/openvswitch/meter.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/openvswitch/meter.c b/net/openvswitch/meter.c index 3323b79ff548..5010d1ddd4bd 100644 --- a/net/openvswitch/meter.c +++ b/net/openvswitch/meter.c @@ -61,7 +61,8 @@ static struct dp_meter *lookup_meter(const struct datapath *dp, struct hlist_head *head; head = meter_hash_bucket(dp, meter_id); - hlist_for_each_entry_rcu(meter, head, dp_hash_node) { + hlist_for_each_entry_rcu(meter, head, dp_hash_node, + lockdep_ovsl_is_held()) { if (meter->id == meter_id) return meter; } -- cgit v1.2.3 From fed48423f14d9fa184b262d7c35d9dc1c3698500 Mon Sep 17 00:00:00 2001 From: Madhuparna Bhowmik Date: Wed, 19 Feb 2020 01:27:42 +0530 Subject: vport.c: Use built-in RCU list checking hlist_for_each_entry_rcu() has built-in RCU and lock checking. Pass cond argument to list_for_each_entry_rcu() to silence false lockdep warning when CONFIG_PROVE_RCU_LIST is enabled by default. Signed-off-by: Madhuparna Bhowmik Signed-off-by: David S. Miller --- net/openvswitch/vport.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c index 5da9392b03d6..47febb4504f0 100644 --- a/net/openvswitch/vport.c +++ b/net/openvswitch/vport.c @@ -96,7 +96,8 @@ struct vport *ovs_vport_locate(const struct net *net, const char *name) struct hlist_head *bucket = hash_bucket(net, name); struct vport *vport; - hlist_for_each_entry_rcu(vport, bucket, hash_node) + hlist_for_each_entry_rcu(vport, bucket, hash_node, + lockdep_ovsl_is_held()) if (!strcmp(name, ovs_vport_name(vport)) && net_eq(ovs_dp_get_net(vport->dp), net)) return vport; -- cgit v1.2.3 From 53742e69e85d2eb7ed56f58d277bc3e682f8949e Mon Sep 17 00:00:00 2001 From: Madhuparna Bhowmik Date: Wed, 19 Feb 2020 01:28:02 +0530 Subject: datapath.c: Use built-in RCU list checking hlist_for_each_entry_rcu() has built-in RCU and lock checking. Pass cond argument to list_for_each_entry_rcu() to silence false lockdep warning when CONFIG_PROVE_RCU_LIST is enabled by default. Signed-off-by: Madhuparna Bhowmik Signed-off-by: David S. Miller --- net/openvswitch/datapath.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 659c2a790fe7..c047afd12116 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -179,7 +179,8 @@ struct vport *ovs_lookup_vport(const struct datapath *dp, u16 port_no) struct hlist_head *head; head = vport_hash_bucket(dp, port_no); - hlist_for_each_entry_rcu(vport, head, dp_hash_node) { + hlist_for_each_entry_rcu(vport, head, dp_hash_node, + lockdep_ovsl_is_held()) { if (vport->port_no == port_no) return vport; } @@ -2042,7 +2043,8 @@ static unsigned int ovs_get_max_headroom(struct datapath *dp) int i; for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) { - hlist_for_each_entry_rcu(vport, &dp->ports[i], dp_hash_node) { + hlist_for_each_entry_rcu(vport, &dp->ports[i], dp_hash_node, + lockdep_ovsl_is_held()) { dev = vport->dev; dev_headroom = netdev_get_fwd_headroom(dev); if (dev_headroom > max_headroom) @@ -2061,7 +2063,8 @@ static void ovs_update_headroom(struct datapath *dp, unsigned int new_headroom) dp->max_headroom = new_headroom; for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) - hlist_for_each_entry_rcu(vport, &dp->ports[i], dp_hash_node) + hlist_for_each_entry_rcu(vport, &dp->ports[i], dp_hash_node, + lockdep_ovsl_is_held()) netdev_set_rx_headroom(vport->dev, new_headroom); } -- cgit v1.2.3 From a2cfb96cc3654c6d451020480a4bcfbbca564350 Mon Sep 17 00:00:00 2001 From: Madhuparna Bhowmik Date: Wed, 19 Feb 2020 01:28:20 +0530 Subject: flow_table.c: Use built-in RCU list checking hlist_for_each_entry_rcu() has built-in RCU and lock checking. Pass cond argument to list_for_each_entry_rcu() to silence false lockdep warning when CONFIG_PROVE_RCU_LIST is enabled by default. Signed-off-by: Madhuparna Bhowmik Signed-off-by: David S. Miller --- net/openvswitch/flow_table.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c index 5904e93e5765..fd8a01ca7a2d 100644 --- a/net/openvswitch/flow_table.c +++ b/net/openvswitch/flow_table.c @@ -585,7 +585,8 @@ static struct sw_flow *masked_flow_lookup(struct table_instance *ti, head = find_bucket(ti, hash); (*n_mask_hit)++; - hlist_for_each_entry_rcu(flow, head, flow_table.node[ti->node_ver]) { + hlist_for_each_entry_rcu(flow, head, flow_table.node[ti->node_ver], + lockdep_ovsl_is_held()) { if (flow->mask == mask && flow->flow_table.hash == hash && flow_cmp_masked_key(flow, &masked_key, &mask->range)) return flow; @@ -769,7 +770,8 @@ struct sw_flow *ovs_flow_tbl_lookup_ufid(struct flow_table *tbl, hash = ufid_hash(ufid); head = find_bucket(ti, hash); - hlist_for_each_entry_rcu(flow, head, ufid_table.node[ti->node_ver]) { + hlist_for_each_entry_rcu(flow, head, ufid_table.node[ti->node_ver], + lockdep_ovsl_is_held()) { if (flow->ufid_table.hash == hash && ovs_flow_cmp_ufid(flow, ufid)) return flow; -- cgit v1.2.3 From bd97ad51a7eb1b02049deca56bc26d96cabbac8a Mon Sep 17 00:00:00 2001 From: Stefano Brivio Date: Fri, 14 Feb 2020 18:14:13 +0100 Subject: netfilter: nft_set_pipapo: Fix mapping table example in comments In both insertion and lookup examples, the two element pointers of rule mapping tables were swapped. Fix that. Reported-by: Pablo Neira Ayuso Fixes: 3c4287f62044 ("nf_tables: Add set type for arbitrary concatenation of ranges") Signed-off-by: Stefano Brivio Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_set_pipapo.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index f0cb1e13af50..579600b39f39 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -203,7 +203,7 @@ * :: * * rule indices in last field: 0 1 - * map to elements: 0x42 0x66 + * map to elements: 0x66 0x42 * * * Matching @@ -298,7 +298,7 @@ * :: * * rule indices in last field: 0 1 - * map to elements: 0x42 0x66 + * map to elements: 0x66 0x42 * * the matching element is at 0x42. * -- cgit v1.2.3 From 9a7712048f9d43da5022e75eca3d6b81080e76d3 Mon Sep 17 00:00:00 2001 From: Stefano Brivio Date: Fri, 14 Feb 2020 18:14:14 +0100 Subject: netfilter: nft_set_pipapo: Don't abuse unlikely() in pipapo_refill() I originally used unlikely() in the if (match_only) clause, which we hit on the mapping table for the last field in a set, to ensure we avoid branching to the rest of for loop body, which is executed more frequently. However, Pablo reports, this is confusing as it gives the impression that this is not a common case, and it's actually not the intended usage of unlikely(). I couldn't observe any statistical difference in matching rates on x864_64 and aarch64 without it, so just drop it. Reported-by: Pablo Neira Ayuso Fixes: 3c4287f62044 ("nf_tables: Add set type for arbitrary concatenation of ranges") Signed-off-by: Stefano Brivio Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_set_pipapo.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index 579600b39f39..feac8553f6d9 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -503,7 +503,7 @@ static int pipapo_refill(unsigned long *map, int len, int rules, return -1; } - if (unlikely(match_only)) { + if (match_only) { bitmap_clear(map, i, 1); return i; } -- cgit v1.2.3 From 6551d5c56eb0d02db2274d7a8d26c333deba7fd2 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 18 Feb 2020 10:12:58 -0800 Subject: pipe: make sure to wake up everybody when the last reader/writer closes Andrei Vagin reported that commit 0ddad21d3e99 ("pipe: use exclusive waits when reading or writing") broke one of the CRIU tests. He even has a trivial reproducer: #include #include #include int main() { int p[2]; pid_t p1, p2; int status; if (pipe(p) == -1) return 1; p1 = fork(); if (p1 == 0) { close(p[1]); read(p[0], &status, sizeof(status)); return 0; } p2 = fork(); if (p2 == 0) { close(p[1]); read(p[0], &status, sizeof(status)); return 0; } sleep(1); close(p[1]); wait(&status); wait(&status); return 0; } and the problem - once he points it out - is obvious. We use these nice exclusive waits, but when the last writer goes away, it then needs to wake up _every_ reader (and conversely, the last reader disappearing needs to wake every writer, of course). In fact, when going through this, we had several small oddities around how to wake things. We did in fact wake every reader when we changed the size of the pipe buffers. But that's entirely pointless, since that just acts as a possible source of new space - no new data to read. And when we change the size of the buffer, we don't need to wake all writers even when we add space - that case acts just as if somebody made space by reading, and any writer that finds itself not filling it up entirely will wake the next one. On the other hand, on the exit path, we tried to limit the wakeups with the proper poll keys etc, which is entirely pointless, because at that point we obviously need to wake up everybody. So don't do that: just wake up everybody - but only do that if the counts changed to zero. So fix those non-IO wakeups to be more proper: space change doesn't add any new data, but it might make room for writers, so it wakes up a writer. And the actual changes to reader/writer counts should wake up everybody, since everybody is affected (ie readers will all see EOF if the writers have gone away, and writers will all get EPIPE if all readers have gone away). Fixes: 0ddad21d3e99 ("pipe: use exclusive waits when reading or writing") Reported-and-tested-by: Andrei Vagin Cc: Josh Triplett Cc: Matthew Wilcox Signed-off-by: Linus Torvalds --- fs/pipe.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/fs/pipe.c b/fs/pipe.c index 5a34d6c22d4c..2144507447c5 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -722,9 +722,10 @@ pipe_release(struct inode *inode, struct file *file) if (file->f_mode & FMODE_WRITE) pipe->writers--; - if (pipe->readers || pipe->writers) { - wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM | EPOLLERR | EPOLLHUP); - wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM | EPOLLERR | EPOLLHUP); + /* Was that the last reader or writer, but not the other side? */ + if (!pipe->readers != !pipe->writers) { + wake_up_interruptible_all(&pipe->rd_wait); + wake_up_interruptible_all(&pipe->wr_wait); kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); } @@ -1026,8 +1027,8 @@ static int wait_for_partner(struct pipe_inode_info *pipe, unsigned int *cnt) static void wake_up_partner(struct pipe_inode_info *pipe) { - wake_up_interruptible(&pipe->rd_wait); - wake_up_interruptible(&pipe->wr_wait); + wake_up_interruptible_all(&pipe->rd_wait); + wake_up_interruptible_all(&pipe->wr_wait); } static int fifo_open(struct inode *inode, struct file *filp) @@ -1144,7 +1145,7 @@ err_rd: err_wr: if (!--pipe->writers) - wake_up_interruptible(&pipe->rd_wait); + wake_up_interruptible_all(&pipe->rd_wait); ret = -ERESTARTSYS; goto err; @@ -1271,8 +1272,9 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg) pipe->max_usage = nr_slots; pipe->tail = tail; pipe->head = head; - wake_up_interruptible_all(&pipe->rd_wait); - wake_up_interruptible_all(&pipe->wr_wait); + + /* This might have made more room for writers */ + wake_up_interruptible(&pipe->wr_wait); return pipe->max_usage * PAGE_SIZE; out_revert_acct: -- cgit v1.2.3 From 9eb425b2e04e0e3006adffea5bf5f227a896f128 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 18 Feb 2020 14:09:29 +0000 Subject: powerpc/entry: Fix an #if which should be an #ifdef in entry_32.S Fixes: 12c3f1fd87bf ("powerpc/32s: get rid of CPU_FTR_601 feature") Cc: stable@vger.kernel.org # v5.4+ Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/a99fc0ad65b87a1ba51cfa3e0e9034ee294c3e07.1582034961.git.christophe.leroy@c-s.fr --- arch/powerpc/kernel/entry_32.S | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S index bc056d906b51..16af0d8d90a8 100644 --- a/arch/powerpc/kernel/entry_32.S +++ b/arch/powerpc/kernel/entry_32.S @@ -783,7 +783,7 @@ fast_exception_return: 1: lis r3,exc_exit_restart_end@ha addi r3,r3,exc_exit_restart_end@l cmplw r12,r3 -#if CONFIG_PPC_BOOK3S_601 +#ifdef CONFIG_PPC_BOOK3S_601 bge 2b #else bge 3f @@ -791,7 +791,7 @@ fast_exception_return: lis r4,exc_exit_restart@ha addi r4,r4,exc_exit_restart@l cmplw r12,r4 -#if CONFIG_PPC_BOOK3S_601 +#ifdef CONFIG_PPC_BOOK3S_601 blt 2b #else blt 3f -- cgit v1.2.3 From 81f7eb00ff5bb8326e82503a32809421d14abb8a Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Tue, 11 Feb 2020 15:25:37 +0800 Subject: btrfs: destroy qgroup extent records on transaction abort We clean up the delayed references when we abort a transaction but we leave the pending qgroup extent records behind, leaking memory. This patch destroys the extent records when we destroy the delayed refs and makes sure ensure they're gone before releasing the transaction. Fixes: 3368d001ba5d ("btrfs: qgroup: Record possible quota-related extent for qgroup.") CC: stable@vger.kernel.org # 4.4+ Reviewed-by: Josef Bacik Signed-off-by: Jeff Mahoney [ Rebased to latest upstream, remove to_qgroup() helper, use rbtree_postorder_for_each_entry_safe() wrapper ] Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 1 + fs/btrfs/qgroup.c | 13 +++++++++++++ fs/btrfs/qgroup.h | 1 + fs/btrfs/transaction.c | 2 ++ 4 files changed, 17 insertions(+) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 89422aa8e9d1..d7fec89974cb 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -4276,6 +4276,7 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, cond_resched(); spin_lock(&delayed_refs->lock); } + btrfs_qgroup_destroy_extent_records(trans); spin_unlock(&delayed_refs->lock); diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 98d9a50352d6..ff1870ff3474 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -4002,3 +4002,16 @@ out: } return ret; } + +void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans) +{ + struct btrfs_qgroup_extent_record *entry; + struct btrfs_qgroup_extent_record *next; + struct rb_root *root; + + root = &trans->delayed_refs.dirty_extent_root; + rbtree_postorder_for_each_entry_safe(entry, next, root, node) { + ulist_free(entry->old_roots); + kfree(entry); + } +} diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h index 236f12224d52..1bc654459469 100644 --- a/fs/btrfs/qgroup.h +++ b/fs/btrfs/qgroup.h @@ -414,5 +414,6 @@ int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans, u64 last_snapshot); int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *eb); +void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans); #endif diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 33dcc88b428a..beb6c69cd1e5 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -121,6 +121,8 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction) BUG_ON(!list_empty(&transaction->list)); WARN_ON(!RB_EMPTY_ROOT( &transaction->delayed_refs.href_root.rb_root)); + WARN_ON(!RB_EMPTY_ROOT( + &transaction->delayed_refs.dirty_extent_root)); if (transaction->delayed_refs.pending_csums) btrfs_err(transaction->fs_info, "pending csums is %llu", -- cgit v1.2.3 From 315bf8ef914f31d51d084af950703aa1e09a728c Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 13 Feb 2020 10:47:28 -0500 Subject: btrfs: reset fs_root to NULL on error in open_ctree While running my error injection script I hit a panic when we tried to clean up the fs_root when freeing the fs_root. This is because fs_info->fs_root == PTR_ERR(-EIO), which isn't great. Fix this by setting fs_info->fs_root = NULL; if we fail to read the root. CC: stable@vger.kernel.org # 4.4+ Reviewed-by: Nikolay Borisov Reviewed-by: Johannes Thumshirn Reviewed-by: Qu Wenruo Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index d7fec89974cb..197352f23534 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3200,6 +3200,7 @@ int __cold open_ctree(struct super_block *sb, if (IS_ERR(fs_info->fs_root)) { err = PTR_ERR(fs_info->fs_root); btrfs_warn(fs_info, "failed to read fs tree: %d", err); + fs_info->fs_root = NULL; goto fail_qgroup; } -- cgit v1.2.3 From 1e90315149f3fe148e114a5de86f0196d1c21fa5 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 13 Feb 2020 10:47:29 -0500 Subject: btrfs: do not check delayed items are empty for single transaction cleanup btrfs_assert_delayed_root_empty() will check if the delayed root is completely empty, but this is a filesystem-wide check. On cleanup we may have allowed other transactions to begin, for whatever reason, and thus the delayed root is not empty. So remove this check from cleanup_one_transation(). This however can stay in btrfs_cleanup_transaction(), because it checks only after all of the transactions have been properly cleaned up, and thus is valid. CC: stable@vger.kernel.org # 4.4+ Reviewed-by: Johannes Thumshirn Reviewed-by: Nikolay Borisov Reviewed-by: Qu Wenruo Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 197352f23534..c6c9a6a8e6c8 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -4503,7 +4503,6 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, wake_up(&fs_info->transaction_wait); btrfs_destroy_delayed_inodes(fs_info); - btrfs_assert_delayed_root_empty(fs_info); btrfs_destroy_marked_extents(fs_info, &cur_trans->dirty_pages, EXTENT_DIRTY); -- cgit v1.2.3 From bd727173e4432fe6cb70ba108dc1f3602c5409d7 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 13 Feb 2020 10:47:30 -0500 Subject: btrfs: handle logged extent failure properly If we're allocating a logged extent we attempt to insert an extent record for the file extent directly. We increase space_info->bytes_reserved, because the extent entry addition will call btrfs_update_block_group(), which will convert the ->bytes_reserved to ->bytes_used. However if we fail at any point while inserting the extent entry we will bail and leave space on ->bytes_reserved, which will trigger a WARN_ON() on umount. Fix this by pinning the space if we fail to insert, which is what happens in every other failure case that involves adding the extent entry. CC: stable@vger.kernel.org # 5.4+ Reviewed-by: Johannes Thumshirn Reviewed-by: Nikolay Borisov Reviewed-by: Qu Wenruo Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 0163fdd59f8f..a7bc66121330 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4430,6 +4430,8 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, ret = alloc_reserved_file_extent(trans, 0, root_objectid, 0, owner, offset, ins, 1); + if (ret) + btrfs_pin_extent(fs_info, ins->objectid, ins->offset, 1); btrfs_put_block_group(block_group); return ret; } -- cgit v1.2.3 From b778cf962d71a0e737923d55d0432f3bd287258e Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 13 Feb 2020 10:47:31 -0500 Subject: btrfs: fix bytes_may_use underflow in prealloc error condtition I hit the following warning while running my error injection stress testing: WARNING: CPU: 3 PID: 1453 at fs/btrfs/space-info.h:108 btrfs_free_reserved_data_space_noquota+0xfd/0x160 [btrfs] RIP: 0010:btrfs_free_reserved_data_space_noquota+0xfd/0x160 [btrfs] Call Trace: btrfs_free_reserved_data_space+0x4f/0x70 [btrfs] __btrfs_prealloc_file_range+0x378/0x470 [btrfs] elfcorehdr_read+0x40/0x40 ? elfcorehdr_read+0x40/0x40 ? btrfs_commit_transaction+0xca/0xa50 [btrfs] ? dput+0xb4/0x2a0 ? btrfs_log_dentry_safe+0x55/0x70 [btrfs] ? btrfs_sync_file+0x30e/0x420 [btrfs] ? do_fsync+0x38/0x70 ? __x64_sys_fdatasync+0x13/0x20 ? do_syscall_64+0x5b/0x1b0 ? entry_SYSCALL_64_after_hwframe+0x44/0xa9 This happens if we fail to insert our reserved file extent. At this point we've already converted our reservation from ->bytes_may_use to ->bytes_reserved. However once we break we will attempt to free everything from [cur_offset, end] from ->bytes_may_use, but our extent reservation will overlap part of this. Fix this problem by adding ins.offset (our extent allocation size) to cur_offset so we remove the actual remaining part from ->bytes_may_use. I validated this fix using my inject-error.py script python inject-error.py -o should_fail_bio -t cache_save_setup -t \ __btrfs_prealloc_file_range \ -t insert_reserved_file_extent.constprop.0 \ -r "-5" ./run-fsstress.sh where run-fsstress.sh simply mounts and runs fsstress on a disk. CC: stable@vger.kernel.org # 4.4+ Reviewed-by: Qu Wenruo Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 36deef69f847..4f47ba652b31 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -9824,6 +9824,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_key ins; u64 cur_offset = start; + u64 clear_offset = start; u64 i_size; u64 cur_bytes; u64 last_alloc = (u64)-1; @@ -9858,6 +9859,15 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, btrfs_end_transaction(trans); break; } + + /* + * We've reserved this space, and thus converted it from + * ->bytes_may_use to ->bytes_reserved. Any error that happens + * from here on out we will only need to clear our reservation + * for the remaining unreserved area, so advance our + * clear_offset by our extent size. + */ + clear_offset += ins.offset; btrfs_dec_block_group_reservations(fs_info, ins.objectid); last_alloc = ins.offset; @@ -9937,9 +9947,9 @@ next: if (own_trans) btrfs_end_transaction(trans); } - if (cur_offset < end) - btrfs_free_reserved_data_space(inode, NULL, cur_offset, - end - cur_offset + 1); + if (clear_offset < end) + btrfs_free_reserved_data_space(inode, NULL, clear_offset, + end - clear_offset + 1); return ret; } -- cgit v1.2.3 From e75fd33b3f744f644061a4f9662bd63f5434f806 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 13 Feb 2020 12:29:50 +0000 Subject: Btrfs: fix btrfs_wait_ordered_range() so that it waits for all ordered extents In btrfs_wait_ordered_range() once we find an ordered extent that has finished with an error we exit the loop and don't wait for any other ordered extents that might be still in progress. All the users of btrfs_wait_ordered_range() expect that there are no more ordered extents in progress after that function returns. So past fixes such like the ones from the two following commits: ff612ba7849964 ("btrfs: fix panic during relocation after ENOSPC before writeback happens") 28aeeac1dd3080 ("Btrfs: fix panic when starting bg cache writeout after IO error") don't work when there are multiple ordered extents in the range. Fix that by making btrfs_wait_ordered_range() wait for all ordered extents even after it finds one that had an error. Link: https://github.com/kdave/btrfs-progs/issues/228#issuecomment-569777554 CC: stable@vger.kernel.org # 4.4+ Reviewed-by: Qu Wenruo Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/ordered-data.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index ecb9fb6a6fe0..a65f189a5b94 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -679,10 +679,15 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) } btrfs_start_ordered_extent(inode, ordered, 1); end = ordered->file_offset; + /* + * If the ordered extent had an error save the error but don't + * exit without waiting first for all other ordered extents in + * the range to complete. + */ if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)) ret = -EIO; btrfs_put_ordered_extent(ordered); - if (ret || end == 0 || end == start) + if (end == 0 || end == start) break; end--; } -- cgit v1.2.3 From 929a3af90f0f4bd7132d83552c1a98c83f60ef7e Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 19 Feb 2020 00:19:09 +0300 Subject: io_uring: fix use-after-free by io_cleanup_req() io_cleanup_req() should be called before req->io is freed, and so shouldn't be after __io_free_req() -> __io_req_aux_free(). Also, it will be ignored for in io_free_req_many(), which use __io_req_aux_free(). Place cleanup_req() into __io_req_aux_free(). Fixes: 99bc4c38537d774 ("io_uring: fix iovec leaks") Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index d35b45696c73..6e249aa97ba3 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1260,6 +1260,9 @@ static void __io_req_aux_free(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; + if (req->flags & REQ_F_NEED_CLEANUP) + io_cleanup_req(req); + kfree(req->io); if (req->file) { if (req->flags & REQ_F_FIXED_FILE) @@ -1275,9 +1278,6 @@ static void __io_free_req(struct io_kiocb *req) { __io_req_aux_free(req); - if (req->flags & REQ_F_NEED_CLEANUP) - io_cleanup_req(req); - if (req->flags & REQ_F_INFLIGHT) { struct io_ring_ctx *ctx = req->ctx; unsigned long flags; -- cgit v1.2.3 From 3d9c5e023a0dbf3e117bb416cfefd9405bf5af0c Mon Sep 17 00:00:00 2001 From: Huy Nguyen Date: Mon, 3 Feb 2020 16:32:18 -0600 Subject: net/mlx5: Fix sleep while atomic in mlx5_eswitch_get_vepa rtnl_bridge_getlink is protected by rcu lock, so mlx5_eswitch_get_vepa cannot take mutex lock. Two possible issues can happen: 1. User at the same time change vepa mode via RTM_SETLINK command. 2. User at the same time change the switchdev mode via devlink netlink interface. Case 1 cannot happen because rtnl executes one message in order. Case 2 can happen but we do not expect user to change the switchdev mode when changing vepa. Even if a user does it, so he will read a value which is no longer valid. Fixes: 8da202b24913 ("net/mlx5: E-Switch, Add support for VEPA in legacy mode.") Signed-off-by: Huy Nguyen Reviewed-by: Mark Bloch Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/eswitch.c | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c index 5acf60b1bbfe..564d42605892 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c @@ -2452,25 +2452,17 @@ out: int mlx5_eswitch_get_vepa(struct mlx5_eswitch *esw, u8 *setting) { - int err = 0; - if (!esw) return -EOPNOTSUPP; if (!ESW_ALLOWED(esw)) return -EPERM; - mutex_lock(&esw->state_lock); - if (esw->mode != MLX5_ESWITCH_LEGACY) { - err = -EOPNOTSUPP; - goto out; - } + if (esw->mode != MLX5_ESWITCH_LEGACY) + return -EOPNOTSUPP; *setting = esw->fdb_table.legacy.vepa_uplink_rule ? 1 : 0; - -out: - mutex_unlock(&esw->state_lock); - return err; + return 0; } int mlx5_eswitch_set_vport_trust(struct mlx5_eswitch *esw, -- cgit v1.2.3 From 5ee090ed0da649b1febae2b7c285ac77d1e55a0c Mon Sep 17 00:00:00 2001 From: Aya Levin Date: Mon, 9 Dec 2019 14:08:18 +0200 Subject: net/mlx5e: Reset RQ doorbell counter before moving RQ state from RST to RDY Initialize RQ doorbell counters to zero prior to moving an RQ from RST to RDY state. Per HW spec, when RQ is back to RDY state, the descriptor ID on the completion is reset. The doorbell record must comply. Fixes: 8276ea1353a4 ("net/mlx5e: Report and recover from CQE with error on RQ") Signed-off-by: Aya Levin Reported-by: Tariq Toukan Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h | 8 +++++ drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 3 ++ drivers/net/ethernet/mellanox/mlx5/core/wq.c | 39 +++++++++++++++++------ drivers/net/ethernet/mellanox/mlx5/core/wq.h | 2 ++ 4 files changed, 43 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h b/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h index 7c8796d9743f..a226277b0980 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h @@ -179,6 +179,14 @@ mlx5e_tx_dma_unmap(struct device *pdev, struct mlx5e_sq_dma *dma) } } +static inline void mlx5e_rqwq_reset(struct mlx5e_rq *rq) +{ + if (rq->wq_type == MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ) + mlx5_wq_ll_reset(&rq->mpwqe.wq); + else + mlx5_wq_cyc_reset(&rq->wqe.wq); +} + /* SW parser related functions */ struct mlx5e_swp_spec { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 454d3459bd8b..966983674663 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -712,6 +712,9 @@ int mlx5e_modify_rq_state(struct mlx5e_rq *rq, int curr_state, int next_state) if (!in) return -ENOMEM; + if (curr_state == MLX5_RQC_STATE_RST && next_state == MLX5_RQC_STATE_RDY) + mlx5e_rqwq_reset(rq); + rqc = MLX5_ADDR_OF(modify_rq_in, in, ctx); MLX5_SET(modify_rq_in, in, rq_state, curr_state); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/wq.c b/drivers/net/ethernet/mellanox/mlx5/core/wq.c index 02f7e4a39578..01f075fac276 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/wq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/wq.c @@ -94,6 +94,13 @@ void mlx5_wq_cyc_wqe_dump(struct mlx5_wq_cyc *wq, u16 ix, u8 nstrides) print_hex_dump(KERN_WARNING, "", DUMP_PREFIX_OFFSET, 16, 1, wqe, len, false); } +void mlx5_wq_cyc_reset(struct mlx5_wq_cyc *wq) +{ + wq->wqe_ctr = 0; + wq->cur_sz = 0; + mlx5_wq_cyc_update_db_record(wq); +} + int mlx5_wq_qp_create(struct mlx5_core_dev *mdev, struct mlx5_wq_param *param, void *qpc, struct mlx5_wq_qp *wq, struct mlx5_wq_ctrl *wq_ctrl) @@ -192,6 +199,19 @@ err_db_free: return err; } +static void mlx5_wq_ll_init_list(struct mlx5_wq_ll *wq) +{ + struct mlx5_wqe_srq_next_seg *next_seg; + int i; + + for (i = 0; i < wq->fbc.sz_m1; i++) { + next_seg = mlx5_wq_ll_get_wqe(wq, i); + next_seg->next_wqe_index = cpu_to_be16(i + 1); + } + next_seg = mlx5_wq_ll_get_wqe(wq, i); + wq->tail_next = &next_seg->next_wqe_index; +} + int mlx5_wq_ll_create(struct mlx5_core_dev *mdev, struct mlx5_wq_param *param, void *wqc, struct mlx5_wq_ll *wq, struct mlx5_wq_ctrl *wq_ctrl) @@ -199,9 +219,7 @@ int mlx5_wq_ll_create(struct mlx5_core_dev *mdev, struct mlx5_wq_param *param, u8 log_wq_stride = MLX5_GET(wq, wqc, log_wq_stride); u8 log_wq_sz = MLX5_GET(wq, wqc, log_wq_sz); struct mlx5_frag_buf_ctrl *fbc = &wq->fbc; - struct mlx5_wqe_srq_next_seg *next_seg; int err; - int i; err = mlx5_db_alloc_node(mdev, &wq_ctrl->db, param->db_numa_node); if (err) { @@ -220,13 +238,7 @@ int mlx5_wq_ll_create(struct mlx5_core_dev *mdev, struct mlx5_wq_param *param, mlx5_init_fbc(wq_ctrl->buf.frags, log_wq_stride, log_wq_sz, fbc); - for (i = 0; i < fbc->sz_m1; i++) { - next_seg = mlx5_wq_ll_get_wqe(wq, i); - next_seg->next_wqe_index = cpu_to_be16(i + 1); - } - next_seg = mlx5_wq_ll_get_wqe(wq, i); - wq->tail_next = &next_seg->next_wqe_index; - + mlx5_wq_ll_init_list(wq); wq_ctrl->mdev = mdev; return 0; @@ -237,6 +249,15 @@ err_db_free: return err; } +void mlx5_wq_ll_reset(struct mlx5_wq_ll *wq) +{ + wq->head = 0; + wq->wqe_ctr = 0; + wq->cur_sz = 0; + mlx5_wq_ll_init_list(wq); + mlx5_wq_ll_update_db_record(wq); +} + void mlx5_wq_destroy(struct mlx5_wq_ctrl *wq_ctrl) { mlx5_frag_buf_free(wq_ctrl->mdev, &wq_ctrl->buf); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/wq.h b/drivers/net/ethernet/mellanox/mlx5/core/wq.h index d9a94bc223c0..4cadc336593f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/wq.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/wq.h @@ -80,6 +80,7 @@ int mlx5_wq_cyc_create(struct mlx5_core_dev *mdev, struct mlx5_wq_param *param, void *wqc, struct mlx5_wq_cyc *wq, struct mlx5_wq_ctrl *wq_ctrl); void mlx5_wq_cyc_wqe_dump(struct mlx5_wq_cyc *wq, u16 ix, u8 nstrides); +void mlx5_wq_cyc_reset(struct mlx5_wq_cyc *wq); int mlx5_wq_qp_create(struct mlx5_core_dev *mdev, struct mlx5_wq_param *param, void *qpc, struct mlx5_wq_qp *wq, @@ -92,6 +93,7 @@ int mlx5_cqwq_create(struct mlx5_core_dev *mdev, struct mlx5_wq_param *param, int mlx5_wq_ll_create(struct mlx5_core_dev *mdev, struct mlx5_wq_param *param, void *wqc, struct mlx5_wq_ll *wq, struct mlx5_wq_ctrl *wq_ctrl); +void mlx5_wq_ll_reset(struct mlx5_wq_ll *wq); void mlx5_wq_destroy(struct mlx5_wq_ctrl *wq_ctrl); -- cgit v1.2.3 From 1ad6c43c6a7b8627240c6cc19c69e31fedc596a7 Mon Sep 17 00:00:00 2001 From: Aya Levin Date: Wed, 12 Feb 2020 15:17:25 +0200 Subject: net/mlx5e: Fix crash in recovery flow without devlink reporter When health reporters are not supported, recovery function is invoked directly, not via devlink health reporters. In this direct flow, the recover function input parameter was passed incorrectly and is causing a kernel oops. This patch is fixing the input parameter. Following call trace is observed on rx error health reporting. Internal error: Oops: 96000007 [#1] PREEMPT SMP Process kworker/u16:4 (pid: 4584, stack limit = 0x00000000c9e45703) Call trace: mlx5e_rx_reporter_err_rq_cqe_recover+0x30/0x164 [mlx5_core] mlx5e_health_report+0x60/0x6c [mlx5_core] mlx5e_reporter_rq_cqe_err+0x6c/0x90 [mlx5_core] mlx5e_rq_err_cqe_work+0x20/0x2c [mlx5_core] process_one_work+0x168/0x3d0 worker_thread+0x58/0x3d0 kthread+0x108/0x134 Fixes: c50de4af1d63 ("net/mlx5e: Generalize tx reporter's functionality") Signed-off-by: Aya Levin Signed-off-by: Parav Pandit Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en/health.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/health.c b/drivers/net/ethernet/mellanox/mlx5/core/en/health.c index 3a975641f902..20b907dc1e29 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/health.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/health.c @@ -200,7 +200,7 @@ int mlx5e_health_report(struct mlx5e_priv *priv, netdev_err(priv->netdev, err_str); if (!reporter) - return err_ctx->recover(&err_ctx->ctx); + return err_ctx->recover(err_ctx->ctx); return devlink_health_report(reporter, err_str, err_ctx); } -- cgit v1.2.3 From 52d214976d4f64504c1bbb52d47b46a5a3d5ee42 Mon Sep 17 00:00:00 2001 From: Hamdan Igbaria Date: Wed, 5 Feb 2020 14:31:12 +0200 Subject: net/mlx5: DR, Fix matching on vport gvmi Set vport gvmi in the tag, only when source gvmi is set in the bit mask. Fixes: 26d688e3 ("net/mlx5: DR, Add Steering entry (STE) utilities") Signed-off-by: Hamdan Igbaria Reviewed-by: Alex Vesker Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste.c index c6c7d1defbd7..aade62a9ee5c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste.c @@ -2307,7 +2307,9 @@ static int dr_ste_build_src_gvmi_qpn_tag(struct mlx5dr_match_param *value, struct mlx5dr_cmd_vport_cap *vport_cap; struct mlx5dr_domain *dmn = sb->dmn; struct mlx5dr_cmd_caps *caps; + u8 *bit_mask = sb->bit_mask; u8 *tag = hw_ste->tag; + bool source_gvmi_set; DR_STE_SET_TAG(src_gvmi_qp, tag, source_qp, misc, source_sqn); @@ -2328,7 +2330,8 @@ static int dr_ste_build_src_gvmi_qpn_tag(struct mlx5dr_match_param *value, if (!vport_cap) return -EINVAL; - if (vport_cap->vport_gvmi) + source_gvmi_set = MLX5_GET(ste_src_gvmi_qp, bit_mask, source_gvmi); + if (vport_cap->vport_gvmi && source_gvmi_set) MLX5_SET(ste_src_gvmi_qp, tag, source_gvmi, vport_cap->vport_gvmi); misc->source_eswitch_owner_vhca_id = 0; -- cgit v1.2.3 From 383de108157c881074f32914b61125e299820bd2 Mon Sep 17 00:00:00 2001 From: Dmytro Linkin Date: Wed, 12 Feb 2020 11:32:39 +0200 Subject: net/mlx5e: Don't clear the whole vf config when switching modes There is no need to reset all vf config (except link state) between legacy and switchdev modes changes. Also, set link state to AUTO, when legacy enabled. Fixes: 3b83b6c2e024 ("net/mlx5e: Clear VF config when switching modes") Signed-off-by: Dmytro Linkin Reviewed-by: Roi Dayan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/eswitch.c | 6 +++++- drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c | 4 ++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c index 564d42605892..e49acd0c5da5 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c @@ -459,12 +459,16 @@ static void esw_destroy_legacy_table(struct mlx5_eswitch *esw) static int esw_legacy_enable(struct mlx5_eswitch *esw) { - int ret; + struct mlx5_vport *vport; + int ret, i; ret = esw_create_legacy_table(esw); if (ret) return ret; + mlx5_esw_for_each_vf_vport(esw, i, vport, esw->esw_funcs.num_vfs) + vport->info.link_state = MLX5_VPORT_ADMIN_STATE_AUTO; + ret = mlx5_eswitch_enable_pf_vf_vports(esw, MLX5_LEGACY_SRIOV_VPORT_EVENTS); if (ret) esw_destroy_legacy_table(esw); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c index 979f13bdc203..1a57b2bd74b8 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c @@ -1172,7 +1172,7 @@ static int esw_offloads_start(struct mlx5_eswitch *esw, return -EINVAL; } - mlx5_eswitch_disable(esw, true); + mlx5_eswitch_disable(esw, false); mlx5_eswitch_update_num_of_vfs(esw, esw->dev->priv.sriov.num_vfs); err = mlx5_eswitch_enable(esw, MLX5_ESWITCH_OFFLOADS); if (err) { @@ -2065,7 +2065,7 @@ static int esw_offloads_stop(struct mlx5_eswitch *esw, { int err, err1; - mlx5_eswitch_disable(esw, true); + mlx5_eswitch_disable(esw, false); err = mlx5_eswitch_enable(esw, MLX5_ESWITCH_LEGACY); if (err) { NL_SET_ERR_MSG_MOD(extack, "Failed setting eswitch to legacy"); -- cgit v1.2.3 From 76781623f009d5615b67f0675230ef90eaa9272a Mon Sep 17 00:00:00 2001 From: Paul Blakey Date: Tue, 31 Dec 2019 17:04:15 +0200 Subject: net/mlx5: Fix lowest FDB pool size The pool sizes represent the pool sizes in the fw. when we request a pool size from fw, it will return the next possible group. We track how many pools the fw has left and start requesting groups from the big to the small. When we start request 4k group, which doesn't exists in fw, fw wants to allocate the next possible size, 64k, but will fail since its exhausted. The correct smallest pool size in fw is 128 and not 4k. Fixes: 39ac237ce009 ("net/mlx5: E-Switch, Refactor chains and priorities") Signed-off-by: Paul Blakey Reviewed-by: Roi Dayan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads_chains.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads_chains.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads_chains.c index c5a446e295aa..4276194b633f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads_chains.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads_chains.c @@ -35,7 +35,7 @@ static const unsigned int ESW_POOLS[] = { 4 * 1024 * 1024, 1 * 1024 * 1024, 64 * 1024, - 4 * 1024, }; + 128 }; struct mlx5_esw_chains_priv { struct rhashtable chains_ht; -- cgit v1.2.3 From 13a7e459a41a56d788ab33d825c6205379bbb711 Mon Sep 17 00:00:00 2001 From: Erez Shitrit Date: Tue, 14 Jan 2020 09:27:27 +0200 Subject: net/mlx5: DR, Handle reformat capability over sw-steering tables On flow table creation, send the relevant flags according to what the FW currently supports. When FW doesn't support reformat option over SW-steering managed table, the driver shouldn't pass this. Fixes: 988fd6b32d07 ("net/mlx5: DR, Pass table flags at creation to lower layer") Signed-off-by: Erez Shitrit Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/steering/fs_dr.c | 9 +++++++-- include/linux/mlx5/mlx5_ifc.h | 5 ++++- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/fs_dr.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/fs_dr.c index 3abfc8125926..c2027192e21e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/fs_dr.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/fs_dr.c @@ -66,15 +66,20 @@ static int mlx5_cmd_dr_create_flow_table(struct mlx5_flow_root_namespace *ns, struct mlx5_flow_table *next_ft) { struct mlx5dr_table *tbl; + u32 flags; int err; if (mlx5_dr_is_fw_table(ft->flags)) return mlx5_fs_cmd_get_fw_cmds()->create_flow_table(ns, ft, log_size, next_ft); + flags = ft->flags; + /* turn off encap/decap if not supported for sw-str by fw */ + if (!MLX5_CAP_FLOWTABLE(ns->dev, sw_owner_reformat_supported)) + flags = ft->flags & ~(MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT | + MLX5_FLOW_TABLE_TUNNEL_EN_DECAP); - tbl = mlx5dr_table_create(ns->fs_dr_domain.dr_domain, - ft->level, ft->flags); + tbl = mlx5dr_table_create(ns->fs_dr_domain.dr_domain, ft->level, flags); if (!tbl) { mlx5_core_err(ns->dev, "Failed creating dr flow_table\n"); return -EINVAL; diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index ff8c9d527bb4..bfdf41537cf1 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -688,7 +688,10 @@ struct mlx5_ifc_flow_table_nic_cap_bits { u8 nic_rx_multi_path_tirs[0x1]; u8 nic_rx_multi_path_tirs_fts[0x1]; u8 allow_sniffer_and_nic_rx_shared_tir[0x1]; - u8 reserved_at_3[0x1d]; + u8 reserved_at_3[0x4]; + u8 sw_owner_reformat_supported[0x1]; + u8 reserved_at_8[0x18]; + u8 encap_general_header[0x1]; u8 reserved_at_21[0xa]; u8 log_max_packet_reformat_context[0x5]; -- cgit v1.2.3 From 3dfee47b215e49788cfc80e474820ea2e948c031 Mon Sep 17 00:00:00 2001 From: Kai-Heng Feng Date: Mon, 10 Feb 2020 15:51:15 +0800 Subject: iommu/amd: Disable IOMMU on Stoney Ridge systems Serious screen flickering when Stoney Ridge outputs to a 4K monitor. Use identity-mapping and PCI ATS doesn't help this issue. According to Alex Deucher, IOMMU isn't enabled on Windows, so let's do the same here to avoid screen flickering on 4K monitor. Cc: Alex Deucher Bug: https://gitlab.freedesktop.org/drm/amd/issues/961 Signed-off-by: Kai-Heng Feng Acked-by: Alex Deucher Signed-off-by: Joerg Roedel --- drivers/iommu/amd_iommu_init.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/amd_iommu_init.c b/drivers/iommu/amd_iommu_init.c index 2759a8d57b7f..6be3853a5d97 100644 --- a/drivers/iommu/amd_iommu_init.c +++ b/drivers/iommu/amd_iommu_init.c @@ -2523,6 +2523,7 @@ static int __init early_amd_iommu_init(void) struct acpi_table_header *ivrs_base; acpi_status status; int i, remap_cache_sz, ret = 0; + u32 pci_id; if (!amd_iommu_detected) return -ENODEV; @@ -2610,6 +2611,16 @@ static int __init early_amd_iommu_init(void) if (ret) goto out; + /* Disable IOMMU if there's Stoney Ridge graphics */ + for (i = 0; i < 32; i++) { + pci_id = read_pci_config(0, i, 0, 0); + if ((pci_id & 0xffff) == 0x1002 && (pci_id >> 16) == 0x98e4) { + pr_info("Disable IOMMU on Stoney Ridge\n"); + amd_iommu_disabled = true; + break; + } + } + /* Disable any previously enabled IOMMUs */ if (!is_kdump_kernel() || amd_iommu_disabled) disable_iommus(); @@ -2718,7 +2729,7 @@ static int __init state_next(void) ret = early_amd_iommu_init(); init_state = ret ? IOMMU_INIT_ERROR : IOMMU_ACPI_FINISHED; if (init_state == IOMMU_ACPI_FINISHED && amd_iommu_disabled) { - pr_info("AMD IOMMU disabled on kernel command-line\n"); + pr_info("AMD IOMMU disabled\n"); init_state = IOMMU_CMDLINE_DISABLED; ret = -EINVAL; } -- cgit v1.2.3 From faf305c51aeabd1ea2d7131e798ef5f55f4a7750 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Tue, 18 Feb 2020 18:12:41 +0000 Subject: iommu/qcom: Fix bogus detach logic Currently, the implementation of qcom_iommu_domain_free() is guaranteed to do one of two things: WARN() and leak everything, or dereference NULL and crash. That alone is terrible, but in fact the whole idea of trying to track the liveness of a domain via the qcom_domain->iommu pointer as a sanity check is full of fundamentally flawed assumptions. Make things robust and actually functional by not trying to be quite so clever. Reported-by: Brian Masney Tested-by: Brian Masney Reported-by: Naresh Kamboju Fixes: 0ae349a0f33f ("iommu/qcom: Add qcom_iommu") Signed-off-by: Robin Murphy Tested-by: Stephan Gerhold Cc: stable@vger.kernel.org # v4.14+ Signed-off-by: Joerg Roedel --- drivers/iommu/qcom_iommu.c | 28 ++++++++++++---------------- 1 file changed, 12 insertions(+), 16 deletions(-) diff --git a/drivers/iommu/qcom_iommu.c b/drivers/iommu/qcom_iommu.c index 39759db4f003..4328da0b0a9f 100644 --- a/drivers/iommu/qcom_iommu.c +++ b/drivers/iommu/qcom_iommu.c @@ -344,21 +344,19 @@ static void qcom_iommu_domain_free(struct iommu_domain *domain) { struct qcom_iommu_domain *qcom_domain = to_qcom_iommu_domain(domain); - if (WARN_ON(qcom_domain->iommu)) /* forgot to detach? */ - return; - iommu_put_dma_cookie(domain); - /* NOTE: unmap can be called after client device is powered off, - * for example, with GPUs or anything involving dma-buf. So we - * cannot rely on the device_link. Make sure the IOMMU is on to - * avoid unclocked accesses in the TLB inv path: - */ - pm_runtime_get_sync(qcom_domain->iommu->dev); - - free_io_pgtable_ops(qcom_domain->pgtbl_ops); - - pm_runtime_put_sync(qcom_domain->iommu->dev); + if (qcom_domain->iommu) { + /* + * NOTE: unmap can be called after client device is powered + * off, for example, with GPUs or anything involving dma-buf. + * So we cannot rely on the device_link. Make sure the IOMMU + * is on to avoid unclocked accesses in the TLB inv path: + */ + pm_runtime_get_sync(qcom_domain->iommu->dev); + free_io_pgtable_ops(qcom_domain->pgtbl_ops); + pm_runtime_put_sync(qcom_domain->iommu->dev); + } kfree(qcom_domain); } @@ -404,7 +402,7 @@ static void qcom_iommu_detach_dev(struct iommu_domain *domain, struct device *de struct qcom_iommu_domain *qcom_domain = to_qcom_iommu_domain(domain); unsigned i; - if (!qcom_domain->iommu) + if (WARN_ON(!qcom_domain->iommu)) return; pm_runtime_get_sync(qcom_iommu->dev); @@ -417,8 +415,6 @@ static void qcom_iommu_detach_dev(struct iommu_domain *domain, struct device *de ctx->domain = NULL; } pm_runtime_put_sync(qcom_iommu->dev); - - qcom_domain->iommu = NULL; } static int qcom_iommu_map(struct iommu_domain *domain, unsigned long iova, -- cgit v1.2.3 From ab362fffa0feb0da23191111e60b641d39130053 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 18 Feb 2020 17:27:56 +0000 Subject: iommu/arm-smmu: Restore naming of driver parameter prefix Extending the Arm SMMU driver to allow for modular builds changed KBUILD_MODNAME to be "arm_smmu_mod" so that a single module could be built from the multiple existing object files without the need to rename any source files. This inadvertently changed the name of the driver parameters, which may lead to runtime issues if bootloaders are relying on the old names for correctness (e.g. "arm-smmu.disable_bypass=0"). Although MODULE_PARAM_PREFIX can be overridden to restore the old naming for builtin parameters, only the new name is matched by modprobe and so loading the driver as a module would cause parameters specified on the kernel command line to be ignored. Instead, rename "arm_smmu_mod" to "arm_smmu". Whilst it's a bit of a bodge, this allows us to create a single module without renaming any files and makes use of the fact that underscores and hyphens can be used interchangeably in parameter names. Cc: Robin Murphy Cc: Russell King Reported-by: Li Yang Fixes: cd221bd24ff5 ("iommu/arm-smmu: Allow building as a module") Signed-off-by: Will Deacon Reviewed-by: Robin Murphy Signed-off-by: Joerg Roedel --- drivers/iommu/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile index 2104fb8afc06..9f33fdb3bb05 100644 --- a/drivers/iommu/Makefile +++ b/drivers/iommu/Makefile @@ -14,8 +14,8 @@ obj-$(CONFIG_MSM_IOMMU) += msm_iommu.o obj-$(CONFIG_AMD_IOMMU) += amd_iommu.o amd_iommu_init.o amd_iommu_quirks.o obj-$(CONFIG_AMD_IOMMU_DEBUGFS) += amd_iommu_debugfs.o obj-$(CONFIG_AMD_IOMMU_V2) += amd_iommu_v2.o -obj-$(CONFIG_ARM_SMMU) += arm-smmu-mod.o -arm-smmu-mod-objs += arm-smmu.o arm-smmu-impl.o arm-smmu-qcom.o +obj-$(CONFIG_ARM_SMMU) += arm_smmu.o +arm_smmu-objs += arm-smmu.o arm-smmu-impl.o arm-smmu-qcom.o obj-$(CONFIG_ARM_SMMU_V3) += arm-smmu-v3.o obj-$(CONFIG_DMAR_TABLE) += dmar.o obj-$(CONFIG_INTEL_IOMMU) += intel-iommu.o intel-pasid.o -- cgit v1.2.3 From 140588bfed2727e9a4814211617a79401d74922f Mon Sep 17 00:00:00 2001 From: Stephen Kitt Date: Fri, 14 Feb 2020 18:26:28 +0100 Subject: s390: remove obsolete ieee_emulation_warnings s390 math emulation was removed with commit 5a79859ae0f3 ("s390: remove 31 bit support"), rendering ieee_emulation_warnings useless. The code still built because it was protected by CONFIG_MATHEMU, which was no longer selectable. This patch removes the sysctl_ieee_emulation_warnings declaration and the sysctl entry declaration. Link: https://lkml.kernel.org/r/20200214172628.3598516-1-steve@sk2.org Reviewed-by: Vasily Gorbik Signed-off-by: Stephen Kitt Signed-off-by: Vasily Gorbik --- arch/s390/include/asm/processor.h | 1 - kernel/sysctl.c | 9 --------- 2 files changed, 10 deletions(-) diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h index 361ef5eda468..aadb3d0e2adc 100644 --- a/arch/s390/include/asm/processor.h +++ b/arch/s390/include/asm/processor.h @@ -84,7 +84,6 @@ void s390_update_cpu_mhz(void); void cpu_detect_mhz_feature(void); extern const struct seq_operations cpuinfo_op; -extern int sysctl_ieee_emulation_warnings; extern void execve_tail(void); extern void __bpon(void); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index d396aaaf19a3..ad5b88a53c5a 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -805,15 +805,6 @@ static struct ctl_table kern_table[] = { .extra2 = &maxolduid, }, #ifdef CONFIG_S390 -#ifdef CONFIG_MATHEMU - { - .procname = "ieee_emulation_warnings", - .data = &sysctl_ieee_emulation_warnings, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif { .procname = "userprocess_debug", .data = &show_unhandled_signals, -- cgit v1.2.3 From d0022c0ef29b78bcbe8a5c5894bd2307143afce1 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Wed, 19 Feb 2020 10:19:13 +0000 Subject: arm64: memory: Add missing brackets to untagged_addr() macro Add brackets around the evaluation of the 'addr' parameter to the untagged_addr() macro so that the cast to 'u64' applies to the result of the expression. Cc: Fixes: 597399d0cb91 ("arm64: tags: Preserve tags for addresses translated via TTBR1") Reported-by: Linus Torvalds Signed-off-by: Will Deacon --- arch/arm64/include/asm/memory.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h index a4f9ca5479b0..4d94676e5a8b 100644 --- a/arch/arm64/include/asm/memory.h +++ b/arch/arm64/include/asm/memory.h @@ -213,7 +213,7 @@ static inline unsigned long kaslr_offset(void) ((__force __typeof__(addr))sign_extend64((__force u64)(addr), 55)) #define untagged_addr(addr) ({ \ - u64 __addr = (__force u64)addr; \ + u64 __addr = (__force u64)(addr); \ __addr &= __untagged_addr(__addr); \ (__force __typeof__(addr))__addr; \ }) -- cgit v1.2.3 From 1fae37accfc5872af3905d4ba71dc6ab15829be7 Mon Sep 17 00:00:00 2001 From: Shyjumon N Date: Thu, 6 Feb 2020 13:17:25 -0700 Subject: nvme/pci: Add sleep quirk for Samsung and Toshiba drives The Samsung SSD SM981/PM981 and Toshiba SSD KBG40ZNT256G on the Lenovo C640 platform experience runtime resume issues when the SSDs are kept in sleep/suspend mode for long time. This patch applies the 'Simple Suspend' quirk to these configurations. With this patch, the issue had not been observed in a 1+ day test. Reviewed-by: Jon Derrick Reviewed-by: Christoph Hellwig Signed-off-by: Shyjumon N Signed-off-by: Keith Busch --- drivers/nvme/host/pci.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 9c80f9f08149..b0434b687b17 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2747,6 +2747,18 @@ static unsigned long check_vendor_combination_bug(struct pci_dev *pdev) (dmi_match(DMI_BOARD_NAME, "PRIME B350M-A") || dmi_match(DMI_BOARD_NAME, "PRIME Z370-A"))) return NVME_QUIRK_NO_APST; + } else if ((pdev->vendor == 0x144d && (pdev->device == 0xa801 || + pdev->device == 0xa808 || pdev->device == 0xa809)) || + (pdev->vendor == 0x1e0f && pdev->device == 0x0001)) { + /* + * Forcing to use host managed nvme power settings for + * lowest idle power with quick resume latency on + * Samsung and Toshiba SSDs based on suspend behavior + * on Coffee Lake board for LENOVO C640 + */ + if ((dmi_match(DMI_BOARD_VENDOR, "LENOVO")) && + dmi_match(DMI_BOARD_NAME, "LNVNB161216")) + return NVME_QUIRK_SIMPLE_SUSPEND; } return 0; -- cgit v1.2.3 From 98f7b86a0becc1154b1a6df6e75c9695dfd87e0d Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 12 Feb 2020 12:32:18 +0200 Subject: nvme-pci: Use single IRQ vector for old Apple models People reported that old Apple machines are not working properly if the non-first IRQ vector is in use. Set quirk for that models to limit IRQ to use first vector only. Based on original patch by GitHub user npx001. Link: https://github.com/Dunedan/mbp-2016-linux/issues/9 Cc: Benjamin Herrenschmidt Cc: Leif Liddy Signed-off-by: Andy Shevchenko Signed-off-by: Keith Busch --- drivers/nvme/host/pci.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index b0434b687b17..ace4dd9e953c 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -3121,7 +3121,8 @@ static const struct pci_device_id nvme_id_table[] = { .driver_data = NVME_QUIRK_NO_DEEPEST_PS | NVME_QUIRK_IGNORE_DEV_SUBNQN, }, { PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) }, - { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2001) }, + { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2001), + .driver_data = NVME_QUIRK_SINGLE_VECTOR }, { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2003) }, { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2005), .driver_data = NVME_QUIRK_SINGLE_VECTOR | -- cgit v1.2.3 From debcf83770073f90c9b075134650fdc758ff3033 Mon Sep 17 00:00:00 2001 From: changzhu Date: Fri, 14 Feb 2020 10:57:17 +0800 Subject: drm/amdgpu: add is_raven_kicker judgement for raven1 The rlc version of raven_kicer_rlc is different from the legacy rlc version of raven_rlc. So it needs to add a judgement function for raven_kicer_rlc and avoid disable GFXOFF when loading raven_kicer_rlc. Signed-off-by: changzhu Reviewed-by: Huang Rui Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c index 6d6aca08d6fa..3afdbbd6aaad 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c @@ -1193,6 +1193,14 @@ static bool gfx_v9_0_should_disable_gfxoff(struct pci_dev *pdev) return false; } +static bool is_raven_kicker(struct amdgpu_device *adev) +{ + if (adev->pm.fw_version >= 0x41e2b) + return true; + else + return false; +} + static void gfx_v9_0_check_if_need_gfxoff(struct amdgpu_device *adev) { if (gfx_v9_0_should_disable_gfxoff(adev->pdev)) @@ -1205,9 +1213,8 @@ static void gfx_v9_0_check_if_need_gfxoff(struct amdgpu_device *adev) break; case CHIP_RAVEN: if (!(adev->rev_id >= 0x8 || adev->pdev->device == 0x15d8) && - ((adev->gfx.rlc_fw_version != 106 && + ((!is_raven_kicker(adev) && adev->gfx.rlc_fw_version < 531) || - (adev->gfx.rlc_fw_version == 53815) || (adev->gfx.rlc_feature_version < 1) || !adev->gfx.rlc.is_rlc_v2_1)) adev->pm.pp_feature &= ~PP_GFXOFF_MASK; -- cgit v1.2.3 From 6c62ce8073daf27ae3fd03b6929d6cea3887eeb2 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Tue, 18 Feb 2020 13:20:30 -0500 Subject: drm/amdgpu/display: clean up hdcp workqueue handling Use the existence of the workqueue itself to determine when to enable HDCP features rather than sprinkling asic checks all over the code. Also add a check for the existence of the hdcp workqueue in the irq handling on the off chance we get and HPD RX interrupt with the CP bit set. This avoids a crash if the driver doesn't support HDCP for a particular asic. Fixes: 96a3b32e67236f ("drm/amd/display: only enable HDCP for DCN+") Bug: https://bugzilla.kernel.org/show_bug.cgi?id=206519 Reviewed-by: Bhawanpreet Lakha Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index 63e8a12a74bc..e8f66fbf399e 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -1911,7 +1911,7 @@ static void handle_hpd_irq(void *param) mutex_lock(&aconnector->hpd_lock); #ifdef CONFIG_DRM_AMD_DC_HDCP - if (adev->asic_type >= CHIP_RAVEN) + if (adev->dm.hdcp_workqueue) hdcp_reset_display(adev->dm.hdcp_workqueue, aconnector->dc_link->link_index); #endif if (aconnector->fake_enable) @@ -2088,8 +2088,10 @@ static void handle_hpd_rx_irq(void *param) } } #ifdef CONFIG_DRM_AMD_DC_HDCP - if (hpd_irq_data.bytes.device_service_irq.bits.CP_IRQ) - hdcp_handle_cpirq(adev->dm.hdcp_workqueue, aconnector->base.index); + if (hpd_irq_data.bytes.device_service_irq.bits.CP_IRQ) { + if (adev->dm.hdcp_workqueue) + hdcp_handle_cpirq(adev->dm.hdcp_workqueue, aconnector->base.index); + } #endif if ((dc_link->cur_link_settings.lane_count != LANE_COUNT_UNKNOWN) || (dc_link->type == dc_connection_mst_branch)) @@ -5702,7 +5704,7 @@ void amdgpu_dm_connector_init_helper(struct amdgpu_display_manager *dm, drm_connector_attach_vrr_capable_property( &aconnector->base); #ifdef CONFIG_DRM_AMD_DC_HDCP - if (adev->asic_type >= CHIP_RAVEN) + if (adev->dm.hdcp_workqueue) drm_connector_attach_content_protection_property(&aconnector->base, true); #endif } -- cgit v1.2.3 From df6d4f9db79c1a5d6f48b59db35ccd1e9ff9adfc Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Thu, 16 Jan 2020 12:46:51 -0800 Subject: x86/boot/compressed: Don't declare __force_order in kaslr_64.c GCC 10 changed the default to -fno-common, which leads to LD arch/x86/boot/compressed/vmlinux ld: arch/x86/boot/compressed/pgtable_64.o:(.bss+0x0): multiple definition of `__force_order'; \ arch/x86/boot/compressed/kaslr_64.o:(.bss+0x0): first defined here make[2]: *** [arch/x86/boot/compressed/Makefile:119: arch/x86/boot/compressed/vmlinux] Error 1 Since __force_order is already provided in pgtable_64.c, there is no need to declare __force_order in kaslr_64.c. Signed-off-by: H.J. Lu Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20200124181811.4780-1-hjl.tools@gmail.com --- arch/x86/boot/compressed/kaslr_64.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/arch/x86/boot/compressed/kaslr_64.c b/arch/x86/boot/compressed/kaslr_64.c index 748456c365f4..9557c5a15b91 100644 --- a/arch/x86/boot/compressed/kaslr_64.c +++ b/arch/x86/boot/compressed/kaslr_64.c @@ -29,9 +29,6 @@ #define __PAGE_OFFSET __PAGE_OFFSET_BASE #include "../../mm/ident_map.c" -/* Used by pgtable.h asm code to force instruction serialization. */ -unsigned long __force_order; - /* Used to track our page table allocation area. */ struct alloc_pgt_data { unsigned char *pgt_buf; -- cgit v1.2.3 From e9091ffd6a0aaced111b5d6ead5eaab5cd7101bc Mon Sep 17 00:00:00 2001 From: Julian Wiedmann Date: Mon, 10 Feb 2020 10:48:11 +0100 Subject: s390/qdio: fill SL with absolute addresses As the comment says, sl->sbal holds an absolute address. qeth currently solves this through wild casting, while zfcp doesn't care. Handle this properly in the code that actually builds the SL. Signed-off-by: Julian Wiedmann Reviewed-by: Alexandra Winter Reviewed-by: Steffen Maier [for qdio] Reviewed-by: Benjamin Block Signed-off-by: Vasily Gorbik --- arch/s390/include/asm/qdio.h | 2 +- drivers/s390/cio/qdio_setup.c | 3 ++- drivers/s390/net/qeth_core_main.c | 23 +++++++++++------------ 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/arch/s390/include/asm/qdio.h b/arch/s390/include/asm/qdio.h index 71e3f0146cda..7870cf834533 100644 --- a/arch/s390/include/asm/qdio.h +++ b/arch/s390/include/asm/qdio.h @@ -227,7 +227,7 @@ struct qdio_buffer { * @sbal: absolute SBAL address */ struct sl_element { - unsigned long sbal; + u64 sbal; } __attribute__ ((packed)); /** diff --git a/drivers/s390/cio/qdio_setup.c b/drivers/s390/cio/qdio_setup.c index 3ab8e80d7bbc..e115623b86b2 100644 --- a/drivers/s390/cio/qdio_setup.c +++ b/drivers/s390/cio/qdio_setup.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include "cio.h" @@ -205,7 +206,7 @@ static void setup_storage_lists(struct qdio_q *q, struct qdio_irq *irq_ptr, /* fill in sl */ for (j = 0; j < QDIO_MAX_BUFFERS_PER_Q; j++) - q->sl->element[j].sbal = (unsigned long)q->sbal[j]; + q->sl->element[j].sbal = virt_to_phys(q->sbal[j]); } static void setup_queues(struct qdio_irq *irq_ptr, diff --git a/drivers/s390/net/qeth_core_main.c b/drivers/s390/net/qeth_core_main.c index 9639938581f5..2c83d489f099 100644 --- a/drivers/s390/net/qeth_core_main.c +++ b/drivers/s390/net/qeth_core_main.c @@ -4746,10 +4746,10 @@ static void qeth_qdio_establish_cq(struct qeth_card *card, if (card->options.cq == QETH_CQ_ENABLED) { int offset = QDIO_MAX_BUFFERS_PER_Q * (card->qdio.no_in_queues - 1); - for (i = 0; i < QDIO_MAX_BUFFERS_PER_Q; ++i) { - in_sbal_ptrs[offset + i] = (struct qdio_buffer *) - virt_to_phys(card->qdio.c_q->bufs[i].buffer); - } + + for (i = 0; i < QDIO_MAX_BUFFERS_PER_Q; i++) + in_sbal_ptrs[offset + i] = + card->qdio.c_q->bufs[i].buffer; queue_start_poll[card->qdio.no_in_queues - 1] = NULL; } @@ -4783,10 +4783,9 @@ static int qeth_qdio_establish(struct qeth_card *card) rc = -ENOMEM; goto out_free_qib_param; } - for (i = 0; i < QDIO_MAX_BUFFERS_PER_Q; ++i) { - in_sbal_ptrs[i] = (struct qdio_buffer *) - virt_to_phys(card->qdio.in_q->bufs[i].buffer); - } + + for (i = 0; i < QDIO_MAX_BUFFERS_PER_Q; i++) + in_sbal_ptrs[i] = card->qdio.in_q->bufs[i].buffer; queue_start_poll = kcalloc(card->qdio.no_in_queues, sizeof(void *), GFP_KERNEL); @@ -4807,11 +4806,11 @@ static int qeth_qdio_establish(struct qeth_card *card) rc = -ENOMEM; goto out_free_queue_start_poll; } + for (i = 0, k = 0; i < card->qdio.no_out_queues; ++i) - for (j = 0; j < QDIO_MAX_BUFFERS_PER_Q; ++j, ++k) { - out_sbal_ptrs[k] = (struct qdio_buffer *)virt_to_phys( - card->qdio.out_qs[i]->bufs[j]->buffer); - } + for (j = 0; j < QDIO_MAX_BUFFERS_PER_Q; j++, k++) + out_sbal_ptrs[k] = + card->qdio.out_qs[i]->bufs[j]->buffer; memset(&init_data, 0, sizeof(struct qdio_initialize)); init_data.cdev = CARD_DDEV(card); -- cgit v1.2.3 From 2db01da8d25f0420c411e788a9e1ba39269ae37b Mon Sep 17 00:00:00 2001 From: Julian Wiedmann Date: Tue, 11 Feb 2020 09:27:38 +0100 Subject: s390/qdio: fill SBALEs with absolute addresses sbale->addr holds an absolute address (or for some FCP usage, an opaque request ID), and should only be used with proper virt/phys translation. Signed-off-by: Julian Wiedmann Reviewed-by: Benjamin Block Signed-off-by: Vasily Gorbik --- arch/s390/include/asm/qdio.h | 4 ++-- drivers/s390/net/qeth_core_main.c | 23 ++++++++++++----------- drivers/s390/scsi/zfcp_fsf.c | 2 +- drivers/s390/scsi/zfcp_qdio.c | 6 +++--- drivers/s390/scsi/zfcp_qdio.h | 6 +++--- 5 files changed, 21 insertions(+), 20 deletions(-) diff --git a/arch/s390/include/asm/qdio.h b/arch/s390/include/asm/qdio.h index 7870cf834533..1e3517b0518b 100644 --- a/arch/s390/include/asm/qdio.h +++ b/arch/s390/include/asm/qdio.h @@ -201,7 +201,7 @@ struct slib { * @scount: SBAL count * @sflags: whole SBAL flags * @length: length - * @addr: address + * @addr: absolute data address */ struct qdio_buffer_element { u8 eflags; @@ -211,7 +211,7 @@ struct qdio_buffer_element { u8 scount; u8 sflags; u32 length; - void *addr; + u64 addr; } __attribute__ ((packed, aligned(16))); /** diff --git a/drivers/s390/net/qeth_core_main.c b/drivers/s390/net/qeth_core_main.c index 2c83d489f099..03935466265c 100644 --- a/drivers/s390/net/qeth_core_main.c +++ b/drivers/s390/net/qeth_core_main.c @@ -1128,9 +1128,10 @@ static void qeth_clear_output_buffer(struct qeth_qdio_out_q *queue, qeth_tx_complete_buf(buf, error, budget); for (i = 0; i < queue->max_elements; ++i) { - if (buf->buffer->element[i].addr && buf->is_header[i]) - kmem_cache_free(qeth_core_header_cache, - buf->buffer->element[i].addr); + void *data = phys_to_virt(buf->buffer->element[i].addr); + + if (data && buf->is_header[i]) + kmem_cache_free(qeth_core_header_cache, data); buf->is_header[i] = 0; } @@ -2641,7 +2642,8 @@ static int qeth_init_input_buffer(struct qeth_card *card, buf->pool_entry = pool_entry; for (i = 0; i < QETH_MAX_BUFFER_ELEMENTS(card); ++i) { buf->buffer->element[i].length = PAGE_SIZE; - buf->buffer->element[i].addr = pool_entry->elements[i]; + buf->buffer->element[i].addr = + virt_to_phys(pool_entry->elements[i]); if (i == QETH_MAX_BUFFER_ELEMENTS(card) - 1) buf->buffer->element[i].eflags = SBAL_EFLAGS_LAST_ENTRY; else @@ -3459,9 +3461,8 @@ static void qeth_qdio_cq_handler(struct qeth_card *card, unsigned int qdio_err, while ((e < QDIO_MAX_ELEMENTS_PER_BUFFER) && buffer->element[e].addr) { - unsigned long phys_aob_addr; + unsigned long phys_aob_addr = buffer->element[e].addr; - phys_aob_addr = (unsigned long) buffer->element[e].addr; qeth_qdio_handle_aob(card, phys_aob_addr); ++e; } @@ -3750,7 +3751,7 @@ static unsigned int __qeth_fill_buffer(struct sk_buff *skb, elem_length = min_t(unsigned int, length, PAGE_SIZE - offset_in_page(data)); - buffer->element[element].addr = data; + buffer->element[element].addr = virt_to_phys(data); buffer->element[element].length = elem_length; length -= elem_length; if (is_first_elem) { @@ -3780,7 +3781,7 @@ static unsigned int __qeth_fill_buffer(struct sk_buff *skb, elem_length = min_t(unsigned int, length, PAGE_SIZE - offset_in_page(data)); - buffer->element[element].addr = data; + buffer->element[element].addr = virt_to_phys(data); buffer->element[element].length = elem_length; buffer->element[element].eflags = SBAL_EFLAGS_MIDDLE_FRAG; @@ -3820,7 +3821,7 @@ static unsigned int qeth_fill_buffer(struct qeth_qdio_out_buffer *buf, int element = buf->next_element_to_fill; is_first_elem = false; - buffer->element[element].addr = hdr; + buffer->element[element].addr = virt_to_phys(hdr); buffer->element[element].length = hd_len; buffer->element[element].eflags = SBAL_EFLAGS_FIRST_FRAG; /* remember to free cache-allocated qeth_hdr: */ @@ -5288,7 +5289,7 @@ next_packet: offset = 0; } - hdr = element->addr + offset; + hdr = phys_to_virt(element->addr) + offset; offset += sizeof(*hdr); skb = NULL; @@ -5387,7 +5388,7 @@ use_skb: walk_packet: while (skb_len) { int data_len = min(skb_len, (int)(element->length - offset)); - char *data = element->addr + offset; + char *data = phys_to_virt(element->addr) + offset; skb_len -= data_len; offset += data_len; diff --git a/drivers/s390/scsi/zfcp_fsf.c b/drivers/s390/scsi/zfcp_fsf.c index 223a805f0b0b..cae9b7ff79b0 100644 --- a/drivers/s390/scsi/zfcp_fsf.c +++ b/drivers/s390/scsi/zfcp_fsf.c @@ -2510,7 +2510,7 @@ void zfcp_fsf_reqid_check(struct zfcp_qdio *qdio, int sbal_idx) for (idx = 0; idx < QDIO_MAX_ELEMENTS_PER_BUFFER; idx++) { sbale = &sbal->element[idx]; - req_id = (unsigned long) sbale->addr; + req_id = sbale->addr; fsf_req = zfcp_reqlist_find_rm(adapter->req_list, req_id); if (!fsf_req) { diff --git a/drivers/s390/scsi/zfcp_qdio.c b/drivers/s390/scsi/zfcp_qdio.c index 661436a92f8e..f0d6296e673b 100644 --- a/drivers/s390/scsi/zfcp_qdio.c +++ b/drivers/s390/scsi/zfcp_qdio.c @@ -98,7 +98,7 @@ static void zfcp_qdio_int_resp(struct ccw_device *cdev, unsigned int qdio_err, memset(pl, 0, ZFCP_QDIO_MAX_SBALS_PER_REQ * sizeof(void *)); sbale = qdio->res_q[idx]->element; - req_id = (u64) sbale->addr; + req_id = sbale->addr; scount = min(sbale->scount + 1, ZFCP_QDIO_MAX_SBALS_PER_REQ + 1); /* incl. signaling SBAL */ @@ -199,7 +199,7 @@ int zfcp_qdio_sbals_from_sg(struct zfcp_qdio *qdio, struct zfcp_qdio_req *q_req, q_req->sbal_number); return -EINVAL; } - sbale->addr = sg_virt(sg); + sbale->addr = sg_phys(sg); sbale->length = sg->length; } return 0; @@ -418,7 +418,7 @@ int zfcp_qdio_open(struct zfcp_qdio *qdio) sbale->length = 0; sbale->eflags = SBAL_EFLAGS_LAST_ENTRY; sbale->sflags = 0; - sbale->addr = NULL; + sbale->addr = 0; } if (do_QDIO(cdev, QDIO_FLAG_SYNC_INPUT, 0, 0, QDIO_MAX_BUFFERS_PER_Q)) diff --git a/drivers/s390/scsi/zfcp_qdio.h b/drivers/s390/scsi/zfcp_qdio.h index 2a816a37b3c0..6b43d6b254be 100644 --- a/drivers/s390/scsi/zfcp_qdio.h +++ b/drivers/s390/scsi/zfcp_qdio.h @@ -122,14 +122,14 @@ void zfcp_qdio_req_init(struct zfcp_qdio *qdio, struct zfcp_qdio_req *q_req, % QDIO_MAX_BUFFERS_PER_Q; sbale = zfcp_qdio_sbale_req(qdio, q_req); - sbale->addr = (void *) req_id; + sbale->addr = req_id; sbale->eflags = 0; sbale->sflags = SBAL_SFLAGS0_COMMAND | sbtype; if (unlikely(!data)) return; sbale++; - sbale->addr = data; + sbale->addr = virt_to_phys(data); sbale->length = len; } @@ -152,7 +152,7 @@ void zfcp_qdio_fill_next(struct zfcp_qdio *qdio, struct zfcp_qdio_req *q_req, BUG_ON(q_req->sbale_curr == qdio->max_sbale_per_sbal - 1); q_req->sbale_curr++; sbale = zfcp_qdio_sbale_curr(qdio, q_req); - sbale->addr = data; + sbale->addr = virt_to_phys(data); sbale->length = len; } -- cgit v1.2.3 From 15755854d53b4bbb0bb37a0fce66f0156cfc8a17 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 20 Feb 2020 00:59:36 +0900 Subject: nvme: Fix uninitialized-variable warning gcc may detect a false positive on nvme using an unintialized variable if setting features fails. Since this is not a fast path, explicitly initialize this variable to suppress the warning. Reported-by: Arnd Bergmann Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index ada59df642d2..a4d8c90ee7cc 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1165,8 +1165,8 @@ static int nvme_identify_ns(struct nvme_ctrl *ctrl, static int nvme_features(struct nvme_ctrl *dev, u8 op, unsigned int fid, unsigned int dword11, void *buffer, size_t buflen, u32 *result) { + union nvme_result res = { 0 }; struct nvme_command c; - union nvme_result res; int ret; memset(&c, 0, sizeof(c)); -- cgit v1.2.3 From a7a9456e8d28e81030f7cf6f1f59f907089916a9 Mon Sep 17 00:00:00 2001 From: Amol Grover Date: Wed, 19 Feb 2020 15:30:11 +0530 Subject: net: hsr: Pass lockdep expression to RCU lists node_db is traversed using list_for_each_entry_rcu outside an RCU read-side critical section but under the protection of hsr->list_lock. Hence, add corresponding lockdep expression to silence false-positive warnings, and harden RCU lists. Signed-off-by: Amol Grover Signed-off-by: David S. Miller --- net/hsr/hsr_framereg.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/hsr/hsr_framereg.c b/net/hsr/hsr_framereg.c index 364ea2cc028e..3ba7f61be107 100644 --- a/net/hsr/hsr_framereg.c +++ b/net/hsr/hsr_framereg.c @@ -155,7 +155,8 @@ static struct hsr_node *hsr_add_node(struct hsr_priv *hsr, new_node->seq_out[i] = seq_out; spin_lock_bh(&hsr->list_lock); - list_for_each_entry_rcu(node, node_db, mac_list) { + list_for_each_entry_rcu(node, node_db, mac_list, + lockdep_is_held(&hsr->list_lock)) { if (ether_addr_equal(node->macaddress_A, addr)) goto out; if (ether_addr_equal(node->macaddress_B, addr)) -- cgit v1.2.3 From 21b5ee59ef18e27d85810584caf1f7ddc705ea83 Mon Sep 17 00:00:00 2001 From: Kim Phillips Date: Wed, 19 Feb 2020 18:52:43 +0100 Subject: x86/cpu/amd: Enable the fixed Instructions Retired counter IRPERF Commit aaf248848db50 ("perf/x86/msr: Add AMD IRPERF (Instructions Retired) performance counter") added support for access to the free-running counter via 'perf -e msr/irperf/', but when exercised, it always returns a 0 count: BEFORE: $ perf stat -e instructions,msr/irperf/ true Performance counter stats for 'true': 624,833 instructions 0 msr/irperf/ Simply set its enable bit - HWCR bit 30 - to make it start counting. Enablement is restricted to all machines advertising IRPERF capability, except those susceptible to an erratum that makes the IRPERF return bad values. That erratum occurs in Family 17h models 00-1fh [1], but not in F17h models 20h and above [2]. AFTER (on a family 17h model 31h machine): $ perf stat -e instructions,msr/irperf/ true Performance counter stats for 'true': 621,690 instructions 622,490 msr/irperf/ [1] Revision Guide for AMD Family 17h Models 00h-0Fh Processors [2] Revision Guide for AMD Family 17h Models 30h-3Fh Processors The revision guides are available from the bugzilla Link below. [ bp: Massage commit message. ] Fixes: aaf248848db50 ("perf/x86/msr: Add AMD IRPERF (Instructions Retired) performance counter") Signed-off-by: Kim Phillips Signed-off-by: Borislav Petkov Cc: Peter Zijlstra Cc: stable@vger.kernel.org Link: https://bugzilla.kernel.org/show_bug.cgi?id=206537 Link: http://lkml.kernel.org/r/20200214201805.13830-1-kim.phillips@amd.com --- arch/x86/include/asm/msr-index.h | 2 ++ arch/x86/kernel/cpu/amd.c | 14 ++++++++++++++ 2 files changed, 16 insertions(+) diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index ebe1685e92dd..d5e517d1c3dd 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -512,6 +512,8 @@ #define MSR_K7_HWCR 0xc0010015 #define MSR_K7_HWCR_SMMLOCK_BIT 0 #define MSR_K7_HWCR_SMMLOCK BIT_ULL(MSR_K7_HWCR_SMMLOCK_BIT) +#define MSR_K7_HWCR_IRPERF_EN_BIT 30 +#define MSR_K7_HWCR_IRPERF_EN BIT_ULL(MSR_K7_HWCR_IRPERF_EN_BIT) #define MSR_K7_FID_VID_CTL 0xc0010041 #define MSR_K7_FID_VID_STATUS 0xc0010042 diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index ac83a0fef628..1f875fbe1384 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -28,6 +28,7 @@ static const int amd_erratum_383[]; static const int amd_erratum_400[]; +static const int amd_erratum_1054[]; static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum); /* @@ -972,6 +973,15 @@ static void init_amd(struct cpuinfo_x86 *c) /* AMD CPUs don't reset SS attributes on SYSRET, Xen does. */ if (!cpu_has(c, X86_FEATURE_XENPV)) set_cpu_bug(c, X86_BUG_SYSRET_SS_ATTRS); + + /* + * Turn on the Instructions Retired free counter on machines not + * susceptible to erratum #1054 "Instructions Retired Performance + * Counter May Be Inaccurate". + */ + if (cpu_has(c, X86_FEATURE_IRPERF) && + !cpu_has_amd_erratum(c, amd_erratum_1054)) + msr_set_bit(MSR_K7_HWCR, MSR_K7_HWCR_IRPERF_EN_BIT); } #ifdef CONFIG_X86_32 @@ -1099,6 +1109,10 @@ static const int amd_erratum_400[] = static const int amd_erratum_383[] = AMD_OSVW_ERRATUM(3, AMD_MODEL_RANGE(0x10, 0, 0, 0xff, 0xf)); +/* #1054: Instructions Retired Performance Counter May Be Inaccurate */ +static const int amd_erratum_1054[] = + AMD_OSVW_ERRATUM(0, AMD_MODEL_RANGE(0x17, 0, 0, 0x2f, 0xf)); + static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum) { -- cgit v1.2.3 From c3331d2fe3fd4d5e321f2467d01f72de7edfb5d0 Mon Sep 17 00:00:00 2001 From: Dmitry Osipenko Date: Wed, 19 Feb 2020 18:01:22 +0300 Subject: nfc: pn544: Fix occasional HW initialization failure The PN544 driver checks the "enable" polarity during of driver's probe and it's doing that by turning ON and OFF NFC with different polarities until enabling succeeds. It takes some time for the hardware to power-down, and thus, to deassert the IRQ that is raised by turning ON the hardware. Since the delay after last power-down of the polarity-checking process is missed in the code, the interrupt may trigger immediately after installing the IRQ handler (right after the checking is done), which results in IRQ handler trying to touch the disabled HW and ends with marking NFC as 'DEAD' during of the driver's probe: pn544_hci_i2c 1-002a: NFC: nfc_en polarity : active high pn544_hci_i2c 1-002a: NFC: invalid len byte shdlc: llc_shdlc_recv_frame: NULL Frame -> link is dead This patch fixes the occasional NFC initialization failure on Nexus 7 device. Signed-off-by: Dmitry Osipenko Signed-off-by: David S. Miller --- drivers/nfc/pn544/i2c.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/nfc/pn544/i2c.c b/drivers/nfc/pn544/i2c.c index 720c89d6066e..4ac8cb262559 100644 --- a/drivers/nfc/pn544/i2c.c +++ b/drivers/nfc/pn544/i2c.c @@ -225,6 +225,7 @@ static void pn544_hci_i2c_platform_init(struct pn544_i2c_phy *phy) out: gpiod_set_value_cansleep(phy->gpiod_en, !phy->en_polarity); + usleep_range(10000, 15000); } static void pn544_hci_i2c_enable_mode(struct pn544_i2c_phy *phy, int run_mode) -- cgit v1.2.3 From 33c4acbe2f4e8f2866914b1fb90ce74fc7216c21 Mon Sep 17 00:00:00 2001 From: Madhuparna Bhowmik Date: Wed, 19 Feb 2020 20:47:46 +0530 Subject: bridge: br_stp: Use built-in RCU list checking list_for_each_entry_rcu() has built-in RCU and lock checking. Pass cond argument to list_for_each_entry_rcu() to silence false lockdep warning when CONFIG_PROVE_RCU_LIST is enabled by default. Signed-off-by: Madhuparna Bhowmik Signed-off-by: David S. Miller --- net/bridge/br_stp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/bridge/br_stp.c b/net/bridge/br_stp.c index 6856a6d9282b..1f14b8455345 100644 --- a/net/bridge/br_stp.c +++ b/net/bridge/br_stp.c @@ -63,7 +63,8 @@ struct net_bridge_port *br_get_port(struct net_bridge *br, u16 port_no) { struct net_bridge_port *p; - list_for_each_entry_rcu(p, &br->port_list, list) { + list_for_each_entry_rcu(p, &br->port_list, list, + lockdep_is_held(&br->lock)) { if (p->port_no == port_no) return p; } -- cgit v1.2.3 From 840f8ad0aaf20044e2fb099095bbce27c02f58da Mon Sep 17 00:00:00 2001 From: Brett Creeley Date: Thu, 13 Feb 2020 13:31:23 -0800 Subject: ice: Don't reject odd values of usecs set by user Currently if a user sets an odd [tx|rx]-usecs value through ethtool, the request is denied because the hardware is set to have an ITR granularity of 2us. This caused poor customer experience. Fix this by aligning to a register allowed value, which results in rounding down. Also, print a once per ring container type message to be clear about our intentions. Also, change the ITR_TO_REG define to be the bitwise and of the ITR setting and the ICE_ITR_MASK. This makes the purpose of ITR_TO_REG more obvious. Signed-off-by: Brett Creeley Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/ice/ice_ethtool.c | 49 +++++++++++++++++++++------- drivers/net/ethernet/intel/ice/ice_txrx.h | 2 +- 2 files changed, 39 insertions(+), 12 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_ethtool.c b/drivers/net/ethernet/intel/ice/ice_ethtool.c index b002ab4e5838..a88763066681 100644 --- a/drivers/net/ethernet/intel/ice/ice_ethtool.c +++ b/drivers/net/ethernet/intel/ice/ice_ethtool.c @@ -3489,21 +3489,13 @@ ice_set_rc_coalesce(enum ice_container_type c_type, struct ethtool_coalesce *ec, return -EINVAL; } - /* hardware only supports an ITR granularity of 2us */ - if (coalesce_usecs % 2 != 0) { - netdev_info(vsi->netdev, "Invalid value, %s-usecs must be even\n", - c_type_str); - return -EINVAL; - } - if (use_adaptive_coalesce) { rc->itr_setting |= ICE_ITR_DYNAMIC; } else { - /* store user facing value how it was set */ + /* save the user set usecs */ rc->itr_setting = coalesce_usecs; - /* set to static and convert to value HW understands */ - rc->target_itr = - ITR_TO_REG(ITR_REG_ALIGN(rc->itr_setting)); + /* device ITR granularity is in 2 usec increments */ + rc->target_itr = ITR_REG_ALIGN(rc->itr_setting); } return 0; @@ -3596,6 +3588,30 @@ ice_is_coalesce_param_invalid(struct net_device *netdev, return 0; } +/** + * ice_print_if_odd_usecs - print message if user tries to set odd [tx|rx]-usecs + * @netdev: netdev used for print + * @itr_setting: previous user setting + * @use_adaptive_coalesce: if adaptive coalesce is enabled or being enabled + * @coalesce_usecs: requested value of [tx|rx]-usecs + * @c_type_str: either "rx" or "tx" to match user set field of [tx|rx]-usecs + */ +static void +ice_print_if_odd_usecs(struct net_device *netdev, u16 itr_setting, + u32 use_adaptive_coalesce, u32 coalesce_usecs, + const char *c_type_str) +{ + if (use_adaptive_coalesce) + return; + + itr_setting = ITR_TO_REG(itr_setting); + + if (itr_setting != coalesce_usecs && (coalesce_usecs % 2)) + netdev_info(netdev, "User set %s-usecs to %d, device only supports even values. Rounding down and attempting to set %s-usecs to %d\n", + c_type_str, coalesce_usecs, c_type_str, + ITR_REG_ALIGN(coalesce_usecs)); +} + /** * __ice_set_coalesce - set ITR/INTRL values for the device * @netdev: pointer to the netdev associated with this query @@ -3616,8 +3632,19 @@ __ice_set_coalesce(struct net_device *netdev, struct ethtool_coalesce *ec, return -EINVAL; if (q_num < 0) { + struct ice_q_vector *q_vector = vsi->q_vectors[0]; int v_idx; + if (q_vector) { + ice_print_if_odd_usecs(netdev, q_vector->rx.itr_setting, + ec->use_adaptive_rx_coalesce, + ec->rx_coalesce_usecs, "rx"); + + ice_print_if_odd_usecs(netdev, q_vector->tx.itr_setting, + ec->use_adaptive_tx_coalesce, + ec->tx_coalesce_usecs, "tx"); + } + ice_for_each_q_vector(vsi, v_idx) { /* In some cases if DCB is configured the num_[rx|tx]q * can be less than vsi->num_q_vectors. This check diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.h b/drivers/net/ethernet/intel/ice/ice_txrx.h index 14a1bf445889..7ee00a128663 100644 --- a/drivers/net/ethernet/intel/ice/ice_txrx.h +++ b/drivers/net/ethernet/intel/ice/ice_txrx.h @@ -222,7 +222,7 @@ enum ice_rx_dtype { #define ICE_ITR_GRAN_S 1 /* ITR granularity is always 2us */ #define ICE_ITR_GRAN_US BIT(ICE_ITR_GRAN_S) #define ICE_ITR_MASK 0x1FFE /* ITR register value alignment mask */ -#define ITR_REG_ALIGN(setting) __ALIGN_MASK(setting, ~ICE_ITR_MASK) +#define ITR_REG_ALIGN(setting) ((setting) & ICE_ITR_MASK) #define ICE_ITR_ADAPTIVE_MIN_INC 0x0002 #define ICE_ITR_ADAPTIVE_MIN_USECS 0x0002 -- cgit v1.2.3 From 8a55c08d3bbc9ffc9639f69f742e59ebd99f913b Mon Sep 17 00:00:00 2001 From: Michal Swiatkowski Date: Thu, 13 Feb 2020 13:31:24 -0800 Subject: ice: Don't tell the OS that link is going down Remove code that tell the OS that link is going down when user change flow control via ethtool. When link is up it isn't certain that link goes down after 0x0605 aq command. If link doesn't go down, OS thinks that link is down, but physical link is up. To reset this state user have to take interface down and up. If link goes down after 0x0605 command, FW send information about that and after that driver tells the OS that the link goes down. So this code in ethtool is unnecessary. Signed-off-by: Michal Swiatkowski Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/ice/ice_ethtool.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_ethtool.c b/drivers/net/ethernet/intel/ice/ice_ethtool.c index a88763066681..77c412a7e7a4 100644 --- a/drivers/net/ethernet/intel/ice/ice_ethtool.c +++ b/drivers/net/ethernet/intel/ice/ice_ethtool.c @@ -2936,13 +2936,6 @@ ice_set_pauseparam(struct net_device *netdev, struct ethtool_pauseparam *pause) else return -EINVAL; - /* Tell the OS link is going down, the link will go back up when fw - * says it is ready asynchronously - */ - ice_print_link_msg(vsi, false); - netif_carrier_off(netdev); - netif_tx_stop_all_queues(netdev); - /* Set the FC mode and only restart AN if link is up */ status = ice_set_fc(pi, &aq_failures, link_up); -- cgit v1.2.3 From c54d209c78b8a3d0a75e710993833ebe1eb3273b Mon Sep 17 00:00:00 2001 From: Brett Creeley Date: Tue, 18 Feb 2020 13:22:06 -0800 Subject: ice: Wait for VF to be reset/ready before configuration The configuration/command below is failing when the VF in the xml file is already bound to the host iavf driver. pci_0000_af_0_0.xml:
> virsh attach-device domain_name pci_0000_af_0_0.xml error: Failed to attach device from pci_0000_af_0_0.xml error: Cannot set interface MAC/vlanid to 00:de:ad:00:11:01/0 for ifname ens1f1 vf 0: Device or resource busy This is failing because the VF has not been completely removed/reset after being unbound (via the virsh command above) from the host iavf driver and ice_set_vf_mac() checks if the VF is disabled before waiting for the reset to finish. Fix this by waiting for the VF remove/reset process to happen before checking if the VF is disabled. Also, since many functions for VF administration on the PF were more or less calling the same 3 functions (ice_wait_on_vf_reset(), ice_is_vf_disabled(), and ice_check_vf_init()) move these into the helper function ice_check_vf_ready_for_cfg(). Then call this function in any flow that attempts to configure/query a VF from the PF. Lastly, increase the maximum wait time in ice_wait_on_vf_reset() to 800ms, and modify/add the #define(s) that determine the wait time. This was done for robustness because in rare/stress cases VF removal can take a max of ~800ms and previously the wait was a max of ~300ms. Signed-off-by: Brett Creeley Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c | 134 +++++++++++++---------- drivers/net/ethernet/intel/ice/ice_virtchnl_pf.h | 3 +- 2 files changed, 76 insertions(+), 61 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c index 262714d5f54a..75c70d432c72 100644 --- a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c +++ b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c @@ -1873,6 +1873,48 @@ error_param: NULL, 0); } +/** + * ice_wait_on_vf_reset - poll to make sure a given VF is ready after reset + * @vf: The VF being resseting + * + * The max poll time is about ~800ms, which is about the maximum time it takes + * for a VF to be reset and/or a VF driver to be removed. + */ +static void ice_wait_on_vf_reset(struct ice_vf *vf) +{ + int i; + + for (i = 0; i < ICE_MAX_VF_RESET_TRIES; i++) { + if (test_bit(ICE_VF_STATE_INIT, vf->vf_states)) + break; + msleep(ICE_MAX_VF_RESET_SLEEP_MS); + } +} + +/** + * ice_check_vf_ready_for_cfg - check if VF is ready to be configured/queried + * @vf: VF to check if it's ready to be configured/queried + * + * The purpose of this function is to make sure the VF is not in reset, not + * disabled, and initialized so it can be configured and/or queried by a host + * administrator. + */ +static int ice_check_vf_ready_for_cfg(struct ice_vf *vf) +{ + struct ice_pf *pf; + + ice_wait_on_vf_reset(vf); + + if (ice_is_vf_disabled(vf)) + return -EINVAL; + + pf = vf->pf; + if (ice_check_vf_init(pf, vf)) + return -EBUSY; + + return 0; +} + /** * ice_set_vf_spoofchk * @netdev: network interface device structure @@ -1890,16 +1932,16 @@ int ice_set_vf_spoofchk(struct net_device *netdev, int vf_id, bool ena) enum ice_status status; struct device *dev; struct ice_vf *vf; - int ret = 0; + int ret; dev = ice_pf_to_dev(pf); if (ice_validate_vf_id(pf, vf_id)) return -EINVAL; vf = &pf->vf[vf_id]; - - if (ice_check_vf_init(pf, vf)) - return -EBUSY; + ret = ice_check_vf_ready_for_cfg(vf); + if (ret) + return ret; vf_vsi = pf->vsi[vf->lan_vsi_idx]; if (!vf_vsi) { @@ -2696,7 +2738,7 @@ ice_set_vf_port_vlan(struct net_device *netdev, int vf_id, u16 vlan_id, u8 qos, struct ice_vsi *vsi; struct device *dev; struct ice_vf *vf; - int ret = 0; + int ret; dev = ice_pf_to_dev(pf); if (ice_validate_vf_id(pf, vf_id)) @@ -2714,13 +2756,15 @@ ice_set_vf_port_vlan(struct net_device *netdev, int vf_id, u16 vlan_id, u8 qos, vf = &pf->vf[vf_id]; vsi = pf->vsi[vf->lan_vsi_idx]; - if (ice_check_vf_init(pf, vf)) - return -EBUSY; + + ret = ice_check_vf_ready_for_cfg(vf); + if (ret) + return ret; if (le16_to_cpu(vsi->info.pvid) == vlanprio) { /* duplicate request, so just return success */ dev_dbg(dev, "Duplicate pvid %d request\n", vlanprio); - return ret; + return 0; } /* If PVID, then remove all filters on the old VLAN */ @@ -2731,7 +2775,7 @@ ice_set_vf_port_vlan(struct net_device *netdev, int vf_id, u16 vlan_id, u8 qos, if (vlan_id || qos) { ret = ice_vsi_manage_pvid(vsi, vlanprio, true); if (ret) - goto error_set_pvid; + return ret; } else { ice_vsi_manage_pvid(vsi, 0, false); vsi->info.pvid = 0; @@ -2744,7 +2788,7 @@ ice_set_vf_port_vlan(struct net_device *netdev, int vf_id, u16 vlan_id, u8 qos, /* add new VLAN filter for each MAC */ ret = ice_vsi_add_vlan(vsi, vlan_id); if (ret) - goto error_set_pvid; + return ret; } /* The Port VLAN needs to be saved across resets the same as the @@ -2752,8 +2796,7 @@ ice_set_vf_port_vlan(struct net_device *netdev, int vf_id, u16 vlan_id, u8 qos, */ vf->port_vlan_id = le16_to_cpu(vsi->info.pvid); -error_set_pvid: - return ret; + return 0; } /** @@ -3236,23 +3279,6 @@ ice_get_vf_cfg(struct net_device *netdev, int vf_id, struct ifla_vf_info *ivi) return 0; } -/** - * ice_wait_on_vf_reset - * @vf: The VF being resseting - * - * Poll to make sure a given VF is ready after reset - */ -static void ice_wait_on_vf_reset(struct ice_vf *vf) -{ - int i; - - for (i = 0; i < ICE_MAX_VF_RESET_WAIT; i++) { - if (test_bit(ICE_VF_STATE_INIT, vf->vf_states)) - break; - msleep(20); - } -} - /** * ice_set_vf_mac * @netdev: network interface device structure @@ -3265,29 +3291,21 @@ int ice_set_vf_mac(struct net_device *netdev, int vf_id, u8 *mac) { struct ice_pf *pf = ice_netdev_to_pf(netdev); struct ice_vf *vf; - int ret = 0; + int ret; if (ice_validate_vf_id(pf, vf_id)) return -EINVAL; - vf = &pf->vf[vf_id]; - /* Don't set MAC on disabled VF */ - if (ice_is_vf_disabled(vf)) - return -EINVAL; - - /* In case VF is in reset mode, wait until it is completed. Depending - * on factors like queue disabling routine, this could take ~250ms - */ - ice_wait_on_vf_reset(vf); - - if (ice_check_vf_init(pf, vf)) - return -EBUSY; - if (is_zero_ether_addr(mac) || is_multicast_ether_addr(mac)) { netdev_err(netdev, "%pM not a valid unicast address\n", mac); return -EINVAL; } + vf = &pf->vf[vf_id]; + ret = ice_check_vf_ready_for_cfg(vf); + if (ret) + return ret; + /* copy MAC into dflt_lan_addr and trigger a VF reset. The reset * flow will use the updated dflt_lan_addr and add a MAC filter * using ice_add_mac. Also set pf_set_mac to indicate that the PF has @@ -3299,7 +3317,7 @@ int ice_set_vf_mac(struct net_device *netdev, int vf_id, u8 *mac) vf_id, mac); ice_vc_reset_vf(vf); - return ret; + return 0; } /** @@ -3314,22 +3332,15 @@ int ice_set_vf_trust(struct net_device *netdev, int vf_id, bool trusted) { struct ice_pf *pf = ice_netdev_to_pf(netdev); struct ice_vf *vf; + int ret; if (ice_validate_vf_id(pf, vf_id)) return -EINVAL; vf = &pf->vf[vf_id]; - /* Don't set Trusted Mode on disabled VF */ - if (ice_is_vf_disabled(vf)) - return -EINVAL; - - /* In case VF is in reset mode, wait until it is completed. Depending - * on factors like queue disabling routine, this could take ~250ms - */ - ice_wait_on_vf_reset(vf); - - if (ice_check_vf_init(pf, vf)) - return -EBUSY; + ret = ice_check_vf_ready_for_cfg(vf); + if (ret) + return ret; /* Check if already trusted */ if (trusted == vf->trusted) @@ -3355,13 +3366,15 @@ int ice_set_vf_link_state(struct net_device *netdev, int vf_id, int link_state) { struct ice_pf *pf = ice_netdev_to_pf(netdev); struct ice_vf *vf; + int ret; if (ice_validate_vf_id(pf, vf_id)) return -EINVAL; vf = &pf->vf[vf_id]; - if (ice_check_vf_init(pf, vf)) - return -EBUSY; + ret = ice_check_vf_ready_for_cfg(vf); + if (ret) + return ret; switch (link_state) { case IFLA_VF_LINK_STATE_AUTO: @@ -3397,14 +3410,15 @@ int ice_get_vf_stats(struct net_device *netdev, int vf_id, struct ice_eth_stats *stats; struct ice_vsi *vsi; struct ice_vf *vf; + int ret; if (ice_validate_vf_id(pf, vf_id)) return -EINVAL; vf = &pf->vf[vf_id]; - - if (ice_check_vf_init(pf, vf)) - return -EBUSY; + ret = ice_check_vf_ready_for_cfg(vf); + if (ret) + return ret; vsi = pf->vsi[vf->lan_vsi_idx]; if (!vsi) diff --git a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.h b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.h index 4647d636ed36..ac67982751df 100644 --- a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.h +++ b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.h @@ -38,7 +38,8 @@ #define ICE_MAX_POLICY_INTR_PER_VF 33 #define ICE_MIN_INTR_PER_VF (ICE_MIN_QS_PER_VF + 1) #define ICE_DFLT_INTR_PER_VF (ICE_DFLT_QS_PER_VF + 1) -#define ICE_MAX_VF_RESET_WAIT 15 +#define ICE_MAX_VF_RESET_TRIES 40 +#define ICE_MAX_VF_RESET_SLEEP_MS 20 #define ice_for_each_vf(pf, i) \ for ((i) = 0; (i) < (pf)->num_alloc_vfs; (i)++) -- cgit v1.2.3 From 492e0d0d6f2eb4badfd2868addf9da0f651eba0e Mon Sep 17 00:00:00 2001 From: Brian Vazquez Date: Tue, 18 Feb 2020 09:25:52 -0800 Subject: bpf: Do not grab the bucket spinlock by default on htab batch ops Grabbing the spinlock for every bucket even if it's empty, was causing significant perfomance cost when traversing htab maps that have only a few entries. This patch addresses the issue by checking first the bucket_cnt, if the bucket has some entries then we go and grab the spinlock and proceed with the batching. Tested with a htab of size 50K and different value of populated entries. Before: Benchmark Time(ns) CPU(ns) --------------------------------------------- BM_DumpHashMap/1 2759655 2752033 BM_DumpHashMap/10 2933722 2930825 BM_DumpHashMap/200 3171680 3170265 BM_DumpHashMap/500 3639607 3635511 BM_DumpHashMap/1000 4369008 4364981 BM_DumpHashMap/5k 11171919 11134028 BM_DumpHashMap/20k 69150080 69033496 BM_DumpHashMap/39k 190501036 190226162 After: Benchmark Time(ns) CPU(ns) --------------------------------------------- BM_DumpHashMap/1 202707 200109 BM_DumpHashMap/10 213441 210569 BM_DumpHashMap/200 478641 472350 BM_DumpHashMap/500 980061 967102 BM_DumpHashMap/1000 1863835 1839575 BM_DumpHashMap/5k 8961836 8902540 BM_DumpHashMap/20k 69761497 69322756 BM_DumpHashMap/39k 187437830 186551111 Fixes: 057996380a42 ("bpf: Add batch ops to all htab bpf map") Signed-off-by: Brian Vazquez Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20200218172552.215077-1-brianvv@google.com --- kernel/bpf/hashtab.c | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 2d182c4ee9d9..9194479a2fa7 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -1259,7 +1259,8 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map, u64 elem_map_flags, map_flags; struct hlist_nulls_head *head; struct hlist_nulls_node *n; - unsigned long flags; + unsigned long flags = 0; + bool locked = false; struct htab_elem *l; struct bucket *b; int ret = 0; @@ -1319,15 +1320,25 @@ again_nocopy: dst_val = values; b = &htab->buckets[batch]; head = &b->head; - raw_spin_lock_irqsave(&b->lock, flags); + /* do not grab the lock unless need it (bucket_cnt > 0). */ + if (locked) + raw_spin_lock_irqsave(&b->lock, flags); bucket_cnt = 0; hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) bucket_cnt++; + if (bucket_cnt && !locked) { + locked = true; + goto again_nocopy; + } + if (bucket_cnt > (max_count - total)) { if (total == 0) ret = -ENOSPC; + /* Note that since bucket_cnt > 0 here, it is implicit + * that the locked was grabbed, so release it. + */ raw_spin_unlock_irqrestore(&b->lock, flags); rcu_read_unlock(); this_cpu_dec(bpf_prog_active); @@ -1337,6 +1348,9 @@ again_nocopy: if (bucket_cnt > bucket_size) { bucket_size = bucket_cnt; + /* Note that since bucket_cnt > 0 here, it is implicit + * that the locked was grabbed, so release it. + */ raw_spin_unlock_irqrestore(&b->lock, flags); rcu_read_unlock(); this_cpu_dec(bpf_prog_active); @@ -1346,6 +1360,10 @@ again_nocopy: goto alloc; } + /* Next block is only safe to run if you have grabbed the lock */ + if (!locked) + goto next_batch; + hlist_nulls_for_each_entry_safe(l, n, head, hash_node) { memcpy(dst_key, l->key, key_size); @@ -1380,6 +1398,8 @@ again_nocopy: } raw_spin_unlock_irqrestore(&b->lock, flags); + locked = false; +next_batch: /* If we are not copying data, we can go to next bucket and avoid * unlocking the rcu. */ -- cgit v1.2.3 From b9aff38de2cb166476988020428985c5f7412ffc Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 19 Feb 2020 15:47:57 -0800 Subject: bpf: Fix a potential deadlock with bpf_map_do_batch Commit 057996380a42 ("bpf: Add batch ops to all htab bpf map") added lookup_and_delete batch operation for hash table. The current implementation has bpf_lru_push_free() inside the bucket lock, which may cause a deadlock. syzbot reports: -> #2 (&htab->buckets[i].lock#2){....}: __raw_spin_lock_irqsave include/linux/spinlock_api_smp.h:110 [inline] _raw_spin_lock_irqsave+0x95/0xcd kernel/locking/spinlock.c:159 htab_lru_map_delete_node+0xce/0x2f0 kernel/bpf/hashtab.c:593 __bpf_lru_list_shrink_inactive kernel/bpf/bpf_lru_list.c:220 [inline] __bpf_lru_list_shrink+0xf9/0x470 kernel/bpf/bpf_lru_list.c:266 bpf_lru_list_pop_free_to_local kernel/bpf/bpf_lru_list.c:340 [inline] bpf_common_lru_pop_free kernel/bpf/bpf_lru_list.c:447 [inline] bpf_lru_pop_free+0x87c/0x1670 kernel/bpf/bpf_lru_list.c:499 prealloc_lru_pop+0x2c/0xa0 kernel/bpf/hashtab.c:132 __htab_lru_percpu_map_update_elem+0x67e/0xa90 kernel/bpf/hashtab.c:1069 bpf_percpu_hash_update+0x16e/0x210 kernel/bpf/hashtab.c:1585 bpf_map_update_value.isra.0+0x2d7/0x8e0 kernel/bpf/syscall.c:181 generic_map_update_batch+0x41f/0x610 kernel/bpf/syscall.c:1319 bpf_map_do_batch+0x3f5/0x510 kernel/bpf/syscall.c:3348 __do_sys_bpf+0x9b7/0x41e0 kernel/bpf/syscall.c:3460 __se_sys_bpf kernel/bpf/syscall.c:3355 [inline] __x64_sys_bpf+0x73/0xb0 kernel/bpf/syscall.c:3355 do_syscall_64+0xfa/0x790 arch/x86/entry/common.c:294 entry_SYSCALL_64_after_hwframe+0x49/0xbe -> #0 (&loc_l->lock){....}: check_prev_add kernel/locking/lockdep.c:2475 [inline] check_prevs_add kernel/locking/lockdep.c:2580 [inline] validate_chain kernel/locking/lockdep.c:2970 [inline] __lock_acquire+0x2596/0x4a00 kernel/locking/lockdep.c:3954 lock_acquire+0x190/0x410 kernel/locking/lockdep.c:4484 __raw_spin_lock_irqsave include/linux/spinlock_api_smp.h:110 [inline] _raw_spin_lock_irqsave+0x95/0xcd kernel/locking/spinlock.c:159 bpf_common_lru_push_free kernel/bpf/bpf_lru_list.c:516 [inline] bpf_lru_push_free+0x250/0x5b0 kernel/bpf/bpf_lru_list.c:555 __htab_map_lookup_and_delete_batch+0x8d4/0x1540 kernel/bpf/hashtab.c:1374 htab_lru_map_lookup_and_delete_batch+0x34/0x40 kernel/bpf/hashtab.c:1491 bpf_map_do_batch+0x3f5/0x510 kernel/bpf/syscall.c:3348 __do_sys_bpf+0x1f7d/0x41e0 kernel/bpf/syscall.c:3456 __se_sys_bpf kernel/bpf/syscall.c:3355 [inline] __x64_sys_bpf+0x73/0xb0 kernel/bpf/syscall.c:3355 do_syscall_64+0xfa/0x790 arch/x86/entry/common.c:294 entry_SYSCALL_64_after_hwframe+0x49/0xbe Possible unsafe locking scenario: CPU0 CPU2 ---- ---- lock(&htab->buckets[i].lock#2); lock(&l->lock); lock(&htab->buckets[i].lock#2); lock(&loc_l->lock); *** DEADLOCK *** To fix the issue, for htab_lru_map_lookup_and_delete_batch() in CPU0, let us do bpf_lru_push_free() out of the htab bucket lock. This can avoid the above deadlock scenario. Fixes: 057996380a42 ("bpf: Add batch ops to all htab bpf map") Reported-by: syzbot+a38ff3d9356388f2fb83@syzkaller.appspotmail.com Reported-by: syzbot+122b5421d14e68f29cd1@syzkaller.appspotmail.com Suggested-by: Hillf Danton Suggested-by: Martin KaFai Lau Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Reviewed-by: Jakub Sitnicki Acked-by: Brian Vazquez Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20200219234757.3544014-1-yhs@fb.com --- kernel/bpf/hashtab.c | 34 +++++++++++++++++++++++++++++++--- 1 file changed, 31 insertions(+), 3 deletions(-) diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 9194479a2fa7..a1468e3f5af2 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -56,6 +56,7 @@ struct htab_elem { union { struct bpf_htab *htab; struct pcpu_freelist_node fnode; + struct htab_elem *batch_flink; }; }; }; @@ -126,6 +127,17 @@ free_elems: bpf_map_area_free(htab->elems); } +/* The LRU list has a lock (lru_lock). Each htab bucket has a lock + * (bucket_lock). If both locks need to be acquired together, the lock + * order is always lru_lock -> bucket_lock and this only happens in + * bpf_lru_list.c logic. For example, certain code path of + * bpf_lru_pop_free(), which is called by function prealloc_lru_pop(), + * will acquire lru_lock first followed by acquiring bucket_lock. + * + * In hashtab.c, to avoid deadlock, lock acquisition of + * bucket_lock followed by lru_lock is not allowed. In such cases, + * bucket_lock needs to be released first before acquiring lru_lock. + */ static struct htab_elem *prealloc_lru_pop(struct bpf_htab *htab, void *key, u32 hash) { @@ -1256,6 +1268,7 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map, void __user *ukeys = u64_to_user_ptr(attr->batch.keys); void *ubatch = u64_to_user_ptr(attr->batch.in_batch); u32 batch, max_count, size, bucket_size; + struct htab_elem *node_to_free = NULL; u64 elem_map_flags, map_flags; struct hlist_nulls_head *head; struct hlist_nulls_node *n; @@ -1388,10 +1401,18 @@ again_nocopy: } if (do_delete) { hlist_nulls_del_rcu(&l->hash_node); - if (is_lru_map) - bpf_lru_push_free(&htab->lru, &l->lru_node); - else + + /* bpf_lru_push_free() will acquire lru_lock, which + * may cause deadlock. See comments in function + * prealloc_lru_pop(). Let us do bpf_lru_push_free() + * after releasing the bucket lock. + */ + if (is_lru_map) { + l->batch_flink = node_to_free; + node_to_free = l; + } else { free_htab_elem(htab, l); + } } dst_key += key_size; dst_val += value_size; @@ -1399,6 +1420,13 @@ again_nocopy: raw_spin_unlock_irqrestore(&b->lock, flags); locked = false; + + while (node_to_free) { + l = node_to_free; + node_to_free = node_to_free->batch_flink; + bpf_lru_push_free(&htab->lru, &l->lru_node); + } + next_batch: /* If we are not copying data, we can go to next bucket and avoid * unlocking the rcu. -- cgit v1.2.3 From 88d5271c1efbd89713092b962070cf9af2fcd7c4 Mon Sep 17 00:00:00 2001 From: Tomas Paukrt Date: Wed, 22 Jan 2020 21:10:39 +0100 Subject: dt-bindings: mmc: omap-hsmmc: Fix SDIO interrupt SDIO interrupt must be specified correctly as IRQ_TYPE_LEVEL_LOW instead of GPIO_ACTIVE_LOW. Signed-off-by: Tomas Paukrt Signed-off-by: Rob Herring --- Documentation/devicetree/bindings/mmc/ti-omap-hsmmc.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/mmc/ti-omap-hsmmc.txt b/Documentation/devicetree/bindings/mmc/ti-omap-hsmmc.txt index 19f5508a7569..4a9145ef15d6 100644 --- a/Documentation/devicetree/bindings/mmc/ti-omap-hsmmc.txt +++ b/Documentation/devicetree/bindings/mmc/ti-omap-hsmmc.txt @@ -124,7 +124,7 @@ not every application needs SDIO irq, e.g. MMC cards. pinctrl-1 = <&mmc1_idle>; pinctrl-2 = <&mmc1_sleep>; ... - interrupts-extended = <&intc 64 &gpio2 28 GPIO_ACTIVE_LOW>; + interrupts-extended = <&intc 64 &gpio2 28 IRQ_TYPE_LEVEL_LOW>; }; mmc1_idle : pinmux_cirq_pin { -- cgit v1.2.3 From 06f5201c6392f998a49ca9c9173e2930c8eb51d8 Mon Sep 17 00:00:00 2001 From: Rohit Maheshwari Date: Wed, 19 Feb 2020 09:40:22 +0530 Subject: net/tls: Fix to avoid gettig invalid tls record Current code doesn't check if tcp sequence number is starting from (/after) 1st record's start sequnce number. It only checks if seq number is before 1st record's end sequnce number. This problem will always be a possibility in re-transmit case. If a record which belongs to a requested seq number is already deleted, tls_get_record will start looking into list and as per the check it will look if seq number is before the end seq of 1st record, which will always be true and will return 1st record always, it should in fact return NULL. As part of the fix, start looking each record only if the sequence number lies in the list else return NULL. There is one more check added, driver look for the start marker record to handle tcp packets which are before the tls offload start sequence number, hence return 1st record if the record is tls start marker and seq number is before the 1st record's starting sequence number. Fixes: e8f69799810c ("net/tls: Add generic NIC offload infrastructure") Signed-off-by: Rohit Maheshwari Reviewed-by: Jakub Kicinski Signed-off-by: David S. Miller --- net/tls/tls_device.c | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index 1ba5a92832bb..1c5574e2e058 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -593,7 +593,7 @@ struct tls_record_info *tls_get_record(struct tls_offload_context_tx *context, u32 seq, u64 *p_record_sn) { u64 record_sn = context->hint_record_sn; - struct tls_record_info *info; + struct tls_record_info *info, *last; info = context->retransmit_hint; if (!info || @@ -605,6 +605,24 @@ struct tls_record_info *tls_get_record(struct tls_offload_context_tx *context, struct tls_record_info, list); if (!info) return NULL; + /* send the start_marker record if seq number is before the + * tls offload start marker sequence number. This record is + * required to handle TCP packets which are before TLS offload + * started. + * And if it's not start marker, look if this seq number + * belongs to the list. + */ + if (likely(!tls_record_is_start_marker(info))) { + /* we have the first record, get the last record to see + * if this seq number belongs to the list. + */ + last = list_last_entry(&context->records_list, + struct tls_record_info, list); + + if (!between(seq, tls_record_start_seq(info), + last->end_seq)) + return NULL; + } record_sn = context->unacked_record_sn; } -- cgit v1.2.3 From 303d0403b8c25e994e4a6e45389e173cf8706fb5 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Wed, 19 Feb 2020 14:16:32 -0500 Subject: udp: rehash on disconnect As of the below commit, udp sockets bound to a specific address can coexist with one bound to the any addr for the same port. The commit also phased out the use of socket hashing based only on port (hslot), in favor of always hashing on {addr, port} (hslot2). The change broke the following behavior with disconnect (AF_UNSPEC): server binds to 0.0.0.0:1337 server connects to 127.0.0.1:80 server disconnects client connects to 127.0.0.1:1337 client sends "hello" server reads "hello" // times out, packet did not find sk On connect the server acquires a specific source addr suitable for routing to its destination. On disconnect it reverts to the any addr. The connect call triggers a rehash to a different hslot2. On disconnect, add the same to return to the original hslot2. Skip this step if the socket is going to be unhashed completely. Fixes: 4cdeeee9252a ("net: udp: prefer listeners bound to an address") Reported-by: Pavel Roskin Signed-off-by: Willem de Bruijn Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/udp.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index db76b9609299..08a41f1e1cd2 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1857,8 +1857,12 @@ int __udp_disconnect(struct sock *sk, int flags) inet->inet_dport = 0; sock_rps_reset_rxhash(sk); sk->sk_bound_dev_if = 0; - if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) + if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) { inet_reset_saddr(sk); + if (sk->sk_prot->rehash && + (sk->sk_userlocks & SOCK_BINDPORT_LOCK)) + sk->sk_prot->rehash(sk); + } if (!(sk->sk_userlocks & SOCK_BINDPORT_LOCK)) { sk->sk_prot->unhash(sk); -- cgit v1.2.3 From 3044d9891bdb98ebae5c9e6fb1bd7f007bf7cad6 Mon Sep 17 00:00:00 2001 From: Thierry Reding Date: Tue, 14 Jan 2020 13:38:21 +0100 Subject: dt-bindings: memory-controller: Update example for Tegra124 EMC The example in the Tegra124 EMC device tree binding looks like an old version that doesn't contain all the required fields. Update it with a version from the current DTS files to fix the make dt_binding_check target. Reported-by: Rob Herring Signed-off-by: Thierry Reding [robh: also fix missing '#reset-cells'] Signed-off-by: Rob Herring --- .../memory-controllers/nvidia,tegra124-emc.yaml | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/Documentation/devicetree/bindings/memory-controllers/nvidia,tegra124-emc.yaml b/Documentation/devicetree/bindings/memory-controllers/nvidia,tegra124-emc.yaml index dd1843489ad1..3e0a8a92d652 100644 --- a/Documentation/devicetree/bindings/memory-controllers/nvidia,tegra124-emc.yaml +++ b/Documentation/devicetree/bindings/memory-controllers/nvidia,tegra124-emc.yaml @@ -347,6 +347,7 @@ examples: interrupts = ; #iommu-cells = <1>; + #reset-cells = <1>; }; external-memory-controller@7001b000 { @@ -363,20 +364,23 @@ examples: timing-0 { clock-frequency = <12750000>; - nvidia,emc-zcal-cnt-long = <0x00000042>; - nvidia,emc-auto-cal-interval = <0x001fffff>; - nvidia,emc-ctt-term-ctrl = <0x00000802>; - nvidia,emc-cfg = <0x73240000>; - nvidia,emc-cfg-2 = <0x000008c5>; - nvidia,emc-sel-dpd-ctrl = <0x00040128>; - nvidia,emc-bgbias-ctl0 = <0x00000008>; nvidia,emc-auto-cal-config = <0xa1430000>; nvidia,emc-auto-cal-config2 = <0x00000000>; nvidia,emc-auto-cal-config3 = <0x00000000>; - nvidia,emc-mode-reset = <0x80001221>; + nvidia,emc-auto-cal-interval = <0x001fffff>; + nvidia,emc-bgbias-ctl0 = <0x00000008>; + nvidia,emc-cfg = <0x73240000>; + nvidia,emc-cfg-2 = <0x000008c5>; + nvidia,emc-ctt-term-ctrl = <0x00000802>; nvidia,emc-mode-1 = <0x80100003>; nvidia,emc-mode-2 = <0x80200008>; nvidia,emc-mode-4 = <0x00000000>; + nvidia,emc-mode-reset = <0x80001221>; + nvidia,emc-mrs-wait-cnt = <0x000e000e>; + nvidia,emc-sel-dpd-ctrl = <0x00040128>; + nvidia,emc-xm2dqspadctrl2 = <0x0130b118>; + nvidia,emc-zcal-cnt-long = <0x00000042>; + nvidia,emc-zcal-interval = <0x00000000>; nvidia,emc-configuration = < 0x00000000 /* EMC_RC */ -- cgit v1.2.3 From 867c1859590f3bbc13d9cfb46c90c8202769d0e5 Mon Sep 17 00:00:00 2001 From: Grygorii Strashko Date: Fri, 14 Feb 2020 21:44:08 +0200 Subject: dt-bindings: net: mdio: remove compatible string from example Remove vendor specific compatible string from example, otherwise DT YAML schemas validation may trigger warnings specific to TI ti,davinci_mdio and not to the generic MDIO example. For example, the "bus_freq" is required for davinci_mdio, but not required for generic mdio example. As result following warning will be produced: mdio.example.dt.yaml: mdio@5c030000: 'bus_freq' is a required property Signed-off-by: Grygorii Strashko Signed-off-by: Rob Herring --- Documentation/devicetree/bindings/net/mdio.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/Documentation/devicetree/bindings/net/mdio.yaml b/Documentation/devicetree/bindings/net/mdio.yaml index 5d08d2ffd4eb..50c3397a82bc 100644 --- a/Documentation/devicetree/bindings/net/mdio.yaml +++ b/Documentation/devicetree/bindings/net/mdio.yaml @@ -56,7 +56,6 @@ patternProperties: examples: - | davinci_mdio: mdio@5c030000 { - compatible = "ti,davinci_mdio"; reg = <0x5c030000 0x1000>; #address-cells = <1>; #size-cells = <0>; -- cgit v1.2.3 From 303d37b4b0522150c0c7701d84934bc13199bebc Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Mon, 10 Feb 2020 11:04:16 +0100 Subject: dt-bindings: media: csi: Add interconnects properties The Allwinner CSI controller is sitting beside the MBUS that is represented as an interconnect. Make sure that the interconnect properties are valid in the binding. Fixes: 7866d6903ce8 ("media: dt-bindings: media: sun4i-csi: Add compatible for CSI0 on R40") Signed-off-by: Maxime Ripard Signed-off-by: Rob Herring --- .../devicetree/bindings/media/allwinner,sun4i-a10-csi.yaml | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/Documentation/devicetree/bindings/media/allwinner,sun4i-a10-csi.yaml b/Documentation/devicetree/bindings/media/allwinner,sun4i-a10-csi.yaml index 9af873b43acd..afde17d9dab1 100644 --- a/Documentation/devicetree/bindings/media/allwinner,sun4i-a10-csi.yaml +++ b/Documentation/devicetree/bindings/media/allwinner,sun4i-a10-csi.yaml @@ -51,6 +51,16 @@ properties: resets: maxItems: 1 + # FIXME: This should be made required eventually once every SoC will + # have the MBUS declared. + interconnects: + maxItems: 1 + + # FIXME: This should be made required eventually once every SoC will + # have the MBUS declared. + interconnect-names: + const: dma-mem + # See ./video-interfaces.txt for details port: type: object -- cgit v1.2.3 From 854bdbae9058bcf09b0add70b6047bc9ca776de2 Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Mon, 10 Feb 2020 11:04:17 +0100 Subject: dt-bindings: media: csi: Fix clocks description Commit 1de243b07666 ("media: dt-bindings: media: sun4i-csi: Add compatible for CSI1 on A10/A20") introduced support for the CSI1 controller on A10 and A20 that unlike CSI0 doesn't have an ISP and therefore only have two clocks, the bus and module clocks. The clocks and clock-names properties have thus been modified to allow either two or tree clocks. However, the current list has the ISP clock at the second position, which means the bindings expects a list of either bus and isp, or bus, isp and mod. The initial intent of the patch was obviously to have bus and mod in the former case. Let's fix the binding so that it validates properly. Fixes: 1de243b07666 ("media: dt-bindings: media: sun4i-csi: Add compatible for CSI1 on A10/A20") Signed-off-by: Maxime Ripard Signed-off-by: Rob Herring --- .../bindings/media/allwinner,sun4i-a10-csi.yaml | 30 +++++++++++++--------- 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/Documentation/devicetree/bindings/media/allwinner,sun4i-a10-csi.yaml b/Documentation/devicetree/bindings/media/allwinner,sun4i-a10-csi.yaml index afde17d9dab1..8453ee340b9f 100644 --- a/Documentation/devicetree/bindings/media/allwinner,sun4i-a10-csi.yaml +++ b/Documentation/devicetree/bindings/media/allwinner,sun4i-a10-csi.yaml @@ -33,20 +33,26 @@ properties: maxItems: 1 clocks: - minItems: 2 - maxItems: 3 - items: - - description: The CSI interface clock - - description: The CSI ISP clock - - description: The CSI DRAM clock + oneOf: + - items: + - description: The CSI interface clock + - description: The CSI DRAM clock + + - items: + - description: The CSI interface clock + - description: The CSI ISP clock + - description: The CSI DRAM clock clock-names: - minItems: 2 - maxItems: 3 - items: - - const: bus - - const: isp - - const: ram + oneOf: + - items: + - const: bus + - const: ram + + - items: + - const: bus + - const: isp + - const: ram resets: maxItems: 1 -- cgit v1.2.3 From badcd4546d52ae4318f2bcfda0e47a1394b60e38 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Wed, 19 Feb 2020 14:36:14 -0800 Subject: hwmon: (acpi_power_meter) Fix lockdep splat Damien Le Moal reports a lockdep splat with the acpi_power_meter, observed with Linux v5.5 and later. ====================================================== WARNING: possible circular locking dependency detected 5.6.0-rc2+ #629 Not tainted ------------------------------------------------------ python/1397 is trying to acquire lock: ffff888619080070 (&resource->lock){+.+.}, at: show_power+0x3c/0xa0 [acpi_power_meter] but task is already holding lock: ffff88881643f188 (kn->count#119){++++}, at: kernfs_seq_start+0x6a/0x160 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (kn->count#119){++++}: __kernfs_remove+0x626/0x7e0 kernfs_remove_by_name_ns+0x41/0x80 remove_attrs+0xcb/0x3c0 [acpi_power_meter] acpi_power_meter_notify+0x1f7/0x310 [acpi_power_meter] acpi_ev_notify_dispatch+0x198/0x1f3 acpi_os_execute_deferred+0x4d/0x70 process_one_work+0x7c8/0x1340 worker_thread+0x94/0xc70 kthread+0x2ed/0x3f0 ret_from_fork+0x24/0x30 -> #0 (&resource->lock){+.+.}: __lock_acquire+0x20be/0x49b0 lock_acquire+0x127/0x340 __mutex_lock+0x15b/0x1350 show_power+0x3c/0xa0 [acpi_power_meter] dev_attr_show+0x3f/0x80 sysfs_kf_seq_show+0x216/0x410 seq_read+0x407/0xf90 vfs_read+0x152/0x2c0 ksys_read+0xf3/0x1d0 do_syscall_64+0x95/0x1010 entry_SYSCALL_64_after_hwframe+0x49/0xbe other info that might help us debug this: Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(kn->count#119); lock(&resource->lock); lock(kn->count#119); lock(&resource->lock); *** DEADLOCK *** 4 locks held by python/1397: #0: ffff8890242d64e0 (&f->f_pos_lock){+.+.}, at: __fdget_pos+0x9b/0xb0 #1: ffff889040be74e0 (&p->lock){+.+.}, at: seq_read+0x6b/0xf90 #2: ffff8890448eb880 (&of->mutex){+.+.}, at: kernfs_seq_start+0x47/0x160 #3: ffff88881643f188 (kn->count#119){++++}, at: kernfs_seq_start+0x6a/0x160 stack backtrace: CPU: 10 PID: 1397 Comm: python Not tainted 5.6.0-rc2+ #629 Hardware name: Supermicro Super Server/X11DPL-i, BIOS 3.1 05/21/2019 Call Trace: dump_stack+0x97/0xe0 check_noncircular+0x32e/0x3e0 ? print_circular_bug.isra.0+0x1e0/0x1e0 ? unwind_next_frame+0xb9a/0x1890 ? entry_SYSCALL_64_after_hwframe+0x49/0xbe ? graph_lock+0x79/0x170 ? __lockdep_reset_lock+0x3c0/0x3c0 ? mark_lock+0xbc/0x1150 __lock_acquire+0x20be/0x49b0 ? mark_held_locks+0xe0/0xe0 ? stack_trace_save+0x91/0xc0 lock_acquire+0x127/0x340 ? show_power+0x3c/0xa0 [acpi_power_meter] ? device_remove_bin_file+0x10/0x10 ? device_remove_bin_file+0x10/0x10 __mutex_lock+0x15b/0x1350 ? show_power+0x3c/0xa0 [acpi_power_meter] ? show_power+0x3c/0xa0 [acpi_power_meter] ? mutex_lock_io_nested+0x11f0/0x11f0 ? lock_downgrade+0x6a0/0x6a0 ? kernfs_seq_start+0x47/0x160 ? lock_acquire+0x127/0x340 ? kernfs_seq_start+0x6a/0x160 ? device_remove_bin_file+0x10/0x10 ? show_power+0x3c/0xa0 [acpi_power_meter] show_power+0x3c/0xa0 [acpi_power_meter] dev_attr_show+0x3f/0x80 ? memset+0x20/0x40 sysfs_kf_seq_show+0x216/0x410 seq_read+0x407/0xf90 ? security_file_permission+0x16f/0x2c0 vfs_read+0x152/0x2c0 Problem is that reading an attribute takes the kernfs lock in the kernfs code, then resource->lock in the driver. During an ACPI notification, the opposite happens: The resource lock is taken first, followed by the kernfs lock when sysfs attributes are removed and re-created. Presumably this is now seen due to some locking related changes in kernfs after v5.4, but it was likely always a problem. Fix the problem by not blindly acquiring the lock in the notification function. It is only needed to protect the various update functions. However, those update functions are called anyway when sysfs attributes are read. This means that we can just stop calling those functions from the notifier, and the resource lock in the notifier function is no longer needed. That leaves two situations: First, METER_NOTIFY_CONFIG removes and re-allocates capability strings. While it did so under the resource lock, _displaying_ those strings was not protected, creating a race condition. To solve this problem, selectively protect both removal/creation and reporting of capability attributes with the resource lock. Second, removing and re-creating the attribute files is no longer protected by the resource lock. That doesn't matter since access to each individual attribute is protected by the kernfs lock. Userspace may get messed up if attributes disappear and reappear under its nose, but that is not different than today, and there is nothing we can do about it without major driver restructuring. Last but not least, when removing the driver, remove attribute functions first, then release capability strings. This avoids yet another race condition. Reported-by: Damien Le Moal Cc: Damien Le Moal Cc: stable@vger.kernel.org # v5.5+ Tested-by: Damien Le Moal Signed-off-by: Guenter Roeck --- drivers/hwmon/acpi_power_meter.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/hwmon/acpi_power_meter.c b/drivers/hwmon/acpi_power_meter.c index 4cf25458f0b9..0db8ef4fd6e1 100644 --- a/drivers/hwmon/acpi_power_meter.c +++ b/drivers/hwmon/acpi_power_meter.c @@ -355,7 +355,9 @@ static ssize_t show_str(struct device *dev, struct acpi_device *acpi_dev = to_acpi_device(dev); struct acpi_power_meter_resource *resource = acpi_dev->driver_data; acpi_string val; + int ret; + mutex_lock(&resource->lock); switch (attr->index) { case 0: val = resource->model_number; @@ -372,8 +374,9 @@ static ssize_t show_str(struct device *dev, val = ""; break; } - - return sprintf(buf, "%s\n", val); + ret = sprintf(buf, "%s\n", val); + mutex_unlock(&resource->lock); + return ret; } static ssize_t show_val(struct device *dev, @@ -817,11 +820,12 @@ static void acpi_power_meter_notify(struct acpi_device *device, u32 event) resource = acpi_driver_data(device); - mutex_lock(&resource->lock); switch (event) { case METER_NOTIFY_CONFIG: + mutex_lock(&resource->lock); free_capabilities(resource); res = read_capabilities(resource); + mutex_unlock(&resource->lock); if (res) break; @@ -830,15 +834,12 @@ static void acpi_power_meter_notify(struct acpi_device *device, u32 event) break; case METER_NOTIFY_TRIP: sysfs_notify(&device->dev.kobj, NULL, POWER_AVERAGE_NAME); - update_meter(resource); break; case METER_NOTIFY_CAP: sysfs_notify(&device->dev.kobj, NULL, POWER_CAP_NAME); - update_cap(resource); break; case METER_NOTIFY_INTERVAL: sysfs_notify(&device->dev.kobj, NULL, POWER_AVG_INTERVAL_NAME); - update_avg_interval(resource); break; case METER_NOTIFY_CAPPING: sysfs_notify(&device->dev.kobj, NULL, POWER_ALARM_NAME); @@ -848,7 +849,6 @@ static void acpi_power_meter_notify(struct acpi_device *device, u32 event) WARN(1, "Unexpected event %d\n", event); break; } - mutex_unlock(&resource->lock); acpi_bus_generate_netlink_event(ACPI_POWER_METER_CLASS, dev_name(&device->dev), event, 0); @@ -912,8 +912,8 @@ static int acpi_power_meter_remove(struct acpi_device *device) resource = acpi_driver_data(device); hwmon_device_unregister(resource->hwmon_dev); - free_capabilities(resource); remove_attrs(resource); + free_capabilities(resource); kfree(resource); return 0; -- cgit v1.2.3 From 35df4299a6487f323b0aca120ea3f485dfee2ae3 Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Fri, 7 Feb 2020 09:29:11 -0500 Subject: ext4: fix a data race in EXT4_I(inode)->i_disksize EXT4_I(inode)->i_disksize could be accessed concurrently as noticed by KCSAN, BUG: KCSAN: data-race in ext4_write_end [ext4] / ext4_writepages [ext4] write to 0xffff91c6713b00f8 of 8 bytes by task 49268 on cpu 127: ext4_write_end+0x4e3/0x750 [ext4] ext4_update_i_disksize at fs/ext4/ext4.h:3032 (inlined by) ext4_update_inode_size at fs/ext4/ext4.h:3046 (inlined by) ext4_write_end at fs/ext4/inode.c:1287 generic_perform_write+0x208/0x2a0 ext4_buffered_write_iter+0x11f/0x210 [ext4] ext4_file_write_iter+0xce/0x9e0 [ext4] new_sync_write+0x29c/0x3b0 __vfs_write+0x92/0xa0 vfs_write+0x103/0x260 ksys_write+0x9d/0x130 __x64_sys_write+0x4c/0x60 do_syscall_64+0x91/0xb47 entry_SYSCALL_64_after_hwframe+0x49/0xbe read to 0xffff91c6713b00f8 of 8 bytes by task 24872 on cpu 37: ext4_writepages+0x10ac/0x1d00 [ext4] mpage_map_and_submit_extent at fs/ext4/inode.c:2468 (inlined by) ext4_writepages at fs/ext4/inode.c:2772 do_writepages+0x5e/0x130 __writeback_single_inode+0xeb/0xb20 writeback_sb_inodes+0x429/0x900 __writeback_inodes_wb+0xc4/0x150 wb_writeback+0x4bd/0x870 wb_workfn+0x6b4/0x960 process_one_work+0x54c/0xbe0 worker_thread+0x80/0x650 kthread+0x1e0/0x200 ret_from_fork+0x27/0x50 Reported by Kernel Concurrency Sanitizer on: CPU: 37 PID: 24872 Comm: kworker/u261:2 Tainted: G W O L 5.5.0-next-20200204+ #5 Hardware name: HPE ProLiant DL385 Gen10/ProLiant DL385 Gen10, BIOS A40 07/10/2019 Workqueue: writeback wb_workfn (flush-7:0) Since only the read is operating as lockless (outside of the "i_data_sem"), load tearing could introduce a logic bug. Fix it by adding READ_ONCE() for the read and WRITE_ONCE() for the write. Signed-off-by: Qian Cai Link: https://lore.kernel.org/r/1581085751-31793-1-git-send-email-cai@lca.pw Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/ext4.h | 2 +- fs/ext4/inode.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 4441331d06cc..480badcf2783 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -3032,7 +3032,7 @@ static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize) !inode_is_locked(inode)); down_write(&EXT4_I(inode)->i_data_sem); if (newsize > EXT4_I(inode)->i_disksize) - EXT4_I(inode)->i_disksize = newsize; + WRITE_ONCE(EXT4_I(inode)->i_disksize, newsize); up_write(&EXT4_I(inode)->i_data_sem); } diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index e60aca791d3f..6e1d81ed44ad 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2465,7 +2465,7 @@ update_disksize: * truncate are avoided by checking i_size under i_data_sem. */ disksize = ((loff_t)mpd->first_page) << PAGE_SHIFT; - if (disksize > EXT4_I(inode)->i_disksize) { + if (disksize > READ_ONCE(EXT4_I(inode)->i_disksize)) { int err2; loff_t i_size; -- cgit v1.2.3 From 9424ef56e13a1f14c57ea161eed3ecfdc7b2770e Mon Sep 17 00:00:00 2001 From: Shijie Luo Date: Sat, 15 Feb 2020 03:02:06 -0500 Subject: ext4: add cond_resched() to __ext4_find_entry() We tested a soft lockup problem in linux 4.19 which could also be found in linux 5.x. When dir inode takes up a large number of blocks, and if the directory is growing when we are searching, it's possible the restart branch could be called many times, and the do while loop could hold cpu a long time. Here is the call trace in linux 4.19. [ 473.756186] Call trace: [ 473.756196] dump_backtrace+0x0/0x198 [ 473.756199] show_stack+0x24/0x30 [ 473.756205] dump_stack+0xa4/0xcc [ 473.756210] watchdog_timer_fn+0x300/0x3e8 [ 473.756215] __hrtimer_run_queues+0x114/0x358 [ 473.756217] hrtimer_interrupt+0x104/0x2d8 [ 473.756222] arch_timer_handler_virt+0x38/0x58 [ 473.756226] handle_percpu_devid_irq+0x90/0x248 [ 473.756231] generic_handle_irq+0x34/0x50 [ 473.756234] __handle_domain_irq+0x68/0xc0 [ 473.756236] gic_handle_irq+0x6c/0x150 [ 473.756238] el1_irq+0xb8/0x140 [ 473.756286] ext4_es_lookup_extent+0xdc/0x258 [ext4] [ 473.756310] ext4_map_blocks+0x64/0x5c0 [ext4] [ 473.756333] ext4_getblk+0x6c/0x1d0 [ext4] [ 473.756356] ext4_bread_batch+0x7c/0x1f8 [ext4] [ 473.756379] ext4_find_entry+0x124/0x3f8 [ext4] [ 473.756402] ext4_lookup+0x8c/0x258 [ext4] [ 473.756407] __lookup_hash+0x8c/0xe8 [ 473.756411] filename_create+0xa0/0x170 [ 473.756413] do_mkdirat+0x6c/0x140 [ 473.756415] __arm64_sys_mkdirat+0x28/0x38 [ 473.756419] el0_svc_common+0x78/0x130 [ 473.756421] el0_svc_handler+0x38/0x78 [ 473.756423] el0_svc+0x8/0xc [ 485.755156] watchdog: BUG: soft lockup - CPU#2 stuck for 22s! [tmp:5149] Add cond_resched() to avoid soft lockup and to provide a better system responding. Link: https://lore.kernel.org/r/20200215080206.13293-1-luoshijie1@huawei.com Signed-off-by: Shijie Luo Signed-off-by: Theodore Ts'o Reviewed-by: Jan Kara Cc: stable@kernel.org --- fs/ext4/namei.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index ceff4b4b1877..b05ea72f38fd 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1511,6 +1511,7 @@ restart: /* * We deal with the read-ahead logic here. */ + cond_resched(); if (ra_ptr >= ra_max) { /* Refill the readahead buffer */ ra_ptr = 0; -- cgit v1.2.3 From ce4a64e1f656138e2a1481049ea554720f86b43a Mon Sep 17 00:00:00 2001 From: Scott Branden Date: Wed, 19 Feb 2020 14:14:03 -0800 Subject: docs: arm64: fix trivial spelling enought to enough in memory.rst Fix trivial spelling error enought to enough in memory.rst. Cc: trivial@kernel.org Signed-off-by: Scott Branden Signed-off-by: Will Deacon --- Documentation/arm64/memory.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/arm64/memory.rst b/Documentation/arm64/memory.rst index 02e02175e6f5..cf03b3290800 100644 --- a/Documentation/arm64/memory.rst +++ b/Documentation/arm64/memory.rst @@ -129,7 +129,7 @@ this logic. As a single binary will need to support both 48-bit and 52-bit VA spaces, the VMEMMAP must be sized large enough for 52-bit VAs and -also must be sized large enought to accommodate a fixed PAGE_OFFSET. +also must be sized large enough to accommodate a fixed PAGE_OFFSET. Most code in the kernel should not need to consider the VA_BITS, for code that does need to know the VA size the variables are -- cgit v1.2.3 From dcde237319e626d1ec3c9d8b7613032f0fd4663a Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Wed, 19 Feb 2020 12:31:56 +0000 Subject: mm: Avoid creating virtual address aliases in brk()/mmap()/mremap() Currently the arm64 kernel ignores the top address byte passed to brk(), mmap() and mremap(). When the user is not aware of the 56-bit address limit or relies on the kernel to return an error, untagging such pointers has the potential to create address aliases in user-space. Passing a tagged address to munmap(), madvise() is permitted since the tagged pointer is expected to be inside an existing mapping. The current behaviour breaks the existing glibc malloc() implementation which relies on brk() with an address beyond 56-bit to be rejected by the kernel. Remove untagging in the above functions by partially reverting commit ce18d171cb73 ("mm: untag user pointers in mmap/munmap/mremap/brk"). In addition, update the arm64 tagged-address-abi.rst document accordingly. Link: https://bugzilla.redhat.com/1797052 Fixes: ce18d171cb73 ("mm: untag user pointers in mmap/munmap/mremap/brk") Cc: # 5.4.x- Cc: Florian Weimer Reviewed-by: Andrew Morton Reported-by: Victor Stinner Acked-by: Will Deacon Acked-by: Andrey Konovalov Signed-off-by: Catalin Marinas Signed-off-by: Will Deacon --- Documentation/arm64/tagged-address-abi.rst | 11 +++++++++-- mm/mmap.c | 4 ---- mm/mremap.c | 1 - 3 files changed, 9 insertions(+), 7 deletions(-) diff --git a/Documentation/arm64/tagged-address-abi.rst b/Documentation/arm64/tagged-address-abi.rst index d4a85d535bf9..4a9d9c794ee5 100644 --- a/Documentation/arm64/tagged-address-abi.rst +++ b/Documentation/arm64/tagged-address-abi.rst @@ -44,8 +44,15 @@ The AArch64 Tagged Address ABI has two stages of relaxation depending how the user addresses are used by the kernel: 1. User addresses not accessed by the kernel but used for address space - management (e.g. ``mmap()``, ``mprotect()``, ``madvise()``). The use - of valid tagged pointers in this context is always allowed. + management (e.g. ``mprotect()``, ``madvise()``). The use of valid + tagged pointers in this context is allowed with the exception of + ``brk()``, ``mmap()`` and the ``new_address`` argument to + ``mremap()`` as these have the potential to alias with existing + user addresses. + + NOTE: This behaviour changed in v5.6 and so some earlier kernels may + incorrectly accept valid tagged pointers for the ``brk()``, + ``mmap()`` and ``mremap()`` system calls. 2. User addresses accessed by the kernel (e.g. ``write()``). This ABI relaxation is disabled by default and the application thread needs to diff --git a/mm/mmap.c b/mm/mmap.c index 6756b8bb0033..d681a20eb4ea 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -195,8 +195,6 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) bool downgraded = false; LIST_HEAD(uf); - brk = untagged_addr(brk); - if (down_write_killable(&mm->mmap_sem)) return -EINTR; @@ -1557,8 +1555,6 @@ unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len, struct file *file = NULL; unsigned long retval; - addr = untagged_addr(addr); - if (!(flags & MAP_ANONYMOUS)) { audit_mmap_fd(fd, flags); file = fget(fd); diff --git a/mm/mremap.c b/mm/mremap.c index 122938dcec15..af363063ea23 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -607,7 +607,6 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, LIST_HEAD(uf_unmap); addr = untagged_addr(addr); - new_addr = untagged_addr(new_addr); if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE)) return ret; -- cgit v1.2.3 From 9038ec99ceb94fb8d93ade5e236b2928f0792c7c Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 19 Feb 2020 22:23:18 -0800 Subject: x86/xen: Distribute switch variables for initialization MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Variables declared in a switch statement before any case statements cannot be automatically initialized with compiler instrumentation (as they are not part of any execution flow). With GCC's proposed automatic stack variable initialization feature, this triggers a warning (and they don't get initialized). Clang's automatic stack variable initialization (via CONFIG_INIT_STACK_ALL=y) doesn't throw a warning, but it also doesn't initialize such variables[1]. Note that these warnings (or silent skipping) happen before the dead-store elimination optimization phase, so even when the automatic initializations are later elided in favor of direct initializations, the warnings remain. To avoid these problems, move such variables into the "case" where they're used or lift them up into the main function body. arch/x86/xen/enlighten_pv.c: In function ‘xen_write_msr_safe’: arch/x86/xen/enlighten_pv.c:904:12: warning: statement will never be executed [-Wswitch-unreachable] 904 | unsigned which; | ^~~~~ [1] https://bugs.llvm.org/show_bug.cgi?id=44916 Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20200220062318.69299-1-keescook@chromium.org Reviewed-by: Juergen Gross [boris: made @which an 'unsigned int'] Signed-off-by: Boris Ostrovsky --- arch/x86/xen/enlighten_pv.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index ae4a41ca19f6..3fbbc50bb032 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -896,14 +896,15 @@ static u64 xen_read_msr_safe(unsigned int msr, int *err) static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high) { int ret; +#ifdef CONFIG_X86_64 + unsigned int which; + u64 base; +#endif ret = 0; switch (msr) { #ifdef CONFIG_X86_64 - unsigned which; - u64 base; - case MSR_FS_BASE: which = SEGBASE_FS; goto set; case MSR_KERNEL_GS_BASE: which = SEGBASE_GS_USER; goto set; case MSR_GS_BASE: which = SEGBASE_GS_KERNEL; goto set; -- cgit v1.2.3 From 161d179261f95ac56f61f94f89304e0620534230 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 19 Feb 2020 22:23:04 -0800 Subject: net: core: Distribute switch variables for initialization MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Variables declared in a switch statement before any case statements cannot be automatically initialized with compiler instrumentation (as they are not part of any execution flow). With GCC's proposed automatic stack variable initialization feature, this triggers a warning (and they don't get initialized). Clang's automatic stack variable initialization (via CONFIG_INIT_STACK_ALL=y) doesn't throw a warning, but it also doesn't initialize such variables[1]. Note that these warnings (or silent skipping) happen before the dead-store elimination optimization phase, so even when the automatic initializations are later elided in favor of direct initializations, the warnings remain. To avoid these problems, move such variables into the "case" where they're used or lift them up into the main function body. net/core/skbuff.c: In function ‘skb_checksum_setup_ip’: net/core/skbuff.c:4809:7: warning: statement will never be executed [-Wswitch-unreachable] 4809 | int err; | ^~~ [1] https://bugs.llvm.org/show_bug.cgi?id=44916 Signed-off-by: Kees Cook Signed-off-by: David S. Miller --- net/core/skbuff.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 1365a556152c..e1101a4f90a6 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -4803,9 +4803,9 @@ static __sum16 *skb_checksum_setup_ip(struct sk_buff *skb, typeof(IPPROTO_IP) proto, unsigned int off) { - switch (proto) { - int err; + int err; + switch (proto) { case IPPROTO_TCP: err = skb_maybe_pull_tail(skb, off + sizeof(struct tcphdr), off + MAX_TCP_HDR_LEN); -- cgit v1.2.3 From 46d30cb1045c2ab1ada269702c8c84d6446baf81 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 19 Feb 2020 22:23:07 -0800 Subject: net: ip6_gre: Distribute switch variables for initialization MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Variables declared in a switch statement before any case statements cannot be automatically initialized with compiler instrumentation (as they are not part of any execution flow). With GCC's proposed automatic stack variable initialization feature, this triggers a warning (and they don't get initialized). Clang's automatic stack variable initialization (via CONFIG_INIT_STACK_ALL=y) doesn't throw a warning, but it also doesn't initialize such variables[1]. Note that these warnings (or silent skipping) happen before the dead-store elimination optimization phase, so even when the automatic initializations are later elided in favor of direct initializations, the warnings remain. To avoid these problems, move such variables into the "case" where they're used or lift them up into the main function body. net/ipv6/ip6_gre.c: In function ‘ip6gre_err’: net/ipv6/ip6_gre.c:440:32: warning: statement will never be executed [-Wswitch-unreachable] 440 | struct ipv6_tlv_tnl_enc_lim *tel; | ^~~ net/ipv6/ip6_tunnel.c: In function ‘ip6_tnl_err’: net/ipv6/ip6_tunnel.c:520:32: warning: statement will never be executed [-Wswitch-unreachable] 520 | struct ipv6_tlv_tnl_enc_lim *tel; | ^~~ [1] https://bugs.llvm.org/show_bug.cgi?id=44916 Signed-off-by: Kees Cook Signed-off-by: David S. Miller --- net/ipv6/ip6_gre.c | 8 +++++--- net/ipv6/ip6_tunnel.c | 13 +++++++++---- 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 55bfc5149d0c..781ca8c07a0d 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -437,8 +437,6 @@ static int ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt, return -ENOENT; switch (type) { - struct ipv6_tlv_tnl_enc_lim *tel; - __u32 teli; case ICMPV6_DEST_UNREACH: net_dbg_ratelimited("%s: Path to destination invalid or inactive!\n", t->parms.name); @@ -452,7 +450,10 @@ static int ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt, break; } return 0; - case ICMPV6_PARAMPROB: + case ICMPV6_PARAMPROB: { + struct ipv6_tlv_tnl_enc_lim *tel; + __u32 teli; + teli = 0; if (code == ICMPV6_HDR_FIELD) teli = ip6_tnl_parse_tlv_enc_lim(skb, skb->data); @@ -468,6 +469,7 @@ static int ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt, t->parms.name); } return 0; + } case ICMPV6_PKT_TOOBIG: ip6_update_pmtu(skb, net, info, 0, 0, sock_net_uid(net, NULL)); return 0; diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 5d65436ad5ad..4703b09808d0 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -517,8 +517,6 @@ ip6_tnl_err(struct sk_buff *skb, __u8 ipproto, struct inet6_skb_parm *opt, err = 0; switch (*type) { - struct ipv6_tlv_tnl_enc_lim *tel; - __u32 mtu, teli; case ICMPV6_DEST_UNREACH: net_dbg_ratelimited("%s: Path to destination invalid or inactive!\n", t->parms.name); @@ -531,7 +529,10 @@ ip6_tnl_err(struct sk_buff *skb, __u8 ipproto, struct inet6_skb_parm *opt, rel_msg = 1; } break; - case ICMPV6_PARAMPROB: + case ICMPV6_PARAMPROB: { + struct ipv6_tlv_tnl_enc_lim *tel; + __u32 teli; + teli = 0; if ((*code) == ICMPV6_HDR_FIELD) teli = ip6_tnl_parse_tlv_enc_lim(skb, skb->data); @@ -548,7 +549,10 @@ ip6_tnl_err(struct sk_buff *skb, __u8 ipproto, struct inet6_skb_parm *opt, t->parms.name); } break; - case ICMPV6_PKT_TOOBIG: + } + case ICMPV6_PKT_TOOBIG: { + __u32 mtu; + ip6_update_pmtu(skb, net, htonl(*info), 0, 0, sock_net_uid(net, NULL)); mtu = *info - offset; @@ -562,6 +566,7 @@ ip6_tnl_err(struct sk_buff *skb, __u8 ipproto, struct inet6_skb_parm *opt, rel_msg = 1; } break; + } case NDISC_REDIRECT: ip6_redirect(skb, net, skb->dev->ifindex, 0, sock_net_uid(net, NULL)); -- cgit v1.2.3 From 16a556eeb7ed2dc3709fe2c5be76accdfa4901ab Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 19 Feb 2020 22:23:09 -0800 Subject: openvswitch: Distribute switch variables for initialization MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Variables declared in a switch statement before any case statements cannot be automatically initialized with compiler instrumentation (as they are not part of any execution flow). With GCC's proposed automatic stack variable initialization feature, this triggers a warning (and they don't get initialized). Clang's automatic stack variable initialization (via CONFIG_INIT_STACK_ALL=y) doesn't throw a warning, but it also doesn't initialize such variables[1]. Note that these warnings (or silent skipping) happen before the dead-store elimination optimization phase, so even when the automatic initializations are later elided in favor of direct initializations, the warnings remain. To avoid these problems, move such variables into the "case" where they're used or lift them up into the main function body. net/openvswitch/flow_netlink.c: In function ‘validate_set’: net/openvswitch/flow_netlink.c:2711:29: warning: statement will never be executed [-Wswitch-unreachable] 2711 | const struct ovs_key_ipv4 *ipv4_key; | ^~~~~~~~ [1] https://bugs.llvm.org/show_bug.cgi?id=44916 Signed-off-by: Kees Cook Signed-off-by: David S. Miller --- net/openvswitch/flow_netlink.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index 7da4230627f5..288122eec7c8 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -2708,10 +2708,6 @@ static int validate_set(const struct nlattr *a, return -EINVAL; switch (key_type) { - const struct ovs_key_ipv4 *ipv4_key; - const struct ovs_key_ipv6 *ipv6_key; - int err; - case OVS_KEY_ATTR_PRIORITY: case OVS_KEY_ATTR_SKB_MARK: case OVS_KEY_ATTR_CT_MARK: @@ -2723,7 +2719,9 @@ static int validate_set(const struct nlattr *a, return -EINVAL; break; - case OVS_KEY_ATTR_TUNNEL: + case OVS_KEY_ATTR_TUNNEL: { + int err; + if (masked) return -EINVAL; /* Masked tunnel set not supported. */ @@ -2732,8 +2730,10 @@ static int validate_set(const struct nlattr *a, if (err) return err; break; + } + case OVS_KEY_ATTR_IPV4: { + const struct ovs_key_ipv4 *ipv4_key; - case OVS_KEY_ATTR_IPV4: if (eth_type != htons(ETH_P_IP)) return -EINVAL; @@ -2753,8 +2753,10 @@ static int validate_set(const struct nlattr *a, return -EINVAL; } break; + } + case OVS_KEY_ATTR_IPV6: { + const struct ovs_key_ipv6 *ipv6_key; - case OVS_KEY_ATTR_IPV6: if (eth_type != htons(ETH_P_IPV6)) return -EINVAL; @@ -2781,7 +2783,7 @@ static int validate_set(const struct nlattr *a, return -EINVAL; break; - + } case OVS_KEY_ATTR_TCP: if ((eth_type != htons(ETH_P_IP) && eth_type != htons(ETH_P_IPV6)) || -- cgit v1.2.3 From 6f3846f0955308b6d1b219419da42b8de2c08845 Mon Sep 17 00:00:00 2001 From: Alexandra Winter Date: Thu, 20 Feb 2020 15:54:54 +0100 Subject: s390/qeth: vnicc Fix EOPNOTSUPP precedence When getting or setting VNICC parameters, the error code EOPNOTSUPP should have precedence over EBUSY. EBUSY is used because vnicc feature and bridgeport feature are mutually exclusive, which is a temporary condition. Whereas EOPNOTSUPP indicates that the HW does not support all or parts of the vnicc feature. This issue causes the vnicc sysfs params to show 'blocked by bridgeport' for HW that does not support VNICC at all. Fixes: caa1f0b10d18 ("s390/qeth: add VNICC enable/disable support") Signed-off-by: Alexandra Winter Signed-off-by: Julian Wiedmann Signed-off-by: David S. Miller --- drivers/s390/net/qeth_l2_main.c | 29 +++++++++++++---------------- 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/drivers/s390/net/qeth_l2_main.c b/drivers/s390/net/qeth_l2_main.c index 692bd2623401..9972d96820f3 100644 --- a/drivers/s390/net/qeth_l2_main.c +++ b/drivers/s390/net/qeth_l2_main.c @@ -1707,15 +1707,14 @@ int qeth_l2_vnicc_set_state(struct qeth_card *card, u32 vnicc, bool state) QETH_CARD_TEXT(card, 2, "vniccsch"); - /* do not change anything if BridgePort is enabled */ - if (qeth_bridgeport_is_in_use(card)) - return -EBUSY; - /* check if characteristic and enable/disable are supported */ if (!(card->options.vnicc.sup_chars & vnicc) || !(card->options.vnicc.set_char_sup & vnicc)) return -EOPNOTSUPP; + if (qeth_bridgeport_is_in_use(card)) + return -EBUSY; + /* set enable/disable command and store wanted characteristic */ if (state) { cmd = IPA_VNICC_ENABLE; @@ -1761,14 +1760,13 @@ int qeth_l2_vnicc_get_state(struct qeth_card *card, u32 vnicc, bool *state) QETH_CARD_TEXT(card, 2, "vniccgch"); - /* do not get anything if BridgePort is enabled */ - if (qeth_bridgeport_is_in_use(card)) - return -EBUSY; - /* check if characteristic is supported */ if (!(card->options.vnicc.sup_chars & vnicc)) return -EOPNOTSUPP; + if (qeth_bridgeport_is_in_use(card)) + return -EBUSY; + /* if card is ready, query current VNICC state */ if (qeth_card_hw_is_reachable(card)) rc = qeth_l2_vnicc_query_chars(card); @@ -1786,15 +1784,14 @@ int qeth_l2_vnicc_set_timeout(struct qeth_card *card, u32 timeout) QETH_CARD_TEXT(card, 2, "vniccsto"); - /* do not change anything if BridgePort is enabled */ - if (qeth_bridgeport_is_in_use(card)) - return -EBUSY; - /* check if characteristic and set_timeout are supported */ if (!(card->options.vnicc.sup_chars & QETH_VNICC_LEARNING) || !(card->options.vnicc.getset_timeout_sup & QETH_VNICC_LEARNING)) return -EOPNOTSUPP; + if (qeth_bridgeport_is_in_use(card)) + return -EBUSY; + /* do we need to do anything? */ if (card->options.vnicc.learning_timeout == timeout) return rc; @@ -1823,14 +1820,14 @@ int qeth_l2_vnicc_get_timeout(struct qeth_card *card, u32 *timeout) QETH_CARD_TEXT(card, 2, "vniccgto"); - /* do not get anything if BridgePort is enabled */ - if (qeth_bridgeport_is_in_use(card)) - return -EBUSY; - /* check if characteristic and get_timeout are supported */ if (!(card->options.vnicc.sup_chars & QETH_VNICC_LEARNING) || !(card->options.vnicc.getset_timeout_sup & QETH_VNICC_LEARNING)) return -EOPNOTSUPP; + + if (qeth_bridgeport_is_in_use(card)) + return -EBUSY; + /* if card is ready, get timeout. Otherwise, just return stored value */ *timeout = card->options.vnicc.learning_timeout; if (qeth_card_hw_is_reachable(card)) -- cgit v1.2.3 From 420579dba126c6111b5a3dea062f21a7e4e647c6 Mon Sep 17 00:00:00 2001 From: Julian Wiedmann Date: Thu, 20 Feb 2020 15:54:55 +0100 Subject: s390/qeth: don't warn for napi with 0 budget Calling napi->poll() with 0 budget is a legitimate use by netpoll. Fixes: a1c3ed4c9ca0 ("qeth: NAPI support for l2 and l3 discipline") Signed-off-by: Julian Wiedmann Signed-off-by: David S. Miller --- drivers/s390/net/qeth_core_main.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/s390/net/qeth_core_main.c b/drivers/s390/net/qeth_core_main.c index 9639938581f5..2264c6619def 100644 --- a/drivers/s390/net/qeth_core_main.c +++ b/drivers/s390/net/qeth_core_main.c @@ -5447,7 +5447,6 @@ static int qeth_extract_skbs(struct qeth_card *card, int budget, { int work_done = 0; - WARN_ON_ONCE(!budget); *done = false; while (budget) { -- cgit v1.2.3 From 54a61fbc020fd2e305680871c453abcf7fc0339b Mon Sep 17 00:00:00 2001 From: Julian Wiedmann Date: Thu, 20 Feb 2020 15:54:56 +0100 Subject: s390/qeth: fix off-by-one in RX copybreak check The RX copybreak is intended as the _max_ value where the frame's data should be copied. So for frame_len == copybreak, don't build an SG skb. Fixes: 4a71df50047f ("qeth: new qeth device driver") Signed-off-by: Julian Wiedmann Signed-off-by: David S. Miller --- drivers/s390/net/qeth_core_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/s390/net/qeth_core_main.c b/drivers/s390/net/qeth_core_main.c index 2264c6619def..5efcaa43615b 100644 --- a/drivers/s390/net/qeth_core_main.c +++ b/drivers/s390/net/qeth_core_main.c @@ -5344,7 +5344,7 @@ next_packet: } use_rx_sg = (card->options.cq == QETH_CQ_ENABLED) || - ((skb_len >= card->options.rx_sg_cb) && + (skb_len > card->options.rx_sg_cb && !atomic_read(&card->force_alloc_skb) && !IS_OSN(card)); -- cgit v1.2.3 From 8645e56a4ad6dcbf504872db7f14a2f67db88ef2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 19 Feb 2020 18:30:26 +0100 Subject: xen: Enable interrupts when calling _cond_resched() xen_maybe_preempt_hcall() is called from the exception entry point xen_do_hypervisor_callback with interrupts disabled. _cond_resched() evades the might_sleep() check in cond_resched() which would have caught that and schedule_debug() unfortunately lacks a check for irqs_disabled(). Enable interrupts around the call and use cond_resched() to catch future issues. Fixes: fdfd811ddde3 ("x86/xen: allow privcmd hypercalls to be preempted") Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/878skypjrh.fsf@nanos.tec.linutronix.de Reviewed-by: Juergen Gross Signed-off-by: Boris Ostrovsky --- drivers/xen/preempt.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/xen/preempt.c b/drivers/xen/preempt.c index 8b9919c26095..456a164364a2 100644 --- a/drivers/xen/preempt.c +++ b/drivers/xen/preempt.c @@ -33,7 +33,9 @@ asmlinkage __visible void xen_maybe_preempt_hcall(void) * cpu. */ __this_cpu_write(xen_in_preemptible_hcall, false); - _cond_resched(); + local_irq_enable(); + cond_resched(); + local_irq_disable(); __this_cpu_write(xen_in_preemptible_hcall, true); } } -- cgit v1.2.3 From ac2fcfa9fd26db67d7000677c05629c34cc94564 Mon Sep 17 00:00:00 2001 From: Alexandre Belloni Date: Wed, 19 Feb 2020 15:15:51 +0100 Subject: net: macb: Properly handle phylink on at91rm9200 at91ether_init was handling the phy mode and speed but since the switch to phylink, the NCFGR register got overwritten by macb_mac_config(). The issue is that the RM9200_RMII bit and the MACB_CLK_DIV32 field are cleared but never restored as they conflict with the PAE, GBE and PCSSEL bits. Add new capability to differentiate between EMAC and the other versions of the IP and use it to set and avoid clearing the relevant bits. Also, this fixes a NULL pointer dereference in macb_mac_link_up as the EMAC doesn't use any rings/bufffers/queues. Fixes: 7897b071ac3b ("net: macb: convert to phylink") Signed-off-by: Alexandre Belloni Signed-off-by: David S. Miller --- drivers/net/ethernet/cadence/macb.h | 1 + drivers/net/ethernet/cadence/macb_main.c | 60 +++++++++++++++++--------------- 2 files changed, 33 insertions(+), 28 deletions(-) diff --git a/drivers/net/ethernet/cadence/macb.h b/drivers/net/ethernet/cadence/macb.h index dbf7070fcdba..a3f0f27fc79a 100644 --- a/drivers/net/ethernet/cadence/macb.h +++ b/drivers/net/ethernet/cadence/macb.h @@ -652,6 +652,7 @@ #define MACB_CAPS_GEM_HAS_PTP 0x00000040 #define MACB_CAPS_BD_RD_PREFETCH 0x00000080 #define MACB_CAPS_NEEDS_RSTONUBR 0x00000100 +#define MACB_CAPS_MACB_IS_EMAC 0x08000000 #define MACB_CAPS_FIFO_MODE 0x10000000 #define MACB_CAPS_GIGABIT_MODE_AVAILABLE 0x20000000 #define MACB_CAPS_SG_DISABLED 0x40000000 diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c index def94e91883a..2c28da1737fe 100644 --- a/drivers/net/ethernet/cadence/macb_main.c +++ b/drivers/net/ethernet/cadence/macb_main.c @@ -572,8 +572,21 @@ static void macb_mac_config(struct phylink_config *config, unsigned int mode, old_ctrl = ctrl = macb_or_gem_readl(bp, NCFGR); /* Clear all the bits we might set later */ - ctrl &= ~(GEM_BIT(GBE) | MACB_BIT(SPD) | MACB_BIT(FD) | MACB_BIT(PAE) | - GEM_BIT(SGMIIEN) | GEM_BIT(PCSSEL)); + ctrl &= ~(MACB_BIT(SPD) | MACB_BIT(FD) | MACB_BIT(PAE)); + + if (bp->caps & MACB_CAPS_MACB_IS_EMAC) { + if (state->interface == PHY_INTERFACE_MODE_RMII) + ctrl |= MACB_BIT(RM9200_RMII); + } else { + ctrl &= ~(GEM_BIT(GBE) | GEM_BIT(SGMIIEN) | GEM_BIT(PCSSEL)); + + /* We do not support MLO_PAUSE_RX yet */ + if (state->pause & MLO_PAUSE_TX) + ctrl |= MACB_BIT(PAE); + + if (state->interface == PHY_INTERFACE_MODE_SGMII) + ctrl |= GEM_BIT(SGMIIEN) | GEM_BIT(PCSSEL); + } if (state->speed == SPEED_1000) ctrl |= GEM_BIT(GBE); @@ -583,13 +596,6 @@ static void macb_mac_config(struct phylink_config *config, unsigned int mode, if (state->duplex) ctrl |= MACB_BIT(FD); - /* We do not support MLO_PAUSE_RX yet */ - if (state->pause & MLO_PAUSE_TX) - ctrl |= MACB_BIT(PAE); - - if (state->interface == PHY_INTERFACE_MODE_SGMII) - ctrl |= GEM_BIT(SGMIIEN) | GEM_BIT(PCSSEL); - /* Apply the new configuration, if any */ if (old_ctrl ^ ctrl) macb_or_gem_writel(bp, NCFGR, ctrl); @@ -608,9 +614,10 @@ static void macb_mac_link_down(struct phylink_config *config, unsigned int mode, unsigned int q; u32 ctrl; - for (q = 0, queue = bp->queues; q < bp->num_queues; ++q, ++queue) - queue_writel(queue, IDR, - bp->rx_intr_mask | MACB_TX_INT_FLAGS | MACB_BIT(HRESP)); + if (!(bp->caps & MACB_CAPS_MACB_IS_EMAC)) + for (q = 0, queue = bp->queues; q < bp->num_queues; ++q, ++queue) + queue_writel(queue, IDR, + bp->rx_intr_mask | MACB_TX_INT_FLAGS | MACB_BIT(HRESP)); /* Disable Rx and Tx */ ctrl = macb_readl(bp, NCR) & ~(MACB_BIT(RE) | MACB_BIT(TE)); @@ -627,17 +634,19 @@ static void macb_mac_link_up(struct phylink_config *config, unsigned int mode, struct macb_queue *queue; unsigned int q; - macb_set_tx_clk(bp->tx_clk, bp->speed, ndev); + if (!(bp->caps & MACB_CAPS_MACB_IS_EMAC)) { + macb_set_tx_clk(bp->tx_clk, bp->speed, ndev); - /* Initialize rings & buffers as clearing MACB_BIT(TE) in link down - * cleared the pipeline and control registers. - */ - bp->macbgem_ops.mog_init_rings(bp); - macb_init_buffers(bp); + /* Initialize rings & buffers as clearing MACB_BIT(TE) in link down + * cleared the pipeline and control registers. + */ + bp->macbgem_ops.mog_init_rings(bp); + macb_init_buffers(bp); - for (q = 0, queue = bp->queues; q < bp->num_queues; ++q, ++queue) - queue_writel(queue, IER, - bp->rx_intr_mask | MACB_TX_INT_FLAGS | MACB_BIT(HRESP)); + for (q = 0, queue = bp->queues; q < bp->num_queues; ++q, ++queue) + queue_writel(queue, IER, + bp->rx_intr_mask | MACB_TX_INT_FLAGS | MACB_BIT(HRESP)); + } /* Enable Rx and Tx */ macb_writel(bp, NCR, macb_readl(bp, NCR) | MACB_BIT(RE) | MACB_BIT(TE)); @@ -4041,7 +4050,6 @@ static int at91ether_init(struct platform_device *pdev) struct net_device *dev = platform_get_drvdata(pdev); struct macb *bp = netdev_priv(dev); int err; - u32 reg; bp->queues[0].bp = bp; @@ -4055,11 +4063,7 @@ static int at91ether_init(struct platform_device *pdev) macb_writel(bp, NCR, 0); - reg = MACB_BF(CLK, MACB_CLK_DIV32) | MACB_BIT(BIG); - if (bp->phy_interface == PHY_INTERFACE_MODE_RMII) - reg |= MACB_BIT(RM9200_RMII); - - macb_writel(bp, NCFGR, reg); + macb_writel(bp, NCFGR, MACB_BF(CLK, MACB_CLK_DIV32) | MACB_BIT(BIG)); return 0; } @@ -4218,7 +4222,7 @@ static const struct macb_config sama5d4_config = { }; static const struct macb_config emac_config = { - .caps = MACB_CAPS_NEEDS_RSTONUBR, + .caps = MACB_CAPS_NEEDS_RSTONUBR | MACB_CAPS_MACB_IS_EMAC, .clk_init = at91ether_clk_init, .init = at91ether_init, }; -- cgit v1.2.3 From 98bda63e20daab95bdc084ce00459a4f622a0505 Mon Sep 17 00:00:00 2001 From: Roman Kiryanov Date: Wed, 19 Feb 2020 13:40:06 -0800 Subject: net: disable BRIDGE_NETFILTER by default The description says 'If unsure, say N.' but the module is built as M by default (once the dependencies are satisfied). When the module is selected (Y or M), it enables NETFILTER_FAMILY_BRIDGE and SKB_EXTENSIONS which alter kernel internal structures. We (Android Studio Emulator) currently do not use this module and think this it is more consistent to have it disabled by default as opposite to disabling it explicitly to prevent enabling NETFILTER_FAMILY_BRIDGE and SKB_EXTENSIONS. Signed-off-by: Roman Kiryanov Acked-by: Florian Westphal Signed-off-by: David S. Miller --- net/Kconfig | 1 - 1 file changed, 1 deletion(-) diff --git a/net/Kconfig b/net/Kconfig index b0937a700f01..2eeb0e55f7c9 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -189,7 +189,6 @@ config BRIDGE_NETFILTER depends on NETFILTER_ADVANCED select NETFILTER_FAMILY_BRIDGE select SKB_EXTENSIONS - default m ---help--- Enabling this option will let arptables resp. iptables see bridged ARP resp. IP traffic. If you want a bridging firewall, you probably -- cgit v1.2.3 From 68b759a75d6257759d1e37ff13f2d0659baf1112 Mon Sep 17 00:00:00 2001 From: Shannon Nelson Date: Wed, 19 Feb 2020 14:59:42 -0800 Subject: ionic: fix fw_status read The fw_status field is only 8 bits, so fix the read. Also, we only want to look at the one status bit, to allow for future use of the other bits, and watch for a bad PCI read. Fixes: 97ca486592c0 ("ionic: add heartbeat check") Signed-off-by: Shannon Nelson Signed-off-by: David S. Miller --- drivers/net/ethernet/pensando/ionic/ionic_dev.c | 11 +++++++---- drivers/net/ethernet/pensando/ionic/ionic_if.h | 1 + 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/pensando/ionic/ionic_dev.c b/drivers/net/ethernet/pensando/ionic/ionic_dev.c index 87f82f36812f..46107de5e6c3 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_dev.c +++ b/drivers/net/ethernet/pensando/ionic/ionic_dev.c @@ -103,7 +103,7 @@ int ionic_heartbeat_check(struct ionic *ionic) { struct ionic_dev *idev = &ionic->idev; unsigned long hb_time; - u32 fw_status; + u8 fw_status; u32 hb; /* wait a little more than one second before testing again */ @@ -111,9 +111,12 @@ int ionic_heartbeat_check(struct ionic *ionic) if (time_before(hb_time, (idev->last_hb_time + ionic->watchdog_period))) return 0; - /* firmware is useful only if fw_status is non-zero */ - fw_status = ioread32(&idev->dev_info_regs->fw_status); - if (!fw_status) + /* firmware is useful only if the running bit is set and + * fw_status != 0xff (bad PCI read) + */ + fw_status = ioread8(&idev->dev_info_regs->fw_status); + if (fw_status == 0xff || + !(fw_status & IONIC_FW_STS_F_RUNNING)) return -ENXIO; /* early FW has no heartbeat, else FW will return non-zero */ diff --git a/drivers/net/ethernet/pensando/ionic/ionic_if.h b/drivers/net/ethernet/pensando/ionic/ionic_if.h index ce07c2931a72..54547d53b0f2 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_if.h +++ b/drivers/net/ethernet/pensando/ionic/ionic_if.h @@ -2445,6 +2445,7 @@ union ionic_dev_info_regs { u8 version; u8 asic_type; u8 asic_rev; +#define IONIC_FW_STS_F_RUNNING 0x1 u8 fw_status; u32 fw_heartbeat; char fw_version[IONIC_DEVINFO_FWVERS_BUFLEN]; -- cgit v1.2.3 From 971617c3b761c876d686a2188220a33898c90e99 Mon Sep 17 00:00:00 2001 From: Tim Harvey Date: Wed, 19 Feb 2020 15:19:36 -0800 Subject: net: thunderx: workaround BGX TX Underflow issue While it is not yet understood why a TX underflow can easily occur for SGMII interfaces resulting in a TX wedge. It has been found that disabling/re-enabling the LMAC resolves the issue. Signed-off-by: Tim Harvey Reviewed-by: Robert Jones Signed-off-by: David S. Miller --- drivers/net/ethernet/cavium/thunder/thunder_bgx.c | 62 +++++++++++++++++++++-- drivers/net/ethernet/cavium/thunder/thunder_bgx.h | 9 ++++ 2 files changed, 68 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/cavium/thunder/thunder_bgx.c b/drivers/net/ethernet/cavium/thunder/thunder_bgx.c index 17a4110c2e49..8ff28ed04b7f 100644 --- a/drivers/net/ethernet/cavium/thunder/thunder_bgx.c +++ b/drivers/net/ethernet/cavium/thunder/thunder_bgx.c @@ -410,10 +410,19 @@ void bgx_lmac_rx_tx_enable(int node, int bgx_idx, int lmacid, bool enable) lmac = &bgx->lmac[lmacid]; cfg = bgx_reg_read(bgx, lmacid, BGX_CMRX_CFG); - if (enable) + if (enable) { cfg |= CMR_PKT_RX_EN | CMR_PKT_TX_EN; - else + + /* enable TX FIFO Underflow interrupt */ + bgx_reg_modify(bgx, lmacid, BGX_GMP_GMI_TXX_INT_ENA_W1S, + GMI_TXX_INT_UNDFLW); + } else { cfg &= ~(CMR_PKT_RX_EN | CMR_PKT_TX_EN); + + /* Disable TX FIFO Underflow interrupt */ + bgx_reg_modify(bgx, lmacid, BGX_GMP_GMI_TXX_INT_ENA_W1C, + GMI_TXX_INT_UNDFLW); + } bgx_reg_write(bgx, lmacid, BGX_CMRX_CFG, cfg); if (bgx->is_rgx) @@ -1535,6 +1544,48 @@ static int bgx_init_phy(struct bgx *bgx) return bgx_init_of_phy(bgx); } +static irqreturn_t bgx_intr_handler(int irq, void *data) +{ + struct bgx *bgx = (struct bgx *)data; + u64 status, val; + int lmac; + + for (lmac = 0; lmac < bgx->lmac_count; lmac++) { + status = bgx_reg_read(bgx, lmac, BGX_GMP_GMI_TXX_INT); + if (status & GMI_TXX_INT_UNDFLW) { + pci_err(bgx->pdev, "BGX%d lmac%d UNDFLW\n", + bgx->bgx_id, lmac); + val = bgx_reg_read(bgx, lmac, BGX_CMRX_CFG); + val &= ~CMR_EN; + bgx_reg_write(bgx, lmac, BGX_CMRX_CFG, val); + val |= CMR_EN; + bgx_reg_write(bgx, lmac, BGX_CMRX_CFG, val); + } + /* clear interrupts */ + bgx_reg_write(bgx, lmac, BGX_GMP_GMI_TXX_INT, status); + } + + return IRQ_HANDLED; +} + +static void bgx_register_intr(struct pci_dev *pdev) +{ + struct bgx *bgx = pci_get_drvdata(pdev); + int ret; + + ret = pci_alloc_irq_vectors(pdev, BGX_LMAC_VEC_OFFSET, + BGX_LMAC_VEC_OFFSET, PCI_IRQ_ALL_TYPES); + if (ret < 0) { + pci_err(pdev, "Req for #%d msix vectors failed\n", + BGX_LMAC_VEC_OFFSET); + return; + } + ret = pci_request_irq(pdev, GMPX_GMI_TX_INT, bgx_intr_handler, NULL, + bgx, "BGX%d", bgx->bgx_id); + if (ret) + pci_free_irq(pdev, GMPX_GMI_TX_INT, bgx); +} + static int bgx_probe(struct pci_dev *pdev, const struct pci_device_id *ent) { int err; @@ -1550,7 +1601,7 @@ static int bgx_probe(struct pci_dev *pdev, const struct pci_device_id *ent) pci_set_drvdata(pdev, bgx); - err = pci_enable_device(pdev); + err = pcim_enable_device(pdev); if (err) { dev_err(dev, "Failed to enable PCI device\n"); pci_set_drvdata(pdev, NULL); @@ -1604,6 +1655,8 @@ static int bgx_probe(struct pci_dev *pdev, const struct pci_device_id *ent) bgx_init_hw(bgx); + bgx_register_intr(pdev); + /* Enable all LMACs */ for (lmac = 0; lmac < bgx->lmac_count; lmac++) { err = bgx_lmac_enable(bgx, lmac); @@ -1620,6 +1673,7 @@ static int bgx_probe(struct pci_dev *pdev, const struct pci_device_id *ent) err_enable: bgx_vnic[bgx->bgx_id] = NULL; + pci_free_irq(pdev, GMPX_GMI_TX_INT, bgx); err_release_regions: pci_release_regions(pdev); err_disable_device: @@ -1637,6 +1691,8 @@ static void bgx_remove(struct pci_dev *pdev) for (lmac = 0; lmac < bgx->lmac_count; lmac++) bgx_lmac_disable(bgx, lmac); + pci_free_irq(pdev, GMPX_GMI_TX_INT, bgx); + bgx_vnic[bgx->bgx_id] = NULL; pci_release_regions(pdev); pci_disable_device(pdev); diff --git a/drivers/net/ethernet/cavium/thunder/thunder_bgx.h b/drivers/net/ethernet/cavium/thunder/thunder_bgx.h index 25888706bdcd..cdea49392185 100644 --- a/drivers/net/ethernet/cavium/thunder/thunder_bgx.h +++ b/drivers/net/ethernet/cavium/thunder/thunder_bgx.h @@ -180,6 +180,15 @@ #define BGX_GMP_GMI_TXX_BURST 0x38228 #define BGX_GMP_GMI_TXX_MIN_PKT 0x38240 #define BGX_GMP_GMI_TXX_SGMII_CTL 0x38300 +#define BGX_GMP_GMI_TXX_INT 0x38500 +#define BGX_GMP_GMI_TXX_INT_W1S 0x38508 +#define BGX_GMP_GMI_TXX_INT_ENA_W1C 0x38510 +#define BGX_GMP_GMI_TXX_INT_ENA_W1S 0x38518 +#define GMI_TXX_INT_PTP_LOST BIT_ULL(4) +#define GMI_TXX_INT_LATE_COL BIT_ULL(3) +#define GMI_TXX_INT_XSDEF BIT_ULL(2) +#define GMI_TXX_INT_XSCOL BIT_ULL(1) +#define GMI_TXX_INT_UNDFLW BIT_ULL(0) #define BGX_MSIX_VEC_0_29_ADDR 0x400000 /* +(0..29) << 4 */ #define BGX_MSIX_VEC_0_29_CTL 0x400008 -- cgit v1.2.3 From 3a20773beeeeadec41477a5ba872175b778ff752 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Thu, 20 Feb 2020 16:42:13 +0200 Subject: net: netlink: cap max groups which will be considered in netlink_bind() Since nl_groups is a u32 we can't bind more groups via ->bind (netlink_bind) call, but netlink has supported more groups via setsockopt() for a long time and thus nlk->ngroups could be over 32. Recently I added support for per-vlan notifications and increased the groups to 33 for NETLINK_ROUTE which exposed an old bug in the netlink_bind() code causing out-of-bounds access on archs where unsigned long is 32 bits via test_bit() on a local variable. Fix this by capping the maximum groups in netlink_bind() to BITS_PER_TYPE(u32), effectively capping them at 32 which is the minimum of allocated groups and the maximum groups which can be bound via netlink_bind(). CC: Christophe Leroy CC: Richard Guy Briggs Fixes: 4f520900522f ("netlink: have netlink per-protocol bind function return an error code.") Reported-by: Erhard F. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/netlink/af_netlink.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 4e31721e7293..edf3e285e242 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -1014,7 +1014,8 @@ static int netlink_bind(struct socket *sock, struct sockaddr *addr, if (nlk->netlink_bind && groups) { int group; - for (group = 0; group < nlk->ngroups; group++) { + /* nl_groups is a u32, so cap the maximum groups we can bind */ + for (group = 0; group < BITS_PER_TYPE(u32); group++) { if (!test_bit(group, &groups)) continue; err = nlk->netlink_bind(net, group + 1); @@ -1033,7 +1034,7 @@ static int netlink_bind(struct socket *sock, struct sockaddr *addr, netlink_insert(sk, nladdr->nl_pid) : netlink_autobind(sock); if (err) { - netlink_undo_bind(nlk->ngroups, groups, sk); + netlink_undo_bind(BITS_PER_TYPE(u32), groups, sk); goto unlock; } } -- cgit v1.2.3 From 5567ae4a8d569d996d0d88d0eceb76205e4c7ce5 Mon Sep 17 00:00:00 2001 From: Vasundhara Volam Date: Thu, 20 Feb 2020 17:26:34 -0500 Subject: bnxt_en: Improve device shutdown method. Especially when bnxt_shutdown() is called during kexec, we need to disable MSIX and disable Bus Master to completely quiesce the device. Make these 2 calls unconditionally in the shutdown method. Fixes: c20dc142dd7b ("bnxt_en: Disable bus master during PCI shutdown and driver unload.") Signed-off-by: Vasundhara Volam Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 597e6fd5bfea..2ad007e5ee7f 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -11983,10 +11983,10 @@ static void bnxt_shutdown(struct pci_dev *pdev) dev_close(dev); bnxt_ulp_shutdown(bp); + bnxt_clear_int_mode(bp); + pci_disable_device(pdev); if (system_state == SYSTEM_POWER_OFF) { - bnxt_clear_int_mode(bp); - pci_disable_device(pdev); pci_wake_from_d3(pdev, bp->wol); pci_set_power_state(pdev, PCI_D3hot); } -- cgit v1.2.3 From 8743db4a9acfd51f805ac0c87bcaae92c42d1061 Mon Sep 17 00:00:00 2001 From: Vasundhara Volam Date: Thu, 20 Feb 2020 17:26:35 -0500 Subject: bnxt_en: Issue PCIe FLR in kdump kernel to cleanup pending DMAs. If crashed kernel does not shutdown the NIC properly, PCIe FLR is required in the kdump kernel in order to initialize all the functions properly. Fixes: d629522e1d66 ("bnxt_en: Reduce memory usage when running in kdump kernel.") Signed-off-by: Vasundhara Volam Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 2ad007e5ee7f..fd6e0e48cd51 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -11786,6 +11786,14 @@ static int bnxt_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) if (version_printed++ == 0) pr_info("%s", version); + /* Clear any pending DMA transactions from crash kernel + * while loading driver in capture kernel. + */ + if (is_kdump_kernel()) { + pci_clear_master(pdev); + pcie_flr(pdev); + } + max_irqs = bnxt_get_max_irq(pdev); dev = alloc_etherdev_mq(sizeof(*bp), max_irqs); if (!dev) -- cgit v1.2.3 From 1d0c3924a92e69bfa91163bda83c12a994b4d106 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sat, 15 Feb 2020 16:40:37 -0500 Subject: ext4: fix potential race between online resizing and write operations During an online resize an array of pointers to buffer heads gets replaced so it can get enlarged. If there is a racing block allocation or deallocation which uses the old array, and the old array has gotten reused this can lead to a GPF or some other random kernel memory getting modified. Link: https://bugzilla.kernel.org/show_bug.cgi?id=206443 Link: https://lore.kernel.org/r/20200221053458.730016-2-tytso@mit.edu Reported-by: Suraj Jitindar Singh Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/balloc.c | 14 +++++++++++--- fs/ext4/ext4.h | 20 +++++++++++++++++++- fs/ext4/resize.c | 55 ++++++++++++++++++++++++++++++++++++++++++++----------- fs/ext4/super.c | 33 +++++++++++++++++++++++---------- 4 files changed, 97 insertions(+), 25 deletions(-) diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 5f993a411251..8fd0b3cdab4c 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -270,6 +270,7 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb, ext4_group_t ngroups = ext4_get_groups_count(sb); struct ext4_group_desc *desc; struct ext4_sb_info *sbi = EXT4_SB(sb); + struct buffer_head *bh_p; if (block_group >= ngroups) { ext4_error(sb, "block_group >= groups_count - block_group = %u," @@ -280,7 +281,14 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb, group_desc = block_group >> EXT4_DESC_PER_BLOCK_BITS(sb); offset = block_group & (EXT4_DESC_PER_BLOCK(sb) - 1); - if (!sbi->s_group_desc[group_desc]) { + bh_p = sbi_array_rcu_deref(sbi, s_group_desc, group_desc); + /* + * sbi_array_rcu_deref returns with rcu unlocked, this is ok since + * the pointer being dereferenced won't be dereferenced again. By + * looking at the usage in add_new_gdb() the value isn't modified, + * just the pointer, and so it remains valid. + */ + if (!bh_p) { ext4_error(sb, "Group descriptor not loaded - " "block_group = %u, group_desc = %u, desc = %u", block_group, group_desc, offset); @@ -288,10 +296,10 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb, } desc = (struct ext4_group_desc *)( - (__u8 *)sbi->s_group_desc[group_desc]->b_data + + (__u8 *)bh_p->b_data + offset * EXT4_DESC_SIZE(sb)); if (bh) - *bh = sbi->s_group_desc[group_desc]; + *bh = bh_p; return desc; } diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 480badcf2783..b51003f75568 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1400,7 +1400,7 @@ struct ext4_sb_info { loff_t s_bitmap_maxbytes; /* max bytes for bitmap files */ struct buffer_head * s_sbh; /* Buffer containing the super block */ struct ext4_super_block *s_es; /* Pointer to the super block in the buffer */ - struct buffer_head **s_group_desc; + struct buffer_head * __rcu *s_group_desc; unsigned int s_mount_opt; unsigned int s_mount_opt2; unsigned int s_mount_flags; @@ -1576,6 +1576,23 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino) ino <= le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count)); } +/* + * Returns: sbi->field[index] + * Used to access an array element from the following sbi fields which require + * rcu protection to avoid dereferencing an invalid pointer due to reassignment + * - s_group_desc + * - s_group_info + * - s_flex_group + */ +#define sbi_array_rcu_deref(sbi, field, index) \ +({ \ + typeof(*((sbi)->field)) _v; \ + rcu_read_lock(); \ + _v = ((typeof(_v)*)rcu_dereference((sbi)->field))[index]; \ + rcu_read_unlock(); \ + _v; \ +}) + /* * Simulate_fail codes */ @@ -2730,6 +2747,7 @@ extern int ext4_generic_delete_entry(handle_t *handle, extern bool ext4_empty_dir(struct inode *inode); /* resize.c */ +extern void ext4_kvfree_array_rcu(void *to_free); extern int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input); extern int ext4_group_extend(struct super_block *sb, diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 86a2500ed292..536cc9f38091 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -17,6 +17,33 @@ #include "ext4_jbd2.h" +struct ext4_rcu_ptr { + struct rcu_head rcu; + void *ptr; +}; + +static void ext4_rcu_ptr_callback(struct rcu_head *head) +{ + struct ext4_rcu_ptr *ptr; + + ptr = container_of(head, struct ext4_rcu_ptr, rcu); + kvfree(ptr->ptr); + kfree(ptr); +} + +void ext4_kvfree_array_rcu(void *to_free) +{ + struct ext4_rcu_ptr *ptr = kzalloc(sizeof(*ptr), GFP_KERNEL); + + if (ptr) { + ptr->ptr = to_free; + call_rcu(&ptr->rcu, ext4_rcu_ptr_callback); + return; + } + synchronize_rcu(); + kvfree(to_free); +} + int ext4_resize_begin(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); @@ -542,8 +569,8 @@ static int setup_new_flex_group_blocks(struct super_block *sb, brelse(gdb); goto out; } - memcpy(gdb->b_data, sbi->s_group_desc[j]->b_data, - gdb->b_size); + memcpy(gdb->b_data, sbi_array_rcu_deref(sbi, + s_group_desc, j)->b_data, gdb->b_size); set_buffer_uptodate(gdb); err = ext4_handle_dirty_metadata(handle, NULL, gdb); @@ -860,13 +887,15 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, } brelse(dind); - o_group_desc = EXT4_SB(sb)->s_group_desc; + rcu_read_lock(); + o_group_desc = rcu_dereference(EXT4_SB(sb)->s_group_desc); memcpy(n_group_desc, o_group_desc, EXT4_SB(sb)->s_gdb_count * sizeof(struct buffer_head *)); + rcu_read_unlock(); n_group_desc[gdb_num] = gdb_bh; - EXT4_SB(sb)->s_group_desc = n_group_desc; + rcu_assign_pointer(EXT4_SB(sb)->s_group_desc, n_group_desc); EXT4_SB(sb)->s_gdb_count++; - kvfree(o_group_desc); + ext4_kvfree_array_rcu(o_group_desc); le16_add_cpu(&es->s_reserved_gdt_blocks, -1); err = ext4_handle_dirty_super(handle, sb); @@ -909,9 +938,11 @@ static int add_new_gdb_meta_bg(struct super_block *sb, return err; } - o_group_desc = EXT4_SB(sb)->s_group_desc; + rcu_read_lock(); + o_group_desc = rcu_dereference(EXT4_SB(sb)->s_group_desc); memcpy(n_group_desc, o_group_desc, EXT4_SB(sb)->s_gdb_count * sizeof(struct buffer_head *)); + rcu_read_unlock(); n_group_desc[gdb_num] = gdb_bh; BUFFER_TRACE(gdb_bh, "get_write_access"); @@ -922,9 +953,9 @@ static int add_new_gdb_meta_bg(struct super_block *sb, return err; } - EXT4_SB(sb)->s_group_desc = n_group_desc; + rcu_assign_pointer(EXT4_SB(sb)->s_group_desc, n_group_desc); EXT4_SB(sb)->s_gdb_count++; - kvfree(o_group_desc); + ext4_kvfree_array_rcu(o_group_desc); return err; } @@ -1188,7 +1219,8 @@ static int ext4_add_new_descs(handle_t *handle, struct super_block *sb, * use non-sparse filesystems anymore. This is already checked above. */ if (gdb_off) { - gdb_bh = sbi->s_group_desc[gdb_num]; + gdb_bh = sbi_array_rcu_deref(sbi, s_group_desc, + gdb_num); BUFFER_TRACE(gdb_bh, "get_write_access"); err = ext4_journal_get_write_access(handle, gdb_bh); @@ -1270,7 +1302,7 @@ static int ext4_setup_new_descs(handle_t *handle, struct super_block *sb, /* * get_write_access() has been called on gdb_bh by ext4_add_new_desc(). */ - gdb_bh = sbi->s_group_desc[gdb_num]; + gdb_bh = sbi_array_rcu_deref(sbi, s_group_desc, gdb_num); /* Update group descriptor block for new group */ gdp = (struct ext4_group_desc *)(gdb_bh->b_data + gdb_off * EXT4_DESC_SIZE(sb)); @@ -1497,7 +1529,8 @@ exit_journal: for (; gdb_num <= gdb_num_end; gdb_num++) { struct buffer_head *gdb_bh; - gdb_bh = sbi->s_group_desc[gdb_num]; + gdb_bh = sbi_array_rcu_deref(sbi, s_group_desc, + gdb_num); if (old_gdb == gdb_bh->b_blocknr) continue; update_backups(sb, gdb_bh->b_blocknr, gdb_bh->b_data, diff --git a/fs/ext4/super.c b/fs/ext4/super.c index f464dff09774..e00bcc19099f 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1014,6 +1014,7 @@ static void ext4_put_super(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; + struct buffer_head **group_desc; int aborted = 0; int i, err; @@ -1046,9 +1047,12 @@ static void ext4_put_super(struct super_block *sb) if (!sb_rdonly(sb)) ext4_commit_super(sb, 1); + rcu_read_lock(); + group_desc = rcu_dereference(sbi->s_group_desc); for (i = 0; i < sbi->s_gdb_count; i++) - brelse(sbi->s_group_desc[i]); - kvfree(sbi->s_group_desc); + brelse(group_desc[i]); + kvfree(group_desc); + rcu_read_unlock(); kvfree(sbi->s_flex_groups); percpu_counter_destroy(&sbi->s_freeclusters_counter); percpu_counter_destroy(&sbi->s_freeinodes_counter); @@ -3634,7 +3638,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) { struct dax_device *dax_dev = fs_dax_get_by_bdev(sb->s_bdev); char *orig_data = kstrdup(data, GFP_KERNEL); - struct buffer_head *bh; + struct buffer_head *bh, **group_desc; struct ext4_super_block *es = NULL; struct ext4_sb_info *sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); ext4_fsblk_t block; @@ -4290,9 +4294,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) goto failed_mount; } } - sbi->s_group_desc = kvmalloc_array(db_count, - sizeof(struct buffer_head *), - GFP_KERNEL); + rcu_assign_pointer(sbi->s_group_desc, + kvmalloc_array(db_count, + sizeof(struct buffer_head *), + GFP_KERNEL)); if (sbi->s_group_desc == NULL) { ext4_msg(sb, KERN_ERR, "not enough memory"); ret = -ENOMEM; @@ -4308,14 +4313,19 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) } for (i = 0; i < db_count; i++) { + struct buffer_head *bh; + block = descriptor_loc(sb, logical_sb_block, i); - sbi->s_group_desc[i] = sb_bread_unmovable(sb, block); - if (!sbi->s_group_desc[i]) { + bh = sb_bread_unmovable(sb, block); + if (!bh) { ext4_msg(sb, KERN_ERR, "can't read group descriptor %d", i); db_count = i; goto failed_mount2; } + rcu_read_lock(); + rcu_dereference(sbi->s_group_desc)[i] = bh; + rcu_read_unlock(); } sbi->s_gdb_count = db_count; if (!ext4_check_descriptors(sb, logical_sb_block, &first_not_zeroed)) { @@ -4717,9 +4727,12 @@ failed_mount3: if (sbi->s_mmp_tsk) kthread_stop(sbi->s_mmp_tsk); failed_mount2: + rcu_read_lock(); + group_desc = rcu_dereference(sbi->s_group_desc); for (i = 0; i < db_count; i++) - brelse(sbi->s_group_desc[i]); - kvfree(sbi->s_group_desc); + brelse(group_desc[i]); + kvfree(group_desc); + rcu_read_unlock(); failed_mount: if (sbi->s_chksum_driver) crypto_free_shash(sbi->s_chksum_driver); -- cgit v1.2.3 From df3da4ea5a0fc5d115c90d5aa6caa4dd433750a7 Mon Sep 17 00:00:00 2001 From: Suraj Jitindar Singh Date: Tue, 18 Feb 2020 19:08:50 -0800 Subject: ext4: fix potential race between s_group_info online resizing and access During an online resize an array of pointers to s_group_info gets replaced so it can get enlarged. If there is a concurrent access to the array in ext4_get_group_info() and this memory has been reused then this can lead to an invalid memory access. Link: https://bugzilla.kernel.org/show_bug.cgi?id=206443 Link: https://lore.kernel.org/r/20200221053458.730016-3-tytso@mit.edu Signed-off-by: Suraj Jitindar Singh Signed-off-by: Theodore Ts'o Reviewed-by: Balbir Singh Cc: stable@kernel.org --- fs/ext4/ext4.h | 8 ++++---- fs/ext4/mballoc.c | 52 +++++++++++++++++++++++++++++++++++----------------- 2 files changed, 39 insertions(+), 21 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index b51003f75568..b1ece5329738 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1462,7 +1462,7 @@ struct ext4_sb_info { #endif /* for buddy allocator */ - struct ext4_group_info ***s_group_info; + struct ext4_group_info ** __rcu *s_group_info; struct inode *s_buddy_cache; spinlock_t s_md_lock; unsigned short *s_mb_offsets; @@ -2994,13 +2994,13 @@ static inline struct ext4_group_info *ext4_get_group_info(struct super_block *sb, ext4_group_t group) { - struct ext4_group_info ***grp_info; + struct ext4_group_info **grp_info; long indexv, indexh; BUG_ON(group >= EXT4_SB(sb)->s_groups_count); - grp_info = EXT4_SB(sb)->s_group_info; indexv = group >> (EXT4_DESC_PER_BLOCK_BITS(sb)); indexh = group & ((EXT4_DESC_PER_BLOCK(sb)) - 1); - return grp_info[indexv][indexh]; + grp_info = sbi_array_rcu_deref(EXT4_SB(sb), s_group_info, indexv); + return grp_info[indexh]; } /* diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index f64838187559..1b46fb63692a 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2356,7 +2356,7 @@ int ext4_mb_alloc_groupinfo(struct super_block *sb, ext4_group_t ngroups) { struct ext4_sb_info *sbi = EXT4_SB(sb); unsigned size; - struct ext4_group_info ***new_groupinfo; + struct ext4_group_info ***old_groupinfo, ***new_groupinfo; size = (ngroups + EXT4_DESC_PER_BLOCK(sb) - 1) >> EXT4_DESC_PER_BLOCK_BITS(sb); @@ -2369,13 +2369,16 @@ int ext4_mb_alloc_groupinfo(struct super_block *sb, ext4_group_t ngroups) ext4_msg(sb, KERN_ERR, "can't allocate buddy meta group"); return -ENOMEM; } - if (sbi->s_group_info) { - memcpy(new_groupinfo, sbi->s_group_info, + rcu_read_lock(); + old_groupinfo = rcu_dereference(sbi->s_group_info); + if (old_groupinfo) + memcpy(new_groupinfo, old_groupinfo, sbi->s_group_info_size * sizeof(*sbi->s_group_info)); - kvfree(sbi->s_group_info); - } - sbi->s_group_info = new_groupinfo; + rcu_read_unlock(); + rcu_assign_pointer(sbi->s_group_info, new_groupinfo); sbi->s_group_info_size = size / sizeof(*sbi->s_group_info); + if (old_groupinfo) + ext4_kvfree_array_rcu(old_groupinfo); ext4_debug("allocated s_groupinfo array for %d meta_bg's\n", sbi->s_group_info_size); return 0; @@ -2387,6 +2390,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, { int i; int metalen = 0; + int idx = group >> EXT4_DESC_PER_BLOCK_BITS(sb); struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_group_info **meta_group_info; struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits); @@ -2405,12 +2409,12 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, "for a buddy group"); goto exit_meta_group_info; } - sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)] = - meta_group_info; + rcu_read_lock(); + rcu_dereference(sbi->s_group_info)[idx] = meta_group_info; + rcu_read_unlock(); } - meta_group_info = - sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)]; + meta_group_info = sbi_array_rcu_deref(sbi, s_group_info, idx); i = group & (EXT4_DESC_PER_BLOCK(sb) - 1); meta_group_info[i] = kmem_cache_zalloc(cachep, GFP_NOFS); @@ -2458,8 +2462,13 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, exit_group_info: /* If a meta_group_info table has been allocated, release it now */ if (group % EXT4_DESC_PER_BLOCK(sb) == 0) { - kfree(sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)]); - sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)] = NULL; + struct ext4_group_info ***group_info; + + rcu_read_lock(); + group_info = rcu_dereference(sbi->s_group_info); + kfree(group_info[idx]); + group_info[idx] = NULL; + rcu_read_unlock(); } exit_meta_group_info: return -ENOMEM; @@ -2472,6 +2481,7 @@ static int ext4_mb_init_backend(struct super_block *sb) struct ext4_sb_info *sbi = EXT4_SB(sb); int err; struct ext4_group_desc *desc; + struct ext4_group_info ***group_info; struct kmem_cache *cachep; err = ext4_mb_alloc_groupinfo(sb, ngroups); @@ -2507,11 +2517,16 @@ err_freebuddy: while (i-- > 0) kmem_cache_free(cachep, ext4_get_group_info(sb, i)); i = sbi->s_group_info_size; + rcu_read_lock(); + group_info = rcu_dereference(sbi->s_group_info); while (i-- > 0) - kfree(sbi->s_group_info[i]); + kfree(group_info[i]); + rcu_read_unlock(); iput(sbi->s_buddy_cache); err_freesgi: - kvfree(sbi->s_group_info); + rcu_read_lock(); + kvfree(rcu_dereference(sbi->s_group_info)); + rcu_read_unlock(); return -ENOMEM; } @@ -2700,7 +2715,7 @@ int ext4_mb_release(struct super_block *sb) ext4_group_t ngroups = ext4_get_groups_count(sb); ext4_group_t i; int num_meta_group_infos; - struct ext4_group_info *grinfo; + struct ext4_group_info *grinfo, ***group_info; struct ext4_sb_info *sbi = EXT4_SB(sb); struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits); @@ -2719,9 +2734,12 @@ int ext4_mb_release(struct super_block *sb) num_meta_group_infos = (ngroups + EXT4_DESC_PER_BLOCK(sb) - 1) >> EXT4_DESC_PER_BLOCK_BITS(sb); + rcu_read_lock(); + group_info = rcu_dereference(sbi->s_group_info); for (i = 0; i < num_meta_group_infos; i++) - kfree(sbi->s_group_info[i]); - kvfree(sbi->s_group_info); + kfree(group_info[i]); + kvfree(group_info); + rcu_read_unlock(); } kfree(sbi->s_mb_offsets); kfree(sbi->s_mb_maxs); -- cgit v1.2.3 From fd1d98650ac0042d475155116e65fd17eb379542 Mon Sep 17 00:00:00 2001 From: Guo Ren Date: Tue, 8 Oct 2019 14:37:02 +0800 Subject: MAINTAINERS: csky: Add mailing list for csky Add mailing list and it's convenient for maintain C-SKY subsystem. Signed-off-by: Guo Ren --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index a0d86490c2c6..a4b5858b29bd 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3649,6 +3649,7 @@ F: sound/pci/oxygen/ C-SKY ARCHITECTURE M: Guo Ren +L: linux-csky@vger.kernel.org T: git https://github.com/c-sky/csky-linux.git S: Supported F: arch/csky/ -- cgit v1.2.3 From 2f78c73f78c39dabc5c44ad8dd61fd6ec65636d6 Mon Sep 17 00:00:00 2001 From: Mao Han Date: Fri, 11 Oct 2019 10:56:55 +0800 Subject: csky: Initial stack protector support This is a basic -fstack-protector support without per-task canary switching. The protector will report something like when stack corruption is detected: It's tested with strcpy local array overflow in sys_kill and get: stack-protector: Kernel stack is corrupted in: sys_kill+0x23c/0x23c TODO: - Support task switch for different cannary Signed-off-by: Mao Han Signed-off-by: Guo Ren --- arch/csky/Kconfig | 1 + arch/csky/include/asm/stackprotector.h | 29 +++++++++++++++++++++++++++++ arch/csky/kernel/process.c | 6 ++++++ 3 files changed, 36 insertions(+) create mode 100644 arch/csky/include/asm/stackprotector.h diff --git a/arch/csky/Kconfig b/arch/csky/Kconfig index da09c884cc30..ea7f3b890d03 100644 --- a/arch/csky/Kconfig +++ b/arch/csky/Kconfig @@ -49,6 +49,7 @@ config CSKY select HAVE_PERF_USER_STACK_DUMP select HAVE_DMA_API_DEBUG select HAVE_DMA_CONTIGUOUS + select HAVE_STACKPROTECTOR select HAVE_SYSCALL_TRACEPOINTS select MAY_HAVE_SPARSE_IRQ select MODULES_USE_ELF_RELA if MODULES diff --git a/arch/csky/include/asm/stackprotector.h b/arch/csky/include/asm/stackprotector.h new file mode 100644 index 000000000000..d7cd4e51edd9 --- /dev/null +++ b/arch/csky/include/asm/stackprotector.h @@ -0,0 +1,29 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_STACKPROTECTOR_H +#define _ASM_STACKPROTECTOR_H 1 + +#include +#include + +extern unsigned long __stack_chk_guard; + +/* + * Initialize the stackprotector canary value. + * + * NOTE: this must only be called from functions that never return, + * and it must always be inlined. + */ +static __always_inline void boot_init_stack_canary(void) +{ + unsigned long canary; + + /* Try to get a semi random initial value. */ + get_random_bytes(&canary, sizeof(canary)); + canary ^= LINUX_VERSION_CODE; + canary &= CANARY_MASK; + + current->stack_canary = canary; + __stack_chk_guard = current->stack_canary; +} + +#endif /* __ASM_SH_STACKPROTECTOR_H */ diff --git a/arch/csky/kernel/process.c b/arch/csky/kernel/process.c index f320d9248a22..5349cd8c0f30 100644 --- a/arch/csky/kernel/process.c +++ b/arch/csky/kernel/process.c @@ -16,6 +16,12 @@ struct cpuinfo_csky cpu_data[NR_CPUS]; +#ifdef CONFIG_STACKPROTECTOR +#include +unsigned long __stack_chk_guard __read_mostly; +EXPORT_SYMBOL(__stack_chk_guard); +#endif + asmlinkage void ret_from_fork(void); asmlinkage void ret_from_kernel_thread(void); -- cgit v1.2.3 From f525bb2c9e7cf1e3c43ab57704c9e1c836d30b34 Mon Sep 17 00:00:00 2001 From: Guo Ren Date: Wed, 27 Nov 2019 08:44:33 +0800 Subject: csky: Tightly-Coupled Memory or Sram support The implementation are not only used by TCM but also used by sram on SOC bus. It follow existed linux tcm software interface, so that old tcm application codes could be re-used directly. Software interface list in asm/tcm.h: - Variables/Const: __tcmdata, __tcmconst - Functions: __tcmfunc, __tcmlocalfunc - Malloc/Free: tcm_alloc, tcm_free In linux menuconfig: - Choose a TCM contain instrctions + data or separated in ITCM/DTCM. - Determine TCM_BASE (DTCM_BASE) in phyiscal address. - Determine size of TCM or ITCM(DTCM) in page counts. Here is hello tcm example from Documentation/arm/tcm.rst which could be directly used: /* Uninitialized data */ static u32 __tcmdata tcmvar; /* Initialized data */ static u32 __tcmdata tcmassigned = 0x2BADBABEU; /* Constant */ static const u32 __tcmconst tcmconst = 0xCAFEBABEU; static void __tcmlocalfunc tcm_to_tcm(void) { int i; for (i = 0; i < 100; i++) tcmvar ++; } static void __tcmfunc hello_tcm(void) { /* Some abstract code that runs in ITCM */ int i; for (i = 0; i < 100; i++) { tcmvar ++; } tcm_to_tcm(); } static void __init test_tcm(void) { u32 *tcmem; int i; hello_tcm(); printk("Hello TCM executed from ITCM RAM\n"); printk("TCM variable from testrun: %u @ %p\n", tcmvar, &tcmvar); tcmvar = 0xDEADBEEFU; printk("TCM variable: 0x%x @ %p\n", tcmvar, &tcmvar); printk("TCM assigned variable: 0x%x @ %p\n", tcmassigned, &tcmassigned); printk("TCM constant: 0x%x @ %p\n", tcmconst, &tcmconst); /* Allocate some TCM memory from the pool */ tcmem = tcm_alloc(20); if (tcmem) { printk("TCM Allocated 20 bytes of TCM @ %p\n", tcmem); tcmem[0] = 0xDEADBEEFU; tcmem[1] = 0x2BADBABEU; tcmem[2] = 0xCAFEBABEU; tcmem[3] = 0xDEADBEEFU; tcmem[4] = 0x2BADBABEU; for (i = 0; i < 5; i++) printk("TCM tcmem[%d] = %08x\n", i, tcmem[i]); tcm_free(tcmem, 20); } } TODO: - Separate fixup mapping from highmem - Support abiv1 Signed-off-by: Guo Ren --- arch/csky/Kconfig | 35 +++++++++ arch/csky/include/asm/fixmap.h | 5 +- arch/csky/include/asm/memory.h | 22 ++++++ arch/csky/include/asm/tcm.h | 24 ++++++ arch/csky/kernel/vmlinux.lds.S | 49 ++++++++++++ arch/csky/mm/Makefile | 1 + arch/csky/mm/tcm.c | 169 +++++++++++++++++++++++++++++++++++++++++ 7 files changed, 304 insertions(+), 1 deletion(-) create mode 100644 arch/csky/include/asm/memory.h create mode 100644 arch/csky/include/asm/tcm.h create mode 100644 arch/csky/mm/tcm.c diff --git a/arch/csky/Kconfig b/arch/csky/Kconfig index ea7f3b890d03..fc92f08f0017 100644 --- a/arch/csky/Kconfig +++ b/arch/csky/Kconfig @@ -189,6 +189,41 @@ config CPU_PM_STOP bool "stop" endchoice +menuconfig HAVE_TCM + bool "Tightly-Coupled/Sram Memory" + depends on HIGHMEM + select GENERIC_ALLOCATOR + help + The implementation are not only used by TCM (Tightly-Coupled Meory) + but also used by sram on SOC bus. It follow existed linux tcm + software interface, so that old tcm application codes could be + re-used directly. + +if HAVE_TCM +config ITCM_RAM_BASE + hex "ITCM ram base" + default 0xffffffff + +config ITCM_NR_PAGES + int "Page count of ITCM size: NR*4KB" + range 1 256 + default 32 + +config HAVE_DTCM + bool "DTCM Support" + +config DTCM_RAM_BASE + hex "DTCM ram base" + depends on HAVE_DTCM + default 0xffffffff + +config DTCM_NR_PAGES + int "Page count of DTCM size: NR*4KB" + depends on HAVE_DTCM + range 1 256 + default 32 +endif + config CPU_HAS_VDSP bool "CPU has VDSP coprocessor" depends on CPU_HAS_FPU && CPU_HAS_FPUV2 diff --git a/arch/csky/include/asm/fixmap.h b/arch/csky/include/asm/fixmap.h index 380ff0a307df..4350578faa1c 100644 --- a/arch/csky/include/asm/fixmap.h +++ b/arch/csky/include/asm/fixmap.h @@ -5,12 +5,16 @@ #define __ASM_CSKY_FIXMAP_H #include +#include #ifdef CONFIG_HIGHMEM #include #include #endif enum fixed_addresses { +#ifdef CONFIG_HAVE_TCM + FIX_TCM = TCM_NR_PAGES, +#endif #ifdef CONFIG_HIGHMEM FIX_KMAP_BEGIN, FIX_KMAP_END = FIX_KMAP_BEGIN + (KM_TYPE_NR * NR_CPUS) - 1, @@ -18,7 +22,6 @@ enum fixed_addresses { __end_of_fixed_addresses }; -#define FIXADDR_TOP 0xffffc000 #define FIXADDR_SIZE (__end_of_fixed_addresses << PAGE_SHIFT) #define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE) diff --git a/arch/csky/include/asm/memory.h b/arch/csky/include/asm/memory.h new file mode 100644 index 000000000000..cdffbd0a500b --- /dev/null +++ b/arch/csky/include/asm/memory.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef __ASM_CSKY_MEMORY_H +#define __ASM_CSKY_MEMORY_H + +#include +#include +#include +#include + +#define FIXADDR_TOP _AC(0xffffc000, UL) + +#ifdef CONFIG_HAVE_TCM +#ifdef CONFIG_HAVE_DTCM +#define TCM_NR_PAGES (CONFIG_ITCM_NR_PAGES + CONFIG_DTCM_NR_PAGES) +#else +#define TCM_NR_PAGES (CONFIG_ITCM_NR_PAGES) +#endif +#define FIXADDR_TCM _AC(FIXADDR_TOP - (TCM_NR_PAGES * PAGE_SIZE), UL) +#endif + +#endif diff --git a/arch/csky/include/asm/tcm.h b/arch/csky/include/asm/tcm.h new file mode 100644 index 000000000000..2b135cefb73f --- /dev/null +++ b/arch/csky/include/asm/tcm.h @@ -0,0 +1,24 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef __ASM_CSKY_TCM_H +#define __ASM_CSKY_TCM_H + +#ifndef CONFIG_HAVE_TCM +#error "You should not be including tcm.h unless you have a TCM!" +#endif + +#include + +/* Tag variables with this */ +#define __tcmdata __section(.tcm.data) +/* Tag constants with this */ +#define __tcmconst __section(.tcm.rodata) +/* Tag functions inside TCM called from outside TCM with this */ +#define __tcmfunc __section(.tcm.text) noinline +/* Tag function inside TCM called from inside TCM with this */ +#define __tcmlocalfunc __section(.tcm.text) + +void *tcm_alloc(size_t len); +void tcm_free(void *addr, size_t len); + +#endif diff --git a/arch/csky/kernel/vmlinux.lds.S b/arch/csky/kernel/vmlinux.lds.S index 2ff37beaf2bf..f05b413df328 100644 --- a/arch/csky/kernel/vmlinux.lds.S +++ b/arch/csky/kernel/vmlinux.lds.S @@ -2,6 +2,7 @@ #include #include +#include OUTPUT_ARCH(csky) ENTRY(_start) @@ -53,6 +54,54 @@ SECTIONS RW_DATA(L1_CACHE_BYTES, PAGE_SIZE, THREAD_SIZE) _edata = .; +#ifdef CONFIG_HAVE_TCM + .tcm_start : { + . = ALIGN(PAGE_SIZE); + __tcm_start = .; + } + + .text_data_tcm FIXADDR_TCM : AT(__tcm_start) + { + . = ALIGN(4); + __stcm_text_data = .; + *(.tcm.text) + *(.tcm.rodata) +#ifndef CONFIG_HAVE_DTCM + *(.tcm.data) +#endif + . = ALIGN(4); + __etcm_text_data = .; + } + + . = ADDR(.tcm_start) + SIZEOF(.tcm_start) + SIZEOF(.text_data_tcm); + +#ifdef CONFIG_HAVE_DTCM + #define ITCM_SIZE CONFIG_ITCM_NR_PAGES * PAGE_SIZE + + .dtcm_start : { + __dtcm_start = .; + } + + .data_tcm FIXADDR_TCM + ITCM_SIZE : AT(__dtcm_start) + { + . = ALIGN(4); + __stcm_data = .; + *(.tcm.data) + . = ALIGN(4); + __etcm_data = .; + } + + . = ADDR(.dtcm_start) + SIZEOF(.data_tcm); + + .tcm_end : AT(ADDR(.dtcm_start) + SIZEOF(.data_tcm)) { +#else + .tcm_end : AT(ADDR(.tcm_start) + SIZEOF(.text_data_tcm)) { +#endif + . = ALIGN(PAGE_SIZE); + __tcm_end = .; + } +#endif + EXCEPTION_TABLE(L1_CACHE_BYTES) BSS_SECTION(L1_CACHE_BYTES, PAGE_SIZE, L1_CACHE_BYTES) VBR_BASE diff --git a/arch/csky/mm/Makefile b/arch/csky/mm/Makefile index c94ef6481098..2c51918eb4fc 100644 --- a/arch/csky/mm/Makefile +++ b/arch/csky/mm/Makefile @@ -14,3 +14,4 @@ obj-y += syscache.o obj-y += tlb.o obj-y += asid.o obj-y += context.o +obj-$(CONFIG_HAVE_TCM) += tcm.o diff --git a/arch/csky/mm/tcm.c b/arch/csky/mm/tcm.c new file mode 100644 index 000000000000..ddeb36328819 --- /dev/null +++ b/arch/csky/mm/tcm.c @@ -0,0 +1,169 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include + +#if (CONFIG_ITCM_RAM_BASE == 0xffffffff) +#error "You should define ITCM_RAM_BASE" +#endif + +#ifdef CONFIG_HAVE_DTCM +#if (CONFIG_DTCM_RAM_BASE == 0xffffffff) +#error "You should define DTCM_RAM_BASE" +#endif + +#if (CONFIG_DTCM_RAM_BASE == CONFIG_ITCM_RAM_BASE) +#error "You should define correct DTCM_RAM_BASE" +#endif +#endif + +extern char __tcm_start, __tcm_end, __dtcm_start; + +static struct gen_pool *tcm_pool; + +static void __init tcm_mapping_init(void) +{ + pte_t *tcm_pte; + unsigned long vaddr, paddr; + int i; + + paddr = CONFIG_ITCM_RAM_BASE; + + if (pfn_valid(PFN_DOWN(CONFIG_ITCM_RAM_BASE))) + goto panic; + +#ifndef CONFIG_HAVE_DTCM + for (i = 0; i < TCM_NR_PAGES; i++) { +#else + for (i = 0; i < CONFIG_ITCM_NR_PAGES; i++) { +#endif + vaddr = __fix_to_virt(FIX_TCM - i); + + tcm_pte = + pte_offset_kernel((pmd_t *)pgd_offset_k(vaddr), vaddr); + + set_pte(tcm_pte, pfn_pte(__phys_to_pfn(paddr), PAGE_KERNEL)); + + flush_tlb_one(vaddr); + + paddr = paddr + PAGE_SIZE; + } + +#ifdef CONFIG_HAVE_DTCM + if (pfn_valid(PFN_DOWN(CONFIG_DTCM_RAM_BASE))) + goto panic; + + paddr = CONFIG_DTCM_RAM_BASE; + + for (i = 0; i < CONFIG_DTCM_NR_PAGES; i++) { + vaddr = __fix_to_virt(FIX_TCM - CONFIG_ITCM_NR_PAGES - i); + + tcm_pte = + pte_offset_kernel((pmd_t *) pgd_offset_k(vaddr), vaddr); + + set_pte(tcm_pte, pfn_pte(__phys_to_pfn(paddr), PAGE_KERNEL)); + + flush_tlb_one(vaddr); + + paddr = paddr + PAGE_SIZE; + } +#endif + +#ifndef CONFIG_HAVE_DTCM + memcpy((void *)__fix_to_virt(FIX_TCM), + &__tcm_start, &__tcm_end - &__tcm_start); + + pr_info("%s: mapping tcm va:0x%08lx to pa:0x%08x\n", + __func__, __fix_to_virt(FIX_TCM), CONFIG_ITCM_RAM_BASE); + + pr_info("%s: __tcm_start va:0x%08lx size:%d\n", + __func__, (unsigned long)&__tcm_start, &__tcm_end - &__tcm_start); +#else + memcpy((void *)__fix_to_virt(FIX_TCM), + &__tcm_start, &__dtcm_start - &__tcm_start); + + pr_info("%s: mapping itcm va:0x%08lx to pa:0x%08x\n", + __func__, __fix_to_virt(FIX_TCM), CONFIG_ITCM_RAM_BASE); + + pr_info("%s: __itcm_start va:0x%08lx size:%d\n", + __func__, (unsigned long)&__tcm_start, &__dtcm_start - &__tcm_start); + + memcpy((void *)__fix_to_virt(FIX_TCM - CONFIG_ITCM_NR_PAGES), + &__dtcm_start, &__tcm_end - &__dtcm_start); + + pr_info("%s: mapping dtcm va:0x%08lx to pa:0x%08x\n", + __func__, __fix_to_virt(FIX_TCM - CONFIG_ITCM_NR_PAGES), + CONFIG_DTCM_RAM_BASE); + + pr_info("%s: __dtcm_start va:0x%08lx size:%d\n", + __func__, (unsigned long)&__dtcm_start, &__tcm_end - &__dtcm_start); + +#endif + return; +panic: + panic("TCM init error"); +} + +void *tcm_alloc(size_t len) +{ + unsigned long vaddr; + + if (!tcm_pool) + return NULL; + + vaddr = gen_pool_alloc(tcm_pool, len); + if (!vaddr) + return NULL; + + return (void *) vaddr; +} +EXPORT_SYMBOL(tcm_alloc); + +void tcm_free(void *addr, size_t len) +{ + gen_pool_free(tcm_pool, (unsigned long) addr, len); +} +EXPORT_SYMBOL(tcm_free); + +static int __init tcm_setup_pool(void) +{ +#ifndef CONFIG_HAVE_DTCM + u32 pool_size = (u32) (TCM_NR_PAGES * PAGE_SIZE) + - (u32) (&__tcm_end - &__tcm_start); + + u32 tcm_pool_start = __fix_to_virt(FIX_TCM) + + (u32) (&__tcm_end - &__tcm_start); +#else + u32 pool_size = (u32) (CONFIG_DTCM_NR_PAGES * PAGE_SIZE) + - (u32) (&__tcm_end - &__dtcm_start); + + u32 tcm_pool_start = __fix_to_virt(FIX_TCM - CONFIG_ITCM_NR_PAGES) + + (u32) (&__tcm_end - &__dtcm_start); +#endif + int ret; + + tcm_pool = gen_pool_create(2, -1); + + ret = gen_pool_add(tcm_pool, tcm_pool_start, pool_size, -1); + if (ret) { + pr_err("%s: gen_pool add failed!\n", __func__); + return ret; + } + + pr_info("%s: Added %d bytes @ 0x%08x to memory pool\n", + __func__, pool_size, tcm_pool_start); + + return 0; +} + +static int __init tcm_init(void) +{ + tcm_mapping_init(); + + tcm_setup_pool(); + + return 0; +} +arch_initcall(tcm_init); -- cgit v1.2.3 From f136008f31e91060d6b6e6e031cb2c827448e280 Mon Sep 17 00:00:00 2001 From: Guo Ren Date: Sun, 1 Dec 2019 22:34:19 +0800 Subject: csky: Separate fixaddr_init from highmem After fixaddr_init is separated from highmem, we could use tcm without highmem selected. (610 (abiv1) don't support highmem, but it could use tcm now.) Signed-off-by: Guo Ren --- arch/csky/Kconfig | 1 - arch/csky/include/asm/fixmap.h | 4 +++ arch/csky/include/asm/memory.h | 3 ++ arch/csky/include/asm/pgtable.h | 6 +--- arch/csky/kernel/setup.c | 2 ++ arch/csky/mm/highmem.c | 64 +++-------------------------------------- arch/csky/mm/init.c | 47 ++++++++++++++++++++++++++++++ 7 files changed, 61 insertions(+), 66 deletions(-) diff --git a/arch/csky/Kconfig b/arch/csky/Kconfig index fc92f08f0017..00ac7bc5558a 100644 --- a/arch/csky/Kconfig +++ b/arch/csky/Kconfig @@ -191,7 +191,6 @@ endchoice menuconfig HAVE_TCM bool "Tightly-Coupled/Sram Memory" - depends on HIGHMEM select GENERIC_ALLOCATOR help The implementation are not only used by TCM (Tightly-Coupled Meory) diff --git a/arch/csky/include/asm/fixmap.h b/arch/csky/include/asm/fixmap.h index 4350578faa1c..81f9477d5330 100644 --- a/arch/csky/include/asm/fixmap.h +++ b/arch/csky/include/asm/fixmap.h @@ -27,4 +27,8 @@ enum fixed_addresses { #include +extern void fixrange_init(unsigned long start, unsigned long end, + pgd_t *pgd_base); +extern void __init fixaddr_init(void); + #endif /* __ASM_CSKY_FIXMAP_H */ diff --git a/arch/csky/include/asm/memory.h b/arch/csky/include/asm/memory.h index cdffbd0a500b..a65c6759f537 100644 --- a/arch/csky/include/asm/memory.h +++ b/arch/csky/include/asm/memory.h @@ -9,6 +9,9 @@ #include #define FIXADDR_TOP _AC(0xffffc000, UL) +#define PKMAP_BASE _AC(0xff800000, UL) +#define VMALLOC_START _AC(0xc0008000, UL) +#define VMALLOC_END (PKMAP_BASE - (PAGE_SIZE * 2)) #ifdef CONFIG_HAVE_TCM #ifdef CONFIG_HAVE_DTCM diff --git a/arch/csky/include/asm/pgtable.h b/arch/csky/include/asm/pgtable.h index 4b2a41e15f2e..9b7764cb7645 100644 --- a/arch/csky/include/asm/pgtable.h +++ b/arch/csky/include/asm/pgtable.h @@ -5,6 +5,7 @@ #define __ASM_CSKY_PGTABLE_H #include +#include #include #include #include @@ -16,11 +17,6 @@ #define USER_PTRS_PER_PGD (0x80000000UL/PGDIR_SIZE) #define FIRST_USER_ADDRESS 0UL -#define PKMAP_BASE (0xff800000) - -#define VMALLOC_START (0xc0008000) -#define VMALLOC_END (PKMAP_BASE - 2*PAGE_SIZE) - /* * C-SKY is two-level paging structure: */ diff --git a/arch/csky/kernel/setup.c b/arch/csky/kernel/setup.c index 52eaf31ba27f..4fc7b16b77d2 100644 --- a/arch/csky/kernel/setup.c +++ b/arch/csky/kernel/setup.c @@ -133,6 +133,8 @@ void __init setup_arch(char **cmdline_p) sparse_init(); + fixaddr_init(); + #ifdef CONFIG_HIGHMEM kmap_init(); #endif diff --git a/arch/csky/mm/highmem.c b/arch/csky/mm/highmem.c index 3317b774f6dc..813129145f3d 100644 --- a/arch/csky/mm/highmem.c +++ b/arch/csky/mm/highmem.c @@ -117,85 +117,29 @@ struct page *kmap_atomic_to_page(void *ptr) return pte_page(*pte); } -static void __init fixrange_init(unsigned long start, unsigned long end, - pgd_t *pgd_base) +static void __init kmap_pages_init(void) { -#ifdef CONFIG_HIGHMEM - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - int i, j, k; unsigned long vaddr; - - vaddr = start; - i = __pgd_offset(vaddr); - j = __pud_offset(vaddr); - k = __pmd_offset(vaddr); - pgd = pgd_base + i; - - for ( ; (i < PTRS_PER_PGD) && (vaddr != end); pgd++, i++) { - pud = (pud_t *)pgd; - for ( ; (j < PTRS_PER_PUD) && (vaddr != end); pud++, j++) { - pmd = (pmd_t *)pud; - for (; (k < PTRS_PER_PMD) && (vaddr != end); pmd++, k++) { - if (pmd_none(*pmd)) { - pte = (pte_t *) memblock_alloc_low(PAGE_SIZE, PAGE_SIZE); - if (!pte) - panic("%s: Failed to allocate %lu bytes align=%lx\n", - __func__, PAGE_SIZE, - PAGE_SIZE); - - set_pmd(pmd, __pmd(__pa(pte))); - BUG_ON(pte != pte_offset_kernel(pmd, 0)); - } - vaddr += PMD_SIZE; - } - k = 0; - } - j = 0; - } -#endif -} - -void __init fixaddr_kmap_pages_init(void) -{ - unsigned long vaddr; - pgd_t *pgd_base; -#ifdef CONFIG_HIGHMEM pgd_t *pgd; pmd_t *pmd; pud_t *pud; pte_t *pte; -#endif - pgd_base = swapper_pg_dir; - - /* - * Fixed mappings: - */ - vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK; - fixrange_init(vaddr, 0, pgd_base); - -#ifdef CONFIG_HIGHMEM - /* - * Permanent kmaps: - */ + vaddr = PKMAP_BASE; - fixrange_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, pgd_base); + fixrange_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, swapper_pg_dir); pgd = swapper_pg_dir + __pgd_offset(vaddr); pud = (pud_t *)pgd; pmd = pmd_offset(pud, vaddr); pte = pte_offset_kernel(pmd, vaddr); pkmap_page_table = pte; -#endif } void __init kmap_init(void) { unsigned long vaddr; - fixaddr_kmap_pages_init(); + kmap_pages_init(); vaddr = __fix_to_virt(FIX_KMAP_BEGIN); diff --git a/arch/csky/mm/init.c b/arch/csky/mm/init.c index d4c2292ea46b..322eb7bd7962 100644 --- a/arch/csky/mm/init.c +++ b/arch/csky/mm/init.c @@ -101,3 +101,50 @@ void __init pre_mmu_init(void) /* Setup page mask to 4k */ write_mmu_pagemask(0); } + +void __init fixrange_init(unsigned long start, unsigned long end, + pgd_t *pgd_base) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + int i, j, k; + unsigned long vaddr; + + vaddr = start; + i = __pgd_offset(vaddr); + j = __pud_offset(vaddr); + k = __pmd_offset(vaddr); + pgd = pgd_base + i; + + for ( ; (i < PTRS_PER_PGD) && (vaddr != end); pgd++, i++) { + pud = (pud_t *)pgd; + for ( ; (j < PTRS_PER_PUD) && (vaddr != end); pud++, j++) { + pmd = (pmd_t *)pud; + for (; (k < PTRS_PER_PMD) && (vaddr != end); pmd++, k++) { + if (pmd_none(*pmd)) { + pte = (pte_t *) memblock_alloc_low(PAGE_SIZE, PAGE_SIZE); + if (!pte) + panic("%s: Failed to allocate %lu bytes align=%lx\n", + __func__, PAGE_SIZE, + PAGE_SIZE); + + set_pmd(pmd, __pmd(__pa(pte))); + BUG_ON(pte != pte_offset_kernel(pmd, 0)); + } + vaddr += PMD_SIZE; + } + k = 0; + } + j = 0; + } +} + +void __init fixaddr_init(void) +{ + unsigned long vaddr; + + vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK; + fixrange_init(vaddr, vaddr + PMD_SIZE, swapper_pg_dir); +} -- cgit v1.2.3 From 7f4a567332f035ab16b29010fbd04a0f10183c77 Mon Sep 17 00:00:00 2001 From: Guo Ren Date: Mon, 30 Dec 2019 15:53:37 +0800 Subject: csky/mm: Fixup export invalid_pte_table symbol There is no present bit in csky pmd hardware, so we need to prepare invalid_pte_table for empty pmd entry and the functions (pmd_none & pmd_present) in pgtable.h need invalid_pte_talbe to get result. If a module use these functions, we need export the symbol for it. Signed-off-by: Guo Ren Cc: Mo Qihui Cc: Zhange Jian --- arch/csky/mm/init.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/csky/mm/init.c b/arch/csky/mm/init.c index 322eb7bd7962..dbb4b2dfe4b7 100644 --- a/arch/csky/mm/init.c +++ b/arch/csky/mm/init.c @@ -31,6 +31,7 @@ pgd_t swapper_pg_dir[PTRS_PER_PGD] __page_aligned_bss; pte_t invalid_pte_table[PTRS_PER_PTE] __page_aligned_bss; +EXPORT_SYMBOL(invalid_pte_table); unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)] __page_aligned_bss; EXPORT_SYMBOL(empty_zero_page); -- cgit v1.2.3 From f8e17c17b81070f38062dce79ca7f4541851dadd Mon Sep 17 00:00:00 2001 From: Guo Ren Date: Tue, 17 Dec 2019 11:12:55 +0800 Subject: csky: Set regs->usp to kernel sp, when the exception is from kernel In the past, we didn't care about kernel sp when saving pt_reg. But in some cases, we still need pt_reg->usp to represent the kernel stack before enter exception. For cmpxhg in atomic.S, we need save and restore usp for above. Signed-off-by: Guo Ren --- arch/csky/abiv1/inc/abi/entry.h | 19 ++++++++++++++----- arch/csky/abiv2/inc/abi/entry.h | 11 +++++++++++ arch/csky/kernel/atomic.S | 8 ++++++-- 3 files changed, 31 insertions(+), 7 deletions(-) diff --git a/arch/csky/abiv1/inc/abi/entry.h b/arch/csky/abiv1/inc/abi/entry.h index 7ab78bd0f3b1..f35a9f3315ee 100644 --- a/arch/csky/abiv1/inc/abi/entry.h +++ b/arch/csky/abiv1/inc/abi/entry.h @@ -16,14 +16,16 @@ #define LSAVE_A4 40 #define LSAVE_A5 44 +#define usp ss1 + .macro USPTOKSP - mtcr sp, ss1 + mtcr sp, usp mfcr sp, ss0 .endm .macro KSPTOUSP mtcr sp, ss0 - mfcr sp, ss1 + mfcr sp, usp .endm .macro SAVE_ALL epc_inc @@ -45,7 +47,13 @@ add lr, r13 stw lr, (sp, 8) + mov lr, sp + addi lr, 32 + addi lr, 32 + addi lr, 16 + bt 2f mfcr lr, ss1 +2: stw lr, (sp, 16) stw a0, (sp, 20) @@ -79,9 +87,10 @@ ldw a0, (sp, 12) mtcr a0, epsr btsti a0, 31 + bt 1f ldw a0, (sp, 16) mtcr a0, ss1 - +1: ldw a0, (sp, 24) ldw a1, (sp, 28) ldw a2, (sp, 32) @@ -102,9 +111,9 @@ addi sp, 32 addi sp, 8 - bt 1f + bt 2f KSPTOUSP -1: +2: rte .endm diff --git a/arch/csky/abiv2/inc/abi/entry.h b/arch/csky/abiv2/inc/abi/entry.h index 9897a16b45e5..94a7a58765df 100644 --- a/arch/csky/abiv2/inc/abi/entry.h +++ b/arch/csky/abiv2/inc/abi/entry.h @@ -31,7 +31,13 @@ mfcr lr, epsr stw lr, (sp, 12) + btsti lr, 31 + bf 1f + addi lr, sp, 152 + br 2f +1: mfcr lr, usp +2: stw lr, (sp, 16) stw a0, (sp, 20) @@ -64,8 +70,10 @@ mtcr a0, epc ldw a0, (sp, 12) mtcr a0, epsr + btsti a0, 31 ldw a0, (sp, 16) mtcr a0, usp + mtcr a0, ss0 #ifdef CONFIG_CPU_HAS_HILO ldw a0, (sp, 140) @@ -86,6 +94,9 @@ addi sp, 40 ldm r16-r30, (sp) addi sp, 72 + bf 1f + mfcr sp, ss0 +1: rte .endm diff --git a/arch/csky/kernel/atomic.S b/arch/csky/kernel/atomic.S index 5b84f11485ae..3821ef9b7567 100644 --- a/arch/csky/kernel/atomic.S +++ b/arch/csky/kernel/atomic.S @@ -17,10 +17,12 @@ ENTRY(csky_cmpxchg) mfcr a3, epc addi a3, TRAP0_SIZE - subi sp, 8 + subi sp, 16 stw a3, (sp, 0) mfcr a3, epsr stw a3, (sp, 4) + mfcr a3, usp + stw a3, (sp, 8) psrset ee #ifdef CONFIG_CPU_HAS_LDSTEX @@ -47,7 +49,9 @@ ENTRY(csky_cmpxchg) mtcr a3, epc ldw a3, (sp, 4) mtcr a3, epsr - addi sp, 8 + ldw a3, (sp, 8) + mtcr a3, usp + addi sp, 16 KSPTOUSP rte END(csky_cmpxchg) -- cgit v1.2.3 From c9492737b25ca32679ba3163609d938c9abfd508 Mon Sep 17 00:00:00 2001 From: Guo Ren Date: Tue, 7 Jan 2020 12:21:25 +0800 Subject: csky/smp: Fixup boot failed when CONFIG_SMP If we use a non-ipi-support interrupt controller, it will cause panic here. We should let cpu up and work with CONFIG_SMP, when we use a non-ipi intc. Signed-off-by: Guo Ren --- arch/csky/kernel/smp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/csky/kernel/smp.c b/arch/csky/kernel/smp.c index b753d382e4ce..0bb0954d5570 100644 --- a/arch/csky/kernel/smp.c +++ b/arch/csky/kernel/smp.c @@ -120,7 +120,7 @@ void __init setup_smp_ipi(void) int rc; if (ipi_irq == 0) - panic("%s IRQ mapping failed\n", __func__); + return; rc = request_percpu_irq(ipi_irq, handle_ipi, "IPI Interrupt", &ipi_dummy_dev); -- cgit v1.2.3 From a736fa1ed772e3640e1bfaab36032c5a285d6a7b Mon Sep 17 00:00:00 2001 From: Guo Ren Date: Sat, 11 Jan 2020 13:44:32 +0800 Subject: csky/Kconfig: Add Kconfig.platforms to support some drivers Such as snps,dw-apb-ictl Signed-off-by: Guo Ren --- arch/csky/Kconfig | 2 ++ arch/csky/Kconfig.platforms | 9 +++++++++ 2 files changed, 11 insertions(+) create mode 100644 arch/csky/Kconfig.platforms diff --git a/arch/csky/Kconfig b/arch/csky/Kconfig index 00ac7bc5558a..1c723cb11cc4 100644 --- a/arch/csky/Kconfig +++ b/arch/csky/Kconfig @@ -270,4 +270,6 @@ config HOTPLUG_CPU Say N if you want to disable CPU hotplug. endmenu +source "arch/csky/Kconfig.platforms" + source "kernel/Kconfig.hz" diff --git a/arch/csky/Kconfig.platforms b/arch/csky/Kconfig.platforms new file mode 100644 index 000000000000..639e17f4eacb --- /dev/null +++ b/arch/csky/Kconfig.platforms @@ -0,0 +1,9 @@ +menu "Platform drivers selection" + +config ARCH_CSKY_DW_APB_ICTL + bool "Select dw-apb interrupt controller" + select DW_APB_ICTL + default y + help + This enables support for snps dw-apb-ictl +endmenu -- cgit v1.2.3 From 761b4f694cb90b63ca2739ac8a8a176342636e5e Mon Sep 17 00:00:00 2001 From: Guo Ren Date: Wed, 22 Jan 2020 11:15:14 +0800 Subject: csky: Support icache flush without specific instructions Some CPUs don't support icache specific instructions to flush icache lines in broadcast way. We use cpu control registers to flush local icache and use IPI to notify other cores. Signed-off-by: Guo Ren --- arch/csky/Kconfig | 4 ++++ arch/csky/include/asm/cache.h | 1 + arch/csky/mm/cachev1.c | 5 +++++ arch/csky/mm/cachev2.c | 29 +++++++++++++++++++++-------- 4 files changed, 31 insertions(+), 8 deletions(-) diff --git a/arch/csky/Kconfig b/arch/csky/Kconfig index 1c723cb11cc4..111474b50f69 100644 --- a/arch/csky/Kconfig +++ b/arch/csky/Kconfig @@ -231,6 +231,10 @@ config CPU_HAS_FPU bool "CPU has FPU coprocessor" depends on CPU_CK807 || CPU_CK810 || CPU_CK860 +config CPU_HAS_ICACHE_INS + bool "CPU has Icache invalidate instructions" + depends on CPU_HAS_CACHEV2 + config CPU_HAS_TEE bool "CPU has Trusted Execution Environment" depends on CPU_CK810 diff --git a/arch/csky/include/asm/cache.h b/arch/csky/include/asm/cache.h index 1d5fc2f78fd7..4b5c09bf1d25 100644 --- a/arch/csky/include/asm/cache.h +++ b/arch/csky/include/asm/cache.h @@ -16,6 +16,7 @@ void dcache_wb_line(unsigned long start); void icache_inv_range(unsigned long start, unsigned long end); void icache_inv_all(void); +void local_icache_inv_all(void *priv); void dcache_wb_range(unsigned long start, unsigned long end); void dcache_wbinv_all(void); diff --git a/arch/csky/mm/cachev1.c b/arch/csky/mm/cachev1.c index 494ec912abff..5a5a9804a0e3 100644 --- a/arch/csky/mm/cachev1.c +++ b/arch/csky/mm/cachev1.c @@ -94,6 +94,11 @@ void icache_inv_all(void) cache_op_all(INS_CACHE|CACHE_INV, 0); } +void local_icache_inv_all(void *priv) +{ + cache_op_all(INS_CACHE|CACHE_INV, 0); +} + void dcache_wb_range(unsigned long start, unsigned long end) { cache_op_range(start, end, DATA_CACHE|CACHE_CLR, 0); diff --git a/arch/csky/mm/cachev2.c b/arch/csky/mm/cachev2.c index b61be6518e21..909fdd0f5995 100644 --- a/arch/csky/mm/cachev2.c +++ b/arch/csky/mm/cachev2.c @@ -3,15 +3,25 @@ #include #include +#include #include #include -inline void dcache_wb_line(unsigned long start) +#define INS_CACHE (1 << 0) +#define CACHE_INV (1 << 4) + +void local_icache_inv_all(void *priv) { - asm volatile("dcache.cval1 %0\n"::"r"(start):"memory"); + mtcr("cr17", INS_CACHE|CACHE_INV); sync_is(); } +void icache_inv_all(void) +{ + on_each_cpu(local_icache_inv_all, NULL, 1); +} + +#ifdef CONFIG_CPU_HAS_ICACHE_INS void icache_inv_range(unsigned long start, unsigned long end) { unsigned long i = start & ~(L1_CACHE_BYTES - 1); @@ -20,10 +30,16 @@ void icache_inv_range(unsigned long start, unsigned long end) asm volatile("icache.iva %0\n"::"r"(i):"memory"); sync_is(); } +#else +void icache_inv_range(unsigned long start, unsigned long end) +{ + icache_inv_all(); +} +#endif -void icache_inv_all(void) +inline void dcache_wb_line(unsigned long start) { - asm volatile("icache.ialls\n":::"memory"); + asm volatile("dcache.cval1 %0\n"::"r"(start):"memory"); sync_is(); } @@ -53,10 +69,7 @@ void cache_wbinv_range(unsigned long start, unsigned long end) asm volatile("dcache.cval1 %0\n"::"r"(i):"memory"); sync_is(); - i = start & ~(L1_CACHE_BYTES - 1); - for (; i < end; i += L1_CACHE_BYTES) - asm volatile("icache.iva %0\n"::"r"(i):"memory"); - sync_is(); + icache_inv_range(start, end); } EXPORT_SYMBOL(cache_wbinv_range); -- cgit v1.2.3 From a1176734132c630b50908c36563e05fb3599682c Mon Sep 17 00:00:00 2001 From: Guo Ren Date: Sat, 25 Jan 2020 00:37:09 +0800 Subject: csky: Remove unnecessary flush_icache_* implementation The abiv2 CPUs are all PIPT cache, so there is no need to implement flush_icache_page function. The function flush_icache_user_range hasn't been used, so just remove it. The function flush_cache_range is not necessary for PIPT cache when tlb mapping changed. Signed-off-by: Guo Ren --- arch/csky/abiv1/inc/abi/cacheflush.h | 3 --- arch/csky/abiv2/cacheflush.c | 23 ----------------------- arch/csky/abiv2/inc/abi/cacheflush.h | 13 ++----------- 3 files changed, 2 insertions(+), 37 deletions(-) diff --git a/arch/csky/abiv1/inc/abi/cacheflush.h b/arch/csky/abiv1/inc/abi/cacheflush.h index 79ef9e8c1afd..a73702704f38 100644 --- a/arch/csky/abiv1/inc/abi/cacheflush.h +++ b/arch/csky/abiv1/inc/abi/cacheflush.h @@ -49,9 +49,6 @@ extern void flush_cache_range(struct vm_area_struct *vma, unsigned long start, u #define flush_icache_page(vma, page) do {} while (0); #define flush_icache_range(start, end) cache_wbinv_range(start, end) -#define flush_icache_user_range(vma,page,addr,len) \ - flush_dcache_page(page) - #define copy_from_user_page(vma, page, vaddr, dst, src, len) \ do { \ memcpy(dst, src, len); \ diff --git a/arch/csky/abiv2/cacheflush.c b/arch/csky/abiv2/cacheflush.c index 5bb887b275e1..f64b415f6fde 100644 --- a/arch/csky/abiv2/cacheflush.c +++ b/arch/csky/abiv2/cacheflush.c @@ -6,29 +6,6 @@ #include #include -void flush_icache_page(struct vm_area_struct *vma, struct page *page) -{ - unsigned long start; - - start = (unsigned long) kmap_atomic(page); - - cache_wbinv_range(start, start + PAGE_SIZE); - - kunmap_atomic((void *)start); -} - -void flush_icache_user_range(struct vm_area_struct *vma, struct page *page, - unsigned long vaddr, int len) -{ - unsigned long kaddr; - - kaddr = (unsigned long) kmap_atomic(page) + (vaddr & ~PAGE_MASK); - - cache_wbinv_range(kaddr, kaddr + len); - - kunmap_atomic((void *)kaddr); -} - void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t *pte) { diff --git a/arch/csky/abiv2/inc/abi/cacheflush.h b/arch/csky/abiv2/inc/abi/cacheflush.h index b8db5e0b2fe3..62a9031fffd8 100644 --- a/arch/csky/abiv2/inc/abi/cacheflush.h +++ b/arch/csky/abiv2/inc/abi/cacheflush.h @@ -13,25 +13,16 @@ #define flush_cache_all() do { } while (0) #define flush_cache_mm(mm) do { } while (0) #define flush_cache_dup_mm(mm) do { } while (0) - -#define flush_cache_range(vma, start, end) \ - do { \ - if (vma->vm_flags & VM_EXEC) \ - icache_inv_all(); \ - } while (0) - +#define flush_cache_range(vma, start, end) do { } while (0) #define flush_cache_page(vma, vmaddr, pfn) do { } while (0) #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0 #define flush_dcache_page(page) do { } while (0) #define flush_dcache_mmap_lock(mapping) do { } while (0) #define flush_dcache_mmap_unlock(mapping) do { } while (0) +#define flush_icache_page(vma, page) do { } while (0) #define flush_icache_range(start, end) cache_wbinv_range(start, end) -void flush_icache_page(struct vm_area_struct *vma, struct page *page); -void flush_icache_user_range(struct vm_area_struct *vma, struct page *page, - unsigned long vaddr, int len); - #define flush_cache_vmap(start, end) do { } while (0) #define flush_cache_vunmap(start, end) do { } while (0) -- cgit v1.2.3 From d936a7e708dcf22344c4420e8b0e36f5d5f8c073 Mon Sep 17 00:00:00 2001 From: Guo Ren Date: Mon, 27 Jan 2020 01:20:36 +0800 Subject: csky: Enable defer flush_dcache_page for abiv2 cpus (807/810/860) Instead of flushing cache per update_mmu_cache() called, we use flush_dcache_page to reduce the frequency of flashing the cache. As abiv2 cpus are all PIPT for icache & dcache, we needn't handle dcache aliasing problem. But their icache can't snoop dcache, so we still need sync_icache_dcache in update_mmu_cache(). Signed-off-by: Guo Ren --- arch/csky/abiv2/cacheflush.c | 14 ++++++++------ arch/csky/abiv2/inc/abi/cacheflush.h | 12 ++++++++++-- 2 files changed, 18 insertions(+), 8 deletions(-) diff --git a/arch/csky/abiv2/cacheflush.c b/arch/csky/abiv2/cacheflush.c index f64b415f6fde..ba469953a16e 100644 --- a/arch/csky/abiv2/cacheflush.c +++ b/arch/csky/abiv2/cacheflush.c @@ -9,20 +9,22 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t *pte) { - unsigned long addr, pfn; + unsigned long addr; struct page *page; - pfn = pte_pfn(*pte); - if (unlikely(!pfn_valid(pfn))) + page = pfn_to_page(pte_pfn(*pte)); + if (page == ZERO_PAGE(0)) return; - page = pfn_to_page(pfn); - if (page == ZERO_PAGE(0)) + if (test_and_set_bit(PG_dcache_clean, &page->flags)) return; addr = (unsigned long) kmap_atomic(page); - cache_wbinv_range(addr, addr + PAGE_SIZE); + dcache_wb_range(addr, addr + PAGE_SIZE); + + if (vma->vm_flags & VM_EXEC) + icache_inv_range(addr, addr + PAGE_SIZE); kunmap_atomic((void *) addr); } diff --git a/arch/csky/abiv2/inc/abi/cacheflush.h b/arch/csky/abiv2/inc/abi/cacheflush.h index 62a9031fffd8..acd7c6c55d61 100644 --- a/arch/csky/abiv2/inc/abi/cacheflush.h +++ b/arch/csky/abiv2/inc/abi/cacheflush.h @@ -15,8 +15,16 @@ #define flush_cache_dup_mm(mm) do { } while (0) #define flush_cache_range(vma, start, end) do { } while (0) #define flush_cache_page(vma, vmaddr, pfn) do { } while (0) -#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0 -#define flush_dcache_page(page) do { } while (0) + +#define PG_dcache_clean PG_arch_1 + +#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1 +static inline void flush_dcache_page(struct page *page) +{ + if (test_bit(PG_dcache_clean, &page->flags)) + clear_bit(PG_dcache_clean, &page->flags); +} + #define flush_dcache_mmap_lock(mapping) do { } while (0) #define flush_dcache_mmap_unlock(mapping) do { } while (0) #define flush_icache_page(vma, page) do { } while (0) -- cgit v1.2.3 From cc1f6563a92ced0889775d0587316d725b6e1a68 Mon Sep 17 00:00:00 2001 From: Guo Ren Date: Mon, 27 Jan 2020 19:57:29 +0800 Subject: csky: Optimize abiv2 copy_to_user_page with VM_EXEC Only when vma is for VM_EXEC, we need sync dcache & icache. eg: - gdb ptrace modify user space instruction code area. Add VM_EXEC condition to reduce unnecessary cache flush. The abiv1 cpus' cache are all VIPT, so we still need to deal with dcache aliasing problem. But there is optimized way to use cache color, just like what's done in arch/csky/abiv1/inc/abi/page.h. Signed-off-by: Guo Ren --- arch/csky/abiv2/inc/abi/cacheflush.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/csky/abiv2/inc/abi/cacheflush.h b/arch/csky/abiv2/inc/abi/cacheflush.h index acd7c6c55d61..28b7c3233175 100644 --- a/arch/csky/abiv2/inc/abi/cacheflush.h +++ b/arch/csky/abiv2/inc/abi/cacheflush.h @@ -37,7 +37,9 @@ static inline void flush_dcache_page(struct page *page) #define copy_to_user_page(vma, page, vaddr, dst, src, len) \ do { \ memcpy(dst, src, len); \ - cache_wbinv_range((unsigned long)dst, (unsigned long)dst + len); \ + if (vma->vm_flags & VM_EXEC) \ + cache_wbinv_range((unsigned long)dst, \ + (unsigned long)dst + len); \ } while (0) #define copy_from_user_page(vma, page, vaddr, dst, src, len) \ memcpy(dst, src, len) -- cgit v1.2.3 From 997153b9a75c08d545ad45e6f8ceb432435d2425 Mon Sep 17 00:00:00 2001 From: Guo Ren Date: Fri, 31 Jan 2020 20:33:10 +0800 Subject: csky: Add flush_icache_mm to defer flush icache all Some CPUs don't support icache.va instruction to maintain the whole smp cores' icache. Using icache.all + IPI casue a lot on performace and using defer mechanism could reduce the number of calling icache _flush_all functions. Signed-off-by: Guo Ren --- arch/csky/abiv1/inc/abi/cacheflush.h | 2 ++ arch/csky/abiv2/cacheflush.c | 55 ++++++++++++++++++++++++++++++++++++ arch/csky/abiv2/inc/abi/cacheflush.h | 14 +++++++-- arch/csky/include/asm/cacheflush.h | 1 + arch/csky/include/asm/mmu.h | 1 + arch/csky/include/asm/mmu_context.h | 2 ++ arch/csky/mm/syscache.c | 13 ++++----- 7 files changed, 77 insertions(+), 11 deletions(-) diff --git a/arch/csky/abiv1/inc/abi/cacheflush.h b/arch/csky/abiv1/inc/abi/cacheflush.h index a73702704f38..d3e04208d53c 100644 --- a/arch/csky/abiv1/inc/abi/cacheflush.h +++ b/arch/csky/abiv1/inc/abi/cacheflush.h @@ -48,6 +48,8 @@ extern void flush_cache_range(struct vm_area_struct *vma, unsigned long start, u #define flush_icache_page(vma, page) do {} while (0); #define flush_icache_range(start, end) cache_wbinv_range(start, end) +#define flush_icache_mm_range(mm, start, end) cache_wbinv_range(start, end) +#define flush_icache_deferred(mm) do {} while (0); #define copy_from_user_page(vma, page, vaddr, dst, src, len) \ do { \ diff --git a/arch/csky/abiv2/cacheflush.c b/arch/csky/abiv2/cacheflush.c index ba469953a16e..790f1ebfba44 100644 --- a/arch/csky/abiv2/cacheflush.c +++ b/arch/csky/abiv2/cacheflush.c @@ -28,3 +28,58 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, kunmap_atomic((void *) addr); } + +void flush_icache_deferred(struct mm_struct *mm) +{ + unsigned int cpu = smp_processor_id(); + cpumask_t *mask = &mm->context.icache_stale_mask; + + if (cpumask_test_cpu(cpu, mask)) { + cpumask_clear_cpu(cpu, mask); + /* + * Ensure the remote hart's writes are visible to this hart. + * This pairs with a barrier in flush_icache_mm. + */ + smp_mb(); + local_icache_inv_all(NULL); + } +} + +void flush_icache_mm_range(struct mm_struct *mm, + unsigned long start, unsigned long end) +{ + unsigned int cpu; + cpumask_t others, *mask; + + preempt_disable(); + +#ifdef CONFIG_CPU_HAS_ICACHE_INS + if (mm == current->mm) { + icache_inv_range(start, end); + preempt_enable(); + return; + } +#endif + + /* Mark every hart's icache as needing a flush for this MM. */ + mask = &mm->context.icache_stale_mask; + cpumask_setall(mask); + + /* Flush this hart's I$ now, and mark it as flushed. */ + cpu = smp_processor_id(); + cpumask_clear_cpu(cpu, mask); + local_icache_inv_all(NULL); + + /* + * Flush the I$ of other harts concurrently executing, and mark them as + * flushed. + */ + cpumask_andnot(&others, mm_cpumask(mm), cpumask_of(cpu)); + + if (mm != current->active_mm || !cpumask_empty(&others)) { + on_each_cpu_mask(&others, local_icache_inv_all, NULL, 1); + cpumask_clear(mask); + } + + preempt_enable(); +} diff --git a/arch/csky/abiv2/inc/abi/cacheflush.h b/arch/csky/abiv2/inc/abi/cacheflush.h index 28b7c3233175..a565e00c3f70 100644 --- a/arch/csky/abiv2/inc/abi/cacheflush.h +++ b/arch/csky/abiv2/inc/abi/cacheflush.h @@ -31,15 +31,23 @@ static inline void flush_dcache_page(struct page *page) #define flush_icache_range(start, end) cache_wbinv_range(start, end) +void flush_icache_mm_range(struct mm_struct *mm, + unsigned long start, unsigned long end); +void flush_icache_deferred(struct mm_struct *mm); + #define flush_cache_vmap(start, end) do { } while (0) #define flush_cache_vunmap(start, end) do { } while (0) #define copy_to_user_page(vma, page, vaddr, dst, src, len) \ do { \ memcpy(dst, src, len); \ - if (vma->vm_flags & VM_EXEC) \ - cache_wbinv_range((unsigned long)dst, \ - (unsigned long)dst + len); \ + if (vma->vm_flags & VM_EXEC) { \ + dcache_wb_range((unsigned long)dst, \ + (unsigned long)dst + len); \ + flush_icache_mm_range(current->mm, \ + (unsigned long)dst, \ + (unsigned long)dst + len); \ + } \ } while (0) #define copy_from_user_page(vma, page, vaddr, dst, src, len) \ memcpy(dst, src, len) diff --git a/arch/csky/include/asm/cacheflush.h b/arch/csky/include/asm/cacheflush.h index a96da67261ae..f0b8f25429a2 100644 --- a/arch/csky/include/asm/cacheflush.h +++ b/arch/csky/include/asm/cacheflush.h @@ -4,6 +4,7 @@ #ifndef __ASM_CSKY_CACHEFLUSH_H #define __ASM_CSKY_CACHEFLUSH_H +#include #include #endif /* __ASM_CSKY_CACHEFLUSH_H */ diff --git a/arch/csky/include/asm/mmu.h b/arch/csky/include/asm/mmu.h index b382a14ea4ec..26fbb1d15df0 100644 --- a/arch/csky/include/asm/mmu.h +++ b/arch/csky/include/asm/mmu.h @@ -7,6 +7,7 @@ typedef struct { atomic64_t asid; void *vdso; + cpumask_t icache_stale_mask; } mm_context_t; #endif /* __ASM_CSKY_MMU_H */ diff --git a/arch/csky/include/asm/mmu_context.h b/arch/csky/include/asm/mmu_context.h index 0285b0ad18b6..abdf1f1cb6ec 100644 --- a/arch/csky/include/asm/mmu_context.h +++ b/arch/csky/include/asm/mmu_context.h @@ -43,5 +43,7 @@ switch_mm(struct mm_struct *prev, struct mm_struct *next, TLBMISS_HANDLER_SETUP_PGD(next->pgd); write_mmu_entryhi(next->context.asid.counter); + + flush_icache_deferred(next); } #endif /* __ASM_CSKY_MMU_CONTEXT_H */ diff --git a/arch/csky/mm/syscache.c b/arch/csky/mm/syscache.c index c4645e4e97f4..ffade2f9a4c8 100644 --- a/arch/csky/mm/syscache.c +++ b/arch/csky/mm/syscache.c @@ -3,7 +3,7 @@ #include #include -#include +#include #include SYSCALL_DEFINE3(cacheflush, @@ -13,17 +13,14 @@ SYSCALL_DEFINE3(cacheflush, { switch (cache) { case ICACHE: - icache_inv_range((unsigned long)addr, - (unsigned long)addr + bytes); - break; + case BCACHE: + flush_icache_mm_range(current->mm, + (unsigned long)addr, + (unsigned long)addr + bytes); case DCACHE: dcache_wb_range((unsigned long)addr, (unsigned long)addr + bytes); break; - case BCACHE: - cache_wbinv_range((unsigned long)addr, - (unsigned long)addr + bytes); - break; default: return -EINVAL; } -- cgit v1.2.3 From 359ae00d12589c31cf103894d0f32588d523ca83 Mon Sep 17 00:00:00 2001 From: Guo Ren Date: Sun, 2 Feb 2020 09:58:42 +0800 Subject: csky: Fixup ftrace modify panic During ftrace init, linux will replace all function prologues (call_mcout) with nops, but it need flush_dcache and invalidate_icache to make it work. So flush_cache functions couldn't be nested called by ftrace framework. Signed-off-by: Guo Ren --- arch/csky/mm/Makefile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/csky/mm/Makefile b/arch/csky/mm/Makefile index 2c51918eb4fc..6e7696e55f71 100644 --- a/arch/csky/mm/Makefile +++ b/arch/csky/mm/Makefile @@ -1,8 +1,10 @@ # SPDX-License-Identifier: GPL-2.0-only ifeq ($(CONFIG_CPU_HAS_CACHEV2),y) obj-y += cachev2.o +CFLAGS_REMOVE_cachev2.o = $(CC_FLAGS_FTRACE) else obj-y += cachev1.o +CFLAGS_REMOVE_cachev1.o = $(CC_FLAGS_FTRACE) endif obj-y += dma-mapping.o -- cgit v1.2.3 From 9025fd48a8aeddade845afa353d4bbab7f19dbf2 Mon Sep 17 00:00:00 2001 From: Guo Ren Date: Sun, 2 Feb 2020 10:58:38 +0800 Subject: csky: Remove unused cache implementation Only for coding convention, these codes are unnecessary for abiv2. Signed-off-by: Guo Ren --- arch/csky/mm/cachev2.c | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) diff --git a/arch/csky/mm/cachev2.c b/arch/csky/mm/cachev2.c index 909fdd0f5995..bc419f8039d3 100644 --- a/arch/csky/mm/cachev2.c +++ b/arch/csky/mm/cachev2.c @@ -52,23 +52,9 @@ void dcache_wb_range(unsigned long start, unsigned long end) sync_is(); } -void dcache_inv_range(unsigned long start, unsigned long end) -{ - unsigned long i = start & ~(L1_CACHE_BYTES - 1); - - for (; i < end; i += L1_CACHE_BYTES) - asm volatile("dcache.civa %0\n"::"r"(i):"memory"); - sync_is(); -} - void cache_wbinv_range(unsigned long start, unsigned long end) { - unsigned long i = start & ~(L1_CACHE_BYTES - 1); - - for (; i < end; i += L1_CACHE_BYTES) - asm volatile("dcache.cval1 %0\n"::"r"(i):"memory"); - sync_is(); - + dcache_wb_range(start, end); icache_inv_range(start, end); } EXPORT_SYMBOL(cache_wbinv_range); -- cgit v1.2.3 From 2305f60b76110cb3e8658a4ae85d1f7eb0c66a5b Mon Sep 17 00:00:00 2001 From: Guo Ren Date: Sun, 2 Feb 2020 14:11:11 +0800 Subject: csky: Fixup compile warning for three unimplemented syscalls Implement fstat64, fstatat64, clone3 syscalls to fixup checksyscalls.sh compile warnings. Signed-off-by: Guo Ren --- arch/csky/include/uapi/asm/unistd.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/csky/include/uapi/asm/unistd.h b/arch/csky/include/uapi/asm/unistd.h index 211c983c7282..ba4018929733 100644 --- a/arch/csky/include/uapi/asm/unistd.h +++ b/arch/csky/include/uapi/asm/unistd.h @@ -1,7 +1,10 @@ /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ // Copyright (C) 2018 Hangzhou C-SKY Microsystems co.,ltd. +#define __ARCH_WANT_STAT64 +#define __ARCH_WANT_NEW_STAT #define __ARCH_WANT_SYS_CLONE +#define __ARCH_WANT_SYS_CLONE3 #define __ARCH_WANT_SET_GET_RLIMIT #define __ARCH_WANT_TIME32_SYSCALLS #include -- cgit v1.2.3 From bebd26ab623616728d6e72b5c74a47bfff5287d8 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 31 Jan 2020 17:52:30 -0800 Subject: arch/csky: fix some Kconfig typos Fix wording in help text for the CPU_HAS_LDSTEX symbol. Signed-off-by: Randy Dunlap Signed-off-by: Guo Ren Signed-off-by: Guo Ren --- arch/csky/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/csky/Kconfig b/arch/csky/Kconfig index 111474b50f69..f1e73543feec 100644 --- a/arch/csky/Kconfig +++ b/arch/csky/Kconfig @@ -76,7 +76,7 @@ config CPU_HAS_TLBI config CPU_HAS_LDSTEX bool help - For SMP, CPU needs "ldex&stex" instrcutions to atomic operations. + For SMP, CPU needs "ldex&stex" instructions for atomic operations. config CPU_NEED_TLBSYNC bool -- cgit v1.2.3 From 4ec575b78521b4573e85aec451930cb040582455 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Thu, 30 Jan 2020 20:22:40 +0100 Subject: csky: Cleanup old Kconfig options CONFIG_CLKSRC_OF is gone since commit bb0eb050a577 ("clocksource/drivers: Rename CLKSRC_OF to TIMER_OF"). The platform already selects TIMER_OF. CONFIG_HAVE_DMA_API_DEBUG is gone since commit 6e88628d03dd ("dma-debug: remove CONFIG_HAVE_DMA_API_DEBUG"). CONFIG_DEFAULT_DEADLINE is gone since commit f382fb0bcef4 ("block: remove legacy IO schedulers"). Signed-off-by: Krzysztof Kozlowski Signed-off-by: Guo Ren --- arch/csky/Kconfig | 2 -- arch/csky/configs/defconfig | 1 - 2 files changed, 3 deletions(-) diff --git a/arch/csky/Kconfig b/arch/csky/Kconfig index f1e73543feec..bf246b036dee 100644 --- a/arch/csky/Kconfig +++ b/arch/csky/Kconfig @@ -9,7 +9,6 @@ config CSKY select ARCH_USE_QUEUED_RWLOCKS if NR_CPUS>2 select COMMON_CLK select CLKSRC_MMIO - select CLKSRC_OF select CSKY_MPINTC if CPU_CK860 select CSKY_MP_TIMER if CPU_CK860 select CSKY_APB_INTC @@ -47,7 +46,6 @@ config CSKY select HAVE_PERF_EVENTS select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP - select HAVE_DMA_API_DEBUG select HAVE_DMA_CONTIGUOUS select HAVE_STACKPROTECTOR select HAVE_SYSCALL_TRACEPOINTS diff --git a/arch/csky/configs/defconfig b/arch/csky/configs/defconfig index 7ef42895dfb0..3b540539dbe6 100644 --- a/arch/csky/configs/defconfig +++ b/arch/csky/configs/defconfig @@ -10,7 +10,6 @@ CONFIG_BSD_PROCESS_ACCT=y CONFIG_BSD_PROCESS_ACCT_V3=y CONFIG_MODULES=y CONFIG_MODULE_UNLOAD=y -CONFIG_DEFAULT_DEADLINE=y CONFIG_CPU_CK807=y CONFIG_CPU_HAS_FPU=y CONFIG_NET=y -- cgit v1.2.3 From d46869aaab7981f9153d3271228005a52ef18591 Mon Sep 17 00:00:00 2001 From: Guo Ren Date: Tue, 8 Oct 2019 14:25:13 +0800 Subject: csky: Add setup_initrd check code We should give some necessary check for initrd just like other architectures and it seems that setup_initrd() could be a common code for all architectures. Signed-off-by: Guo Ren --- arch/csky/kernel/setup.c | 3 --- arch/csky/mm/init.c | 44 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 44 insertions(+), 3 deletions(-) diff --git a/arch/csky/kernel/setup.c b/arch/csky/kernel/setup.c index 4fc7b16b77d2..3821e55742f4 100644 --- a/arch/csky/kernel/setup.c +++ b/arch/csky/kernel/setup.c @@ -47,9 +47,6 @@ static void __init csky_memblock_init(void) signed long size; memblock_reserve(__pa(_stext), _end - _stext); -#ifdef CONFIG_BLK_DEV_INITRD - memblock_reserve(__pa(initrd_start), initrd_end - initrd_start); -#endif early_init_fdt_reserve_self(); early_init_fdt_scan_reserved_mem(); diff --git a/arch/csky/mm/init.c b/arch/csky/mm/init.c index dbb4b2dfe4b7..cb64d8647a78 100644 --- a/arch/csky/mm/init.c +++ b/arch/csky/mm/init.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -36,6 +37,45 @@ unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)] __page_aligned_bss; EXPORT_SYMBOL(empty_zero_page); +#ifdef CONFIG_BLK_DEV_INITRD +static void __init setup_initrd(void) +{ + unsigned long size; + + if (initrd_start >= initrd_end) { + pr_err("initrd not found or empty"); + goto disable; + } + + if (__pa(initrd_end) > PFN_PHYS(max_low_pfn)) { + pr_err("initrd extends beyond end of memory"); + goto disable; + } + + size = initrd_end - initrd_start; + + if (memblock_is_region_reserved(__pa(initrd_start), size)) { + pr_err("INITRD: 0x%08lx+0x%08lx overlaps in-use memory region", + __pa(initrd_start), size); + goto disable; + } + + memblock_reserve(__pa(initrd_start), size); + + pr_info("Initial ramdisk at: 0x%p (%lu bytes)\n", + (void *)(initrd_start), size); + + initrd_below_start_ok = 1; + + return; + +disable: + initrd_start = initrd_end = 0; + + pr_err(" - disabling initrd\n"); +} +#endif + void __init mem_init(void) { #ifdef CONFIG_HIGHMEM @@ -47,6 +87,10 @@ void __init mem_init(void) #endif high_memory = (void *) __va(max_low_pfn << PAGE_SHIFT); +#ifdef CONFIG_BLK_DEV_INITRD + setup_initrd(); +#endif + memblock_free_all(); #ifdef CONFIG_HIGHMEM -- cgit v1.2.3 From dc2efc0028dd5c4bf0f7324806c9a196958aaad3 Mon Sep 17 00:00:00 2001 From: Ma Jun Date: Sun, 2 Feb 2020 17:56:58 +0800 Subject: csky: Minimize defconfig to support buildroot config.fragment Some bsp (eg: buildroot) has defconfig.fragment design to add more configs into the defconfig in linux source code tree. For example, we could put different cpu configs into different defconfig.fragments, but they all use the same defconfig in Linux. Signed-off-by: Ma Jun Signed-off-by: Guo Ren --- arch/csky/configs/defconfig | 7 ------- 1 file changed, 7 deletions(-) diff --git a/arch/csky/configs/defconfig b/arch/csky/configs/defconfig index 3b540539dbe6..af722e4dfb47 100644 --- a/arch/csky/configs/defconfig +++ b/arch/csky/configs/defconfig @@ -10,8 +10,6 @@ CONFIG_BSD_PROCESS_ACCT=y CONFIG_BSD_PROCESS_ACCT_V3=y CONFIG_MODULES=y CONFIG_MODULE_UNLOAD=y -CONFIG_CPU_CK807=y -CONFIG_CPU_HAS_FPU=y CONFIG_NET=y CONFIG_PACKET=y CONFIG_UNIX=y @@ -26,10 +24,7 @@ CONFIG_SERIAL_NONSTANDARD=y CONFIG_SERIAL_8250=y CONFIG_SERIAL_8250_CONSOLE=y CONFIG_SERIAL_OF_PLATFORM=y -CONFIG_TTY_PRINTK=y # CONFIG_VGA_CONSOLE is not set -CONFIG_CSKY_MPTIMER=y -CONFIG_GX6605S_TIMER=y CONFIG_PM_DEVFREQ=y CONFIG_DEVFREQ_GOV_SIMPLE_ONDEMAND=y CONFIG_DEVFREQ_GOV_PERFORMANCE=y @@ -55,6 +50,4 @@ CONFIG_CRAMFS=y CONFIG_ROMFS_FS=y CONFIG_NFS_FS=y CONFIG_PRINTK_TIME=y -CONFIG_DEBUG_INFO=y -CONFIG_DEBUG_FS=y CONFIG_MAGIC_SYSRQ=y -- cgit v1.2.3 From 5b49c82dadfe0f3741778f57385f964ec1514863 Mon Sep 17 00:00:00 2001 From: MaJun Date: Mon, 27 Jan 2020 10:56:21 +0800 Subject: csky: Add PCI support Add the pci related code for csky arch to support basic pci virtual function, such as qemu virt-pci-9pfs. Signed-off-by: MaJun Signed-off-by: Guo Ren --- arch/csky/Kconfig | 5 +++++ arch/csky/include/asm/Kbuild | 1 - arch/csky/include/asm/pci.h | 34 ++++++++++++++++++++++++++++++++++ 3 files changed, 39 insertions(+), 1 deletion(-) create mode 100644 arch/csky/include/asm/pci.h diff --git a/arch/csky/Kconfig b/arch/csky/Kconfig index bf246b036dee..72b2999a889a 100644 --- a/arch/csky/Kconfig +++ b/arch/csky/Kconfig @@ -58,6 +58,11 @@ config CSKY select TIMER_OF select USB_ARCH_HAS_EHCI select USB_ARCH_HAS_OHCI + select GENERIC_PCI_IOMAP + select HAVE_PCI + select PCI_DOMAINS_GENERIC if PCI + select PCI_SYSCALL if PCI + select PCI_MSI if PCI config CPU_HAS_CACHEV2 bool diff --git a/arch/csky/include/asm/Kbuild b/arch/csky/include/asm/Kbuild index bc15a26c782f..4130e3eaa766 100644 --- a/arch/csky/include/asm/Kbuild +++ b/arch/csky/include/asm/Kbuild @@ -28,7 +28,6 @@ generic-y += local64.h generic-y += mm-arch-hooks.h generic-y += mmiowb.h generic-y += module.h -generic-y += pci.h generic-y += percpu.h generic-y += preempt.h generic-y += qrwlock.h diff --git a/arch/csky/include/asm/pci.h b/arch/csky/include/asm/pci.h new file mode 100644 index 000000000000..ebc765b1f78b --- /dev/null +++ b/arch/csky/include/asm/pci.h @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#ifndef __ASM_CSKY_PCI_H +#define __ASM_CSKY_PCI_H + +#include +#include +#include + +#include + +#define PCIBIOS_MIN_IO 0 +#define PCIBIOS_MIN_MEM 0 + +/* C-SKY shim does not initialize PCI bus */ +#define pcibios_assign_all_busses() 1 + +extern int isa_dma_bridge_buggy; + +#ifdef CONFIG_PCI +static inline int pci_get_legacy_ide_irq(struct pci_dev *dev, int channel) +{ + /* no legacy IRQ on csky */ + return -ENODEV; +} + +static inline int pci_proc_domain(struct pci_bus *bus) +{ + /* always show the domain in /proc */ + return 1; +} +#endif /* CONFIG_PCI */ + +#endif /* __ASM_CSKY_PCI_H */ -- cgit v1.2.3 From 0b9f386c4be6493d282aab0af6f9b70c62142777 Mon Sep 17 00:00:00 2001 From: Guo Ren Date: Wed, 12 Feb 2020 10:24:52 +0800 Subject: csky: Implement copy_thread_tls This is required for clone3 which passes the TLS value through a struct rather than a register. Cc: Amanieu d'Antras Signed-off-by: Guo Ren --- arch/csky/Kconfig | 1 + arch/csky/kernel/process.c | 7 ++++--- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/arch/csky/Kconfig b/arch/csky/Kconfig index 72b2999a889a..047427f71d83 100644 --- a/arch/csky/Kconfig +++ b/arch/csky/Kconfig @@ -36,6 +36,7 @@ config CSKY select GX6605S_TIMER if CPU_CK610 select HAVE_ARCH_TRACEHOOK select HAVE_ARCH_AUDITSYSCALL + select HAVE_COPY_THREAD_TLS select HAVE_DYNAMIC_FTRACE select HAVE_FUNCTION_TRACER select HAVE_FUNCTION_GRAPH_TRACER diff --git a/arch/csky/kernel/process.c b/arch/csky/kernel/process.c index 5349cd8c0f30..f7b231ca269a 100644 --- a/arch/csky/kernel/process.c +++ b/arch/csky/kernel/process.c @@ -40,10 +40,11 @@ unsigned long thread_saved_pc(struct task_struct *tsk) return sw->r15; } -int copy_thread(unsigned long clone_flags, +int copy_thread_tls(unsigned long clone_flags, unsigned long usp, unsigned long kthread_arg, - struct task_struct *p) + struct task_struct *p, + unsigned long tls) { struct switch_stack *childstack; struct pt_regs *childregs = task_pt_regs(p); @@ -70,7 +71,7 @@ int copy_thread(unsigned long clone_flags, childregs->usp = usp; if (clone_flags & CLONE_SETTLS) task_thread_info(p)->tp_value = childregs->tls - = childregs->regs[0]; + = tls; childregs->a0 = 0; childstack->r15 = (unsigned long) ret_from_fork; -- cgit v1.2.3 From 4c5fd3b791a06021084b42d5610400f846d206b5 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 19 Feb 2020 17:28:21 -0800 Subject: zonefs: fix documentation typos etc. Fix typos, spellos, etc. in zonefs.txt. Signed-off-by: Randy Dunlap Cc: Damien Le Moal Reviewed-by: Chaitanya Kulkarni Signed-off-by: Damien Le Moal --- Documentation/filesystems/zonefs.txt | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/Documentation/filesystems/zonefs.txt b/Documentation/filesystems/zonefs.txt index 935bf22031ca..d54fa98ac158 100644 --- a/Documentation/filesystems/zonefs.txt +++ b/Documentation/filesystems/zonefs.txt @@ -134,7 +134,7 @@ Sequential zone files can only be written sequentially, starting from the file end, that is, write operations can only be append writes. Zonefs makes no attempt at accepting random writes and will fail any write request that has a start offset not corresponding to the end of the file, or to the end of the last -write issued and still in-flight (for asynchrnous I/O operations). +write issued and still in-flight (for asynchronous I/O operations). Since dirty page writeback by the page cache does not guarantee a sequential write pattern, zonefs prevents buffered writes and writeable shared mappings @@ -142,7 +142,7 @@ on sequential files. Only direct I/O writes are accepted for these files. zonefs relies on the sequential delivery of write I/O requests to the device implemented by the block layer elevator. An elevator implementing the sequential write feature for zoned block device (ELEVATOR_F_ZBD_SEQ_WRITE elevator feature) -must be used. This type of elevator (e.g. mq-deadline) is the set by default +must be used. This type of elevator (e.g. mq-deadline) is set by default for zoned block devices on device initialization. There are no restrictions on the type of I/O used for read operations in @@ -196,7 +196,7 @@ additional conditions that result in I/O errors. may still happen in the case of a partial failure of a very large direct I/O operation split into multiple BIOs/requests or asynchronous I/O operations. If one of the write request within the set of sequential write requests - issued to the device fails, all write requests after queued after it will + issued to the device fails, all write requests queued after it will become unaligned and fail. * Delayed write errors: similarly to regular block devices, if the device side @@ -207,7 +207,7 @@ additional conditions that result in I/O errors. causing all data to be dropped after the sector that caused the error. All I/O errors detected by zonefs are notified to the user with an error code -return for the system call that trigered or detected the error. The recovery +return for the system call that triggered or detected the error. The recovery actions taken by zonefs in response to I/O errors depend on the I/O type (read vs write) and on the reason for the error (bad sector, unaligned writes or zone condition change). @@ -222,7 +222,7 @@ condition change). * A zone condition change to read-only or offline also always triggers zonefs I/O error recovery. -Zonefs minimal I/O error recovery may change a file size and a file access +Zonefs minimal I/O error recovery may change a file size and file access permissions. * File size changes: @@ -237,7 +237,7 @@ permissions. A file size may also be reduced to reflect a delayed write error detected on fsync(): in this case, the amount of data effectively written in the zone may be less than originally indicated by the file inode size. After such I/O - error, zonefs always fixes a file inode size to reflect the amount of data + error, zonefs always fixes the file inode size to reflect the amount of data persistently stored in the file zone. * Access permission changes: @@ -281,11 +281,11 @@ Further notes: permissions to read-only applies to all files. The file system is remounted read-only. * Access permission and file size changes due to the device transitioning zones - to the offline condition are permanent. Remounting or reformating the device + to the offline condition are permanent. Remounting or reformatting the device with mkfs.zonefs (mkzonefs) will not change back offline zone files to a good state. * File access permission changes to read-only due to the device transitioning - zones to the read-only condition are permanent. Remounting or reformating + zones to the read-only condition are permanent. Remounting or reformatting the device will not re-enable file write access. * File access permission changes implied by the remount-ro, zone-ro and zone-offline mount options are temporary for zones in a good condition. @@ -301,13 +301,13 @@ Mount options zonefs define the "errors=" mount option to allow the user to specify zonefs behavior in response to I/O errors, inode size inconsistencies or zone -condition chages. The defined behaviors are as follow: +condition changes. The defined behaviors are as follow: * remount-ro (default) * zone-ro * zone-offline * repair -The I/O error actions defined for each behavior is detailed in the previous +The I/O error actions defined for each behavior are detailed in the previous section. Zonefs User Space Tools -- cgit v1.2.3 From 2546287c5fb363a0165933ae2181c92f03e701d0 Mon Sep 17 00:00:00 2001 From: Zenghui Yu Date: Fri, 21 Feb 2020 10:07:25 +0800 Subject: genirq/irqdomain: Make sure all irq domain flags are distinct This was noticed when printing debugfs for MSIs on my ARM64 server. The new dstate IRQD_MSI_NOMASK_QUIRK came out surprisingly while it should only be the x86 stuff for the time being... The new MSI quirk flag uses the same bit as IRQ_DOMAIN_NAME_ALLOCATED which is oddly defined as bit 6 for no good reason. Switch it to the non used bit 1. Fixes: 6f1a4891a592 ("x86/apic/msi: Plug non-maskable MSI affinity race") Signed-off-by: Zenghui Yu Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20200221020725.2038-1-yuzenghui@huawei.com --- include/linux/irqdomain.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index b2d47571ab67..8d062e86d954 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -192,7 +192,7 @@ enum { IRQ_DOMAIN_FLAG_HIERARCHY = (1 << 0), /* Irq domain name was allocated in __irq_domain_add() */ - IRQ_DOMAIN_NAME_ALLOCATED = (1 << 6), + IRQ_DOMAIN_NAME_ALLOCATED = (1 << 1), /* Irq domain is an IPI domain with virq per cpu */ IRQ_DOMAIN_FLAG_IPI_PER_CPU = (1 << 2), -- cgit v1.2.3 From 3b7830904e17202524bad1974505a9bfc718d31f Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Thu, 20 Feb 2020 13:29:53 -0700 Subject: nvme-multipath: Fix memory leak with ana_log_buf kmemleak reports a memory leak with the ana_log_buf allocated by nvme_mpath_init(): unreferenced object 0xffff888120e94000 (size 8208): comm "nvme", pid 6884, jiffies 4295020435 (age 78786.312s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 01 00 00 00 00 00 00 00 ................ 01 00 00 00 01 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [<00000000e2360188>] kmalloc_order+0x97/0xc0 [<0000000079b18dd4>] kmalloc_order_trace+0x24/0x100 [<00000000f50c0406>] __kmalloc+0x24c/0x2d0 [<00000000f31a10b9>] nvme_mpath_init+0x23c/0x2b0 [<000000005802589e>] nvme_init_identify+0x75f/0x1600 [<0000000058ef911b>] nvme_loop_configure_admin_queue+0x26d/0x280 [<00000000673774b9>] nvme_loop_create_ctrl+0x2a7/0x710 [<00000000f1c7a233>] nvmf_dev_write+0xc66/0x10b9 [<000000004199f8d0>] __vfs_write+0x50/0xa0 [<0000000065466fef>] vfs_write+0xf3/0x280 [<00000000b0db9a8b>] ksys_write+0xc6/0x160 [<0000000082156b91>] __x64_sys_write+0x43/0x50 [<00000000c34fbb6d>] do_syscall_64+0x77/0x2f0 [<00000000bbc574c9>] entry_SYSCALL_64_after_hwframe+0x49/0xbe nvme_mpath_init() is called by nvme_init_identify() which is called in multiple places (nvme_reset_work(), nvme_passthru_end(), etc). This means nvme_mpath_init() may be called multiple times before nvme_mpath_uninit() (which is only called on nvme_free_ctrl()). When nvme_mpath_init() is called multiple times, it overwrites the ana_log_buf pointer with a new allocation, thus leaking the previous allocation. To fix this, free ana_log_buf before allocating a new one. Fixes: 0d0b660f214dc490 ("nvme: add ANA support") Cc: Reviewed-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Signed-off-by: Logan Gunthorpe Signed-off-by: Keith Busch --- drivers/nvme/host/multipath.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 797c18337d96..a11900cf3a36 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -715,6 +715,7 @@ int nvme_mpath_init(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) } INIT_WORK(&ctrl->ana_work, nvme_ana_work); + kfree(ctrl->ana_log_buf); ctrl->ana_log_buf = kmalloc(ctrl->ana_log_size, GFP_KERNEL); if (!ctrl->ana_log_buf) { error = -ENOMEM; -- cgit v1.2.3 From a5ae50dea9111db63d30d700766dd5509602f7ad Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 20 Feb 2020 13:29:49 +0000 Subject: Btrfs: fix deadlock during fast fsync when logging prealloc extents beyond eof While logging the prealloc extents of an inode during a fast fsync we call btrfs_truncate_inode_items(), through btrfs_log_prealloc_extents(), while holding a read lock on a leaf of the inode's root (not the log root, the fs/subvol root), and then that function locks the file range in the inode's iotree. This can lead to a deadlock when: * the fsync is ranged * the file has prealloc extents beyond eof * writeback for a range different from the fsync range starts during the fsync * the size of the file is not sector size aligned Because when finishing an ordered extent we lock first a file range and then try to COW the fs/subvol tree to insert an extent item. The following diagram shows how the deadlock can happen. CPU 1 CPU 2 btrfs_sync_file() --> for range [0, 1MiB) --> inode has a size of 1MiB and has 1 prealloc extent beyond the i_size, starting at offset 4MiB flushes all delalloc for the range [0MiB, 1MiB) and waits for the respective ordered extents to complete --> before task at CPU 1 locks the inode, a write into file range [1MiB, 2MiB + 1KiB) is made --> i_size is updated to 2MiB + 1KiB --> writeback is started for that range, [1MiB, 2MiB + 4KiB) --> end offset rounded up to be sector size aligned btrfs_log_dentry_safe() btrfs_log_inode_parent() btrfs_log_inode() btrfs_log_changed_extents() btrfs_log_prealloc_extents() --> does a search on the inode's root --> holds a read lock on leaf X btrfs_finish_ordered_io() --> locks range [1MiB, 2MiB + 4KiB) --> end offset rounded up to be sector size aligned --> tries to cow leaf X, through insert_reserved_file_extent() --> already locked by the task at CPU 1 btrfs_truncate_inode_items() --> gets an i_size of 2MiB + 1KiB, which is not sector size aligned --> tries to lock file range [2MiB, (u64)-1) --> the start range is rounded down from 2MiB + 1K to 2MiB to be sector size aligned --> but the subrange [2MiB, 2MiB + 4KiB) is already locked by task at CPU 2 which is waiting to get a write lock on leaf X for which we are holding a read lock *** deadlock *** This results in a stack trace like the following, triggered by test case generic/561 from fstests: [ 2779.973608] INFO: task kworker/u8:6:247 blocked for more than 120 seconds. [ 2779.979536] Not tainted 5.6.0-rc2-btrfs-next-53 #1 [ 2779.984503] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [ 2779.990136] kworker/u8:6 D 0 247 2 0x80004000 [ 2779.990457] Workqueue: btrfs-endio-write btrfs_work_helper [btrfs] [ 2779.990466] Call Trace: [ 2779.990491] ? __schedule+0x384/0xa30 [ 2779.990521] schedule+0x33/0xe0 [ 2779.990616] btrfs_tree_read_lock+0x19e/0x2e0 [btrfs] [ 2779.990632] ? remove_wait_queue+0x60/0x60 [ 2779.990730] btrfs_read_lock_root_node+0x2f/0x40 [btrfs] [ 2779.990782] btrfs_search_slot+0x510/0x1000 [btrfs] [ 2779.990869] btrfs_lookup_file_extent+0x4a/0x70 [btrfs] [ 2779.990944] __btrfs_drop_extents+0x161/0x1060 [btrfs] [ 2779.990987] ? mark_held_locks+0x6d/0xc0 [ 2779.990994] ? __slab_alloc.isra.49+0x99/0x100 [ 2779.991060] ? insert_reserved_file_extent.constprop.19+0x64/0x300 [btrfs] [ 2779.991145] insert_reserved_file_extent.constprop.19+0x97/0x300 [btrfs] [ 2779.991222] ? start_transaction+0xdd/0x5c0 [btrfs] [ 2779.991291] btrfs_finish_ordered_io+0x4f4/0x840 [btrfs] [ 2779.991405] btrfs_work_helper+0xaa/0x720 [btrfs] [ 2779.991432] process_one_work+0x26d/0x6a0 [ 2779.991460] worker_thread+0x4f/0x3e0 [ 2779.991481] ? process_one_work+0x6a0/0x6a0 [ 2779.991489] kthread+0x103/0x140 [ 2779.991499] ? kthread_create_worker_on_cpu+0x70/0x70 [ 2779.991515] ret_from_fork+0x3a/0x50 (...) [ 2780.026211] INFO: task fsstress:17375 blocked for more than 120 seconds. [ 2780.027480] Not tainted 5.6.0-rc2-btrfs-next-53 #1 [ 2780.028482] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [ 2780.030035] fsstress D 0 17375 17373 0x00004000 [ 2780.030038] Call Trace: [ 2780.030044] ? __schedule+0x384/0xa30 [ 2780.030052] schedule+0x33/0xe0 [ 2780.030075] lock_extent_bits+0x20c/0x320 [btrfs] [ 2780.030094] ? btrfs_truncate_inode_items+0xf4/0x1150 [btrfs] [ 2780.030098] ? rcu_read_lock_sched_held+0x59/0xa0 [ 2780.030102] ? remove_wait_queue+0x60/0x60 [ 2780.030122] btrfs_truncate_inode_items+0x133/0x1150 [btrfs] [ 2780.030151] ? btrfs_set_path_blocking+0xb2/0x160 [btrfs] [ 2780.030165] ? btrfs_search_slot+0x379/0x1000 [btrfs] [ 2780.030195] btrfs_log_changed_extents.isra.8+0x841/0x93e [btrfs] [ 2780.030202] ? do_raw_spin_unlock+0x49/0xc0 [ 2780.030215] ? btrfs_get_num_csums+0x10/0x10 [btrfs] [ 2780.030239] btrfs_log_inode+0xf83/0x1124 [btrfs] [ 2780.030251] ? __mutex_unlock_slowpath+0x45/0x2a0 [ 2780.030275] btrfs_log_inode_parent+0x2a0/0xe40 [btrfs] [ 2780.030282] ? dget_parent+0xa1/0x370 [ 2780.030309] btrfs_log_dentry_safe+0x4a/0x70 [btrfs] [ 2780.030329] btrfs_sync_file+0x3f3/0x490 [btrfs] [ 2780.030339] do_fsync+0x38/0x60 [ 2780.030343] __x64_sys_fdatasync+0x13/0x20 [ 2780.030345] do_syscall_64+0x5c/0x280 [ 2780.030348] entry_SYSCALL_64_after_hwframe+0x49/0xbe [ 2780.030356] RIP: 0033:0x7f2d80f6d5f0 [ 2780.030361] Code: Bad RIP value. [ 2780.030362] RSP: 002b:00007ffdba3c8548 EFLAGS: 00000246 ORIG_RAX: 000000000000004b [ 2780.030364] RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 00007f2d80f6d5f0 [ 2780.030365] RDX: 00007ffdba3c84b0 RSI: 00007ffdba3c84b0 RDI: 0000000000000003 [ 2780.030367] RBP: 000000000000004a R08: 0000000000000001 R09: 00007ffdba3c855c [ 2780.030368] R10: 0000000000000078 R11: 0000000000000246 R12: 00000000000001f4 [ 2780.030369] R13: 0000000051eb851f R14: 00007ffdba3c85f0 R15: 0000557a49220d90 So fix this by making btrfs_truncate_inode_items() not lock the range in the inode's iotree when the target root is a log root, since it's not needed to lock the range for log roots as the protection from the inode's lock and log_mutex are all that's needed. Fixes: 28553fa992cb28 ("Btrfs: fix race between shrinking truncate and fiemap") CC: stable@vger.kernel.org # 4.4+ Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/inode.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 4f47ba652b31..1ccb3f8d528d 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4103,8 +4103,9 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, return -ENOMEM; path->reada = READA_BACK; - lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, (u64)-1, - &cached_state); + if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) + lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, (u64)-1, + &cached_state); /* * We want to drop from the next block forward in case this new size is @@ -4368,11 +4369,10 @@ out: if (!ret && last_size > new_size) last_size = new_size; btrfs_ordered_update_i_size(inode, last_size, NULL); + unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start, + (u64)-1, &cached_state); } - unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start, (u64)-1, - &cached_state); - btrfs_free_path(path); return ret; } -- cgit v1.2.3 From 7143b5ac5750f404ff3a594b34fdf3fc2f99f828 Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Fri, 21 Feb 2020 16:42:16 +0100 Subject: io_uring: prevent sq_thread from spinning when it should stop This patch drops 'cur_mm' before calling cond_resched(), to prevent the sq_thread from spinning even when the user process is finished. Before this patch, if the user process ended without closing the io_uring fd, the sq_thread continues to spin until the 'sq_thread_idle' timeout ends. In the worst case where the 'sq_thread_idle' parameter is bigger than INT_MAX, the sq_thread will spin forever. Fixes: 6c271ce2f1d5 ("io_uring: add submission polling") Signed-off-by: Stefano Garzarella Signed-off-by: Jens Axboe --- fs/io_uring.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 6e249aa97ba3..b43467b3a8dc 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -5142,6 +5142,18 @@ static int io_sq_thread(void *data) * to enter the kernel to reap and flush events. */ if (!to_submit || ret == -EBUSY) { + /* + * Drop cur_mm before scheduling, we can't hold it for + * long periods (or over schedule()). Do this before + * adding ourselves to the waitqueue, as the unuse/drop + * may sleep. + */ + if (cur_mm) { + unuse_mm(cur_mm); + mmput(cur_mm); + cur_mm = NULL; + } + /* * We're polling. If we're within the defined idle * period, then let us spin without work before going @@ -5156,18 +5168,6 @@ static int io_sq_thread(void *data) continue; } - /* - * Drop cur_mm before scheduling, we can't hold it for - * long periods (or over schedule()). Do this before - * adding ourselves to the waitqueue, as the unuse/drop - * may sleep. - */ - if (cur_mm) { - unuse_mm(cur_mm); - mmput(cur_mm); - cur_mm = NULL; - } - prepare_to_wait(&ctx->sqo_wait, &wait, TASK_INTERRUPTIBLE); -- cgit v1.2.3 From e61d2392253220477813b43412090dccdcac601f Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Thu, 20 Feb 2020 06:29:48 -0800 Subject: hwmon: (w83627ehf) Fix crash seen with W83627DHG-P Loading the driver on a system with W83627DHG-P crashes as follows. w83627ehf: Found W83627DHG-P chip at 0x290 BUG: kernel NULL pointer dereference, address: 0000000000000000 PGD 0 P4D 0 Oops: 0000 [#1] SMP NOPTI CPU: 0 PID: 604 Comm: sensors Not tainted 5.6.0-rc2-00055-gca7e1fd1026c #29 Hardware name: /D425KT, BIOS MWPNT10N.86A.0132.2013.0726.1534 07/26/2013 RIP: 0010:w83627ehf_read_string+0x27/0x70 [w83627ehf] Code: [... ] RSP: 0018:ffffb95980657df8 EFLAGS: 00010293 RAX: 0000000000000000 RBX: ffff96caaa7f5218 RCX: 0000000000000000 RDX: 0000000000000015 RSI: 0000000000000001 RDI: ffff96caa736ec08 RBP: 0000000000000000 R08: ffffb95980657e20 R09: 0000000000000001 R10: ffff96caaa635cc0 R11: 0000000000000000 R12: ffff96caa9f7cf00 R13: ffff96caa9ec3d00 R14: ffff96caa9ec3d28 R15: ffff96caa9ec3d40 FS: 00007fbc7c4e2740(0000) GS:ffff96caabc00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 0000000129d58000 CR4: 00000000000006f0 Call Trace: ? cp_new_stat+0x12d/0x160 hwmon_attr_show_string+0x37/0x70 [hwmon] dev_attr_show+0x14/0x50 sysfs_kf_seq_show+0xb5/0x1b0 seq_read+0xcf/0x460 vfs_read+0x9b/0x150 ksys_read+0x5f/0xe0 do_syscall_64+0x48/0x190 entry_SYSCALL_64_after_hwframe+0x44/0xa9 ... Temperature labels are not always present. Adjust sysfs attribute visibility accordingly. Reported-by: Meelis Roos Suggested-by: Dr. David Alan Gilbert Reviewed-by: Dr. David Alan Gilbert Cc: Meelis Roos Cc: Dr. David Alan Gilbert Fixes: 266cd5835947 ("hwmon: (w83627ehf) convert to with_info interface") Signed-off-by: Guenter Roeck --- drivers/hwmon/w83627ehf.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/hwmon/w83627ehf.c b/drivers/hwmon/w83627ehf.c index 7ffadc2da57b..5a5120121e50 100644 --- a/drivers/hwmon/w83627ehf.c +++ b/drivers/hwmon/w83627ehf.c @@ -1346,8 +1346,13 @@ w83627ehf_is_visible(const void *drvdata, enum hwmon_sensor_types type, /* channel 0.., name 1.. */ if (!(data->have_temp & (1 << channel))) return 0; - if (attr == hwmon_temp_input || attr == hwmon_temp_label) + if (attr == hwmon_temp_input) return 0444; + if (attr == hwmon_temp_label) { + if (data->temp_label) + return 0444; + return 0; + } if (channel == 2 && data->temp3_val_only) return 0; if (attr == hwmon_temp_max) { -- cgit v1.2.3 From 63fb9623427fbb44e3782233b6e4714057b76ff2 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 21 Feb 2020 01:46:18 +0100 Subject: ACPI: PM: s2idle: Check fixed wakeup events in acpi_s2idle_wake() Commit fdde0ff8590b ("ACPI: PM: s2idle: Prevent spurious SCIs from waking up the system") overlooked the fact that fixed events can wake up the system too and broke RTC wakeup from suspend-to-idle as a result. Fix this issue by checking the fixed events in acpi_s2idle_wake() in addition to checking wakeup GPEs and break out of the suspend-to-idle loop if the status bits of any enabled fixed events are set then. Fixes: fdde0ff8590b ("ACPI: PM: s2idle: Prevent spurious SCIs from waking up the system") Reported-and-tested-by: Chris Wilson Cc: 5.4+ # 5.4+ Signed-off-by: Rafael J. Wysocki Signed-off-by: Linus Torvalds --- drivers/acpi/acpica/evevent.c | 45 +++++++++++++++++++++++++++++++++++++++++++ drivers/acpi/sleep.c | 7 +++++++ include/acpi/acpixf.h | 1 + 3 files changed, 53 insertions(+) diff --git a/drivers/acpi/acpica/evevent.c b/drivers/acpi/acpica/evevent.c index 8c83d8c620dc..789d5e920aaf 100644 --- a/drivers/acpi/acpica/evevent.c +++ b/drivers/acpi/acpica/evevent.c @@ -265,4 +265,49 @@ static u32 acpi_ev_fixed_event_dispatch(u32 event) handler) (acpi_gbl_fixed_event_handlers[event].context)); } +/******************************************************************************* + * + * FUNCTION: acpi_any_fixed_event_status_set + * + * PARAMETERS: None + * + * RETURN: TRUE or FALSE + * + * DESCRIPTION: Checks the PM status register for active fixed events + * + ******************************************************************************/ + +u32 acpi_any_fixed_event_status_set(void) +{ + acpi_status status; + u32 in_status; + u32 in_enable; + u32 i; + + status = acpi_hw_register_read(ACPI_REGISTER_PM1_ENABLE, &in_enable); + if (ACPI_FAILURE(status)) { + return (FALSE); + } + + status = acpi_hw_register_read(ACPI_REGISTER_PM1_STATUS, &in_status); + if (ACPI_FAILURE(status)) { + return (FALSE); + } + + /* + * Check for all possible Fixed Events and dispatch those that are active + */ + for (i = 0; i < ACPI_NUM_FIXED_EVENTS; i++) { + + /* Both the status and enable bits must be on for this event */ + + if ((in_status & acpi_gbl_fixed_event_info[i].status_bit_mask) && + (in_enable & acpi_gbl_fixed_event_info[i].enable_bit_mask)) { + return (TRUE); + } + } + + return (FALSE); +} + #endif /* !ACPI_REDUCED_HARDWARE */ diff --git a/drivers/acpi/sleep.c b/drivers/acpi/sleep.c index 152f7fc0b200..e5f95922bc21 100644 --- a/drivers/acpi/sleep.c +++ b/drivers/acpi/sleep.c @@ -1005,6 +1005,13 @@ static bool acpi_s2idle_wake(void) if (irqd_is_wakeup_armed(irq_get_irq_data(acpi_sci_irq))) return true; + /* + * If the status bit of any enabled fixed event is set, the + * wakeup is regarded as valid. + */ + if (acpi_any_fixed_event_status_set()) + return true; + /* * If there are no EC events to process and at least one of the * other enabled GPEs is active, the wakeup is regarded as a diff --git a/include/acpi/acpixf.h b/include/acpi/acpixf.h index 5867777bb7d0..8e8be989c2a6 100644 --- a/include/acpi/acpixf.h +++ b/include/acpi/acpixf.h @@ -753,6 +753,7 @@ ACPI_HW_DEPENDENT_RETURN_STATUS(acpi_status acpi_disable_all_gpes(void)) ACPI_HW_DEPENDENT_RETURN_STATUS(acpi_status acpi_enable_all_runtime_gpes(void)) ACPI_HW_DEPENDENT_RETURN_STATUS(acpi_status acpi_enable_all_wakeup_gpes(void)) ACPI_HW_DEPENDENT_RETURN_UINT32(u32 acpi_any_gpe_status_set(void)) +ACPI_HW_DEPENDENT_RETURN_UINT32(u32 acpi_any_fixed_event_status_set(void)) ACPI_HW_DEPENDENT_RETURN_STATUS(acpi_status acpi_get_gpe_device(u32 gpe_index, -- cgit v1.2.3 From 595abbaff5db121428247a2e6ab368734472e101 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 20 Feb 2020 20:03:50 -0800 Subject: y2038: remove ktime to/from timespec/timeval conversion A couple of helpers are now obsolete and can be removed, so drivers can no longer start using them and instead use y2038-safe interfaces. Link: http://lkml.kernel.org/r/20200110154232.4104492-2-arnd@arndb.de Signed-off-by: Arnd Bergmann Acked-by: Thomas Gleixner Cc: Deepa Dinamani Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/ktime.h | 37 ------------------------------------- 1 file changed, 37 deletions(-) diff --git a/include/linux/ktime.h b/include/linux/ktime.h index b2bb44f87f5a..d1fb05135665 100644 --- a/include/linux/ktime.h +++ b/include/linux/ktime.h @@ -66,33 +66,15 @@ static inline ktime_t ktime_set(const s64 secs, const unsigned long nsecs) */ #define ktime_sub_ns(kt, nsval) ((kt) - (nsval)) -/* convert a timespec to ktime_t format: */ -static inline ktime_t timespec_to_ktime(struct timespec ts) -{ - return ktime_set(ts.tv_sec, ts.tv_nsec); -} - /* convert a timespec64 to ktime_t format: */ static inline ktime_t timespec64_to_ktime(struct timespec64 ts) { return ktime_set(ts.tv_sec, ts.tv_nsec); } -/* convert a timeval to ktime_t format: */ -static inline ktime_t timeval_to_ktime(struct timeval tv) -{ - return ktime_set(tv.tv_sec, tv.tv_usec * NSEC_PER_USEC); -} - -/* Map the ktime_t to timespec conversion to ns_to_timespec function */ -#define ktime_to_timespec(kt) ns_to_timespec((kt)) - /* Map the ktime_t to timespec conversion to ns_to_timespec function */ #define ktime_to_timespec64(kt) ns_to_timespec64((kt)) -/* Map the ktime_t to timeval conversion to ns_to_timeval function */ -#define ktime_to_timeval(kt) ns_to_timeval((kt)) - /* Convert ktime_t to nanoseconds */ static inline s64 ktime_to_ns(const ktime_t kt) { @@ -215,25 +197,6 @@ static inline ktime_t ktime_sub_ms(const ktime_t kt, const u64 msec) extern ktime_t ktime_add_safe(const ktime_t lhs, const ktime_t rhs); -/** - * ktime_to_timespec_cond - convert a ktime_t variable to timespec - * format only if the variable contains data - * @kt: the ktime_t variable to convert - * @ts: the timespec variable to store the result in - * - * Return: %true if there was a successful conversion, %false if kt was 0. - */ -static inline __must_check bool ktime_to_timespec_cond(const ktime_t kt, - struct timespec *ts) -{ - if (kt) { - *ts = ktime_to_timespec(kt); - return true; - } else { - return false; - } -} - /** * ktime_to_timespec64_cond - convert a ktime_t variable to timespec64 * format only if the variable contains data -- cgit v1.2.3 From 412c53a680a97cb1ae2c0ab60230e193bee86387 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 20 Feb 2020 20:03:54 -0800 Subject: y2038: remove unused time32 interfaces No users remain, so kill these off before we grow new ones. Link: http://lkml.kernel.org/r/20200110154232.4104492-3-arnd@arndb.de Signed-off-by: Arnd Bergmann Acked-by: Thomas Gleixner Cc: Deepa Dinamani Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/compat.h | 29 -------- include/linux/time32.h | 154 +----------------------------------------- include/linux/timekeeping32.h | 32 --------- include/linux/types.h | 5 -- kernel/compat.c | 64 ------------------ kernel/time/time.c | 43 ------------ 6 files changed, 1 insertion(+), 326 deletions(-) diff --git a/include/linux/compat.h b/include/linux/compat.h index 11083d84eb23..df2475be134a 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -248,15 +248,6 @@ typedef struct compat_siginfo { } _sifields; } compat_siginfo_t; -/* - * These functions operate on 32- or 64-bit specs depending on - * COMPAT_USE_64BIT_TIME, hence the void user pointer arguments. - */ -extern int compat_get_timespec(struct timespec *, const void __user *); -extern int compat_put_timespec(const struct timespec *, void __user *); -extern int compat_get_timeval(struct timeval *, const void __user *); -extern int compat_put_timeval(const struct timeval *, void __user *); - struct compat_iovec { compat_uptr_t iov_base; compat_size_t iov_len; @@ -416,26 +407,6 @@ int copy_siginfo_to_user32(struct compat_siginfo __user *to, const kernel_siginf int get_compat_sigevent(struct sigevent *event, const struct compat_sigevent __user *u_event); -static inline int old_timeval32_compare(struct old_timeval32 *lhs, - struct old_timeval32 *rhs) -{ - if (lhs->tv_sec < rhs->tv_sec) - return -1; - if (lhs->tv_sec > rhs->tv_sec) - return 1; - return lhs->tv_usec - rhs->tv_usec; -} - -static inline int old_timespec32_compare(struct old_timespec32 *lhs, - struct old_timespec32 *rhs) -{ - if (lhs->tv_sec < rhs->tv_sec) - return -1; - if (lhs->tv_sec > rhs->tv_sec) - return 1; - return lhs->tv_nsec - rhs->tv_nsec; -} - extern int get_compat_sigset(sigset_t *set, const compat_sigset_t __user *compat); /* diff --git a/include/linux/time32.h b/include/linux/time32.h index cad4c3186002..cf9320cd2d0b 100644 --- a/include/linux/time32.h +++ b/include/linux/time32.h @@ -12,8 +12,6 @@ #include #include -#define TIME_T_MAX (__kernel_old_time_t)((1UL << ((sizeof(__kernel_old_time_t) << 3) - 1)) - 1) - typedef s32 old_time32_t; struct old_timespec32 { @@ -73,162 +71,12 @@ struct __kernel_timex; int get_old_timex32(struct __kernel_timex *, const struct old_timex32 __user *); int put_old_timex32(struct old_timex32 __user *, const struct __kernel_timex *); -#if __BITS_PER_LONG == 64 - -/* timespec64 is defined as timespec here */ -static inline struct timespec timespec64_to_timespec(const struct timespec64 ts64) -{ - return *(const struct timespec *)&ts64; -} - -static inline struct timespec64 timespec_to_timespec64(const struct timespec ts) -{ - return *(const struct timespec64 *)&ts; -} - -#else -static inline struct timespec timespec64_to_timespec(const struct timespec64 ts64) -{ - struct timespec ret; - - ret.tv_sec = (time_t)ts64.tv_sec; - ret.tv_nsec = ts64.tv_nsec; - return ret; -} - -static inline struct timespec64 timespec_to_timespec64(const struct timespec ts) -{ - struct timespec64 ret; - - ret.tv_sec = ts.tv_sec; - ret.tv_nsec = ts.tv_nsec; - return ret; -} -#endif - -static inline int timespec_equal(const struct timespec *a, - const struct timespec *b) -{ - return (a->tv_sec == b->tv_sec) && (a->tv_nsec == b->tv_nsec); -} - -/* - * lhs < rhs: return <0 - * lhs == rhs: return 0 - * lhs > rhs: return >0 - */ -static inline int timespec_compare(const struct timespec *lhs, const struct timespec *rhs) -{ - if (lhs->tv_sec < rhs->tv_sec) - return -1; - if (lhs->tv_sec > rhs->tv_sec) - return 1; - return lhs->tv_nsec - rhs->tv_nsec; -} - -/* - * Returns true if the timespec is norm, false if denorm: - */ -static inline bool timespec_valid(const struct timespec *ts) -{ - /* Dates before 1970 are bogus */ - if (ts->tv_sec < 0) - return false; - /* Can't have more nanoseconds then a second */ - if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC) - return false; - return true; -} - -/** - * timespec_to_ns - Convert timespec to nanoseconds - * @ts: pointer to the timespec variable to be converted - * - * Returns the scalar nanosecond representation of the timespec - * parameter. - */ -static inline s64 timespec_to_ns(const struct timespec *ts) -{ - return ((s64) ts->tv_sec * NSEC_PER_SEC) + ts->tv_nsec; -} - /** - * ns_to_timespec - Convert nanoseconds to timespec - * @nsec: the nanoseconds value to be converted - * - * Returns the timespec representation of the nsec parameter. - */ -extern struct timespec ns_to_timespec(const s64 nsec); - -/** - * timespec_add_ns - Adds nanoseconds to a timespec - * @a: pointer to timespec to be incremented - * @ns: unsigned nanoseconds value to be added - * - * This must always be inlined because its used from the x86-64 vdso, - * which cannot call other kernel functions. - */ -static __always_inline void timespec_add_ns(struct timespec *a, u64 ns) -{ - a->tv_sec += __iter_div_u64_rem(a->tv_nsec + ns, NSEC_PER_SEC, &ns); - a->tv_nsec = ns; -} - -static inline unsigned long mktime(const unsigned int year, - const unsigned int mon, const unsigned int day, - const unsigned int hour, const unsigned int min, - const unsigned int sec) -{ - return mktime64(year, mon, day, hour, min, sec); -} - -static inline bool timeval_valid(const struct timeval *tv) -{ - /* Dates before 1970 are bogus */ - if (tv->tv_sec < 0) - return false; - - /* Can't have more microseconds then a second */ - if (tv->tv_usec < 0 || tv->tv_usec >= USEC_PER_SEC) - return false; - - return true; -} - -/** - * timeval_to_ns - Convert timeval to nanoseconds - * @ts: pointer to the timeval variable to be converted - * - * Returns the scalar nanosecond representation of the timeval - * parameter. - */ -static inline s64 timeval_to_ns(const struct timeval *tv) -{ - return ((s64) tv->tv_sec * NSEC_PER_SEC) + - tv->tv_usec * NSEC_PER_USEC; -} - -/** - * ns_to_timeval - Convert nanoseconds to timeval + * ns_to_kernel_old_timeval - Convert nanoseconds to timeval * @nsec: the nanoseconds value to be converted * * Returns the timeval representation of the nsec parameter. */ -extern struct timeval ns_to_timeval(const s64 nsec); extern struct __kernel_old_timeval ns_to_kernel_old_timeval(s64 nsec); -/* - * Old names for the 32-bit time_t interfaces, these will be removed - * when everything uses the new names. - */ -#define compat_time_t old_time32_t -#define compat_timeval old_timeval32 -#define compat_timespec old_timespec32 -#define compat_itimerspec old_itimerspec32 -#define ns_to_compat_timeval ns_to_old_timeval32 -#define get_compat_itimerspec64 get_old_itimerspec32 -#define put_compat_itimerspec64 put_old_itimerspec32 -#define compat_get_timespec64 get_old_timespec32 -#define compat_put_timespec64 put_old_timespec32 - #endif diff --git a/include/linux/timekeeping32.h b/include/linux/timekeeping32.h index cc59cc9e0e84..266017fc9ee9 100644 --- a/include/linux/timekeeping32.h +++ b/include/linux/timekeeping32.h @@ -11,36 +11,4 @@ static inline unsigned long get_seconds(void) return ktime_get_real_seconds(); } -static inline void getnstimeofday(struct timespec *ts) -{ - struct timespec64 ts64; - - ktime_get_real_ts64(&ts64); - *ts = timespec64_to_timespec(ts64); -} - -static inline void ktime_get_ts(struct timespec *ts) -{ - struct timespec64 ts64; - - ktime_get_ts64(&ts64); - *ts = timespec64_to_timespec(ts64); -} - -static inline void getrawmonotonic(struct timespec *ts) -{ - struct timespec64 ts64; - - ktime_get_raw_ts64(&ts64); - *ts = timespec64_to_timespec(ts64); -} - -static inline void getboottime(struct timespec *ts) -{ - struct timespec64 ts64; - - getboottime64(&ts64); - *ts = timespec64_to_timespec(ts64); -} - #endif diff --git a/include/linux/types.h b/include/linux/types.h index eb870ad42919..d3021c879179 100644 --- a/include/linux/types.h +++ b/include/linux/types.h @@ -65,11 +65,6 @@ typedef __kernel_ssize_t ssize_t; typedef __kernel_ptrdiff_t ptrdiff_t; #endif -#ifndef _TIME_T -#define _TIME_T -typedef __kernel_old_time_t time_t; -#endif - #ifndef _CLOCK_T #define _CLOCK_T typedef __kernel_clock_t clock_t; diff --git a/kernel/compat.c b/kernel/compat.c index 95005f849c68..843dd17e6078 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -26,70 +26,6 @@ #include -static int __compat_get_timeval(struct timeval *tv, const struct old_timeval32 __user *ctv) -{ - return (!access_ok(ctv, sizeof(*ctv)) || - __get_user(tv->tv_sec, &ctv->tv_sec) || - __get_user(tv->tv_usec, &ctv->tv_usec)) ? -EFAULT : 0; -} - -static int __compat_put_timeval(const struct timeval *tv, struct old_timeval32 __user *ctv) -{ - return (!access_ok(ctv, sizeof(*ctv)) || - __put_user(tv->tv_sec, &ctv->tv_sec) || - __put_user(tv->tv_usec, &ctv->tv_usec)) ? -EFAULT : 0; -} - -static int __compat_get_timespec(struct timespec *ts, const struct old_timespec32 __user *cts) -{ - return (!access_ok(cts, sizeof(*cts)) || - __get_user(ts->tv_sec, &cts->tv_sec) || - __get_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0; -} - -static int __compat_put_timespec(const struct timespec *ts, struct old_timespec32 __user *cts) -{ - return (!access_ok(cts, sizeof(*cts)) || - __put_user(ts->tv_sec, &cts->tv_sec) || - __put_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0; -} - -int compat_get_timeval(struct timeval *tv, const void __user *utv) -{ - if (COMPAT_USE_64BIT_TIME) - return copy_from_user(tv, utv, sizeof(*tv)) ? -EFAULT : 0; - else - return __compat_get_timeval(tv, utv); -} -EXPORT_SYMBOL_GPL(compat_get_timeval); - -int compat_put_timeval(const struct timeval *tv, void __user *utv) -{ - if (COMPAT_USE_64BIT_TIME) - return copy_to_user(utv, tv, sizeof(*tv)) ? -EFAULT : 0; - else - return __compat_put_timeval(tv, utv); -} -EXPORT_SYMBOL_GPL(compat_put_timeval); - -int compat_get_timespec(struct timespec *ts, const void __user *uts) -{ - if (COMPAT_USE_64BIT_TIME) - return copy_from_user(ts, uts, sizeof(*ts)) ? -EFAULT : 0; - else - return __compat_get_timespec(ts, uts); -} -EXPORT_SYMBOL_GPL(compat_get_timespec); - -int compat_put_timespec(const struct timespec *ts, void __user *uts) -{ - if (COMPAT_USE_64BIT_TIME) - return copy_to_user(uts, ts, sizeof(*ts)) ? -EFAULT : 0; - else - return __compat_put_timespec(ts, uts); -} -EXPORT_SYMBOL_GPL(compat_put_timespec); - #ifdef __ARCH_WANT_SYS_SIGPROCMASK /* diff --git a/kernel/time/time.c b/kernel/time/time.c index cdd7386115ff..3985b2b32d08 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -449,49 +449,6 @@ time64_t mktime64(const unsigned int year0, const unsigned int mon0, } EXPORT_SYMBOL(mktime64); -/** - * ns_to_timespec - Convert nanoseconds to timespec - * @nsec: the nanoseconds value to be converted - * - * Returns the timespec representation of the nsec parameter. - */ -struct timespec ns_to_timespec(const s64 nsec) -{ - struct timespec ts; - s32 rem; - - if (!nsec) - return (struct timespec) {0, 0}; - - ts.tv_sec = div_s64_rem(nsec, NSEC_PER_SEC, &rem); - if (unlikely(rem < 0)) { - ts.tv_sec--; - rem += NSEC_PER_SEC; - } - ts.tv_nsec = rem; - - return ts; -} -EXPORT_SYMBOL(ns_to_timespec); - -/** - * ns_to_timeval - Convert nanoseconds to timeval - * @nsec: the nanoseconds value to be converted - * - * Returns the timeval representation of the nsec parameter. - */ -struct timeval ns_to_timeval(const s64 nsec) -{ - struct timespec ts = ns_to_timespec(nsec); - struct timeval tv; - - tv.tv_sec = ts.tv_sec; - tv.tv_usec = (suseconds_t) ts.tv_nsec / 1000; - - return tv; -} -EXPORT_SYMBOL(ns_to_timeval); - struct __kernel_old_timeval ns_to_kernel_old_timeval(const s64 nsec) { struct timespec64 ts = ns_to_timespec64(nsec); -- cgit v1.2.3 From c766d1472c70d25ad475cf56042af1652e792b23 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 20 Feb 2020 20:03:57 -0800 Subject: y2038: hide timeval/timespec/itimerval/itimerspec types There are no in-kernel users remaining, but there may still be users that include linux/time.h instead of sys/time.h from user space, so leave the types available to user space while hiding them from kernel space. Only the __kernel_old_* versions of these types remain now. Link: http://lkml.kernel.org/r/20200110154232.4104492-4-arnd@arndb.de Signed-off-by: Arnd Bergmann Acked-by: Thomas Gleixner Cc: Deepa Dinamani Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/uapi/asm-generic/posix_types.h | 2 ++ include/uapi/linux/time.h | 22 ++++++++++++---------- 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/include/uapi/asm-generic/posix_types.h b/include/uapi/asm-generic/posix_types.h index 2f9c80595ba7..b5f7594eee7a 100644 --- a/include/uapi/asm-generic/posix_types.h +++ b/include/uapi/asm-generic/posix_types.h @@ -87,7 +87,9 @@ typedef struct { typedef __kernel_long_t __kernel_off_t; typedef long long __kernel_loff_t; typedef __kernel_long_t __kernel_old_time_t; +#ifndef __KERNEL__ typedef __kernel_long_t __kernel_time_t; +#endif typedef long long __kernel_time64_t; typedef __kernel_long_t __kernel_clock_t; typedef int __kernel_timer_t; diff --git a/include/uapi/linux/time.h b/include/uapi/linux/time.h index a655aa28dc6e..4f4b6e48e01c 100644 --- a/include/uapi/linux/time.h +++ b/include/uapi/linux/time.h @@ -5,6 +5,7 @@ #include #include +#ifndef __KERNEL__ #ifndef _STRUCT_TIMESPEC #define _STRUCT_TIMESPEC struct timespec { @@ -18,6 +19,17 @@ struct timeval { __kernel_suseconds_t tv_usec; /* microseconds */ }; +struct itimerspec { + struct timespec it_interval;/* timer period */ + struct timespec it_value; /* timer expiration */ +}; + +struct itimerval { + struct timeval it_interval;/* timer interval */ + struct timeval it_value; /* current value */ +}; +#endif + struct timezone { int tz_minuteswest; /* minutes west of Greenwich */ int tz_dsttime; /* type of dst correction */ @@ -31,16 +43,6 @@ struct timezone { #define ITIMER_VIRTUAL 1 #define ITIMER_PROF 2 -struct itimerspec { - struct timespec it_interval; /* timer period */ - struct timespec it_value; /* timer expiration */ -}; - -struct itimerval { - struct timeval it_interval; /* timer interval */ - struct timeval it_value; /* current value */ -}; - /* * The IDs of the various system clocks (for POSIX.1b interval timers): */ -- cgit v1.2.3 From edf28f4061afe4c2d9eb1c3323d90e882c1d6800 Mon Sep 17 00:00:00 2001 From: Ioanna Alifieraki Date: Thu, 20 Feb 2020 20:04:00 -0800 Subject: Revert "ipc,sem: remove uneeded sem_undo_list lock usage in exit_sem()" This reverts commit a97955844807e327df11aa33869009d14d6b7de0. Commit a97955844807 ("ipc,sem: remove uneeded sem_undo_list lock usage in exit_sem()") removes a lock that is needed. This leads to a process looping infinitely in exit_sem() and can also lead to a crash. There is a reproducer available in [1] and with the commit reverted the issue does not reproduce anymore. Using the reproducer found in [1] is fairly easy to reach a point where one of the child processes is looping infinitely in exit_sem between for(;;) and if (semid == -1) block, while it's trying to free its last sem_undo structure which has already been freed by freeary(). Each sem_undo struct is on two lists: one per semaphore set (list_id) and one per process (list_proc). The list_id list tracks undos by semaphore set, and the list_proc by process. Undo structures are removed either by freeary() or by exit_sem(). The freeary function is invoked when the user invokes a syscall to remove a semaphore set. During this operation freeary() traverses the list_id associated with the semaphore set and removes the undo structures from both the list_id and list_proc lists. For this case, exit_sem() is called at process exit. Each process contains a struct sem_undo_list (referred to as "ulp") which contains the head for the list_proc list. When the process exits, exit_sem() traverses this list to remove each sem_undo struct. As in freeary(), whenever a sem_undo struct is removed from list_proc, it is also removed from the list_id list. Removing elements from list_id is safe for both exit_sem() and freeary() due to sem_lock(). Removing elements from list_proc is not safe; freeary() locks &un->ulp->lock when it performs list_del_rcu(&un->list_proc) but exit_sem() does not (locking was removed by commit a97955844807 ("ipc,sem: remove uneeded sem_undo_list lock usage in exit_sem()"). This can result in the following situation while executing the reproducer [1] : Consider a child process in exit_sem() and the parent in freeary() (because of semctl(sid[i], NSEM, IPC_RMID)). - The list_proc for the child contains the last two undo structs A and B (the rest have been removed either by exit_sem() or freeary()). - The semid for A is 1 and semid for B is 2. - exit_sem() removes A and at the same time freeary() removes B. - Since A and B have different semid sem_lock() will acquire different locks for each process and both can proceed. The bug is that they remove A and B from the same list_proc at the same time because only freeary() acquires the ulp lock. When exit_sem() removes A it makes ulp->list_proc.next to point at B and at the same time freeary() removes B setting B->semid=-1. At the next iteration of for(;;) loop exit_sem() will try to remove B. The only way to break from for(;;) is for (&un->list_proc == &ulp->list_proc) to be true which is not. Then exit_sem() will check if B->semid=-1 which is and will continue looping in for(;;) until the memory for B is reallocated and the value at B->semid is changed. At that point, exit_sem() will crash attempting to unlink B from the lists (this can be easily triggered by running the reproducer [1] a second time). To prove this scenario instrumentation was added to keep information about each sem_undo (un) struct that is removed per process and per semaphore set (sma). CPU0 CPU1 [caller holds sem_lock(sma for A)] ... freeary() exit_sem() ... ... ... sem_lock(sma for B) spin_lock(A->ulp->lock) ... list_del_rcu(un_A->list_proc) list_del_rcu(un_B->list_proc) Undo structures A and B have different semid and sem_lock() operations proceed. However they belong to the same list_proc list and they are removed at the same time. This results into ulp->list_proc.next pointing to the address of B which is already removed. After reverting commit a97955844807 ("ipc,sem: remove uneeded sem_undo_list lock usage in exit_sem()") the issue was no longer reproducible. [1] https://bugzilla.redhat.com/show_bug.cgi?id=1694779 Link: http://lkml.kernel.org/r/20191211191318.11860-1-ioanna-maria.alifieraki@canonical.com Fixes: a97955844807 ("ipc,sem: remove uneeded sem_undo_list lock usage in exit_sem()") Signed-off-by: Ioanna Alifieraki Acked-by: Manfred Spraul Acked-by: Herton R. Krzesinski Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Cc: Joel Fernandes (Google) Cc: Davidlohr Bueso Cc: Jay Vosburgh Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- ipc/sem.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/ipc/sem.c b/ipc/sem.c index 4f4303f32077..3687b71151b3 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -2384,11 +2384,9 @@ void exit_sem(struct task_struct *tsk) ipc_assert_locked_object(&sma->sem_perm); list_del(&un->list_id); - /* we are the last process using this ulp, acquiring ulp->lock - * isn't required. Besides that, we are also protected against - * IPC_RMID as we hold sma->sem_perm lock now - */ + spin_lock(&ulp->lock); list_del_rcu(&un->list_proc); + spin_unlock(&ulp->lock); /* perform adjustments registered in un */ for (i = 0; i < sma->sem_nsems; i++) { -- cgit v1.2.3 From 467d12f5c7842896d2de3ced74e4147ee29e97c8 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Thu, 20 Feb 2020 20:04:03 -0800 Subject: include/uapi/linux/swab.h: fix userspace breakage, use __BITS_PER_LONG for swap QEMU has a funny new build error message when I use the upstream kernel headers: CC block/file-posix.o In file included from /home/cborntra/REPOS/qemu/include/qemu/timer.h:4, from /home/cborntra/REPOS/qemu/include/qemu/timed-average.h:29, from /home/cborntra/REPOS/qemu/include/block/accounting.h:28, from /home/cborntra/REPOS/qemu/include/block/block_int.h:27, from /home/cborntra/REPOS/qemu/block/file-posix.c:30: /usr/include/linux/swab.h: In function `__swab': /home/cborntra/REPOS/qemu/include/qemu/bitops.h:20:34: error: "sizeof" is not defined, evaluates to 0 [-Werror=undef] 20 | #define BITS_PER_LONG (sizeof (unsigned long) * BITS_PER_BYTE) | ^~~~~~ /home/cborntra/REPOS/qemu/include/qemu/bitops.h:20:41: error: missing binary operator before token "(" 20 | #define BITS_PER_LONG (sizeof (unsigned long) * BITS_PER_BYTE) | ^ cc1: all warnings being treated as errors make: *** [/home/cborntra/REPOS/qemu/rules.mak:69: block/file-posix.o] Error 1 rm tests/qemu-iotests/socket_scm_helper.o This was triggered by commit d5767057c9a ("uapi: rename ext2_swab() to swab() and share globally in swab.h"). That patch is doing #include but it uses BITS_PER_LONG. The kernel file asm/bitsperlong.h provide only __BITS_PER_LONG. Let us use the __ variant in swap.h Link: http://lkml.kernel.org/r/20200213142147.17604-1-borntraeger@de.ibm.com Fixes: d5767057c9a ("uapi: rename ext2_swab() to swab() and share globally in swab.h") Signed-off-by: Christian Borntraeger Cc: Yury Norov Cc: Allison Randal Cc: Joe Perches Cc: Thomas Gleixner Cc: William Breathitt Gray Cc: Torsten Hilbrich Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/uapi/linux/swab.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/uapi/linux/swab.h b/include/uapi/linux/swab.h index fa7f97da5b76..7272f85d6d6a 100644 --- a/include/uapi/linux/swab.h +++ b/include/uapi/linux/swab.h @@ -135,9 +135,9 @@ static inline __attribute_const__ __u32 __fswahb32(__u32 val) static __always_inline unsigned long __swab(const unsigned long y) { -#if BITS_PER_LONG == 64 +#if __BITS_PER_LONG == 64 return __swab64(y); -#else /* BITS_PER_LONG == 32 */ +#else /* __BITS_PER_LONG == 32 */ return __swab32(y); #endif } -- cgit v1.2.3 From 9e69fa46275b63f670ced0f11095af1841c73fca Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Thu, 20 Feb 2020 20:04:06 -0800 Subject: selftests/vm: add missed tests in run_vmtests The commits introducing 'mlock-random-test'[1], 'map_fiex_noreplace'[2], and 'thuge-gen'[3] have not added those in the 'run_vmtests' script and thus the 'run_tests' command of kselftests doesn't run those. This commit adds those in the script. 'gup_benchmark' and 'transhuge-stress' are also not included in the 'run_vmtests', but this commit does not add those because those are for performance measurement rather than pass/fail tests. [1] commit 26b4224d9961 ("selftests: expanding more mlock selftest") [2] commit 91cbacc34512 ("tools/testing/selftests/vm/map_fixed_noreplace.c: add test for MAP_FIXED_NOREPLACE") [3] commit fcc1f2d5dd34 ("selftests: add a test program for variable huge page sizes in mmap/shmget") Link: http://lkml.kernel.org/r/20200206085144.29126-1-sj38.park@gmail.com Signed-off-by: SeongJae Park Cc: Uladzislau Rezki (Sony) Cc: Masami Hiramatsu Cc: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/vm/run_vmtests | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/tools/testing/selftests/vm/run_vmtests b/tools/testing/selftests/vm/run_vmtests index a692ea828317..f33714843198 100755 --- a/tools/testing/selftests/vm/run_vmtests +++ b/tools/testing/selftests/vm/run_vmtests @@ -112,6 +112,17 @@ echo "NOTE: The above hugetlb tests provide minimal coverage. Use" echo " https://github.com/libhugetlbfs/libhugetlbfs.git for" echo " hugetlb regression testing." +echo "---------------------------" +echo "running map_fixed_noreplace" +echo "---------------------------" +./map_fixed_noreplace +if [ $? -ne 0 ]; then + echo "[FAIL]" + exitcode=1 +else + echo "[PASS]" +fi + echo "-------------------" echo "running userfaultfd" echo "-------------------" @@ -186,6 +197,17 @@ else echo "[PASS]" fi +echo "-------------------------" +echo "running mlock-random-test" +echo "-------------------------" +./mlock-random-test +if [ $? -ne 0 ]; then + echo "[FAIL]" + exitcode=1 +else + echo "[PASS]" +fi + echo "--------------------" echo "running mlock2-tests" echo "--------------------" @@ -197,6 +219,17 @@ else echo "[PASS]" fi +echo "-----------------" +echo "running thuge-gen" +echo "-----------------" +./thuge-gen +if [ $? -ne 0 ]; then + echo "[FAIL]" + exitcode=1 +else + echo "[PASS]" +fi + if [ $VADDR64 -ne 0 ]; then echo "-----------------------------" echo "running virtual_address_range" -- cgit v1.2.3 From ef0c08192ac09e29ddd676b3ca6c4a501f277f10 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Thu, 20 Feb 2020 20:04:09 -0800 Subject: get_maintainer: remove uses of P: for maintainer name Commit 1ca84ed6425f ("MAINTAINERS: Reclaim the P: tag for Maintainer Entry Profile") changed the use of the "P:" tag from "Person" to "Profile (ie: special subsystem coding styles and characteristics)" Change how get_maintainer.pl parses the "P:" tag to match. Link: http://lkml.kernel.org/r/ca53823fc5d25c0be32ad937d0207a0589c08643.camel@perches.com Signed-off-by: Joe Perches Acked-by: Dan Williams Cc: Jonathan Corbet Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/get_maintainer.pl | 24 ------------------------ 1 file changed, 24 deletions(-) diff --git a/scripts/get_maintainer.pl b/scripts/get_maintainer.pl index 34085d146fa2..a00156855354 100755 --- a/scripts/get_maintainer.pl +++ b/scripts/get_maintainer.pl @@ -1341,35 +1341,11 @@ sub add_categories { } } } elsif ($ptype eq "M") { - my ($name, $address) = parse_email($pvalue); - if ($name eq "") { - if ($i > 0) { - my $tv = $typevalue[$i - 1]; - if ($tv =~ m/^([A-Z]):\s*(.*)/) { - if ($1 eq "P") { - $name = $2; - $pvalue = format_email($name, $address, $email_usename); - } - } - } - } if ($email_maintainer) { my $role = get_maintainer_role($i); push_email_addresses($pvalue, $role); } } elsif ($ptype eq "R") { - my ($name, $address) = parse_email($pvalue); - if ($name eq "") { - if ($i > 0) { - my $tv = $typevalue[$i - 1]; - if ($tv =~ m/^([A-Z]):\s*(.*)/) { - if ($1 eq "P") { - $name = $2; - $pvalue = format_email($name, $address, $email_usename); - } - } - } - } if ($email_reviewer) { my $subsystem = get_subsystem_name($i); push_email_addresses($pvalue, "reviewer:$subsystem"); -- cgit v1.2.3 From 0ef82fcefb99300ede6f4d38a8100845b2dc8e30 Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Thu, 20 Feb 2020 20:04:12 -0800 Subject: scripts/get_maintainer.pl: deprioritize old Fixes: addresses Recently, I found that get_maintainer was causing me to send emails to the old addresses for maintainers. Since I usually just trust the output of get_maintainer to know the right email address, I didn't even look carefully and fired off two patch series that went to the wrong place. Oops. The problem was introduced recently when trying to add signatures from Fixes. The problem was that these email addresses were added too early in the process of compiling our list of places to send. Things added to the list earlier are considered more canonical and when we later added maintainer entries we ended up deduplicating to the old address. Here are two examples using mainline commits (to make it easier to replicate) for the two maintainers that I messed up recently: $ git format-patch d8549bcd0529~..d8549bcd0529 $ ./scripts/get_maintainer.pl 0001-clk-Add-clk_hw*.patch | grep Boyd Stephen Boyd ... $ git format-patch 6d1238aa3395~..6d1238aa3395 $ ./scripts/get_maintainer.pl 0001-arm64-dts-qcom-qcs404*.patch | grep Andy Andy Gross Let's move the adding of addresses from Fixes: to the end since the email addresses from these are much more likely to be older. After this patch the above examples get the right addresses for the two examples. Link: http://lkml.kernel.org/r/20200127095001.1.I41fba9f33590bfd92cd01960161d8384268c6569@changeid Fixes: 2f5bd343694e ("scripts/get_maintainer.pl: add signatures from Fixes: lines in commit message") Signed-off-by: Douglas Anderson Acked-by: Joe Perches Cc: Stephen Boyd Cc: Bjorn Andersson Cc: Andy Gross Cc: Kees Cook Cc: Dan Carpenter Cc: Greg Kroah-Hartman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/get_maintainer.pl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/get_maintainer.pl b/scripts/get_maintainer.pl index a00156855354..6cbcd1a3e113 100755 --- a/scripts/get_maintainer.pl +++ b/scripts/get_maintainer.pl @@ -932,10 +932,6 @@ sub get_maintainers { } } - foreach my $fix (@fixes) { - vcs_add_commit_signers($fix, "blamed_fixes"); - } - foreach my $email (@email_to, @list_to) { $email->[0] = deduplicate_email($email->[0]); } @@ -974,6 +970,10 @@ sub get_maintainers { } } + foreach my $fix (@fixes) { + vcs_add_commit_signers($fix, "blamed_fixes"); + } + my @to = (); if ($email || $email_list) { if ($email) { -- cgit v1.2.3 From fed98ef4d8b665316479dd35cbd92d3e2ff470a3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 20 Feb 2020 20:04:15 -0800 Subject: mm/swapfile.c: fix a comment in sys_swapon() claim_swapfile now always takes i_rwsem. Link: http://lkml.kernel.org/r/20200114161225.309792-2-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index 2c33ff456ed5..b2a2e45c9a36 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -3157,7 +3157,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) mapping = swap_file->f_mapping; inode = mapping->host; - /* If S_ISREG(inode->i_mode) will do inode_lock(inode); */ + /* will take i_rwsem; */ error = claim_swapfile(p, inode); if (unlikely(error)) goto bad_swap; -- cgit v1.2.3 From 75866af62b439859d5146b7093ceb6b482852683 Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Thu, 20 Feb 2020 20:04:18 -0800 Subject: mm/memcontrol.c: lost css_put in memcg_expand_shrinker_maps() for_each_mem_cgroup() increases css reference counter for memory cgroup and requires to use mem_cgroup_iter_break() if the walk is cancelled. Link: http://lkml.kernel.org/r/c98414fb-7e1f-da0f-867a-9340ec4bd30b@virtuozzo.com Fixes: 0a4465d34028 ("mm, memcg: assign memcg-aware shrinkers bitmap to memcg") Signed-off-by: Vasily Averin Acked-by: Kirill Tkhai Acked-by: Michal Hocko Reviewed-by: Roman Gushchin Cc: Johannes Weiner Cc: Vladimir Davydov Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 6f6dc8712e39..d09776cd6e10 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -409,8 +409,10 @@ int memcg_expand_shrinker_maps(int new_id) if (mem_cgroup_is_root(memcg)) continue; ret = memcg_expand_one_shrinker_map(memcg, size, old_size); - if (ret) + if (ret) { + mem_cgroup_iter_break(NULL, memcg); goto unlock; + } } unlock: if (!ret) -- cgit v1.2.3 From c11d3fa0116a6bc832a9e387427caa16f8de5ef2 Mon Sep 17 00:00:00 2001 From: Alexandru Ardelean Date: Thu, 20 Feb 2020 20:04:21 -0800 Subject: lib/string.c: update match_string() doc-strings with correct behavior There were a few attempts at changing behavior of the match_string() helpers (i.e. 'match_string()' & 'sysfs_match_string()'), to change & extend the behavior according to the doc-string. But the simplest approach is to just fix the doc-strings. The current behavior is fine as-is, and some bugs were introduced trying to fix it. As for extending the behavior, new helpers can always be introduced if needed. The match_string() helpers behave more like 'strncmp()' in the sense that they go up to n elements or until the first NULL element in the array of strings. This change updates the doc-strings with this info. Link: http://lkml.kernel.org/r/20200213072722.8249-1-alexandru.ardelean@analog.com Signed-off-by: Alexandru Ardelean Acked-by: Andy Shevchenko Cc: Kees Cook Cc: "Tobin C . Harding" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/string.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/lib/string.c b/lib/string.c index f607b967d978..6012c385fb31 100644 --- a/lib/string.c +++ b/lib/string.c @@ -699,6 +699,14 @@ EXPORT_SYMBOL(sysfs_streq); * @n: number of strings in the array or -1 for NULL terminated arrays * @string: string to match with * + * This routine will look for a string in an array of strings up to the + * n-th element in the array or until the first NULL element. + * + * Historically the value of -1 for @n, was used to search in arrays that + * are NULL terminated. However, the function does not make a distinction + * when finishing the search: either @n elements have been compared OR + * the first NULL element was found. + * * Return: * index of a @string in the @array if matches, or %-EINVAL otherwise. */ @@ -727,6 +735,14 @@ EXPORT_SYMBOL(match_string); * * Returns index of @str in the @array or -EINVAL, just like match_string(). * Uses sysfs_streq instead of strcmp for matching. + * + * This routine will look for a string in an array of strings up to the + * n-th element in the array or until the first NULL element. + * + * Historically the value of -1 for @n, was used to search in arrays that + * are NULL terminated. However, the function does not make a distinction + * when finishing the search: either @n elements have been compared OR + * the first NULL element was found. */ int __sysfs_match_string(const char * const *array, size_t n, const char *str) { -- cgit v1.2.3 From 76073c646f5f4999d763f471df9e38a5a912d70d Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Thu, 20 Feb 2020 20:04:24 -0800 Subject: mm/vmscan.c: don't round up scan size for online memory cgroup Commit 68600f623d69 ("mm: don't miss the last page because of round-off error") makes the scan size round up to @denominator regardless of the memory cgroup's state, online or offline. This affects the overall reclaiming behavior: the corresponding LRU list is eligible for reclaiming only when its size logically right shifted by @sc->priority is bigger than zero in the former formula. For example, the inactive anonymous LRU list should have at least 0x4000 pages to be eligible for reclaiming when we have 60/12 for swappiness/priority and without taking scan/rotation ratio into account. After the roundup is applied, the inactive anonymous LRU list becomes eligible for reclaiming when its size is bigger than or equal to 0x1000 in the same condition. (0x4000 >> 12) * 60 / (60 + 140 + 1) = 1 ((0x1000 >> 12) * 60) + 200) / (60 + 140 + 1) = 1 aarch64 has 512MB huge page size when the base page size is 64KB. The memory cgroup that has a huge page is always eligible for reclaiming in that case. The reclaiming is likely to stop after the huge page is reclaimed, meaing the further iteration on @sc->priority and the silbing and child memory cgroups will be skipped. The overall behaviour has been changed. This fixes the issue by applying the roundup to offlined memory cgroups only, to give more preference to reclaim memory from offlined memory cgroup. It sounds reasonable as those memory is unlikedly to be used by anyone. The issue was found by starting up 8 VMs on a Ampere Mustang machine, which has 8 CPUs and 16 GB memory. Each VM is given with 2 vCPUs and 2GB memory. It took 264 seconds for all VMs to be completely up and 784MB swap is consumed after that. With this patch applied, it took 236 seconds and 60MB swap to do same thing. So there is 10% performance improvement for my case. Note that KSM is disable while THP is enabled in the testing. total used free shared buff/cache available Mem: 16196 10065 2049 16 4081 3749 Swap: 8175 784 7391 total used free shared buff/cache available Mem: 16196 11324 3656 24 1215 2936 Swap: 8175 60 8115 Link: http://lkml.kernel.org/r/20200211024514.8730-1-gshan@redhat.com Fixes: 68600f623d69 ("mm: don't miss the last page because of round-off error") Signed-off-by: Gavin Shan Acked-by: Roman Gushchin Cc: [4.20+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index c05eb9efec07..876370565455 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2415,10 +2415,13 @@ out: /* * Scan types proportional to swappiness and * their relative recent reclaim efficiency. - * Make sure we don't miss the last page - * because of a round-off error. + * Make sure we don't miss the last page on + * the offlined memory cgroups because of a + * round-off error. */ - scan = DIV64_U64_ROUND_UP(scan * fraction[file], + scan = mem_cgroup_online(memcg) ? + div64_u64(scan * fraction[file], denominator) : + DIV64_U64_ROUND_UP(scan * fraction[file], denominator); break; case SCAN_FILE: -- cgit v1.2.3 From 18e19f195cd888f65643a77a0c6aee8f5be6439a Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Thu, 20 Feb 2020 20:04:27 -0800 Subject: mm/sparsemem: pfn_to_page is not valid yet on SPARSEMEM When we use SPARSEMEM instead of SPARSEMEM_VMEMMAP, pfn_to_page() doesn't work before sparse_init_one_section() is called. This leads to a crash when hotplug memory: BUG: unable to handle page fault for address: 0000000006400000 #PF: supervisor write access in kernel mode #PF: error_code(0x0002) - not-present page PGD 0 P4D 0 Oops: 0002 [#1] SMP PTI CPU: 3 PID: 221 Comm: kworker/u16:1 Tainted: G W 5.5.0-next-20200205+ #343 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 0.0.0 02/06/2015 Workqueue: kacpi_hotplug acpi_hotplug_work_fn RIP: 0010:__memset+0x24/0x30 Code: cc cc cc cc cc cc 0f 1f 44 00 00 49 89 f9 48 89 d1 83 e2 07 48 c1 e9 03 40 0f b6 f6 48 b8 01 01 01 01 01 01 01 01 48 0f af c6 48 ab 89 d1 f3 aa 4c 89 c8 c3 90 49 89 f9 40 88 f0 48 89 d1 f3 RSP: 0018:ffffb43ac0373c80 EFLAGS: 00010a87 RAX: ffffffffffffffff RBX: ffff8a1518800000 RCX: 0000000000050000 RDX: 0000000000000000 RSI: 00000000000000ff RDI: 0000000006400000 RBP: 0000000000140000 R08: 0000000000100000 R09: 0000000006400000 R10: 0000000000000000 R11: 0000000000000002 R12: 0000000000000000 R13: 0000000000000028 R14: 0000000000000000 R15: ffff8a153ffd9280 FS: 0000000000000000(0000) GS:ffff8a153ab00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000006400000 CR3: 0000000136fca000 CR4: 00000000000006e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: sparse_add_section+0x1c9/0x26a __add_pages+0xbf/0x150 add_pages+0x12/0x60 add_memory_resource+0xc8/0x210 __add_memory+0x62/0xb0 acpi_memory_device_add+0x13f/0x300 acpi_bus_attach+0xf6/0x200 acpi_bus_scan+0x43/0x90 acpi_device_hotplug+0x275/0x3d0 acpi_hotplug_work_fn+0x1a/0x30 process_one_work+0x1a7/0x370 worker_thread+0x30/0x380 kthread+0x112/0x130 ret_from_fork+0x35/0x40 We should use memmap as it did. On x86 the impact is limited to x86_32 builds, or x86_64 configurations that override the default setting for SPARSEMEM_VMEMMAP. Other memory hotplug archs (arm64, ia64, and ppc) also default to SPARSEMEM_VMEMMAP=y. [dan.j.williams@intel.com: changelog update] {rppt@linux.ibm.com: changelog update] Link: http://lkml.kernel.org/r/20200219030454.4844-1-bhe@redhat.com Fixes: ba72b4c8cf60 ("mm/sparsemem: support sub-section hotplug") Signed-off-by: Wei Yang Signed-off-by: Baoquan He Acked-by: David Hildenbrand Reviewed-by: Baoquan He Reviewed-by: Dan Williams Acked-by: Michal Hocko Cc: Mike Rapoport Cc: Oscar Salvador Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/sparse.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/sparse.c b/mm/sparse.c index c184b69460b7..596b2a45b100 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -876,7 +876,7 @@ int __meminit sparse_add_section(int nid, unsigned long start_pfn, * Poison uninitialized struct pages in order to catch invalid flags * combinations. */ - page_init_poison(pfn_to_page(start_pfn), sizeof(struct page) * nr_pages); + page_init_poison(memmap, sizeof(struct page) * nr_pages); ms = __nr_to_section(section_nr); set_section_nid(section_nr, nid); -- cgit v1.2.3 From 305e519ce48e935702c32241f07d393c3c8fed3e Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 20 Feb 2020 20:04:30 -0800 Subject: lib/stackdepot.c: fix global out-of-bounds in stack_slabs Walter Wu has reported a potential case in which init_stack_slab() is called after stack_slabs[STACK_ALLOC_MAX_SLABS - 1] has already been initialized. In that case init_stack_slab() will overwrite stack_slabs[STACK_ALLOC_MAX_SLABS], which may result in a memory corruption. Link: http://lkml.kernel.org/r/20200218102950.260263-1-glider@google.com Fixes: cd11016e5f521 ("mm, kasan: stackdepot implementation. Enable stackdepot for SLAB") Signed-off-by: Alexander Potapenko Reported-by: Walter Wu Cc: Dmitry Vyukov Cc: Matthias Brugger Cc: Thomas Gleixner Cc: Josh Poimboeuf Cc: Kate Stewart Cc: Greg Kroah-Hartman Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/stackdepot.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/lib/stackdepot.c b/lib/stackdepot.c index ed717dd08ff3..81c69c08d1d1 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -83,15 +83,19 @@ static bool init_stack_slab(void **prealloc) return true; if (stack_slabs[depot_index] == NULL) { stack_slabs[depot_index] = *prealloc; + *prealloc = NULL; } else { - stack_slabs[depot_index + 1] = *prealloc; + /* If this is the last depot slab, do not touch the next one. */ + if (depot_index + 1 < STACK_ALLOC_MAX_SLABS) { + stack_slabs[depot_index + 1] = *prealloc; + *prealloc = NULL; + } /* * This smp_store_release pairs with smp_load_acquire() from * |next_slab_inited| above and in stack_depot_save(). */ smp_store_release(&next_slab_inited, 1); } - *prealloc = NULL; return true; } -- cgit v1.2.3 From bb8d00ff51a0c5e58cebd2e698a778f4e3d34d48 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 20 Feb 2020 20:04:33 -0800 Subject: MAINTAINERS: use tabs for SAFESETID Use tabs for indentation instead of spaces for SAFESETID. All (!) other entries in MAINTAINERS use tabs (according to my simple grepping). Link: http://lkml.kernel.org/r/2bb2e52a-2694-816d-57b4-6cabfadd6c1a@infradead.org Signed-off-by: Randy Dunlap Cc: Micah Morton Cc: James Morris Cc: "Serge E. Hallyn" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 4beb8dc4c7eb..4e9bdb3c63b2 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -14582,10 +14582,10 @@ F: drivers/media/pci/saa7146/ F: include/media/drv-intf/saa7146* SAFESETID SECURITY MODULE -M: Micah Morton -S: Supported -F: security/safesetid/ -F: Documentation/admin-guide/LSM/SafeSetID.rst +M: Micah Morton +S: Supported +F: security/safesetid/ +F: Documentation/admin-guide/LSM/SafeSetID.rst SAMSUNG AUDIO (ASoC) DRIVERS M: Krzysztof Kozlowski -- cgit v1.2.3 From 7c990728b99ed6fbe9c75fc202fce1172d9916da Mon Sep 17 00:00:00 2001 From: Suraj Jitindar Singh Date: Tue, 18 Feb 2020 19:08:51 -0800 Subject: ext4: fix potential race between s_flex_groups online resizing and access During an online resize an array of s_flex_groups structures gets replaced so it can get enlarged. If there is a concurrent access to the array and this memory has been reused then this can lead to an invalid memory access. The s_flex_group array has been converted into an array of pointers rather than an array of structures. This is to ensure that the information contained in the structures cannot get out of sync during a resize due to an accessor updating the value in the old structure after it has been copied but before the array pointer is updated. Since the structures them- selves are no longer copied but only the pointers to them this case is mitigated. Link: https://bugzilla.kernel.org/show_bug.cgi?id=206443 Link: https://lore.kernel.org/r/20200221053458.730016-4-tytso@mit.edu Signed-off-by: Suraj Jitindar Singh Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/ext4.h | 2 +- fs/ext4/ialloc.c | 23 +++++++++++------- fs/ext4/mballoc.c | 9 ++++--- fs/ext4/resize.c | 7 ++++-- fs/ext4/super.c | 72 ++++++++++++++++++++++++++++++++++++++----------------- 5 files changed, 76 insertions(+), 37 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index b1ece5329738..614fefa7dc7a 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1512,7 +1512,7 @@ struct ext4_sb_info { unsigned int s_extent_max_zeroout_kb; unsigned int s_log_groups_per_flex; - struct flex_groups *s_flex_groups; + struct flex_groups * __rcu *s_flex_groups; ext4_group_t s_flex_groups_allocated; /* workqueue for reserved extent conversions (buffered io) */ diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index c66e8f9451a2..f95ee99091e4 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -328,11 +328,13 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) percpu_counter_inc(&sbi->s_freeinodes_counter); if (sbi->s_log_groups_per_flex) { - ext4_group_t f = ext4_flex_group(sbi, block_group); + struct flex_groups *fg; - atomic_inc(&sbi->s_flex_groups[f].free_inodes); + fg = sbi_array_rcu_deref(sbi, s_flex_groups, + ext4_flex_group(sbi, block_group)); + atomic_inc(&fg->free_inodes); if (is_directory) - atomic_dec(&sbi->s_flex_groups[f].used_dirs); + atomic_dec(&fg->used_dirs); } BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata"); fatal = ext4_handle_dirty_metadata(handle, NULL, bh2); @@ -368,12 +370,13 @@ static void get_orlov_stats(struct super_block *sb, ext4_group_t g, int flex_size, struct orlov_stats *stats) { struct ext4_group_desc *desc; - struct flex_groups *flex_group = EXT4_SB(sb)->s_flex_groups; if (flex_size > 1) { - stats->free_inodes = atomic_read(&flex_group[g].free_inodes); - stats->free_clusters = atomic64_read(&flex_group[g].free_clusters); - stats->used_dirs = atomic_read(&flex_group[g].used_dirs); + struct flex_groups *fg = sbi_array_rcu_deref(EXT4_SB(sb), + s_flex_groups, g); + stats->free_inodes = atomic_read(&fg->free_inodes); + stats->free_clusters = atomic64_read(&fg->free_clusters); + stats->used_dirs = atomic_read(&fg->used_dirs); return; } @@ -1054,7 +1057,8 @@ got: if (sbi->s_log_groups_per_flex) { ext4_group_t f = ext4_flex_group(sbi, group); - atomic_inc(&sbi->s_flex_groups[f].used_dirs); + atomic_inc(&sbi_array_rcu_deref(sbi, s_flex_groups, + f)->used_dirs); } } if (ext4_has_group_desc_csum(sb)) { @@ -1077,7 +1081,8 @@ got: if (sbi->s_log_groups_per_flex) { flex_group = ext4_flex_group(sbi, group); - atomic_dec(&sbi->s_flex_groups[flex_group].free_inodes); + atomic_dec(&sbi_array_rcu_deref(sbi, s_flex_groups, + flex_group)->free_inodes); } inode->i_ino = ino + group * EXT4_INODES_PER_GROUP(sb); diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 1b46fb63692a..51a78eb65f3c 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -3038,7 +3038,8 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, ext4_group_t flex_group = ext4_flex_group(sbi, ac->ac_b_ex.fe_group); atomic64_sub(ac->ac_b_ex.fe_len, - &sbi->s_flex_groups[flex_group].free_clusters); + &sbi_array_rcu_deref(sbi, s_flex_groups, + flex_group)->free_clusters); } err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); @@ -4936,7 +4937,8 @@ do_more: if (sbi->s_log_groups_per_flex) { ext4_group_t flex_group = ext4_flex_group(sbi, block_group); atomic64_add(count_clusters, - &sbi->s_flex_groups[flex_group].free_clusters); + &sbi_array_rcu_deref(sbi, s_flex_groups, + flex_group)->free_clusters); } /* @@ -5093,7 +5095,8 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, if (sbi->s_log_groups_per_flex) { ext4_group_t flex_group = ext4_flex_group(sbi, block_group); atomic64_add(clusters_freed, - &sbi->s_flex_groups[flex_group].free_clusters); + &sbi_array_rcu_deref(sbi, s_flex_groups, + flex_group)->free_clusters); } ext4_mb_unload_buddy(&e4b); diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 536cc9f38091..a50b51270ea9 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -1430,11 +1430,14 @@ static void ext4_update_super(struct super_block *sb, percpu_counter_read(&sbi->s_freeclusters_counter)); if (ext4_has_feature_flex_bg(sb) && sbi->s_log_groups_per_flex) { ext4_group_t flex_group; + struct flex_groups *fg; + flex_group = ext4_flex_group(sbi, group_data[0].group); + fg = sbi_array_rcu_deref(sbi, s_flex_groups, flex_group); atomic64_add(EXT4_NUM_B2C(sbi, free_blocks), - &sbi->s_flex_groups[flex_group].free_clusters); + &fg->free_clusters); atomic_add(EXT4_INODES_PER_GROUP(sb) * flex_gd->count, - &sbi->s_flex_groups[flex_group].free_inodes); + &fg->free_inodes); } /* diff --git a/fs/ext4/super.c b/fs/ext4/super.c index e00bcc19099f..6b7e628b7903 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1015,6 +1015,7 @@ static void ext4_put_super(struct super_block *sb) struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; struct buffer_head **group_desc; + struct flex_groups **flex_groups; int aborted = 0; int i, err; @@ -1052,8 +1053,13 @@ static void ext4_put_super(struct super_block *sb) for (i = 0; i < sbi->s_gdb_count; i++) brelse(group_desc[i]); kvfree(group_desc); + flex_groups = rcu_dereference(sbi->s_flex_groups); + if (flex_groups) { + for (i = 0; i < sbi->s_flex_groups_allocated; i++) + kvfree(flex_groups[i]); + kvfree(flex_groups); + } rcu_read_unlock(); - kvfree(sbi->s_flex_groups); percpu_counter_destroy(&sbi->s_freeclusters_counter); percpu_counter_destroy(&sbi->s_freeinodes_counter); percpu_counter_destroy(&sbi->s_dirs_counter); @@ -2384,8 +2390,8 @@ done: int ext4_alloc_flex_bg_array(struct super_block *sb, ext4_group_t ngroup) { struct ext4_sb_info *sbi = EXT4_SB(sb); - struct flex_groups *new_groups; - int size; + struct flex_groups **old_groups, **new_groups; + int size, i; if (!sbi->s_log_groups_per_flex) return 0; @@ -2394,22 +2400,37 @@ int ext4_alloc_flex_bg_array(struct super_block *sb, ext4_group_t ngroup) if (size <= sbi->s_flex_groups_allocated) return 0; - size = roundup_pow_of_two(size * sizeof(struct flex_groups)); - new_groups = kvzalloc(size, GFP_KERNEL); + new_groups = kvzalloc(roundup_pow_of_two(size * + sizeof(*sbi->s_flex_groups)), GFP_KERNEL); if (!new_groups) { - ext4_msg(sb, KERN_ERR, "not enough memory for %d flex groups", - size / (int) sizeof(struct flex_groups)); + ext4_msg(sb, KERN_ERR, + "not enough memory for %d flex group pointers", size); return -ENOMEM; } - - if (sbi->s_flex_groups) { - memcpy(new_groups, sbi->s_flex_groups, - (sbi->s_flex_groups_allocated * - sizeof(struct flex_groups))); - kvfree(sbi->s_flex_groups); + for (i = sbi->s_flex_groups_allocated; i < size; i++) { + new_groups[i] = kvzalloc(roundup_pow_of_two( + sizeof(struct flex_groups)), + GFP_KERNEL); + if (!new_groups[i]) { + for (i--; i >= sbi->s_flex_groups_allocated; i--) + kvfree(new_groups[i]); + kvfree(new_groups); + ext4_msg(sb, KERN_ERR, + "not enough memory for %d flex groups", size); + return -ENOMEM; + } } - sbi->s_flex_groups = new_groups; - sbi->s_flex_groups_allocated = size / sizeof(struct flex_groups); + rcu_read_lock(); + old_groups = rcu_dereference(sbi->s_flex_groups); + if (old_groups) + memcpy(new_groups, old_groups, + (sbi->s_flex_groups_allocated * + sizeof(struct flex_groups *))); + rcu_read_unlock(); + rcu_assign_pointer(sbi->s_flex_groups, new_groups); + sbi->s_flex_groups_allocated = size; + if (old_groups) + ext4_kvfree_array_rcu(old_groups); return 0; } @@ -2417,6 +2438,7 @@ static int ext4_fill_flex_info(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_group_desc *gdp = NULL; + struct flex_groups *fg; ext4_group_t flex_group; int i, err; @@ -2434,12 +2456,11 @@ static int ext4_fill_flex_info(struct super_block *sb) gdp = ext4_get_group_desc(sb, i, NULL); flex_group = ext4_flex_group(sbi, i); - atomic_add(ext4_free_inodes_count(sb, gdp), - &sbi->s_flex_groups[flex_group].free_inodes); + fg = sbi_array_rcu_deref(sbi, s_flex_groups, flex_group); + atomic_add(ext4_free_inodes_count(sb, gdp), &fg->free_inodes); atomic64_add(ext4_free_group_clusters(sb, gdp), - &sbi->s_flex_groups[flex_group].free_clusters); - atomic_add(ext4_used_dirs_count(sb, gdp), - &sbi->s_flex_groups[flex_group].used_dirs); + &fg->free_clusters); + atomic_add(ext4_used_dirs_count(sb, gdp), &fg->used_dirs); } return 1; @@ -3641,6 +3662,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) struct buffer_head *bh, **group_desc; struct ext4_super_block *es = NULL; struct ext4_sb_info *sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); + struct flex_groups **flex_groups; ext4_fsblk_t block; ext4_fsblk_t sb_block = get_sb_block(&data); ext4_fsblk_t logical_sb_block; @@ -4692,8 +4714,14 @@ failed_mount7: ext4_unregister_li_request(sb); failed_mount6: ext4_mb_release(sb); - if (sbi->s_flex_groups) - kvfree(sbi->s_flex_groups); + rcu_read_lock(); + flex_groups = rcu_dereference(sbi->s_flex_groups); + if (flex_groups) { + for (i = 0; i < sbi->s_flex_groups_allocated; i++) + kvfree(flex_groups[i]); + kvfree(flex_groups); + } + rcu_read_unlock(); percpu_counter_destroy(&sbi->s_freeclusters_counter); percpu_counter_destroy(&sbi->s_freeinodes_counter); percpu_counter_destroy(&sbi->s_dirs_counter); -- cgit v1.2.3 From bbd55937de8f2754adc5792b0f8e5ff7d9c0420e Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 19 Feb 2020 10:30:46 -0800 Subject: ext4: rename s_journal_flag_rwsem to s_writepages_rwsem In preparation for making s_journal_flag_rwsem synchronize ext4_writepages() with changes to both the EXTENTS and JOURNAL_DATA flags (rather than just JOURNAL_DATA as it does currently), rename it to s_writepages_rwsem. Link: https://lore.kernel.org/r/20200219183047.47417-2-ebiggers@kernel.org Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o Reviewed-by: Jan Kara Cc: stable@kernel.org --- fs/ext4/ext4.h | 2 +- fs/ext4/inode.c | 14 +++++++------- fs/ext4/super.c | 6 +++--- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 614fefa7dc7a..4b986ad42b9d 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1553,7 +1553,7 @@ struct ext4_sb_info { struct ratelimit_state s_msg_ratelimit_state; /* Barrier between changing inodes' journal flags and writepages ops. */ - struct percpu_rw_semaphore s_journal_flag_rwsem; + struct percpu_rw_semaphore s_writepages_rwsem; struct dax_device *s_daxdev; #ifdef CONFIG_EXT4_DEBUG unsigned long s_simulate_fail; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 6e1d81ed44ad..fa0ff78dc033 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2628,7 +2628,7 @@ static int ext4_writepages(struct address_space *mapping, if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) return -EIO; - percpu_down_read(&sbi->s_journal_flag_rwsem); + percpu_down_read(&sbi->s_writepages_rwsem); trace_ext4_writepages(inode, wbc); /* @@ -2849,7 +2849,7 @@ unplug: out_writepages: trace_ext4_writepages_result(inode, wbc, ret, nr_to_write - wbc->nr_to_write); - percpu_up_read(&sbi->s_journal_flag_rwsem); + percpu_up_read(&sbi->s_writepages_rwsem); return ret; } @@ -2864,13 +2864,13 @@ static int ext4_dax_writepages(struct address_space *mapping, if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) return -EIO; - percpu_down_read(&sbi->s_journal_flag_rwsem); + percpu_down_read(&sbi->s_writepages_rwsem); trace_ext4_writepages(inode, wbc); ret = dax_writeback_mapping_range(mapping, sbi->s_daxdev, wbc); trace_ext4_writepages_result(inode, wbc, ret, nr_to_write - wbc->nr_to_write); - percpu_up_read(&sbi->s_journal_flag_rwsem); + percpu_up_read(&sbi->s_writepages_rwsem); return ret; } @@ -5861,7 +5861,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) } } - percpu_down_write(&sbi->s_journal_flag_rwsem); + percpu_down_write(&sbi->s_writepages_rwsem); jbd2_journal_lock_updates(journal); /* @@ -5878,7 +5878,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) err = jbd2_journal_flush(journal); if (err < 0) { jbd2_journal_unlock_updates(journal); - percpu_up_write(&sbi->s_journal_flag_rwsem); + percpu_up_write(&sbi->s_writepages_rwsem); return err; } ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA); @@ -5886,7 +5886,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) ext4_set_aops(inode); jbd2_journal_unlock_updates(journal); - percpu_up_write(&sbi->s_journal_flag_rwsem); + percpu_up_write(&sbi->s_writepages_rwsem); if (val) up_write(&EXT4_I(inode)->i_mmap_sem); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 6b7e628b7903..6928fc229799 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1064,7 +1064,7 @@ static void ext4_put_super(struct super_block *sb) percpu_counter_destroy(&sbi->s_freeinodes_counter); percpu_counter_destroy(&sbi->s_dirs_counter); percpu_counter_destroy(&sbi->s_dirtyclusters_counter); - percpu_free_rwsem(&sbi->s_journal_flag_rwsem); + percpu_free_rwsem(&sbi->s_writepages_rwsem); #ifdef CONFIG_QUOTA for (i = 0; i < EXT4_MAXQUOTAS; i++) kfree(get_qf_name(sb, sbi, i)); @@ -4626,7 +4626,7 @@ no_journal: err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0, GFP_KERNEL); if (!err) - err = percpu_init_rwsem(&sbi->s_journal_flag_rwsem); + err = percpu_init_rwsem(&sbi->s_writepages_rwsem); if (err) { ext4_msg(sb, KERN_ERR, "insufficient memory"); @@ -4726,7 +4726,7 @@ failed_mount6: percpu_counter_destroy(&sbi->s_freeinodes_counter); percpu_counter_destroy(&sbi->s_dirs_counter); percpu_counter_destroy(&sbi->s_dirtyclusters_counter); - percpu_free_rwsem(&sbi->s_journal_flag_rwsem); + percpu_free_rwsem(&sbi->s_writepages_rwsem); failed_mount5: ext4_ext_release(sb); ext4_release_system_zone(sb); -- cgit v1.2.3 From cb85f4d23f794e24127f3e562cb3b54b0803f456 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 19 Feb 2020 10:30:47 -0800 Subject: ext4: fix race between writepages and enabling EXT4_EXTENTS_FL If EXT4_EXTENTS_FL is set on an inode while ext4_writepages() is running on it, the following warning in ext4_add_complete_io() can be hit: WARNING: CPU: 1 PID: 0 at fs/ext4/page-io.c:234 ext4_put_io_end_defer+0xf0/0x120 Here's a minimal reproducer (not 100% reliable) (root isn't required): while true; do sync done & while true; do rm -f file touch file chattr -e file echo X >> file chattr +e file done The problem is that in ext4_writepages(), ext4_should_dioread_nolock() (which only returns true on extent-based files) is checked once to set the number of reserved journal credits, and also again later to select the flags for ext4_map_blocks() and copy the reserved journal handle to ext4_io_end::handle. But if EXT4_EXTENTS_FL is being concurrently set, the first check can see dioread_nolock disabled while the later one can see it enabled, causing the reserved handle to unexpectedly be NULL. Since changing EXT4_EXTENTS_FL is uncommon, and there may be other races related to doing so as well, fix this by synchronizing changing EXT4_EXTENTS_FL with ext4_writepages() via the existing s_writepages_rwsem (previously called s_journal_flag_rwsem). This was originally reported by syzbot without a reproducer at https://syzkaller.appspot.com/bug?extid=2202a584a00fffd19fbf, but now that dioread_nolock is the default I also started seeing this when running syzkaller locally. Link: https://lore.kernel.org/r/20200219183047.47417-3-ebiggers@kernel.org Reported-by: syzbot+2202a584a00fffd19fbf@syzkaller.appspotmail.com Fixes: 6b523df4fb5a ("ext4: use transaction reservation for extent conversion in ext4_end_io") Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o Reviewed-by: Jan Kara Cc: stable@kernel.org --- fs/ext4/ext4.h | 5 ++++- fs/ext4/migrate.c | 27 +++++++++++++++++++-------- 2 files changed, 23 insertions(+), 9 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 4b986ad42b9d..61b37a052052 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1552,7 +1552,10 @@ struct ext4_sb_info { struct ratelimit_state s_warning_ratelimit_state; struct ratelimit_state s_msg_ratelimit_state; - /* Barrier between changing inodes' journal flags and writepages ops. */ + /* + * Barrier between writepages ops and changing any inode's JOURNAL_DATA + * or EXTENTS flag. + */ struct percpu_rw_semaphore s_writepages_rwsem; struct dax_device *s_daxdev; #ifdef CONFIG_EXT4_DEBUG diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index 89725fa42573..fb6520f37135 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -407,6 +407,7 @@ static int free_ext_block(handle_t *handle, struct inode *inode) int ext4_ext_migrate(struct inode *inode) { + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); handle_t *handle; int retval = 0, i; __le32 *i_data; @@ -431,6 +432,8 @@ int ext4_ext_migrate(struct inode *inode) */ return retval; + percpu_down_write(&sbi->s_writepages_rwsem); + /* * Worst case we can touch the allocation bitmaps, a bgd * block, and a block to link in the orphan list. We do need @@ -441,7 +444,7 @@ int ext4_ext_migrate(struct inode *inode) if (IS_ERR(handle)) { retval = PTR_ERR(handle); - return retval; + goto out_unlock; } goal = (((inode->i_ino - 1) / EXT4_INODES_PER_GROUP(inode->i_sb)) * EXT4_INODES_PER_GROUP(inode->i_sb)) + 1; @@ -452,7 +455,7 @@ int ext4_ext_migrate(struct inode *inode) if (IS_ERR(tmp_inode)) { retval = PTR_ERR(tmp_inode); ext4_journal_stop(handle); - return retval; + goto out_unlock; } i_size_write(tmp_inode, i_size_read(inode)); /* @@ -494,7 +497,7 @@ int ext4_ext_migrate(struct inode *inode) */ ext4_orphan_del(NULL, tmp_inode); retval = PTR_ERR(handle); - goto out; + goto out_tmp_inode; } ei = EXT4_I(inode); @@ -576,10 +579,11 @@ err_out: ext4_ext_tree_init(handle, tmp_inode); out_stop: ext4_journal_stop(handle); -out: +out_tmp_inode: unlock_new_inode(tmp_inode); iput(tmp_inode); - +out_unlock: + percpu_up_write(&sbi->s_writepages_rwsem); return retval; } @@ -589,7 +593,8 @@ out: int ext4_ind_migrate(struct inode *inode) { struct ext4_extent_header *eh; - struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es; + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + struct ext4_super_block *es = sbi->s_es; struct ext4_inode_info *ei = EXT4_I(inode); struct ext4_extent *ex; unsigned int i, len; @@ -613,9 +618,13 @@ int ext4_ind_migrate(struct inode *inode) if (test_opt(inode->i_sb, DELALLOC)) ext4_alloc_da_blocks(inode); + percpu_down_write(&sbi->s_writepages_rwsem); + handle = ext4_journal_start(inode, EXT4_HT_MIGRATE, 1); - if (IS_ERR(handle)) - return PTR_ERR(handle); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out_unlock; + } down_write(&EXT4_I(inode)->i_data_sem); ret = ext4_ext_check_inode(inode); @@ -650,5 +659,7 @@ int ext4_ind_migrate(struct inode *inode) errout: ext4_journal_stop(handle); up_write(&EXT4_I(inode)->i_data_sem); +out_unlock: + percpu_up_write(&sbi->s_writepages_rwsem); return ret; } -- cgit v1.2.3 From 8eedabfd66b68a4623beec0789eac54b8c9d0fb6 Mon Sep 17 00:00:00 2001 From: wangyan Date: Thu, 20 Feb 2020 21:46:14 +0800 Subject: jbd2: fix ocfs2 corrupt when clearing block group bits I found a NULL pointer dereference in ocfs2_block_group_clear_bits(). The running environment: kernel version: 4.19 A cluster with two nodes, 5 luns mounted on two nodes, and do some file operations like dd/fallocate/truncate/rm on every lun with storage network disconnection. The fallocate operation on dm-23-45 caused an null pointer dereference. The information of NULL pointer dereference as follows: [577992.878282] JBD2: Error -5 detected when updating journal superblock for dm-23-45. [577992.878290] Aborting journal on device dm-23-45. ... [577992.890778] JBD2: Error -5 detected when updating journal superblock for dm-24-46. [577992.890908] __journal_remove_journal_head: freeing b_committed_data [577992.890916] (fallocate,88392,52):ocfs2_extend_trans:474 ERROR: status = -30 [577992.890918] __journal_remove_journal_head: freeing b_committed_data [577992.890920] (fallocate,88392,52):ocfs2_rotate_tree_right:2500 ERROR: status = -30 [577992.890922] __journal_remove_journal_head: freeing b_committed_data [577992.890924] (fallocate,88392,52):ocfs2_do_insert_extent:4382 ERROR: status = -30 [577992.890928] (fallocate,88392,52):ocfs2_insert_extent:4842 ERROR: status = -30 [577992.890928] __journal_remove_journal_head: freeing b_committed_data [577992.890930] (fallocate,88392,52):ocfs2_add_clusters_in_btree:4947 ERROR: status = -30 [577992.890933] __journal_remove_journal_head: freeing b_committed_data [577992.890939] __journal_remove_journal_head: freeing b_committed_data [577992.890949] Unable to handle kernel NULL pointer dereference at virtual address 0000000000000020 [577992.890950] Mem abort info: [577992.890951] ESR = 0x96000004 [577992.890952] Exception class = DABT (current EL), IL = 32 bits [577992.890952] SET = 0, FnV = 0 [577992.890953] EA = 0, S1PTW = 0 [577992.890954] Data abort info: [577992.890955] ISV = 0, ISS = 0x00000004 [577992.890956] CM = 0, WnR = 0 [577992.890958] user pgtable: 4k pages, 48-bit VAs, pgdp = 00000000f8da07a9 [577992.890960] [0000000000000020] pgd=0000000000000000 [577992.890964] Internal error: Oops: 96000004 [#1] SMP [577992.890965] Process fallocate (pid: 88392, stack limit = 0x00000000013db2fd) [577992.890968] CPU: 52 PID: 88392 Comm: fallocate Kdump: loaded Tainted: G W OE 4.19.36 #1 [577992.890969] Hardware name: Huawei TaiShan 2280 V2/BC82AMDD, BIOS 0.98 08/25/2019 [577992.890971] pstate: 60400009 (nZCv daif +PAN -UAO) [577992.891054] pc : _ocfs2_free_suballoc_bits+0x63c/0x968 [ocfs2] [577992.891082] lr : _ocfs2_free_suballoc_bits+0x618/0x968 [ocfs2] [577992.891084] sp : ffff0000c8e2b810 [577992.891085] x29: ffff0000c8e2b820 x28: 0000000000000000 [577992.891087] x27: 00000000000006f3 x26: ffffa07957b02e70 [577992.891089] x25: ffff807c59d50000 x24: 00000000000006f2 [577992.891091] x23: 0000000000000001 x22: ffff807bd39abc30 [577992.891093] x21: ffff0000811d9000 x20: ffffa07535d6a000 [577992.891097] x19: ffff000001681638 x18: ffffffffffffffff [577992.891098] x17: 0000000000000000 x16: ffff000080a03df0 [577992.891100] x15: ffff0000811d9708 x14: 203d207375746174 [577992.891101] x13: 73203a524f525245 x12: 20373439343a6565 [577992.891103] x11: 0000000000000038 x10: 0101010101010101 [577992.891106] x9 : ffffa07c68a85d70 x8 : 7f7f7f7f7f7f7f7f [577992.891109] x7 : 0000000000000000 x6 : 0000000000000080 [577992.891110] x5 : 0000000000000000 x4 : 0000000000000002 [577992.891112] x3 : ffff000001713390 x2 : 2ff90f88b1c22f00 [577992.891114] x1 : ffff807bd39abc30 x0 : 0000000000000000 [577992.891116] Call trace: [577992.891139] _ocfs2_free_suballoc_bits+0x63c/0x968 [ocfs2] [577992.891162] _ocfs2_free_clusters+0x100/0x290 [ocfs2] [577992.891185] ocfs2_free_clusters+0x50/0x68 [ocfs2] [577992.891206] ocfs2_add_clusters_in_btree+0x198/0x5e0 [ocfs2] [577992.891227] ocfs2_add_inode_data+0x94/0xc8 [ocfs2] [577992.891248] ocfs2_extend_allocation+0x1bc/0x7a8 [ocfs2] [577992.891269] ocfs2_allocate_extents+0x14c/0x338 [ocfs2] [577992.891290] __ocfs2_change_file_space+0x3f8/0x610 [ocfs2] [577992.891309] ocfs2_fallocate+0xe4/0x128 [ocfs2] [577992.891316] vfs_fallocate+0x11c/0x250 [577992.891317] ksys_fallocate+0x54/0x88 [577992.891319] __arm64_sys_fallocate+0x28/0x38 [577992.891323] el0_svc_common+0x78/0x130 [577992.891325] el0_svc_handler+0x38/0x78 [577992.891327] el0_svc+0x8/0xc My analysis process as follows: ocfs2_fallocate __ocfs2_change_file_space ocfs2_allocate_extents ocfs2_extend_allocation ocfs2_add_inode_data ocfs2_add_clusters_in_btree ocfs2_insert_extent ocfs2_do_insert_extent ocfs2_rotate_tree_right ocfs2_extend_rotate_transaction ocfs2_extend_trans jbd2_journal_restart jbd2__journal_restart /* handle->h_transaction is NULL, * is_handle_aborted(handle) is true */ handle->h_transaction = NULL; start_this_handle return -EROFS; ocfs2_free_clusters _ocfs2_free_clusters _ocfs2_free_suballoc_bits ocfs2_block_group_clear_bits ocfs2_journal_access_gd __ocfs2_journal_access jbd2_journal_get_undo_access /* I think jbd2_write_access_granted() will * return true, because do_get_write_access() * will return -EROFS. */ if (jbd2_write_access_granted(...)) return 0; do_get_write_access /* handle->h_transaction is NULL, it will * return -EROFS here, so do_get_write_access() * was not called. */ if (is_handle_aborted(handle)) return -EROFS; /* bh2jh(group_bh) is NULL, caused NULL pointer dereference */ undo_bg = (struct ocfs2_group_desc *) bh2jh(group_bh)->b_committed_data; If handle->h_transaction == NULL, then jbd2_write_access_granted() does not really guarantee that journal_head will stay around, not even speaking of its b_committed_data. The bh2jh(group_bh) can be removed after ocfs2_journal_access_gd() and before call "bh2jh(group_bh)->b_committed_data". So, we should move is_handle_aborted() check from do_get_write_access() into jbd2_journal_get_undo_access() and jbd2_journal_get_write_access() before the call to jbd2_write_access_granted(). Link: https://lore.kernel.org/r/f72a623f-b3f1-381a-d91d-d22a1c83a336@huawei.com Signed-off-by: Yan Wang Signed-off-by: Theodore Ts'o Reviewed-by: Jun Piao Reviewed-by: Jan Kara Cc: stable@kernel.org --- fs/jbd2/transaction.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 2dd848a743ed..d181948c0390 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -936,8 +936,6 @@ do_get_write_access(handle_t *handle, struct journal_head *jh, char *frozen_buffer = NULL; unsigned long start_lock, time_lock; - if (is_handle_aborted(handle)) - return -EROFS; journal = transaction->t_journal; jbd_debug(5, "journal_head %p, force_copy %d\n", jh, force_copy); @@ -1189,6 +1187,9 @@ int jbd2_journal_get_write_access(handle_t *handle, struct buffer_head *bh) struct journal_head *jh; int rc; + if (is_handle_aborted(handle)) + return -EROFS; + if (jbd2_write_access_granted(handle, bh, false)) return 0; @@ -1326,6 +1327,9 @@ int jbd2_journal_get_undo_access(handle_t *handle, struct buffer_head *bh) struct journal_head *jh; char *committed_data = NULL; + if (is_handle_aborted(handle)) + return -EROFS; + if (jbd2_write_access_granted(handle, bh, true)) return 0; -- cgit v1.2.3 From 9db176bceb5c5df4990486709da386edadc6bd1d Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 21 Feb 2020 11:08:35 +0100 Subject: ext4: fix mount failure with quota configured as module When CONFIG_QFMT_V2 is configured as a module, the test in ext4_feature_set_ok() fails and so mount of filesystems with quota or project features fails. Fix the test to use IS_ENABLED macro which works properly even for modules. Link: https://lore.kernel.org/r/20200221100835.9332-1-jack@suse.cz Fixes: d65d87a07476 ("ext4: improve explanation of a mount failure caused by a misconfigured kernel") Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 6928fc229799..ff1b764b0c0e 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3034,7 +3034,7 @@ static int ext4_feature_set_ok(struct super_block *sb, int readonly) return 0; } -#if !defined(CONFIG_QUOTA) || !defined(CONFIG_QFMT_V2) +#if !IS_ENABLED(CONFIG_QUOTA) || !IS_ENABLED(CONFIG_QFMT_V2) if (!readonly && (ext4_has_feature_quota(sb) || ext4_has_feature_project(sb))) { ext4_msg(sb, KERN_ERR, -- cgit v1.2.3 From c7849be9cc2dd2754c48ddbaca27c2de6d80a95d Mon Sep 17 00:00:00 2001 From: Xiaoguang Wang Date: Sat, 22 Feb 2020 14:46:05 +0800 Subject: io_uring: fix __io_iopoll_check deadlock in io_sq_thread Since commit a3a0e43fd770 ("io_uring: don't enter poll loop if we have CQEs pending"), if we already events pending, we won't enter poll loop. In case SETUP_IOPOLL and SETUP_SQPOLL are both enabled, if app has been terminated and don't reap pending events which are already in cq ring, and there are some reqs in poll_list, io_sq_thread will enter __io_iopoll_check(), and find pending events, then return, this loop will never have a chance to exit. I have seen this issue in fio stress tests, to fix this issue, let io_sq_thread call io_iopoll_getevents() with argument 'min' being zero, and remove __io_iopoll_check(). Fixes: a3a0e43fd770 ("io_uring: don't enter poll loop if we have CQEs pending") Signed-off-by: Xiaoguang Wang Signed-off-by: Jens Axboe --- fs/io_uring.c | 27 +++++++++------------------ 1 file changed, 9 insertions(+), 18 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index b43467b3a8dc..de650df9ac53 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1672,11 +1672,17 @@ static void io_iopoll_reap_events(struct io_ring_ctx *ctx) mutex_unlock(&ctx->uring_lock); } -static int __io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events, - long min) +static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events, + long min) { int iters = 0, ret = 0; + /* + * We disallow the app entering submit/complete with polling, but we + * still need to lock the ring to prevent racing with polled issue + * that got punted to a workqueue. + */ + mutex_lock(&ctx->uring_lock); do { int tmin = 0; @@ -1712,21 +1718,6 @@ static int __io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events, ret = 0; } while (min && !*nr_events && !need_resched()); - return ret; -} - -static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events, - long min) -{ - int ret; - - /* - * We disallow the app entering submit/complete with polling, but we - * still need to lock the ring to prevent racing with polled issue - * that got punted to a workqueue. - */ - mutex_lock(&ctx->uring_lock); - ret = __io_iopoll_check(ctx, nr_events, min); mutex_unlock(&ctx->uring_lock); return ret; } @@ -5118,7 +5109,7 @@ static int io_sq_thread(void *data) */ mutex_lock(&ctx->uring_lock); if (!list_empty(&ctx->poll_list)) - __io_iopoll_check(ctx, &nr_events, 0); + io_iopoll_getevents(ctx, &nr_events, 0); else inflight = 0; mutex_unlock(&ctx->uring_lock); -- cgit v1.2.3 From 99db590b083fa2bc60adfcb5c839a62db4ef1d79 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Wed, 12 Feb 2020 11:10:58 +0100 Subject: csky: Replace by The C-Sky platform code is not a clock provider, and just needs to call of_clk_init(). Hence it can include instead of . Signed-off-by: Geert Uytterhoeven Signed-off-by: Guo Ren --- arch/csky/kernel/time.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/csky/kernel/time.c b/arch/csky/kernel/time.c index b5fc9447d93f..52379d866fe4 100644 --- a/arch/csky/kernel/time.c +++ b/arch/csky/kernel/time.c @@ -1,8 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 // Copyright (C) 2018 Hangzhou C-SKY Microsystems co.,ltd. -#include #include +#include void __init time_init(void) { -- cgit v1.2.3 From f8788d86ab28f61f7b46eb6be375f8a726783636 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 23 Feb 2020 16:17:42 -0800 Subject: Linux 5.6-rc3 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index aab38cb02b24..0914049d2929 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 5 PATCHLEVEL = 6 SUBLEVEL = 0 -EXTRAVERSION = -rc2 +EXTRAVERSION = -rc3 NAME = Kleptomaniac Octopus # *DOCUMENTATION* -- cgit v1.2.3