From ac0f7d42584125dab8039e60ab4ade48cc2db61c Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Mon, 6 Apr 2020 12:41:50 +0200 Subject: Drivers: hv: copy from message page only what's needed Hyper-V Interrupt Message Page (SIMP) has 16 256-byte slots for messages. Each message comes with a header (16 bytes) which specifies the payload length (up to 240 bytes). vmbus_on_msg_dpc(), however, doesn't look at the real message length and copies the whole slot to a temporary buffer before passing it to message handlers. This is potentially dangerous as hypervisor doesn't have to clean the whole slot when putting a new message there and a message handler can get access to some data which belongs to a previous message. Note, this is not currently a problem because all message handlers are in-kernel but eventually we may e.g. get this exported to userspace. Note also, that this is not a performance critical path: messages (unlike events) represent rare events so it doesn't really matter (from performance point of view) if we copy too much. Fix the issue by taking into account the real message length. The temporary buffer allocated by vmbus_on_msg_dpc() remains fixed size for now. Also, check that the supplied payload length is valid (<= 240 bytes). Signed-off-by: Vitaly Kuznetsov Reviewed-by: Michael Kelley Link: https://lore.kernel.org/r/20200406104154.45010-2-vkuznets@redhat.com Signed-off-by: Wei Liu --- drivers/hv/vmbus_drv.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c index a68bce4d0ddb..183a5b07c3ad 100644 --- a/drivers/hv/vmbus_drv.c +++ b/drivers/hv/vmbus_drv.c @@ -1060,6 +1060,12 @@ void vmbus_on_msg_dpc(unsigned long data) goto msg_handled; } + if (msg->header.payload_size > HV_MESSAGE_PAYLOAD_BYTE_COUNT) { + WARN_ONCE(1, "payload size is too large (%d)\n", + msg->header.payload_size); + goto msg_handled; + } + entry = &channel_message_table[hdr->msgtype]; if (!entry->message_handler) @@ -1071,7 +1077,8 @@ void vmbus_on_msg_dpc(unsigned long data) return; INIT_WORK(&ctx->work, vmbus_onmessage_work); - memcpy(&ctx->msg, msg, sizeof(*msg)); + memcpy(&ctx->msg, msg, sizeof(msg->header) + + msg->header.payload_size); /* * The host can generate a rescind message while we -- cgit v1.2.3 From a276463b7aeb6186e7e4315cccb032773fb31b5d Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Mon, 6 Apr 2020 12:41:51 +0200 Subject: Drivers: hv: allocate the exact needed memory for messages When we need to pass a buffer with Hyper-V message we don't need to always allocate 256 bytes for the message: the real message length is known from the header. Change 'struct onmessage_work_context' to make it possible to not over-allocate. Signed-off-by: Vitaly Kuznetsov Reviewed-by: Michael Kelley Link: https://lore.kernel.org/r/20200406104154.45010-3-vkuznets@redhat.com Signed-off-by: Wei Liu --- drivers/hv/vmbus_drv.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c index 183a5b07c3ad..e356ea4752c0 100644 --- a/drivers/hv/vmbus_drv.c +++ b/drivers/hv/vmbus_drv.c @@ -1019,7 +1019,10 @@ static struct bus_type hv_bus = { struct onmessage_work_context { struct work_struct work; - struct hv_message msg; + struct { + struct hv_message_header header; + u8 payload[]; + } msg; }; static void vmbus_onmessage_work(struct work_struct *work) @@ -1072,7 +1075,8 @@ void vmbus_on_msg_dpc(unsigned long data) goto msg_handled; if (entry->handler_type == VMHT_BLOCKING) { - ctx = kmalloc(sizeof(*ctx), GFP_ATOMIC); + ctx = kmalloc(sizeof(*ctx) + msg->header.payload_size, + GFP_ATOMIC); if (ctx == NULL) return; @@ -1126,10 +1130,11 @@ static void vmbus_force_channel_rescinded(struct vmbus_channel *channel) WARN_ON(!is_hvsock_channel(channel)); /* - * sizeof(*ctx) is small and the allocation should really not fail, + * Allocation size is small and the allocation should really not fail, * otherwise the state of the hv_sock connections ends up in limbo. */ - ctx = kzalloc(sizeof(*ctx), GFP_KERNEL | __GFP_NOFAIL); + ctx = kzalloc(sizeof(*ctx) + sizeof(*rescind), + GFP_KERNEL | __GFP_NOFAIL); /* * So far, these are not really used by Linux. Just set them to the @@ -1139,7 +1144,7 @@ static void vmbus_force_channel_rescinded(struct vmbus_channel *channel) ctx->msg.header.payload_size = sizeof(*rescind); /* These values are actually used by Linux. */ - rescind = (struct vmbus_channel_rescind_offer *)ctx->msg.u.payload; + rescind = (struct vmbus_channel_rescind_offer *)ctx->msg.payload; rescind->header.msgtype = CHANNELMSG_RESCIND_CHANNELOFFER; rescind->child_relid = channel->offermsg.child_relid; -- cgit v1.2.3 From 5cc415001bca8fe0e3f0ee6d58a953a314dd9751 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Mon, 6 Apr 2020 12:41:52 +0200 Subject: Drivers: hv: avoid passing opaque pointer to vmbus_onmessage() vmbus_onmessage() doesn't need the header of the message, it only uses it to get to the payload, we can pass the pointer to the payload directly. Signed-off-by: Vitaly Kuznetsov Reviewed-by: Michael Kelley Link: https://lore.kernel.org/r/20200406104154.45010-4-vkuznets@redhat.com Signed-off-by: Wei Liu --- drivers/hv/channel_mgmt.c | 7 +------ drivers/hv/vmbus_drv.c | 3 ++- include/linux/hyperv.h | 2 +- 3 files changed, 4 insertions(+), 8 deletions(-) diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c index 501c43c5851d..da93b5f1b143 100644 --- a/drivers/hv/channel_mgmt.c +++ b/drivers/hv/channel_mgmt.c @@ -1363,13 +1363,8 @@ channel_message_table[CHANNELMSG_COUNT] = { * * This is invoked in the vmbus worker thread context. */ -void vmbus_onmessage(void *context) +void vmbus_onmessage(struct vmbus_channel_message_header *hdr) { - struct hv_message *msg = context; - struct vmbus_channel_message_header *hdr; - - hdr = (struct vmbus_channel_message_header *)msg->u.payload; - trace_vmbus_on_message(hdr); /* diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c index e356ea4752c0..8f08e8273dd8 100644 --- a/drivers/hv/vmbus_drv.c +++ b/drivers/hv/vmbus_drv.c @@ -1035,7 +1035,8 @@ static void vmbus_onmessage_work(struct work_struct *work) ctx = container_of(work, struct onmessage_work_context, work); - vmbus_onmessage(&ctx->msg); + vmbus_onmessage((struct vmbus_channel_message_header *) + &ctx->msg.payload); kfree(ctx); } diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index 692c89ccf5df..cbd24f4e68d1 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -1017,7 +1017,7 @@ static inline void clear_low_latency_mode(struct vmbus_channel *c) c->low_latency = false; } -void vmbus_onmessage(void *context); +void vmbus_onmessage(struct vmbus_channel_message_header *hdr); int vmbus_request_offers(void); -- cgit v1.2.3 From b0a284dc65b401a508dc2c5ed7d465884220f607 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Mon, 6 Apr 2020 12:43:15 +0200 Subject: Drivers: hv: make sure that 'struct vmbus_channel_message_header' compiles correctly Strictly speaking, compiler is free to use something different from 'u32' for 'enum vmbus_channel_message_type' (e.g. char) but it doesn't happen in real life, just add a BUILD_BUG_ON() guardian. Signed-off-by: Vitaly Kuznetsov Reviewed-by: Michael Kelley Link: https://lore.kernel.org/r/20200406104316.45303-1-vkuznets@redhat.com Signed-off-by: Wei Liu --- drivers/hv/vmbus_drv.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c index 8f08e8273dd8..6e54adb1dd33 100644 --- a/drivers/hv/vmbus_drv.c +++ b/drivers/hv/vmbus_drv.c @@ -1051,6 +1051,13 @@ void vmbus_on_msg_dpc(unsigned long data) struct onmessage_work_context *ctx; u32 message_type = msg->header.message_type; + /* + * 'enum vmbus_channel_message_type' is supposed to always be 'u32' as + * it is being used in 'struct vmbus_channel_message_header' definition + * which is supposed to match hypervisor ABI. + */ + BUILD_BUG_ON(sizeof(enum vmbus_channel_message_type) != sizeof(u32)); + if (message_type == HVMSG_NONE) /* no msg */ return; -- cgit v1.2.3 From 52c7803f9bd4b1f0ac6e2e3e6051415198cc06bd Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Mon, 6 Apr 2020 12:43:26 +0200 Subject: Drivers: hv: check VMBus messages lengths VMBus message handlers (channel_message_table) receive a pointer to 'struct vmbus_channel_message_header' and cast it to a structure of their choice, which is sometimes longer than the header. We, however, don't check that the message is long enough so in case hypervisor screws up we'll be accessing memory beyond what was allocated for temporary buffer. Previously, we used to always allocate and copy 256 bytes from message page to temporary buffer but this is hardly better: in case the message is shorter than we expect we'll be trying to consume garbage as some real data and no memory guarding technique will be able to identify an issue. Introduce 'min_payload_len' to 'struct vmbus_channel_message_table_entry' and check against it in vmbus_on_msg_dpc(). Note, we can't require the exact length as new hypervisor versions may add extra fields to messages, we only check that the message is not shorter than we expect. Signed-off-by: Vitaly Kuznetsov Reviewed-by: Michael Kelley Link: https://lore.kernel.org/r/20200406104326.45361-1-vkuznets@redhat.com Signed-off-by: Wei Liu --- drivers/hv/channel_mgmt.c | 54 ++++++++++++++++++++++++++--------------------- drivers/hv/hyperv_vmbus.h | 1 + drivers/hv/vmbus_drv.c | 6 ++++++ 3 files changed, 37 insertions(+), 24 deletions(-) diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c index da93b5f1b143..3a5700c8486a 100644 --- a/drivers/hv/channel_mgmt.c +++ b/drivers/hv/channel_mgmt.c @@ -1332,30 +1332,36 @@ static void vmbus_onversion_response( /* Channel message dispatch table */ const struct vmbus_channel_message_table_entry channel_message_table[CHANNELMSG_COUNT] = { - { CHANNELMSG_INVALID, 0, NULL }, - { CHANNELMSG_OFFERCHANNEL, 0, vmbus_onoffer }, - { CHANNELMSG_RESCIND_CHANNELOFFER, 0, vmbus_onoffer_rescind }, - { CHANNELMSG_REQUESTOFFERS, 0, NULL }, - { CHANNELMSG_ALLOFFERS_DELIVERED, 1, vmbus_onoffers_delivered }, - { CHANNELMSG_OPENCHANNEL, 0, NULL }, - { CHANNELMSG_OPENCHANNEL_RESULT, 1, vmbus_onopen_result }, - { CHANNELMSG_CLOSECHANNEL, 0, NULL }, - { CHANNELMSG_GPADL_HEADER, 0, NULL }, - { CHANNELMSG_GPADL_BODY, 0, NULL }, - { CHANNELMSG_GPADL_CREATED, 1, vmbus_ongpadl_created }, - { CHANNELMSG_GPADL_TEARDOWN, 0, NULL }, - { CHANNELMSG_GPADL_TORNDOWN, 1, vmbus_ongpadl_torndown }, - { CHANNELMSG_RELID_RELEASED, 0, NULL }, - { CHANNELMSG_INITIATE_CONTACT, 0, NULL }, - { CHANNELMSG_VERSION_RESPONSE, 1, vmbus_onversion_response }, - { CHANNELMSG_UNLOAD, 0, NULL }, - { CHANNELMSG_UNLOAD_RESPONSE, 1, vmbus_unload_response }, - { CHANNELMSG_18, 0, NULL }, - { CHANNELMSG_19, 0, NULL }, - { CHANNELMSG_20, 0, NULL }, - { CHANNELMSG_TL_CONNECT_REQUEST, 0, NULL }, - { CHANNELMSG_22, 0, NULL }, - { CHANNELMSG_TL_CONNECT_RESULT, 0, NULL }, + { CHANNELMSG_INVALID, 0, NULL, 0}, + { CHANNELMSG_OFFERCHANNEL, 0, vmbus_onoffer, + sizeof(struct vmbus_channel_offer_channel)}, + { CHANNELMSG_RESCIND_CHANNELOFFER, 0, vmbus_onoffer_rescind, + sizeof(struct vmbus_channel_rescind_offer) }, + { CHANNELMSG_REQUESTOFFERS, 0, NULL, 0}, + { CHANNELMSG_ALLOFFERS_DELIVERED, 1, vmbus_onoffers_delivered, 0}, + { CHANNELMSG_OPENCHANNEL, 0, NULL, 0}, + { CHANNELMSG_OPENCHANNEL_RESULT, 1, vmbus_onopen_result, + sizeof(struct vmbus_channel_open_result)}, + { CHANNELMSG_CLOSECHANNEL, 0, NULL, 0}, + { CHANNELMSG_GPADL_HEADER, 0, NULL, 0}, + { CHANNELMSG_GPADL_BODY, 0, NULL, 0}, + { CHANNELMSG_GPADL_CREATED, 1, vmbus_ongpadl_created, + sizeof(struct vmbus_channel_gpadl_created)}, + { CHANNELMSG_GPADL_TEARDOWN, 0, NULL, 0}, + { CHANNELMSG_GPADL_TORNDOWN, 1, vmbus_ongpadl_torndown, + sizeof(struct vmbus_channel_gpadl_torndown) }, + { CHANNELMSG_RELID_RELEASED, 0, NULL, 0}, + { CHANNELMSG_INITIATE_CONTACT, 0, NULL, 0}, + { CHANNELMSG_VERSION_RESPONSE, 1, vmbus_onversion_response, + sizeof(struct vmbus_channel_version_response)}, + { CHANNELMSG_UNLOAD, 0, NULL, 0}, + { CHANNELMSG_UNLOAD_RESPONSE, 1, vmbus_unload_response, 0}, + { CHANNELMSG_18, 0, NULL, 0}, + { CHANNELMSG_19, 0, NULL, 0}, + { CHANNELMSG_20, 0, NULL, 0}, + { CHANNELMSG_TL_CONNECT_REQUEST, 0, NULL, 0}, + { CHANNELMSG_22, 0, NULL, 0}, + { CHANNELMSG_TL_CONNECT_RESULT, 0, NULL, 0}, }; /* diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h index 70b30e223a57..ab560ac9c040 100644 --- a/drivers/hv/hyperv_vmbus.h +++ b/drivers/hv/hyperv_vmbus.h @@ -317,6 +317,7 @@ struct vmbus_channel_message_table_entry { enum vmbus_channel_message_type message_type; enum vmbus_message_handler_type handler_type; void (*message_handler)(struct vmbus_channel_message_header *msg); + u32 min_payload_len; }; extern const struct vmbus_channel_message_table_entry diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c index 6e54adb1dd33..3a05e1bc359c 100644 --- a/drivers/hv/vmbus_drv.c +++ b/drivers/hv/vmbus_drv.c @@ -1082,6 +1082,12 @@ void vmbus_on_msg_dpc(unsigned long data) if (!entry->message_handler) goto msg_handled; + if (msg->header.payload_size < entry->min_payload_len) { + WARN_ONCE(1, "message too short: msgtype=%d len=%d\n", + hdr->msgtype, msg->header.payload_size); + goto msg_handled; + } + if (entry->handler_type == VMHT_BLOCKING) { ctx = kmalloc(sizeof(*ctx) + msg->header.payload_size, GFP_ATOMIC); -- cgit v1.2.3 From 8a857c55420f29da4fc131adc22b12d474c48f4c Mon Sep 17 00:00:00 2001 From: "Andrea Parri (Microsoft)" Date: Mon, 6 Apr 2020 02:15:04 +0200 Subject: Drivers: hv: vmbus: Always handle the VMBus messages on CPU0 A Linux guest have to pick a "connect CPU" to communicate with the Hyper-V host. This CPU can not be taken offline because Hyper-V does not provide a way to change that CPU assignment. Current code sets the connect CPU to whatever CPU ends up running the function vmbus_negotiate_version(), and this will generate problems if that CPU is taken offine. Establish CPU0 as the connect CPU, and add logics to prevents the connect CPU from being taken offline. We could pick some other CPU, and we could pick that "other CPU" dynamically if there was a reason to do so at some point in the future. But for now, #defining the connect CPU to 0 is the most straightforward and least complex solution. While on this, add inline comments explaining "why" offer and rescind messages should not be handled by a same serialized work queue. Suggested-by: Dexuan Cui Signed-off-by: Andrea Parri (Microsoft) Reviewed-by: Vitaly Kuznetsov Link: https://lore.kernel.org/r/20200406001514.19876-2-parri.andrea@gmail.com Reviewed-by: Michael Kelley Signed-off-by: Wei Liu --- drivers/hv/connection.c | 20 +------------------- drivers/hv/hv.c | 7 +++++++ drivers/hv/hyperv_vmbus.h | 11 ++++++----- drivers/hv/vmbus_drv.c | 20 +++++++++++++++++--- 4 files changed, 31 insertions(+), 27 deletions(-) diff --git a/drivers/hv/connection.c b/drivers/hv/connection.c index 74e77de89b4f..f4bd306d2cef 100644 --- a/drivers/hv/connection.c +++ b/drivers/hv/connection.c @@ -69,7 +69,6 @@ MODULE_PARM_DESC(max_version, int vmbus_negotiate_version(struct vmbus_channel_msginfo *msginfo, u32 version) { int ret = 0; - unsigned int cur_cpu; struct vmbus_channel_initiate_contact *msg; unsigned long flags; @@ -102,24 +101,7 @@ int vmbus_negotiate_version(struct vmbus_channel_msginfo *msginfo, u32 version) msg->monitor_page1 = virt_to_phys(vmbus_connection.monitor_pages[0]); msg->monitor_page2 = virt_to_phys(vmbus_connection.monitor_pages[1]); - /* - * We want all channel messages to be delivered on CPU 0. - * This has been the behavior pre-win8. This is not - * perf issue and having all channel messages delivered on CPU 0 - * would be ok. - * For post win8 hosts, we support receiving channel messagges on - * all the CPUs. This is needed for kexec to work correctly where - * the CPU attempting to connect may not be CPU 0. - */ - if (version >= VERSION_WIN8_1) { - cur_cpu = get_cpu(); - msg->target_vcpu = hv_cpu_number_to_vp_number(cur_cpu); - vmbus_connection.connect_cpu = cur_cpu; - put_cpu(); - } else { - msg->target_vcpu = 0; - vmbus_connection.connect_cpu = 0; - } + msg->target_vcpu = hv_cpu_number_to_vp_number(VMBUS_CONNECT_CPU); /* * Add to list before we send the request since we may diff --git a/drivers/hv/hv.c b/drivers/hv/hv.c index 6098e0cbdb4b..e2b331045464 100644 --- a/drivers/hv/hv.c +++ b/drivers/hv/hv.c @@ -249,6 +249,13 @@ int hv_synic_cleanup(unsigned int cpu) bool channel_found = false; unsigned long flags; + /* + * Hyper-V does not provide a way to change the connect CPU once + * it is set; we must prevent the connect CPU from going offline. + */ + if (cpu == VMBUS_CONNECT_CPU) + return -EBUSY; + /* * Search for channels which are bound to the CPU we're about to * cleanup. In case we find one and vmbus is still connected we need to diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h index ab560ac9c040..000740d053a2 100644 --- a/drivers/hv/hyperv_vmbus.h +++ b/drivers/hv/hyperv_vmbus.h @@ -212,12 +212,13 @@ enum vmbus_connect_state { #define MAX_SIZE_CHANNEL_MESSAGE HV_MESSAGE_PAYLOAD_BYTE_COUNT -struct vmbus_connection { - /* - * CPU on which the initial host contact was made. - */ - int connect_cpu; +/* + * The CPU that Hyper-V will interrupt for VMBUS messages, such as + * CHANNELMSG_OFFERCHANNEL and CHANNELMSG_RESCIND_CHANNELOFFER. + */ +#define VMBUS_CONNECT_CPU 0 +struct vmbus_connection { u32 msg_conn_id; atomic_t offer_in_progress; diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c index 3a05e1bc359c..495337ec68f1 100644 --- a/drivers/hv/vmbus_drv.c +++ b/drivers/hv/vmbus_drv.c @@ -1109,14 +1109,28 @@ void vmbus_on_msg_dpc(unsigned long data) /* * If we are handling the rescind message; * schedule the work on the global work queue. + * + * The OFFER message and the RESCIND message should + * not be handled by the same serialized work queue, + * because the OFFER handler may call vmbus_open(), + * which tries to open the channel by sending an + * OPEN_CHANNEL message to the host and waits for + * the host's response; however, if the host has + * rescinded the channel before it receives the + * OPEN_CHANNEL message, the host just silently + * ignores the OPEN_CHANNEL message; as a result, + * the guest's OFFER handler hangs for ever, if we + * handle the RESCIND message in the same serialized + * work queue: the RESCIND handler can not start to + * run before the OFFER handler finishes. */ - schedule_work_on(vmbus_connection.connect_cpu, + schedule_work_on(VMBUS_CONNECT_CPU, &ctx->work); break; case CHANNELMSG_OFFERCHANNEL: atomic_inc(&vmbus_connection.offer_in_progress); - queue_work_on(vmbus_connection.connect_cpu, + queue_work_on(VMBUS_CONNECT_CPU, vmbus_connection.work_queue, &ctx->work); break; @@ -1164,7 +1178,7 @@ static void vmbus_force_channel_rescinded(struct vmbus_channel *channel) INIT_WORK(&ctx->work, vmbus_onmessage_work); - queue_work_on(vmbus_connection.connect_cpu, + queue_work_on(VMBUS_CONNECT_CPU, vmbus_connection.work_queue, &ctx->work); } -- cgit v1.2.3 From b9fa1b8797dcb579a9642a502769e1a5c3adc0d2 Mon Sep 17 00:00:00 2001 From: "Andrea Parri (Microsoft)" Date: Mon, 6 Apr 2020 02:15:05 +0200 Subject: Drivers: hv: vmbus: Don't bind the offer&rescind works to a specific CPU The offer and rescind works are currently scheduled on the so called "connect CPU". However, this is not really needed: we can synchronize the works by relying on the usage of the offer_in_progress counter and of the channel_mutex mutex. This synchronization is already in place. So, remove this unnecessary "bind to the connect CPU" constraint and update the inline comments accordingly. Suggested-by: Dexuan Cui Signed-off-by: Andrea Parri (Microsoft) Link: https://lore.kernel.org/r/20200406001514.19876-3-parri.andrea@gmail.com Reviewed-by: Michael Kelley Signed-off-by: Wei Liu --- drivers/hv/channel_mgmt.c | 21 ++++++++++++++++----- drivers/hv/vmbus_drv.c | 39 ++++++++++++++++++++++++++++----------- 2 files changed, 44 insertions(+), 16 deletions(-) diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c index 3a5700c8486a..0ced32ef2715 100644 --- a/drivers/hv/channel_mgmt.c +++ b/drivers/hv/channel_mgmt.c @@ -1028,11 +1028,22 @@ static void vmbus_onoffer_rescind(struct vmbus_channel_message_header *hdr) * offer comes in first and then the rescind. * Since we process these events in work elements, * and with preemption, we may end up processing - * the events out of order. Given that we handle these - * work elements on the same CPU, this is possible only - * in the case of preemption. In any case wait here - * until the offer processing has moved beyond the - * point where the channel is discoverable. + * the events out of order. We rely on the synchronization + * provided by offer_in_progress and by channel_mutex for + * ordering these events: + * + * { Initially: offer_in_progress = 1 } + * + * CPU1 CPU2 + * + * [vmbus_process_offer()] [vmbus_onoffer_rescind()] + * + * LOCK channel_mutex WAIT_ON offer_in_progress == 0 + * DECREMENT offer_in_progress LOCK channel_mutex + * INSERT chn_list SEARCH chn_list + * UNLOCK channel_mutex UNLOCK channel_mutex + * + * Forbids: CPU2's SEARCH from *not* seeing CPU1's INSERT */ while (atomic_read(&vmbus_connection.offer_in_progress) != 0) { diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c index 495337ec68f1..1f3e69d18d35 100644 --- a/drivers/hv/vmbus_drv.c +++ b/drivers/hv/vmbus_drv.c @@ -1101,8 +1101,9 @@ void vmbus_on_msg_dpc(unsigned long data) /* * The host can generate a rescind message while we * may still be handling the original offer. We deal with - * this condition by ensuring the processing is done on the - * same CPU. + * this condition by relying on the synchronization provided + * by offer_in_progress and by channel_mutex. See also the + * inline comments in vmbus_onoffer_rescind(). */ switch (hdr->msgtype) { case CHANNELMSG_RESCIND_CHANNELOFFER: @@ -1124,16 +1125,34 @@ void vmbus_on_msg_dpc(unsigned long data) * work queue: the RESCIND handler can not start to * run before the OFFER handler finishes. */ - schedule_work_on(VMBUS_CONNECT_CPU, - &ctx->work); + schedule_work(&ctx->work); break; case CHANNELMSG_OFFERCHANNEL: + /* + * The host sends the offer message of a given channel + * before sending the rescind message of the same + * channel. These messages are sent to the guest's + * connect CPU; the guest then starts processing them + * in the tasklet handler on this CPU: + * + * VMBUS_CONNECT_CPU + * + * [vmbus_on_msg_dpc()] + * atomic_inc() // CHANNELMSG_OFFERCHANNEL + * queue_work() + * ... + * [vmbus_on_msg_dpc()] + * schedule_work() // CHANNELMSG_RESCIND_CHANNELOFFER + * + * We rely on the memory-ordering properties of the + * queue_work() and schedule_work() primitives, which + * guarantee that the atomic increment will be visible + * to the CPUs which will execute the offer & rescind + * works by the time these works will start execution. + */ atomic_inc(&vmbus_connection.offer_in_progress); - queue_work_on(VMBUS_CONNECT_CPU, - vmbus_connection.work_queue, - &ctx->work); - break; + fallthrough; default: queue_work(vmbus_connection.work_queue, &ctx->work); @@ -1178,9 +1197,7 @@ static void vmbus_force_channel_rescinded(struct vmbus_channel *channel) INIT_WORK(&ctx->work, vmbus_onmessage_work); - queue_work_on(VMBUS_CONNECT_CPU, - vmbus_connection.work_queue, - &ctx->work); + queue_work(vmbus_connection.work_queue, &ctx->work); } #endif /* CONFIG_PM_SLEEP */ -- cgit v1.2.3 From 8b6a877c060ed6b86878fe66c7c6493a6054cf23 Mon Sep 17 00:00:00 2001 From: "Andrea Parri (Microsoft)" Date: Mon, 6 Apr 2020 02:15:06 +0200 Subject: Drivers: hv: vmbus: Replace the per-CPU channel lists with a global array of channels When Hyper-V sends an interrupt to the guest, the guest has to figure out which channel the interrupt is associated with. Hyper-V sets a bit in a memory page that is shared with the guest, indicating a particular "relid" that the interrupt is associated with. The current Linux code then uses a set of per-CPU linked lists to map a given "relid" to a pointer to a channel structure. This design introduces a synchronization problem if the CPU that Hyper-V will interrupt for a certain channel is changed. If the interrupt comes on the "old CPU" and the channel was already moved to the per-CPU list of the "new CPU", then the relid -> channel mapping will fail and the interrupt is dropped. Similarly, if the interrupt comes on the new CPU but the channel was not moved to the per-CPU list of the new CPU, then the mapping will fail and the interrupt is dropped. Relids are integers ranging from 0 to 2047. The mapping from relids to channel structures can be done by setting up an array with 2048 entries, each entry being a pointer to a channel structure (hence total size ~16K bytes, which is not a problem). The array is global, so there are no per-CPU linked lists to update. The array can be searched and updated by loading from/storing to the array at the specified index. With no per-CPU data structures, the above mentioned synchronization problem is avoided and the relid2channel() function gets simpler. Suggested-by: Michael Kelley Signed-off-by: Andrea Parri (Microsoft) Link: https://lore.kernel.org/r/20200406001514.19876-4-parri.andrea@gmail.com Reviewed-by: Michael Kelley Signed-off-by: Wei Liu --- drivers/hv/channel_mgmt.c | 179 +++++++++++++++++++++++++++++----------------- drivers/hv/connection.c | 38 +++------- drivers/hv/hv.c | 2 - drivers/hv/hyperv_vmbus.h | 14 ++-- drivers/hv/vmbus_drv.c | 48 ++++++++----- include/linux/hyperv.h | 5 -- 6 files changed, 160 insertions(+), 126 deletions(-) diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c index 0ced32ef2715..bbd0eee4362f 100644 --- a/drivers/hv/channel_mgmt.c +++ b/drivers/hv/channel_mgmt.c @@ -319,7 +319,6 @@ static struct vmbus_channel *alloc_channel(void) init_completion(&channel->rescind_event); INIT_LIST_HEAD(&channel->sc_list); - INIT_LIST_HEAD(&channel->percpu_list); tasklet_init(&channel->callback_event, vmbus_on_event, (unsigned long)channel); @@ -340,23 +339,49 @@ static void free_channel(struct vmbus_channel *channel) kobject_put(&channel->kobj); } -static void percpu_channel_enq(void *arg) +void vmbus_channel_map_relid(struct vmbus_channel *channel) { - struct vmbus_channel *channel = arg; - struct hv_per_cpu_context *hv_cpu - = this_cpu_ptr(hv_context.cpu_context); - - list_add_tail_rcu(&channel->percpu_list, &hv_cpu->chan_list); + if (WARN_ON(channel->offermsg.child_relid >= MAX_CHANNEL_RELIDS)) + return; + /* + * The mapping of the channel's relid is visible from the CPUs that + * execute vmbus_chan_sched() by the time that vmbus_chan_sched() will + * execute: + * + * (a) In the "normal (i.e., not resuming from hibernation)" path, + * the full barrier in smp_store_mb() guarantees that the store + * is propagated to all CPUs before the add_channel_work work + * is queued. In turn, add_channel_work is queued before the + * channel's ring buffer is allocated/initialized and the + * OPENCHANNEL message for the channel is sent in vmbus_open(). + * Hyper-V won't start sending the interrupts for the channel + * before the OPENCHANNEL message is acked. The memory barrier + * in vmbus_chan_sched() -> sync_test_and_clear_bit() ensures + * that vmbus_chan_sched() must find the channel's relid in + * recv_int_page before retrieving the channel pointer from the + * array of channels. + * + * (b) In the "resuming from hibernation" path, the smp_store_mb() + * guarantees that the store is propagated to all CPUs before + * the VMBus connection is marked as ready for the resume event + * (cf. check_ready_for_resume_event()). The interrupt handler + * of the VMBus driver and vmbus_chan_sched() can not run before + * vmbus_bus_resume() has completed execution (cf. resume_noirq). + */ + smp_store_mb( + vmbus_connection.channels[channel->offermsg.child_relid], + channel); } -static void percpu_channel_deq(void *arg) +void vmbus_channel_unmap_relid(struct vmbus_channel *channel) { - struct vmbus_channel *channel = arg; - - list_del_rcu(&channel->percpu_list); + if (WARN_ON(channel->offermsg.child_relid >= MAX_CHANNEL_RELIDS)) + return; + WRITE_ONCE( + vmbus_connection.channels[channel->offermsg.child_relid], + NULL); } - static void vmbus_release_relid(u32 relid) { struct vmbus_channel_relid_released msg; @@ -376,17 +401,25 @@ void hv_process_channel_removal(struct vmbus_channel *channel) struct vmbus_channel *primary_channel; unsigned long flags; - BUG_ON(!mutex_is_locked(&vmbus_connection.channel_mutex)); + lockdep_assert_held(&vmbus_connection.channel_mutex); BUG_ON(!channel->rescind); - if (channel->target_cpu != get_cpu()) { - put_cpu(); - smp_call_function_single(channel->target_cpu, - percpu_channel_deq, channel, true); - } else { - percpu_channel_deq(channel); - put_cpu(); - } + /* + * hv_process_channel_removal() could find INVALID_RELID only for + * hv_sock channels. See the inline comments in vmbus_onoffer(). + */ + WARN_ON(channel->offermsg.child_relid == INVALID_RELID && + !is_hvsock_channel(channel)); + + /* + * Upon suspend, an in-use hv_sock channel is removed from the array of + * channels and the relid is invalidated. After hibernation, when the + * user-space appplication destroys the channel, it's unnecessary and + * unsafe to remove the channel from the array of channels. See also + * the inline comments before the call of vmbus_release_relid() below. + */ + if (channel->offermsg.child_relid != INVALID_RELID) + vmbus_channel_unmap_relid(channel); if (channel->primary_channel == NULL) { list_del(&channel->listentry); @@ -447,16 +480,6 @@ static void vmbus_add_channel_work(struct work_struct *work) init_vp_index(newchannel, dev_type); - if (newchannel->target_cpu != get_cpu()) { - put_cpu(); - smp_call_function_single(newchannel->target_cpu, - percpu_channel_enq, - newchannel, true); - } else { - percpu_channel_enq(newchannel); - put_cpu(); - } - /* * This state is used to indicate a successful open * so that when we do close the channel normally, we @@ -523,17 +546,10 @@ err_deq_chan: spin_unlock_irqrestore(&primary_channel->lock, flags); } - mutex_unlock(&vmbus_connection.channel_mutex); + /* vmbus_process_offer() has mapped the channel. */ + vmbus_channel_unmap_relid(newchannel); - if (newchannel->target_cpu != get_cpu()) { - put_cpu(); - smp_call_function_single(newchannel->target_cpu, - percpu_channel_deq, - newchannel, true); - } else { - percpu_channel_deq(newchannel); - put_cpu(); - } + mutex_unlock(&vmbus_connection.channel_mutex); vmbus_release_relid(newchannel->offermsg.child_relid); @@ -599,6 +615,8 @@ static void vmbus_process_offer(struct vmbus_channel *newchannel) spin_unlock_irqrestore(&channel->lock, flags); } + vmbus_channel_map_relid(newchannel); + mutex_unlock(&vmbus_connection.channel_mutex); /* @@ -940,8 +958,6 @@ static void vmbus_onoffer(struct vmbus_channel_message_header *hdr) oldchannel = find_primary_channel_by_offer(offer); if (oldchannel != NULL) { - atomic_dec(&vmbus_connection.offer_in_progress); - /* * We're resuming from hibernation: all the sub-channel and * hv_sock channels we had before the hibernation should have @@ -949,36 +965,65 @@ static void vmbus_onoffer(struct vmbus_channel_message_header *hdr) * primary channel that we had before the hibernation. */ + /* + * { Initially: channel relid = INVALID_RELID, + * channels[valid_relid] = NULL } + * + * CPU1 CPU2 + * + * [vmbus_onoffer()] [vmbus_device_release()] + * + * LOCK channel_mutex LOCK channel_mutex + * STORE channel relid = valid_relid LOAD r1 = channel relid + * MAP_RELID channel if (r1 != INVALID_RELID) + * UNLOCK channel_mutex UNMAP_RELID channel + * UNLOCK channel_mutex + * + * Forbids: r1 == valid_relid && + * channels[valid_relid] == channel + * + * Note. r1 can be INVALID_RELID only for an hv_sock channel. + * None of the hv_sock channels which were present before the + * suspend are re-offered upon the resume. See the WARN_ON() + * in hv_process_channel_removal(). + */ + mutex_lock(&vmbus_connection.channel_mutex); + + atomic_dec(&vmbus_connection.offer_in_progress); + WARN_ON(oldchannel->offermsg.child_relid != INVALID_RELID); /* Fix up the relid. */ oldchannel->offermsg.child_relid = offer->child_relid; offer_sz = sizeof(*offer); - if (memcmp(offer, &oldchannel->offermsg, offer_sz) == 0) { - check_ready_for_resume_event(); - return; + if (memcmp(offer, &oldchannel->offermsg, offer_sz) != 0) { + /* + * This is not an error, since the host can also change + * the other field(s) of the offer, e.g. on WS RS5 + * (Build 17763), the offer->connection_id of the + * Mellanox VF vmbus device can change when the host + * reoffers the device upon resume. + */ + pr_debug("vmbus offer changed: relid=%d\n", + offer->child_relid); + + print_hex_dump_debug("Old vmbus offer: ", + DUMP_PREFIX_OFFSET, 16, 4, + &oldchannel->offermsg, offer_sz, + false); + print_hex_dump_debug("New vmbus offer: ", + DUMP_PREFIX_OFFSET, 16, 4, + offer, offer_sz, false); + + /* Fix up the old channel. */ + vmbus_setup_channel_state(oldchannel, offer); } - /* - * This is not an error, since the host can also change the - * other field(s) of the offer, e.g. on WS RS5 (Build 17763), - * the offer->connection_id of the Mellanox VF vmbus device - * can change when the host reoffers the device upon resume. - */ - pr_debug("vmbus offer changed: relid=%d\n", - offer->child_relid); - - print_hex_dump_debug("Old vmbus offer: ", DUMP_PREFIX_OFFSET, - 16, 4, &oldchannel->offermsg, offer_sz, - false); - print_hex_dump_debug("New vmbus offer: ", DUMP_PREFIX_OFFSET, - 16, 4, offer, offer_sz, false); - - /* Fix up the old channel. */ - vmbus_setup_channel_state(oldchannel, offer); - + /* Add the channel back to the array of channels. */ + vmbus_channel_map_relid(oldchannel); check_ready_for_resume_event(); + mutex_unlock(&vmbus_connection.channel_mutex); return; } @@ -1036,14 +1081,14 @@ static void vmbus_onoffer_rescind(struct vmbus_channel_message_header *hdr) * * CPU1 CPU2 * - * [vmbus_process_offer()] [vmbus_onoffer_rescind()] + * [vmbus_onoffer()] [vmbus_onoffer_rescind()] * * LOCK channel_mutex WAIT_ON offer_in_progress == 0 * DECREMENT offer_in_progress LOCK channel_mutex - * INSERT chn_list SEARCH chn_list + * STORE channels[] LOAD channels[] * UNLOCK channel_mutex UNLOCK channel_mutex * - * Forbids: CPU2's SEARCH from *not* seeing CPU1's INSERT + * Forbids: CPU2's LOAD from *not* seeing CPU1's STORE */ while (atomic_read(&vmbus_connection.offer_in_progress) != 0) { diff --git a/drivers/hv/connection.c b/drivers/hv/connection.c index f4bd306d2cef..11170d9a2e1a 100644 --- a/drivers/hv/connection.c +++ b/drivers/hv/connection.c @@ -248,6 +248,14 @@ int vmbus_connect(void) pr_info("Vmbus version:%d.%d\n", version >> 16, version & 0xFFFF); + vmbus_connection.channels = kcalloc(MAX_CHANNEL_RELIDS, + sizeof(struct vmbus_channel *), + GFP_KERNEL); + if (vmbus_connection.channels == NULL) { + ret = -ENOMEM; + goto cleanup; + } + kfree(msginfo); return 0; @@ -295,33 +303,9 @@ void vmbus_disconnect(void) */ struct vmbus_channel *relid2channel(u32 relid) { - struct vmbus_channel *channel; - struct vmbus_channel *found_channel = NULL; - struct list_head *cur, *tmp; - struct vmbus_channel *cur_sc; - - BUG_ON(!mutex_is_locked(&vmbus_connection.channel_mutex)); - - list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) { - if (channel->offermsg.child_relid == relid) { - found_channel = channel; - break; - } else if (!list_empty(&channel->sc_list)) { - /* - * Deal with sub-channels. - */ - list_for_each_safe(cur, tmp, &channel->sc_list) { - cur_sc = list_entry(cur, struct vmbus_channel, - sc_list); - if (cur_sc->offermsg.child_relid == relid) { - found_channel = cur_sc; - break; - } - } - } - } - - return found_channel; + if (WARN_ON(relid >= MAX_CHANNEL_RELIDS)) + return NULL; + return READ_ONCE(vmbus_connection.channels[relid]); } /* diff --git a/drivers/hv/hv.c b/drivers/hv/hv.c index e2b331045464..17bf1f229152 100644 --- a/drivers/hv/hv.c +++ b/drivers/hv/hv.c @@ -117,8 +117,6 @@ int hv_synic_alloc(void) pr_err("Unable to allocate post msg page\n"); goto err; } - - INIT_LIST_HEAD(&hv_cpu->chan_list); } return 0; diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h index 000740d053a2..3edf993b0fd9 100644 --- a/drivers/hv/hyperv_vmbus.h +++ b/drivers/hv/hyperv_vmbus.h @@ -132,12 +132,6 @@ struct hv_per_cpu_context { * basis. */ struct tasklet_struct msg_dpc; - - /* - * To optimize the mapping of relid to channel, maintain - * per-cpu list of the channels based on their CPU affinity. - */ - struct list_head chan_list; }; struct hv_context { @@ -202,6 +196,8 @@ int hv_ringbuffer_read(struct vmbus_channel *channel, /* TODO: Need to make this configurable */ #define MAX_NUM_CHANNELS_SUPPORTED 256 +#define MAX_CHANNEL_RELIDS \ + max(MAX_NUM_CHANNELS_SUPPORTED, HV_EVENT_FLAGS_COUNT) enum vmbus_connect_state { DISCONNECTED, @@ -251,6 +247,9 @@ struct vmbus_connection { struct list_head chn_list; struct mutex channel_mutex; + /* Array of channels */ + struct vmbus_channel **channels; + /* * An offer message is handled first on the work_queue, and then * is further handled on handle_primary_chan_wq or @@ -338,6 +337,9 @@ int vmbus_add_channel_kobj(struct hv_device *device_obj, void vmbus_remove_channel_attr_group(struct vmbus_channel *channel); +void vmbus_channel_map_relid(struct vmbus_channel *channel); +void vmbus_channel_unmap_relid(struct vmbus_channel *channel); + struct vmbus_channel *relid2channel(u32 relid); void vmbus_free_channels(void); diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c index 1f3e69d18d35..21a01bca7fcd 100644 --- a/drivers/hv/vmbus_drv.c +++ b/drivers/hv/vmbus_drv.c @@ -1252,33 +1252,39 @@ static void vmbus_chan_sched(struct hv_per_cpu_context *hv_cpu) if (relid == 0) continue; + /* + * Pairs with the kfree_rcu() in vmbus_chan_release(). + * Guarantees that the channel data structure doesn't + * get freed while the channel pointer below is being + * dereferenced. + */ rcu_read_lock(); /* Find channel based on relid */ - list_for_each_entry_rcu(channel, &hv_cpu->chan_list, percpu_list) { - if (channel->offermsg.child_relid != relid) - continue; + channel = relid2channel(relid); + if (channel == NULL) + goto sched_unlock_rcu; - if (channel->rescind) - continue; + if (channel->rescind) + goto sched_unlock_rcu; - trace_vmbus_chan_sched(channel); + trace_vmbus_chan_sched(channel); - ++channel->interrupts; + ++channel->interrupts; - switch (channel->callback_mode) { - case HV_CALL_ISR: - vmbus_channel_isr(channel); - break; + switch (channel->callback_mode) { + case HV_CALL_ISR: + vmbus_channel_isr(channel); + break; - case HV_CALL_BATCHED: - hv_begin_read(&channel->inbound); - /* fallthrough */ - case HV_CALL_DIRECT: - tasklet_schedule(&channel->callback_event); - } + case HV_CALL_BATCHED: + hv_begin_read(&channel->inbound); + fallthrough; + case HV_CALL_DIRECT: + tasklet_schedule(&channel->callback_event); } +sched_unlock_rcu: rcu_read_unlock(); } } @@ -2264,9 +2270,12 @@ static int vmbus_bus_suspend(struct device *dev) list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) { /* - * Invalidate the field. Upon resume, vmbus_onoffer() will fix - * up the field, and the other fields (if necessary). + * Remove the channel from the array of channels and invalidate + * the channel's relid. Upon resume, vmbus_onoffer() will fix + * up the relid (and other fields, if necessary) and add the + * channel back to the array. */ + vmbus_channel_unmap_relid(channel); channel->offermsg.child_relid = INVALID_RELID; if (is_hvsock_channel(channel)) { @@ -2502,6 +2511,7 @@ static void __exit vmbus_exit(void) hv_debug_rm_all_dir(); vmbus_free_channels(); + kfree(vmbus_connection.channels); if (ms_hyperv.misc_features & HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE) { kmsg_dump_unregister(&hv_kmsg_dumper); diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index cbd24f4e68d1..f5b3f008c55a 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -854,11 +854,6 @@ struct vmbus_channel { * Support per-channel state for use by vmbus drivers. */ void *per_channel_state; - /* - * To support per-cpu lookup mapping of relid to channel, - * link up channels based on their CPU affinity. - */ - struct list_head percpu_list; /* * Defer freeing channel until after all cpu's have -- cgit v1.2.3 From ac5047671758ad4be9f93898247b3a8b6dfde4c7 Mon Sep 17 00:00:00 2001 From: "Andrea Parri (Microsoft)" Date: Mon, 6 Apr 2020 02:15:07 +0200 Subject: hv_netvsc: Disable NAPI before closing the VMBus channel vmbus_chan_sched() might call the netvsc driver callback function that ends up scheduling NAPI work. This "work" can access the channel ring buffer, so we must ensure that any such work is completed and that the ring buffer is no longer being accessed before freeing the ring buffer data structure in the channel closure path. To this end, disable NAPI before calling vmbus_close() in netvsc_device_remove(). Suggested-by: Michael Kelley Signed-off-by: Andrea Parri (Microsoft) Acked-by: Stephen Hemminger Cc: "David S. Miller" Cc: Link: https://lore.kernel.org/r/20200406001514.19876-5-parri.andrea@gmail.com Reviewed-by: Michael Kelley Signed-off-by: Wei Liu --- drivers/hv/channel.c | 6 ++++++ drivers/net/hyperv/netvsc.c | 7 +++++-- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/drivers/hv/channel.c b/drivers/hv/channel.c index 23f358cb7f49..256ee90c7446 100644 --- a/drivers/hv/channel.c +++ b/drivers/hv/channel.c @@ -609,6 +609,12 @@ void vmbus_reset_channel_cb(struct vmbus_channel *channel) * the former is accessing channel->inbound.ring_buffer, the latter * could be freeing the ring_buffer pages, so here we must stop it * first. + * + * vmbus_chan_sched() might call the netvsc driver callback function + * that ends up scheduling NAPI work that accesses the ring buffer. + * At this point, we have to ensure that any such work is completed + * and that the channel ring buffer is no longer being accessed, cf. + * the calls to napi_disable() in netvsc_device_remove(). */ tasklet_disable(&channel->callback_event); diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c index ca68aa1df801..41f5cf0bb997 100644 --- a/drivers/net/hyperv/netvsc.c +++ b/drivers/net/hyperv/netvsc.c @@ -636,9 +636,12 @@ void netvsc_device_remove(struct hv_device *device) RCU_INIT_POINTER(net_device_ctx->nvdev, NULL); - /* And disassociate NAPI context from device */ - for (i = 0; i < net_device->num_chn; i++) + /* Disable NAPI and disassociate its context from the device. */ + for (i = 0; i < net_device->num_chn; i++) { + /* See also vmbus_reset_channel_cb(). */ + napi_disable(&net_device->chan_table[i].napi); netif_napi_del(&net_device->chan_table[i].napi); + } /* * At this point, no one should be accessing net_device -- cgit v1.2.3 From 238d2ed8f7d1b1ca0c13334bb8197a42654af946 Mon Sep 17 00:00:00 2001 From: "Andrea Parri (Microsoft)" Date: Mon, 6 Apr 2020 02:15:08 +0200 Subject: hv_utils: Always execute the fcopy and vss callbacks in a tasklet The fcopy and vss callback functions could be running in a tasklet at the same time they are called in hv_poll_channel(). Current code serializes the invocations of these functions, and their accesses to the channel ring buffer, by sending an IPI to the CPU that is allowed to access the ring buffer, cf. hv_poll_channel(). This IPI mechanism becomes infeasible if we allow changing the CPU that a channel will interrupt. Instead modify the callback wrappers to always execute the fcopy and vss callbacks in a tasklet, thus mirroring the solution for the kvp callback functions adopted since commit a3ade8cc474d8 ("HV: properly delay KVP packets when negotiation is in progress"). This will ensure that the callback function can't run on two CPUs at the same time. Suggested-by: Michael Kelley Signed-off-by: Andrea Parri (Microsoft) Link: https://lore.kernel.org/r/20200406001514.19876-6-parri.andrea@gmail.com Reviewed-by: Michael Kelley Signed-off-by: Wei Liu --- drivers/hv/hv_fcopy.c | 2 +- drivers/hv/hv_snapshot.c | 2 +- drivers/hv/hyperv_vmbus.h | 7 +------ 3 files changed, 3 insertions(+), 8 deletions(-) diff --git a/drivers/hv/hv_fcopy.c b/drivers/hv/hv_fcopy.c index bb9ba3f7c794..5040d7e0cd9e 100644 --- a/drivers/hv/hv_fcopy.c +++ b/drivers/hv/hv_fcopy.c @@ -71,7 +71,7 @@ static void fcopy_poll_wrapper(void *channel) { /* Transaction is finished, reset the state here to avoid races. */ fcopy_transaction.state = HVUTIL_READY; - hv_fcopy_onchannelcallback(channel); + tasklet_schedule(&((struct vmbus_channel *)channel)->callback_event); } static void fcopy_timeout_func(struct work_struct *dummy) diff --git a/drivers/hv/hv_snapshot.c b/drivers/hv/hv_snapshot.c index 1c75b38f0d6d..783779e4cc1a 100644 --- a/drivers/hv/hv_snapshot.c +++ b/drivers/hv/hv_snapshot.c @@ -80,7 +80,7 @@ static void vss_poll_wrapper(void *channel) { /* Transaction is finished, reset the state here to avoid races. */ vss_transaction.state = HVUTIL_READY; - hv_vss_onchannelcallback(channel); + tasklet_schedule(&((struct vmbus_channel *)channel)->callback_event); } /* diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h index 3edf993b0fd9..5e5cebe5d048 100644 --- a/drivers/hv/hyperv_vmbus.h +++ b/drivers/hv/hyperv_vmbus.h @@ -378,12 +378,7 @@ static inline void hv_poll_channel(struct vmbus_channel *channel, { if (!channel) return; - - if (in_interrupt() && (channel->target_cpu == smp_processor_id())) { - cb(channel); - return; - } - smp_call_function_single(channel->target_cpu, cb, channel, true); + cb(channel); } enum hvutil_device_state { -- cgit v1.2.3 From 9403b66e6161130ebae7e55a97491c84c1ad6f9f Mon Sep 17 00:00:00 2001 From: "Andrea Parri (Microsoft)" Date: Mon, 6 Apr 2020 02:15:09 +0200 Subject: Drivers: hv: vmbus: Use a spin lock for synchronizing channel scheduling vs. channel removal Since vmbus_chan_sched() dereferences the ring buffer pointer, we have to make sure that the ring buffer data structures don't get freed while such dereferencing is happening. Current code does this by sending an IPI to the CPU that is allowed to access that ring buffer from interrupt level, cf., vmbus_reset_channel_cb(). But with the new functionality to allow changing the CPU that a channel will interrupt, we can't be sure what CPU will be running the vmbus_chan_sched() function for a particular channel, so the current IPI mechanism is infeasible. Instead synchronize vmbus_chan_sched() and vmbus_reset_channel_cb() by using the (newly introduced) per-channel spin lock "sched_lock". Move the test for onchannel_callback being NULL before the "switch" control statement in vmbus_chan_sched(), in order to not access the ring buffer if the vmbus_reset_channel_cb() has been completed on the channel. Suggested-by: Michael Kelley Signed-off-by: Andrea Parri (Microsoft) Link: https://lore.kernel.org/r/20200406001514.19876-7-parri.andrea@gmail.com Reviewed-by: Michael Kelley Signed-off-by: Wei Liu --- drivers/hv/channel.c | 24 +++++++----------------- drivers/hv/channel_mgmt.c | 1 + drivers/hv/vmbus_drv.c | 30 +++++++++++++++++------------- include/linux/hyperv.h | 6 ++++++ 4 files changed, 31 insertions(+), 30 deletions(-) diff --git a/drivers/hv/channel.c b/drivers/hv/channel.c index 256ee90c7446..132e476f87b2 100644 --- a/drivers/hv/channel.c +++ b/drivers/hv/channel.c @@ -594,15 +594,10 @@ post_msg_err: } EXPORT_SYMBOL_GPL(vmbus_teardown_gpadl); -static void reset_channel_cb(void *arg) -{ - struct vmbus_channel *channel = arg; - - channel->onchannel_callback = NULL; -} - void vmbus_reset_channel_cb(struct vmbus_channel *channel) { + unsigned long flags; + /* * vmbus_on_event(), running in the per-channel tasklet, can race * with vmbus_close_internal() in the case of SMP guest, e.g., when @@ -618,17 +613,12 @@ void vmbus_reset_channel_cb(struct vmbus_channel *channel) */ tasklet_disable(&channel->callback_event); - channel->sc_creation_callback = NULL; + /* See the inline comments in vmbus_chan_sched(). */ + spin_lock_irqsave(&channel->sched_lock, flags); + channel->onchannel_callback = NULL; + spin_unlock_irqrestore(&channel->sched_lock, flags); - /* Stop the callback asap */ - if (channel->target_cpu != get_cpu()) { - put_cpu(); - smp_call_function_single(channel->target_cpu, reset_channel_cb, - channel, true); - } else { - reset_channel_cb(channel); - put_cpu(); - } + channel->sc_creation_callback = NULL; /* Re-enable tasklet for use on re-open */ tasklet_enable(&channel->callback_event); diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c index bbd0eee4362f..2e730f84654b 100644 --- a/drivers/hv/channel_mgmt.c +++ b/drivers/hv/channel_mgmt.c @@ -315,6 +315,7 @@ static struct vmbus_channel *alloc_channel(void) if (!channel) return NULL; + spin_lock_init(&channel->sched_lock); spin_lock_init(&channel->lock); init_completion(&channel->rescind_event); diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c index 21a01bca7fcd..0f7dfa507a40 100644 --- a/drivers/hv/vmbus_drv.c +++ b/drivers/hv/vmbus_drv.c @@ -1201,18 +1201,6 @@ static void vmbus_force_channel_rescinded(struct vmbus_channel *channel) } #endif /* CONFIG_PM_SLEEP */ -/* - * Direct callback for channels using other deferred processing - */ -static void vmbus_channel_isr(struct vmbus_channel *channel) -{ - void (*callback_fn)(void *); - - callback_fn = READ_ONCE(channel->onchannel_callback); - if (likely(callback_fn != NULL)) - (*callback_fn)(channel->channel_callback_context); -} - /* * Schedule all channels with events pending */ @@ -1243,6 +1231,7 @@ static void vmbus_chan_sched(struct hv_per_cpu_context *hv_cpu) return; for_each_set_bit(relid, recv_int_page, maxbits) { + void (*callback_fn)(void *context); struct vmbus_channel *channel; if (!sync_test_and_clear_bit(relid, recv_int_page)) @@ -1268,13 +1257,26 @@ static void vmbus_chan_sched(struct hv_per_cpu_context *hv_cpu) if (channel->rescind) goto sched_unlock_rcu; + /* + * Make sure that the ring buffer data structure doesn't get + * freed while we dereference the ring buffer pointer. Test + * for the channel's onchannel_callback being NULL within a + * sched_lock critical section. See also the inline comments + * in vmbus_reset_channel_cb(). + */ + spin_lock(&channel->sched_lock); + + callback_fn = channel->onchannel_callback; + if (unlikely(callback_fn == NULL)) + goto sched_unlock; + trace_vmbus_chan_sched(channel); ++channel->interrupts; switch (channel->callback_mode) { case HV_CALL_ISR: - vmbus_channel_isr(channel); + (*callback_fn)(channel->channel_callback_context); break; case HV_CALL_BATCHED: @@ -1284,6 +1286,8 @@ static void vmbus_chan_sched(struct hv_per_cpu_context *hv_cpu) tasklet_schedule(&channel->callback_event); } +sched_unlock: + spin_unlock(&channel->sched_lock); sched_unlock_rcu: rcu_read_unlock(); } diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index f5b3f008c55a..62b2c11d0875 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -771,6 +771,12 @@ struct vmbus_channel { void (*onchannel_callback)(void *context); void *channel_callback_context; + /* + * Synchronize channel scheduling and channel removal; see the inline + * comments in vmbus_chan_sched() and vmbus_reset_channel_cb(). + */ + spinlock_t sched_lock; + /* * A channel can be marked for one of three modes of reading: * BATCHED - callback called from taslket and should read -- cgit v1.2.3 From 240ad77cb50d9f0a961fcb0f21e67939cf7a9c04 Mon Sep 17 00:00:00 2001 From: "Andrea Parri (Microsoft)" Date: Mon, 6 Apr 2020 02:15:10 +0200 Subject: PCI: hv: Prepare hv_compose_msi_msg() for the VMBus-channel-interrupt-to-vCPU reassignment functionality The current implementation of hv_compose_msi_msg() is incompatible with the new functionality that allows changing the vCPU a VMBus channel will interrupt: if this function always calls hv_pci_onchannelcallback() in the polling loop, the interrupt going to a different CPU could cause hv_pci_onchannelcallback() to be running simultaneously in a tasklet, which will break. The current code also has a problem in that it is not synchronized with vmbus_reset_channel_cb(): hv_compose_msi_msg() could be accessing the ring buffer via the call of hv_pci_onchannelcallback() well after the time that vmbus_reset_channel_cb() has finished. Fix these issues as follows. Disable the channel tasklet before entering the polling loop in hv_compose_msi_msg() and re-enable it when done. This will prevent hv_pci_onchannelcallback() from running in a tasklet on a different CPU. Moreover, poll by always calling hv_pci_onchannelcallback(), but check the channel callback function for NULL and invoke the callback within a sched_lock critical section. This will prevent hv_compose_msi_msg() from accessing the ring buffer after vmbus_reset_channel_cb() has acquired the sched_lock spinlock. Suggested-by: Michael Kelley Signed-off-by: Andrea Parri (Microsoft) Cc: Lorenzo Pieralisi Cc: Andrew Murray Cc: Bjorn Helgaas Cc: Link: https://lore.kernel.org/r/20200406001514.19876-8-parri.andrea@gmail.com Reviewed-by: Michael Kelley Signed-off-by: Wei Liu --- drivers/pci/controller/pci-hyperv.c | 44 +++++++++++++++++++++++-------------- 1 file changed, 28 insertions(+), 16 deletions(-) diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c index e15022ff63e3..222ff5639ebe 100644 --- a/drivers/pci/controller/pci-hyperv.c +++ b/drivers/pci/controller/pci-hyperv.c @@ -1356,11 +1356,11 @@ static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) { struct irq_cfg *cfg = irqd_cfg(data); struct hv_pcibus_device *hbus; + struct vmbus_channel *channel; struct hv_pci_dev *hpdev; struct pci_bus *pbus; struct pci_dev *pdev; struct cpumask *dest; - unsigned long flags; struct compose_comp_ctxt comp; struct tran_int_desc *int_desc; struct { @@ -1378,6 +1378,7 @@ static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) dest = irq_data_get_effective_affinity_mask(data); pbus = pdev->bus; hbus = container_of(pbus->sysdata, struct hv_pcibus_device, sysdata); + channel = hbus->hdev->channel; hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(pdev->devfn)); if (!hpdev) goto return_null_message; @@ -1435,43 +1436,52 @@ static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) goto free_int_desc; } + /* + * Prevents hv_pci_onchannelcallback() from running concurrently + * in the tasklet. + */ + tasklet_disable(&channel->callback_event); + /* * Since this function is called with IRQ locks held, can't * do normal wait for completion; instead poll. */ while (!try_wait_for_completion(&comp.comp_pkt.host_event)) { + unsigned long flags; + /* 0xFFFF means an invalid PCI VENDOR ID. */ if (hv_pcifront_get_vendor_id(hpdev) == 0xFFFF) { dev_err_once(&hbus->hdev->device, "the device has gone\n"); - goto free_int_desc; + goto enable_tasklet; } /* - * When the higher level interrupt code calls us with - * interrupt disabled, we must poll the channel by calling - * the channel callback directly when channel->target_cpu is - * the current CPU. When the higher level interrupt code - * calls us with interrupt enabled, let's add the - * local_irq_save()/restore() to avoid race: - * hv_pci_onchannelcallback() can also run in tasklet. + * Make sure that the ring buffer data structure doesn't get + * freed while we dereference the ring buffer pointer. Test + * for the channel's onchannel_callback being NULL within a + * sched_lock critical section. See also the inline comments + * in vmbus_reset_channel_cb(). */ - local_irq_save(flags); - - if (hbus->hdev->channel->target_cpu == smp_processor_id()) - hv_pci_onchannelcallback(hbus); - - local_irq_restore(flags); + spin_lock_irqsave(&channel->sched_lock, flags); + if (unlikely(channel->onchannel_callback == NULL)) { + spin_unlock_irqrestore(&channel->sched_lock, flags); + goto enable_tasklet; + } + hv_pci_onchannelcallback(hbus); + spin_unlock_irqrestore(&channel->sched_lock, flags); if (hpdev->state == hv_pcichild_ejecting) { dev_err_once(&hbus->hdev->device, "the device is being ejected\n"); - goto free_int_desc; + goto enable_tasklet; } udelay(100); } + tasklet_enable(&channel->callback_event); + if (comp.comp_pkt.completion_status < 0) { dev_err(&hbus->hdev->device, "Request for interrupt failed: 0x%x", @@ -1495,6 +1505,8 @@ static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) put_pcichild(hpdev); return; +enable_tasklet: + tasklet_enable(&channel->callback_event); free_int_desc: kfree(int_desc); drop_reference: -- cgit v1.2.3 From 8ef4c4abbbcdcd9d4bc0fd9454df03e6dac24b73 Mon Sep 17 00:00:00 2001 From: "Andrea Parri (Microsoft)" Date: Mon, 6 Apr 2020 02:15:11 +0200 Subject: Drivers: hv: vmbus: Remove the unused HV_LOCALIZED channel affinity logic The logic is unused since commit 509879bdb30b8 ("Drivers: hv: Introduce a policy for controlling channel affinity"). This logic assumes that a channel target_cpu doesn't change during the lifetime of a channel, but this assumption is incompatible with the new functionality that allows changing the vCPU a channel will interrupt. Signed-off-by: Andrea Parri (Microsoft) Link: https://lore.kernel.org/r/20200406001514.19876-9-parri.andrea@gmail.com Reviewed-by: Michael Kelley Signed-off-by: Wei Liu --- drivers/hv/channel_mgmt.c | 105 +++++++++++----------------------------------- include/linux/hyperv.h | 27 ------------ 2 files changed, 25 insertions(+), 107 deletions(-) diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c index 2e730f84654b..a19dc28fdf77 100644 --- a/drivers/hv/channel_mgmt.c +++ b/drivers/hv/channel_mgmt.c @@ -433,14 +433,6 @@ void hv_process_channel_removal(struct vmbus_channel *channel) spin_unlock_irqrestore(&primary_channel->lock, flags); } - /* - * We need to free the bit for init_vp_index() to work in the case - * of sub-channel, when we reload drivers like hv_netvsc. - */ - if (channel->affinity_policy == HV_LOCALIZED) - cpumask_clear_cpu(channel->target_cpu, - &primary_channel->alloced_cpus_in_node); - /* * Upon suspend, an in-use hv_sock channel is marked as "rescinded" and * the relid is invalidated; after hibernation, when the user-space app @@ -662,20 +654,21 @@ static DEFINE_SPINLOCK(bind_channel_to_cpu_lock); /* * Starting with Win8, we can statically distribute the incoming * channel interrupt load by binding a channel to VCPU. - * We distribute the interrupt loads to one or more NUMA nodes based on - * the channel's affinity_policy. * * For pre-win8 hosts or non-performance critical channels we assign the * first CPU in the first NUMA node. + * + * Starting with win8, performance critical channels will be distributed + * evenly among all the available NUMA nodes. Once the node is assigned, + * we will assign the CPU based on a simple round robin scheme. */ static void init_vp_index(struct vmbus_channel *channel, u16 dev_type) { - u32 cur_cpu; bool perf_chn = vmbus_devs[dev_type].perf_device; - struct vmbus_channel *primary = channel->primary_channel; - int next_node; cpumask_var_t available_mask; struct cpumask *alloced_mask; + u32 target_cpu; + int numa_node; if ((vmbus_proto_version == VERSION_WS2008) || (vmbus_proto_version == VERSION_WIN7) || (!perf_chn) || @@ -693,31 +686,27 @@ static void init_vp_index(struct vmbus_channel *channel, u16 dev_type) return; } - spin_lock(&bind_channel_to_cpu_lock); - /* - * Based on the channel affinity policy, we will assign the NUMA - * nodes. + * Serializes the accesses to the global variable next_numa_node_id. + * See also the header comment of the spin lock declaration. */ + spin_lock(&bind_channel_to_cpu_lock); - if ((channel->affinity_policy == HV_BALANCED) || (!primary)) { - while (true) { - next_node = next_numa_node_id++; - if (next_node == nr_node_ids) { - next_node = next_numa_node_id = 0; - continue; - } - if (cpumask_empty(cpumask_of_node(next_node))) - continue; - break; + while (true) { + numa_node = next_numa_node_id++; + if (numa_node == nr_node_ids) { + next_numa_node_id = 0; + continue; } - channel->numa_node = next_node; - primary = channel; + if (cpumask_empty(cpumask_of_node(numa_node))) + continue; + break; } - alloced_mask = &hv_context.hv_numa_map[primary->numa_node]; + channel->numa_node = numa_node; + alloced_mask = &hv_context.hv_numa_map[numa_node]; if (cpumask_weight(alloced_mask) == - cpumask_weight(cpumask_of_node(primary->numa_node))) { + cpumask_weight(cpumask_of_node(numa_node))) { /* * We have cycled through all the CPUs in the node; * reset the alloced map. @@ -725,57 +714,13 @@ static void init_vp_index(struct vmbus_channel *channel, u16 dev_type) cpumask_clear(alloced_mask); } - cpumask_xor(available_mask, alloced_mask, - cpumask_of_node(primary->numa_node)); + cpumask_xor(available_mask, alloced_mask, cpumask_of_node(numa_node)); - cur_cpu = -1; - - if (primary->affinity_policy == HV_LOCALIZED) { - /* - * Normally Hyper-V host doesn't create more subchannels - * than there are VCPUs on the node but it is possible when not - * all present VCPUs on the node are initialized by guest. - * Clear the alloced_cpus_in_node to start over. - */ - if (cpumask_equal(&primary->alloced_cpus_in_node, - cpumask_of_node(primary->numa_node))) - cpumask_clear(&primary->alloced_cpus_in_node); - } - - while (true) { - cur_cpu = cpumask_next(cur_cpu, available_mask); - if (cur_cpu >= nr_cpu_ids) { - cur_cpu = -1; - cpumask_copy(available_mask, - cpumask_of_node(primary->numa_node)); - continue; - } - - if (primary->affinity_policy == HV_LOCALIZED) { - /* - * NOTE: in the case of sub-channel, we clear the - * sub-channel related bit(s) in - * primary->alloced_cpus_in_node in - * hv_process_channel_removal(), so when we - * reload drivers like hv_netvsc in SMP guest, here - * we're able to re-allocate - * bit from primary->alloced_cpus_in_node. - */ - if (!cpumask_test_cpu(cur_cpu, - &primary->alloced_cpus_in_node)) { - cpumask_set_cpu(cur_cpu, - &primary->alloced_cpus_in_node); - cpumask_set_cpu(cur_cpu, alloced_mask); - break; - } - } else { - cpumask_set_cpu(cur_cpu, alloced_mask); - break; - } - } + target_cpu = cpumask_first(available_mask); + cpumask_set_cpu(target_cpu, alloced_mask); - channel->target_cpu = cur_cpu; - channel->target_vp = hv_cpu_number_to_vp_number(cur_cpu); + channel->target_cpu = target_cpu; + channel->target_vp = hv_cpu_number_to_vp_number(target_cpu); spin_unlock(&bind_channel_to_cpu_lock); diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index 62b2c11d0875..247356dbd742 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -689,11 +689,6 @@ union hv_connection_id { } u; }; -enum hv_numa_policy { - HV_BALANCED = 0, - HV_LOCALIZED, -}; - enum vmbus_device_type { HV_IDE = 0, HV_SCSI, @@ -808,10 +803,6 @@ struct vmbus_channel { u32 target_vp; /* The corresponding CPUID in the guest */ u32 target_cpu; - /* - * State to manage the CPU affiliation of channels. - */ - struct cpumask alloced_cpus_in_node; int numa_node; /* * Support for sub-channels. For high performance devices, @@ -898,18 +889,6 @@ struct vmbus_channel { */ bool low_latency; - /* - * NUMA distribution policy: - * We support two policies: - * 1) Balanced: Here all performance critical channels are - * distributed evenly amongst all the NUMA nodes. - * This policy will be the default policy. - * 2) Localized: All channels of a given instance of a - * performance critical service will be assigned CPUs - * within a selected NUMA node. - */ - enum hv_numa_policy affinity_policy; - bool probe_done; /* @@ -965,12 +944,6 @@ static inline bool is_sub_channel(const struct vmbus_channel *c) return c->offermsg.offer.sub_channel_index != 0; } -static inline void set_channel_affinity_state(struct vmbus_channel *c, - enum hv_numa_policy policy) -{ - c->affinity_policy = policy; -} - static inline void set_channel_read_mode(struct vmbus_channel *c, enum hv_callback_mode mode) { -- cgit v1.2.3 From d570aec0f2154e1bfba14ffd0df164a185e363b5 Mon Sep 17 00:00:00 2001 From: "Andrea Parri (Microsoft)" Date: Mon, 6 Apr 2020 02:15:12 +0200 Subject: Drivers: hv: vmbus: Synchronize init_vp_index() vs. CPU hotplug init_vp_index() may access the cpu_online_mask mask via its calls of cpumask_of_node(). Make sure to protect these accesses with a cpus_read_lock() critical section. Also, remove some (hardcoded) instances of CPU(0) from init_vp_index() and replace them with VMBUS_CONNECT_CPU. The connect CPU can not go offline, since Hyper-V does not provide a way to change it. Finally, order the accesses of target_cpu from init_vp_index() and hv_synic_cleanup() by relying on the channel_mutex; this is achieved by moving the call of init_vp_index() into vmbus_process_offer(). Signed-off-by: Andrea Parri (Microsoft) Link: https://lore.kernel.org/r/20200406001514.19876-10-parri.andrea@gmail.com Reviewed-by: Michael Kelley Signed-off-by: Wei Liu --- drivers/hv/channel_mgmt.c | 47 ++++++++++++++++++++++++++++++++++------------- drivers/hv/hv.c | 7 ++++--- 2 files changed, 38 insertions(+), 16 deletions(-) diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c index a19dc28fdf77..2db3823f0e59 100644 --- a/drivers/hv/channel_mgmt.c +++ b/drivers/hv/channel_mgmt.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -466,13 +467,8 @@ static void vmbus_add_channel_work(struct work_struct *work) container_of(work, struct vmbus_channel, add_channel_work); struct vmbus_channel *primary_channel = newchannel->primary_channel; unsigned long flags; - u16 dev_type; int ret; - dev_type = hv_get_dev_type(newchannel); - - init_vp_index(newchannel, dev_type); - /* * This state is used to indicate a successful open * so that when we do close the channel normally, we @@ -504,7 +500,7 @@ static void vmbus_add_channel_work(struct work_struct *work) if (!newchannel->device_obj) goto err_deq_chan; - newchannel->device_obj->device_id = dev_type; + newchannel->device_obj->device_id = hv_get_dev_type(newchannel); /* * Add the new device to the bus. This will kick off device-driver * binding which eventually invokes the device driver's AddDevice() @@ -560,6 +556,25 @@ static void vmbus_process_offer(struct vmbus_channel *newchannel) unsigned long flags; bool fnew = true; + /* + * Initialize the target_CPU before inserting the channel in + * the chn_list and sc_list lists, within the channel_mutex + * critical section: + * + * CPU1 CPU2 + * + * [vmbus_process_offer()] [hv_syninc_cleanup()] + * + * STORE target_cpu LOCK channel_mutex + * LOCK channel_mutex SEARCH chn_list + * INSERT chn_list LOAD target_cpu + * UNLOCK channel_mutex UNLOCK channel_mutex + * + * Forbids: CPU2's SEARCH from seeing CPU1's INSERT && + * CPU2's LOAD from *not* seing CPU1's STORE + */ + init_vp_index(newchannel, hv_get_dev_type(newchannel)); + mutex_lock(&vmbus_connection.channel_mutex); /* Remember the channels that should be cleaned up upon suspend. */ @@ -656,7 +671,7 @@ static DEFINE_SPINLOCK(bind_channel_to_cpu_lock); * channel interrupt load by binding a channel to VCPU. * * For pre-win8 hosts or non-performance critical channels we assign the - * first CPU in the first NUMA node. + * VMBUS_CONNECT_CPU. * * Starting with win8, performance critical channels will be distributed * evenly among all the available NUMA nodes. Once the node is assigned, @@ -675,17 +690,22 @@ static void init_vp_index(struct vmbus_channel *channel, u16 dev_type) !alloc_cpumask_var(&available_mask, GFP_KERNEL)) { /* * Prior to win8, all channel interrupts are - * delivered on cpu 0. + * delivered on VMBUS_CONNECT_CPU. * Also if the channel is not a performance critical - * channel, bind it to cpu 0. - * In case alloc_cpumask_var() fails, bind it to cpu 0. + * channel, bind it to VMBUS_CONNECT_CPU. + * In case alloc_cpumask_var() fails, bind it to + * VMBUS_CONNECT_CPU. */ - channel->numa_node = 0; - channel->target_cpu = 0; - channel->target_vp = hv_cpu_number_to_vp_number(0); + channel->numa_node = cpu_to_node(VMBUS_CONNECT_CPU); + channel->target_cpu = VMBUS_CONNECT_CPU; + channel->target_vp = + hv_cpu_number_to_vp_number(VMBUS_CONNECT_CPU); return; } + /* No CPUs can come up or down during this. */ + cpus_read_lock(); + /* * Serializes the accesses to the global variable next_numa_node_id. * See also the header comment of the spin lock declaration. @@ -723,6 +743,7 @@ static void init_vp_index(struct vmbus_channel *channel, u16 dev_type) channel->target_vp = hv_cpu_number_to_vp_number(target_cpu); spin_unlock(&bind_channel_to_cpu_lock); + cpus_read_unlock(); free_cpumask_var(available_mask); } diff --git a/drivers/hv/hv.c b/drivers/hv/hv.c index 17bf1f229152..188b42b07f07 100644 --- a/drivers/hv/hv.c +++ b/drivers/hv/hv.c @@ -256,9 +256,10 @@ int hv_synic_cleanup(unsigned int cpu) /* * Search for channels which are bound to the CPU we're about to - * cleanup. In case we find one and vmbus is still connected we need to - * fail, this will effectively prevent CPU offlining. There is no way - * we can re-bind channels to different CPUs for now. + * cleanup. In case we find one and vmbus is still connected, we + * fail; this will effectively prevent CPU offlining. + * + * TODO: Re-bind the channels to different CPUs. */ mutex_lock(&vmbus_connection.channel_mutex); list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) { -- cgit v1.2.3 From 7527810573436f00e582d3d5ef2eb3c027c98d7d Mon Sep 17 00:00:00 2001 From: "Andrea Parri (Microsoft)" Date: Mon, 6 Apr 2020 02:15:13 +0200 Subject: Drivers: hv: vmbus: Introduce the CHANNELMSG_MODIFYCHANNEL message type VMBus version 4.1 and later support the CHANNELMSG_MODIFYCHANNEL(22) message type which can be used to request Hyper-V to change the vCPU that a channel will interrupt. Introduce the CHANNELMSG_MODIFYCHANNEL message type, and define the vmbus_send_modifychannel() function to send CHANNELMSG_MODIFYCHANNEL requests to the host via a hypercall. The function is then used to define a sysfs "store" operation, which allows to change the (v)CPU the channel will interrupt by using the sysfs interface. The feature can be used for load balancing or other purposes. One interesting catch here is that Hyper-V can *not* currently ACK CHANNELMSG_MODIFYCHANNEL messages with the promise that (after the ACK is sent) the channel won't send any more interrupts to the "old" CPU. The peculiarity of the CHANNELMSG_MODIFYCHANNEL messages is problematic if the user want to take a CPU offline, since we don't want to take a CPU offline (and, potentially, "lose" channel interrupts on such CPU) if the host is still processing a CHANNELMSG_MODIFYCHANNEL message associated to that CPU. It is worth mentioning, however, that we have been unable to observe the above mentioned "race": in all our tests, CHANNELMSG_MODIFYCHANNEL requests appeared *as if* they were processed synchronously by the host. Suggested-by: Michael Kelley Signed-off-by: Andrea Parri (Microsoft) Link: https://lore.kernel.org/r/20200406001514.19876-11-parri.andrea@gmail.com Reviewed-by: Michael Kelley [ wei: fix conflict in channel_mgmt.c ] Signed-off-by: Wei Liu --- drivers/hv/channel.c | 28 ++++++++++++ drivers/hv/channel_mgmt.c | 2 +- drivers/hv/hv_trace.h | 19 ++++++++ drivers/hv/vmbus_drv.c | 108 +++++++++++++++++++++++++++++++++++++++++++++- include/linux/hyperv.h | 10 ++++- 5 files changed, 163 insertions(+), 4 deletions(-) diff --git a/drivers/hv/channel.c b/drivers/hv/channel.c index 132e476f87b2..90070b337c10 100644 --- a/drivers/hv/channel.c +++ b/drivers/hv/channel.c @@ -289,6 +289,34 @@ int vmbus_send_tl_connect_request(const guid_t *shv_guest_servie_id, } EXPORT_SYMBOL_GPL(vmbus_send_tl_connect_request); +/* + * Set/change the vCPU (@target_vp) the channel (@child_relid) will interrupt. + * + * CHANNELMSG_MODIFYCHANNEL messages are aynchronous. Also, Hyper-V does not + * ACK such messages. IOW we can't know when the host will stop interrupting + * the "old" vCPU and start interrupting the "new" vCPU for the given channel. + * + * The CHANNELMSG_MODIFYCHANNEL message type is supported since VMBus version + * VERSION_WIN10_V4_1. + */ +int vmbus_send_modifychannel(u32 child_relid, u32 target_vp) +{ + struct vmbus_channel_modifychannel conn_msg; + int ret; + + memset(&conn_msg, 0, sizeof(conn_msg)); + conn_msg.header.msgtype = CHANNELMSG_MODIFYCHANNEL; + conn_msg.child_relid = child_relid; + conn_msg.target_vp = target_vp; + + ret = vmbus_post_msg(&conn_msg, sizeof(conn_msg), true); + + trace_vmbus_send_modifychannel(&conn_msg, ret); + + return ret; +} +EXPORT_SYMBOL_GPL(vmbus_send_modifychannel); + /* * create_gpadl_header - Creates a gpadl for the specified buffer */ diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c index 2db3823f0e59..ffd7fffa5f83 100644 --- a/drivers/hv/channel_mgmt.c +++ b/drivers/hv/channel_mgmt.c @@ -1383,7 +1383,7 @@ channel_message_table[CHANNELMSG_COUNT] = { { CHANNELMSG_19, 0, NULL, 0}, { CHANNELMSG_20, 0, NULL, 0}, { CHANNELMSG_TL_CONNECT_REQUEST, 0, NULL, 0}, - { CHANNELMSG_22, 0, NULL, 0}, + { CHANNELMSG_MODIFYCHANNEL, 0, NULL, 0}, { CHANNELMSG_TL_CONNECT_RESULT, 0, NULL, 0}, }; diff --git a/drivers/hv/hv_trace.h b/drivers/hv/hv_trace.h index e70783e33680..a43bc76c2d5d 100644 --- a/drivers/hv/hv_trace.h +++ b/drivers/hv/hv_trace.h @@ -296,6 +296,25 @@ TRACE_EVENT(vmbus_send_tl_connect_request, ) ); +TRACE_EVENT(vmbus_send_modifychannel, + TP_PROTO(const struct vmbus_channel_modifychannel *msg, + int ret), + TP_ARGS(msg, ret), + TP_STRUCT__entry( + __field(u32, child_relid) + __field(u32, target_vp) + __field(int, ret) + ), + TP_fast_assign( + __entry->child_relid = msg->child_relid; + __entry->target_vp = msg->target_vp; + __entry->ret = ret; + ), + TP_printk("binding child_relid 0x%x to target_vp 0x%x, ret %d", + __entry->child_relid, __entry->target_vp, __entry->ret + ) + ); + DECLARE_EVENT_CLASS(vmbus_channel, TP_PROTO(const struct vmbus_channel *channel), TP_ARGS(channel), diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c index 0f7dfa507a40..5d24b25fb5aa 100644 --- a/drivers/hv/vmbus_drv.c +++ b/drivers/hv/vmbus_drv.c @@ -1606,8 +1606,24 @@ static ssize_t vmbus_chan_attr_show(struct kobject *kobj, return attribute->show(chan, buf); } +static ssize_t vmbus_chan_attr_store(struct kobject *kobj, + struct attribute *attr, const char *buf, + size_t count) +{ + const struct vmbus_chan_attribute *attribute + = container_of(attr, struct vmbus_chan_attribute, attr); + struct vmbus_channel *chan + = container_of(kobj, struct vmbus_channel, kobj); + + if (!attribute->store) + return -EIO; + + return attribute->store(chan, buf, count); +} + static const struct sysfs_ops vmbus_chan_sysfs_ops = { .show = vmbus_chan_attr_show, + .store = vmbus_chan_attr_store, }; static ssize_t out_mask_show(struct vmbus_channel *channel, char *buf) @@ -1678,11 +1694,99 @@ static ssize_t write_avail_show(struct vmbus_channel *channel, char *buf) } static VMBUS_CHAN_ATTR_RO(write_avail); -static ssize_t show_target_cpu(struct vmbus_channel *channel, char *buf) +static ssize_t target_cpu_show(struct vmbus_channel *channel, char *buf) { return sprintf(buf, "%u\n", channel->target_cpu); } -static VMBUS_CHAN_ATTR(cpu, S_IRUGO, show_target_cpu, NULL); +static ssize_t target_cpu_store(struct vmbus_channel *channel, + const char *buf, size_t count) +{ + ssize_t ret = count; + u32 target_cpu; + + if (vmbus_proto_version < VERSION_WIN10_V4_1) + return -EIO; + + if (sscanf(buf, "%uu", &target_cpu) != 1) + return -EIO; + + /* Validate target_cpu for the cpumask_test_cpu() operation below. */ + if (target_cpu >= nr_cpumask_bits) + return -EINVAL; + + /* No CPUs should come up or down during this. */ + cpus_read_lock(); + + if (!cpumask_test_cpu(target_cpu, cpu_online_mask)) { + cpus_read_unlock(); + return -EINVAL; + } + + /* + * Synchronizes target_cpu_store() and channel closure: + * + * { Initially: state = CHANNEL_OPENED } + * + * CPU1 CPU2 + * + * [target_cpu_store()] [vmbus_disconnect_ring()] + * + * LOCK channel_mutex LOCK channel_mutex + * LOAD r1 = state LOAD r2 = state + * IF (r1 == CHANNEL_OPENED) IF (r2 == CHANNEL_OPENED) + * SEND MODIFYCHANNEL STORE state = CHANNEL_OPEN + * [...] SEND CLOSECHANNEL + * UNLOCK channel_mutex UNLOCK channel_mutex + * + * Forbids: r1 == r2 == CHANNEL_OPENED (i.e., CPU1's LOCK precedes + * CPU2's LOCK) && CPU2's SEND precedes CPU1's SEND + * + * Note. The host processes the channel messages "sequentially", in + * the order in which they are received on a per-partition basis. + */ + mutex_lock(&vmbus_connection.channel_mutex); + + /* + * Hyper-V will ignore MODIFYCHANNEL messages for "non-open" channels; + * avoid sending the message and fail here for such channels. + */ + if (channel->state != CHANNEL_OPENED_STATE) { + ret = -EIO; + goto cpu_store_unlock; + } + + if (channel->target_cpu == target_cpu) + goto cpu_store_unlock; + + if (vmbus_send_modifychannel(channel->offermsg.child_relid, + hv_cpu_number_to_vp_number(target_cpu))) { + ret = -EIO; + goto cpu_store_unlock; + } + + /* + * Warning. At this point, there is *no* guarantee that the host will + * have successfully processed the vmbus_send_modifychannel() request. + * See the header comment of vmbus_send_modifychannel() for more info. + * + * Lags in the processing of the above vmbus_send_modifychannel() can + * result in missed interrupts if the "old" target CPU is taken offline + * before Hyper-V starts sending interrupts to the "new" target CPU. + * But apart from this offlining scenario, the code tolerates such + * lags. It will function correctly even if a channel interrupt comes + * in on a CPU that is different from the channel target_cpu value. + */ + + channel->target_cpu = target_cpu; + channel->target_vp = hv_cpu_number_to_vp_number(target_cpu); + channel->numa_node = cpu_to_node(target_cpu); + +cpu_store_unlock: + mutex_unlock(&vmbus_connection.channel_mutex); + cpus_read_unlock(); + return ret; +} +static VMBUS_CHAN_ATTR(cpu, 0644, target_cpu_show, target_cpu_store); static ssize_t channel_pending_show(struct vmbus_channel *channel, char *buf) diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index 247356dbd742..b85d7580f2c1 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -425,7 +425,7 @@ enum vmbus_channel_message_type { CHANNELMSG_19 = 19, CHANNELMSG_20 = 20, CHANNELMSG_TL_CONNECT_REQUEST = 21, - CHANNELMSG_22 = 22, + CHANNELMSG_MODIFYCHANNEL = 22, CHANNELMSG_TL_CONNECT_RESULT = 23, CHANNELMSG_COUNT }; @@ -620,6 +620,13 @@ struct vmbus_channel_tl_connect_request { guid_t host_service_id; } __packed; +/* Modify Channel parameters, cf. vmbus_send_modifychannel() */ +struct vmbus_channel_modifychannel { + struct vmbus_channel_message_header header; + u32 child_relid; + u32 target_vp; +} __packed; + struct vmbus_channel_version_response { struct vmbus_channel_message_header header; u8 version_supported; @@ -1505,6 +1512,7 @@ extern __u32 vmbus_proto_version; int vmbus_send_tl_connect_request(const guid_t *shv_guest_servie_id, const guid_t *shv_host_servie_id); +int vmbus_send_modifychannel(u32 child_relid, u32 target_vp); void vmbus_set_event(struct vmbus_channel *channel); /* Get the start of the ring buffer. */ -- cgit v1.2.3 From 7769e18c201aa88eade5556faf9da7f2bc15bb8a Mon Sep 17 00:00:00 2001 From: "Andrea Parri (Microsoft)" Date: Mon, 6 Apr 2020 02:15:14 +0200 Subject: scsi: storvsc: Re-init stor_chns when a channel interrupt is re-assigned For each storvsc_device, storvsc keeps track of the channel target CPUs associated to the device (alloced_cpus) and it uses this information to fill a "cache" (stor_chns) mapping CPU->channel according to a certain heuristic. Update the alloced_cpus mask and the stor_chns array when a channel of the storvsc device is re-assigned to a different CPU. Signed-off-by: Andrea Parri (Microsoft) Cc: "James E.J. Bottomley" Cc: "Martin K. Petersen" Cc: Link: https://lore.kernel.org/r/20200406001514.19876-12-parri.andrea@gmail.com Reviewed-by; Long Li Reviewed-by: Michael Kelley [ wei: fix a small issue reported by kbuild test robot ] Signed-off-by: Wei Liu --- drivers/hv/vmbus_drv.c | 4 ++ drivers/scsi/storvsc_drv.c | 96 ++++++++++++++++++++++++++++++++++++++++++---- include/linux/hyperv.h | 3 ++ 3 files changed, 95 insertions(+), 8 deletions(-) diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c index 5d24b25fb5aa..28c009c7a9cd 100644 --- a/drivers/hv/vmbus_drv.c +++ b/drivers/hv/vmbus_drv.c @@ -1777,6 +1777,10 @@ static ssize_t target_cpu_store(struct vmbus_channel *channel, * in on a CPU that is different from the channel target_cpu value. */ + if (channel->change_target_cpu_callback) + (*channel->change_target_cpu_callback)(channel, + channel->target_cpu, target_cpu); + channel->target_cpu = target_cpu; channel->target_vp = hv_cpu_number_to_vp_number(target_cpu); channel->numa_node = cpu_to_node(target_cpu); diff --git a/drivers/scsi/storvsc_drv.c b/drivers/scsi/storvsc_drv.c index fb41636519ee..072ed8728657 100644 --- a/drivers/scsi/storvsc_drv.c +++ b/drivers/scsi/storvsc_drv.c @@ -621,6 +621,64 @@ get_in_err: } +static void storvsc_change_target_cpu(struct vmbus_channel *channel, u32 old, + u32 new) +{ + struct storvsc_device *stor_device; + struct vmbus_channel *cur_chn; + bool old_is_alloced = false; + struct hv_device *device; + unsigned long flags; + int cpu; + + device = channel->primary_channel ? + channel->primary_channel->device_obj + : channel->device_obj; + stor_device = get_out_stor_device(device); + if (!stor_device) + return; + + /* See storvsc_do_io() -> get_og_chn(). */ + spin_lock_irqsave(&device->channel->lock, flags); + + /* + * Determines if the storvsc device has other channels assigned to + * the "old" CPU to update the alloced_cpus mask and the stor_chns + * array. + */ + if (device->channel != channel && device->channel->target_cpu == old) { + cur_chn = device->channel; + old_is_alloced = true; + goto old_is_alloced; + } + list_for_each_entry(cur_chn, &device->channel->sc_list, sc_list) { + if (cur_chn == channel) + continue; + if (cur_chn->target_cpu == old) { + old_is_alloced = true; + goto old_is_alloced; + } + } + +old_is_alloced: + if (old_is_alloced) + WRITE_ONCE(stor_device->stor_chns[old], cur_chn); + else + cpumask_clear_cpu(old, &stor_device->alloced_cpus); + + /* "Flush" the stor_chns array. */ + for_each_possible_cpu(cpu) { + if (stor_device->stor_chns[cpu] && !cpumask_test_cpu( + cpu, &stor_device->alloced_cpus)) + WRITE_ONCE(stor_device->stor_chns[cpu], NULL); + } + + WRITE_ONCE(stor_device->stor_chns[new], channel); + cpumask_set_cpu(new, &stor_device->alloced_cpus); + + spin_unlock_irqrestore(&device->channel->lock, flags); +} + static void handle_sc_creation(struct vmbus_channel *new_sc) { struct hv_device *device = new_sc->primary_channel->device_obj; @@ -648,6 +706,8 @@ static void handle_sc_creation(struct vmbus_channel *new_sc) return; } + new_sc->change_target_cpu_callback = storvsc_change_target_cpu; + /* Add the sub-channel to the array of available channels. */ stor_device->stor_chns[new_sc->target_cpu] = new_sc; cpumask_set_cpu(new_sc->target_cpu, &stor_device->alloced_cpus); @@ -876,6 +936,8 @@ static int storvsc_channel_init(struct hv_device *device, bool is_fc) if (stor_device->stor_chns == NULL) return -ENOMEM; + device->channel->change_target_cpu_callback = storvsc_change_target_cpu; + stor_device->stor_chns[device->channel->target_cpu] = device->channel; cpumask_set_cpu(device->channel->target_cpu, &stor_device->alloced_cpus); @@ -1248,8 +1310,10 @@ static struct vmbus_channel *get_og_chn(struct storvsc_device *stor_device, const struct cpumask *node_mask; int num_channels, tgt_cpu; - if (stor_device->num_sc == 0) + if (stor_device->num_sc == 0) { + stor_device->stor_chns[q_num] = stor_device->device->channel; return stor_device->device->channel; + } /* * Our channel array is sparsley populated and we @@ -1258,7 +1322,6 @@ static struct vmbus_channel *get_og_chn(struct storvsc_device *stor_device, * The strategy is simple: * I. Ensure NUMA locality * II. Distribute evenly (best effort) - * III. Mapping is persistent. */ node_mask = cpumask_of_node(cpu_to_node(q_num)); @@ -1268,8 +1331,10 @@ static struct vmbus_channel *get_og_chn(struct storvsc_device *stor_device, if (cpumask_test_cpu(tgt_cpu, node_mask)) num_channels++; } - if (num_channels == 0) + if (num_channels == 0) { + stor_device->stor_chns[q_num] = stor_device->device->channel; return stor_device->device->channel; + } hash_qnum = q_num; while (hash_qnum >= num_channels) @@ -1295,6 +1360,7 @@ static int storvsc_do_io(struct hv_device *device, struct storvsc_device *stor_device; struct vstor_packet *vstor_packet; struct vmbus_channel *outgoing_channel, *channel; + unsigned long flags; int ret = 0; const struct cpumask *node_mask; int tgt_cpu; @@ -1308,10 +1374,11 @@ static int storvsc_do_io(struct hv_device *device, request->device = device; /* - * Select an an appropriate channel to send the request out. + * Select an appropriate channel to send the request out. */ - if (stor_device->stor_chns[q_num] != NULL) { - outgoing_channel = stor_device->stor_chns[q_num]; + /* See storvsc_change_target_cpu(). */ + outgoing_channel = READ_ONCE(stor_device->stor_chns[q_num]); + if (outgoing_channel != NULL) { if (outgoing_channel->target_cpu == q_num) { /* * Ideally, we want to pick a different channel if @@ -1324,7 +1391,10 @@ static int storvsc_do_io(struct hv_device *device, continue; if (tgt_cpu == q_num) continue; - channel = stor_device->stor_chns[tgt_cpu]; + channel = READ_ONCE( + stor_device->stor_chns[tgt_cpu]); + if (channel == NULL) + continue; if (hv_get_avail_to_write_percent( &channel->outbound) > ring_avail_percent_lowater) { @@ -1350,7 +1420,10 @@ static int storvsc_do_io(struct hv_device *device, for_each_cpu(tgt_cpu, &stor_device->alloced_cpus) { if (cpumask_test_cpu(tgt_cpu, node_mask)) continue; - channel = stor_device->stor_chns[tgt_cpu]; + channel = READ_ONCE( + stor_device->stor_chns[tgt_cpu]); + if (channel == NULL) + continue; if (hv_get_avail_to_write_percent( &channel->outbound) > ring_avail_percent_lowater) { @@ -1360,7 +1433,14 @@ static int storvsc_do_io(struct hv_device *device, } } } else { + spin_lock_irqsave(&device->channel->lock, flags); + outgoing_channel = stor_device->stor_chns[q_num]; + if (outgoing_channel != NULL) { + spin_unlock_irqrestore(&device->channel->lock, flags); + goto found_channel; + } outgoing_channel = get_og_chn(stor_device, q_num); + spin_unlock_irqrestore(&device->channel->lock, flags); } found_channel: diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index b85d7580f2c1..cd64ab7bb3b7 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -773,6 +773,9 @@ struct vmbus_channel { void (*onchannel_callback)(void *context); void *channel_callback_context; + void (*change_target_cpu_callback)(struct vmbus_channel *channel, + u32 old, u32 new); + /* * Synchronize channel scheduling and channel removal; see the inline * comments in vmbus_chan_sched() and vmbus_reset_channel_cb(). -- cgit v1.2.3 From 677b0ce5d66c5445440f3cedd3c20ff1665c0f46 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 14 Apr 2020 16:23:43 +0100 Subject: drivers: hv: remove redundant assignment to pointer primary_channel The pointer primary_channel is being assigned with a value that is never used. The assignment is redundant and can be removed. Move the definition of primary_channel to a narrower scope. Addresses-Coverity: ("Unused value") Signed-off-by: Colin Ian King Link: https://lore.kernel.org/r/20200414152343.243166-1-colin.king@canonical.com [ wei: move primary_channel and update commit message ] Signed-off-by: Wei Liu --- drivers/hv/channel_mgmt.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c index ffd7fffa5f83..fde806d6525b 100644 --- a/drivers/hv/channel_mgmt.c +++ b/drivers/hv/channel_mgmt.c @@ -400,7 +400,6 @@ static void vmbus_release_relid(u32 relid) void hv_process_channel_removal(struct vmbus_channel *channel) { - struct vmbus_channel *primary_channel; unsigned long flags; lockdep_assert_held(&vmbus_connection.channel_mutex); @@ -425,10 +424,8 @@ void hv_process_channel_removal(struct vmbus_channel *channel) if (channel->primary_channel == NULL) { list_del(&channel->listentry); - - primary_channel = channel; } else { - primary_channel = channel->primary_channel; + struct vmbus_channel *primary_channel = channel->primary_channel; spin_lock_irqsave(&primary_channel->lock, flags); list_del(&channel->sc_list); spin_unlock_irqrestore(&primary_channel->lock, flags); -- cgit v1.2.3 From 7357b1df744c2a3bcbe00cea0eef1509d004f488 Mon Sep 17 00:00:00 2001 From: Michael Kelley Date: Wed, 22 Apr 2020 12:57:34 -0700 Subject: KVM: x86: hyperv: Remove duplicate definitions of Reference TSC Page The Hyper-V Reference TSC Page structure is defined twice. struct ms_hyperv_tsc_page has padding out to a full 4 Kbyte page size. But the padding is not needed because the declaration includes a union with HV_HYP_PAGE_SIZE. KVM uses the second definition, which is struct _HV_REFERENCE_TSC_PAGE, because it does not have the padding. Fix the duplication by removing the padding from ms_hyperv_tsc_page. Fix up the KVM code to use it. Remove the no longer used struct _HV_REFERENCE_TSC_PAGE. There is no functional change. Signed-off-by: Michael Kelley Acked-by: Paolo Bonzini Reviewed-by: Vitaly Kuznetsov Link: https://lore.kernel.org/r/20200422195737.10223-2-mikelley@microsoft.com Signed-off-by: Wei Liu --- arch/x86/include/asm/hyperv-tlfs.h | 8 -------- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/hyperv.c | 4 ++-- 3 files changed, 3 insertions(+), 11 deletions(-) diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h index 29336574d0bc..0e4d76920957 100644 --- a/arch/x86/include/asm/hyperv-tlfs.h +++ b/arch/x86/include/asm/hyperv-tlfs.h @@ -303,7 +303,6 @@ struct ms_hyperv_tsc_page { u32 reserved1; volatile u64 tsc_scale; volatile s64 tsc_offset; - u64 reserved2[509]; } __packed; /* @@ -433,13 +432,6 @@ enum HV_GENERIC_SET_FORMAT { */ #define HV_CLOCK_HZ (NSEC_PER_SEC/100) -typedef struct _HV_REFERENCE_TSC_PAGE { - __u32 tsc_sequence; - __u32 res1; - __u64 tsc_scale; - __s64 tsc_offset; -} __packed HV_REFERENCE_TSC_PAGE, *PHV_REFERENCE_TSC_PAGE; - /* Define the number of synthetic interrupt sources. */ #define HV_SYNIC_SINT_COUNT (16) /* Define the expected SynIC version. */ diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 42a2d0d3984a..4698343b9a05 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -865,7 +865,7 @@ struct kvm_hv { u64 hv_crash_param[HV_X64_MSR_CRASH_PARAMS]; u64 hv_crash_ctl; - HV_REFERENCE_TSC_PAGE tsc_ref; + struct ms_hyperv_tsc_page tsc_ref; struct idr conn_to_evt; diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index bcefa9d4e57e..1f3c6fd3cdaa 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -900,7 +900,7 @@ static int kvm_hv_msr_set_crash_data(struct kvm_vcpu *vcpu, * These two equivalencies are implemented in this function. */ static bool compute_tsc_page_parameters(struct pvclock_vcpu_time_info *hv_clock, - HV_REFERENCE_TSC_PAGE *tsc_ref) + struct ms_hyperv_tsc_page *tsc_ref) { u64 max_mul; @@ -941,7 +941,7 @@ void kvm_hv_setup_tsc_page(struct kvm *kvm, u64 gfn; BUILD_BUG_ON(sizeof(tsc_seq) != sizeof(hv->tsc_ref.tsc_sequence)); - BUILD_BUG_ON(offsetof(HV_REFERENCE_TSC_PAGE, tsc_sequence) != 0); + BUILD_BUG_ON(offsetof(struct ms_hyperv_tsc_page, tsc_sequence) != 0); if (!(hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE)) return; -- cgit v1.2.3 From a8a42d0284f18a64d540a604aecec9f961964e3b Mon Sep 17 00:00:00 2001 From: Michael Kelley Date: Wed, 22 Apr 2020 12:57:35 -0700 Subject: x86/hyperv: Remove HV_PROCESSOR_POWER_STATE #defines The HV_PROCESSOR_POWER_STATE_C #defines date back to year 2010, but they are not in the TLFS v6.0 document and are not used anywhere in Linux. Remove them. Signed-off-by: Michael Kelley Link: https://lore.kernel.org/r/20200422195737.10223-3-mikelley@microsoft.com Signed-off-by: Wei Liu --- arch/x86/include/asm/hyperv-tlfs.h | 5 ----- 1 file changed, 5 deletions(-) diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h index 0e4d76920957..2dd1ceb2bcf8 100644 --- a/arch/x86/include/asm/hyperv-tlfs.h +++ b/arch/x86/include/asm/hyperv-tlfs.h @@ -390,11 +390,6 @@ struct hv_tsc_emulation_status { #define HV_X64_MSR_TSC_REFERENCE_ENABLE 0x00000001 #define HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT 12 -#define HV_PROCESSOR_POWER_STATE_C0 0 -#define HV_PROCESSOR_POWER_STATE_C1 1 -#define HV_PROCESSOR_POWER_STATE_C2 2 -#define HV_PROCESSOR_POWER_STATE_C3 3 - #define HV_FLUSH_ALL_PROCESSORS BIT(0) #define HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES BIT(1) #define HV_FLUSH_NON_GLOBAL_MAPPINGS_ONLY BIT(2) -- cgit v1.2.3 From c55a844f46f958b4df804eccd734b86795e40359 Mon Sep 17 00:00:00 2001 From: Michael Kelley Date: Wed, 22 Apr 2020 12:57:36 -0700 Subject: x86/hyperv: Split hyperv-tlfs.h into arch dependent and independent files In preparation for adding ARM64 support, split hyperv-tlfs.h into architecture dependent and architecture independent files, similar to what has been done with mshyperv.h. Move architecture independent definitions into include/asm-generic/hyperv-tlfs.h. The split will avoid duplicating significant lines of code in the ARM64 version of hyperv-tlfs.h. The split has no functional impact. Some of the common definitions have "X64" in the symbol name. Change these to remove the "X64" in the architecture independent version of hyperv-tlfs.h, but add aliases with the "X64" in the x86 version so that x86 code will continue to compile. A later patch set will change all the references and allow removal of the aliases. Signed-off-by: Michael Kelley Link: https://lore.kernel.org/r/20200422195737.10223-4-mikelley@microsoft.com Signed-off-by: Wei Liu --- MAINTAINERS | 1 + arch/x86/include/asm/hyperv-tlfs.h | 459 +++---------------------------------- include/asm-generic/hyperv-tlfs.h | 442 +++++++++++++++++++++++++++++++++++ 3 files changed, 479 insertions(+), 423 deletions(-) create mode 100644 include/asm-generic/hyperv-tlfs.h diff --git a/MAINTAINERS b/MAINTAINERS index b816a453b10e..bfb9f384b3f6 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7883,6 +7883,7 @@ F: drivers/pci/controller/pci-hyperv.c F: drivers/scsi/storvsc_drv.c F: drivers/uio/uio_hv_generic.c F: drivers/video/fbdev/hyperv_fb.c +F: include/asm-generic/hyperv-tlfs.h F: include/asm-generic/mshyperv.h F: include/clocksource/hyperv_timer.h F: include/linux/hyperv.h diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h index 2dd1ceb2bcf8..4e91f6118d5d 100644 --- a/arch/x86/include/asm/hyperv-tlfs.h +++ b/arch/x86/include/asm/hyperv-tlfs.h @@ -11,17 +11,6 @@ #include #include - -/* - * While not explicitly listed in the TLFS, Hyper-V always runs with a page size - * of 4096. These definitions are used when communicating with Hyper-V using - * guest physical pages and guest physical page addresses, since the guest page - * size may not be 4096 on all architectures. - */ -#define HV_HYP_PAGE_SHIFT 12 -#define HV_HYP_PAGE_SIZE BIT(HV_HYP_PAGE_SHIFT) -#define HV_HYP_PAGE_MASK (~(HV_HYP_PAGE_SIZE - 1)) - /* * The below CPUID leaves are present if VersionAndFeatures.HypervisorPresent * is set by CPUID(HvCpuIdFunctionVersionAndFeatures). @@ -39,78 +28,41 @@ #define HYPERV_CPUID_MAX 0x4000ffff /* - * Feature identification. EAX indicates which features are available - * to the partition based upon the current partition privileges. - * These are HYPERV_CPUID_FEATURES.EAX bits. + * Aliases for Group A features that have X64 in the name. + * On x86/x64 these are HYPERV_CPUID_FEATURES.EAX bits. */ -/* VP Runtime (HV_X64_MSR_VP_RUNTIME) available */ -#define HV_X64_MSR_VP_RUNTIME_AVAILABLE BIT(0) -/* Partition Reference Counter (HV_X64_MSR_TIME_REF_COUNT) available*/ -#define HV_MSR_TIME_REF_COUNT_AVAILABLE BIT(1) -/* - * Basic SynIC MSRs (HV_X64_MSR_SCONTROL through HV_X64_MSR_EOM - * and HV_X64_MSR_SINT0 through HV_X64_MSR_SINT15) available - */ -#define HV_X64_MSR_SYNIC_AVAILABLE BIT(2) -/* - * Synthetic Timer MSRs (HV_X64_MSR_STIMER0_CONFIG through - * HV_X64_MSR_STIMER3_COUNT) available - */ -#define HV_MSR_SYNTIMER_AVAILABLE BIT(3) -/* - * APIC access MSRs (HV_X64_MSR_EOI, HV_X64_MSR_ICR and HV_X64_MSR_TPR) - * are available - */ -#define HV_X64_MSR_APIC_ACCESS_AVAILABLE BIT(4) -/* Hypercall MSRs (HV_X64_MSR_GUEST_OS_ID and HV_X64_MSR_HYPERCALL) available*/ -#define HV_X64_MSR_HYPERCALL_AVAILABLE BIT(5) -/* Access virtual processor index MSR (HV_X64_MSR_VP_INDEX) available*/ -#define HV_X64_MSR_VP_INDEX_AVAILABLE BIT(6) -/* Virtual system reset MSR (HV_X64_MSR_RESET) is available*/ -#define HV_X64_MSR_RESET_AVAILABLE BIT(7) -/* - * Access statistics pages MSRs (HV_X64_MSR_STATS_PARTITION_RETAIL_PAGE, - * HV_X64_MSR_STATS_PARTITION_INTERNAL_PAGE, HV_X64_MSR_STATS_VP_RETAIL_PAGE, - * HV_X64_MSR_STATS_VP_INTERNAL_PAGE) available - */ -#define HV_X64_MSR_STAT_PAGES_AVAILABLE BIT(8) -/* Partition reference TSC MSR is available */ -#define HV_MSR_REFERENCE_TSC_AVAILABLE BIT(9) -/* Partition Guest IDLE MSR is available */ -#define HV_X64_MSR_GUEST_IDLE_AVAILABLE BIT(10) -/* - * There is a single feature flag that signifies if the partition has access - * to MSRs with local APIC and TSC frequencies. - */ -#define HV_X64_ACCESS_FREQUENCY_MSRS BIT(11) -/* AccessReenlightenmentControls privilege */ -#define HV_X64_ACCESS_REENLIGHTENMENT BIT(13) -/* AccessTscInvariantControls privilege */ -#define HV_X64_ACCESS_TSC_INVARIANT BIT(15) +#define HV_X64_MSR_VP_RUNTIME_AVAILABLE \ + HV_MSR_VP_RUNTIME_AVAILABLE +#define HV_X64_MSR_SYNIC_AVAILABLE \ + HV_MSR_SYNIC_AVAILABLE +#define HV_X64_MSR_APIC_ACCESS_AVAILABLE \ + HV_MSR_APIC_ACCESS_AVAILABLE +#define HV_X64_MSR_HYPERCALL_AVAILABLE \ + HV_MSR_HYPERCALL_AVAILABLE +#define HV_X64_MSR_VP_INDEX_AVAILABLE \ + HV_MSR_VP_INDEX_AVAILABLE +#define HV_X64_MSR_RESET_AVAILABLE \ + HV_MSR_RESET_AVAILABLE +#define HV_X64_MSR_GUEST_IDLE_AVAILABLE \ + HV_MSR_GUEST_IDLE_AVAILABLE +#define HV_X64_ACCESS_FREQUENCY_MSRS \ + HV_ACCESS_FREQUENCY_MSRS +#define HV_X64_ACCESS_REENLIGHTENMENT \ + HV_ACCESS_REENLIGHTENMENT +#define HV_X64_ACCESS_TSC_INVARIANT \ + HV_ACCESS_TSC_INVARIANT /* - * Feature identification: indicates which flags were specified at partition - * creation. The format is the same as the partition creation flag structure - * defined in section Partition Creation Flags. - * These are HYPERV_CPUID_FEATURES.EBX bits. + * Aliases for Group B features that have X64 in the name. + * On x86/x64 these are HYPERV_CPUID_FEATURES.EBX bits. */ -#define HV_X64_CREATE_PARTITIONS BIT(0) -#define HV_X64_ACCESS_PARTITION_ID BIT(1) -#define HV_X64_ACCESS_MEMORY_POOL BIT(2) -#define HV_X64_ADJUST_MESSAGE_BUFFERS BIT(3) -#define HV_X64_POST_MESSAGES BIT(4) -#define HV_X64_SIGNAL_EVENTS BIT(5) -#define HV_X64_CREATE_PORT BIT(6) -#define HV_X64_CONNECT_PORT BIT(7) -#define HV_X64_ACCESS_STATS BIT(8) -#define HV_X64_DEBUGGING BIT(11) -#define HV_X64_CPU_POWER_MANAGEMENT BIT(12) +#define HV_X64_POST_MESSAGES HV_POST_MESSAGES +#define HV_X64_SIGNAL_EVENTS HV_SIGNAL_EVENTS /* - * Feature identification. EDX indicates which miscellaneous features - * are available to the partition. - * These are HYPERV_CPUID_FEATURES.EDX bits. + * Group D Features. The bit assignments are custom to each architecture. + * On x86/x64 these are HYPERV_CPUID_FEATURES.EDX bits. */ /* The MWAIT instruction is available (per section MONITOR / MWAIT) */ #define HV_X64_MWAIT_AVAILABLE BIT(0) @@ -187,7 +139,7 @@ * processor, except for virtual processors that are reported as sibling SMT * threads. */ -#define HV_X64_NO_NONARCH_CORESHARING BIT(18) +#define HV_X64_NO_NONARCH_CORESHARING BIT(18) /* Nested features. These are HYPERV_CPUID_NESTED_FEATURES.EAX bits. */ #define HV_X64_NESTED_DIRECT_FLUSH BIT(17) @@ -295,42 +247,6 @@ union hv_x64_msr_hypercall_contents { } __packed; }; -/* - * TSC page layout. - */ -struct ms_hyperv_tsc_page { - volatile u32 tsc_sequence; - u32 reserved1; - volatile u64 tsc_scale; - volatile s64 tsc_offset; -} __packed; - -/* - * The guest OS needs to register the guest ID with the hypervisor. - * The guest ID is a 64 bit entity and the structure of this ID is - * specified in the Hyper-V specification: - * - * msdn.microsoft.com/en-us/library/windows/hardware/ff542653%28v=vs.85%29.aspx - * - * While the current guideline does not specify how Linux guest ID(s) - * need to be generated, our plan is to publish the guidelines for - * Linux and other guest operating systems that currently are hosted - * on Hyper-V. The implementation here conforms to this yet - * unpublished guidelines. - * - * - * Bit(s) - * 63 - Indicates if the OS is Open Source or not; 1 is Open Source - * 62:56 - Os Type; Linux is 0x100 - * 55:48 - Distro specific identification - * 47:16 - Linux kernel version number - * 15:0 - Distro specific identification - * - * - */ - -#define HV_LINUX_VENDOR_ID 0x8100 - struct hv_reenlightenment_control { __u64 vector:8; __u64 reserved1:8; @@ -354,31 +270,12 @@ struct hv_tsc_emulation_status { #define HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_MASK \ (~((1ull << HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT) - 1)) -/* - * Crash notification (HV_X64_MSR_CRASH_CTL) flags. - */ -#define HV_CRASH_CTL_CRASH_NOTIFY_MSG BIT_ULL(62) -#define HV_CRASH_CTL_CRASH_NOTIFY BIT_ULL(63) #define HV_X64_MSR_CRASH_PARAMS \ (1 + (HV_X64_MSR_CRASH_P4 - HV_X64_MSR_CRASH_P0)) #define HV_IPI_LOW_VECTOR 0x10 #define HV_IPI_HIGH_VECTOR 0xff -/* Declare the various hypercall operations. */ -#define HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE 0x0002 -#define HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST 0x0003 -#define HVCALL_NOTIFY_LONG_SPIN_WAIT 0x0008 -#define HVCALL_SEND_IPI 0x000b -#define HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX 0x0013 -#define HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX 0x0014 -#define HVCALL_SEND_IPI_EX 0x0015 -#define HVCALL_POST_MESSAGE 0x005c -#define HVCALL_SIGNAL_EVENT 0x005d -#define HVCALL_RETARGET_INTERRUPT 0x007e -#define HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE 0x00af -#define HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_LIST 0x00b0 - #define HV_X64_MSR_VP_ASSIST_PAGE_ENABLE 0x00000001 #define HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_SHIFT 12 #define HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_MASK \ @@ -390,63 +287,6 @@ struct hv_tsc_emulation_status { #define HV_X64_MSR_TSC_REFERENCE_ENABLE 0x00000001 #define HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT 12 -#define HV_FLUSH_ALL_PROCESSORS BIT(0) -#define HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES BIT(1) -#define HV_FLUSH_NON_GLOBAL_MAPPINGS_ONLY BIT(2) -#define HV_FLUSH_USE_EXTENDED_RANGE_FORMAT BIT(3) - -enum HV_GENERIC_SET_FORMAT { - HV_GENERIC_SET_SPARSE_4K, - HV_GENERIC_SET_ALL, -}; - -#define HV_PARTITION_ID_SELF ((u64)-1) - -#define HV_HYPERCALL_RESULT_MASK GENMASK_ULL(15, 0) -#define HV_HYPERCALL_FAST_BIT BIT(16) -#define HV_HYPERCALL_VARHEAD_OFFSET 17 -#define HV_HYPERCALL_REP_COMP_OFFSET 32 -#define HV_HYPERCALL_REP_COMP_MASK GENMASK_ULL(43, 32) -#define HV_HYPERCALL_REP_START_OFFSET 48 -#define HV_HYPERCALL_REP_START_MASK GENMASK_ULL(59, 48) - -/* hypercall status code */ -#define HV_STATUS_SUCCESS 0 -#define HV_STATUS_INVALID_HYPERCALL_CODE 2 -#define HV_STATUS_INVALID_HYPERCALL_INPUT 3 -#define HV_STATUS_INVALID_ALIGNMENT 4 -#define HV_STATUS_INVALID_PARAMETER 5 -#define HV_STATUS_INSUFFICIENT_MEMORY 11 -#define HV_STATUS_INVALID_PORT_ID 17 -#define HV_STATUS_INVALID_CONNECTION_ID 18 -#define HV_STATUS_INSUFFICIENT_BUFFERS 19 - -/* - * The Hyper-V TimeRefCount register and the TSC - * page provide a guest VM clock with 100ns tick rate - */ -#define HV_CLOCK_HZ (NSEC_PER_SEC/100) - -/* Define the number of synthetic interrupt sources. */ -#define HV_SYNIC_SINT_COUNT (16) -/* Define the expected SynIC version. */ -#define HV_SYNIC_VERSION_1 (0x1) -/* Valid SynIC vectors are 16-255. */ -#define HV_SYNIC_FIRST_VALID_VECTOR (16) - -#define HV_SYNIC_CONTROL_ENABLE (1ULL << 0) -#define HV_SYNIC_SIMP_ENABLE (1ULL << 0) -#define HV_SYNIC_SIEFP_ENABLE (1ULL << 0) -#define HV_SYNIC_SINT_MASKED (1ULL << 16) -#define HV_SYNIC_SINT_AUTO_EOI (1ULL << 17) -#define HV_SYNIC_SINT_VECTOR_MASK (0xFF) - -#define HV_SYNIC_STIMER_COUNT (4) - -/* Define synthetic interrupt controller message constants. */ -#define HV_MESSAGE_SIZE (256) -#define HV_MESSAGE_PAYLOAD_BYTE_COUNT (240) -#define HV_MESSAGE_PAYLOAD_QWORD_COUNT (30) /* Define hypervisor message types. */ enum hv_message_type { @@ -457,76 +297,25 @@ enum hv_message_type { HVMSG_GPA_INTERCEPT = 0x80000001, /* Timer notification messages. */ - HVMSG_TIMER_EXPIRED = 0x80000010, + HVMSG_TIMER_EXPIRED = 0x80000010, /* Error messages. */ HVMSG_INVALID_VP_REGISTER_VALUE = 0x80000020, HVMSG_UNRECOVERABLE_EXCEPTION = 0x80000021, - HVMSG_UNSUPPORTED_FEATURE = 0x80000022, + HVMSG_UNSUPPORTED_FEATURE = 0x80000022, /* Trace buffer complete messages. */ HVMSG_EVENTLOG_BUFFERCOMPLETE = 0x80000040, /* Platform-specific processor intercept messages. */ - HVMSG_X64_IOPORT_INTERCEPT = 0x80010000, + HVMSG_X64_IOPORT_INTERCEPT = 0x80010000, HVMSG_X64_MSR_INTERCEPT = 0x80010001, - HVMSG_X64_CPUID_INTERCEPT = 0x80010002, + HVMSG_X64_CPUID_INTERCEPT = 0x80010002, HVMSG_X64_EXCEPTION_INTERCEPT = 0x80010003, - HVMSG_X64_APIC_EOI = 0x80010004, - HVMSG_X64_LEGACY_FP_ERROR = 0x80010005 + HVMSG_X64_APIC_EOI = 0x80010004, + HVMSG_X64_LEGACY_FP_ERROR = 0x80010005 }; -/* Define synthetic interrupt controller message flags. */ -union hv_message_flags { - __u8 asu8; - struct { - __u8 msg_pending:1; - __u8 reserved:7; - } __packed; -}; - -/* Define port identifier type. */ -union hv_port_id { - __u32 asu32; - struct { - __u32 id:24; - __u32 reserved:8; - } __packed u; -}; - -/* Define synthetic interrupt controller message header. */ -struct hv_message_header { - __u32 message_type; - __u8 payload_size; - union hv_message_flags message_flags; - __u8 reserved[2]; - union { - __u64 sender; - union hv_port_id port; - }; -} __packed; - -/* Define synthetic interrupt controller message format. */ -struct hv_message { - struct hv_message_header header; - union { - __u64 payload[HV_MESSAGE_PAYLOAD_QWORD_COUNT]; - } u; -} __packed; - -/* Define the synthetic interrupt message page layout. */ -struct hv_message_page { - struct hv_message sint_message[HV_SYNIC_SINT_COUNT]; -} __packed; - -/* Define timer message payload structure. */ -struct hv_timer_message_payload { - __u32 timer_index; - __u32 reserved; - __u64 expiration_time; /* When the timer expired */ - __u64 delivery_time; /* When the message was delivered */ -} __packed; - struct hv_nested_enlightenments_control { struct { __u32 directhypercall:1; @@ -754,187 +543,11 @@ struct hv_enlightened_vmcs { #define HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL 0xFFFF -/* Define synthetic interrupt controller flag constants. */ -#define HV_EVENT_FLAGS_COUNT (256 * 8) -#define HV_EVENT_FLAGS_LONG_COUNT (256 / sizeof(unsigned long)) - -/* - * Synthetic timer configuration. - */ -union hv_stimer_config { - u64 as_uint64; - struct { - u64 enable:1; - u64 periodic:1; - u64 lazy:1; - u64 auto_enable:1; - u64 apic_vector:8; - u64 direct_mode:1; - u64 reserved_z0:3; - u64 sintx:4; - u64 reserved_z1:44; - } __packed; -}; - - -/* Define the synthetic interrupt controller event flags format. */ -union hv_synic_event_flags { - unsigned long flags[HV_EVENT_FLAGS_LONG_COUNT]; -}; - -/* Define SynIC control register. */ -union hv_synic_scontrol { - u64 as_uint64; - struct { - u64 enable:1; - u64 reserved:63; - } __packed; -}; - -/* Define synthetic interrupt source. */ -union hv_synic_sint { - u64 as_uint64; - struct { - u64 vector:8; - u64 reserved1:8; - u64 masked:1; - u64 auto_eoi:1; - u64 polling:1; - u64 reserved2:45; - } __packed; -}; - -/* Define the format of the SIMP register */ -union hv_synic_simp { - u64 as_uint64; - struct { - u64 simp_enabled:1; - u64 preserved:11; - u64 base_simp_gpa:52; - } __packed; -}; - -/* Define the format of the SIEFP register */ -union hv_synic_siefp { - u64 as_uint64; - struct { - u64 siefp_enabled:1; - u64 preserved:11; - u64 base_siefp_gpa:52; - } __packed; -}; - -struct hv_vpset { - u64 format; - u64 valid_bank_mask; - u64 bank_contents[]; -} __packed; - -/* HvCallSendSyntheticClusterIpi hypercall */ -struct hv_send_ipi { - u32 vector; - u32 reserved; - u64 cpu_mask; -} __packed; - -/* HvCallSendSyntheticClusterIpiEx hypercall */ -struct hv_send_ipi_ex { - u32 vector; - u32 reserved; - struct hv_vpset vp_set; -} __packed; - -/* HvFlushGuestPhysicalAddressSpace hypercalls */ -struct hv_guest_mapping_flush { - u64 address_space; - u64 flags; -} __packed; - -/* - * HV_MAX_FLUSH_PAGES = "additional_pages" + 1. It's limited - * by the bitwidth of "additional_pages" in union hv_gpa_page_range. - */ -#define HV_MAX_FLUSH_PAGES (2048) - -/* HvFlushGuestPhysicalAddressList hypercall */ -union hv_gpa_page_range { - u64 address_space; - struct { - u64 additional_pages:11; - u64 largepage:1; - u64 basepfn:52; - } page; -}; - -/* - * All input flush parameters should be in single page. The max flush - * count is equal with how many entries of union hv_gpa_page_range can - * be populated into the input parameter page. - */ -#define HV_MAX_FLUSH_REP_COUNT ((HV_HYP_PAGE_SIZE - 2 * sizeof(u64)) / \ - sizeof(union hv_gpa_page_range)) - -struct hv_guest_mapping_flush_list { - u64 address_space; - u64 flags; - union hv_gpa_page_range gpa_list[HV_MAX_FLUSH_REP_COUNT]; -}; - -/* HvFlushVirtualAddressSpace, HvFlushVirtualAddressList hypercalls */ -struct hv_tlb_flush { - u64 address_space; - u64 flags; - u64 processor_mask; - u64 gva_list[]; -} __packed; - -/* HvFlushVirtualAddressSpaceEx, HvFlushVirtualAddressListEx hypercalls */ -struct hv_tlb_flush_ex { - u64 address_space; - u64 flags; - struct hv_vpset hv_vp_set; - u64 gva_list[]; -} __packed; - struct hv_partition_assist_pg { u32 tlb_lock_count; }; -union hv_msi_entry { - u64 as_uint64; - struct { - u32 address; - u32 data; - } __packed; -}; - -struct hv_interrupt_entry { - u32 source; /* 1 for MSI(-X) */ - u32 reserved1; - union hv_msi_entry msi_entry; -} __packed; -/* - * flags for hv_device_interrupt_target.flags - */ -#define HV_DEVICE_INTERRUPT_TARGET_MULTICAST 1 -#define HV_DEVICE_INTERRUPT_TARGET_PROCESSOR_SET 2 - -struct hv_device_interrupt_target { - u32 vector; - u32 flags; - union { - u64 vp_mask; - struct hv_vpset vp_set; - }; -} __packed; +#include -/* HvRetargetDeviceInterrupt hypercall */ -struct hv_retarget_device_interrupt { - u64 partition_id; /* use "self" */ - u64 device_id; - struct hv_interrupt_entry int_entry; - u64 reserved2; - struct hv_device_interrupt_target int_target; -} __packed __aligned(8); #endif diff --git a/include/asm-generic/hyperv-tlfs.h b/include/asm-generic/hyperv-tlfs.h new file mode 100644 index 000000000000..1f92ef92eb56 --- /dev/null +++ b/include/asm-generic/hyperv-tlfs.h @@ -0,0 +1,442 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * This file contains definitions from Hyper-V Hypervisor Top-Level Functional + * Specification (TLFS): + * https://docs.microsoft.com/en-us/virtualization/hyper-v-on-windows/reference/tlfs + */ + +#ifndef _ASM_GENERIC_HYPERV_TLFS_H +#define _ASM_GENERIC_HYPERV_TLFS_H + +#include +#include +#include + +/* + * While not explicitly listed in the TLFS, Hyper-V always runs with a page size + * of 4096. These definitions are used when communicating with Hyper-V using + * guest physical pages and guest physical page addresses, since the guest page + * size may not be 4096 on all architectures. + */ +#define HV_HYP_PAGE_SHIFT 12 +#define HV_HYP_PAGE_SIZE BIT(HV_HYP_PAGE_SHIFT) +#define HV_HYP_PAGE_MASK (~(HV_HYP_PAGE_SIZE - 1)) + +/* + * Hyper-V provides two categories of flags relevant to guest VMs. The + * "Features" category indicates specific functionality that is available + * to guests on this particular instance of Hyper-V. The "Features" + * are presented in four groups, each of which is 32 bits. The group A + * and B definitions are common across architectures and are listed here. + * However, not all flags are relevant on all architectures. + * + * Groups C and D vary across architectures and are listed in the + * architecture specific portion of hyperv-tlfs.h. Some of these flags exist + * on multiple architectures, but the bit positions are different so they + * cannot appear in the generic portion of hyperv-tlfs.h. + * + * The "Enlightenments" category provides recommendations on whether to use + * specific enlightenments that are available. The Enlighenments are a single + * group of 32 bits, but they vary across architectures and are listed in + * the architecture specific portion of hyperv-tlfs.h. + */ + +/* + * Group A Features. + */ + +/* VP Runtime register available */ +#define HV_MSR_VP_RUNTIME_AVAILABLE BIT(0) +/* Partition Reference Counter available*/ +#define HV_MSR_TIME_REF_COUNT_AVAILABLE BIT(1) +/* Basic SynIC register available */ +#define HV_MSR_SYNIC_AVAILABLE BIT(2) +/* Synthetic Timer registers available */ +#define HV_MSR_SYNTIMER_AVAILABLE BIT(3) +/* Virtual APIC assist and VP assist page registers available */ +#define HV_MSR_APIC_ACCESS_AVAILABLE BIT(4) +/* Hypercall and Guest OS ID registers available*/ +#define HV_MSR_HYPERCALL_AVAILABLE BIT(5) +/* Access virtual processor index register available*/ +#define HV_MSR_VP_INDEX_AVAILABLE BIT(6) +/* Virtual system reset register available*/ +#define HV_MSR_RESET_AVAILABLE BIT(7) +/* Access statistics page registers available */ +#define HV_MSR_STAT_PAGES_AVAILABLE BIT(8) +/* Partition reference TSC register is available */ +#define HV_MSR_REFERENCE_TSC_AVAILABLE BIT(9) +/* Partition Guest IDLE register is available */ +#define HV_MSR_GUEST_IDLE_AVAILABLE BIT(10) +/* Partition local APIC and TSC frequency registers available */ +#define HV_ACCESS_FREQUENCY_MSRS BIT(11) +/* AccessReenlightenmentControls privilege */ +#define HV_ACCESS_REENLIGHTENMENT BIT(13) +/* AccessTscInvariantControls privilege */ +#define HV_ACCESS_TSC_INVARIANT BIT(15) + +/* + * Group B features. + */ +#define HV_CREATE_PARTITIONS BIT(0) +#define HV_ACCESS_PARTITION_ID BIT(1) +#define HV_ACCESS_MEMORY_POOL BIT(2) +#define HV_ADJUST_MESSAGE_BUFFERS BIT(3) +#define HV_POST_MESSAGES BIT(4) +#define HV_SIGNAL_EVENTS BIT(5) +#define HV_CREATE_PORT BIT(6) +#define HV_CONNECT_PORT BIT(7) +#define HV_ACCESS_STATS BIT(8) +#define HV_DEBUGGING BIT(11) +#define HV_CPU_POWER_MANAGEMENT BIT(12) + + +/* + * TSC page layout. + */ +struct ms_hyperv_tsc_page { + volatile u32 tsc_sequence; + u32 reserved1; + volatile u64 tsc_scale; + volatile s64 tsc_offset; +} __packed; + +/* + * The guest OS needs to register the guest ID with the hypervisor. + * The guest ID is a 64 bit entity and the structure of this ID is + * specified in the Hyper-V specification: + * + * msdn.microsoft.com/en-us/library/windows/hardware/ff542653%28v=vs.85%29.aspx + * + * While the current guideline does not specify how Linux guest ID(s) + * need to be generated, our plan is to publish the guidelines for + * Linux and other guest operating systems that currently are hosted + * on Hyper-V. The implementation here conforms to this yet + * unpublished guidelines. + * + * + * Bit(s) + * 63 - Indicates if the OS is Open Source or not; 1 is Open Source + * 62:56 - Os Type; Linux is 0x100 + * 55:48 - Distro specific identification + * 47:16 - Linux kernel version number + * 15:0 - Distro specific identification + * + * + */ + +#define HV_LINUX_VENDOR_ID 0x8100 + +/* + * Crash notification flags. + */ +#define HV_CRASH_CTL_CRASH_NOTIFY_MSG BIT_ULL(62) +#define HV_CRASH_CTL_CRASH_NOTIFY BIT_ULL(63) + +/* Declare the various hypercall operations. */ +#define HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE 0x0002 +#define HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST 0x0003 +#define HVCALL_NOTIFY_LONG_SPIN_WAIT 0x0008 +#define HVCALL_SEND_IPI 0x000b +#define HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX 0x0013 +#define HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX 0x0014 +#define HVCALL_SEND_IPI_EX 0x0015 +#define HVCALL_POST_MESSAGE 0x005c +#define HVCALL_SIGNAL_EVENT 0x005d +#define HVCALL_RETARGET_INTERRUPT 0x007e +#define HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE 0x00af +#define HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_LIST 0x00b0 + +#define HV_FLUSH_ALL_PROCESSORS BIT(0) +#define HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES BIT(1) +#define HV_FLUSH_NON_GLOBAL_MAPPINGS_ONLY BIT(2) +#define HV_FLUSH_USE_EXTENDED_RANGE_FORMAT BIT(3) + +enum HV_GENERIC_SET_FORMAT { + HV_GENERIC_SET_SPARSE_4K, + HV_GENERIC_SET_ALL, +}; + +#define HV_PARTITION_ID_SELF ((u64)-1) +#define HV_VP_INDEX_SELF ((u32)-2) + +#define HV_HYPERCALL_RESULT_MASK GENMASK_ULL(15, 0) +#define HV_HYPERCALL_FAST_BIT BIT(16) +#define HV_HYPERCALL_VARHEAD_OFFSET 17 +#define HV_HYPERCALL_REP_COMP_OFFSET 32 +#define HV_HYPERCALL_REP_COMP_1 BIT_ULL(32) +#define HV_HYPERCALL_REP_COMP_MASK GENMASK_ULL(43, 32) +#define HV_HYPERCALL_REP_START_OFFSET 48 +#define HV_HYPERCALL_REP_START_MASK GENMASK_ULL(59, 48) + +/* hypercall status code */ +#define HV_STATUS_SUCCESS 0 +#define HV_STATUS_INVALID_HYPERCALL_CODE 2 +#define HV_STATUS_INVALID_HYPERCALL_INPUT 3 +#define HV_STATUS_INVALID_ALIGNMENT 4 +#define HV_STATUS_INVALID_PARAMETER 5 +#define HV_STATUS_INSUFFICIENT_MEMORY 11 +#define HV_STATUS_INVALID_PORT_ID 17 +#define HV_STATUS_INVALID_CONNECTION_ID 18 +#define HV_STATUS_INSUFFICIENT_BUFFERS 19 + +/* + * The Hyper-V TimeRefCount register and the TSC + * page provide a guest VM clock with 100ns tick rate + */ +#define HV_CLOCK_HZ (NSEC_PER_SEC/100) + +/* Define the number of synthetic interrupt sources. */ +#define HV_SYNIC_SINT_COUNT (16) +/* Define the expected SynIC version. */ +#define HV_SYNIC_VERSION_1 (0x1) +/* Valid SynIC vectors are 16-255. */ +#define HV_SYNIC_FIRST_VALID_VECTOR (16) + +#define HV_SYNIC_CONTROL_ENABLE (1ULL << 0) +#define HV_SYNIC_SIMP_ENABLE (1ULL << 0) +#define HV_SYNIC_SIEFP_ENABLE (1ULL << 0) +#define HV_SYNIC_SINT_MASKED (1ULL << 16) +#define HV_SYNIC_SINT_AUTO_EOI (1ULL << 17) +#define HV_SYNIC_SINT_VECTOR_MASK (0xFF) + +#define HV_SYNIC_STIMER_COUNT (4) + +/* Define synthetic interrupt controller message constants. */ +#define HV_MESSAGE_SIZE (256) +#define HV_MESSAGE_PAYLOAD_BYTE_COUNT (240) +#define HV_MESSAGE_PAYLOAD_QWORD_COUNT (30) + +/* Define synthetic interrupt controller message flags. */ +union hv_message_flags { + __u8 asu8; + struct { + __u8 msg_pending:1; + __u8 reserved:7; + } __packed; +}; + +/* Define port identifier type. */ +union hv_port_id { + __u32 asu32; + struct { + __u32 id:24; + __u32 reserved:8; + } __packed u; +}; + +/* Define synthetic interrupt controller message header. */ +struct hv_message_header { + __u32 message_type; + __u8 payload_size; + union hv_message_flags message_flags; + __u8 reserved[2]; + union { + __u64 sender; + union hv_port_id port; + }; +} __packed; + +/* Define synthetic interrupt controller message format. */ +struct hv_message { + struct hv_message_header header; + union { + __u64 payload[HV_MESSAGE_PAYLOAD_QWORD_COUNT]; + } u; +} __packed; + +/* Define the synthetic interrupt message page layout. */ +struct hv_message_page { + struct hv_message sint_message[HV_SYNIC_SINT_COUNT]; +} __packed; + +/* Define timer message payload structure. */ +struct hv_timer_message_payload { + __u32 timer_index; + __u32 reserved; + __u64 expiration_time; /* When the timer expired */ + __u64 delivery_time; /* When the message was delivered */ +} __packed; + + +/* Define synthetic interrupt controller flag constants. */ +#define HV_EVENT_FLAGS_COUNT (256 * 8) +#define HV_EVENT_FLAGS_LONG_COUNT (256 / sizeof(unsigned long)) + +/* + * Synthetic timer configuration. + */ +union hv_stimer_config { + u64 as_uint64; + struct { + u64 enable:1; + u64 periodic:1; + u64 lazy:1; + u64 auto_enable:1; + u64 apic_vector:8; + u64 direct_mode:1; + u64 reserved_z0:3; + u64 sintx:4; + u64 reserved_z1:44; + } __packed; +}; + + +/* Define the synthetic interrupt controller event flags format. */ +union hv_synic_event_flags { + unsigned long flags[HV_EVENT_FLAGS_LONG_COUNT]; +}; + +/* Define SynIC control register. */ +union hv_synic_scontrol { + u64 as_uint64; + struct { + u64 enable:1; + u64 reserved:63; + } __packed; +}; + +/* Define synthetic interrupt source. */ +union hv_synic_sint { + u64 as_uint64; + struct { + u64 vector:8; + u64 reserved1:8; + u64 masked:1; + u64 auto_eoi:1; + u64 polling:1; + u64 reserved2:45; + } __packed; +}; + +/* Define the format of the SIMP register */ +union hv_synic_simp { + u64 as_uint64; + struct { + u64 simp_enabled:1; + u64 preserved:11; + u64 base_simp_gpa:52; + } __packed; +}; + +/* Define the format of the SIEFP register */ +union hv_synic_siefp { + u64 as_uint64; + struct { + u64 siefp_enabled:1; + u64 preserved:11; + u64 base_siefp_gpa:52; + } __packed; +}; + +struct hv_vpset { + u64 format; + u64 valid_bank_mask; + u64 bank_contents[]; +} __packed; + +/* HvCallSendSyntheticClusterIpi hypercall */ +struct hv_send_ipi { + u32 vector; + u32 reserved; + u64 cpu_mask; +} __packed; + +/* HvCallSendSyntheticClusterIpiEx hypercall */ +struct hv_send_ipi_ex { + u32 vector; + u32 reserved; + struct hv_vpset vp_set; +} __packed; + +/* HvFlushGuestPhysicalAddressSpace hypercalls */ +struct hv_guest_mapping_flush { + u64 address_space; + u64 flags; +} __packed; + +/* + * HV_MAX_FLUSH_PAGES = "additional_pages" + 1. It's limited + * by the bitwidth of "additional_pages" in union hv_gpa_page_range. + */ +#define HV_MAX_FLUSH_PAGES (2048) + +/* HvFlushGuestPhysicalAddressList hypercall */ +union hv_gpa_page_range { + u64 address_space; + struct { + u64 additional_pages:11; + u64 largepage:1; + u64 basepfn:52; + } page; +}; + +/* + * All input flush parameters should be in single page. The max flush + * count is equal with how many entries of union hv_gpa_page_range can + * be populated into the input parameter page. + */ +#define HV_MAX_FLUSH_REP_COUNT ((HV_HYP_PAGE_SIZE - 2 * sizeof(u64)) / \ + sizeof(union hv_gpa_page_range)) + +struct hv_guest_mapping_flush_list { + u64 address_space; + u64 flags; + union hv_gpa_page_range gpa_list[HV_MAX_FLUSH_REP_COUNT]; +}; + +/* HvFlushVirtualAddressSpace, HvFlushVirtualAddressList hypercalls */ +struct hv_tlb_flush { + u64 address_space; + u64 flags; + u64 processor_mask; + u64 gva_list[]; +} __packed; + +/* HvFlushVirtualAddressSpaceEx, HvFlushVirtualAddressListEx hypercalls */ +struct hv_tlb_flush_ex { + u64 address_space; + u64 flags; + struct hv_vpset hv_vp_set; + u64 gva_list[]; +} __packed; + +/* HvRetargetDeviceInterrupt hypercall */ +union hv_msi_entry { + u64 as_uint64; + struct { + u32 address; + u32 data; + } __packed; +}; + +struct hv_interrupt_entry { + u32 source; /* 1 for MSI(-X) */ + u32 reserved1; + union hv_msi_entry msi_entry; +} __packed; + +/* + * flags for hv_device_interrupt_target.flags + */ +#define HV_DEVICE_INTERRUPT_TARGET_MULTICAST 1 +#define HV_DEVICE_INTERRUPT_TARGET_PROCESSOR_SET 2 + +struct hv_device_interrupt_target { + u32 vector; + u32 flags; + union { + u64 vp_mask; + struct hv_vpset vp_set; + }; +} __packed; + +struct hv_retarget_device_interrupt { + u64 partition_id; /* use "self" */ + u64 device_id; + struct hv_interrupt_entry int_entry; + u64 reserved2; + struct hv_device_interrupt_target int_target; +} __packed __aligned(8); + +#endif -- cgit v1.2.3 From 88b42da6e3dc9adbdc3d7e1a9293ddf3d84f1021 Mon Sep 17 00:00:00 2001 From: Michael Kelley Date: Wed, 22 Apr 2020 12:57:37 -0700 Subject: asm-generic/hyperv: Add definitions for Get/SetVpRegister hypercalls Add definitions for GetVpRegister and SetVpRegister hypercalls, which are implemented for both x86 and ARM64. Signed-off-by: Michael Kelley Link: https://lore.kernel.org/r/20200422195737.10223-5-mikelley@microsoft.com Signed-off-by: Wei Liu --- include/asm-generic/hyperv-tlfs.h | 51 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/include/asm-generic/hyperv-tlfs.h b/include/asm-generic/hyperv-tlfs.h index 1f92ef92eb56..262fae9526b1 100644 --- a/include/asm-generic/hyperv-tlfs.h +++ b/include/asm-generic/hyperv-tlfs.h @@ -141,6 +141,8 @@ struct ms_hyperv_tsc_page { #define HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX 0x0013 #define HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX 0x0014 #define HVCALL_SEND_IPI_EX 0x0015 +#define HVCALL_GET_VP_REGISTERS 0x0050 +#define HVCALL_SET_VP_REGISTERS 0x0051 #define HVCALL_POST_MESSAGE 0x005c #define HVCALL_SIGNAL_EVENT 0x005d #define HVCALL_RETARGET_INTERRUPT 0x007e @@ -439,4 +441,53 @@ struct hv_retarget_device_interrupt { struct hv_device_interrupt_target int_target; } __packed __aligned(8); + +/* HvGetVpRegisters hypercall input with variable size reg name list*/ +struct hv_get_vp_registers_input { + struct { + u64 partitionid; + u32 vpindex; + u8 inputvtl; + u8 padding[3]; + } header; + struct input { + u32 name0; + u32 name1; + } element[]; +} __packed; + + +/* HvGetVpRegisters returns an array of these output elements */ +struct hv_get_vp_registers_output { + union { + struct { + u32 a; + u32 b; + u32 c; + u32 d; + } as32 __packed; + struct { + u64 low; + u64 high; + } as64 __packed; + }; +}; + +/* HvSetVpRegisters hypercall with variable size reg name/value list*/ +struct hv_set_vp_registers_input { + struct { + u64 partitionid; + u32 vpindex; + u8 inputvtl; + u8 padding[3]; + } header; + struct { + u32 name; + u32 padding1; + u64 padding2; + u64 valuelow; + u64 valuehigh; + } element[]; +} __packed; + #endif -- cgit v1.2.3 From 69f57058badded5c523871bbaaaf60b637bcf623 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 23 Apr 2020 16:45:02 +0300 Subject: hyper-v: Use UUID API for exporting the GUID (part 2) This is a follow up to the commit 1d3c9c075462 ("hyper-v: Use UUID API for exporting the GUID") which starts the conversion. There is export_guid() function which exports guid_t to the u8 array. Use it instead of open coding variant. This allows to hide the uuid_t internals. Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20200423134505.78221-1-andriy.shevchenko@linux.intel.com Signed-off-by: Wei Liu --- drivers/hv/hv_trace.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/hv/hv_trace.h b/drivers/hv/hv_trace.h index a43bc76c2d5d..a888784552c8 100644 --- a/drivers/hv/hv_trace.h +++ b/drivers/hv/hv_trace.h @@ -44,10 +44,8 @@ TRACE_EVENT(vmbus_onoffer, __entry->monitorid = offer->monitorid; __entry->is_ddc_int = offer->is_dedicated_interrupt; __entry->connection_id = offer->connection_id; - memcpy(__entry->if_type, - &offer->offer.if_type.b, 16); - memcpy(__entry->if_instance, - &offer->offer.if_instance.b, 16); + export_guid(__entry->if_type, &offer->offer.if_type); + export_guid(__entry->if_instance, &offer->offer.if_instance); __entry->chn_flags = offer->offer.chn_flags; __entry->mmio_mb = offer->offer.mmio_megabytes; __entry->sub_idx = offer->offer.sub_channel_index; -- cgit v1.2.3 From 458c4475be9ae42e248963b2db732266d40408b7 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 23 Apr 2020 16:45:03 +0300 Subject: hyper-v: Supply GUID pointer to printf() like functions Drop dereference when printing the GUID with printf() like functions. This allows to hide the uuid_t internals. Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20200423134505.78221-2-andriy.shevchenko@linux.intel.com Signed-off-by: Wei Liu --- drivers/hv/vmbus_drv.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c index 28c009c7a9cd..c1b7f4c344df 100644 --- a/drivers/hv/vmbus_drv.c +++ b/drivers/hv/vmbus_drv.c @@ -201,7 +201,7 @@ static ssize_t class_id_show(struct device *dev, if (!hv_dev->channel) return -ENODEV; return sprintf(buf, "{%pUl}\n", - hv_dev->channel->offermsg.offer.if_type.b); + &hv_dev->channel->offermsg.offer.if_type); } static DEVICE_ATTR_RO(class_id); @@ -213,7 +213,7 @@ static ssize_t device_id_show(struct device *dev, if (!hv_dev->channel) return -ENODEV; return sprintf(buf, "{%pUl}\n", - hv_dev->channel->offermsg.offer.if_instance.b); + &hv_dev->channel->offermsg.offer.if_instance); } static DEVICE_ATTR_RO(device_id); @@ -1991,7 +1991,7 @@ int vmbus_device_register(struct hv_device *child_device_obj) int ret; dev_set_name(&child_device_obj->device, "%pUl", - child_device_obj->channel->offermsg.offer.if_instance.b); + &child_device_obj->channel->offermsg.offer.if_instance); child_device_obj->device.bus = &hv_bus; child_device_obj->device.parent = &hv_acpi_dev->dev; -- cgit v1.2.3 From 0027e3fd6d907c81097ab656de2521854f37b9a6 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 23 Apr 2020 16:45:04 +0300 Subject: hyper-v: Replace open-coded variant of %*phN specifier printf() like functions in the kernel have extensions, such as %*phN to dump small pieces of memory as hex values. Replace print_alias_name() with the direct use of %*phN. Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20200423134505.78221-3-andriy.shevchenko@linux.intel.com Signed-off-by: Wei Liu --- drivers/hv/vmbus_drv.c | 19 +++---------------- 1 file changed, 3 insertions(+), 16 deletions(-) diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c index c1b7f4c344df..48e62028213b 100644 --- a/drivers/hv/vmbus_drv.c +++ b/drivers/hv/vmbus_drv.c @@ -117,14 +117,6 @@ static int vmbus_exists(void) return 0; } -#define VMBUS_ALIAS_LEN ((sizeof((struct hv_vmbus_device_id *)0)->guid) * 2) -static void print_alias_name(struct hv_device *hv_dev, char *alias_name) -{ - int i; - for (i = 0; i < VMBUS_ALIAS_LEN; i += 2) - sprintf(&alias_name[i], "%02x", hv_dev->dev_type.b[i/2]); -} - static u8 channel_monitor_group(const struct vmbus_channel *channel) { return (u8)channel->offermsg.monitorid / 32; @@ -221,10 +213,8 @@ static ssize_t modalias_show(struct device *dev, struct device_attribute *dev_attr, char *buf) { struct hv_device *hv_dev = device_to_hv_device(dev); - char alias_name[VMBUS_ALIAS_LEN + 1]; - print_alias_name(hv_dev, alias_name); - return sprintf(buf, "vmbus:%s\n", alias_name); + return sprintf(buf, "vmbus:%*phN\n", UUID_SIZE, &hv_dev->dev_type); } static DEVICE_ATTR_RO(modalias); @@ -693,12 +683,9 @@ __ATTRIBUTE_GROUPS(vmbus_dev); static int vmbus_uevent(struct device *device, struct kobj_uevent_env *env) { struct hv_device *dev = device_to_hv_device(device); - int ret; - char alias_name[VMBUS_ALIAS_LEN + 1]; + const char *format = "MODALIAS=vmbus:%*phN"; - print_alias_name(dev, alias_name); - ret = add_uevent_var(env, "MODALIAS=vmbus:%s", alias_name); - return ret; + return add_uevent_var(env, format, UUID_SIZE, &dev->dev_type); } static const struct hv_vmbus_device_id * -- cgit v1.2.3 From b7d18c57c94a38a81c1dbcfee0d63153ffaf1856 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 23 Apr 2020 16:45:05 +0300 Subject: hyper-v: Switch to use UUID types directly uuid_le is an alias for guid_t and is going to be removed in the future. Replace it with original type. Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20200423134505.78221-4-andriy.shevchenko@linux.intel.com Signed-off-by: Wei Liu --- include/linux/mod_devicetable.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h index 4c2ddd0941a7..f36b1cf5ffe3 100644 --- a/include/linux/mod_devicetable.h +++ b/include/linux/mod_devicetable.h @@ -434,7 +434,7 @@ struct virtio_device_id { * For Hyper-V devices we use the device guid as the id. */ struct hv_vmbus_device_id { - uuid_le guid; + guid_t guid; kernel_ulong_t driver_data; /* Data private to the driver */ }; -- cgit v1.2.3 From 723c425f2947dd0ecdde270cb8463b22b3e59603 Mon Sep 17 00:00:00 2001 From: Wei Liu Date: Wed, 6 May 2020 16:08:05 +0000 Subject: Driver: hv: vmbus: drop a no long applicable comment None of the things mentioned in the comment is initialized in hv_init. They've been moved elsewhere. Signed-off-by: Wei Liu Link: https://lore.kernel.org/r/20200506160806.118965-1-wei.liu@kernel.org Reviewed-by: Michael Kelley --- drivers/hv/vmbus_drv.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c index 48e62028213b..c2a4a7c0b99a 100644 --- a/drivers/hv/vmbus_drv.c +++ b/drivers/hv/vmbus_drv.c @@ -1404,7 +1404,6 @@ static int vmbus_bus_init(void) { int ret; - /* Hypervisor initialization...setup hypercall page..etc */ ret = hv_init(); if (ret != 0) { pr_err("Unable to initialize the hypervisor - 0x%x\n", ret); -- cgit v1.2.3 From db5871e85533f06ffe1795c1cb9b9153860d1e4f Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Thu, 7 May 2020 13:53:23 -0500 Subject: vmbus: Replace zero-length array with flexible-array The current codebase makes use of the zero-length array language extension to the C90 standard, but the preferred mechanism to declare variable-length types such as these ones is a flexible array member[1][2], introduced in C99: struct foo { int stuff; struct boo array[]; }; By making use of the mechanism above, we will get a compiler warning in case the flexible array does not occur last in the structure, which will help us prevent some kind of undefined behavior bugs from being inadvertently introduced[3] to the codebase from now on. Also, notice that, dynamic memory allocations won't be affected by this change: "Flexible array members have incomplete type, and so the sizeof operator may not be applied. As a quirk of the original implementation of zero-length arrays, sizeof evaluates to zero."[1] sizeof(flexible-array-member) triggers a warning because flexible array members have incomplete type[1]. There are some instances of code in which the sizeof operator is being incorrectly/erroneously applied to zero-length arrays and the result is zero. Such instances may be hiding some bugs. So, this work (flexible-array member conversions) will also help to get completely rid of those sorts of issues. This issue was found with the help of Coccinelle. [1] https://gcc.gnu.org/onlinedocs/gcc/Zero-Length.html [2] https://github.com/KSPP/linux/issues/21 [3] commit 76497732932f ("cxgb3/l2t: Fix undefined behaviour") Signed-off-by: Gustavo A. R. Silva Link: https://lore.kernel.org/r/20200507185323.GA14416@embeddedor Signed-off-by: Wei Liu --- include/linux/hyperv.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index cd64ab7bb3b7..d783847d8cb4 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -117,7 +117,7 @@ struct hv_ring_buffer { * Ring data starts here + RingDataStartOffset * !!! DO NOT place any fields below this !!! */ - u8 buffer[0]; + u8 buffer[]; } __packed; struct hv_ring_buffer_info { @@ -313,7 +313,7 @@ struct vmadd_remove_transfer_page_set { struct gpa_range { u32 byte_count; u32 byte_offset; - u64 pfn_array[0]; + u64 pfn_array[]; }; /* @@ -563,7 +563,7 @@ struct vmbus_channel_gpadl_header { u32 gpadl; u16 range_buflen; u16 rangecount; - struct gpa_range range[0]; + struct gpa_range range[]; } __packed; /* This is the followup packet that contains more PFNs. */ @@ -571,7 +571,7 @@ struct vmbus_channel_gpadl_body { struct vmbus_channel_message_header header; u32 msgnumber; u32 gpadl; - u64 pfn[0]; + u64 pfn[]; } __packed; struct vmbus_channel_gpadl_created { @@ -679,7 +679,7 @@ struct vmbus_channel_msginfo { * The channel message that goes out on the "wire". * It will contain at minimum the VMBUS_CHANNEL_MESSAGE_HEADER header */ - unsigned char msg[0]; + unsigned char msg[]; }; struct vmbus_close_msg { -- cgit v1.2.3 From a949e86c0d7802c05b2ae726a84fae89ddb5be7d Mon Sep 17 00:00:00 2001 From: "Andrea Parri (Microsoft)" Date: Fri, 22 May 2020 19:19:00 +0200 Subject: Drivers: hv: vmbus: Resolve race between init_vp_index() and CPU hotplug vmbus_process_offer() does two things (among others): 1) first, it sets the channel's target CPU with cpu_hotplug_lock; 2) it then adds the channel to the channel list(s) with channel_mutex. Since cpu_hotplug_lock is released before (2), the channel's target CPU (as designated in (1)) can be deemed "free" by hv_synic_cleanup() and go offline before the channel is added to the list. Fix the race condition by "extending" the cpu_hotplug_lock critical section to include (2) (and (1)), nesting the channel_mutex critical section within the cpu_hotplug_lock critical section as done elsewhere (hv_synic_cleanup(), target_cpu_store()) in the hyperv drivers code. Move even further by extending the channel_mutex critical section to include (1) (and (2)): this change allows to remove (the now redundant) bind_channel_to_cpu_lock, and generally simplifies the handling of the target CPUs (that are now always modified with channel_mutex held). Fixes: d570aec0f2154e ("Drivers: hv: vmbus: Synchronize init_vp_index() vs. CPU hotplug") Signed-off-by: Andrea Parri (Microsoft) Reviewed-by: Michael Kelley Link: https://lore.kernel.org/r/20200522171901.204127-2-parri.andrea@gmail.com Signed-off-by: Wei Liu --- drivers/hv/channel_mgmt.c | 46 ++++++++++++++++++---------------------------- 1 file changed, 18 insertions(+), 28 deletions(-) diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c index fde806d6525b..89eaacf069a8 100644 --- a/drivers/hv/channel_mgmt.c +++ b/drivers/hv/channel_mgmt.c @@ -554,26 +554,34 @@ static void vmbus_process_offer(struct vmbus_channel *newchannel) bool fnew = true; /* - * Initialize the target_CPU before inserting the channel in - * the chn_list and sc_list lists, within the channel_mutex - * critical section: + * Synchronize vmbus_process_offer() and CPU hotplugging: * * CPU1 CPU2 * - * [vmbus_process_offer()] [hv_syninc_cleanup()] + * [vmbus_process_offer()] [Hot removal of the CPU] * - * STORE target_cpu LOCK channel_mutex - * LOCK channel_mutex SEARCH chn_list - * INSERT chn_list LOAD target_cpu - * UNLOCK channel_mutex UNLOCK channel_mutex + * CPU_READ_LOCK CPUS_WRITE_LOCK + * LOAD cpu_online_mask SEARCH chn_list + * STORE target_cpu LOAD target_cpu + * INSERT chn_list STORE cpu_online_mask + * CPUS_READ_UNLOCK CPUS_WRITE_UNLOCK + * + * Forbids: CPU1's LOAD from *not* seing CPU2's STORE && + * CPU2's SEARCH from *not* seeing CPU1's INSERT * * Forbids: CPU2's SEARCH from seeing CPU1's INSERT && * CPU2's LOAD from *not* seing CPU1's STORE */ - init_vp_index(newchannel, hv_get_dev_type(newchannel)); + cpus_read_lock(); + /* + * Serializes the modifications of the chn_list list as well as + * the accesses to next_numa_node_id in init_vp_index(). + */ mutex_lock(&vmbus_connection.channel_mutex); + init_vp_index(newchannel, hv_get_dev_type(newchannel)); + /* Remember the channels that should be cleaned up upon suspend. */ if (is_hvsock_channel(newchannel) || is_sub_channel(newchannel)) atomic_inc(&vmbus_connection.nr_chan_close_on_suspend); @@ -623,6 +631,7 @@ static void vmbus_process_offer(struct vmbus_channel *newchannel) vmbus_channel_map_relid(newchannel); mutex_unlock(&vmbus_connection.channel_mutex); + cpus_read_unlock(); /* * vmbus_process_offer() mustn't call channel->sc_creation_callback() @@ -655,13 +664,6 @@ static void vmbus_process_offer(struct vmbus_channel *newchannel) * We use this state to statically distribute the channel interrupt load. */ static int next_numa_node_id; -/* - * init_vp_index() accesses global variables like next_numa_node_id, and - * it can run concurrently for primary channels and sub-channels: see - * vmbus_process_offer(), so we need the lock to protect the global - * variables. - */ -static DEFINE_SPINLOCK(bind_channel_to_cpu_lock); /* * Starting with Win8, we can statically distribute the incoming @@ -700,15 +702,6 @@ static void init_vp_index(struct vmbus_channel *channel, u16 dev_type) return; } - /* No CPUs can come up or down during this. */ - cpus_read_lock(); - - /* - * Serializes the accesses to the global variable next_numa_node_id. - * See also the header comment of the spin lock declaration. - */ - spin_lock(&bind_channel_to_cpu_lock); - while (true) { numa_node = next_numa_node_id++; if (numa_node == nr_node_ids) { @@ -739,9 +732,6 @@ static void init_vp_index(struct vmbus_channel *channel, u16 dev_type) channel->target_cpu = target_cpu; channel->target_vp = hv_cpu_number_to_vp_number(target_cpu); - spin_unlock(&bind_channel_to_cpu_lock); - cpus_read_unlock(); - free_cpumask_var(available_mask); } -- cgit v1.2.3 From afaa33da08abd10be8978781d7c99a9e67d2bbff Mon Sep 17 00:00:00 2001 From: "Andrea Parri (Microsoft)" Date: Fri, 22 May 2020 19:19:01 +0200 Subject: Drivers: hv: vmbus: Resolve more races involving init_vp_index() init_vp_index() uses the (per-node) hv_numa_map[] masks to record the CPUs allocated for channel interrupts at a given time, and distribute the performance-critical channels across the available CPUs: in part., the mask of "candidate" target CPUs in a given NUMA node, for a newly offered channel, is determined by XOR-ing the node's CPU mask and the node's hv_numa_map. This operation/mechanism assumes that no offline CPUs is set in the hv_numa_map mask, an assumption that does not hold since such mask is currently not updated when a channel is removed or assigned to a different CPU. To address the issues described above, this adds hooks in the channel removal path (hv_process_channel_removal()) and in target_cpu_store() in order to clear, resp. to update, the hv_numa_map[] masks as needed. This also adds a (missed) update of the masks in init_vp_index() (cf., e.g., the memory-allocation failure path in this function). Like in the case of init_vp_index(), such hooks require to determine if the given channel is performance critical. init_vp_index() does this by parsing the channel's offer, it can not rely on the device data structure (device_obj) to retrieve such information because the device data structure has not been allocated/linked with the channel by the time that init_vp_index() executes. A similar situation may hold in hv_is_alloced_cpu() (defined below); the adopted approach is to "cache" the device type of the channel, as computed by parsing the channel's offer, in the channel structure itself. Fixes: 7527810573436f ("Drivers: hv: vmbus: Introduce the CHANNELMSG_MODIFYCHANNEL message type") Signed-off-by: Andrea Parri (Microsoft) Reviewed-by: Michael Kelley Link: https://lore.kernel.org/r/20200522171901.204127-3-parri.andrea@gmail.com Signed-off-by: Wei Liu --- drivers/hv/channel_mgmt.c | 22 ++++++++++++++++------ drivers/hv/hyperv_vmbus.h | 48 +++++++++++++++++++++++++++++++++++++++++++++++ drivers/hv/vmbus_drv.c | 19 +++++++++++++------ include/linux/hyperv.h | 7 +++++++ 4 files changed, 84 insertions(+), 12 deletions(-) diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c index 89eaacf069a8..417a95e5094d 100644 --- a/drivers/hv/channel_mgmt.c +++ b/drivers/hv/channel_mgmt.c @@ -24,9 +24,9 @@ #include "hyperv_vmbus.h" -static void init_vp_index(struct vmbus_channel *channel, u16 dev_type); +static void init_vp_index(struct vmbus_channel *channel); -static const struct vmbus_device vmbus_devs[] = { +const struct vmbus_device vmbus_devs[] = { /* IDE */ { .dev_type = HV_IDE, HV_IDE_GUID, @@ -431,6 +431,13 @@ void hv_process_channel_removal(struct vmbus_channel *channel) spin_unlock_irqrestore(&primary_channel->lock, flags); } + /* + * If this is a "perf" channel, updates the hv_numa_map[] masks so that + * init_vp_index() can (re-)use the CPU. + */ + if (hv_is_perf_channel(channel)) + hv_clear_alloced_cpu(channel->target_cpu); + /* * Upon suspend, an in-use hv_sock channel is marked as "rescinded" and * the relid is invalidated; after hibernation, when the user-space app @@ -497,7 +504,7 @@ static void vmbus_add_channel_work(struct work_struct *work) if (!newchannel->device_obj) goto err_deq_chan; - newchannel->device_obj->device_id = hv_get_dev_type(newchannel); + newchannel->device_obj->device_id = newchannel->device_id; /* * Add the new device to the bus. This will kick off device-driver * binding which eventually invokes the device driver's AddDevice() @@ -580,7 +587,7 @@ static void vmbus_process_offer(struct vmbus_channel *newchannel) */ mutex_lock(&vmbus_connection.channel_mutex); - init_vp_index(newchannel, hv_get_dev_type(newchannel)); + init_vp_index(newchannel); /* Remember the channels that should be cleaned up upon suspend. */ if (is_hvsock_channel(newchannel) || is_sub_channel(newchannel)) @@ -676,9 +683,9 @@ static int next_numa_node_id; * evenly among all the available NUMA nodes. Once the node is assigned, * we will assign the CPU based on a simple round robin scheme. */ -static void init_vp_index(struct vmbus_channel *channel, u16 dev_type) +static void init_vp_index(struct vmbus_channel *channel) { - bool perf_chn = vmbus_devs[dev_type].perf_device; + bool perf_chn = hv_is_perf_channel(channel); cpumask_var_t available_mask; struct cpumask *alloced_mask; u32 target_cpu; @@ -699,6 +706,8 @@ static void init_vp_index(struct vmbus_channel *channel, u16 dev_type) channel->target_cpu = VMBUS_CONNECT_CPU; channel->target_vp = hv_cpu_number_to_vp_number(VMBUS_CONNECT_CPU); + if (perf_chn) + hv_set_alloced_cpu(VMBUS_CONNECT_CPU); return; } @@ -862,6 +871,7 @@ static void vmbus_setup_channel_state(struct vmbus_channel *channel, sizeof(struct vmbus_channel_offer_channel)); channel->monitor_grp = (u8)offer->monitorid / 32; channel->monitor_bit = (u8)offer->monitorid % 32; + channel->device_id = hv_get_dev_type(channel); } /* diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h index 5e5cebe5d048..40e2b9f91163 100644 --- a/drivers/hv/hyperv_vmbus.h +++ b/drivers/hv/hyperv_vmbus.h @@ -395,6 +395,54 @@ enum delay { MESSAGE_DELAY = 1, }; +extern const struct vmbus_device vmbus_devs[]; + +static inline bool hv_is_perf_channel(struct vmbus_channel *channel) +{ + return vmbus_devs[channel->device_id].perf_device; +} + +static inline bool hv_is_alloced_cpu(unsigned int cpu) +{ + struct vmbus_channel *channel, *sc; + + lockdep_assert_held(&vmbus_connection.channel_mutex); + /* + * List additions/deletions as well as updates of the target CPUs are + * protected by channel_mutex. + */ + list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) { + if (!hv_is_perf_channel(channel)) + continue; + if (channel->target_cpu == cpu) + return true; + list_for_each_entry(sc, &channel->sc_list, sc_list) { + if (sc->target_cpu == cpu) + return true; + } + } + return false; +} + +static inline void hv_set_alloced_cpu(unsigned int cpu) +{ + cpumask_set_cpu(cpu, &hv_context.hv_numa_map[cpu_to_node(cpu)]); +} + +static inline void hv_clear_alloced_cpu(unsigned int cpu) +{ + if (hv_is_alloced_cpu(cpu)) + return; + cpumask_clear_cpu(cpu, &hv_context.hv_numa_map[cpu_to_node(cpu)]); +} + +static inline void hv_update_alloced_cpus(unsigned int old_cpu, + unsigned int new_cpu) +{ + hv_set_alloced_cpu(new_cpu); + hv_clear_alloced_cpu(old_cpu); +} + #ifdef CONFIG_HYPERV_TESTING int hv_debug_add_dev_dir(struct hv_device *dev); diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c index c2a4a7c0b99a..47747755d2e1 100644 --- a/drivers/hv/vmbus_drv.c +++ b/drivers/hv/vmbus_drv.c @@ -1687,8 +1687,8 @@ static ssize_t target_cpu_show(struct vmbus_channel *channel, char *buf) static ssize_t target_cpu_store(struct vmbus_channel *channel, const char *buf, size_t count) { + u32 target_cpu, origin_cpu; ssize_t ret = count; - u32 target_cpu; if (vmbus_proto_version < VERSION_WIN10_V4_1) return -EIO; @@ -1741,7 +1741,8 @@ static ssize_t target_cpu_store(struct vmbus_channel *channel, goto cpu_store_unlock; } - if (channel->target_cpu == target_cpu) + origin_cpu = channel->target_cpu; + if (target_cpu == origin_cpu) goto cpu_store_unlock; if (vmbus_send_modifychannel(channel->offermsg.child_relid, @@ -1763,14 +1764,20 @@ static ssize_t target_cpu_store(struct vmbus_channel *channel, * in on a CPU that is different from the channel target_cpu value. */ - if (channel->change_target_cpu_callback) - (*channel->change_target_cpu_callback)(channel, - channel->target_cpu, target_cpu); - channel->target_cpu = target_cpu; channel->target_vp = hv_cpu_number_to_vp_number(target_cpu); channel->numa_node = cpu_to_node(target_cpu); + /* See init_vp_index(). */ + if (hv_is_perf_channel(channel)) + hv_update_alloced_cpus(origin_cpu, target_cpu); + + /* Currently set only for storvsc channels. */ + if (channel->change_target_cpu_callback) { + (*channel->change_target_cpu_callback)(channel, + origin_cpu, target_cpu); + } + cpu_store_unlock: mutex_unlock(&vmbus_connection.channel_mutex); cpus_read_unlock(); diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index d783847d8cb4..40df3103e890 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -901,6 +901,13 @@ struct vmbus_channel { bool probe_done; + /* + * Cache the device ID here for easy access; this is useful, in + * particular, in situations where the channel's device_obj has + * not been allocated/initialized yet. + */ + u16 device_id; + /* * We must offload the handling of the primary/sub channels * from the single-threaded vmbus_connection.work_queue to -- cgit v1.2.3