From 85455c795c07882091b15c0613f78d4567d9be36 Mon Sep 17 00:00:00 2001 From: Joe Damato Date: Tue, 13 Feb 2024 06:16:42 +0000 Subject: eventpoll: support busy poll per epoll instance Allow busy polling on a per-epoll context basis. The per-epoll context usec timeout value is preferred, but the pre-existing system wide sysctl value is still supported if it specified. busy_poll_usecs is a u32, but in a follow up patch the ioctl provided to the user only allows setting a value from 0 to S32_MAX. Signed-off-by: Joe Damato Acked-by: Stanislav Fomichev Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- fs/eventpoll.c | 44 +++++++++++++++++++++++++++++++++++++++----- 1 file changed, 39 insertions(+), 5 deletions(-) (limited to 'fs/eventpoll.c') diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 3534d36a1474..401f865eced9 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -227,6 +227,8 @@ struct eventpoll { #ifdef CONFIG_NET_RX_BUSY_POLL /* used to track busy poll napi_id */ unsigned int napi_id; + /* busy poll timeout */ + u32 busy_poll_usecs; #endif #ifdef CONFIG_DEBUG_LOCK_ALLOC @@ -387,11 +389,41 @@ static inline int ep_events_available(struct eventpoll *ep) } #ifdef CONFIG_NET_RX_BUSY_POLL +/** + * busy_loop_ep_timeout - check if busy poll has timed out. The timeout value + * from the epoll instance ep is preferred, but if it is not set fallback to + * the system-wide global via busy_loop_timeout. + * + * @start_time: The start time used to compute the remaining time until timeout. + * @ep: Pointer to the eventpoll context. + * + * Return: true if the timeout has expired, false otherwise. + */ +static bool busy_loop_ep_timeout(unsigned long start_time, + struct eventpoll *ep) +{ + unsigned long bp_usec = READ_ONCE(ep->busy_poll_usecs); + + if (bp_usec) { + unsigned long end_time = start_time + bp_usec; + unsigned long now = busy_loop_current_time(); + + return time_after(now, end_time); + } else { + return busy_loop_timeout(start_time); + } +} + +static bool ep_busy_loop_on(struct eventpoll *ep) +{ + return !!ep->busy_poll_usecs || net_busy_loop_on(); +} + static bool ep_busy_loop_end(void *p, unsigned long start_time) { struct eventpoll *ep = p; - return ep_events_available(ep) || busy_loop_timeout(start_time); + return ep_events_available(ep) || busy_loop_ep_timeout(start_time, ep); } /* @@ -404,7 +436,7 @@ static bool ep_busy_loop(struct eventpoll *ep, int nonblock) { unsigned int napi_id = READ_ONCE(ep->napi_id); - if ((napi_id >= MIN_NAPI_ID) && net_busy_loop_on()) { + if (napi_id >= MIN_NAPI_ID && ep_busy_loop_on(ep)) { napi_busy_loop(napi_id, nonblock ? NULL : ep_busy_loop_end, ep, false, BUSY_POLL_BUDGET); if (ep_events_available(ep)) @@ -425,12 +457,12 @@ static bool ep_busy_loop(struct eventpoll *ep, int nonblock) */ static inline void ep_set_busy_poll_napi_id(struct epitem *epi) { - struct eventpoll *ep; + struct eventpoll *ep = epi->ep; unsigned int napi_id; struct socket *sock; struct sock *sk; - if (!net_busy_loop_on()) + if (!ep_busy_loop_on(ep)) return; sock = sock_from_file(epi->ffd.file); @@ -442,7 +474,6 @@ static inline void ep_set_busy_poll_napi_id(struct epitem *epi) return; napi_id = READ_ONCE(sk->sk_napi_id); - ep = epi->ep; /* Non-NAPI IDs can be rejected * or @@ -2058,6 +2089,9 @@ static int do_epoll_create(int flags) error = PTR_ERR(file); goto out_free_fd; } +#ifdef CONFIG_NET_RX_BUSY_POLL + ep->busy_poll_usecs = 0; +#endif ep->file = file; fd_install(fd, file); return fd; -- cgit v1.2.3 From c6aa2a7778d8e3ba7c6f84c8095f0b89f0617830 Mon Sep 17 00:00:00 2001 From: Joe Damato Date: Tue, 13 Feb 2024 06:16:43 +0000 Subject: eventpoll: Add per-epoll busy poll packet budget When using epoll-based busy poll, the packet budget is hardcoded to BUSY_POLL_BUDGET (8). Users may desire larger busy poll budgets, which can potentially increase throughput when busy polling under high network load. Other busy poll methods allow setting the busy poll budget via SO_BUSY_POLL_BUDGET, but epoll-based busy polling uses a hardcoded value. Fix this edge case by adding support for a per-epoll context busy poll packet budget. If not specified, the default value (BUSY_POLL_BUDGET) is used. Signed-off-by: Joe Damato Acked-by: Stanislav Fomichev Reviewed-by: Jakub Kicinski Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- fs/eventpoll.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'fs/eventpoll.c') diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 401f865eced9..ed83ae33dd45 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -229,6 +229,8 @@ struct eventpoll { unsigned int napi_id; /* busy poll timeout */ u32 busy_poll_usecs; + /* busy poll packet budget */ + u16 busy_poll_budget; #endif #ifdef CONFIG_DEBUG_LOCK_ALLOC @@ -435,10 +437,14 @@ static bool ep_busy_loop_end(void *p, unsigned long start_time) static bool ep_busy_loop(struct eventpoll *ep, int nonblock) { unsigned int napi_id = READ_ONCE(ep->napi_id); + u16 budget = READ_ONCE(ep->busy_poll_budget); + + if (!budget) + budget = BUSY_POLL_BUDGET; if (napi_id >= MIN_NAPI_ID && ep_busy_loop_on(ep)) { napi_busy_loop(napi_id, nonblock ? NULL : ep_busy_loop_end, ep, false, - BUSY_POLL_BUDGET); + budget); if (ep_events_available(ep)) return true; /* @@ -2091,6 +2097,7 @@ static int do_epoll_create(int flags) } #ifdef CONFIG_NET_RX_BUSY_POLL ep->busy_poll_usecs = 0; + ep->busy_poll_budget = 0; #endif ep->file = file; fd_install(fd, file); -- cgit v1.2.3 From de57a251082211b68e8c01e0e8210a23c022ac57 Mon Sep 17 00:00:00 2001 From: Joe Damato Date: Tue, 13 Feb 2024 06:16:44 +0000 Subject: eventpoll: Add per-epoll prefer busy poll option When using epoll-based busy poll, the prefer_busy_poll option is hardcoded to false. Users may want to enable prefer_busy_poll to be used in conjunction with gro_flush_timeout and defer_hard_irqs_count to keep device IRQs masked. Other busy poll methods allow enabling or disabling prefer busy poll via SO_PREFER_BUSY_POLL, but epoll-based busy polling uses a hardcoded value. Fix this edge case by adding support for a per-epoll context prefer_busy_poll option. The default is false, as it was hardcoded before this change. Signed-off-by: Joe Damato Acked-by: Stanislav Fomichev Reviewed-by: Jakub Kicinski Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- fs/eventpoll.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'fs/eventpoll.c') diff --git a/fs/eventpoll.c b/fs/eventpoll.c index ed83ae33dd45..1b8d01af0c2c 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -231,6 +231,7 @@ struct eventpoll { u32 busy_poll_usecs; /* busy poll packet budget */ u16 busy_poll_budget; + bool prefer_busy_poll; #endif #ifdef CONFIG_DEBUG_LOCK_ALLOC @@ -438,13 +439,14 @@ static bool ep_busy_loop(struct eventpoll *ep, int nonblock) { unsigned int napi_id = READ_ONCE(ep->napi_id); u16 budget = READ_ONCE(ep->busy_poll_budget); + bool prefer_busy_poll = READ_ONCE(ep->prefer_busy_poll); if (!budget) budget = BUSY_POLL_BUDGET; if (napi_id >= MIN_NAPI_ID && ep_busy_loop_on(ep)) { - napi_busy_loop(napi_id, nonblock ? NULL : ep_busy_loop_end, ep, false, - budget); + napi_busy_loop(napi_id, nonblock ? NULL : ep_busy_loop_end, + ep, prefer_busy_poll, budget); if (ep_events_available(ep)) return true; /* @@ -2098,6 +2100,7 @@ static int do_epoll_create(int flags) #ifdef CONFIG_NET_RX_BUSY_POLL ep->busy_poll_usecs = 0; ep->busy_poll_budget = 0; + ep->prefer_busy_poll = false; #endif ep->file = file; fd_install(fd, file); -- cgit v1.2.3 From 18e2bf0edf4dd88d9656ec92395aa47392e85b61 Mon Sep 17 00:00:00 2001 From: Joe Damato Date: Tue, 13 Feb 2024 06:16:45 +0000 Subject: eventpoll: Add epoll ioctl for epoll_params Add an ioctl for getting and setting epoll_params. User programs can use this ioctl to get and set the busy poll usec time, packet budget, and prefer busy poll params for a specific epoll context. Parameters are limited: - busy_poll_usecs is limited to <= s32_max - busy_poll_budget is limited to <= NAPI_POLL_WEIGHT by unprivileged users (!capable(CAP_NET_ADMIN)) - prefer_busy_poll must be 0 or 1 - __pad must be 0 Signed-off-by: Joe Damato Acked-by: Stanislav Fomichev Reviewed-by: Jiri Slaby Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- fs/eventpoll.c | 73 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) (limited to 'fs/eventpoll.c') diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 1b8d01af0c2c..df2ed3af486e 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -37,6 +37,7 @@ #include #include #include +#include #include /* @@ -494,6 +495,49 @@ static inline void ep_set_busy_poll_napi_id(struct epitem *epi) ep->napi_id = napi_id; } +static long ep_eventpoll_bp_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + struct eventpoll *ep = file->private_data; + void __user *uarg = (void __user *)arg; + struct epoll_params epoll_params; + + switch (cmd) { + case EPIOCSPARAMS: + if (copy_from_user(&epoll_params, uarg, sizeof(epoll_params))) + return -EFAULT; + + /* pad byte must be zero */ + if (epoll_params.__pad) + return -EINVAL; + + if (epoll_params.busy_poll_usecs > S32_MAX) + return -EINVAL; + + if (epoll_params.prefer_busy_poll > 1) + return -EINVAL; + + if (epoll_params.busy_poll_budget > NAPI_POLL_WEIGHT && + !capable(CAP_NET_ADMIN)) + return -EPERM; + + WRITE_ONCE(ep->busy_poll_usecs, epoll_params.busy_poll_usecs); + WRITE_ONCE(ep->busy_poll_budget, epoll_params.busy_poll_budget); + WRITE_ONCE(ep->prefer_busy_poll, epoll_params.prefer_busy_poll); + return 0; + case EPIOCGPARAMS: + memset(&epoll_params, 0, sizeof(epoll_params)); + epoll_params.busy_poll_usecs = READ_ONCE(ep->busy_poll_usecs); + epoll_params.busy_poll_budget = READ_ONCE(ep->busy_poll_budget); + epoll_params.prefer_busy_poll = READ_ONCE(ep->prefer_busy_poll); + if (copy_to_user(uarg, &epoll_params, sizeof(epoll_params))) + return -EFAULT; + return 0; + default: + return -ENOIOCTLCMD; + } +} + #else static inline bool ep_busy_loop(struct eventpoll *ep, int nonblock) @@ -505,6 +549,12 @@ static inline void ep_set_busy_poll_napi_id(struct epitem *epi) { } +static long ep_eventpoll_bp_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + return -EOPNOTSUPP; +} + #endif /* CONFIG_NET_RX_BUSY_POLL */ /* @@ -864,6 +914,27 @@ static void ep_clear_and_put(struct eventpoll *ep) ep_free(ep); } +static long ep_eventpoll_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + int ret; + + if (!is_file_epoll(file)) + return -EINVAL; + + switch (cmd) { + case EPIOCSPARAMS: + case EPIOCGPARAMS: + ret = ep_eventpoll_bp_ioctl(file, cmd, arg); + break; + default: + ret = -EINVAL; + break; + } + + return ret; +} + static int ep_eventpoll_release(struct inode *inode, struct file *file) { struct eventpoll *ep = file->private_data; @@ -970,6 +1041,8 @@ static const struct file_operations eventpoll_fops = { .release = ep_eventpoll_release, .poll = ep_eventpoll_poll, .llseek = noop_llseek, + .unlocked_ioctl = ep_eventpoll_ioctl, + .compat_ioctl = compat_ptr_ioctl, }; /* -- cgit v1.2.3