/* * fs/eventfd.c * * Copyright (C) 2007 Davide Libenzi <davidel@xmailserver.org> * */ #include <linux/file.h> #include <linux/poll.h> #include <linux/init.h> #include <linux/fs.h> #include <linux/sched.h> #include <linux/kernel.h> #include <linux/list.h> #include <linux/spinlock.h> #include <linux/anon_inodes.h> #include <linux/eventfd.h> #include <linux/syscalls.h> struct eventfd_ctx { wait_queue_head_t wqh; /* * Every time that a write(2) is performed on an eventfd, the * value of the __u64 being written is added to "count" and a * wakeup is performed on "wqh". A read(2) will return the "count" * value to userspace, and will reset "count" to zero. The kernel * size eventfd_signal() also, adds to the "count" counter and * issue a wakeup. */ __u64 count; unsigned int flags; }; /* * Adds "n" to the eventfd counter "count". Returns "n" in case of * success, or a value lower then "n" in case of coutner overflow. * This function is supposed to be called by the kernel in paths * that do not allow sleeping. In this function we allow the counter * to reach the ULLONG_MAX value, and we signal this as overflow * condition by returining a POLLERR to poll(2). */ int eventfd_signal(struct file *file, int n) { struct eventfd_ctx *ctx = file->private_data; unsigned long flags; if (n < 0) return -EINVAL; spin_lock_irqsave(&ctx->wqh.lock, flags); if (ULLONG_MAX - ctx->count < n) n = (int) (ULLONG_MAX - ctx->count); ctx->count += n; if (waitqueue_active(&ctx->wqh)) wake_up_locked_poll(&ctx->wqh, POLLIN); spin_unlock_irqrestore(&ctx->wqh.lock, flags); return n; } static int eventfd_release(struct inode *inode, struct file *file) { kfree(file->private_data); return 0; } static unsigned int eventfd_poll(struct file *file, poll_table *wait) { struct eventfd_ctx *ctx = file->private_data; unsigned int events = 0; unsigned long flags; poll_wait(file, &ctx->wqh, wait); spin_lock_irqsave(&ctx->wqh.lock, flags); if (ctx->count > 0) events |= POLLIN; if (ctx->count == ULLONG_MAX) events |= POLLERR; if (ULLONG_MAX - 1 > ctx->count) events |= POLLOUT; spin_unlock_irqrestore(&ctx->wqh.lock, flags); return events; } static ssize_t eventfd_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { struct eventfd_ctx *ctx = file->private_data; ssize_t res; __u64 ucnt = 0; DECLARE_WAITQUEUE(wait, current); if (count < sizeof(ucnt)) return -EINVAL; spin_lock_irq(&ctx->wqh.lock); res = -EAGAIN; if (ctx->count > 0) res = sizeof(ucnt); else if (!(file->f_flags & O_NONBLOCK)) { __add_wait_queue(&ctx->wqh, &wait); for (res = 0;;) { set_current_state(TASK_INTERRUPTIBLE); if (ctx->count > 0) { res = sizeof(ucnt); break; } if (signal_pending(current)) { res = -ERESTARTSYS; break; } spin_unlock_irq(&ctx->wqh.lock); schedule(); spin_lock_irq(&ctx->wqh.lock); } __remove_wait_queue(&ctx->wqh, &wait); __set_current_state(TASK_RUNNING); } if (likely(res > 0)) { ucnt = (ctx->flags & EFD_SEMAPHORE) ? 1 : ctx->count; ctx->count -= ucnt; if (waitqueue_active(&ctx->wqh)) wake_up_locked_poll(&ctx->wqh, POLLOUT); } spin_unlock_irq(&ctx->wqh.lock); if (res > 0 && put_user(ucnt, (__u64 __user *) buf)) return -EFAULT; return res; } static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { struct eventfd_ctx *ctx = file->private_data; ssize_t res; __u64 ucnt; DECLARE_WAITQUEUE(wait, current); if (count < sizeof(ucnt)) return -EINVAL; if (copy_from_user(&ucnt, buf, sizeof(ucnt))) return -EFAULT; if (ucnt == ULLONG_MAX) return -EINVAL; spin_lock_irq(&ctx->wqh.lock); res = -EAGAIN; if (ULLONG_MAX - ctx->count > ucnt) res = sizeof(ucnt); else if (!(file->f_flags & O_NONBLOCK)) { __add_wait_queue(&ctx->wqh, &wait); for (res = 0;;) { set_current_state(TASK_INTERRUPTIBLE); if (ULLONG_MAX - ctx->count > ucnt) { res = sizeof(ucnt); break; } if (signal_pending(current)) { res = -ERESTARTSYS; break; } spin_unlock_irq(&ctx->wqh.lock); schedule(); spin_lock_irq(&ctx->wqh.lock); } __remove_wait_queue(&ctx->wqh, &wait); __set_current_state(TASK_RUNNING); } if (likely(res > 0)) { ctx->count += ucnt; if (waitqueue_active(&ctx->wqh)) wake_up_locked_poll(&ctx->wqh, POLLIN); } spin_unlock_irq(&ctx->wqh.lock); return res; } static const struct file_operations eventfd_fops = { .release = eventfd_release, .poll = eventfd_poll, .read = eventfd_read, .write = eventfd_write, }; struct file *eventfd_fget(int fd) { struct file *file; file = fget(fd); if (!file) return ERR_PTR(-EBADF); if (file->f_op != &eventfd_fops) { fput(file); return ERR_PTR(-EINVAL); } return file; } SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags) { int fd; struct eventfd_ctx *ctx; /* Check the EFD_* constants for consistency. */ BUILD_BUG_ON(EFD_CLOEXEC != O_CLOEXEC); BUILD_BUG_ON(EFD_NONBLOCK != O_NONBLOCK); if (flags & ~EFD_FLAGS_SET) return -EINVAL; ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); if (!ctx) return -ENOMEM; init_waitqueue_head(&ctx->wqh); ctx->count = count; ctx->flags = flags; /* * When we call this, the initialization must be complete, since * anon_inode_getfd() will install the fd. */ fd = anon_inode_getfd("[eventfd]", &eventfd_fops, ctx, flags & EFD_SHARED_FCNTL_FLAGS); if (fd < 0) kfree(ctx); return fd; } SYSCALL_DEFINE1(eventfd, unsigned int, count) { return sys_eventfd2(count, 0); }