summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorShailabh Nagar <nagar@watson.ibm.com>2006-07-14 00:24:40 -0700
committerLinus Torvalds <torvalds@g5.osdl.org>2006-07-14 21:53:56 -0700
commitc757249af152c59fd74b85e52e8c090acb33d9c0 (patch)
tree78495f661fe537bf5087b24e6577659de8725b5a /kernel
parentfb0ba6bd021248b6bdc58a7b1213a55a6776a38a (diff)
downloadlinux-c757249af152c59fd74b85e52e8c090acb33d9c0.tar.gz
linux-c757249af152c59fd74b85e52e8c090acb33d9c0.tar.bz2
linux-c757249af152c59fd74b85e52e8c090acb33d9c0.zip
[PATCH] per-task-delay-accounting: taskstats interface
Create a "taskstats" interface based on generic netlink (NETLINK_GENERIC family), for getting statistics of tasks and thread groups during their lifetime and when they exit. The interface is intended for use by multiple accounting packages though it is being created in the context of delay accounting. This patch creates the interface without populating the fields of the data that is sent to the user in response to a command or upon the exit of a task. Each accounting package interested in using taskstats has to provide an additional patch to add its stats to the common structure. [akpm@osdl.org: cleanups, Kconfig fix] Signed-off-by: Shailabh Nagar <nagar@us.ibm.com> Signed-off-by: Balbir Singh <balbir@in.ibm.com> Cc: Jes Sorensen <jes@sgi.com> Cc: Peter Chubb <peterc@gelato.unsw.edu.au> Cc: Erich Focht <efocht@ess.nec.de> Cc: Levent Serinol <lserinol@gmail.com> Cc: Jay Lan <jlan@engr.sgi.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/exit.c7
-rw-r--r--kernel/taskstats.c336
3 files changed, 344 insertions, 0 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 87bb34cc8938..d62ec66c1af2 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -49,6 +49,7 @@ obj-$(CONFIG_SECCOMP) += seccomp.o
obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
obj-$(CONFIG_RELAY) += relay.o
obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
+obj-$(CONFIG_TASKSTATS) += taskstats.o
ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/exit.c b/kernel/exit.c
index 3c2cf91defa7..9852ed8c2988 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -25,6 +25,7 @@
#include <linux/mount.h>
#include <linux/proc_fs.h>
#include <linux/mempolicy.h>
+#include <linux/taskstats_kern.h>
#include <linux/delayacct.h>
#include <linux/cpuset.h>
#include <linux/syscalls.h>
@@ -844,6 +845,7 @@ static void exit_notify(struct task_struct *tsk)
fastcall NORET_TYPE void do_exit(long code)
{
struct task_struct *tsk = current;
+ struct taskstats *tidstats, *tgidstats;
int group_dead;
profile_task_exit(tsk);
@@ -882,6 +884,8 @@ fastcall NORET_TYPE void do_exit(long code)
current->comm, current->pid,
preempt_count());
+ taskstats_exit_alloc(&tidstats, &tgidstats);
+
acct_update_integrals(tsk);
if (tsk->mm) {
update_hiwater_rss(tsk->mm);
@@ -901,7 +905,10 @@ fastcall NORET_TYPE void do_exit(long code)
#endif
if (unlikely(tsk->audit_context))
audit_free(tsk);
+ taskstats_exit_send(tsk, tidstats, tgidstats);
+ taskstats_exit_free(tidstats, tgidstats);
delayacct_tsk_exit(tsk);
+
exit_mm(tsk);
if (group_dead)
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
new file mode 100644
index 000000000000..82ec9137d908
--- /dev/null
+++ b/kernel/taskstats.c
@@ -0,0 +1,336 @@
+/*
+ * taskstats.c - Export per-task statistics to userland
+ *
+ * Copyright (C) Shailabh Nagar, IBM Corp. 2006
+ * (C) Balbir Singh, IBM Corp. 2006
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/taskstats_kern.h>
+#include <net/genetlink.h>
+#include <asm/atomic.h>
+
+static DEFINE_PER_CPU(__u32, taskstats_seqnum) = { 0 };
+static int family_registered;
+kmem_cache_t *taskstats_cache;
+static DEFINE_MUTEX(taskstats_exit_mutex);
+
+static struct genl_family family = {
+ .id = GENL_ID_GENERATE,
+ .name = TASKSTATS_GENL_NAME,
+ .version = TASKSTATS_GENL_VERSION,
+ .maxattr = TASKSTATS_CMD_ATTR_MAX,
+};
+
+static struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1]
+__read_mostly = {
+ [TASKSTATS_CMD_ATTR_PID] = { .type = NLA_U32 },
+ [TASKSTATS_CMD_ATTR_TGID] = { .type = NLA_U32 },
+};
+
+
+static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp,
+ void **replyp, size_t size)
+{
+ struct sk_buff *skb;
+ void *reply;
+
+ /*
+ * If new attributes are added, please revisit this allocation
+ */
+ skb = nlmsg_new(size);
+ if (!skb)
+ return -ENOMEM;
+
+ if (!info) {
+ int seq = get_cpu_var(taskstats_seqnum)++;
+ put_cpu_var(taskstats_seqnum);
+
+ reply = genlmsg_put(skb, 0, seq,
+ family.id, 0, 0,
+ cmd, family.version);
+ } else
+ reply = genlmsg_put(skb, info->snd_pid, info->snd_seq,
+ family.id, 0, 0,
+ cmd, family.version);
+ if (reply == NULL) {
+ nlmsg_free(skb);
+ return -EINVAL;
+ }
+
+ *skbp = skb;
+ *replyp = reply;
+ return 0;
+}
+
+static int send_reply(struct sk_buff *skb, pid_t pid, int event)
+{
+ struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data);
+ void *reply;
+ int rc;
+
+ reply = genlmsg_data(genlhdr);
+
+ rc = genlmsg_end(skb, reply);
+ if (rc < 0) {
+ nlmsg_free(skb);
+ return rc;
+ }
+
+ if (event == TASKSTATS_MSG_MULTICAST)
+ return genlmsg_multicast(skb, pid, TASKSTATS_LISTEN_GROUP);
+ return genlmsg_unicast(skb, pid);
+}
+
+static int fill_pid(pid_t pid, struct task_struct *pidtsk,
+ struct taskstats *stats)
+{
+ int rc;
+ struct task_struct *tsk = pidtsk;
+
+ if (!pidtsk) {
+ read_lock(&tasklist_lock);
+ tsk = find_task_by_pid(pid);
+ if (!tsk) {
+ read_unlock(&tasklist_lock);
+ return -ESRCH;
+ }
+ get_task_struct(tsk);
+ read_unlock(&tasklist_lock);
+ } else
+ get_task_struct(tsk);
+
+ /*
+ * Each accounting subsystem adds calls to its functions to
+ * fill in relevant parts of struct taskstsats as follows
+ *
+ * rc = per-task-foo(stats, tsk);
+ * if (rc)
+ * goto err;
+ */
+
+err:
+ put_task_struct(tsk);
+ return rc;
+
+}
+
+static int fill_tgid(pid_t tgid, struct task_struct *tgidtsk,
+ struct taskstats *stats)
+{
+ int rc;
+ struct task_struct *tsk, *first;
+
+ first = tgidtsk;
+ read_lock(&tasklist_lock);
+ if (!first) {
+ first = find_task_by_pid(tgid);
+ if (!first) {
+ read_unlock(&tasklist_lock);
+ return -ESRCH;
+ }
+ }
+ tsk = first;
+ do {
+ /*
+ * Each accounting subsystem adds calls its functions to
+ * fill in relevant parts of struct taskstsats as follows
+ *
+ * rc = per-task-foo(stats, tsk);
+ * if (rc)
+ * break;
+ */
+
+ } while_each_thread(first, tsk);
+ read_unlock(&tasklist_lock);
+
+ /*
+ * Accounting subsytems can also add calls here if they don't
+ * wish to aggregate statistics for per-tgid stats
+ */
+
+ return rc;
+}
+
+static int taskstats_send_stats(struct sk_buff *skb, struct genl_info *info)
+{
+ int rc = 0;
+ struct sk_buff *rep_skb;
+ struct taskstats stats;
+ void *reply;
+ size_t size;
+ struct nlattr *na;
+
+ /*
+ * Size includes space for nested attributes
+ */
+ size = nla_total_size(sizeof(u32)) +
+ nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
+
+ memset(&stats, 0, sizeof(stats));
+ rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, &reply, size);
+ if (rc < 0)
+ return rc;
+
+ if (info->attrs[TASKSTATS_CMD_ATTR_PID]) {
+ u32 pid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_PID]);
+ rc = fill_pid(pid, NULL, &stats);
+ if (rc < 0)
+ goto err;
+
+ na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_PID);
+ NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_PID, pid);
+ NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS,
+ stats);
+ } else if (info->attrs[TASKSTATS_CMD_ATTR_TGID]) {
+ u32 tgid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_TGID]);
+ rc = fill_tgid(tgid, NULL, &stats);
+ if (rc < 0)
+ goto err;
+
+ na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_TGID);
+ NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_TGID, tgid);
+ NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS,
+ stats);
+ } else {
+ rc = -EINVAL;
+ goto err;
+ }
+
+ nla_nest_end(rep_skb, na);
+
+ return send_reply(rep_skb, info->snd_pid, TASKSTATS_MSG_UNICAST);
+
+nla_put_failure:
+ return genlmsg_cancel(rep_skb, reply);
+err:
+ nlmsg_free(rep_skb);
+ return rc;
+}
+
+/* Send pid data out on exit */
+void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats,
+ struct taskstats *tgidstats)
+{
+ int rc;
+ struct sk_buff *rep_skb;
+ void *reply;
+ size_t size;
+ int is_thread_group;
+ struct nlattr *na;
+
+ if (!family_registered || !tidstats)
+ return;
+
+ mutex_lock(&taskstats_exit_mutex);
+
+ is_thread_group = !thread_group_empty(tsk);
+ rc = 0;
+
+ /*
+ * Size includes space for nested attributes
+ */
+ size = nla_total_size(sizeof(u32)) +
+ nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
+
+ if (is_thread_group)
+ size = 2 * size; /* PID + STATS + TGID + STATS */
+
+ rc = prepare_reply(NULL, TASKSTATS_CMD_NEW, &rep_skb, &reply, size);
+ if (rc < 0)
+ goto ret;
+
+ rc = fill_pid(tsk->pid, tsk, tidstats);
+ if (rc < 0)
+ goto err_skb;
+
+ na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_PID);
+ NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_PID, (u32)tsk->pid);
+ NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS,
+ *tidstats);
+ nla_nest_end(rep_skb, na);
+
+ if (!is_thread_group || !tgidstats) {
+ send_reply(rep_skb, 0, TASKSTATS_MSG_MULTICAST);
+ goto ret;
+ }
+
+ rc = fill_tgid(tsk->pid, tsk, tgidstats);
+ /*
+ * If fill_tgid() failed then one probable reason could be that the
+ * thread group leader has exited. fill_tgid() will fail, send out
+ * the pid statistics collected earlier.
+ */
+ if (rc < 0) {
+ send_reply(rep_skb, 0, TASKSTATS_MSG_MULTICAST);
+ goto ret;
+ }
+
+ na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_TGID);
+ NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_TGID, (u32)tsk->tgid);
+ NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS,
+ *tgidstats);
+ nla_nest_end(rep_skb, na);
+
+ send_reply(rep_skb, 0, TASKSTATS_MSG_MULTICAST);
+ goto ret;
+
+nla_put_failure:
+ genlmsg_cancel(rep_skb, reply);
+ goto ret;
+err_skb:
+ nlmsg_free(rep_skb);
+ret:
+ mutex_unlock(&taskstats_exit_mutex);
+ return;
+}
+
+static struct genl_ops taskstats_ops = {
+ .cmd = TASKSTATS_CMD_GET,
+ .doit = taskstats_send_stats,
+ .policy = taskstats_cmd_get_policy,
+};
+
+/* Needed early in initialization */
+void __init taskstats_init_early(void)
+{
+ taskstats_cache = kmem_cache_create("taskstats_cache",
+ sizeof(struct taskstats),
+ 0, SLAB_PANIC, NULL, NULL);
+}
+
+static int __init taskstats_init(void)
+{
+ int rc;
+
+ rc = genl_register_family(&family);
+ if (rc)
+ return rc;
+
+ rc = genl_register_ops(&family, &taskstats_ops);
+ if (rc < 0)
+ goto err;
+
+ family_registered = 1;
+ return 0;
+err:
+ genl_unregister_family(&family);
+ return rc;
+}
+
+/*
+ * late initcall ensures initialization of statistics collection
+ * mechanisms precedes initialization of the taskstats interface
+ */
+late_initcall(taskstats_init);