summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/acct.c117
-rw-r--r--kernel/audit.c205
-rw-r--r--kernel/audit.h61
-rw-r--r--kernel/auditfilter.c899
-rw-r--r--kernel/auditsc.c649
-rw-r--r--kernel/compat.c30
-rw-r--r--kernel/cpuset.c16
-rw-r--r--kernel/exit.c22
-rw-r--r--kernel/fork.c6
-rw-r--r--kernel/futex.c8
-rw-r--r--kernel/hrtimer.c25
-rw-r--r--kernel/intermodule.c184
-rw-r--r--kernel/irq/handle.c5
-rw-r--r--kernel/irq/migration.c4
-rw-r--r--kernel/irq/proc.c3
-rw-r--r--kernel/irq/spurious.c12
-rw-r--r--kernel/kexec.c6
-rw-r--r--kernel/ksysfs.c19
-rw-r--r--kernel/kthread.c61
-rw-r--r--kernel/module.c2
-rw-r--r--kernel/posix-cpu-timers.c48
-rw-r--r--kernel/power/Kconfig9
-rw-r--r--kernel/power/disk.c2
-rw-r--r--kernel/power/main.c8
-rw-r--r--kernel/power/power.h2
-rw-r--r--kernel/power/snapshot.c148
-rw-r--r--kernel/power/swsusp.c20
-rw-r--r--kernel/printk.c80
-rw-r--r--kernel/rcupdate.c13
-rw-r--r--kernel/sched.c18
-rw-r--r--kernel/signal.c2
-rw-r--r--kernel/softirq.c2
-rw-r--r--kernel/softlockup.c4
-rw-r--r--kernel/stop_machine.c17
-rw-r--r--kernel/sys.c80
-rw-r--r--kernel/sys_ni.c2
-rw-r--r--kernel/sysctl.c26
-rw-r--r--kernel/timer.c32
-rw-r--r--kernel/user.c4
-rw-r--r--kernel/workqueue.c34
41 files changed, 2113 insertions, 773 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 58908f9d156a..f6ef00f4f90f 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -20,7 +20,6 @@ obj-$(CONFIG_SMP) += cpu.o spinlock.o
obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
obj-$(CONFIG_UID16) += uid16.o
obj-$(CONFIG_MODULES) += module.o
-obj-$(CONFIG_OBSOLETE_INTERMODULE) += intermodule.o
obj-$(CONFIG_KALLSYMS) += kallsyms.o
obj-$(CONFIG_PM) += power/
obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
diff --git a/kernel/acct.c b/kernel/acct.c
index b327f4d20104..368c4f03fe0e 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -75,7 +75,7 @@ int acct_parm[3] = {4, 2, 30};
/*
* External references and all of the globals.
*/
-static void do_acct_process(long, struct file *);
+static void do_acct_process(struct file *);
/*
* This structure is used so that all the data protected by lock
@@ -118,7 +118,7 @@ static int check_free_space(struct file *file)
spin_unlock(&acct_globals.lock);
/* May block */
- if (vfs_statfs(file->f_dentry->d_inode->i_sb, &sbuf))
+ if (vfs_statfs(file->f_dentry, &sbuf))
return res;
suspend = sbuf.f_blocks * SUSPEND;
resume = sbuf.f_blocks * RESUME;
@@ -196,7 +196,7 @@ static void acct_file_reopen(struct file *file)
if (old_acct) {
mnt_unpin(old_acct->f_vfsmnt);
spin_unlock(&acct_globals.lock);
- do_acct_process(0, old_acct);
+ do_acct_process(old_acct);
filp_close(old_acct, NULL);
spin_lock(&acct_globals.lock);
}
@@ -419,16 +419,15 @@ static u32 encode_float(u64 value)
/*
* do_acct_process does all actual work. Caller holds the reference to file.
*/
-static void do_acct_process(long exitcode, struct file *file)
+static void do_acct_process(struct file *file)
{
+ struct pacct_struct *pacct = &current->signal->pacct;
acct_t ac;
mm_segment_t fs;
- unsigned long vsize;
unsigned long flim;
u64 elapsed;
u64 run_time;
struct timespec uptime;
- unsigned long jiffies;
/*
* First check to see if there is enough free_space to continue
@@ -469,12 +468,6 @@ static void do_acct_process(long exitcode, struct file *file)
#endif
do_div(elapsed, AHZ);
ac.ac_btime = xtime.tv_sec - elapsed;
- jiffies = cputime_to_jiffies(cputime_add(current->utime,
- current->signal->utime));
- ac.ac_utime = encode_comp_t(jiffies_to_AHZ(jiffies));
- jiffies = cputime_to_jiffies(cputime_add(current->stime,
- current->signal->stime));
- ac.ac_stime = encode_comp_t(jiffies_to_AHZ(jiffies));
/* we really need to bite the bullet and change layout */
ac.ac_uid = current->uid;
ac.ac_gid = current->gid;
@@ -496,37 +489,18 @@ static void do_acct_process(long exitcode, struct file *file)
old_encode_dev(tty_devnum(current->signal->tty)) : 0;
read_unlock(&tasklist_lock);
- ac.ac_flag = 0;
- if (current->flags & PF_FORKNOEXEC)
- ac.ac_flag |= AFORK;
- if (current->flags & PF_SUPERPRIV)
- ac.ac_flag |= ASU;
- if (current->flags & PF_DUMPCORE)
- ac.ac_flag |= ACORE;
- if (current->flags & PF_SIGNALED)
- ac.ac_flag |= AXSIG;
-
- vsize = 0;
- if (current->mm) {
- struct vm_area_struct *vma;
- down_read(&current->mm->mmap_sem);
- vma = current->mm->mmap;
- while (vma) {
- vsize += vma->vm_end - vma->vm_start;
- vma = vma->vm_next;
- }
- up_read(&current->mm->mmap_sem);
- }
- vsize = vsize / 1024;
- ac.ac_mem = encode_comp_t(vsize);
+ spin_lock(&current->sighand->siglock);
+ ac.ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime)));
+ ac.ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime)));
+ ac.ac_flag = pacct->ac_flag;
+ ac.ac_mem = encode_comp_t(pacct->ac_mem);
+ ac.ac_minflt = encode_comp_t(pacct->ac_minflt);
+ ac.ac_majflt = encode_comp_t(pacct->ac_majflt);
+ ac.ac_exitcode = pacct->ac_exitcode;
+ spin_unlock(&current->sighand->siglock);
ac.ac_io = encode_comp_t(0 /* current->io_usage */); /* %% */
ac.ac_rw = encode_comp_t(ac.ac_io / 1024);
- ac.ac_minflt = encode_comp_t(current->signal->min_flt +
- current->min_flt);
- ac.ac_majflt = encode_comp_t(current->signal->maj_flt +
- current->maj_flt);
ac.ac_swaps = encode_comp_t(0);
- ac.ac_exitcode = exitcode;
/*
* Kernel segment override to datasegment and write it
@@ -546,12 +520,63 @@ static void do_acct_process(long exitcode, struct file *file)
}
/**
+ * acct_init_pacct - initialize a new pacct_struct
+ */
+void acct_init_pacct(struct pacct_struct *pacct)
+{
+ memset(pacct, 0, sizeof(struct pacct_struct));
+ pacct->ac_utime = pacct->ac_stime = cputime_zero;
+}
+
+/**
+ * acct_collect - collect accounting information into pacct_struct
+ * @exitcode: task exit code
+ * @group_dead: not 0, if this thread is the last one in the process.
+ */
+void acct_collect(long exitcode, int group_dead)
+{
+ struct pacct_struct *pacct = &current->signal->pacct;
+ unsigned long vsize = 0;
+
+ if (group_dead && current->mm) {
+ struct vm_area_struct *vma;
+ down_read(&current->mm->mmap_sem);
+ vma = current->mm->mmap;
+ while (vma) {
+ vsize += vma->vm_end - vma->vm_start;
+ vma = vma->vm_next;
+ }
+ up_read(&current->mm->mmap_sem);
+ }
+
+ spin_lock_irq(&current->sighand->siglock);
+ if (group_dead)
+ pacct->ac_mem = vsize / 1024;
+ if (thread_group_leader(current)) {
+ pacct->ac_exitcode = exitcode;
+ if (current->flags & PF_FORKNOEXEC)
+ pacct->ac_flag |= AFORK;
+ }
+ if (current->flags & PF_SUPERPRIV)
+ pacct->ac_flag |= ASU;
+ if (current->flags & PF_DUMPCORE)
+ pacct->ac_flag |= ACORE;
+ if (current->flags & PF_SIGNALED)
+ pacct->ac_flag |= AXSIG;
+ pacct->ac_utime = cputime_add(pacct->ac_utime, current->utime);
+ pacct->ac_stime = cputime_add(pacct->ac_stime, current->stime);
+ pacct->ac_minflt += current->min_flt;
+ pacct->ac_majflt += current->maj_flt;
+ spin_unlock_irq(&current->sighand->siglock);
+}
+
+/**
* acct_process - now just a wrapper around do_acct_process
* @exitcode: task exit code
*
* handles process accounting for an exiting task
*/
-void acct_process(long exitcode)
+void acct_process()
{
struct file *file = NULL;
@@ -570,7 +595,7 @@ void acct_process(long exitcode)
get_file(file);
spin_unlock(&acct_globals.lock);
- do_acct_process(exitcode, file);
+ do_acct_process(file);
fput(file);
}
@@ -599,9 +624,7 @@ void acct_update_integrals(struct task_struct *tsk)
*/
void acct_clear_integrals(struct task_struct *tsk)
{
- if (tsk) {
- tsk->acct_stimexpd = 0;
- tsk->acct_rss_mem1 = 0;
- tsk->acct_vm_mem1 = 0;
- }
+ tsk->acct_stimexpd = 0;
+ tsk->acct_rss_mem1 = 0;
+ tsk->acct_vm_mem1 = 0;
}
diff --git a/kernel/audit.c b/kernel/audit.c
index df57b493e1cb..7dfac7031bd7 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -56,6 +56,7 @@
#include <linux/skbuff.h>
#include <linux/netlink.h>
#include <linux/selinux.h>
+#include <linux/inotify.h>
#include "audit.h"
@@ -89,6 +90,7 @@ static int audit_backlog_wait_overflow = 0;
/* The identity of the user shutting down the audit system. */
uid_t audit_sig_uid = -1;
pid_t audit_sig_pid = -1;
+u32 audit_sig_sid = 0;
/* Records can be lost in several ways:
0) [suppressed in audit_alloc]
@@ -102,6 +104,12 @@ static atomic_t audit_lost = ATOMIC_INIT(0);
/* The netlink socket. */
static struct sock *audit_sock;
+/* Inotify handle. */
+struct inotify_handle *audit_ih;
+
+/* Hash for inode-based rules */
+struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS];
+
/* The audit_freelist is a list of pre-allocated audit buffers (if more
* than AUDIT_MAXFREE are in use, the audit buffer is freed instead of
* being placed on the freelist). */
@@ -114,10 +122,8 @@ static struct task_struct *kauditd_task;
static DECLARE_WAIT_QUEUE_HEAD(kauditd_wait);
static DECLARE_WAIT_QUEUE_HEAD(audit_backlog_wait);
-/* The netlink socket is only to be read by 1 CPU, which lets us assume
- * that list additions and deletions never happen simultaneously in
- * auditsc.c */
-DEFINE_MUTEX(audit_netlink_mutex);
+/* Serialize requests from userspace. */
+static DEFINE_MUTEX(audit_cmd_mutex);
/* AUDIT_BUFSIZ is the size of the temporary buffer used for formatting
* audit records. Since printk uses a 1024 byte buffer, this buffer
@@ -250,7 +256,7 @@ static int audit_set_rate_limit(int limit, uid_t loginuid, u32 sid)
"audit_rate_limit=%d old=%d by auid=%u",
limit, old, loginuid);
audit_rate_limit = limit;
- return old;
+ return 0;
}
static int audit_set_backlog_limit(int limit, uid_t loginuid, u32 sid)
@@ -273,7 +279,7 @@ static int audit_set_backlog_limit(int limit, uid_t loginuid, u32 sid)
"audit_backlog_limit=%d old=%d by auid=%u",
limit, old, loginuid);
audit_backlog_limit = limit;
- return old;
+ return 0;
}
static int audit_set_enabled(int state, uid_t loginuid, u32 sid)
@@ -299,7 +305,7 @@ static int audit_set_enabled(int state, uid_t loginuid, u32 sid)
"audit_enabled=%d old=%d by auid=%u",
state, old, loginuid);
audit_enabled = state;
- return old;
+ return 0;
}
static int audit_set_failure(int state, uid_t loginuid, u32 sid)
@@ -327,7 +333,7 @@ static int audit_set_failure(int state, uid_t loginuid, u32 sid)
"audit_failure=%d old=%d by auid=%u",
state, old, loginuid);
audit_failure = state;
- return old;
+ return 0;
}
static int kauditd_thread(void *dummy)
@@ -363,9 +369,52 @@ static int kauditd_thread(void *dummy)
remove_wait_queue(&kauditd_wait, &wait);
}
}
+}
+
+int audit_send_list(void *_dest)
+{
+ struct audit_netlink_list *dest = _dest;
+ int pid = dest->pid;
+ struct sk_buff *skb;
+
+ /* wait for parent to finish and send an ACK */
+ mutex_lock(&audit_cmd_mutex);
+ mutex_unlock(&audit_cmd_mutex);
+
+ while ((skb = __skb_dequeue(&dest->q)) != NULL)
+ netlink_unicast(audit_sock, skb, pid, 0);
+
+ kfree(dest);
+
return 0;
}
+struct sk_buff *audit_make_reply(int pid, int seq, int type, int done,
+ int multi, void *payload, int size)
+{
+ struct sk_buff *skb;
+ struct nlmsghdr *nlh;
+ int len = NLMSG_SPACE(size);
+ void *data;
+ int flags = multi ? NLM_F_MULTI : 0;
+ int t = done ? NLMSG_DONE : type;
+
+ skb = alloc_skb(len, GFP_KERNEL);
+ if (!skb)
+ return NULL;
+
+ nlh = NLMSG_PUT(skb, pid, seq, t, size);
+ nlh->nlmsg_flags = flags;
+ data = NLMSG_DATA(nlh);
+ memcpy(data, payload, size);
+ return skb;
+
+nlmsg_failure: /* Used by NLMSG_PUT */
+ if (skb)
+ kfree_skb(skb);
+ return NULL;
+}
+
/**
* audit_send_reply - send an audit reply message via netlink
* @pid: process id to send reply to
@@ -383,29 +432,13 @@ void audit_send_reply(int pid, int seq, int type, int done, int multi,
void *payload, int size)
{
struct sk_buff *skb;
- struct nlmsghdr *nlh;
- int len = NLMSG_SPACE(size);
- void *data;
- int flags = multi ? NLM_F_MULTI : 0;
- int t = done ? NLMSG_DONE : type;
-
- skb = alloc_skb(len, GFP_KERNEL);
+ skb = audit_make_reply(pid, seq, type, done, multi, payload, size);
if (!skb)
return;
-
- nlh = NLMSG_PUT(skb, pid, seq, t, size);
- nlh->nlmsg_flags = flags;
- data = NLMSG_DATA(nlh);
- memcpy(data, payload, size);
-
/* Ignore failure. It'll only happen if the sender goes away,
because our timeout is set to infinite. */
netlink_unicast(audit_sock, skb, pid, 0);
return;
-
-nlmsg_failure: /* Used by NLMSG_PUT */
- if (skb)
- kfree_skb(skb);
}
/*
@@ -451,7 +484,9 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
struct audit_buffer *ab;
u16 msg_type = nlh->nlmsg_type;
uid_t loginuid; /* loginuid of sender */
- struct audit_sig_info sig_data;
+ struct audit_sig_info *sig_data;
+ char *ctx;
+ u32 len;
err = audit_netlink_ok(NETLINK_CB(skb).eff_cap, msg_type);
if (err)
@@ -503,12 +538,9 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
if (status_get->mask & AUDIT_STATUS_PID) {
int old = audit_pid;
if (sid) {
- char *ctx = NULL;
- u32 len;
- int rc;
- if ((rc = selinux_ctxid_to_string(
+ if ((err = selinux_ctxid_to_string(
sid, &ctx, &len)))
- return rc;
+ return err;
else
audit_log(NULL, GFP_KERNEL,
AUDIT_CONFIG_CHANGE,
@@ -523,10 +555,10 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
audit_pid = status_get->pid;
}
if (status_get->mask & AUDIT_STATUS_RATE_LIMIT)
- audit_set_rate_limit(status_get->rate_limit,
+ err = audit_set_rate_limit(status_get->rate_limit,
loginuid, sid);
if (status_get->mask & AUDIT_STATUS_BACKLOG_LIMIT)
- audit_set_backlog_limit(status_get->backlog_limit,
+ err = audit_set_backlog_limit(status_get->backlog_limit,
loginuid, sid);
break;
case AUDIT_USER:
@@ -544,8 +576,6 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
"user pid=%d uid=%u auid=%u",
pid, uid, loginuid);
if (sid) {
- char *ctx = NULL;
- u32 len;
if (selinux_ctxid_to_string(
sid, &ctx, &len)) {
audit_log_format(ab,
@@ -584,10 +614,21 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
loginuid, sid);
break;
case AUDIT_SIGNAL_INFO:
- sig_data.uid = audit_sig_uid;
- sig_data.pid = audit_sig_pid;
+ err = selinux_ctxid_to_string(audit_sig_sid, &ctx, &len);
+ if (err)
+ return err;
+ sig_data = kmalloc(sizeof(*sig_data) + len, GFP_KERNEL);
+ if (!sig_data) {
+ kfree(ctx);
+ return -ENOMEM;
+ }
+ sig_data->uid = audit_sig_uid;
+ sig_data->pid = audit_sig_pid;
+ memcpy(sig_data->ctx, ctx, len);
+ kfree(ctx);
audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_SIGNAL_INFO,
- 0, 0, &sig_data, sizeof(sig_data));
+ 0, 0, sig_data, sizeof(*sig_data) + len);
+ kfree(sig_data);
break;
default:
err = -EINVAL;
@@ -629,20 +670,30 @@ static void audit_receive(struct sock *sk, int length)
struct sk_buff *skb;
unsigned int qlen;
- mutex_lock(&audit_netlink_mutex);
+ mutex_lock(&audit_cmd_mutex);
for (qlen = skb_queue_len(&sk->sk_receive_queue); qlen; qlen--) {
skb = skb_dequeue(&sk->sk_receive_queue);
audit_receive_skb(skb);
kfree_skb(skb);
}
- mutex_unlock(&audit_netlink_mutex);
+ mutex_unlock(&audit_cmd_mutex);
}
+#ifdef CONFIG_AUDITSYSCALL
+static const struct inotify_operations audit_inotify_ops = {
+ .handle_event = audit_handle_ievent,
+ .destroy_watch = audit_free_parent,
+};
+#endif
/* Initialize audit support at boot time. */
static int __init audit_init(void)
{
+#ifdef CONFIG_AUDITSYSCALL
+ int i;
+#endif
+
printk(KERN_INFO "audit: initializing netlink socket (%s)\n",
audit_default ? "enabled" : "disabled");
audit_sock = netlink_kernel_create(NETLINK_AUDIT, 0, audit_receive,
@@ -661,6 +712,16 @@ static int __init audit_init(void)
selinux_audit_set_callback(&selinux_audit_rule_update);
audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL, "initialized");
+
+#ifdef CONFIG_AUDITSYSCALL
+ audit_ih = inotify_init(&audit_inotify_ops);
+ if (IS_ERR(audit_ih))
+ audit_panic("cannot initialize inotify handle");
+
+ for (i = 0; i < AUDIT_INODE_BUCKETS; i++)
+ INIT_LIST_HEAD(&audit_inode_hash[i]);
+#endif
+
return 0;
}
__initcall(audit_init);
@@ -690,10 +751,12 @@ static void audit_buffer_free(struct audit_buffer *ab)
kfree_skb(ab->skb);
spin_lock_irqsave(&audit_freelist_lock, flags);
- if (++audit_freelist_count > AUDIT_MAXFREE)
+ if (audit_freelist_count > AUDIT_MAXFREE)
kfree(ab);
- else
+ else {
+ audit_freelist_count++;
list_add(&ab->list, &audit_freelist);
+ }
spin_unlock_irqrestore(&audit_freelist_lock, flags);
}
@@ -988,28 +1051,76 @@ void audit_log_hex(struct audit_buffer *ab, const unsigned char *buf,
skb_put(skb, len << 1); /* new string is twice the old string */
}
+/*
+ * Format a string of no more than slen characters into the audit buffer,
+ * enclosed in quote marks.
+ */
+static void audit_log_n_string(struct audit_buffer *ab, size_t slen,
+ const char *string)
+{
+ int avail, new_len;
+ unsigned char *ptr;
+ struct sk_buff *skb;
+
+ BUG_ON(!ab->skb);
+ skb = ab->skb;
+ avail = skb_tailroom(skb);
+ new_len = slen + 3; /* enclosing quotes + null terminator */
+ if (new_len > avail) {
+ avail = audit_expand(ab, new_len);
+ if (!avail)
+ return;
+ }
+ ptr = skb->tail;
+ *ptr++ = '"';
+ memcpy(ptr, string, slen);
+ ptr += slen;
+ *ptr++ = '"';
+ *ptr = 0;
+ skb_put(skb, slen + 2); /* don't include null terminator */
+}
+
/**
- * audit_log_unstrustedstring - log a string that may contain random characters
+ * audit_log_n_unstrustedstring - log a string that may contain random characters
* @ab: audit_buffer
+ * @len: lenth of string (not including trailing null)
* @string: string to be logged
*
* This code will escape a string that is passed to it if the string
* contains a control character, unprintable character, double quote mark,
* or a space. Unescaped strings will start and end with a double quote mark.
* Strings that are escaped are printed in hex (2 digits per char).
+ *
+ * The caller specifies the number of characters in the string to log, which may
+ * or may not be the entire string.
*/
-void audit_log_untrustedstring(struct audit_buffer *ab, const char *string)
+const char *audit_log_n_untrustedstring(struct audit_buffer *ab, size_t len,
+ const char *string)
{
const unsigned char *p = string;
while (*p) {
if (*p == '"' || *p < 0x21 || *p > 0x7f) {
- audit_log_hex(ab, string, strlen(string));
- return;
+ audit_log_hex(ab, string, len);
+ return string + len + 1;
}
p++;
}
- audit_log_format(ab, "\"%s\"", string);
+ audit_log_n_string(ab, len, string);
+ return p + 1;
+}
+
+/**
+ * audit_log_unstrustedstring - log a string that may contain random characters
+ * @ab: audit_buffer
+ * @string: string to be logged
+ *
+ * Same as audit_log_n_unstrustedstring(), except that strlen is used to
+ * determine string length.
+ */
+const char *audit_log_untrustedstring(struct audit_buffer *ab, const char *string)
+{
+ return audit_log_n_untrustedstring(ab, strlen(string), string);
}
/* This is a helper-function to print the escaped d_path */
diff --git a/kernel/audit.h b/kernel/audit.h
index 6f733920fd32..8323e4132a33 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -19,9 +19,9 @@
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
-#include <linux/mutex.h>
#include <linux/fs.h>
#include <linux/audit.h>
+#include <linux/skbuff.h>
/* 0 = no checking
1 = put_count checking
@@ -53,6 +53,18 @@ enum audit_state {
};
/* Rule lists */
+struct audit_parent;
+
+struct audit_watch {
+ atomic_t count; /* reference count */
+ char *path; /* insertion path */
+ dev_t dev; /* associated superblock device */
+ unsigned long ino; /* associated inode number */
+ struct audit_parent *parent; /* associated parent */
+ struct list_head wlist; /* entry in parent->watches list */
+ struct list_head rules; /* associated rules */
+};
+
struct audit_field {
u32 type;
u32 val;
@@ -70,6 +82,9 @@ struct audit_krule {
u32 buflen; /* for data alloc on list rules */
u32 field_count;
struct audit_field *fields;
+ struct audit_field *inode_f; /* quick access to an inode field */
+ struct audit_watch *watch; /* associated watch */
+ struct list_head rlist; /* entry in audit_watch.rules list */
};
struct audit_entry {
@@ -78,15 +93,53 @@ struct audit_entry {
struct audit_krule rule;
};
-
extern int audit_pid;
-extern int audit_comparator(const u32 left, const u32 op, const u32 right);
+#define AUDIT_INODE_BUCKETS 32
+extern struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS];
+
+static inline int audit_hash_ino(u32 ino)
+{
+ return (ino & (AUDIT_INODE_BUCKETS-1));
+}
+
+extern int audit_comparator(const u32 left, const u32 op, const u32 right);
+extern int audit_compare_dname_path(const char *dname, const char *path,
+ int *dirlen);
+extern struct sk_buff * audit_make_reply(int pid, int seq, int type,
+ int done, int multi,
+ void *payload, int size);
extern void audit_send_reply(int pid, int seq, int type,
int done, int multi,
void *payload, int size);
extern void audit_log_lost(const char *message);
extern void audit_panic(const char *message);
-extern struct mutex audit_netlink_mutex;
+struct audit_netlink_list {
+ int pid;
+ struct sk_buff_head q;
+};
+
+int audit_send_list(void *);
+
+struct inotify_watch;
+extern void audit_free_parent(struct inotify_watch *);
+extern void audit_handle_ievent(struct inotify_watch *, u32, u32, u32,
+ const char *, struct inode *);
extern int selinux_audit_rule_update(void);
+
+#ifdef CONFIG_AUDITSYSCALL
+extern void __audit_signal_info(int sig, struct task_struct *t);
+static inline void audit_signal_info(int sig, struct task_struct *t)
+{
+ if (unlikely(audit_pid && t->tgid == audit_pid))
+ __audit_signal_info(sig, t);
+}
+extern enum audit_state audit_filter_inodes(struct task_struct *,
+ struct audit_context *);
+extern void audit_set_auditable(struct audit_context *);
+#else
+#define audit_signal_info(s,t)
+#define audit_filter_inodes(t,c) AUDIT_DISABLED
+#define audit_set_auditable(c)
+#endif
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 7c134906d689..4c99d2c586ed 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -22,13 +22,59 @@
#include <linux/kernel.h>
#include <linux/audit.h>
#include <linux/kthread.h>
+#include <linux/mutex.h>
+#include <linux/fs.h>
+#include <linux/namei.h>
#include <linux/netlink.h>
+#include <linux/sched.h>
+#include <linux/inotify.h>
#include <linux/selinux.h>
#include "audit.h"
-/* There are three lists of rules -- one to search at task creation
- * time, one to search at syscall entry time, and another to search at
- * syscall exit time. */
+/*
+ * Locking model:
+ *
+ * audit_filter_mutex:
+ * Synchronizes writes and blocking reads of audit's filterlist
+ * data. Rcu is used to traverse the filterlist and access
+ * contents of structs audit_entry, audit_watch and opaque
+ * selinux rules during filtering. If modified, these structures
+ * must be copied and replace their counterparts in the filterlist.
+ * An audit_parent struct is not accessed during filtering, so may
+ * be written directly provided audit_filter_mutex is held.
+ */
+
+/*
+ * Reference counting:
+ *
+ * audit_parent: lifetime is from audit_init_parent() to receipt of an IN_IGNORED
+ * event. Each audit_watch holds a reference to its associated parent.
+ *
+ * audit_watch: if added to lists, lifetime is from audit_init_watch() to
+ * audit_remove_watch(). Additionally, an audit_watch may exist
+ * temporarily to assist in searching existing filter data. Each
+ * audit_krule holds a reference to its associated watch.
+ */
+
+struct audit_parent {
+ struct list_head ilist; /* entry in inotify registration list */
+ struct list_head watches; /* associated watches */
+ struct inotify_watch wdata; /* inotify watch data */
+ unsigned flags; /* status flags */
+};
+
+/*
+ * audit_parent status flags:
+ *
+ * AUDIT_PARENT_INVALID - set anytime rules/watches are auto-removed due to
+ * a filesystem event to ensure we're adding audit watches to a valid parent.
+ * Technically not needed for IN_DELETE_SELF or IN_UNMOUNT events, as we cannot
+ * receive them while we have nameidata, but must be used for IN_MOVE_SELF which
+ * we can receive while holding nameidata.
+ */
+#define AUDIT_PARENT_INVALID 0x001
+
+/* Audit filter lists, defined in <linux/audit.h> */
struct list_head audit_filter_list[AUDIT_NR_FILTERS] = {
LIST_HEAD_INIT(audit_filter_list[0]),
LIST_HEAD_INIT(audit_filter_list[1]),
@@ -41,9 +87,53 @@ struct list_head audit_filter_list[AUDIT_NR_FILTERS] = {
#endif
};
+static DEFINE_MUTEX(audit_filter_mutex);
+
+/* Inotify handle */
+extern struct inotify_handle *audit_ih;
+
+/* Inotify events we care about. */
+#define AUDIT_IN_WATCH IN_MOVE|IN_CREATE|IN_DELETE|IN_DELETE_SELF|IN_MOVE_SELF
+
+void audit_free_parent(struct inotify_watch *i_watch)
+{
+ struct audit_parent *parent;
+
+ parent = container_of(i_watch, struct audit_parent, wdata);
+ WARN_ON(!list_empty(&parent->watches));
+ kfree(parent);
+}
+
+static inline void audit_get_watch(struct audit_watch *watch)
+{
+ atomic_inc(&watch->count);
+}
+
+static void audit_put_watch(struct audit_watch *watch)
+{
+ if (atomic_dec_and_test(&watch->count)) {
+ WARN_ON(watch->parent);
+ WARN_ON(!list_empty(&watch->rules));
+ kfree(watch->path);
+ kfree(watch);
+ }
+}
+
+static void audit_remove_watch(struct audit_watch *watch)
+{
+ list_del(&watch->wlist);
+ put_inotify_watch(&watch->parent->wdata);
+ watch->parent = NULL;
+ audit_put_watch(watch); /* match initial get */
+}
+
static inline void audit_free_rule(struct audit_entry *e)
{
int i;
+
+ /* some rules don't have associated watches */
+ if (e->rule.watch)
+ audit_put_watch(e->rule.watch);
if (e->rule.fields)
for (i = 0; i < e->rule.field_count; i++) {
struct audit_field *f = &e->rule.fields[i];
@@ -60,6 +150,50 @@ static inline void audit_free_rule_rcu(struct rcu_head *head)
audit_free_rule(e);
}
+/* Initialize a parent watch entry. */
+static struct audit_parent *audit_init_parent(struct nameidata *ndp)
+{
+ struct audit_parent *parent;
+ s32 wd;
+
+ parent = kzalloc(sizeof(*parent), GFP_KERNEL);
+ if (unlikely(!parent))
+ return ERR_PTR(-ENOMEM);
+
+ INIT_LIST_HEAD(&parent->watches);
+ parent->flags = 0;
+
+ inotify_init_watch(&parent->wdata);
+ /* grab a ref so inotify watch hangs around until we take audit_filter_mutex */
+ get_inotify_watch(&parent->wdata);
+ wd = inotify_add_watch(audit_ih, &parent->wdata, ndp->dentry->d_inode,
+ AUDIT_IN_WATCH);
+ if (wd < 0) {
+ audit_free_parent(&parent->wdata);
+ return ERR_PTR(wd);
+ }
+
+ return parent;
+}
+
+/* Initialize a watch entry. */
+static struct audit_watch *audit_init_watch(char *path)
+{
+ struct audit_watch *watch;
+
+ watch = kzalloc(sizeof(*watch), GFP_KERNEL);
+ if (unlikely(!watch))
+ return ERR_PTR(-ENOMEM);
+
+ INIT_LIST_HEAD(&watch->rules);
+ atomic_set(&watch->count, 1);
+ watch->path = path;
+ watch->dev = (dev_t)-1;
+ watch->ino = (unsigned long)-1;
+
+ return watch;
+}
+
/* Initialize an audit filterlist entry. */
static inline struct audit_entry *audit_init_entry(u32 field_count)
{
@@ -107,6 +241,43 @@ static char *audit_unpack_string(void **bufp, size_t *remain, size_t len)
return str;
}
+/* Translate an inode field to kernel respresentation. */
+static inline int audit_to_inode(struct audit_krule *krule,
+ struct audit_field *f)
+{
+ if (krule->listnr != AUDIT_FILTER_EXIT ||
+ krule->watch || krule->inode_f)
+ return -EINVAL;
+
+ krule->inode_f = f;
+ return 0;
+}
+
+/* Translate a watch string to kernel respresentation. */
+static int audit_to_watch(struct audit_krule *krule, char *path, int len,
+ u32 op)
+{
+ struct audit_watch *watch;
+
+ if (!audit_ih)
+ return -EOPNOTSUPP;
+
+ if (path[0] != '/' || path[len-1] == '/' ||
+ krule->listnr != AUDIT_FILTER_EXIT ||
+ op & ~AUDIT_EQUAL ||
+ krule->inode_f || krule->watch) /* 1 inode # per rule, for hash */
+ return -EINVAL;
+
+ watch = audit_init_watch(path);
+ if (unlikely(IS_ERR(watch)))
+ return PTR_ERR(watch);
+
+ audit_get_watch(watch);
+ krule->watch = watch;
+
+ return 0;
+}
+
/* Common user-space to kernel rule translation. */
static inline struct audit_entry *audit_to_entry_common(struct audit_rule *rule)
{
@@ -128,8 +299,11 @@ static inline struct audit_entry *audit_to_entry_common(struct audit_rule *rule)
#endif
;
}
- if (rule->action != AUDIT_NEVER && rule->action != AUDIT_POSSIBLE &&
- rule->action != AUDIT_ALWAYS)
+ if (unlikely(rule->action == AUDIT_POSSIBLE)) {
+ printk(KERN_ERR "AUDIT_POSSIBLE is deprecated\n");
+ goto exit_err;
+ }
+ if (rule->action != AUDIT_NEVER && rule->action != AUDIT_ALWAYS)
goto exit_err;
if (rule->field_count > AUDIT_MAX_FIELDS)
goto exit_err;
@@ -158,6 +332,7 @@ exit_err:
static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
{
struct audit_entry *entry;
+ struct audit_field *f;
int err = 0;
int i;
@@ -172,14 +347,37 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
f->type = rule->fields[i] & ~(AUDIT_NEGATE|AUDIT_OPERATORS);
f->val = rule->values[i];
- if (f->type & AUDIT_UNUSED_BITS ||
- f->type == AUDIT_SE_USER ||
- f->type == AUDIT_SE_ROLE ||
- f->type == AUDIT_SE_TYPE ||
- f->type == AUDIT_SE_SEN ||
- f->type == AUDIT_SE_CLR) {
- err = -EINVAL;
+ err = -EINVAL;
+ switch(f->type) {
+ default:
goto exit_free;
+ case AUDIT_PID:
+ case AUDIT_UID:
+ case AUDIT_EUID:
+ case AUDIT_SUID:
+ case AUDIT_FSUID:
+ case AUDIT_GID:
+ case AUDIT_EGID:
+ case AUDIT_SGID:
+ case AUDIT_FSGID:
+ case AUDIT_LOGINUID:
+ case AUDIT_PERS:
+ case AUDIT_ARCH:
+ case AUDIT_MSGTYPE:
+ case AUDIT_DEVMAJOR:
+ case AUDIT_DEVMINOR:
+ case AUDIT_EXIT:
+ case AUDIT_SUCCESS:
+ case AUDIT_ARG0:
+ case AUDIT_ARG1:
+ case AUDIT_ARG2:
+ case AUDIT_ARG3:
+ break;
+ case AUDIT_INODE:
+ err = audit_to_inode(&entry->rule, f);
+ if (err)
+ goto exit_free;
+ break;
}
entry->rule.vers_ops = (f->op & AUDIT_OPERATORS) ? 2 : 1;
@@ -196,6 +394,18 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
}
}
+ f = entry->rule.inode_f;
+ if (f) {
+ switch(f->op) {
+ case AUDIT_NOT_EQUAL:
+ entry->rule.inode_f = NULL;
+ case AUDIT_EQUAL:
+ break;
+ default:
+ goto exit_free;
+ }
+ }
+
exit_nofree:
return entry;
@@ -210,6 +420,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
{
int err = 0;
struct audit_entry *entry;
+ struct audit_field *f;
void *bufp;
size_t remain = datasz - sizeof(struct audit_rule_data);
int i;
@@ -235,6 +446,29 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
f->se_str = NULL;
f->se_rule = NULL;
switch(f->type) {
+ case AUDIT_PID:
+ case AUDIT_UID:
+ case AUDIT_EUID:
+ case AUDIT_SUID:
+ case AUDIT_FSUID:
+ case AUDIT_GID:
+ case AUDIT_EGID:
+ case AUDIT_SGID:
+ case AUDIT_FSGID:
+ case AUDIT_LOGINUID:
+ case AUDIT_PERS:
+ case AUDIT_ARCH:
+ case AUDIT_MSGTYPE:
+ case AUDIT_PPID:
+ case AUDIT_DEVMAJOR:
+ case AUDIT_DEVMINOR:
+ case AUDIT_EXIT:
+ case AUDIT_SUCCESS:
+ case AUDIT_ARG0:
+ case AUDIT_ARG1:
+ case AUDIT_ARG2:
+ case AUDIT_ARG3:
+ break;
case AUDIT_SE_USER:
case AUDIT_SE_ROLE:
case AUDIT_SE_TYPE:
@@ -260,6 +494,37 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
} else
f->se_str = str;
break;
+ case AUDIT_WATCH:
+ str = audit_unpack_string(&bufp, &remain, f->val);
+ if (IS_ERR(str))
+ goto exit_free;
+ entry->rule.buflen += f->val;
+
+ err = audit_to_watch(&entry->rule, str, f->val, f->op);
+ if (err) {
+ kfree(str);
+ goto exit_free;
+ }
+ break;
+ case AUDIT_INODE:
+ err = audit_to_inode(&entry->rule, f);
+ if (err)
+ goto exit_free;
+ break;
+ default:
+ goto exit_free;
+ }
+ }
+
+ f = entry->rule.inode_f;
+ if (f) {
+ switch(f->op) {
+ case AUDIT_NOT_EQUAL:
+ entry->rule.inode_f = NULL;
+ case AUDIT_EQUAL:
+ break;
+ default:
+ goto exit_free;
}
}
@@ -291,7 +556,7 @@ static struct audit_rule *audit_krule_to_rule(struct audit_krule *krule)
rule = kmalloc(sizeof(*rule), GFP_KERNEL);
if (unlikely(!rule))
- return ERR_PTR(-ENOMEM);
+ return NULL;
memset(rule, 0, sizeof(*rule));
rule->flags = krule->flags | krule->listnr;
@@ -322,7 +587,7 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule)
data = kmalloc(sizeof(*data) + krule->buflen, GFP_KERNEL);
if (unlikely(!data))
- return ERR_PTR(-ENOMEM);
+ return NULL;
memset(data, 0, sizeof(*data));
data->flags = krule->flags | krule->listnr;
@@ -343,6 +608,10 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule)
data->buflen += data->values[i] =
audit_pack_string(&bufp, f->se_str);
break;
+ case AUDIT_WATCH:
+ data->buflen += data->values[i] =
+ audit_pack_string(&bufp, krule->watch->path);
+ break;
default:
data->values[i] = f->val;
}
@@ -378,6 +647,10 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b)
if (strcmp(a->fields[i].se_str, b->fields[i].se_str))
return 1;
break;
+ case AUDIT_WATCH:
+ if (strcmp(a->watch->path, b->watch->path))
+ return 1;
+ break;
default:
if (a->fields[i].val != b->fields[i].val)
return 1;
@@ -391,6 +664,32 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b)
return 0;
}
+/* Duplicate the given audit watch. The new watch's rules list is initialized
+ * to an empty list and wlist is undefined. */
+static struct audit_watch *audit_dupe_watch(struct audit_watch *old)
+{
+ char *path;
+ struct audit_watch *new;
+
+ path = kstrdup(old->path, GFP_KERNEL);
+ if (unlikely(!path))
+ return ERR_PTR(-ENOMEM);
+
+ new = audit_init_watch(path);
+ if (unlikely(IS_ERR(new))) {
+ kfree(path);
+ goto out;
+ }
+
+ new->dev = old->dev;
+ new->ino = old->ino;
+ get_inotify_watch(&old->parent->wdata);
+ new->parent = old->parent;
+
+out:
+ return new;
+}
+
/* Duplicate selinux field information. The se_rule is opaque, so must be
* re-initialized. */
static inline int audit_dupe_selinux_field(struct audit_field *df,
@@ -422,8 +721,11 @@ static inline int audit_dupe_selinux_field(struct audit_field *df,
/* Duplicate an audit rule. This will be a deep copy with the exception
* of the watch - that pointer is carried over. The selinux specific fields
* will be updated in the copy. The point is to be able to replace the old
- * rule with the new rule in the filterlist, then free the old rule. */
-static struct audit_entry *audit_dupe_rule(struct audit_krule *old)
+ * rule with the new rule in the filterlist, then free the old rule.
+ * The rlist element is undefined; list manipulations are handled apart from
+ * the initial copy. */
+static struct audit_entry *audit_dupe_rule(struct audit_krule *old,
+ struct audit_watch *watch)
{
u32 fcount = old->field_count;
struct audit_entry *entry;
@@ -442,6 +744,8 @@ static struct audit_entry *audit_dupe_rule(struct audit_krule *old)
for (i = 0; i < AUDIT_BITMASK_SIZE; i++)
new->mask[i] = old->mask[i];
new->buflen = old->buflen;
+ new->inode_f = old->inode_f;
+ new->watch = NULL;
new->field_count = old->field_count;
memcpy(new->fields, old->fields, sizeof(struct audit_field) * fcount);
@@ -463,68 +767,409 @@ static struct audit_entry *audit_dupe_rule(struct audit_krule *old)
}
}
+ if (watch) {
+ audit_get_watch(watch);
+ new->watch = watch;
+ }
+
return entry;
}
-/* Add rule to given filterlist if not a duplicate. Protected by
- * audit_netlink_mutex. */
+/* Update inode info in audit rules based on filesystem event. */
+static void audit_update_watch(struct audit_parent *parent,
+ const char *dname, dev_t dev,
+ unsigned long ino, unsigned invalidating)
+{
+ struct audit_watch *owatch, *nwatch, *nextw;
+ struct audit_krule *r, *nextr;
+ struct audit_entry *oentry, *nentry;
+ struct audit_buffer *ab;
+
+ mutex_lock(&audit_filter_mutex);
+ list_for_each_entry_safe(owatch, nextw, &parent->watches, wlist) {
+ if (audit_compare_dname_path(dname, owatch->path, NULL))
+ continue;
+
+ /* If the update involves invalidating rules, do the inode-based
+ * filtering now, so we don't omit records. */
+ if (invalidating &&
+ audit_filter_inodes(current, current->audit_context) == AUDIT_RECORD_CONTEXT)
+ audit_set_auditable(current->audit_context);
+
+ nwatch = audit_dupe_watch(owatch);
+ if (unlikely(IS_ERR(nwatch))) {
+ mutex_unlock(&audit_filter_mutex);
+ audit_panic("error updating watch, skipping");
+ return;
+ }
+ nwatch->dev = dev;
+ nwatch->ino = ino;
+
+ list_for_each_entry_safe(r, nextr, &owatch->rules, rlist) {
+
+ oentry = container_of(r, struct audit_entry, rule);
+ list_del(&oentry->rule.rlist);
+ list_del_rcu(&oentry->list);
+
+ nentry = audit_dupe_rule(&oentry->rule, nwatch);
+ if (unlikely(IS_ERR(nentry)))
+ audit_panic("error updating watch, removing");
+ else {
+ int h = audit_hash_ino((u32)ino);
+ list_add(&nentry->rule.rlist, &nwatch->rules);
+ list_add_rcu(&nentry->list, &audit_inode_hash[h]);
+ }
+
+ call_rcu(&oentry->rcu, audit_free_rule_rcu);
+ }
+
+ ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
+ audit_log_format(ab, "audit updated rules specifying watch=");
+ audit_log_untrustedstring(ab, owatch->path);
+ audit_log_format(ab, " with dev=%u ino=%lu\n", dev, ino);
+ audit_log_end(ab);
+
+ audit_remove_watch(owatch);
+ goto add_watch_to_parent; /* event applies to a single watch */
+ }
+ mutex_unlock(&audit_filter_mutex);
+ return;
+
+add_watch_to_parent:
+ list_add(&nwatch->wlist, &parent->watches);
+ mutex_unlock(&audit_filter_mutex);
+ return;
+}
+
+/* Remove all watches & rules associated with a parent that is going away. */
+static void audit_remove_parent_watches(struct audit_parent *parent)
+{
+ struct audit_watch *w, *nextw;
+ struct audit_krule *r, *nextr;
+ struct audit_entry *e;
+
+ mutex_lock(&audit_filter_mutex);
+ parent->flags |= AUDIT_PARENT_INVALID;
+ list_for_each_entry_safe(w, nextw, &parent->watches, wlist) {
+ list_for_each_entry_safe(r, nextr, &w->rules, rlist) {
+ e = container_of(r, struct audit_entry, rule);
+ list_del(&r->rlist);
+ list_del_rcu(&e->list);
+ call_rcu(&e->rcu, audit_free_rule_rcu);
+
+ audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
+ "audit implicitly removed rule from list=%d\n",
+ AUDIT_FILTER_EXIT);
+ }
+ audit_remove_watch(w);
+ }
+ mutex_unlock(&audit_filter_mutex);
+}
+
+/* Unregister inotify watches for parents on in_list.
+ * Generates an IN_IGNORED event. */
+static void audit_inotify_unregister(struct list_head *in_list)
+{
+ struct audit_parent *p, *n;
+
+ list_for_each_entry_safe(p, n, in_list, ilist) {
+ list_del(&p->ilist);
+ inotify_rm_watch(audit_ih, &p->wdata);
+ /* the put matching the get in audit_do_del_rule() */
+ put_inotify_watch(&p->wdata);
+ }
+}
+
+/* Find an existing audit rule.
+ * Caller must hold audit_filter_mutex to prevent stale rule data. */
+static struct audit_entry *audit_find_rule(struct audit_entry *entry,
+ struct list_head *list)
+{
+ struct audit_entry *e, *found = NULL;
+ int h;
+
+ if (entry->rule.watch) {
+ /* we don't know the inode number, so must walk entire hash */
+ for (h = 0; h < AUDIT_INODE_BUCKETS; h++) {
+ list = &audit_inode_hash[h];
+ list_for_each_entry(e, list, list)
+ if (!audit_compare_rule(&entry->rule, &e->rule)) {
+ found = e;
+ goto out;
+ }
+ }
+ goto out;
+ }
+
+ list_for_each_entry(e, list, list)
+ if (!audit_compare_rule(&entry->rule, &e->rule)) {
+ found = e;
+ goto out;
+ }
+
+out:
+ return found;
+}
+
+/* Get path information necessary for adding watches. */
+static int audit_get_nd(char *path, struct nameidata **ndp,
+ struct nameidata **ndw)
+{
+ struct nameidata *ndparent, *ndwatch;
+ int err;
+
+ ndparent = kmalloc(sizeof(*ndparent), GFP_KERNEL);
+ if (unlikely(!ndparent))
+ return -ENOMEM;
+
+ ndwatch = kmalloc(sizeof(*ndwatch), GFP_KERNEL);
+ if (unlikely(!ndwatch)) {
+ kfree(ndparent);
+ return -ENOMEM;
+ }
+
+ err = path_lookup(path, LOOKUP_PARENT, ndparent);
+ if (err) {
+ kfree(ndparent);
+ kfree(ndwatch);
+ return err;
+ }
+
+ err = path_lookup(path, 0, ndwatch);
+ if (err) {
+ kfree(ndwatch);
+ ndwatch = NULL;
+ }
+
+ *ndp = ndparent;
+ *ndw = ndwatch;
+
+ return 0;
+}
+
+/* Release resources used for watch path information. */
+static void audit_put_nd(struct nameidata *ndp, struct nameidata *ndw)
+{
+ if (ndp) {
+ path_release(ndp);
+ kfree(ndp);
+ }
+ if (ndw) {
+ path_release(ndw);
+ kfree(ndw);
+ }
+}
+
+/* Associate the given rule with an existing parent inotify_watch.
+ * Caller must hold audit_filter_mutex. */
+static void audit_add_to_parent(struct audit_krule *krule,
+ struct audit_parent *parent)
+{
+ struct audit_watch *w, *watch = krule->watch;
+ int watch_found = 0;
+
+ list_for_each_entry(w, &parent->watches, wlist) {
+ if (strcmp(watch->path, w->path))
+ continue;
+
+ watch_found = 1;
+
+ /* put krule's and initial refs to temporary watch */
+ audit_put_watch(watch);
+ audit_put_watch(watch);
+
+ audit_get_watch(w);
+ krule->watch = watch = w;
+ break;
+ }
+
+ if (!watch_found) {
+ get_inotify_watch(&parent->wdata);
+ watch->parent = parent;
+
+ list_add(&watch->wlist, &parent->watches);
+ }
+ list_add(&krule->rlist, &watch->rules);
+}
+
+/* Find a matching watch entry, or add this one.
+ * Caller must hold audit_filter_mutex. */
+static int audit_add_watch(struct audit_krule *krule, struct nameidata *ndp,
+ struct nameidata *ndw)
+{
+ struct audit_watch *watch = krule->watch;
+ struct inotify_watch *i_watch;
+ struct audit_parent *parent;
+ int ret = 0;
+
+ /* update watch filter fields */
+ if (ndw) {
+ watch->dev = ndw->dentry->d_inode->i_sb->s_dev;
+ watch->ino = ndw->dentry->d_inode->i_ino;
+ }
+
+ /* The audit_filter_mutex must not be held during inotify calls because
+ * we hold it during inotify event callback processing. If an existing
+ * inotify watch is found, inotify_find_watch() grabs a reference before
+ * returning.
+ */
+ mutex_unlock(&audit_filter_mutex);
+
+ if (inotify_find_watch(audit_ih, ndp->dentry->d_inode, &i_watch) < 0) {
+ parent = audit_init_parent(ndp);
+ if (IS_ERR(parent)) {
+ /* caller expects mutex locked */
+ mutex_lock(&audit_filter_mutex);
+ return PTR_ERR(parent);
+ }
+ } else
+ parent = container_of(i_watch, struct audit_parent, wdata);
+
+ mutex_lock(&audit_filter_mutex);
+
+ /* parent was moved before we took audit_filter_mutex */
+ if (parent->flags & AUDIT_PARENT_INVALID)
+ ret = -ENOENT;
+ else
+ audit_add_to_parent(krule, parent);
+
+ /* match get in audit_init_parent or inotify_find_watch */
+ put_inotify_watch(&parent->wdata);
+ return ret;
+}
+
+/* Add rule to given filterlist if not a duplicate. */
static inline int audit_add_rule(struct audit_entry *entry,
- struct list_head *list)
+ struct list_head *list)
{
struct audit_entry *e;
+ struct audit_field *inode_f = entry->rule.inode_f;
+ struct audit_watch *watch = entry->rule.watch;
+ struct nameidata *ndp, *ndw;
+ int h, err, putnd_needed = 0;
+
+ if (inode_f) {
+ h = audit_hash_ino(inode_f->val);
+ list = &audit_inode_hash[h];
+ }
- /* Do not use the _rcu iterator here, since this is the only
- * addition routine. */
- list_for_each_entry(e, list, list) {
- if (!audit_compare_rule(&entry->rule, &e->rule))
- return -EEXIST;
+ mutex_lock(&audit_filter_mutex);
+ e = audit_find_rule(entry, list);
+ mutex_unlock(&audit_filter_mutex);
+ if (e) {
+ err = -EEXIST;
+ goto error;
+ }
+
+ /* Avoid calling path_lookup under audit_filter_mutex. */
+ if (watch) {
+ err = audit_get_nd(watch->path, &ndp, &ndw);
+ if (err)
+ goto error;
+ putnd_needed = 1;
+ }
+
+ mutex_lock(&audit_filter_mutex);
+ if (watch) {
+ /* audit_filter_mutex is dropped and re-taken during this call */
+ err = audit_add_watch(&entry->rule, ndp, ndw);
+ if (err) {
+ mutex_unlock(&audit_filter_mutex);
+ goto error;
+ }
+ h = audit_hash_ino((u32)watch->ino);
+ list = &audit_inode_hash[h];
}
if (entry->rule.flags & AUDIT_FILTER_PREPEND) {
list_add_rcu(&entry->list, list);
+ entry->rule.flags &= ~AUDIT_FILTER_PREPEND;
} else {
list_add_tail_rcu(&entry->list, list);
}
+ mutex_unlock(&audit_filter_mutex);
- return 0;
+ if (putnd_needed)
+ audit_put_nd(ndp, ndw);
+
+ return 0;
+
+error:
+ if (putnd_needed)
+ audit_put_nd(ndp, ndw);
+ if (watch)
+ audit_put_watch(watch); /* tmp watch, matches initial get */
+ return err;
}
-/* Remove an existing rule from filterlist. Protected by
- * audit_netlink_mutex. */
+/* Remove an existing rule from filterlist. */
static inline int audit_del_rule(struct audit_entry *entry,
struct list_head *list)
{
struct audit_entry *e;
+ struct audit_field *inode_f = entry->rule.inode_f;
+ struct audit_watch *watch, *tmp_watch = entry->rule.watch;
+ LIST_HEAD(inotify_list);
+ int h, ret = 0;
+
+ if (inode_f) {
+ h = audit_hash_ino(inode_f->val);
+ list = &audit_inode_hash[h];
+ }
- /* Do not use the _rcu iterator here, since this is the only
- * deletion routine. */
- list_for_each_entry(e, list, list) {
- if (!audit_compare_rule(&entry->rule, &e->rule)) {
- list_del_rcu(&e->list);
- call_rcu(&e->rcu, audit_free_rule_rcu);
- return 0;
+ mutex_lock(&audit_filter_mutex);
+ e = audit_find_rule(entry, list);
+ if (!e) {
+ mutex_unlock(&audit_filter_mutex);
+ ret = -ENOENT;
+ goto out;
+ }
+
+ watch = e->rule.watch;
+ if (watch) {
+ struct audit_parent *parent = watch->parent;
+
+ list_del(&e->rule.rlist);
+
+ if (list_empty(&watch->rules)) {
+ audit_remove_watch(watch);
+
+ if (list_empty(&parent->watches)) {
+ /* Put parent on the inotify un-registration
+ * list. Grab a reference before releasing
+ * audit_filter_mutex, to be released in
+ * audit_inotify_unregister(). */
+ list_add(&parent->ilist, &inotify_list);
+ get_inotify_watch(&parent->wdata);
+ }
}
}
- return -ENOENT; /* No matching rule */
+
+ list_del_rcu(&e->list);
+ call_rcu(&e->rcu, audit_free_rule_rcu);
+
+ mutex_unlock(&audit_filter_mutex);
+
+ if (!list_empty(&inotify_list))
+ audit_inotify_unregister(&inotify_list);
+
+out:
+ if (tmp_watch)
+ audit_put_watch(tmp_watch); /* match initial get */
+
+ return ret;
}
/* List rules using struct audit_rule. Exists for backward
* compatibility with userspace. */
-static int audit_list(void *_dest)
+static void audit_list(int pid, int seq, struct sk_buff_head *q)
{
- int pid, seq;
- int *dest = _dest;
+ struct sk_buff *skb;
struct audit_entry *entry;
int i;
- pid = dest[0];
- seq = dest[1];
- kfree(dest);
-
- mutex_lock(&audit_netlink_mutex);
-
- /* The *_rcu iterators not needed here because we are
- always called with audit_netlink_mutex held. */
+ /* This is a blocking read, so use audit_filter_mutex instead of rcu
+ * iterator to sync with list writers. */
for (i=0; i<AUDIT_NR_FILTERS; i++) {
list_for_each_entry(entry, &audit_filter_list[i], list) {
struct audit_rule *rule;
@@ -532,33 +1177,41 @@ static int audit_list(void *_dest)
rule = audit_krule_to_rule(&entry->rule);
if (unlikely(!rule))
break;
- audit_send_reply(pid, seq, AUDIT_LIST, 0, 1,
+ skb = audit_make_reply(pid, seq, AUDIT_LIST, 0, 1,
rule, sizeof(*rule));
+ if (skb)
+ skb_queue_tail(q, skb);
kfree(rule);
}
}
- audit_send_reply(pid, seq, AUDIT_LIST, 1, 1, NULL, 0);
-
- mutex_unlock(&audit_netlink_mutex);
- return 0;
+ for (i = 0; i < AUDIT_INODE_BUCKETS; i++) {
+ list_for_each_entry(entry, &audit_inode_hash[i], list) {
+ struct audit_rule *rule;
+
+ rule = audit_krule_to_rule(&entry->rule);
+ if (unlikely(!rule))
+ break;
+ skb = audit_make_reply(pid, seq, AUDIT_LIST, 0, 1,
+ rule, sizeof(*rule));
+ if (skb)
+ skb_queue_tail(q, skb);
+ kfree(rule);
+ }
+ }
+ skb = audit_make_reply(pid, seq, AUDIT_LIST, 1, 1, NULL, 0);
+ if (skb)
+ skb_queue_tail(q, skb);
}
/* List rules using struct audit_rule_data. */
-static int audit_list_rules(void *_dest)
+static void audit_list_rules(int pid, int seq, struct sk_buff_head *q)
{
- int pid, seq;
- int *dest = _dest;
+ struct sk_buff *skb;
struct audit_entry *e;
int i;
- pid = dest[0];
- seq = dest[1];
- kfree(dest);
-
- mutex_lock(&audit_netlink_mutex);
-
- /* The *_rcu iterators not needed here because we are
- always called with audit_netlink_mutex held. */
+ /* This is a blocking read, so use audit_filter_mutex instead of rcu
+ * iterator to sync with list writers. */
for (i=0; i<AUDIT_NR_FILTERS; i++) {
list_for_each_entry(e, &audit_filter_list[i], list) {
struct audit_rule_data *data;
@@ -566,15 +1219,30 @@ static int audit_list_rules(void *_dest)
data = audit_krule_to_data(&e->rule);
if (unlikely(!data))
break;
- audit_send_reply(pid, seq, AUDIT_LIST_RULES, 0, 1,
- data, sizeof(*data));
+ skb = audit_make_reply(pid, seq, AUDIT_LIST_RULES, 0, 1,
+ data, sizeof(*data) + data->buflen);
+ if (skb)
+ skb_queue_tail(q, skb);
kfree(data);
}
}
- audit_send_reply(pid, seq, AUDIT_LIST_RULES, 1, 1, NULL, 0);
+ for (i=0; i< AUDIT_INODE_BUCKETS; i++) {
+ list_for_each_entry(e, &audit_inode_hash[i], list) {
+ struct audit_rule_data *data;
- mutex_unlock(&audit_netlink_mutex);
- return 0;
+ data = audit_krule_to_data(&e->rule);
+ if (unlikely(!data))
+ break;
+ skb = audit_make_reply(pid, seq, AUDIT_LIST_RULES, 0, 1,
+ data, sizeof(*data) + data->buflen);
+ if (skb)
+ skb_queue_tail(q, skb);
+ kfree(data);
+ }
+ }
+ skb = audit_make_reply(pid, seq, AUDIT_LIST_RULES, 1, 1, NULL, 0);
+ if (skb)
+ skb_queue_tail(q, skb);
}
/**
@@ -592,7 +1260,7 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
size_t datasz, uid_t loginuid, u32 sid)
{
struct task_struct *tsk;
- int *dest;
+ struct audit_netlink_list *dest;
int err = 0;
struct audit_entry *entry;
@@ -605,18 +1273,22 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
* happen if we're actually running in the context of auditctl
* trying to _send_ the stuff */
- dest = kmalloc(2 * sizeof(int), GFP_KERNEL);
+ dest = kmalloc(sizeof(struct audit_netlink_list), GFP_KERNEL);
if (!dest)
return -ENOMEM;
- dest[0] = pid;
- dest[1] = seq;
+ dest->pid = pid;
+ skb_queue_head_init(&dest->q);
+ mutex_lock(&audit_filter_mutex);
if (type == AUDIT_LIST)
- tsk = kthread_run(audit_list, dest, "audit_list");
+ audit_list(pid, seq, &dest->q);
else
- tsk = kthread_run(audit_list_rules, dest,
- "audit_list_rules");
+ audit_list_rules(pid, seq, &dest->q);
+ mutex_unlock(&audit_filter_mutex);
+
+ tsk = kthread_run(audit_send_list, dest, "audit_send_list");
if (IS_ERR(tsk)) {
+ skb_queue_purge(&dest->q);
kfree(dest);
err = PTR_ERR(tsk);
}
@@ -632,6 +1304,7 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
err = audit_add_rule(entry,
&audit_filter_list[entry->rule.listnr]);
+
if (sid) {
char *ctx = NULL;
u32 len;
@@ -712,7 +1385,43 @@ int audit_comparator(const u32 left, const u32 op, const u32 right)
return 0;
}
+/* Compare given dentry name with last component in given path,
+ * return of 0 indicates a match. */
+int audit_compare_dname_path(const char *dname, const char *path,
+ int *dirlen)
+{
+ int dlen, plen;
+ const char *p;
+ if (!dname || !path)
+ return 1;
+
+ dlen = strlen(dname);
+ plen = strlen(path);
+ if (plen < dlen)
+ return 1;
+
+ /* disregard trailing slashes */
+ p = path + plen - 1;
+ while ((*p == '/') && (p > path))
+ p--;
+
+ /* find last path component */
+ p = p - dlen + 1;
+ if (p < path)
+ return 1;
+ else if (p > path) {
+ if (*--p != '/')
+ return 1;
+ else
+ p++;
+ }
+
+ /* return length of path's directory component */
+ if (dirlen)
+ *dirlen = p - path;
+ return strncmp(p, dname, dlen);
+}
static int audit_filter_user_rules(struct netlink_skb_parms *cb,
struct audit_krule *rule,
@@ -744,7 +1453,6 @@ static int audit_filter_user_rules(struct netlink_skb_parms *cb,
}
switch (rule->action) {
case AUDIT_NEVER: *state = AUDIT_DISABLED; break;
- case AUDIT_POSSIBLE: *state = AUDIT_BUILD_CONTEXT; break;
case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break;
}
return 1;
@@ -826,32 +1534,65 @@ static inline int audit_rule_has_selinux(struct audit_krule *rule)
int selinux_audit_rule_update(void)
{
struct audit_entry *entry, *n, *nentry;
+ struct audit_watch *watch;
int i, err = 0;
- /* audit_netlink_mutex synchronizes the writers */
- mutex_lock(&audit_netlink_mutex);
+ /* audit_filter_mutex synchronizes the writers */
+ mutex_lock(&audit_filter_mutex);
for (i = 0; i < AUDIT_NR_FILTERS; i++) {
list_for_each_entry_safe(entry, n, &audit_filter_list[i], list) {
if (!audit_rule_has_selinux(&entry->rule))
continue;
- nentry = audit_dupe_rule(&entry->rule);
+ watch = entry->rule.watch;
+ nentry = audit_dupe_rule(&entry->rule, watch);
if (unlikely(IS_ERR(nentry))) {
/* save the first error encountered for the
* return value */
if (!err)
err = PTR_ERR(nentry);
audit_panic("error updating selinux filters");
+ if (watch)
+ list_del(&entry->rule.rlist);
list_del_rcu(&entry->list);
} else {
+ if (watch) {
+ list_add(&nentry->rule.rlist,
+ &watch->rules);
+ list_del(&entry->rule.rlist);
+ }
list_replace_rcu(&entry->list, &nentry->list);
}
call_rcu(&entry->rcu, audit_free_rule_rcu);
}
}
- mutex_unlock(&audit_netlink_mutex);
+ mutex_unlock(&audit_filter_mutex);
return err;
}
+
+/* Update watch data in audit rules based on inotify events. */
+void audit_handle_ievent(struct inotify_watch *i_watch, u32 wd, u32 mask,
+ u32 cookie, const char *dname, struct inode *inode)
+{
+ struct audit_parent *parent;
+
+ parent = container_of(i_watch, struct audit_parent, wdata);
+
+ if (mask & (IN_CREATE|IN_MOVED_TO) && inode)
+ audit_update_watch(parent, dname, inode->i_sb->s_dev,
+ inode->i_ino, 0);
+ else if (mask & (IN_DELETE|IN_MOVED_FROM))
+ audit_update_watch(parent, dname, (dev_t)-1, (unsigned long)-1, 1);
+ /* inotify automatically removes the watch and sends IN_IGNORED */
+ else if (mask & (IN_DELETE_SELF|IN_UNMOUNT))
+ audit_remove_parent_watches(parent);
+ /* inotify does not remove the watch, so remove it manually */
+ else if(mask & IN_MOVE_SELF) {
+ audit_remove_parent_watches(parent);
+ inotify_remove_watch_locked(audit_ih, i_watch);
+ } else if (mask & IN_IGNORED)
+ put_inotify_watch(i_watch);
+}
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 1c03a4ed1b27..9ebd96fda295 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -3,7 +3,7 @@
*
* Copyright 2003-2004 Red Hat Inc., Durham, North Carolina.
* Copyright 2005 Hewlett-Packard Development Company, L.P.
- * Copyright (C) 2005 IBM Corporation
+ * Copyright (C) 2005, 2006 IBM Corporation
* All Rights Reserved.
*
* This program is free software; you can redistribute it and/or modify
@@ -29,6 +29,9 @@
* this file -- see entry.S) is based on a GPL'd patch written by
* okir@suse.de and Copyright 2003 SuSE Linux AG.
*
+ * POSIX message queue support added by George Wilson <ltcgcw@us.ibm.com>,
+ * 2006.
+ *
* The support of additional filter rules compares (>, <, >=, <=) was
* added by Dustin Kirkland <dustin.kirkland@us.ibm.com>, 2005.
*
@@ -49,6 +52,7 @@
#include <linux/module.h>
#include <linux/mount.h>
#include <linux/socket.h>
+#include <linux/mqueue.h>
#include <linux/audit.h>
#include <linux/personality.h>
#include <linux/time.h>
@@ -59,6 +63,8 @@
#include <linux/list.h>
#include <linux/tty.h>
#include <linux/selinux.h>
+#include <linux/binfmts.h>
+#include <linux/syscalls.h>
#include "audit.h"
@@ -76,6 +82,9 @@ extern int audit_enabled;
* path_lookup. */
#define AUDIT_NAMES_RESERVED 7
+/* Indicates that audit should log the full pathname. */
+#define AUDIT_NAME_FULL -1
+
/* When fs/namei.c:getname() is called, we store the pointer in name and
* we don't let putname() free it (instead we free all of the saved
* pointers at syscall exit time).
@@ -83,8 +92,9 @@ extern int audit_enabled;
* Further, in fs/namei.c:path_lookup() we store the inode and device. */
struct audit_names {
const char *name;
+ int name_len; /* number of name's characters to log */
+ unsigned name_put; /* call __putname() for this name */
unsigned long ino;
- unsigned long pino;
dev_t dev;
umode_t mode;
uid_t uid;
@@ -100,6 +110,33 @@ struct audit_aux_data {
#define AUDIT_AUX_IPCPERM 0
+struct audit_aux_data_mq_open {
+ struct audit_aux_data d;
+ int oflag;
+ mode_t mode;
+ struct mq_attr attr;
+};
+
+struct audit_aux_data_mq_sendrecv {
+ struct audit_aux_data d;
+ mqd_t mqdes;
+ size_t msg_len;
+ unsigned int msg_prio;
+ struct timespec abs_timeout;
+};
+
+struct audit_aux_data_mq_notify {
+ struct audit_aux_data d;
+ mqd_t mqdes;
+ struct sigevent notification;
+};
+
+struct audit_aux_data_mq_getsetattr {
+ struct audit_aux_data d;
+ mqd_t mqdes;
+ struct mq_attr mqstat;
+};
+
struct audit_aux_data_ipcctl {
struct audit_aux_data d;
struct ipc_perm p;
@@ -110,6 +147,13 @@ struct audit_aux_data_ipcctl {
u32 osid;
};
+struct audit_aux_data_execve {
+ struct audit_aux_data d;
+ int argc;
+ int envc;
+ char mem[0];
+};
+
struct audit_aux_data_socketcall {
struct audit_aux_data d;
int nargs;
@@ -148,7 +192,7 @@ struct audit_context {
struct audit_aux_data *aux;
/* Save things to print about task_struct */
- pid_t pid;
+ pid_t pid, ppid;
uid_t uid, euid, suid, fsuid;
gid_t gid, egid, sgid, fsgid;
unsigned long personality;
@@ -160,12 +204,13 @@ struct audit_context {
#endif
};
-
+/* Determine if any context name data matches a rule's watch data */
/* Compare a task_struct with an audit_rule. Return 1 on match, 0
* otherwise. */
static int audit_filter_rules(struct task_struct *tsk,
struct audit_krule *rule,
struct audit_context *ctx,
+ struct audit_names *name,
enum audit_state *state)
{
int i, j, need_sid = 1;
@@ -179,6 +224,10 @@ static int audit_filter_rules(struct task_struct *tsk,
case AUDIT_PID:
result = audit_comparator(tsk->pid, f->op, f->val);
break;
+ case AUDIT_PPID:
+ if (ctx)
+ result = audit_comparator(ctx->ppid, f->op, f->val);
+ break;
case AUDIT_UID:
result = audit_comparator(tsk->uid, f->op, f->val);
break;
@@ -224,7 +273,10 @@ static int audit_filter_rules(struct task_struct *tsk,
}
break;
case AUDIT_DEVMAJOR:
- if (ctx) {
+ if (name)
+ result = audit_comparator(MAJOR(name->dev),
+ f->op, f->val);
+ else if (ctx) {
for (j = 0; j < ctx->name_count; j++) {
if (audit_comparator(MAJOR(ctx->names[j].dev), f->op, f->val)) {
++result;
@@ -234,7 +286,10 @@ static int audit_filter_rules(struct task_struct *tsk,
}
break;
case AUDIT_DEVMINOR:
- if (ctx) {
+ if (name)
+ result = audit_comparator(MINOR(name->dev),
+ f->op, f->val);
+ else if (ctx) {
for (j = 0; j < ctx->name_count; j++) {
if (audit_comparator(MINOR(ctx->names[j].dev), f->op, f->val)) {
++result;
@@ -244,16 +299,22 @@ static int audit_filter_rules(struct task_struct *tsk,
}
break;
case AUDIT_INODE:
- if (ctx) {
+ if (name)
+ result = (name->ino == f->val);
+ else if (ctx) {
for (j = 0; j < ctx->name_count; j++) {
- if (audit_comparator(ctx->names[j].ino, f->op, f->val) ||
- audit_comparator(ctx->names[j].pino, f->op, f->val)) {
+ if (audit_comparator(ctx->names[j].ino, f->op, f->val)) {
++result;
break;
}
}
}
break;
+ case AUDIT_WATCH:
+ if (name && rule->watch->ino != (unsigned long)-1)
+ result = (name->dev == rule->watch->dev &&
+ name->ino == rule->watch->ino);
+ break;
case AUDIT_LOGINUID:
result = 0;
if (ctx)
@@ -294,7 +355,6 @@ static int audit_filter_rules(struct task_struct *tsk,
}
switch (rule->action) {
case AUDIT_NEVER: *state = AUDIT_DISABLED; break;
- case AUDIT_POSSIBLE: *state = AUDIT_BUILD_CONTEXT; break;
case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break;
}
return 1;
@@ -311,7 +371,7 @@ static enum audit_state audit_filter_task(struct task_struct *tsk)
rcu_read_lock();
list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_TASK], list) {
- if (audit_filter_rules(tsk, &e->rule, NULL, &state)) {
+ if (audit_filter_rules(tsk, &e->rule, NULL, NULL, &state)) {
rcu_read_unlock();
return state;
}
@@ -341,8 +401,47 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk,
int bit = AUDIT_BIT(ctx->major);
list_for_each_entry_rcu(e, list, list) {
- if ((e->rule.mask[word] & bit) == bit
- && audit_filter_rules(tsk, &e->rule, ctx, &state)) {
+ if ((e->rule.mask[word] & bit) == bit &&
+ audit_filter_rules(tsk, &e->rule, ctx, NULL,
+ &state)) {
+ rcu_read_unlock();
+ return state;
+ }
+ }
+ }
+ rcu_read_unlock();
+ return AUDIT_BUILD_CONTEXT;
+}
+
+/* At syscall exit time, this filter is called if any audit_names[] have been
+ * collected during syscall processing. We only check rules in sublists at hash
+ * buckets applicable to the inode numbers in audit_names[].
+ * Regarding audit_state, same rules apply as for audit_filter_syscall().
+ */
+enum audit_state audit_filter_inodes(struct task_struct *tsk,
+ struct audit_context *ctx)
+{
+ int i;
+ struct audit_entry *e;
+ enum audit_state state;
+
+ if (audit_pid && tsk->tgid == audit_pid)
+ return AUDIT_DISABLED;
+
+ rcu_read_lock();
+ for (i = 0; i < ctx->name_count; i++) {
+ int word = AUDIT_WORD(ctx->major);
+ int bit = AUDIT_BIT(ctx->major);
+ struct audit_names *n = &ctx->names[i];
+ int h = audit_hash_ino((u32)n->ino);
+ struct list_head *list = &audit_inode_hash[h];
+
+ if (list_empty(list))
+ continue;
+
+ list_for_each_entry_rcu(e, list, list) {
+ if ((e->rule.mask[word] & bit) == bit &&
+ audit_filter_rules(tsk, &e->rule, ctx, n, &state)) {
rcu_read_unlock();
return state;
}
@@ -352,6 +451,11 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk,
return AUDIT_BUILD_CONTEXT;
}
+void audit_set_auditable(struct audit_context *ctx)
+{
+ ctx->auditable = 1;
+}
+
static inline struct audit_context *audit_get_context(struct task_struct *tsk,
int return_valid,
int return_code)
@@ -365,12 +469,22 @@ static inline struct audit_context *audit_get_context(struct task_struct *tsk,
if (context->in_syscall && !context->auditable) {
enum audit_state state;
+
state = audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_EXIT]);
+ if (state == AUDIT_RECORD_CONTEXT) {
+ context->auditable = 1;
+ goto get_context;
+ }
+
+ state = audit_filter_inodes(tsk, context);
if (state == AUDIT_RECORD_CONTEXT)
context->auditable = 1;
+
}
+get_context:
context->pid = tsk->pid;
+ context->ppid = sys_getppid(); /* sic. tsk == current in all cases */
context->uid = tsk->uid;
context->gid = tsk->gid;
context->euid = tsk->euid;
@@ -413,7 +527,7 @@ static inline void audit_free_names(struct audit_context *context)
#endif
for (i = 0; i < context->name_count; i++) {
- if (context->names[i].name)
+ if (context->names[i].name && context->names[i].name_put)
__putname(context->names[i].name);
}
context->name_count = 0;
@@ -606,7 +720,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
tty = "(none)";
audit_log_format(ab,
" a0=%lx a1=%lx a2=%lx a3=%lx items=%d"
- " pid=%d auid=%u uid=%u gid=%u"
+ " ppid=%d pid=%d auid=%u uid=%u gid=%u"
" euid=%u suid=%u fsuid=%u"
" egid=%u sgid=%u fsgid=%u tty=%s",
context->argv[0],
@@ -614,6 +728,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
context->argv[2],
context->argv[3],
context->name_count,
+ context->ppid,
context->pid,
context->loginuid,
context->uid,
@@ -630,11 +745,48 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
continue; /* audit_panic has been called */
switch (aux->type) {
+ case AUDIT_MQ_OPEN: {
+ struct audit_aux_data_mq_open *axi = (void *)aux;
+ audit_log_format(ab,
+ "oflag=0x%x mode=%#o mq_flags=0x%lx mq_maxmsg=%ld "
+ "mq_msgsize=%ld mq_curmsgs=%ld",
+ axi->oflag, axi->mode, axi->attr.mq_flags,
+ axi->attr.mq_maxmsg, axi->attr.mq_msgsize,
+ axi->attr.mq_curmsgs);
+ break; }
+
+ case AUDIT_MQ_SENDRECV: {
+ struct audit_aux_data_mq_sendrecv *axi = (void *)aux;
+ audit_log_format(ab,
+ "mqdes=%d msg_len=%zd msg_prio=%u "
+ "abs_timeout_sec=%ld abs_timeout_nsec=%ld",
+ axi->mqdes, axi->msg_len, axi->msg_prio,
+ axi->abs_timeout.tv_sec, axi->abs_timeout.tv_nsec);
+ break; }
+
+ case AUDIT_MQ_NOTIFY: {
+ struct audit_aux_data_mq_notify *axi = (void *)aux;
+ audit_log_format(ab,
+ "mqdes=%d sigev_signo=%d",
+ axi->mqdes,
+ axi->notification.sigev_signo);
+ break; }
+
+ case AUDIT_MQ_GETSETATTR: {
+ struct audit_aux_data_mq_getsetattr *axi = (void *)aux;
+ audit_log_format(ab,
+ "mqdes=%d mq_flags=0x%lx mq_maxmsg=%ld mq_msgsize=%ld "
+ "mq_curmsgs=%ld ",
+ axi->mqdes,
+ axi->mqstat.mq_flags, axi->mqstat.mq_maxmsg,
+ axi->mqstat.mq_msgsize, axi->mqstat.mq_curmsgs);
+ break; }
+
case AUDIT_IPC: {
struct audit_aux_data_ipcctl *axi = (void *)aux;
audit_log_format(ab,
- " qbytes=%lx iuid=%u igid=%u mode=%x",
- axi->qbytes, axi->uid, axi->gid, axi->mode);
+ "ouid=%u ogid=%u mode=%x",
+ axi->uid, axi->gid, axi->mode);
if (axi->osid != 0) {
char *ctx = NULL;
u32 len;
@@ -652,19 +804,18 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
case AUDIT_IPC_SET_PERM: {
struct audit_aux_data_ipcctl *axi = (void *)aux;
audit_log_format(ab,
- " new qbytes=%lx new iuid=%u new igid=%u new mode=%x",
+ "qbytes=%lx ouid=%u ogid=%u mode=%x",
axi->qbytes, axi->uid, axi->gid, axi->mode);
- if (axi->osid != 0) {
- char *ctx = NULL;
- u32 len;
- if (selinux_ctxid_to_string(
- axi->osid, &ctx, &len)) {
- audit_log_format(ab, " osid=%u",
- axi->osid);
- call_panic = 1;
- } else
- audit_log_format(ab, " obj=%s", ctx);
- kfree(ctx);
+ break; }
+
+ case AUDIT_EXECVE: {
+ struct audit_aux_data_execve *axi = (void *)aux;
+ int i;
+ const char *p;
+ for (i = 0, p = axi->mem; i < axi->argc; i++) {
+ audit_log_format(ab, "a%d=", i);
+ p = audit_log_untrustedstring(ab, p);
+ audit_log_format(ab, "\n");
}
break; }
@@ -700,8 +851,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
}
}
for (i = 0; i < context->name_count; i++) {
- unsigned long ino = context->names[i].ino;
- unsigned long pino = context->names[i].pino;
+ struct audit_names *n = &context->names[i];
ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH);
if (!ab)
@@ -709,33 +859,47 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
audit_log_format(ab, "item=%d", i);
- audit_log_format(ab, " name=");
- if (context->names[i].name)
- audit_log_untrustedstring(ab, context->names[i].name);
- else
- audit_log_format(ab, "(null)");
-
- if (pino != (unsigned long)-1)
- audit_log_format(ab, " parent=%lu", pino);
- if (ino != (unsigned long)-1)
- audit_log_format(ab, " inode=%lu", ino);
- if ((pino != (unsigned long)-1) || (ino != (unsigned long)-1))
- audit_log_format(ab, " dev=%02x:%02x mode=%#o"
- " ouid=%u ogid=%u rdev=%02x:%02x",
- MAJOR(context->names[i].dev),
- MINOR(context->names[i].dev),
- context->names[i].mode,
- context->names[i].uid,
- context->names[i].gid,
- MAJOR(context->names[i].rdev),
- MINOR(context->names[i].rdev));
- if (context->names[i].osid != 0) {
+ if (n->name) {
+ switch(n->name_len) {
+ case AUDIT_NAME_FULL:
+ /* log the full path */
+ audit_log_format(ab, " name=");
+ audit_log_untrustedstring(ab, n->name);
+ break;
+ case 0:
+ /* name was specified as a relative path and the
+ * directory component is the cwd */
+ audit_log_d_path(ab, " name=", context->pwd,
+ context->pwdmnt);
+ break;
+ default:
+ /* log the name's directory component */
+ audit_log_format(ab, " name=");
+ audit_log_n_untrustedstring(ab, n->name_len,
+ n->name);
+ }
+ } else
+ audit_log_format(ab, " name=(null)");
+
+ if (n->ino != (unsigned long)-1) {
+ audit_log_format(ab, " inode=%lu"
+ " dev=%02x:%02x mode=%#o"
+ " ouid=%u ogid=%u rdev=%02x:%02x",
+ n->ino,
+ MAJOR(n->dev),
+ MINOR(n->dev),
+ n->mode,
+ n->uid,
+ n->gid,
+ MAJOR(n->rdev),
+ MINOR(n->rdev));
+ }
+ if (n->osid != 0) {
char *ctx = NULL;
u32 len;
if (selinux_ctxid_to_string(
- context->names[i].osid, &ctx, &len)) {
- audit_log_format(ab, " osid=%u",
- context->names[i].osid);
+ n->osid, &ctx, &len)) {
+ audit_log_format(ab, " osid=%u", n->osid);
call_panic = 2;
} else
audit_log_format(ab, " obj=%s", ctx);
@@ -908,11 +1072,11 @@ void audit_syscall_exit(int valid, long return_code)
* Add a name to the list of audit names for this context.
* Called from fs/namei.c:getname().
*/
-void audit_getname(const char *name)
+void __audit_getname(const char *name)
{
struct audit_context *context = current->audit_context;
- if (!context || IS_ERR(name) || !name)
+ if (IS_ERR(name) || !name)
return;
if (!context->in_syscall) {
@@ -925,6 +1089,8 @@ void audit_getname(const char *name)
}
BUG_ON(context->name_count >= AUDIT_NAMES);
context->names[context->name_count].name = name;
+ context->names[context->name_count].name_len = AUDIT_NAME_FULL;
+ context->names[context->name_count].name_put = 1;
context->names[context->name_count].ino = (unsigned long)-1;
++context->name_count;
if (!context->pwd) {
@@ -991,11 +1157,10 @@ static void audit_inode_context(int idx, const struct inode *inode)
* audit_inode - store the inode and device from a lookup
* @name: name being audited
* @inode: inode being audited
- * @flags: lookup flags (as used in path_lookup())
*
* Called from fs/namei.c:path_lookup().
*/
-void __audit_inode(const char *name, const struct inode *inode, unsigned flags)
+void __audit_inode(const char *name, const struct inode *inode)
{
int idx;
struct audit_context *context = current->audit_context;
@@ -1021,20 +1186,13 @@ void __audit_inode(const char *name, const struct inode *inode, unsigned flags)
++context->ino_count;
#endif
}
+ context->names[idx].ino = inode->i_ino;
context->names[idx].dev = inode->i_sb->s_dev;
context->names[idx].mode = inode->i_mode;
context->names[idx].uid = inode->i_uid;
context->names[idx].gid = inode->i_gid;
context->names[idx].rdev = inode->i_rdev;
audit_inode_context(idx, inode);
- if ((flags & LOOKUP_PARENT) && (strcmp(name, "/") != 0) &&
- (strcmp(name, ".") != 0)) {
- context->names[idx].ino = (unsigned long)-1;
- context->names[idx].pino = inode->i_ino;
- } else {
- context->names[idx].ino = inode->i_ino;
- context->names[idx].pino = (unsigned long)-1;
- }
}
/**
@@ -1056,51 +1214,40 @@ void __audit_inode_child(const char *dname, const struct inode *inode,
{
int idx;
struct audit_context *context = current->audit_context;
+ const char *found_name = NULL;
+ int dirlen = 0;
if (!context->in_syscall)
return;
/* determine matching parent */
- if (dname)
- for (idx = 0; idx < context->name_count; idx++)
- if (context->names[idx].pino == pino) {
- const char *n;
- const char *name = context->names[idx].name;
- int dlen = strlen(dname);
- int nlen = name ? strlen(name) : 0;
-
- if (nlen < dlen)
- continue;
-
- /* disregard trailing slashes */
- n = name + nlen - 1;
- while ((*n == '/') && (n > name))
- n--;
-
- /* find last path component */
- n = n - dlen + 1;
- if (n < name)
- continue;
- else if (n > name) {
- if (*--n != '/')
- continue;
- else
- n++;
- }
-
- if (strncmp(n, dname, dlen) == 0)
- goto update_context;
+ if (!dname)
+ goto update_context;
+ for (idx = 0; idx < context->name_count; idx++)
+ if (context->names[idx].ino == pino) {
+ const char *name = context->names[idx].name;
+
+ if (!name)
+ continue;
+
+ if (audit_compare_dname_path(dname, name, &dirlen) == 0) {
+ context->names[idx].name_len = dirlen;
+ found_name = name;
+ break;
}
+ }
- /* catch-all in case match not found */
+update_context:
idx = context->name_count++;
- context->names[idx].name = NULL;
- context->names[idx].pino = pino;
#if AUDIT_DEBUG
context->ino_count++;
#endif
+ /* Re-use the name belonging to the slot for a matching parent directory.
+ * All names for this context are relinquished in audit_free_names() */
+ context->names[idx].name = found_name;
+ context->names[idx].name_len = AUDIT_NAME_FULL;
+ context->names[idx].name_put = 0; /* don't call __putname() */
-update_context:
if (inode) {
context->names[idx].ino = inode->i_ino;
context->names[idx].dev = inode->i_sb->s_dev;
@@ -1109,7 +1256,8 @@ update_context:
context->names[idx].gid = inode->i_gid;
context->names[idx].rdev = inode->i_rdev;
audit_inode_context(idx, inode);
- }
+ } else
+ context->names[idx].ino = (unsigned long)-1;
}
/**
@@ -1142,18 +1290,23 @@ void auditsc_get_stamp(struct audit_context *ctx,
*/
int audit_set_loginuid(struct task_struct *task, uid_t loginuid)
{
- if (task->audit_context) {
- struct audit_buffer *ab;
-
- ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN);
- if (ab) {
- audit_log_format(ab, "login pid=%d uid=%u "
- "old auid=%u new auid=%u",
- task->pid, task->uid,
- task->audit_context->loginuid, loginuid);
- audit_log_end(ab);
+ struct audit_context *context = task->audit_context;
+
+ if (context) {
+ /* Only log if audit is enabled */
+ if (context->in_syscall) {
+ struct audit_buffer *ab;
+
+ ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN);
+ if (ab) {
+ audit_log_format(ab, "login pid=%d uid=%u "
+ "old auid=%u new auid=%u",
+ task->pid, task->uid,
+ context->loginuid, loginuid);
+ audit_log_end(ab);
+ }
}
- task->audit_context->loginuid = loginuid;
+ context->loginuid = loginuid;
}
return 0;
}
@@ -1170,16 +1323,193 @@ uid_t audit_get_loginuid(struct audit_context *ctx)
}
/**
- * audit_ipc_obj - record audit data for ipc object
- * @ipcp: ipc permissions
+ * __audit_mq_open - record audit data for a POSIX MQ open
+ * @oflag: open flag
+ * @mode: mode bits
+ * @u_attr: queue attributes
*
* Returns 0 for success or NULL context or < 0 on error.
*/
-int audit_ipc_obj(struct kern_ipc_perm *ipcp)
+int __audit_mq_open(int oflag, mode_t mode, struct mq_attr __user *u_attr)
{
- struct audit_aux_data_ipcctl *ax;
+ struct audit_aux_data_mq_open *ax;
+ struct audit_context *context = current->audit_context;
+
+ if (!audit_enabled)
+ return 0;
+
+ if (likely(!context))
+ return 0;
+
+ ax = kmalloc(sizeof(*ax), GFP_ATOMIC);
+ if (!ax)
+ return -ENOMEM;
+
+ if (u_attr != NULL) {
+ if (copy_from_user(&ax->attr, u_attr, sizeof(ax->attr))) {
+ kfree(ax);
+ return -EFAULT;
+ }
+ } else
+ memset(&ax->attr, 0, sizeof(ax->attr));
+
+ ax->oflag = oflag;
+ ax->mode = mode;
+
+ ax->d.type = AUDIT_MQ_OPEN;
+ ax->d.next = context->aux;
+ context->aux = (void *)ax;
+ return 0;
+}
+
+/**
+ * __audit_mq_timedsend - record audit data for a POSIX MQ timed send
+ * @mqdes: MQ descriptor
+ * @msg_len: Message length
+ * @msg_prio: Message priority
+ * @abs_timeout: Message timeout in absolute time
+ *
+ * Returns 0 for success or NULL context or < 0 on error.
+ */
+int __audit_mq_timedsend(mqd_t mqdes, size_t msg_len, unsigned int msg_prio,
+ const struct timespec __user *u_abs_timeout)
+{
+ struct audit_aux_data_mq_sendrecv *ax;
+ struct audit_context *context = current->audit_context;
+
+ if (!audit_enabled)
+ return 0;
+
+ if (likely(!context))
+ return 0;
+
+ ax = kmalloc(sizeof(*ax), GFP_ATOMIC);
+ if (!ax)
+ return -ENOMEM;
+
+ if (u_abs_timeout != NULL) {
+ if (copy_from_user(&ax->abs_timeout, u_abs_timeout, sizeof(ax->abs_timeout))) {
+ kfree(ax);
+ return -EFAULT;
+ }
+ } else
+ memset(&ax->abs_timeout, 0, sizeof(ax->abs_timeout));
+
+ ax->mqdes = mqdes;
+ ax->msg_len = msg_len;
+ ax->msg_prio = msg_prio;
+
+ ax->d.type = AUDIT_MQ_SENDRECV;
+ ax->d.next = context->aux;
+ context->aux = (void *)ax;
+ return 0;
+}
+
+/**
+ * __audit_mq_timedreceive - record audit data for a POSIX MQ timed receive
+ * @mqdes: MQ descriptor
+ * @msg_len: Message length
+ * @msg_prio: Message priority
+ * @abs_timeout: Message timeout in absolute time
+ *
+ * Returns 0 for success or NULL context or < 0 on error.
+ */
+int __audit_mq_timedreceive(mqd_t mqdes, size_t msg_len,
+ unsigned int __user *u_msg_prio,
+ const struct timespec __user *u_abs_timeout)
+{
+ struct audit_aux_data_mq_sendrecv *ax;
+ struct audit_context *context = current->audit_context;
+
+ if (!audit_enabled)
+ return 0;
+
+ if (likely(!context))
+ return 0;
+
+ ax = kmalloc(sizeof(*ax), GFP_ATOMIC);
+ if (!ax)
+ return -ENOMEM;
+
+ if (u_msg_prio != NULL) {
+ if (get_user(ax->msg_prio, u_msg_prio)) {
+ kfree(ax);
+ return -EFAULT;
+ }
+ } else
+ ax->msg_prio = 0;
+
+ if (u_abs_timeout != NULL) {
+ if (copy_from_user(&ax->abs_timeout, u_abs_timeout, sizeof(ax->abs_timeout))) {
+ kfree(ax);
+ return -EFAULT;
+ }
+ } else
+ memset(&ax->abs_timeout, 0, sizeof(ax->abs_timeout));
+
+ ax->mqdes = mqdes;
+ ax->msg_len = msg_len;
+
+ ax->d.type = AUDIT_MQ_SENDRECV;
+ ax->d.next = context->aux;
+ context->aux = (void *)ax;
+ return 0;
+}
+
+/**
+ * __audit_mq_notify - record audit data for a POSIX MQ notify
+ * @mqdes: MQ descriptor
+ * @u_notification: Notification event
+ *
+ * Returns 0 for success or NULL context or < 0 on error.
+ */
+
+int __audit_mq_notify(mqd_t mqdes, const struct sigevent __user *u_notification)
+{
+ struct audit_aux_data_mq_notify *ax;
+ struct audit_context *context = current->audit_context;
+
+ if (!audit_enabled)
+ return 0;
+
+ if (likely(!context))
+ return 0;
+
+ ax = kmalloc(sizeof(*ax), GFP_ATOMIC);
+ if (!ax)
+ return -ENOMEM;
+
+ if (u_notification != NULL) {
+ if (copy_from_user(&ax->notification, u_notification, sizeof(ax->notification))) {
+ kfree(ax);
+ return -EFAULT;
+ }
+ } else
+ memset(&ax->notification, 0, sizeof(ax->notification));
+
+ ax->mqdes = mqdes;
+
+ ax->d.type = AUDIT_MQ_NOTIFY;
+ ax->d.next = context->aux;
+ context->aux = (void *)ax;
+ return 0;
+}
+
+/**
+ * __audit_mq_getsetattr - record audit data for a POSIX MQ get/set attribute
+ * @mqdes: MQ descriptor
+ * @mqstat: MQ flags
+ *
+ * Returns 0 for success or NULL context or < 0 on error.
+ */
+int __audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat)
+{
+ struct audit_aux_data_mq_getsetattr *ax;
struct audit_context *context = current->audit_context;
+ if (!audit_enabled)
+ return 0;
+
if (likely(!context))
return 0;
@@ -1187,6 +1517,30 @@ int audit_ipc_obj(struct kern_ipc_perm *ipcp)
if (!ax)
return -ENOMEM;
+ ax->mqdes = mqdes;
+ ax->mqstat = *mqstat;
+
+ ax->d.type = AUDIT_MQ_GETSETATTR;
+ ax->d.next = context->aux;
+ context->aux = (void *)ax;
+ return 0;
+}
+
+/**
+ * audit_ipc_obj - record audit data for ipc object
+ * @ipcp: ipc permissions
+ *
+ * Returns 0 for success or NULL context or < 0 on error.
+ */
+int __audit_ipc_obj(struct kern_ipc_perm *ipcp)
+{
+ struct audit_aux_data_ipcctl *ax;
+ struct audit_context *context = current->audit_context;
+
+ ax = kmalloc(sizeof(*ax), GFP_ATOMIC);
+ if (!ax)
+ return -ENOMEM;
+
ax->uid = ipcp->uid;
ax->gid = ipcp->gid;
ax->mode = ipcp->mode;
@@ -1204,17 +1558,15 @@ int audit_ipc_obj(struct kern_ipc_perm *ipcp)
* @uid: msgq user id
* @gid: msgq group id
* @mode: msgq mode (permissions)
+ * @ipcp: in-kernel IPC permissions
*
* Returns 0 for success or NULL context or < 0 on error.
*/
-int audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode, struct kern_ipc_perm *ipcp)
+int __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode)
{
struct audit_aux_data_ipcctl *ax;
struct audit_context *context = current->audit_context;
- if (likely(!context))
- return 0;
-
ax = kmalloc(sizeof(*ax), GFP_ATOMIC);
if (!ax)
return -ENOMEM;
@@ -1223,7 +1575,6 @@ int audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode,
ax->uid = uid;
ax->gid = gid;
ax->mode = mode;
- selinux_get_ipc_sid(ipcp, &ax->osid);
ax->d.type = AUDIT_IPC_SET_PERM;
ax->d.next = context->aux;
@@ -1231,6 +1582,39 @@ int audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode,
return 0;
}
+int audit_bprm(struct linux_binprm *bprm)
+{
+ struct audit_aux_data_execve *ax;
+ struct audit_context *context = current->audit_context;
+ unsigned long p, next;
+ void *to;
+
+ if (likely(!audit_enabled || !context))
+ return 0;
+
+ ax = kmalloc(sizeof(*ax) + PAGE_SIZE * MAX_ARG_PAGES - bprm->p,
+ GFP_KERNEL);
+ if (!ax)
+ return -ENOMEM;
+
+ ax->argc = bprm->argc;
+ ax->envc = bprm->envc;
+ for (p = bprm->p, to = ax->mem; p < MAX_ARG_PAGES*PAGE_SIZE; p = next) {
+ struct page *page = bprm->page[p / PAGE_SIZE];
+ void *kaddr = kmap(page);
+ next = (p + PAGE_SIZE) & ~(PAGE_SIZE - 1);
+ memcpy(to, kaddr + (p & (PAGE_SIZE - 1)), next - p);
+ to += next - p;
+ kunmap(page);
+ }
+
+ ax->d.type = AUDIT_EXECVE;
+ ax->d.next = context->aux;
+ context->aux = (void *)ax;
+ return 0;
+}
+
+
/**
* audit_socketcall - record audit data for sys_socketcall
* @nargs: number of args
@@ -1325,19 +1709,20 @@ int audit_avc_path(struct dentry *dentry, struct vfsmount *mnt)
* If the audit subsystem is being terminated, record the task (pid)
* and uid that is doing that.
*/
-void audit_signal_info(int sig, struct task_struct *t)
+void __audit_signal_info(int sig, struct task_struct *t)
{
extern pid_t audit_sig_pid;
extern uid_t audit_sig_uid;
-
- if (unlikely(audit_pid && t->tgid == audit_pid)) {
- if (sig == SIGTERM || sig == SIGHUP) {
- struct audit_context *ctx = current->audit_context;
- audit_sig_pid = current->pid;
- if (ctx)
- audit_sig_uid = ctx->loginuid;
- else
- audit_sig_uid = current->uid;
- }
+ extern u32 audit_sig_sid;
+
+ if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1) {
+ struct task_struct *tsk = current;
+ struct audit_context *ctx = tsk->audit_context;
+ audit_sig_pid = tsk->pid;
+ if (ctx)
+ audit_sig_uid = ctx->loginuid;
+ else
+ audit_sig_uid = tsk->uid;
+ selinux_get_task_sid(tsk, &audit_sig_sid);
}
}
diff --git a/kernel/compat.c b/kernel/compat.c
index c1601a84f8d8..126dee9530aa 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -21,6 +21,7 @@
#include <linux/unistd.h>
#include <linux/security.h>
#include <linux/timex.h>
+#include <linux/migrate.h>
#include <asm/uaccess.h>
@@ -729,17 +730,10 @@ void
sigset_from_compat (sigset_t *set, compat_sigset_t *compat)
{
switch (_NSIG_WORDS) {
-#if defined (__COMPAT_ENDIAN_SWAP__)
- case 4: set->sig[3] = compat->sig[7] | (((long)compat->sig[6]) << 32 );
- case 3: set->sig[2] = compat->sig[5] | (((long)compat->sig[4]) << 32 );
- case 2: set->sig[1] = compat->sig[3] | (((long)compat->sig[2]) << 32 );
- case 1: set->sig[0] = compat->sig[1] | (((long)compat->sig[0]) << 32 );
-#else
case 4: set->sig[3] = compat->sig[6] | (((long)compat->sig[7]) << 32 );
case 3: set->sig[2] = compat->sig[4] | (((long)compat->sig[5]) << 32 );
case 2: set->sig[1] = compat->sig[2] | (((long)compat->sig[3]) << 32 );
case 1: set->sig[0] = compat->sig[0] | (((long)compat->sig[1]) << 32 );
-#endif
}
}
@@ -934,3 +928,25 @@ asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp)
return ret;
}
+
+#ifdef CONFIG_NUMA
+asmlinkage long compat_sys_move_pages(pid_t pid, unsigned long nr_pages,
+ compat_uptr_t __user *pages32,
+ const int __user *nodes,
+ int __user *status,
+ int flags)
+{
+ const void __user * __user *pages;
+ int i;
+
+ pages = compat_alloc_user_space(nr_pages * sizeof(void *));
+ for (i = 0; i < nr_pages; i++) {
+ compat_uptr_t p;
+
+ if (get_user(p, pages32 + i) ||
+ put_user(compat_ptr(p), pages + i))
+ return -EFAULT;
+ }
+ return sys_move_pages(pid, nr_pages, pages, nodes, status, flags);
+}
+#endif
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index ab81fdd4572b..b602f73fb38d 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -41,6 +41,7 @@
#include <linux/rcupdate.h>
#include <linux/sched.h>
#include <linux/seq_file.h>
+#include <linux/security.h>
#include <linux/slab.h>
#include <linux/smp_lock.h>
#include <linux/spinlock.h>
@@ -392,11 +393,11 @@ static int cpuset_fill_super(struct super_block *sb, void *unused_data,
return 0;
}
-static struct super_block *cpuset_get_sb(struct file_system_type *fs_type,
- int flags, const char *unused_dev_name,
- void *data)
+static int cpuset_get_sb(struct file_system_type *fs_type,
+ int flags, const char *unused_dev_name,
+ void *data, struct vfsmount *mnt)
{
- return get_sb_single(fs_type, flags, data, cpuset_fill_super);
+ return get_sb_single(fs_type, flags, data, cpuset_fill_super, mnt);
}
static struct file_system_type cpuset_fs_type = {
@@ -1177,6 +1178,7 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
cpumask_t cpus;
nodemask_t from, to;
struct mm_struct *mm;
+ int retval;
if (sscanf(pidbuf, "%d", &pid) != 1)
return -EIO;
@@ -1205,6 +1207,12 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
get_task_struct(tsk);
}
+ retval = security_task_setscheduler(tsk, 0, NULL);
+ if (retval) {
+ put_task_struct(tsk);
+ return retval;
+ }
+
mutex_lock(&callback_mutex);
task_lock(tsk);
diff --git a/kernel/exit.c b/kernel/exit.c
index e95b93282210..e76bd02e930e 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -36,6 +36,7 @@
#include <linux/compat.h>
#include <linux/pipe_fs_i.h>
#include <linux/audit.h> /* for audit_free() */
+#include <linux/resource.h>
#include <asm/uaccess.h>
#include <asm/unistd.h>
@@ -45,8 +46,6 @@
extern void sem_exit (void);
extern struct task_struct *child_reaper;
-int getrusage(struct task_struct *, int, struct rusage __user *);
-
static void exit_mm(struct task_struct * tsk);
static void __unhash_process(struct task_struct *p)
@@ -579,7 +578,7 @@ static void exit_mm(struct task_struct * tsk)
down_read(&mm->mmap_sem);
}
atomic_inc(&mm->mm_count);
- if (mm != tsk->active_mm) BUG();
+ BUG_ON(mm != tsk->active_mm);
/* more a memory barrier than a real lock */
task_lock(tsk);
tsk->mm = NULL;
@@ -881,14 +880,6 @@ fastcall NORET_TYPE void do_exit(long code)
tsk->flags |= PF_EXITING;
- /*
- * Make sure we don't try to process any timer firings
- * while we are already exiting.
- */
- tsk->it_virt_expires = cputime_zero;
- tsk->it_prof_expires = cputime_zero;
- tsk->it_sched_expires = 0;
-
if (unlikely(in_atomic()))
printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n",
current->comm, current->pid,
@@ -903,11 +894,11 @@ fastcall NORET_TYPE void do_exit(long code)
if (group_dead) {
hrtimer_cancel(&tsk->signal->real_timer);
exit_itimers(tsk->signal);
- acct_process(code);
}
+ acct_collect(code, group_dead);
if (unlikely(tsk->robust_list))
exit_robust_list(tsk);
-#ifdef CONFIG_COMPAT
+#if defined(CONFIG_FUTEX) && defined(CONFIG_COMPAT)
if (unlikely(tsk->compat_robust_list))
compat_exit_robust_list(tsk);
#endif
@@ -915,6 +906,8 @@ fastcall NORET_TYPE void do_exit(long code)
audit_free(tsk);
exit_mm(tsk);
+ if (group_dead)
+ acct_process();
exit_sem(tsk);
__exit_files(tsk);
__exit_fs(tsk);
@@ -1538,8 +1531,7 @@ check_continued:
if (options & __WNOTHREAD)
break;
tsk = next_thread(tsk);
- if (tsk->signal != current->signal)
- BUG();
+ BUG_ON(tsk->signal != current->signal);
} while (tsk != current);
read_unlock(&tasklist_lock);
diff --git a/kernel/fork.c b/kernel/fork.c
index ac8100e3088a..dfd10cb370c3 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -368,6 +368,8 @@ void fastcall __mmdrop(struct mm_struct *mm)
*/
void mmput(struct mm_struct *mm)
{
+ might_sleep();
+
if (atomic_dec_and_test(&mm->mm_users)) {
exit_aio(mm);
exit_mmap(mm);
@@ -623,6 +625,7 @@ out:
/*
* Allocate a new files structure and copy contents from the
* passed in files structure.
+ * errorp will be valid only when the returned files_struct is NULL.
*/
static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
{
@@ -631,6 +634,7 @@ static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
int open_files, size, i, expand;
struct fdtable *old_fdt, *new_fdt;
+ *errorp = -ENOMEM;
newf = alloc_files();
if (!newf)
goto out;
@@ -744,7 +748,6 @@ static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
* break this.
*/
tsk->files = NULL;
- error = -ENOMEM;
newf = dup_fd(oldf, &error);
if (!newf)
goto out;
@@ -871,6 +874,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
tsk->it_prof_expires =
secs_to_cputime(sig->rlim[RLIMIT_CPU].rlim_cur);
}
+ acct_init_pacct(&sig->pacct);
return 0;
}
diff --git a/kernel/futex.c b/kernel/futex.c
index 5699c512057b..e1a380c77a5a 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1056,11 +1056,11 @@ asmlinkage long sys_futex(u32 __user *uaddr, int op, int val,
(unsigned long)uaddr2, val2, val3);
}
-static struct super_block *
-futexfs_get_sb(struct file_system_type *fs_type,
- int flags, const char *dev_name, void *data)
+static int futexfs_get_sb(struct file_system_type *fs_type,
+ int flags, const char *dev_name, void *data,
+ struct vfsmount *mnt)
{
- return get_sb_pseudo(fs_type, "futex", NULL, 0xBAD1DEA);
+ return get_sb_pseudo(fs_type, "futex", NULL, 0xBAD1DEA, mnt);
}
static struct file_system_type futex_fs_type = {
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index b7f0388bd71c..55601b3ce60e 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -98,7 +98,6 @@ static DEFINE_PER_CPU(struct hrtimer_base, hrtimer_bases[MAX_HRTIMER_BASES]) =
/**
* ktime_get_ts - get the monotonic clock in timespec format
- *
* @ts: pointer to timespec variable
*
* The function calculates the monotonic clock from the realtime
@@ -238,7 +237,6 @@ lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
# ifndef CONFIG_KTIME_SCALAR
/**
* ktime_add_ns - Add a scalar nanoseconds value to a ktime_t variable
- *
* @kt: addend
* @nsec: the scalar nsec value to add
*
@@ -299,7 +297,6 @@ void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
/**
* hrtimer_forward - forward the timer expiry
- *
* @timer: hrtimer to forward
* @now: forward past this time
* @interval: the interval to forward
@@ -393,7 +390,7 @@ static void __remove_hrtimer(struct hrtimer *timer, struct hrtimer_base *base)
if (base->first == &timer->node)
base->first = rb_next(&timer->node);
rb_erase(&timer->node, &base->active);
- timer->node.rb_parent = HRTIMER_INACTIVE;
+ rb_set_parent(&timer->node, &timer->node);
}
/*
@@ -411,7 +408,6 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_base *base)
/**
* hrtimer_start - (re)start an relative timer on the current CPU
- *
* @timer: the timer to be added
* @tim: expiry time
* @mode: expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL)
@@ -456,17 +452,17 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
return ret;
}
+EXPORT_SYMBOL_GPL(hrtimer_start);
/**
* hrtimer_try_to_cancel - try to deactivate a timer
- *
* @timer: hrtimer to stop
*
* Returns:
* 0 when the timer was not active
* 1 when the timer was active
* -1 when the timer is currently excuting the callback function and
- * can not be stopped
+ * cannot be stopped
*/
int hrtimer_try_to_cancel(struct hrtimer *timer)
{
@@ -484,10 +480,10 @@ int hrtimer_try_to_cancel(struct hrtimer *timer)
return ret;
}
+EXPORT_SYMBOL_GPL(hrtimer_try_to_cancel);
/**
* hrtimer_cancel - cancel a timer and wait for the handler to finish.
- *
* @timer: the timer to be cancelled
*
* Returns:
@@ -504,10 +500,10 @@ int hrtimer_cancel(struct hrtimer *timer)
cpu_relax();
}
}
+EXPORT_SYMBOL_GPL(hrtimer_cancel);
/**
* hrtimer_get_remaining - get remaining time for the timer
- *
* @timer: the timer to read
*/
ktime_t hrtimer_get_remaining(const struct hrtimer *timer)
@@ -522,6 +518,7 @@ ktime_t hrtimer_get_remaining(const struct hrtimer *timer)
return rem;
}
+EXPORT_SYMBOL_GPL(hrtimer_get_remaining);
#ifdef CONFIG_NO_IDLE_HZ
/**
@@ -560,7 +557,6 @@ ktime_t hrtimer_get_next_event(void)
/**
* hrtimer_init - initialize a timer to the given clock
- *
* @timer: the timer to be initialized
* @clock_id: the clock to be used
* @mode: timer mode abs/rel
@@ -572,18 +568,18 @@ void hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
memset(timer, 0, sizeof(struct hrtimer));
- bases = per_cpu(hrtimer_bases, raw_smp_processor_id());
+ bases = __raw_get_cpu_var(hrtimer_bases);
if (clock_id == CLOCK_REALTIME && mode != HRTIMER_ABS)
clock_id = CLOCK_MONOTONIC;
timer->base = &bases[clock_id];
- timer->node.rb_parent = HRTIMER_INACTIVE;
+ rb_set_parent(&timer->node, &timer->node);
}
+EXPORT_SYMBOL_GPL(hrtimer_init);
/**
* hrtimer_get_res - get the timer resolution for a clock
- *
* @which_clock: which clock to query
* @tp: pointer to timespec variable to store the resolution
*
@@ -594,11 +590,12 @@ int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp)
{
struct hrtimer_base *bases;
- bases = per_cpu(hrtimer_bases, raw_smp_processor_id());
+ bases = __raw_get_cpu_var(hrtimer_bases);
*tp = ktime_to_timespec(bases[which_clock].resolution);
return 0;
}
+EXPORT_SYMBOL_GPL(hrtimer_get_res);
/*
* Expire the per base hrtimer-queue:
diff --git a/kernel/intermodule.c b/kernel/intermodule.c
deleted file mode 100644
index 55b1e5b85db9..000000000000
--- a/kernel/intermodule.c
+++ /dev/null
@@ -1,184 +0,0 @@
-/* Deprecated, do not use. Moved from module.c to here. --RR */
-
-/* Written by Keith Owens <kaos@ocs.com.au> Oct 2000 */
-#include <linux/module.h>
-#include <linux/kmod.h>
-#include <linux/spinlock.h>
-#include <linux/list.h>
-#include <linux/slab.h>
-
-/* inter_module functions are always available, even when the kernel is
- * compiled without modules. Consumers of inter_module_xxx routines
- * will always work, even when both are built into the kernel, this
- * approach removes lots of #ifdefs in mainline code.
- */
-
-static struct list_head ime_list = LIST_HEAD_INIT(ime_list);
-static DEFINE_SPINLOCK(ime_lock);
-static int kmalloc_failed;
-
-struct inter_module_entry {
- struct list_head list;
- const char *im_name;
- struct module *owner;
- const void *userdata;
-};
-
-/**
- * inter_module_register - register a new set of inter module data.
- * @im_name: an arbitrary string to identify the data, must be unique
- * @owner: module that is registering the data, always use THIS_MODULE
- * @userdata: pointer to arbitrary userdata to be registered
- *
- * Description: Check that the im_name has not already been registered,
- * complain if it has. For new data, add it to the inter_module_entry
- * list.
- */
-void inter_module_register(const char *im_name, struct module *owner, const void *userdata)
-{
- struct list_head *tmp;
- struct inter_module_entry *ime, *ime_new;
-
- if (!(ime_new = kzalloc(sizeof(*ime), GFP_KERNEL))) {
- /* Overloaded kernel, not fatal */
- printk(KERN_ERR
- "Aiee, inter_module_register: cannot kmalloc entry for '%s'\n",
- im_name);
- kmalloc_failed = 1;
- return;
- }
- ime_new->im_name = im_name;
- ime_new->owner = owner;
- ime_new->userdata = userdata;
-
- spin_lock(&ime_lock);
- list_for_each(tmp, &ime_list) {
- ime = list_entry(tmp, struct inter_module_entry, list);
- if (strcmp(ime->im_name, im_name) == 0) {
- spin_unlock(&ime_lock);
- kfree(ime_new);
- /* Program logic error, fatal */
- printk(KERN_ERR "inter_module_register: duplicate im_name '%s'", im_name);
- BUG();
- }
- }
- list_add(&(ime_new->list), &ime_list);
- spin_unlock(&ime_lock);
-}
-
-/**
- * inter_module_unregister - unregister a set of inter module data.
- * @im_name: an arbitrary string to identify the data, must be unique
- *
- * Description: Check that the im_name has been registered, complain if
- * it has not. For existing data, remove it from the
- * inter_module_entry list.
- */
-void inter_module_unregister(const char *im_name)
-{
- struct list_head *tmp;
- struct inter_module_entry *ime;
-
- spin_lock(&ime_lock);
- list_for_each(tmp, &ime_list) {
- ime = list_entry(tmp, struct inter_module_entry, list);
- if (strcmp(ime->im_name, im_name) == 0) {
- list_del(&(ime->list));
- spin_unlock(&ime_lock);
- kfree(ime);
- return;
- }
- }
- spin_unlock(&ime_lock);
- if (kmalloc_failed) {
- printk(KERN_ERR
- "inter_module_unregister: no entry for '%s', "
- "probably caused by previous kmalloc failure\n",
- im_name);
- return;
- }
- else {
- /* Program logic error, fatal */
- printk(KERN_ERR "inter_module_unregister: no entry for '%s'", im_name);
- BUG();
- }
-}
-
-/**
- * inter_module_get - return arbitrary userdata from another module.
- * @im_name: an arbitrary string to identify the data, must be unique
- *
- * Description: If the im_name has not been registered, return NULL.
- * Try to increment the use count on the owning module, if that fails
- * then return NULL. Otherwise return the userdata.
- */
-static const void *inter_module_get(const char *im_name)
-{
- struct list_head *tmp;
- struct inter_module_entry *ime;
- const void *result = NULL;
-
- spin_lock(&ime_lock);
- list_for_each(tmp, &ime_list) {
- ime = list_entry(tmp, struct inter_module_entry, list);
- if (strcmp(ime->im_name, im_name) == 0) {
- if (try_module_get(ime->owner))
- result = ime->userdata;
- break;
- }
- }
- spin_unlock(&ime_lock);
- return(result);
-}
-
-/**
- * inter_module_get_request - im get with automatic request_module.
- * @im_name: an arbitrary string to identify the data, must be unique
- * @modname: module that is expected to register im_name
- *
- * Description: If inter_module_get fails, do request_module then retry.
- */
-const void *inter_module_get_request(const char *im_name, const char *modname)
-{
- const void *result = inter_module_get(im_name);
- if (!result) {
- request_module("%s", modname);
- result = inter_module_get(im_name);
- }
- return(result);
-}
-
-/**
- * inter_module_put - release use of data from another module.
- * @im_name: an arbitrary string to identify the data, must be unique
- *
- * Description: If the im_name has not been registered, complain,
- * otherwise decrement the use count on the owning module.
- */
-void inter_module_put(const char *im_name)
-{
- struct list_head *tmp;
- struct inter_module_entry *ime;
-
- spin_lock(&ime_lock);
- list_for_each(tmp, &ime_list) {
- ime = list_entry(tmp, struct inter_module_entry, list);
- if (strcmp(ime->im_name, im_name) == 0) {
- if (ime->owner)
- module_put(ime->owner);
- spin_unlock(&ime_lock);
- return;
- }
- }
- spin_unlock(&ime_lock);
- printk(KERN_ERR "inter_module_put: no entry for '%s'", im_name);
- BUG();
-}
-
-EXPORT_SYMBOL(inter_module_register);
-EXPORT_SYMBOL(inter_module_unregister);
-EXPORT_SYMBOL(inter_module_get_request);
-EXPORT_SYMBOL(inter_module_put);
-
-MODULE_LICENSE("GPL");
-
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 51df337b37db..0f6530117105 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -76,10 +76,11 @@ irqreturn_t no_action(int cpl, void *dev_id, struct pt_regs *regs)
/*
* Have got an event to handle:
*/
-fastcall int handle_IRQ_event(unsigned int irq, struct pt_regs *regs,
+fastcall irqreturn_t handle_IRQ_event(unsigned int irq, struct pt_regs *regs,
struct irqaction *action)
{
- int ret, retval = 0, status = 0;
+ irqreturn_t ret, retval = IRQ_NONE;
+ unsigned int status = 0;
if (!(action->flags & SA_INTERRUPT))
local_irq_enable();
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index 134f9f2e0e39..a12d00eb5e7c 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -30,7 +30,7 @@ void move_native_irq(int irq)
desc->move_irq = 0;
- if (likely(cpus_empty(pending_irq_cpumask[irq])))
+ if (unlikely(cpus_empty(pending_irq_cpumask[irq])))
return;
if (!desc->handler->set_affinity)
@@ -49,7 +49,7 @@ void move_native_irq(int irq)
* cause some ioapics to mal-function.
* Being paranoid i guess!
*/
- if (unlikely(!cpus_empty(tmp))) {
+ if (likely(!cpus_empty(tmp))) {
if (likely(!(desc->status & IRQ_DISABLED)))
desc->handler->disable(irq);
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index d03b5eef8ce0..afacd6f585fa 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -24,6 +24,8 @@ static struct proc_dir_entry *smp_affinity_entry[NR_IRQS];
#ifdef CONFIG_GENERIC_PENDING_IRQ
void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val)
{
+ set_balance_irq_affinity(irq, mask_val);
+
/*
* Save these away for later use. Re-progam when the
* interrupt is pending
@@ -33,6 +35,7 @@ void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val)
#else
void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val)
{
+ set_balance_irq_affinity(irq, mask_val);
irq_affinity[irq] = mask_val;
irq_desc[irq].handler->set_affinity(irq, mask_val);
}
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 7df9abd5ec86..b2fb3c18d06b 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -11,7 +11,7 @@
#include <linux/kallsyms.h>
#include <linux/interrupt.h>
-static int irqfixup;
+static int irqfixup __read_mostly;
/*
* Recovery handler for misrouted interrupts.
@@ -136,9 +136,9 @@ static void report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t actio
void note_interrupt(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret,
struct pt_regs *regs)
{
- if (action_ret != IRQ_HANDLED) {
+ if (unlikely(action_ret != IRQ_HANDLED)) {
desc->irqs_unhandled++;
- if (action_ret != IRQ_NONE)
+ if (unlikely(action_ret != IRQ_NONE))
report_bad_irq(irq, desc, action_ret);
}
@@ -152,11 +152,11 @@ void note_interrupt(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret,
}
desc->irq_count++;
- if (desc->irq_count < 100000)
+ if (likely(desc->irq_count < 100000))
return;
desc->irq_count = 0;
- if (desc->irqs_unhandled > 99900) {
+ if (unlikely(desc->irqs_unhandled > 99900)) {
/*
* The interrupt is stuck
*/
@@ -171,7 +171,7 @@ void note_interrupt(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret,
desc->irqs_unhandled = 0;
}
-int noirqdebug;
+int noirqdebug __read_mostly;
int __init noirqdebug_setup(char *str)
{
diff --git a/kernel/kexec.c b/kernel/kexec.c
index bf39d28e4c0e..58f0f382597c 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -902,14 +902,14 @@ static int kimage_load_segment(struct kimage *image,
* kexec does not sync, or unmount filesystems so if you need
* that to happen you need to do that yourself.
*/
-struct kimage *kexec_image = NULL;
-static struct kimage *kexec_crash_image = NULL;
+struct kimage *kexec_image;
+struct kimage *kexec_crash_image;
/*
* A home grown binary mutex.
* Nothing can wait so this mutex is safe to use
* in interrupt context :)
*/
-static int kexec_lock = 0;
+static int kexec_lock;
asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments,
struct kexec_segment __user *segments,
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index f119e098e67b..9e28478a17a5 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -14,6 +14,7 @@
#include <linux/sysfs.h>
#include <linux/module.h>
#include <linux/init.h>
+#include <linux/kexec.h>
#define KERNEL_ATTR_RO(_name) \
static struct subsys_attribute _name##_attr = __ATTR_RO(_name)
@@ -48,6 +49,20 @@ static ssize_t uevent_helper_store(struct subsystem *subsys, const char *page, s
KERNEL_ATTR_RW(uevent_helper);
#endif
+#ifdef CONFIG_KEXEC
+static ssize_t kexec_loaded_show(struct subsystem *subsys, char *page)
+{
+ return sprintf(page, "%d\n", !!kexec_image);
+}
+KERNEL_ATTR_RO(kexec_loaded);
+
+static ssize_t kexec_crash_loaded_show(struct subsystem *subsys, char *page)
+{
+ return sprintf(page, "%d\n", !!kexec_crash_image);
+}
+KERNEL_ATTR_RO(kexec_crash_loaded);
+#endif /* CONFIG_KEXEC */
+
decl_subsys(kernel, NULL, NULL);
EXPORT_SYMBOL_GPL(kernel_subsys);
@@ -56,6 +71,10 @@ static struct attribute * kernel_attrs[] = {
&uevent_seqnum_attr.attr,
&uevent_helper_attr.attr,
#endif
+#ifdef CONFIG_KEXEC
+ &kexec_loaded_attr.attr,
+ &kexec_crash_loaded_attr.attr,
+#endif
NULL
};
diff --git a/kernel/kthread.c b/kernel/kthread.c
index c5f3c6613b6d..24be714b04c7 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -45,6 +45,13 @@ struct kthread_stop_info
static DEFINE_MUTEX(kthread_stop_lock);
static struct kthread_stop_info kthread_stop_info;
+/**
+ * kthread_should_stop - should this kthread return now?
+ *
+ * When someone calls kthread_stop on your kthread, it will be woken
+ * and this will return true. You should then return, and your return
+ * value will be passed through to kthread_stop().
+ */
int kthread_should_stop(void)
{
return (kthread_stop_info.k == current);
@@ -122,6 +129,25 @@ static void keventd_create_kthread(void *_create)
complete(&create->done);
}
+/**
+ * kthread_create - create a kthread.
+ * @threadfn: the function to run until signal_pending(current).
+ * @data: data ptr for @threadfn.
+ * @namefmt: printf-style name for the thread.
+ *
+ * Description: This helper function creates and names a kernel
+ * thread. The thread will be stopped: use wake_up_process() to start
+ * it. See also kthread_run(), kthread_create_on_cpu().
+ *
+ * When woken, the thread will run @threadfn() with @data as its
+ * argument. @threadfn can either call do_exit() directly if it is a
+ * standalone thread for which noone will call kthread_stop(), or
+ * return when 'kthread_should_stop()' is true (which means
+ * kthread_stop() has been called). The return value should be zero
+ * or a negative error number; it will be passed to kthread_stop().
+ *
+ * Returns a task_struct or ERR_PTR(-ENOMEM).
+ */
struct task_struct *kthread_create(int (*threadfn)(void *data),
void *data,
const char namefmt[],
@@ -156,6 +182,15 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
}
EXPORT_SYMBOL(kthread_create);
+/**
+ * kthread_bind - bind a just-created kthread to a cpu.
+ * @k: thread created by kthread_create().
+ * @cpu: cpu (might not be online, must be possible) for @k to run on.
+ *
+ * Description: This function is equivalent to set_cpus_allowed(),
+ * except that @cpu doesn't need to be online, and the thread must be
+ * stopped (i.e., just returned from kthread_create().
+ */
void kthread_bind(struct task_struct *k, unsigned int cpu)
{
BUG_ON(k->state != TASK_INTERRUPTIBLE);
@@ -166,12 +201,36 @@ void kthread_bind(struct task_struct *k, unsigned int cpu)
}
EXPORT_SYMBOL(kthread_bind);
+/**
+ * kthread_stop - stop a thread created by kthread_create().
+ * @k: thread created by kthread_create().
+ *
+ * Sets kthread_should_stop() for @k to return true, wakes it, and
+ * waits for it to exit. Your threadfn() must not call do_exit()
+ * itself if you use this function! This can also be called after
+ * kthread_create() instead of calling wake_up_process(): the thread
+ * will exit without calling threadfn().
+ *
+ * Returns the result of threadfn(), or %-EINTR if wake_up_process()
+ * was never called.
+ */
int kthread_stop(struct task_struct *k)
{
return kthread_stop_sem(k, NULL);
}
EXPORT_SYMBOL(kthread_stop);
+/**
+ * kthread_stop_sem - stop a thread created by kthread_create().
+ * @k: thread created by kthread_create().
+ * @s: semaphore that @k waits on while idle.
+ *
+ * Does essentially the same thing as kthread_stop() above, but wakes
+ * @k by calling up(@s).
+ *
+ * Returns the result of threadfn(), or %-EINTR if wake_up_process()
+ * was never called.
+ */
int kthread_stop_sem(struct task_struct *k, struct semaphore *s)
{
int ret;
@@ -210,5 +269,5 @@ static __init int helper_init(void)
return 0;
}
-core_initcall(helper_init);
+core_initcall(helper_init);
diff --git a/kernel/module.c b/kernel/module.c
index bbe04862e1b0..d75275de1c28 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1326,7 +1326,7 @@ int is_exported(const char *name, const struct module *mod)
if (!mod && lookup_symbol(name, __start___ksymtab, __stop___ksymtab))
return 1;
else
- if (lookup_symbol(name, mod->syms, mod->syms + mod->num_syms))
+ if (mod && lookup_symbol(name, mod->syms, mod->syms + mod->num_syms))
return 1;
else
return 0;
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 520f6c59948d..d38d9ec3276c 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -555,9 +555,6 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now)
struct cpu_timer_list *next;
unsigned long i;
- if (CPUCLOCK_PERTHREAD(timer->it_clock) && (p->flags & PF_EXITING))
- return;
-
head = (CPUCLOCK_PERTHREAD(timer->it_clock) ?
p->cpu_timers : p->signal->cpu_timers);
head += CPUCLOCK_WHICH(timer->it_clock);
@@ -1173,6 +1170,9 @@ static void check_process_timers(struct task_struct *tsk,
}
t = tsk;
do {
+ if (unlikely(t->flags & PF_EXITING))
+ continue;
+
ticks = cputime_add(cputime_add(t->utime, t->stime),
prof_left);
if (!cputime_eq(prof_expires, cputime_zero) &&
@@ -1193,11 +1193,7 @@ static void check_process_timers(struct task_struct *tsk,
t->it_sched_expires > sched)) {
t->it_sched_expires = sched;
}
-
- do {
- t = next_thread(t);
- } while (unlikely(t->flags & PF_EXITING));
- } while (t != tsk);
+ } while ((t = next_thread(t)) != tsk);
}
}
@@ -1289,30 +1285,30 @@ void run_posix_cpu_timers(struct task_struct *tsk)
#undef UNEXPIRED
- BUG_ON(tsk->exit_state);
-
/*
* Double-check with locks held.
*/
read_lock(&tasklist_lock);
- spin_lock(&tsk->sighand->siglock);
+ if (likely(tsk->signal != NULL)) {
+ spin_lock(&tsk->sighand->siglock);
- /*
- * Here we take off tsk->cpu_timers[N] and tsk->signal->cpu_timers[N]
- * all the timers that are firing, and put them on the firing list.
- */
- check_thread_timers(tsk, &firing);
- check_process_timers(tsk, &firing);
+ /*
+ * Here we take off tsk->cpu_timers[N] and tsk->signal->cpu_timers[N]
+ * all the timers that are firing, and put them on the firing list.
+ */
+ check_thread_timers(tsk, &firing);
+ check_process_timers(tsk, &firing);
- /*
- * We must release these locks before taking any timer's lock.
- * There is a potential race with timer deletion here, as the
- * siglock now protects our private firing list. We have set
- * the firing flag in each timer, so that a deletion attempt
- * that gets the timer lock before we do will give it up and
- * spin until we've taken care of that timer below.
- */
- spin_unlock(&tsk->sighand->siglock);
+ /*
+ * We must release these locks before taking any timer's lock.
+ * There is a potential race with timer deletion here, as the
+ * siglock now protects our private firing list. We have set
+ * the firing flag in each timer, so that a deletion attempt
+ * that gets the timer lock before we do will give it up and
+ * spin until we've taken care of that timer below.
+ */
+ spin_unlock(&tsk->sighand->siglock);
+ }
read_unlock(&tasklist_lock);
/*
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index ce0dfb8f4a4e..fc311a4673a2 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -36,6 +36,15 @@ config PM_DEBUG
code. This is helpful when debugging and reporting various PM bugs,
like suspend support.
+config PM_TRACE
+ bool "Suspend/resume event tracing"
+ depends on PM && PM_DEBUG && X86_32
+ default y
+ ---help---
+ This enables some cheesy code to save the last PM event point in the
+ RTC across reboots, so that you can debug a machine that just hangs
+ during suspend (or more commonly, during resume).
+
config SOFTWARE_SUSPEND
bool "Software Suspend"
depends on PM && SWAP && (X86 && (!SMP || SUSPEND_SMP)) || ((FRV || PPC32) && !SMP)
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 81d4d982f3f0..e13e74067845 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -231,7 +231,7 @@ static int software_resume(void)
late_initcall(software_resume);
-static char * pm_disk_modes[] = {
+static const char * const pm_disk_modes[] = {
[PM_DISK_FIRMWARE] = "firmware",
[PM_DISK_PLATFORM] = "platform",
[PM_DISK_SHUTDOWN] = "shutdown",
diff --git a/kernel/power/main.c b/kernel/power/main.c
index a6d9ef46009e..6d295c776794 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -15,7 +15,7 @@
#include <linux/errno.h>
#include <linux/init.h>
#include <linux/pm.h>
-
+#include <linux/console.h>
#include "power.h"
@@ -86,6 +86,7 @@ static int suspend_prepare(suspend_state_t state)
goto Thaw;
}
+ suspend_console();
if ((error = device_suspend(PMSG_SUSPEND))) {
printk(KERN_ERR "Some devices failed to suspend\n");
goto Finish;
@@ -133,6 +134,7 @@ int suspend_enter(suspend_state_t state)
static void suspend_finish(suspend_state_t state)
{
device_resume();
+ resume_console();
thaw_processes();
enable_nonboot_cpus();
if (pm_ops && pm_ops->finish)
@@ -143,7 +145,7 @@ static void suspend_finish(suspend_state_t state)
-static char *pm_states[PM_SUSPEND_MAX] = {
+static const char * const pm_states[PM_SUSPEND_MAX] = {
[PM_SUSPEND_STANDBY] = "standby",
[PM_SUSPEND_MEM] = "mem",
#ifdef CONFIG_SOFTWARE_SUSPEND
@@ -260,7 +262,7 @@ static ssize_t state_show(struct subsystem * subsys, char * buf)
static ssize_t state_store(struct subsystem * subsys, const char * buf, size_t n)
{
suspend_state_t state = PM_SUSPEND_STANDBY;
- char ** s;
+ const char * const *s;
char *p;
int error;
int len;
diff --git a/kernel/power/power.h b/kernel/power/power.h
index f06f12f21767..57a792982fb9 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -55,7 +55,7 @@ struct snapshot_handle {
unsigned int page;
unsigned int page_offset;
unsigned int prev;
- struct pbe *pbe;
+ struct pbe *pbe, *last_pbe;
void *buffer;
unsigned int buf_offset;
};
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 3eeedbb13b78..24c96f354231 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -150,6 +150,10 @@ int restore_highmem(void)
}
return 0;
}
+#else
+static inline unsigned int count_highmem_pages(void) {return 0;}
+static inline int save_highmem(void) {return 0;}
+static inline int restore_highmem(void) {return 0;}
#endif
static int pfn_is_nosave(unsigned long pfn)
@@ -293,62 +297,29 @@ static inline void create_pbe_list(struct pbe *pblist, unsigned int nr_pages)
}
}
-/**
- * On resume it is necessary to trace and eventually free the unsafe
- * pages that have been allocated, because they are needed for I/O
- * (on x86-64 we likely will "eat" these pages once again while
- * creating the temporary page translation tables)
- */
-
-struct eaten_page {
- struct eaten_page *next;
- char padding[PAGE_SIZE - sizeof(void *)];
-};
-
-static struct eaten_page *eaten_pages = NULL;
-
-static void release_eaten_pages(void)
-{
- struct eaten_page *p, *q;
-
- p = eaten_pages;
- while (p) {
- q = p->next;
- /* We don't want swsusp_free() to free this page again */
- ClearPageNosave(virt_to_page(p));
- free_page((unsigned long)p);
- p = q;
- }
- eaten_pages = NULL;
-}
+static unsigned int unsafe_pages;
/**
* @safe_needed - on resume, for storing the PBE list and the image,
* we can only use memory pages that do not conflict with the pages
- * which had been used before suspend.
+ * used before suspend.
*
* The unsafe pages are marked with the PG_nosave_free flag
- *
- * Allocated but unusable (ie eaten) memory pages should be marked
- * so that swsusp_free() can release them
+ * and we count them using unsafe_pages
*/
static inline void *alloc_image_page(gfp_t gfp_mask, int safe_needed)
{
void *res;
+ res = (void *)get_zeroed_page(gfp_mask);
if (safe_needed)
- do {
+ while (res && PageNosaveFree(virt_to_page(res))) {
+ /* The page is unsafe, mark it for swsusp_free() */
+ SetPageNosave(virt_to_page(res));
+ unsafe_pages++;
res = (void *)get_zeroed_page(gfp_mask);
- if (res && PageNosaveFree(virt_to_page(res))) {
- /* This is for swsusp_free() */
- SetPageNosave(virt_to_page(res));
- ((struct eaten_page *)res)->next = eaten_pages;
- eaten_pages = res;
- }
- } while (res && PageNosaveFree(virt_to_page(res)));
- else
- res = (void *)get_zeroed_page(gfp_mask);
+ }
if (res) {
SetPageNosave(virt_to_page(res));
SetPageNosaveFree(virt_to_page(res));
@@ -374,7 +345,8 @@ unsigned long get_safe_page(gfp_t gfp_mask)
* On each page we set up a list of struct_pbe elements.
*/
-struct pbe *alloc_pagedir(unsigned int nr_pages, gfp_t gfp_mask, int safe_needed)
+static struct pbe *alloc_pagedir(unsigned int nr_pages, gfp_t gfp_mask,
+ int safe_needed)
{
unsigned int num;
struct pbe *pblist, *pbe;
@@ -642,6 +614,8 @@ static int mark_unsafe_pages(struct pbe *pblist)
return -EFAULT;
}
+ unsafe_pages = 0;
+
return 0;
}
@@ -719,42 +693,99 @@ static inline struct pbe *unpack_orig_addresses(unsigned long *buf,
}
/**
- * create_image - use metadata contained in the PBE list
+ * prepare_image - use metadata contained in the PBE list
* pointed to by pagedir_nosave to mark the pages that will
* be overwritten in the process of restoring the system
- * memory state from the image and allocate memory for
- * the image avoiding these pages
+ * memory state from the image ("unsafe" pages) and allocate
+ * memory for the image
+ *
+ * The idea is to allocate the PBE list first and then
+ * allocate as many pages as it's needed for the image data,
+ * but not to assign these pages to the PBEs initially.
+ * Instead, we just mark them as allocated and create a list
+ * of "safe" which will be used later
*/
-static int create_image(struct snapshot_handle *handle)
+struct safe_page {
+ struct safe_page *next;
+ char padding[PAGE_SIZE - sizeof(void *)];
+};
+
+static struct safe_page *safe_pages;
+
+static int prepare_image(struct snapshot_handle *handle)
{
int error = 0;
- struct pbe *p, *pblist;
+ unsigned int nr_pages = nr_copy_pages;
+ struct pbe *p, *pblist = NULL;
p = pagedir_nosave;
error = mark_unsafe_pages(p);
if (!error) {
- pblist = alloc_pagedir(nr_copy_pages, GFP_ATOMIC, 1);
+ pblist = alloc_pagedir(nr_pages, GFP_ATOMIC, 1);
if (pblist)
copy_page_backup_list(pblist, p);
free_pagedir(p, 0);
if (!pblist)
error = -ENOMEM;
}
- if (!error)
- error = alloc_data_pages(pblist, GFP_ATOMIC, 1);
+ safe_pages = NULL;
+ if (!error && nr_pages > unsafe_pages) {
+ nr_pages -= unsafe_pages;
+ while (nr_pages--) {
+ struct safe_page *ptr;
+
+ ptr = (struct safe_page *)get_zeroed_page(GFP_ATOMIC);
+ if (!ptr) {
+ error = -ENOMEM;
+ break;
+ }
+ if (!PageNosaveFree(virt_to_page(ptr))) {
+ /* The page is "safe", add it to the list */
+ ptr->next = safe_pages;
+ safe_pages = ptr;
+ }
+ /* Mark the page as allocated */
+ SetPageNosave(virt_to_page(ptr));
+ SetPageNosaveFree(virt_to_page(ptr));
+ }
+ }
if (!error) {
- release_eaten_pages();
pagedir_nosave = pblist;
} else {
- pagedir_nosave = NULL;
handle->pbe = NULL;
- nr_copy_pages = 0;
- nr_meta_pages = 0;
+ swsusp_free();
}
return error;
}
+static void *get_buffer(struct snapshot_handle *handle)
+{
+ struct pbe *pbe = handle->pbe, *last = handle->last_pbe;
+ struct page *page = virt_to_page(pbe->orig_address);
+
+ if (PageNosave(page) && PageNosaveFree(page)) {
+ /*
+ * We have allocated the "original" page frame and we can
+ * use it directly to store the read page
+ */
+ pbe->address = 0;
+ if (last && last->next)
+ last->next = NULL;
+ return (void *)pbe->orig_address;
+ }
+ /*
+ * The "original" page frame has not been allocated and we have to
+ * use a "safe" page frame to store the read page
+ */
+ pbe->address = (unsigned long)safe_pages;
+ safe_pages = safe_pages->next;
+ if (last)
+ last->next = pbe;
+ handle->last_pbe = pbe;
+ return (void *)pbe->address;
+}
+
/**
* snapshot_write_next - used for writing the system memory snapshot.
*
@@ -799,15 +830,16 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count)
} else if (handle->prev <= nr_meta_pages) {
handle->pbe = unpack_orig_addresses(buffer, handle->pbe);
if (!handle->pbe) {
- error = create_image(handle);
+ error = prepare_image(handle);
if (error)
return error;
handle->pbe = pagedir_nosave;
- handle->buffer = (void *)handle->pbe->address;
+ handle->last_pbe = NULL;
+ handle->buffer = get_buffer(handle);
}
} else {
handle->pbe = handle->pbe->next;
- handle->buffer = (void *)handle->pbe->address;
+ handle->buffer = get_buffer(handle);
}
handle->prev = handle->page;
}
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index c4016cbbd3e0..17f669c83012 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -67,9 +67,9 @@ unsigned int count_highmem_pages(void);
int save_highmem(void);
int restore_highmem(void);
#else
-static int save_highmem(void) { return 0; }
-static int restore_highmem(void) { return 0; }
-static unsigned int count_highmem_pages(void) { return 0; }
+static inline int save_highmem(void) { return 0; }
+static inline int restore_highmem(void) { return 0; }
+static inline unsigned int count_highmem_pages(void) { return 0; }
#endif
/**
@@ -175,6 +175,12 @@ void free_all_swap_pages(int swap, struct bitmap_page *bitmap)
*/
#define SHRINK_BITE 10000
+static inline unsigned long __shrink_memory(long tmp)
+{
+ if (tmp > SHRINK_BITE)
+ tmp = SHRINK_BITE;
+ return shrink_all_memory(tmp);
+}
int swsusp_shrink_memory(void)
{
@@ -192,15 +198,17 @@ int swsusp_shrink_memory(void)
PAGES_FOR_IO;
tmp = size;
for_each_zone (zone)
- if (!is_highmem(zone))
+ if (!is_highmem(zone) && populated_zone(zone)) {
tmp -= zone->free_pages;
+ tmp += zone->lowmem_reserve[ZONE_NORMAL];
+ }
if (tmp > 0) {
- tmp = shrink_all_memory(SHRINK_BITE);
+ tmp = __shrink_memory(tmp);
if (!tmp)
return -ENOMEM;
pages += tmp;
} else if (size > image_size / PAGE_SIZE) {
- tmp = shrink_all_memory(SHRINK_BITE);
+ tmp = __shrink_memory(size - (image_size / PAGE_SIZE));
pages += tmp;
}
printk("\b%c", p[i++%4]);
diff --git a/kernel/printk.c b/kernel/printk.c
index c056f3324432..95b7fe17f124 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -24,6 +24,7 @@
#include <linux/console.h>
#include <linux/init.h>
#include <linux/module.h>
+#include <linux/moduleparam.h>
#include <linux/interrupt.h> /* For in_interrupt() */
#include <linux/config.h>
#include <linux/delay.h>
@@ -67,6 +68,7 @@ EXPORT_SYMBOL(oops_in_progress);
* driver system.
*/
static DECLARE_MUTEX(console_sem);
+static DECLARE_MUTEX(secondary_console_sem);
struct console *console_drivers;
/*
* This is used for debugging the mess that is the VT code by
@@ -76,7 +78,7 @@ struct console *console_drivers;
* path in the console code where we end up in places I want
* locked without the console sempahore held
*/
-static int console_locked;
+static int console_locked, console_suspended;
/*
* logbuf_lock protects log_buf, log_start, log_end, con_start and logged_chars
@@ -326,7 +328,9 @@ static void __call_console_drivers(unsigned long start, unsigned long end)
struct console *con;
for (con = console_drivers; con; con = con->next) {
- if ((con->flags & CON_ENABLED) && con->write)
+ if ((con->flags & CON_ENABLED) && con->write &&
+ (cpu_online(smp_processor_id()) ||
+ (con->flags & CON_ANYTIME)))
con->write(con, &LOG_BUF(start), end - start);
}
}
@@ -436,6 +440,7 @@ static int printk_time = 1;
#else
static int printk_time = 0;
#endif
+module_param(printk_time, int, S_IRUGO | S_IWUSR);
static int __init printk_time_setup(char *str)
{
@@ -452,6 +457,18 @@ __attribute__((weak)) unsigned long long printk_clock(void)
return sched_clock();
}
+/* Check if we have any console registered that can be called early in boot. */
+static int have_callable_console(void)
+{
+ struct console *con;
+
+ for (con = console_drivers; con; con = con->next)
+ if (con->flags & CON_ANYTIME)
+ return 1;
+
+ return 0;
+}
+
/**
* printk - print a kernel message
* @fmt: format string
@@ -565,27 +582,29 @@ asmlinkage int vprintk(const char *fmt, va_list args)
log_level_unknown = 1;
}
- if (!cpu_online(smp_processor_id())) {
+ if (!down_trylock(&console_sem)) {
/*
- * Some console drivers may assume that per-cpu resources have
- * been allocated. So don't allow them to be called by this
- * CPU until it is officially up. We shouldn't be calling into
- * random console drivers on a CPU which doesn't exist yet..
+ * We own the drivers. We can drop the spinlock and
+ * let release_console_sem() print the text, maybe ...
*/
+ console_locked = 1;
printk_cpu = UINT_MAX;
spin_unlock_irqrestore(&logbuf_lock, flags);
- goto out;
- }
- if (!down_trylock(&console_sem)) {
- console_locked = 1;
+
/*
- * We own the drivers. We can drop the spinlock and let
- * release_console_sem() print the text
+ * Console drivers may assume that per-cpu resources have
+ * been allocated. So unless they're explicitly marked as
+ * being able to cope (CON_ANYTIME) don't call them until
+ * this CPU is officially up.
*/
- printk_cpu = UINT_MAX;
- spin_unlock_irqrestore(&logbuf_lock, flags);
- console_may_schedule = 0;
- release_console_sem();
+ if (cpu_online(smp_processor_id()) || have_callable_console()) {
+ console_may_schedule = 0;
+ release_console_sem();
+ } else {
+ /* Release by hand to avoid flushing the buffer. */
+ console_locked = 0;
+ up(&console_sem);
+ }
} else {
/*
* Someone else owns the drivers. We drop the spinlock, which
@@ -595,7 +614,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
printk_cpu = UINT_MAX;
spin_unlock_irqrestore(&logbuf_lock, flags);
}
-out:
+
preempt_enable();
return printed_len;
}
@@ -698,6 +717,23 @@ int __init add_preferred_console(char *name, int idx, char *options)
}
/**
+ * suspend_console - suspend the console subsystem
+ *
+ * This disables printk() while we go into suspend states
+ */
+void suspend_console(void)
+{
+ acquire_console_sem();
+ console_suspended = 1;
+}
+
+void resume_console(void)
+{
+ console_suspended = 0;
+ release_console_sem();
+}
+
+/**
* acquire_console_sem - lock the console system for exclusive use.
*
* Acquires a semaphore which guarantees that the caller has
@@ -708,6 +744,10 @@ int __init add_preferred_console(char *name, int idx, char *options)
void acquire_console_sem(void)
{
BUG_ON(in_interrupt());
+ if (console_suspended) {
+ down(&secondary_console_sem);
+ return;
+ }
down(&console_sem);
console_locked = 1;
console_may_schedule = 1;
@@ -750,6 +790,10 @@ void release_console_sem(void)
unsigned long _con_start, _log_end;
unsigned long wake_klogd = 0;
+ if (console_suspended) {
+ up(&secondary_console_sem);
+ return;
+ }
for ( ; ; ) {
spin_lock_irqsave(&logbuf_lock, flags);
wake_klogd |= log_start - log_end;
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 2058f88c7bbb..20e9710fc21c 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -612,14 +612,6 @@ void synchronize_rcu(void)
wait_for_completion(&rcu.completion);
}
-/*
- * Deprecated, use synchronize_rcu() or synchronize_sched() instead.
- */
-void synchronize_kernel(void)
-{
- synchronize_rcu();
-}
-
module_param(blimit, int, 0);
module_param(qhimark, int, 0);
module_param(qlowmark, int, 0);
@@ -627,7 +619,6 @@ module_param(qlowmark, int, 0);
module_param(rsinterval, int, 0);
#endif
EXPORT_SYMBOL_GPL(rcu_batches_completed);
-EXPORT_SYMBOL_GPL_FUTURE(call_rcu); /* WARNING: GPL-only in April 2006. */
-EXPORT_SYMBOL_GPL_FUTURE(call_rcu_bh); /* WARNING: GPL-only in April 2006. */
+EXPORT_SYMBOL_GPL(call_rcu);
+EXPORT_SYMBOL_GPL(call_rcu_bh);
EXPORT_SYMBOL_GPL(synchronize_rcu);
-EXPORT_SYMBOL_GPL_FUTURE(synchronize_kernel); /* WARNING: GPL-only in April 2006. */
diff --git a/kernel/sched.c b/kernel/sched.c
index c13f1bd2df7d..f06d059edef5 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3886,6 +3886,10 @@ long sched_setaffinity(pid_t pid, cpumask_t new_mask)
!capable(CAP_SYS_NICE))
goto out_unlock;
+ retval = security_task_setscheduler(p, 0, NULL);
+ if (retval)
+ goto out_unlock;
+
cpus_allowed = cpuset_cpus_allowed(p);
cpus_and(new_mask, new_mask, cpus_allowed);
retval = set_cpus_allowed(p, new_mask);
@@ -3954,7 +3958,10 @@ long sched_getaffinity(pid_t pid, cpumask_t *mask)
if (!p)
goto out_unlock;
- retval = 0;
+ retval = security_task_getscheduler(p);
+ if (retval)
+ goto out_unlock;
+
cpus_and(*mask, p->cpus_allowed, cpu_online_map);
out_unlock:
@@ -4046,6 +4053,9 @@ asmlinkage long sys_sched_yield(void)
static inline void __cond_resched(void)
{
+#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
+ __might_sleep(__FILE__, __LINE__);
+#endif
/*
* The BKS might be reacquired before we have dropped
* PREEMPT_ACTIVE, which could trigger a second
@@ -4142,7 +4152,7 @@ EXPORT_SYMBOL(yield);
*/
void __sched io_schedule(void)
{
- struct runqueue *rq = &per_cpu(runqueues, raw_smp_processor_id());
+ struct runqueue *rq = &__raw_get_cpu_var(runqueues);
atomic_inc(&rq->nr_iowait);
schedule();
@@ -4153,7 +4163,7 @@ EXPORT_SYMBOL(io_schedule);
long __sched io_schedule_timeout(long timeout)
{
- struct runqueue *rq = &per_cpu(runqueues, raw_smp_processor_id());
+ struct runqueue *rq = &__raw_get_cpu_var(runqueues);
long ret;
atomic_inc(&rq->nr_iowait);
@@ -4746,6 +4756,8 @@ static int migration_call(struct notifier_block *nfb, unsigned long action,
break;
#ifdef CONFIG_HOTPLUG_CPU
case CPU_UP_CANCELED:
+ if (!cpu_rq(cpu)->migration_thread)
+ break;
/* Unbind it from offline cpu so it can run. Fall thru. */
kthread_bind(cpu_rq(cpu)->migration_thread,
any_online_cpu(cpu_online_map));
diff --git a/kernel/signal.c b/kernel/signal.c
index e5f8aea78ffe..1b3c921737e2 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -23,12 +23,12 @@
#include <linux/syscalls.h>
#include <linux/ptrace.h>
#include <linux/signal.h>
-#include <linux/audit.h>
#include <linux/capability.h>
#include <asm/param.h>
#include <asm/uaccess.h>
#include <asm/unistd.h>
#include <asm/siginfo.h>
+#include "audit.h" /* audit_signal_info() */
/*
* SLAB caches for signal bits.
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 336f92d64e2e..9e2f1c6e73d7 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -470,6 +470,8 @@ static int cpu_callback(struct notifier_block *nfb,
break;
#ifdef CONFIG_HOTPLUG_CPU
case CPU_UP_CANCELED:
+ if (!per_cpu(ksoftirqd, hotcpu))
+ break;
/* Unbind so it can run. Fall thru. */
kthread_bind(per_cpu(ksoftirqd, hotcpu),
any_online_cpu(cpu_online_map));
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 14c7faf02909..b5c3b94e01ce 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -36,7 +36,7 @@ static struct notifier_block panic_block = {
void touch_softlockup_watchdog(void)
{
- per_cpu(touch_timestamp, raw_smp_processor_id()) = jiffies;
+ __raw_get_cpu_var(touch_timestamp) = jiffies;
}
EXPORT_SYMBOL(touch_softlockup_watchdog);
@@ -127,6 +127,8 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
break;
#ifdef CONFIG_HOTPLUG_CPU
case CPU_UP_CANCELED:
+ if (!per_cpu(watchdog_task, hotcpu))
+ break;
/* Unbind so it can run. Fall thru. */
kthread_bind(per_cpu(watchdog_task, hotcpu),
any_online_cpu(cpu_online_map));
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index dcfb5d731466..2c0aacc37c55 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -4,6 +4,7 @@
#include <linux/cpu.h>
#include <linux/err.h>
#include <linux/syscalls.h>
+#include <linux/kthread.h>
#include <asm/atomic.h>
#include <asm/semaphore.h>
#include <asm/uaccess.h>
@@ -25,13 +26,11 @@ static unsigned int stopmachine_num_threads;
static atomic_t stopmachine_thread_ack;
static DECLARE_MUTEX(stopmachine_mutex);
-static int stopmachine(void *cpu)
+static int stopmachine(void *unused)
{
int irqs_disabled = 0;
int prepared = 0;
- set_cpus_allowed(current, cpumask_of_cpu((int)(long)cpu));
-
/* Ack: we are alive */
smp_mb(); /* Theoretically the ack = 0 might not be on this CPU yet. */
atomic_inc(&stopmachine_thread_ack);
@@ -85,7 +84,8 @@ static void stopmachine_set_state(enum stopmachine_state state)
static int stop_machine(void)
{
- int i, ret = 0;
+ int ret = 0;
+ unsigned int i;
struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
/* One high-prio thread per cpu. We'll do this one. */
@@ -96,11 +96,16 @@ static int stop_machine(void)
stopmachine_state = STOPMACHINE_WAIT;
for_each_online_cpu(i) {
+ struct task_struct *tsk;
if (i == raw_smp_processor_id())
continue;
- ret = kernel_thread(stopmachine, (void *)(long)i,CLONE_KERNEL);
- if (ret < 0)
+ tsk = kthread_create(stopmachine, NULL, "stopmachine");
+ if (IS_ERR(tsk)) {
+ ret = PTR_ERR(tsk);
break;
+ }
+ kthread_bind(tsk, i);
+ wake_up_process(tsk);
stopmachine_num_threads++;
}
diff --git a/kernel/sys.c b/kernel/sys.c
index 0b6ec0e7936f..2d5179c67cec 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -13,7 +13,6 @@
#include <linux/notifier.h>
#include <linux/reboot.h>
#include <linux/prctl.h>
-#include <linux/init.h>
#include <linux/highuid.h>
#include <linux/fs.h>
#include <linux/kernel.h>
@@ -57,6 +56,12 @@
#ifndef GET_FPEXC_CTL
# define GET_FPEXC_CTL(a,b) (-EINVAL)
#endif
+#ifndef GET_ENDIAN
+# define GET_ENDIAN(a,b) (-EINVAL)
+#endif
+#ifndef SET_ENDIAN
+# define SET_ENDIAN(a,b) (-EINVAL)
+#endif
/*
* this is where the system-wide overflow UID and GID are defined, for
@@ -132,14 +137,15 @@ static int __kprobes notifier_call_chain(struct notifier_block **nl,
unsigned long val, void *v)
{
int ret = NOTIFY_DONE;
- struct notifier_block *nb;
+ struct notifier_block *nb, *next_nb;
nb = rcu_dereference(*nl);
while (nb) {
+ next_nb = rcu_dereference(nb->next);
ret = nb->notifier_call(nb, val, v);
if ((ret & NOTIFY_STOP_MASK) == NOTIFY_STOP_MASK)
break;
- nb = rcu_dereference(nb->next);
+ nb = next_nb;
}
return ret;
}
@@ -583,7 +589,7 @@ void emergency_restart(void)
}
EXPORT_SYMBOL_GPL(emergency_restart);
-void kernel_restart_prepare(char *cmd)
+static void kernel_restart_prepare(char *cmd)
{
blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd);
system_state = SYSTEM_RESTART;
@@ -617,7 +623,7 @@ EXPORT_SYMBOL_GPL(kernel_restart);
* Move into place and start executing a preloaded standalone
* executable. If nothing was preloaded return an error.
*/
-void kernel_kexec(void)
+static void kernel_kexec(void)
{
#ifdef CONFIG_KEXEC
struct kimage *image;
@@ -631,7 +637,6 @@ void kernel_kexec(void)
machine_kexec(image);
#endif
}
-EXPORT_SYMBOL_GPL(kernel_kexec);
void kernel_shutdown_prepare(enum system_states state)
{
@@ -1860,23 +1865,20 @@ out:
* fields when reaping, so a sample either gets all the additions of a
* given child after it's reaped, or none so this sample is before reaping.
*
- * tasklist_lock locking optimisation:
- * If we are current and single threaded, we do not need to take the tasklist
- * lock or the siglock. No one else can take our signal_struct away,
- * no one else can reap the children to update signal->c* counters, and
- * no one else can race with the signal-> fields.
- * If we do not take the tasklist_lock, the signal-> fields could be read
- * out of order while another thread was just exiting. So we place a
- * read memory barrier when we avoid the lock. On the writer side,
- * write memory barrier is implied in __exit_signal as __exit_signal releases
- * the siglock spinlock after updating the signal-> fields.
- *
- * We don't really need the siglock when we access the non c* fields
- * of the signal_struct (for RUSAGE_SELF) even in multithreaded
- * case, since we take the tasklist lock for read and the non c* signal->
- * fields are updated only in __exit_signal, which is called with
- * tasklist_lock taken for write, hence these two threads cannot execute
- * concurrently.
+ * Locking:
+ * We need to take the siglock for CHILDEREN, SELF and BOTH
+ * for the cases current multithreaded, non-current single threaded
+ * non-current multithreaded. Thread traversal is now safe with
+ * the siglock held.
+ * Strictly speaking, we donot need to take the siglock if we are current and
+ * single threaded, as no one else can take our signal_struct away, no one
+ * else can reap the children to update signal->c* counters, and no one else
+ * can race with the signal-> fields. If we do not take any lock, the
+ * signal-> fields could be read out of order while another thread was just
+ * exiting. So we should place a read memory barrier when we avoid the lock.
+ * On the writer side, write memory barrier is implied in __exit_signal
+ * as __exit_signal releases the siglock spinlock after updating the signal->
+ * fields. But we don't do this yet to keep things simple.
*
*/
@@ -1885,35 +1887,25 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
struct task_struct *t;
unsigned long flags;
cputime_t utime, stime;
- int need_lock = 0;
memset((char *) r, 0, sizeof *r);
utime = stime = cputime_zero;
- if (p != current || !thread_group_empty(p))
- need_lock = 1;
-
- if (need_lock) {
- read_lock(&tasklist_lock);
- if (unlikely(!p->signal)) {
- read_unlock(&tasklist_lock);
- return;
- }
- } else
- /* See locking comments above */
- smp_rmb();
+ rcu_read_lock();
+ if (!lock_task_sighand(p, &flags)) {
+ rcu_read_unlock();
+ return;
+ }
switch (who) {
case RUSAGE_BOTH:
case RUSAGE_CHILDREN:
- spin_lock_irqsave(&p->sighand->siglock, flags);
utime = p->signal->cutime;
stime = p->signal->cstime;
r->ru_nvcsw = p->signal->cnvcsw;
r->ru_nivcsw = p->signal->cnivcsw;
r->ru_minflt = p->signal->cmin_flt;
r->ru_majflt = p->signal->cmaj_flt;
- spin_unlock_irqrestore(&p->sighand->siglock, flags);
if (who == RUSAGE_CHILDREN)
break;
@@ -1941,8 +1933,9 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
BUG();
}
- if (need_lock)
- read_unlock(&tasklist_lock);
+ unlock_task_sighand(p, &flags);
+ rcu_read_unlock();
+
cputime_to_timeval(utime, &r->ru_utime);
cputime_to_timeval(stime, &r->ru_stime);
}
@@ -2057,6 +2050,13 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
return -EFAULT;
return 0;
}
+ case PR_GET_ENDIAN:
+ error = GET_ENDIAN(current, arg2);
+ break;
+ case PR_SET_ENDIAN:
+ error = SET_ENDIAN(current, arg2);
+ break;
+
default:
error = -EINVAL;
break;
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 5433195040f1..6991bece67e8 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -87,6 +87,7 @@ cond_syscall(sys_inotify_init);
cond_syscall(sys_inotify_add_watch);
cond_syscall(sys_inotify_rm_watch);
cond_syscall(sys_migrate_pages);
+cond_syscall(sys_move_pages);
cond_syscall(sys_chown16);
cond_syscall(sys_fchown16);
cond_syscall(sys_getegid16);
@@ -132,3 +133,4 @@ cond_syscall(sys_mincore);
cond_syscall(sys_madvise);
cond_syscall(sys_mremap);
cond_syscall(sys_remap_file_pages);
+cond_syscall(compat_sys_move_pages);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index e82726faeeff..2c0e65819448 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -59,6 +59,7 @@ extern int proc_nr_files(ctl_table *table, int write, struct file *filp,
extern int C_A_D;
extern int sysctl_overcommit_memory;
extern int sysctl_overcommit_ratio;
+extern int sysctl_panic_on_oom;
extern int max_threads;
extern int sysrq_enabled;
extern int core_uses_pid;
@@ -142,7 +143,6 @@ static struct ctl_table_header root_table_header =
static ctl_table kern_table[];
static ctl_table vm_table[];
-static ctl_table proc_table[];
static ctl_table fs_table[];
static ctl_table debug_table[];
static ctl_table dev_table[];
@@ -150,7 +150,7 @@ extern ctl_table random_table[];
#ifdef CONFIG_UNIX98_PTYS
extern ctl_table pty_table[];
#endif
-#ifdef CONFIG_INOTIFY
+#ifdef CONFIG_INOTIFY_USER
extern ctl_table inotify_table[];
#endif
@@ -202,12 +202,6 @@ static ctl_table root_table[] = {
},
#endif
{
- .ctl_name = CTL_PROC,
- .procname = "proc",
- .mode = 0555,
- .child = proc_table,
- },
- {
.ctl_name = CTL_FS,
.procname = "fs",
.mode = 0555,
@@ -398,7 +392,7 @@ static ctl_table kern_table[] = {
.strategy = &sysctl_string,
},
#endif
-#ifdef CONFIG_HOTPLUG
+#if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET)
{
.ctl_name = KERN_HOTPLUG,
.procname = "hotplug",
@@ -702,6 +696,14 @@ static ctl_table vm_table[] = {
.proc_handler = &proc_dointvec,
},
{
+ .ctl_name = VM_PANIC_ON_OOM,
+ .procname = "panic_on_oom",
+ .data = &sysctl_panic_on_oom,
+ .maxlen = sizeof(sysctl_panic_on_oom),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
.ctl_name = VM_OVERCOMMIT_RATIO,
.procname = "overcommit_ratio",
.data = &sysctl_overcommit_ratio,
@@ -918,10 +920,6 @@ static ctl_table vm_table[] = {
{ .ctl_name = 0 }
};
-static ctl_table proc_table[] = {
- { .ctl_name = 0 }
-};
-
static ctl_table fs_table[] = {
{
.ctl_name = FS_NRINODE,
@@ -1028,7 +1026,7 @@ static ctl_table fs_table[] = {
.mode = 0644,
.proc_handler = &proc_doulongvec_minmax,
},
-#ifdef CONFIG_INOTIFY
+#ifdef CONFIG_INOTIFY_USER
{
.ctl_name = FS_INOTIFY,
.procname = "inotify",
diff --git a/kernel/timer.c b/kernel/timer.c
index 9e49deed468c..eb97371b87d8 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -146,7 +146,7 @@ static void internal_add_timer(tvec_base_t *base, struct timer_list *timer)
void fastcall init_timer(struct timer_list *timer)
{
timer->entry.next = NULL;
- timer->base = per_cpu(tvec_bases, raw_smp_processor_id());
+ timer->base = __raw_get_cpu_var(tvec_bases);
}
EXPORT_SYMBOL(init_timer);
@@ -383,23 +383,19 @@ EXPORT_SYMBOL(del_timer_sync);
static int cascade(tvec_base_t *base, tvec_t *tv, int index)
{
/* cascade all the timers from tv up one level */
- struct list_head *head, *curr;
+ struct timer_list *timer, *tmp;
+ struct list_head tv_list;
+
+ list_replace_init(tv->vec + index, &tv_list);
- head = tv->vec + index;
- curr = head->next;
/*
- * We are removing _all_ timers from the list, so we don't have to
- * detach them individually, just clear the list afterwards.
+ * We are removing _all_ timers from the list, so we
+ * don't have to detach them individually.
*/
- while (curr != head) {
- struct timer_list *tmp;
-
- tmp = list_entry(curr, struct timer_list, entry);
- BUG_ON(tmp->base != base);
- curr = curr->next;
- internal_add_timer(base, tmp);
+ list_for_each_entry_safe(timer, tmp, &tv_list, entry) {
+ BUG_ON(timer->base != base);
+ internal_add_timer(base, timer);
}
- INIT_LIST_HEAD(head);
return index;
}
@@ -419,10 +415,10 @@ static inline void __run_timers(tvec_base_t *base)
spin_lock_irq(&base->lock);
while (time_after_eq(jiffies, base->timer_jiffies)) {
- struct list_head work_list = LIST_HEAD_INIT(work_list);
+ struct list_head work_list;
struct list_head *head = &work_list;
int index = base->timer_jiffies & TVR_MASK;
-
+
/*
* Cascade timers:
*/
@@ -431,8 +427,8 @@ static inline void __run_timers(tvec_base_t *base)
(!cascade(base, &base->tv3, INDEX(1))) &&
!cascade(base, &base->tv4, INDEX(2)))
cascade(base, &base->tv5, INDEX(3));
- ++base->timer_jiffies;
- list_splice_init(base->tv1.vec + index, &work_list);
+ ++base->timer_jiffies;
+ list_replace_init(base->tv1.vec + index, &work_list);
while (!list_empty(head)) {
void (*fn)(unsigned long);
unsigned long data;
diff --git a/kernel/user.c b/kernel/user.c
index 2116642f42c6..6408c0424291 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -140,7 +140,7 @@ struct user_struct * alloc_uid(uid_t uid)
atomic_set(&new->processes, 0);
atomic_set(&new->files, 0);
atomic_set(&new->sigpending, 0);
-#ifdef CONFIG_INOTIFY
+#ifdef CONFIG_INOTIFY_USER
atomic_set(&new->inotify_watches, 0);
atomic_set(&new->inotify_devs, 0);
#endif
@@ -148,7 +148,7 @@ struct user_struct * alloc_uid(uid_t uid)
new->mq_bytes = 0;
new->locked_shm = 0;
- if (alloc_uid_keyring(new) < 0) {
+ if (alloc_uid_keyring(new, current) < 0) {
kmem_cache_free(uid_cachep, new);
return NULL;
}
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 880fb415a8f6..565cf7a1febd 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -428,22 +428,34 @@ int schedule_delayed_work_on(int cpu,
return ret;
}
-int schedule_on_each_cpu(void (*func) (void *info), void *info)
+/**
+ * schedule_on_each_cpu - call a function on each online CPU from keventd
+ * @func: the function to call
+ * @info: a pointer to pass to func()
+ *
+ * Returns zero on success.
+ * Returns -ve errno on failure.
+ *
+ * Appears to be racy against CPU hotplug.
+ *
+ * schedule_on_each_cpu() is very slow.
+ */
+int schedule_on_each_cpu(void (*func)(void *info), void *info)
{
int cpu;
- struct work_struct *work;
+ struct work_struct *works;
- work = kmalloc(NR_CPUS * sizeof(struct work_struct), GFP_KERNEL);
-
- if (!work)
+ works = alloc_percpu(struct work_struct);
+ if (!works)
return -ENOMEM;
+
for_each_online_cpu(cpu) {
- INIT_WORK(work + cpu, func, info);
+ INIT_WORK(per_cpu_ptr(works, cpu), func, info);
__queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu),
- work + cpu);
+ per_cpu_ptr(works, cpu));
}
flush_workqueue(keventd_wq);
- kfree(work);
+ free_percpu(works);
return 0;
}
@@ -531,11 +543,11 @@ int current_is_keventd(void)
static void take_over_work(struct workqueue_struct *wq, unsigned int cpu)
{
struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu);
- LIST_HEAD(list);
+ struct list_head list;
struct work_struct *work;
spin_lock_irq(&cwq->lock);
- list_splice_init(&cwq->worklist, &list);
+ list_replace_init(&cwq->worklist, &list);
while (!list_empty(&list)) {
printk("Taking work for %s\n", wq->name);
@@ -578,6 +590,8 @@ static int workqueue_cpu_callback(struct notifier_block *nfb,
case CPU_UP_CANCELED:
list_for_each_entry(wq, &workqueues, list) {
+ if (!per_cpu_ptr(wq->cpu_wq, hotcpu)->thread)
+ continue;
/* Unbind so it can run. */
kthread_bind(per_cpu_ptr(wq->cpu_wq, hotcpu)->thread,
any_online_cpu(cpu_online_map));