Discussion:
[PATCH] audit: Use struct net not pid_t to remember the network namespce to reply in
Eric W. Biederman
2014-02-28 18:49:05 UTC
Permalink
While reading through 3.14-rc1 I found a pretty siginficant mishandling
of network namespaces in the recent audit changes.

In struct audit_netlink_list and audit_reply add a reference to the
network namespace of the caller and remove the userspace pid of the
caller. This cleanly remembers the callers network namespace, and
removes a huge class of races and nasty failure modes that can occur
when attempting to relook up the callers network namespace from a pid_t
(including the caller's network namespace changing, pid wraparound, and
the pid simply not being present).

Signed-off-by: "Eric W. Biederman" <***@xmission.com>
---
kernel/audit.c | 10 ++++++----
kernel/audit.h | 2 +-
kernel/auditfilter.c | 3 ++-
3 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/kernel/audit.c b/kernel/audit.c
index 34c5a2310fbf..1e5756f16f6f 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -182,7 +182,7 @@ struct audit_buffer {

struct audit_reply {
__u32 portid;
- pid_t pid;
+ struct net *net;
struct sk_buff *skb;
};

@@ -500,7 +500,7 @@ int audit_send_list(void *_dest)
{
struct audit_netlink_list *dest = _dest;
struct sk_buff *skb;
- struct net *net = get_net_ns_by_pid(dest->pid);
+ struct net *net = dest->net;
struct audit_net *aunet = net_generic(net, audit_net_id);

/* wait for parent to finish and send an ACK */
@@ -510,6 +510,7 @@ int audit_send_list(void *_dest)
while ((skb = __skb_dequeue(&dest->q)) != NULL)
netlink_unicast(aunet->nlsk, skb, dest->portid, 0);

+ put_net(net);
kfree(dest);

return 0;
@@ -543,7 +544,7 @@ out_kfree_skb:
static int audit_send_reply_thread(void *arg)
{
struct audit_reply *reply = (struct audit_reply *)arg;
- struct net *net = get_net_ns_by_pid(reply->pid);
+ struct net *net = reply->net;
struct audit_net *aunet = net_generic(net, audit_net_id);

mutex_lock(&audit_cmd_mutex);
@@ -552,6 +553,7 @@ static int audit_send_reply_thread(void *arg)
/* Ignore failure. It'll only happen if the sender goes away,
because our timeout is set to infinite. */
netlink_unicast(aunet->nlsk , reply->skb, reply->portid, 0);
+ put_net(net);
kfree(reply);
return 0;
}
@@ -583,8 +585,8 @@ static void audit_send_reply(__u32 portid, int seq, int type, int done,
if (!skb)
goto out;

+ reply->net = get_net(current->nsproxy->net_ns);
reply->portid = portid;
- reply->pid = task_pid_vnr(current);
reply->skb = skb;

tsk = kthread_run(audit_send_reply_thread, reply, "audit_send_reply");
diff --git a/kernel/audit.h b/kernel/audit.h
index 57cc64d67718..8df132214606 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -247,7 +247,7 @@ extern void audit_panic(const char *message);

struct audit_netlink_list {
__u32 portid;
- pid_t pid;
+ struct net *net;
struct sk_buff_head q;
};

diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 14a78cca384e..a5e3d73d73e4 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -29,6 +29,7 @@
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/security.h>
+#include <net/net_namespace.h>
#include "audit.h"

/*
@@ -1083,8 +1084,8 @@ int audit_list_rules_send(__u32 portid, int seq)
dest = kmalloc(sizeof(struct audit_netlink_list), GFP_KERNEL);
if (!dest)
return -ENOMEM;
+ dest->net = get_net(current->nsproxy->net_ns);
dest->portid = portid;
- dest->pid = task_pid_vnr(current);
skb_queue_head_init(&dest->q);

mutex_lock(&audit_filter_mutex);
--
1.7.5.4
Richard Guy Briggs
2014-03-01 01:11:42 UTC
Permalink
Post by Eric W. Biederman
While reading through 3.14-rc1 I found a pretty siginficant mishandling
of network namespaces in the recent audit changes.
In struct audit_netlink_list and audit_reply add a reference to the
network namespace of the caller and remove the userspace pid of the
caller. This cleanly remembers the callers network namespace, and
removes a huge class of races and nasty failure modes that can occur
when attempting to relook up the callers network namespace from a pid_t
(including the caller's network namespace changing, pid wraparound, and
the pid simply not being present).
Ok, so I see that avoiding pid_t in struct audit_reply and struct
audit_netlink_list is necessary. Why not switch to struct pid?

How does this patch solve a caller's network namespace changing?
Post by Eric W. Biederman
---
kernel/audit.c | 10 ++++++----
kernel/audit.h | 2 +-
kernel/auditfilter.c | 3 ++-
3 files changed, 9 insertions(+), 6 deletions(-)
diff --git a/kernel/audit.c b/kernel/audit.c
index 34c5a2310fbf..1e5756f16f6f 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -182,7 +182,7 @@ struct audit_buffer {
struct audit_reply {
__u32 portid;
- pid_t pid;
+ struct net *net;
struct sk_buff *skb;
};
@@ -500,7 +500,7 @@ int audit_send_list(void *_dest)
{
struct audit_netlink_list *dest = _dest;
struct sk_buff *skb;
- struct net *net = get_net_ns_by_pid(dest->pid);
+ struct net *net = dest->net;
struct audit_net *aunet = net_generic(net, audit_net_id);
/* wait for parent to finish and send an ACK */
@@ -510,6 +510,7 @@ int audit_send_list(void *_dest)
while ((skb = __skb_dequeue(&dest->q)) != NULL)
netlink_unicast(aunet->nlsk, skb, dest->portid, 0);
+ put_net(net);
kfree(dest);
return 0;
static int audit_send_reply_thread(void *arg)
{
struct audit_reply *reply = (struct audit_reply *)arg;
- struct net *net = get_net_ns_by_pid(reply->pid);
+ struct net *net = reply->net;
struct audit_net *aunet = net_generic(net, audit_net_id);
mutex_lock(&audit_cmd_mutex);
@@ -552,6 +553,7 @@ static int audit_send_reply_thread(void *arg)
/* Ignore failure. It'll only happen if the sender goes away,
because our timeout is set to infinite. */
netlink_unicast(aunet->nlsk , reply->skb, reply->portid, 0);
+ put_net(net);
kfree(reply);
return 0;
}
@@ -583,8 +585,8 @@ static void audit_send_reply(__u32 portid, int seq, int type, int done,
if (!skb)
goto out;
+ reply->net = get_net(current->nsproxy->net_ns);
reply->portid = portid;
- reply->pid = task_pid_vnr(current);
reply->skb = skb;
tsk = kthread_run(audit_send_reply_thread, reply, "audit_send_reply");
diff --git a/kernel/audit.h b/kernel/audit.h
index 57cc64d67718..8df132214606 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -247,7 +247,7 @@ extern void audit_panic(const char *message);
struct audit_netlink_list {
__u32 portid;
- pid_t pid;
+ struct net *net;
struct sk_buff_head q;
};
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 14a78cca384e..a5e3d73d73e4 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -29,6 +29,7 @@
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/security.h>
+#include <net/net_namespace.h>
#include "audit.h"
/*
@@ -1083,8 +1084,8 @@ int audit_list_rules_send(__u32 portid, int seq)
dest = kmalloc(sizeof(struct audit_netlink_list), GFP_KERNEL);
if (!dest)
return -ENOMEM;
+ dest->net = get_net(current->nsproxy->net_ns);
dest->portid = portid;
- dest->pid = task_pid_vnr(current);
skb_queue_head_init(&dest->q);
mutex_lock(&audit_filter_mutex);
--
1.7.5.4
- RGB

--
Richard Guy Briggs <***@redhat.com>
Senior Software Engineer, Kernel Security, AMER ENG Base Operating Systems, Red Hat
Remote, Ottawa, Canada
Voice: +1.647.777.2635, Internal: (81) 32635, Alt: +1.613.693.0684x3545
Eric W. Biederman
2014-03-01 04:34:52 UTC
Permalink
Post by Richard Guy Briggs
Post by Eric W. Biederman
While reading through 3.14-rc1 I found a pretty siginficant mishandling
of network namespaces in the recent audit changes.
In struct audit_netlink_list and audit_reply add a reference to the
network namespace of the caller and remove the userspace pid of the
caller. This cleanly remembers the callers network namespace, and
removes a huge class of races and nasty failure modes that can occur
when attempting to relook up the callers network namespace from a pid_t
(including the caller's network namespace changing, pid wraparound, and
the pid simply not being present).
Ok, so I see that avoiding pid_t in struct audit_reply and struct
audit_netlink_list is necessary. Why not switch to struct pid?
How does this patch solve a caller's network namespace changing?
This solves the callers network namespace changing or the caller going
away entirely (a much more serious concern) because we capture the
network namespace at the time of the request when the caller is in the
kernel. I would have simply captured the socket we want to reply on but
there did not appear to be a good way to do that.

Reading through it again capturing current->nsproxy->net_ns is striclty
wrong. We should be capturing sock_net(NETLINK_CB(skb).sk). The
network namespace of the requesting socket. That handles even weird
cases of passing file descriptors between processes in different network
namespaces. (An incremental patch to change to code to selct the
network namespace of the requesting socket to follow in a moment).

Still what my patch implements today at least means we won't oops the
kernel if the audit process exits early, and causes get_net_ns_by_pid
to return NULL.


This whole code path is so crazy because what we really should be doing
is sending the packets in nonblocking mode and just dropping packets
if the receiving socket does not have enough socket buvffers.

Eric
Eric W. Biederman
2014-03-01 04:36:55 UTC
Permalink
In perverse cases of file descriptor passing the current network
namespace of a process and the network namespace of a socket used by
that socket may differ. Therefore use the network namespace of the
appropiate socket to ensure replies always go to the appropiate
socket.

Signed-off-by: "Eric W. Biederman" <***@xmission.com>
---

This is an incremental change on top of my previous patch to guarantee
that replies always happen in the appropriate network namespace.

include/linux/audit.h | 3 ++-
kernel/audit.c | 21 ++++++++++-----------
kernel/auditfilter.c | 7 +++++--
3 files changed, 17 insertions(+), 14 deletions(-)

diff --git a/include/linux/audit.h b/include/linux/audit.h
index aa865a9a4c4f..ec1464df4c60 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -43,6 +43,7 @@ struct mq_attr;
struct mqstat;
struct audit_watch;
struct audit_tree;
+struct sk_buff;

struct audit_krule {
int vers_ops;
@@ -463,7 +464,7 @@ extern int audit_filter_user(int type);
extern int audit_filter_type(int type);
extern int audit_rule_change(int type, __u32 portid, int seq,
void *data, size_t datasz);
-extern int audit_list_rules_send(__u32 portid, int seq);
+extern int audit_list_rules_send(struct sk_buff *request_skb, int seq);

extern u32 audit_enabled;
#else /* CONFIG_AUDIT */
diff --git a/kernel/audit.c b/kernel/audit.c
index 1e5756f16f6f..32086bff5564 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -570,9 +570,11 @@ static int audit_send_reply_thread(void *arg)
* Allocates an skb, builds the netlink message, and sends it to the port id.
* No failure notifications.
*/
-static void audit_send_reply(__u32 portid, int seq, int type, int done,
+static void audit_send_reply(struct sk_buff *request_skb, int seq, int type, int done,
int multi, const void *payload, int size)
{
+ u32 portid = NETLINK_CB(request_skb).portid;
+ struct net *net = sock_net(NETLINK_CB(request_skb).sk);
struct sk_buff *skb;
struct task_struct *tsk;
struct audit_reply *reply = kmalloc(sizeof(struct audit_reply),
@@ -585,7 +587,7 @@ static void audit_send_reply(__u32 portid, int seq, int type, int done,
if (!skb)
goto out;

- reply->net = get_net(current->nsproxy->net_ns);
+ reply->net = get_net(net);
reply->portid = portid;
reply->skb = skb;

@@ -675,8 +677,7 @@ static int audit_get_feature(struct sk_buff *skb)

seq = nlmsg_hdr(skb)->nlmsg_seq;

- audit_send_reply(NETLINK_CB(skb).portid, seq, AUDIT_GET, 0, 0,
- &af, sizeof(af));
+ audit_send_reply(skb, seq, AUDIT_GET, 0, 0, &af, sizeof(af));

return 0;
}
@@ -796,8 +797,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
s.backlog = skb_queue_len(&audit_skb_queue);
s.version = AUDIT_VERSION_LATEST;
s.backlog_wait_time = audit_backlog_wait_time;
- audit_send_reply(NETLINK_CB(skb).portid, seq, AUDIT_GET, 0, 0,
- &s, sizeof(s));
+ audit_send_reply(skb, seq, AUDIT_GET, 0, 0, &s, sizeof(s));
break;
}
case AUDIT_SET: {
@@ -907,7 +907,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
seq, data, nlmsg_len(nlh));
break;
case AUDIT_LIST_RULES:
- err = audit_list_rules_send(NETLINK_CB(skb).portid, seq);
+ err = audit_list_rules_send(skb, seq);
break;
case AUDIT_TRIM:
audit_trim_trees();
@@ -972,8 +972,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
memcpy(sig_data->ctx, ctx, len);
security_release_secctx(ctx, len);
}
- audit_send_reply(NETLINK_CB(skb).portid, seq, AUDIT_SIGNAL_INFO,
- 0, 0, sig_data, sizeof(*sig_data) + len);
+ audit_send_reply(skb, seq, AUDIT_SIGNAL_INFO, 0, 0,
+ sig_data, sizeof(*sig_data) + len);
kfree(sig_data);
break;
case AUDIT_TTY_GET: {
@@ -985,8 +985,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
s.log_passwd = tsk->signal->audit_tty_log_passwd;
spin_unlock(&tsk->sighand->siglock);

- audit_send_reply(NETLINK_CB(skb).portid, seq,
- AUDIT_TTY_GET, 0, 0, &s, sizeof(s));
+ audit_send_reply(skb, seq, AUDIT_TTY_GET, 0, 0, &s, sizeof(s));
break;
}
case AUDIT_TTY_SET: {
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index a5e3d73d73e4..e8d1c7c515d7 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -30,6 +30,7 @@
#include <linux/slab.h>
#include <linux/security.h>
#include <net/net_namespace.h>
+#include <net/sock.h>
#include "audit.h"

/*
@@ -1069,8 +1070,10 @@ int audit_rule_change(int type, __u32 portid, int seq, void *data,
* @portid: target portid for netlink audit messages
* @seq: netlink audit message sequence (serial) number
*/
-int audit_list_rules_send(__u32 portid, int seq)
+int audit_list_rules_send(struct sk_buff *request_skb, int seq)
{
+ u32 portid = NETLINK_CB(request_skb).portid;
+ struct net *net = sock_net(NETLINK_CB(request_skb).sk);
struct task_struct *tsk;
struct audit_netlink_list *dest;
int err = 0;
@@ -1084,7 +1087,7 @@ int audit_list_rules_send(__u32 portid, int seq)
dest = kmalloc(sizeof(struct audit_netlink_list), GFP_KERNEL);
if (!dest)
return -ENOMEM;
- dest->net = get_net(current->nsproxy->net_ns);
+ dest->net = get_net(net);
dest->portid = portid;
skb_queue_head_init(&dest->q);
--
1.7.5.4
Eric W. Biederman
2014-03-01 04:50:19 UTC
Permalink
Modify audit_send_reply to directly use a non-blocking send and
to return an error on failure (if anyone cares).

Modify audit_list_rules_send to use audit_send_reply and give up
if we can not send a packet.

Merge audit_list_rules into iaudit_list_rules_send as the code
is now sufficiently simple to not justify to callers.

Kill audit_send_list, audit_send_reply_thread because using
a separate thread for replies is not needed when sending
packets syncrhonously.

Signed-off-by: "Eric W. Biederman" <***@xmission.com>
---

I haven't properly tested and made certain this doesn't break userspace,
but this is much simpler than what audit is currently doing.

I really don't understand why we are using kernel threads to allow us to
exceed the receiving sockets configured buffer limits. That just seems
insane. If that is really what we want we should be able to force
the receiving buffer limits up in audit_send_reply.

include/linux/audit.h | 2 +-
kernel/audit.c | 75 ++++++++-----------------------------------------
kernel/audit.h | 14 ++-------
kernel/auditfilter.c | 64 ++++++++++--------------------------------
4 files changed, 31 insertions(+), 124 deletions(-)

diff --git a/include/linux/audit.h b/include/linux/audit.h
index ec1464df4c60..cd2f5112822a 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -464,7 +464,7 @@ extern int audit_filter_user(int type);
extern int audit_filter_type(int type);
extern int audit_rule_change(int type, __u32 portid, int seq,
void *data, size_t datasz);
-extern int audit_list_rules_send(struct sk_buff *request_skb, int seq);
+extern void audit_list_rules_send(struct sk_buff *request_skb, int seq);

extern u32 audit_enabled;
#else /* CONFIG_AUDIT */
diff --git a/kernel/audit.c b/kernel/audit.c
index 32086bff5564..201808fc86aa 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -180,12 +180,6 @@ struct audit_buffer {
gfp_t gfp_mask;
};

-struct audit_reply {
- __u32 portid;
- struct net *net;
- struct sk_buff *skb;
-};
-
static void audit_set_portid(struct audit_buffer *ab, __u32 portid)
{
if (ab) {
@@ -496,26 +490,6 @@ static int kauditd_thread(void *dummy)
return 0;
}

-int audit_send_list(void *_dest)
-{
- struct audit_netlink_list *dest = _dest;
- struct sk_buff *skb;
- struct net *net = dest->net;
- struct audit_net *aunet = net_generic(net, audit_net_id);
-
- /* wait for parent to finish and send an ACK */
- mutex_lock(&audit_cmd_mutex);
- mutex_unlock(&audit_cmd_mutex);
-
- while ((skb = __skb_dequeue(&dest->q)) != NULL)
- netlink_unicast(aunet->nlsk, skb, dest->portid, 0);
-
- put_net(net);
- kfree(dest);
-
- return 0;
-}
-
struct sk_buff *audit_make_reply(__u32 portid, int seq, int type, int done,
int multi, const void *payload, int size)
{
@@ -541,25 +515,9 @@ out_kfree_skb:
return NULL;
}

-static int audit_send_reply_thread(void *arg)
-{
- struct audit_reply *reply = (struct audit_reply *)arg;
- struct net *net = reply->net;
- struct audit_net *aunet = net_generic(net, audit_net_id);
-
- mutex_lock(&audit_cmd_mutex);
- mutex_unlock(&audit_cmd_mutex);
-
- /* Ignore failure. It'll only happen if the sender goes away,
- because our timeout is set to infinite. */
- netlink_unicast(aunet->nlsk , reply->skb, reply->portid, 0);
- put_net(net);
- kfree(reply);
- return 0;
-}
/**
* audit_send_reply - send an audit reply message via netlink
- * @portid: netlink port to which to send reply
+ * @request_skb: The request skb (used to calculate where to reply)
* @seq: sequence number
* @type: audit message type
* @done: done (last) flag
@@ -570,33 +528,24 @@ static int audit_send_reply_thread(void *arg)
* Allocates an skb, builds the netlink message, and sends it to the port id.
* No failure notifications.
*/
-static void audit_send_reply(struct sk_buff *request_skb, int seq, int type, int done,
- int multi, const void *payload, int size)
+int audit_send_reply(struct sk_buff *request_skb, int seq, int type, int done,
+ int multi, const void *payload, int size)
{
u32 portid = NETLINK_CB(request_skb).portid;
struct net *net = sock_net(NETLINK_CB(request_skb).sk);
+ struct audit_net *aunet = net_generic(net, audit_net_id);
struct sk_buff *skb;
- struct task_struct *tsk;
- struct audit_reply *reply = kmalloc(sizeof(struct audit_reply),
- GFP_KERNEL);
-
- if (!reply)
- return;

skb = audit_make_reply(portid, seq, type, done, multi, payload, size);
if (!skb)
- goto out;
-
- reply->net = get_net(net);
- reply->portid = portid;
- reply->skb = skb;
+ return -ENOMEM;

- tsk = kthread_run(audit_send_reply_thread, reply, "audit_send_reply");
- if (!IS_ERR(tsk))
- return;
- kfree_skb(skb);
-out:
- kfree(reply);
+ /*
+ * audit_send_reply runs in the context of the requesting process
+ * which means that a blocking send can deadlock. Peform a
+ * non-blocking send, and in general ignore errors.
+ */
+ return netlink_unicast(aunet->nlsk, skb, portid, 1);
}

/*
@@ -907,7 +856,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
seq, data, nlmsg_len(nlh));
break;
case AUDIT_LIST_RULES:
- err = audit_list_rules_send(skb, seq);
+ audit_list_rules_send(skb, seq);
break;
case AUDIT_TRIM:
audit_trim_trees();
diff --git a/kernel/audit.h b/kernel/audit.h
index 8df132214606..c7a594890950 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -240,18 +240,10 @@ extern int audit_uid_comparator(kuid_t left, u32 op, kuid_t right);
extern int audit_gid_comparator(kgid_t left, u32 op, kgid_t right);
extern int parent_len(const char *path);
extern int audit_compare_dname_path(const char *dname, const char *path, int plen);
-extern struct sk_buff *audit_make_reply(__u32 portid, int seq, int type,
- int done, int multi,
- const void *payload, int size);
-extern void audit_panic(const char *message);
-
-struct audit_netlink_list {
- __u32 portid;
- struct net *net;
- struct sk_buff_head q;
-};
+extern int audit_send_reply(struct sk_buff *request_skb, int seq, int type,
+ int done, int multi, const void *payload, int size);

-int audit_send_list(void *);
+extern void audit_panic(const char *message);

struct audit_net {
struct sock *nlsk;
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index e8d1c7c515d7..5cd05b1f3ec3 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -973,33 +973,39 @@ out:
return ret;
}

-/* List rules using struct audit_rule_data. */
-static void audit_list_rules(__u32 portid, int seq, struct sk_buff_head *q)
+/**
+ * audit_list_rules_send - list the audit rules using struct audit_rule_data.
+ * @request_skb: The requesting skb (used to calculate where to reply)
+ * @seq: netlink audit message sequence (serial) number
+ */
+void audit_list_rules_send(struct sk_buff *request_skb, int seq)
{
- struct sk_buff *skb;
struct audit_krule *r;
int i;

+ mutex_lock(&audit_filter_mutex);
/* This is a blocking read, so use audit_filter_mutex instead of rcu
* iterator to sync with list writers. */
for (i=0; i<AUDIT_NR_FILTERS; i++) {
list_for_each_entry(r, &audit_rules_list[i], list) {
struct audit_rule_data *data;
+ int err;

data = audit_krule_to_data(r);
if (unlikely(!data))
break;
- skb = audit_make_reply(portid, seq, AUDIT_LIST_RULES,
+ err = audit_send_reply(request_skb, seq, AUDIT_LIST_RULES,
0, 1, data,
sizeof(*data) + data->buflen);
- if (skb)
- skb_queue_tail(q, skb);
kfree(data);
+ if (err < 0)
+ goto done;
}
}
- skb = audit_make_reply(portid, seq, AUDIT_LIST_RULES, 1, 1, NULL, 0);
- if (skb)
- skb_queue_tail(q, skb);
+ audit_send_reply(request_skb, seq, AUDIT_LIST_RULES, 1, 1, NULL, 0);
+done:
+ mutex_unlock(&audit_filter_mutex);
+ return;
}

/* Log rule additions and removals */
@@ -1065,46 +1071,6 @@ int audit_rule_change(int type, __u32 portid, int seq, void *data,
return err;
}

-/**
- * audit_list_rules_send - list the audit rules
- * @portid: target portid for netlink audit messages
- * @seq: netlink audit message sequence (serial) number
- */
-int audit_list_rules_send(struct sk_buff *request_skb, int seq)
-{
- u32 portid = NETLINK_CB(request_skb).portid;
- struct net *net = sock_net(NETLINK_CB(request_skb).sk);
- struct task_struct *tsk;
- struct audit_netlink_list *dest;
- int err = 0;
-
- /* We can't just spew out the rules here because we might fill
- * the available socket buffer space and deadlock waiting for
- * auditctl to read from it... which isn't ever going to
- * happen if we're actually running in the context of auditctl
- * trying to _send_ the stuff */
-
- dest = kmalloc(sizeof(struct audit_netlink_list), GFP_KERNEL);
- if (!dest)
- return -ENOMEM;
- dest->net = get_net(net);
- dest->portid = portid;
- skb_queue_head_init(&dest->q);
-
- mutex_lock(&audit_filter_mutex);
- audit_list_rules(portid, seq, &dest->q);
- mutex_unlock(&audit_filter_mutex);
-
- tsk = kthread_run(audit_send_list, dest, "audit_send_list");
- if (IS_ERR(tsk)) {
- skb_queue_purge(&dest->q);
- kfree(dest);
- err = PTR_ERR(tsk);
- }
-
- return err;
-}
-
int audit_comparator(u32 left, u32 op, u32 right)
{
switch (op) {
--
1.7.5.4
Andrew Morton
2014-03-04 21:30:04 UTC
Permalink
Post by Eric W. Biederman
Modify audit_send_reply to directly use a non-blocking send and
to return an error on failure (if anyone cares).
Modify audit_list_rules_send to use audit_send_reply and give up
if we can not send a packet.
Merge audit_list_rules into iaudit_list_rules_send as the code
is now sufficiently simple to not justify to callers.
Kill audit_send_list, audit_send_reply_thread because using
a separate thread for replies is not needed when sending
packets syncrhonously.
Nothing much seems to be happening here?

In an earlier email you said "While reading through 3.14-rc1 I found a
pretty siginficant mishandling of network namespaces in the recent
audit changes." Were those recent audit changes a post-3.14 thing? And
what is the user-visible effect of the mishandling?

Thanks.
David Miller
2014-03-04 21:51:22 UTC
Permalink
From: Andrew Morton <***@linux-foundation.org>
Date: Tue, 4 Mar 2014 13:30:04 -0800
Post by Andrew Morton
Post by Eric W. Biederman
Modify audit_send_reply to directly use a non-blocking send and
to return an error on failure (if anyone cares).
Modify audit_list_rules_send to use audit_send_reply and give up
if we can not send a packet.
Merge audit_list_rules into iaudit_list_rules_send as the code
is now sufficiently simple to not justify to callers.
Kill audit_send_list, audit_send_reply_thread because using
a separate thread for replies is not needed when sending
packets syncrhonously.
Nothing much seems to be happening here?
In an earlier email you said "While reading through 3.14-rc1 I found a
pretty siginficant mishandling of network namespaces in the recent
audit changes." Were those recent audit changes a post-3.14 thing? And
what is the user-visible effect of the mishandling?
I took a quick look at this patch yesterday, and my only suspicion is
that threads are created currently by this code purely to cons up a
blockable context for the netlink code.

Perhaps it wants to avoid any netlink packet drops from being possible.

I'm not so sure.

Anyways, some investigation into perhaps figuring out the intentions of
the original implementation would be nice.
Eric W. Biederman
2014-03-04 22:41:16 UTC
Permalink
Post by David Miller
Date: Tue, 4 Mar 2014 13:30:04 -0800
Post by Andrew Morton
Post by Eric W. Biederman
Modify audit_send_reply to directly use a non-blocking send and
to return an error on failure (if anyone cares).
Modify audit_list_rules_send to use audit_send_reply and give up
if we can not send a packet.
Merge audit_list_rules into iaudit_list_rules_send as the code
is now sufficiently simple to not justify to callers.
Kill audit_send_list, audit_send_reply_thread because using
a separate thread for replies is not needed when sending
packets syncrhonously.
Nothing much seems to be happening here?
Well you picked up the patch that fixes the worst of the bugs that I was
complaining about. Beyond that I don't know what makes sense.
Post by David Miller
Post by Andrew Morton
In an earlier email you said "While reading through 3.14-rc1 I found a
pretty siginficant mishandling of network namespaces in the recent
audit changes." Were those recent audit changes a post-3.14 thing? And
what is the user-visible effect of the mishandling?
I took a quick look at this patch yesterday, and my only suspicion is
that threads are created currently by this code purely to cons up a
blockable context for the netlink code.
Perhaps it wants to avoid any netlink packet drops from being possible.
I'm not so sure.
Looking at the history (see below) the stated reason from David
Woodhouse is to prevent the removal of packets from the packet queues
when we have reached our rcvbuf limit.

The reasoning doesn't make a lick of sense to me and I was hoping the
folks dealing with audit would comment.

If we really want the ability to always appened to the queue of skb's
is to just have a version of netlink_send_skb that ignores the queued
limits. Of course an evil program then could force the generation of
enough audit records to DOS the kernel, but we seem to be in that
situation now. Shrug.
Post by David Miller
Anyways, some investigation into perhaps figuring out the intentions of
the original implementation would be nice.
I had looked briefly and missed a few things. Here is the complete
history in all of it's confusion.

In brief. The code came in using non-blocking netlink writes.

David Woodhouse made the writes blocking.

Al Viro, and then Eric Paris patch the code to deal with deadlocks
cause by the blocking writes.


commit 4527a30f157fca45c102ea49a2fb34a4502264eb
Author: akpm <akpm>
Date: Mon Apr 12 20:29:12 2004 +0000

[PATCH] Light-weight Auditing Framework

From: Rik Faith <***@redhat.com>

This patch provides a low-overhead system-call auditing framework for Linux
that is usable by LSM components (e.g., SELinux). This is an update of the
patch discussed in this thread:

http://marc.theaimsgroup.com/?t=107815888100001&r=1&w=2

In brief, it provides for netlink-based logging of audit records that have
been generated in other parts of the kernel (e.g., SELinux) as well as the
ability to audit system calls, either independently (using simple
filtering) or as a compliment to the audit record that another part of the
kernel generated.

The main goals were to provide system call auditing with 1) as low overhead
as possible, and 2) without duplicating functionality that is already
provided by SELinux (and/or other security infrastructures). This
framework will work "stand-alone", but is not designed to provide, e.g.,
CAPP functionality without another security component in place.

This updated patch includes changes from feedback I have received,
including the ability to compile without CONFIG_NET (and better use of
tabs, so use -w if you diff against the older patch).

Please see http://people.redhat.com/faith/audit/ for an early example
user-space client (auditd-0.4.tar.gz) and instructions on how to try it.

My future intentions at the kernel level include improving filtering (e.g.,
syscall personality/exit codes) and syscall support for more architectures.
First, though, I'm going to work on documentation, a (real) audit daemon,
and patches for other user-space tools so that people can play with the
framework and understand how it can be used with and without SELinux.


Update:

Light-weight Auditing Framework receive filter fixes
From: Rik Faith <***@redhat.com>

Since audit_receive_filter() is only called with audit_netlink_sem held, it
cannot race with either audit_del_rule() or audit_add_rule(), so the
list_for_each_entry_rcu()s may be replaced by list_for_each_entry()s, and
the rcu_read_{un,}lock()s removed. A fix for this is part of the attached
patch.

Other features of the attached patch are:

1) generalized the ability to test for inequality

2) added syscall exit status reporting and testing

3) added ability to report and test first 4 syscall arguments (this adds
a large amount of flexibility for little cost; not implemented or tested
on ppc64)

4) added ability to report and test personality

User-space demo program enhanced for new fields and inequality testing:
http://people.redhat.com/faith/audit/auditd-0.5.tar.gz

BKrev: 407afc18IAH0yJVxEhi5ka-sbTOn8g

diff --git a/kernel/audit.c b/kernel/audit.c
new file mode 100644
index 000000000000..765822b03b91
--- /dev/null
+++ b/kernel/audit.c
[snip]
+void audit_send_reply(int pid, int seq, int type, int done, int multi,
+ void *payload, int size)
+{
+ struct sk_buff *skb;
+ struct nlmsghdr *nlh;
+ int len = NLMSG_SPACE(size);
+ void *data;
+ int flags = multi ? NLM_F_MULTI : 0;
+ int t = done ? NLMSG_DONE : type;
+
+ skb = alloc_skb(len, GFP_KERNEL);
+ if (!skb)
+ goto nlmsg_failure;
+
+ nlh = NLMSG_PUT(skb, pid, seq, t, len - sizeof(*nlh));
+ nlh->nlmsg_flags = flags;
+ data = NLMSG_DATA(nlh);
+ memcpy(data, payload, size);
+ netlink_unicast(audit_sock, skb, pid, MSG_DONTWAIT);
+ return;
+
+nlmsg_failure: /* Used by NLMSG_PUT */
+ if (skb)
+ kfree_skb(skb);
+}


commit b7d1125817c9a46cc46f57db89d9c195e7af22f8
Author: David Woodhouse <***@shinybook.infradead.org>
Date: Thu May 19 10:56:58 2005 +0100

AUDIT: Send netlink messages from a separate kernel thread

netlink_unicast() will attempt to reallocate and will free messages if
the socket's rcvbuf limit is reached unless we give it an infinite
timeout. So do that, from a kernel thread which is dedicated to spewing
stuff up the netlink socket.

Signed-off-by: David Woodhouse <***@infradead.org>
@@ -293,13 +319,16 @@ void audit_send_reply(int pid, int seq, int type, int done, int multi,

skb = alloc_skb(len, GFP_KERNEL);
if (!skb)
- goto nlmsg_failure;
+ return;

- nlh = NLMSG_PUT(skb, pid, seq, t, len - sizeof(*nlh));
+ nlh = NLMSG_PUT(skb, pid, seq, t, size);
nlh->nlmsg_flags = flags;
data = NLMSG_DATA(nlh);
memcpy(data, payload, size);
- netlink_unicast(audit_sock, skb, pid, MSG_DONTWAIT);
+
+ /* Ignore failure. It'll only happen if the sender goes away,
+ because our timeout is set to infinite. */
+ netlink_unicast(audit_sock, skb, pid, 0);
return;

nlmsg_failure: /* Used by NLMSG_PUT */


commit 9044e6bca5a4a575d3c068dfccb5651a2d6a13bc
Author: Al Viro <***@zeniv.linux.org.uk>
Date: Mon May 22 01:09:24 2006 -0400

[PATCH] fix deadlocks in AUDIT_LIST/AUDIT_LIST_RULES

We should not send a pile of replies while holding audit_netlink_mutex
since we hold the same mutex when we receive commands. As the result,
we can get blocked while sending and sit there holding the mutex while
auditctl is unable to send the next command and get around to receiving
what we'd sent.

Solution: create skb and put them into a queue instead of sending;
once we are done, send what we've got on the list. The former can
be done synchronously while we are handling AUDIT_LIST or AUDIT_LIST_RULES;
we are holding audit_netlink_mutex at that point. The latter is done
asynchronously and without messing with audit_netlink_mutex.

Signed-off-by: Al Viro <***@zeniv.linux.org.uk>

commit f09ac9db2aafe36fde9ebd63c8c5d776f6e7bd41
Author: Eric Paris <***@redhat.com>
Date: Fri Apr 18 10:11:04 2008 -0400

Audit: stop deadlock from signals under load

A deadlock is possible between kauditd and auditd under load if auditd
receives a signal. When auditd receives a signal it sends a netlink
message to the kernel asking for information about the sender of the
signal. In that same context the audit system will attempt to send a
netlink message back to the userspace auditd. If kauditd has already
filled the socket buffer (see netlink_attachskb()) auditd will now put
itself to sleep waiting for room to send the message. Since auditd is
responsible for draining that socket we have a deadlock. The fix, since
the response from the kernel does not need to be synchronous is to send
the signal information back to auditd in a separate thread. And thus
auditd can continue to drain the audit queue normally.

Signed-off-by: Eric Paris <***@redhat.com>
Signed-off-by: Al Viro <***@zeniv.linux.org.uk>




Eric
Andrew Morton
2014-03-04 22:50:37 UTC
Permalink
Post by Eric W. Biederman
Post by David Miller
Date: Tue, 4 Mar 2014 13:30:04 -0800
Post by Andrew Morton
Post by Eric W. Biederman
Modify audit_send_reply to directly use a non-blocking send and
to return an error on failure (if anyone cares).
Modify audit_list_rules_send to use audit_send_reply and give up
if we can not send a packet.
Merge audit_list_rules into iaudit_list_rules_send as the code
is now sufficiently simple to not justify to callers.
Kill audit_send_list, audit_send_reply_thread because using
a separate thread for replies is not needed when sending
packets syncrhonously.
Nothing much seems to be happening here?
Well you picked up the patch that fixes the worst of the bugs that I was
complaining about. Beyond that I don't know what makes sense.
Oh, so I did. I wasn't planning on merging it myself, hoping that
someone who hasaclue will step in. Help.
Eric W. Biederman
2014-03-10 03:06:43 UTC
Permalink
Linus,

Please pull the for-linus branch from the git tree:

git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace.git for-linus

HEAD: d211f177b28ec070c25b3d0b960aa55f352f731f audit: Update kdoc for audit_send_reply and audit_list_rules_send

Starting with 3.14-rc1 the audit code is faulty (think oopses and races)
with respect to how it computes the network namespace of which socket to
reply to, and I happened to notice by chance when reading through the
code.

My efforts to get these fixes noticed by people who care about audit
seem to have landed on deaf ears, so since these are namespace related I
have put them in my tree.

My testing and the automated build bots don't find any problems with
these fixes.

Eric W. Biederman (3):
audit: Use struct net not pid_t to remember the network namespce to reply in
audit: Send replies in the proper network namespace.
audit: Update kdoc for audit_send_reply and audit_list_rules_send

include/linux/audit.h | 3 ++-
kernel/audit.c | 31 ++++++++++++++++---------------
kernel/audit.h | 2 +-
kernel/auditfilter.c | 10 +++++++---
4 files changed, 26 insertions(+), 20 deletions(-)

Eric
Eric Paris
2014-03-10 13:59:04 UTC
Permalink
Post by Eric W. Biederman
Linus,
git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace.git for-linus
HEAD: d211f177b28ec070c25b3d0b960aa55f352f731f audit: Update kdoc for audit_send_reply and audit_list_rules_send
Starting with 3.14-rc1 the audit code is faulty (think oopses and races)
with respect to how it computes the network namespace of which socket to
reply to, and I happened to notice by chance when reading through the
code.
My efforts to get these fixes noticed by people who care about audit
seem to have landed on deaf ears, so since these are namespace related I
have put them in my tree.
This commentary sounds like a pile of crap seeing as how there was a
whole discussion around how to handle this stuff, which you were a part
of. And that the audit tree already picked up these 2 patches.

In any case, since I haven't sent them to Linus and I'm glad that is
done, so feel free to consider this me Acking the pull request.
Eric W. Biederman
2014-03-10 19:56:06 UTC
Permalink
Post by Eric Paris
Post by Eric W. Biederman
My efforts to get these fixes noticed by people who care about audit
seem to have landed on deaf ears, so since these are namespace related I
have put them in my tree.
This commentary sounds like a pile of crap seeing as how there was a
whole discussion around how to handle this stuff, which you were a part
of. And that the audit tree already picked up these 2 patches.
My apologies for offending. Last night when I looked the audit tree had
not been updated in 7 weeks, and Andrew Morton wasn't ready to push
these fixes to Linus and was looking for someone who would. Keeping
important bug fixes in my retransmit queue leaves me very grumpy.
Post by Eric Paris
In any case, since I haven't sent them to Linus and I'm glad that is
done, so feel free to consider this me Acking the pull request.
Eric
Richard Guy Briggs
2014-03-16 18:36:51 UTC
Permalink
Post by Eric W. Biederman
Post by Eric Paris
Post by Eric W. Biederman
My efforts to get these fixes noticed by people who care about audit
seem to have landed on deaf ears, so since these are namespace related I
have put them in my tree.
This commentary sounds like a pile of crap seeing as how there was a
whole discussion around how to handle this stuff, which you were a part
of. And that the audit tree already picked up these 2 patches.
My apologies for offending. Last night when I looked the audit tree had
not been updated in 7 weeks, and Andrew Morton wasn't ready to push
these fixes to Linus and was looking for someone who would. Keeping
important bug fixes in my retransmit queue leaves me very grumpy.
Well, it is offensive. Please stick to the facts. Some of us may not
be as smart as you, but continuing to suggest people don't care is
starting to get abusive.
Post by Eric W. Biederman
Post by Eric Paris
In any case, since I haven't sent them to Linus and I'm glad that is
done, so feel free to consider this me Acking the pull request.
Eric
- RGB

--
Richard Guy Briggs <***@redhat.com>
Senior Software Engineer, Kernel Security, AMER ENG Base Operating Systems, Red Hat
Remote, Ottawa, Canada
Voice: +1.647.777.2635, Internal: (81) 32635, Alt: +1.613.693.0684x3545
David Miller
2014-03-05 00:21:52 UTC
Permalink
From: ***@xmission.com (Eric W. Biederman)
Date: Tue, 04 Mar 2014 14:41:16 -0800
Post by Eric W. Biederman
If we really want the ability to always appened to the queue of skb's
is to just have a version of netlink_send_skb that ignores the queued
limits. Of course an evil program then could force the generation of
enough audit records to DOS the kernel, but we seem to be in that
situation now. Shrug.
There is never a valid reason to bypass the socket limits.

It protects the system from things going out of control.

Netlink packet sends can fail, and audit should cope with that
event instead of trying to bludgeon it into not happening.
Steve Grubb
2014-03-05 16:59:01 UTC
Permalink
Post by David Miller
Date: Tue, 04 Mar 2014 14:41:16 -0800
Post by Eric W. Biederman
If we really want the ability to always appened to the queue of skb's
is to just have a version of netlink_send_skb that ignores the queued
limits. Of course an evil program then could force the generation of
enough audit records to DOS the kernel, but we seem to be in that
situation now. Shrug.
There is never a valid reason to bypass the socket limits.
It protects the system from things going out of control.
Netlink packet sends can fail, and audit should cope with that
event instead of trying to bludgeon it into not happening.
I am not sure what exactly is the problem with the code that was being
patched. The audit code has been stable and working pretty well for everyone
for the last 5-6 years. But lately there has been a lot of kernel code churn
and I am concerned that the changes are being made without a complete
understanding of the design goals.

The audit system has to be very reliable. It can't lose any event or record.
The people that really depend on it would rather have access denied to the
system than lose any event. This is the reason it goes to such lengths.

If I understand the patch, it looks like its affecting code that adds, deletes,
or lists audit rules. This code is quite old and working well. It didn't at
first. We found a lot of problems by writing scripts like:

#!/bin/bash
while [ 1 ] ;
do
echo "Inserting syscalls..."
sys=1
while [ $sys -lt 100 ]
do
sys=`expr $sys + 1`
echo "$sys"
auditctl -a entry,always -S $sys
done

echo "Listing..."
auditctl -l
echo "Deleting..."
auditctl -D
done

with another shell running:

#!/bin/bash
while [ 1 ] ;
do
echo "Listing..."
auditctl -l
done

and another shell running:

#!/bin/bash
while [ 1 ] ;
do
echo "Disabling..."
auditctl -e 0
echo "Enabling..."
auditctl -e 1
done

You can probably imagine more stress tests. But the proposed code should be
well tested similar to this.

Thanks,
-Steve
LC Bruzenak
2014-03-05 17:57:04 UTC
Permalink
Post by Steve Grubb
The audit system has to be very reliable. It can't lose any event or record.
The people that really depend on it would rather have access denied to the
system than lose any event. This is the reason it goes to such lengths.
+1

LCB
--
LC (Lenny) Bruzenak
***@magitekltd.com
Eric W. Biederman
2014-03-05 18:06:23 UTC
Permalink
Post by Steve Grubb
Post by David Miller
Date: Tue, 04 Mar 2014 14:41:16 -0800
Post by Eric W. Biederman
If we really want the ability to always appened to the queue of skb's
is to just have a version of netlink_send_skb that ignores the queued
limits. Of course an evil program then could force the generation of
enough audit records to DOS the kernel, but we seem to be in that
situation now. Shrug.
There is never a valid reason to bypass the socket limits.
It protects the system from things going out of control.
Netlink packet sends can fail, and audit should cope with that
event instead of trying to bludgeon it into not happening.
I am not sure what exactly is the problem with the code that was being
patched. The audit code has been stable and working pretty well for everyone
for the last 5-6 years. But lately there has been a lot of kernel code churn
and I am concerned that the changes are being made without a complete
understanding of the design goals.
The problem is that the audit code in the kernel is not using netlink
correctly and which leads to mistakes when the code is updated because
the code in the kernel is unnecessarily complex, and inefficient.
Post by Steve Grubb
The audit system has to be very reliable. It can't lose any event or record.
The people that really depend on it would rather have access denied to the
system than lose any event. This is the reason it goes to such
lengths.
But the designers of the code didn't really go to any lengths at all.
There are enormous and cumbersome hacks to avoid fixing the kernel
interfaces to do what audit wants to use the kernel interfaces for.
That is the audit code for transmitting replies is stupid and is causing
serious maintenance issues, by growing hacks upon hacks instead of
dealing with the core issues.

The first approximation of what you want should have been be to set the
netlink socket recvbuf to as large as the can be:
setsockopt(sk, SOL_SOCKET, SO_RCVBUFFORCE, INTMAX);

If/when 2GiB is shown to be not enough people should should have worked
on the netlink layer to allow even larger piles of audit messages to be
in flight, assuming you really do want to bring down the system when the
audit client is just too slow.
Post by Steve Grubb
If I understand the patch, it looks like its affecting code that adds, deletes,
or lists audit rules. This code is quite old and working well. It didn't at
Working well does not mean implemented well, and for a security feature
not implemented well means broken. And frankly since I can't see a
single setsockopt to set the received buffer size for audit netlink
sockets I have to stronlgy suspect that people were working around
userpsace bugs with stupid kernel code.
Post by Steve Grubb
#!/bin/bash
while [ 1 ] ;
do
echo "Inserting syscalls..."
sys=1
while [ $sys -lt 100 ]
do
sys=`expr $sys + 1`
echo "$sys"
auditctl -a entry,always -S $sys
done
echo "Listing..."
auditctl -l
echo "Deleting..."
auditctl -D
done
#!/bin/bash
while [ 1 ] ;
do
echo "Listing..."
auditctl -l
done
#!/bin/bash
while [ 1 ] ;
do
echo "Disabling..."
auditctl -e 0
echo "Enabling..."
auditctl -e 1
done
You can probably imagine more stress tests. But the proposed code should be
well tested similar to this.
Honestly since no one cares enough to maintain the kernel code properly
I really think we should just rip audit out instead trying to present
userspace with the delusion that the code works, and will continue to
work properly.

My apologies for being grumpy. I have no intention of breaking
userspace (which is why my last patch was an rfc) but at the same time
I care absolutely nothing about audit. It solves none of my problems
and works at all of the wrong layers to be interesting for me.

I just happened to notice that the code has been broken since 3.14-rc1
and no one understands or cares enough about audit to even accept
trivial kernel patches to fix the bugs. No offense but I am not
interested in testing code I care nothing about, especially when there
is no one to pick up the patches. My RFC patch was supposed to be a
word to the wise.

Eric
Eric Paris
2014-03-07 22:52:02 UTC
Permalink
As usual Eric, your commentary is anything but useful. However your
technical thoughts are not off the mark. Can we stick to those?
Post by David Miller
Date: Tue, 04 Mar 2014 14:41:16 -0800
Post by Eric W. Biederman
If we really want the ability to always appened to the queue of skb's
is to just have a version of netlink_send_skb that ignores the queued
limits. Of course an evil program then could force the generation of
enough audit records to DOS the kernel, but we seem to be in that
situation now. Shrug.
There is never a valid reason to bypass the socket limits.
Audit does have some pretty crazy/wacky/dumb ways of doing things. And
they've worked really well. I'm the first to agree that doesn't make
them right. But also that I don't know how to do them better. I'm
happy to try if I know how. Audit is non-tolerant to failure and loss.
Many users of audit would prefer the system panic() than lose a message.
If someone shows me how to do it better I'll happily admit there are
likely places where what we do is just a 'little' too
strong/foolish/crazy. Note that ALL users of these functions must have
at least 1 capability (CAP_AUDIT_CONTROL). So if there is a malicious
app, it is a root malicious app...

The kernel audit has 3 different 'things' that send skbs to userspace.
All of them work a little crazy, but similar crazy. The current task
calls into the kernel via netlink, and the kernel then builds one or
more skb(s) and passes those skb(s) (via differing mechanisms) to a
kthread which in turn calls netlink_unicast(,,,0) sending the response
back to the current task in 2 of the three cases. In all cases, since
the timeout is infinite, we assume that the only possible reason this
call to netlink_unicast() will fail is because the other end of the
socket went away. Simple drawing of 2 of the 3 cases.
+------------------------------------------------------------------+
| |
| auditctl (audit tool run by root) |
| netlink send netlink receive |
+------------------------------------------------------------------+
+ ^
| |
v +
+----------------------------+ +------------------------+
| kernel audit generate skbs | | send skbs to userspace |
+----------------------------+ +------------------------+
+ ^
| +------------------------+ |
+------->| send skbs to a kthread |+-----+
+------------------------+
The most important of the 3 cases and the one that people care the
absolute most about 'things cannot be lost' is the actual audit
messages. Messages like 'process A just did action B to object C'.
These are handled by means of the current process generating an skb and
passing those to an audit specific queue. This audit internal queue
depth is controllable by userspace. If we overflow this queue we may
call panic() (admin choice, obviously non-default). Again, the kthread
on the other end of that queue assumes that all calls to
netlink_unicast(,,,0) will eventually succeed (unless the receiving task
died). It is actually imperative that the current process be blocked
until the message is on track to userspace. Even Eric isn't trying to
change this one case in his patch. This is the one case where the task
receiving the skb is not (likely) the current task (but could be)

The other two, the ones Eric patched, are much more flexible. In both
cases a userspace task ask the kernel for a specific piece of
information (by sending a netlink message). current is going to be the
task draining the netlink queue. This is the reason the send is being
punted to a kthread. So current can read from the netlink socket. In
one case audit_send_reply_thread() the response is small and can't
really grow without bound. Converting to a nonblocking socket might
well make sense here.

The second user Eric patched, audit_send_list(), can grow without bound.
The number of skb's is going to be the size of the number of audit rules
that root loaded. We run the list of rules, generate an skb per rule,
and add all of them to an skb_buff_head. We then pass the skb_buff_head
to a kthread so that current will be able to read/drain the socket.
There really is no limit to how big the skb_buff_head could possibly
grow. This doesn't necessarily absolutely have to be lossless but it
can actually quite reasonably be a whole lot of data that needs to get
sent. I know of no way to deliver unbounded lengths of data to the
current task via netlink without blocking on more space in the socket.
Even if the socket rmem was MAX_INT, how can we deliver more? The rule
size is unbounded. How do I get an unbounded amount of data onto this
side of the socket when I have to generate it all during the request...

Tell me how to architect it better and I'll look at it.
David Miller
2014-03-08 00:48:01 UTC
Permalink
From: Eric Paris <***@redhat.com>
Date: Fri, 07 Mar 2014 17:52:02 -0500
Post by Eric Paris
Audit is non-tolerant to failure and loss.
Netlink is not a loss-less transport.
Steve Grubb
2014-03-08 03:27:28 UTC
Permalink
Post by David Miller
Date: Fri, 07 Mar 2014 17:52:02 -0500
Post by Eric Paris
Audit is non-tolerant to failure and loss.
Netlink is not a loss-less transport.
Perhaps. But in all our testing over the years its been very good.

-Steve
David Miller
2014-03-08 06:34:51 UTC
Permalink
From: Steve Grubb <***@redhat.com>
Date: Fri, 07 Mar 2014 22:27:28 -0500
Post by Steve Grubb
Post by David Miller
Date: Fri, 07 Mar 2014 17:52:02 -0500
Post by Eric Paris
Audit is non-tolerant to failure and loss.
Netlink is not a loss-less transport.
Perhaps. But in all our testing over the years its been very good.
What I really meant by that was that there is flow control.

You can push as much data reliably over it as you want, but you have
to block when the socket limits are hit.

And I'd say you might as well make the creator of the event do the
blocking rather than making other threads do this.
Eric Paris
2014-03-08 03:56:52 UTC
Permalink
Post by David Miller
Date: Fri, 07 Mar 2014 17:52:02 -0500
Post by Eric Paris
Audit is non-tolerant to failure and loss.
Netlink is not a loss-less transport.
I'm happy to accept that (and know it to be true). How can I better
architect things? It seems Eric is complaining that when we get a
request for info, we queue that info up, and then use a kthread to make
it available when the task next calls recv. By using blocking sockets
in the kthread we have no problem with the size of the socket read buf.
If we switch to non-blocking sockets how can we possibly queue up more
than rmem size of data? (honestly, if userspace used INT_MAX it is
almost certainly overkill for even the largest rulesets, but
theoretically, it's not...)

Is our design somehow wrong? Flawed? Mind you it's pretty dumb that we
do basically the same thing in 3 different audit code path, but the
architecture doesn't seem crazy to me. Then again, I'm not brilliant by
any stretch!

+------------------------------------------------------------------+
| |
| auditctl (audit tool run by root) |
| netlink send netlink receive |
+------------------------------------------------------------------+
+ ^
| |
v +
+----------------------------+ +------------------------+
| kernel audit generate skbs | | send skbs to userspace |
+----------------------------+ +------------------------+
+ ^
| +------------------------+ |
+------->| send skbs to a kthread |+-----+
+------------------------+
David Miller
2014-03-10 19:30:01 UTC
Permalink
From: Eric Paris <***@redhat.com>
Date: Fri, 07 Mar 2014 17:52:02 -0500
Post by Eric Paris
The second user Eric patched, audit_send_list(), can grow without bound.
The number of skb's is going to be the size of the number of audit rules
that root loaded. We run the list of rules, generate an skb per rule,
and add all of them to an skb_buff_head. We then pass the skb_buff_head
to a kthread so that current will be able to read/drain the socket.
There really is no limit to how big the skb_buff_head could possibly
grow. This doesn't necessarily absolutely have to be lossless but it
can actually quite reasonably be a whole lot of data that needs to get
sent. I know of no way to deliver unbounded lengths of data to the
current task via netlink without blocking on more space in the socket.
Even if the socket rmem was MAX_INT, how can we deliver more? The rule
size is unbounded. How do I get an unbounded amount of data onto this
side of the socket when I have to generate it all during the request...
This is what netlink dumps are for. It is how we are able to dump
routing tables with millions of routes to userspace.

By using normal netlink requests and netlink_unicast() for this, you
are ignoring an entire mechanism in netlink designed specifically to
handle this kind of situation.

Netlink dumps track state and build one or more SKBs (as necessary),
one by one, to form the reply. It implements flow control, state
tracking for iteration, optimized SKB sizing and allocation, etc.
Eric Paris
2014-03-10 21:57:46 UTC
Permalink
Post by David Miller
Date: Fri, 07 Mar 2014 17:52:02 -0500
Post by Eric Paris
The second user Eric patched, audit_send_list(), can grow without bound.
The number of skb's is going to be the size of the number of audit rules
that root loaded. We run the list of rules, generate an skb per rule,
and add all of them to an skb_buff_head. We then pass the skb_buff_head
to a kthread so that current will be able to read/drain the socket.
There really is no limit to how big the skb_buff_head could possibly
grow. This doesn't necessarily absolutely have to be lossless but it
can actually quite reasonably be a whole lot of data that needs to get
sent. I know of no way to deliver unbounded lengths of data to the
current task via netlink without blocking on more space in the socket.
Even if the socket rmem was MAX_INT, how can we deliver more? The rule
size is unbounded. How do I get an unbounded amount of data onto this
side of the socket when I have to generate it all during the request...
This is what netlink dumps are for. It is how we are able to dump
routing tables with millions of routes to userspace.
By using normal netlink requests and netlink_unicast() for this, you
are ignoring an entire mechanism in netlink designed specifically to
handle this kind of situation.
Netlink dumps track state and build one or more SKBs (as necessary),
one by one, to form the reply. It implements flow control, state
tracking for iteration, optimized SKB sizing and allocation, etc.
Awesome. I'll see what I can find!
Richard Guy Briggs
2014-03-16 18:19:03 UTC
Permalink
Post by Eric W. Biederman
In perverse cases of file descriptor passing the current network
namespace of a process and the network namespace of a socket used by
that socket may differ. Therefore use the network namespace of the
appropiate socket to ensure replies always go to the appropiate
socket.
---
This is an incremental change on top of my previous patch to guarantee
that replies always happen in the appropriate network namespace.
include/linux/audit.h | 3 ++-
kernel/audit.c | 21 ++++++++++-----------
kernel/auditfilter.c | 7 +++++--
3 files changed, 17 insertions(+), 14 deletions(-)
diff --git a/include/linux/audit.h b/include/linux/audit.h
index aa865a9a4c4f..ec1464df4c60 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -43,6 +43,7 @@ struct mq_attr;
struct mqstat;
struct audit_watch;
struct audit_tree;
+struct sk_buff;
struct audit_krule {
int vers_ops;
@@ -463,7 +464,7 @@ extern int audit_filter_user(int type);
extern int audit_filter_type(int type);
extern int audit_rule_change(int type, __u32 portid, int seq,
void *data, size_t datasz);
-extern int audit_list_rules_send(__u32 portid, int seq);
+extern int audit_list_rules_send(struct sk_buff *request_skb, int seq);
extern u32 audit_enabled;
#else /* CONFIG_AUDIT */
diff --git a/kernel/audit.c b/kernel/audit.c
index 1e5756f16f6f..32086bff5564 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -570,9 +570,11 @@ static int audit_send_reply_thread(void *arg)
* Allocates an skb, builds the netlink message, and sends it to the port id.
* No failure notifications.
*/
-static void audit_send_reply(__u32 portid, int seq, int type, int done,
+static void audit_send_reply(struct sk_buff *request_skb, int seq, int type, int done,
int multi, const void *payload, int size)
{
+ u32 portid = NETLINK_CB(request_skb).portid;
+ struct net *net = sock_net(NETLINK_CB(request_skb).sk);
struct sk_buff *skb;
struct task_struct *tsk;
struct audit_reply *reply = kmalloc(sizeof(struct audit_reply),
@@ -585,7 +587,7 @@ static void audit_send_reply(__u32 portid, int seq, int type, int done,
if (!skb)
goto out;
- reply->net = get_net(current->nsproxy->net_ns);
+ reply->net = get_net(net);
reply->portid = portid;
reply->skb = skb;
@@ -675,8 +677,7 @@ static int audit_get_feature(struct sk_buff *skb)
seq = nlmsg_hdr(skb)->nlmsg_seq;
- audit_send_reply(NETLINK_CB(skb).portid, seq, AUDIT_GET, 0, 0,
- &af, sizeof(af));
+ audit_send_reply(skb, seq, AUDIT_GET, 0, 0, &af, sizeof(af));
return 0;
}
@@ -796,8 +797,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
s.backlog = skb_queue_len(&audit_skb_queue);
s.version = AUDIT_VERSION_LATEST;
s.backlog_wait_time = audit_backlog_wait_time;
- audit_send_reply(NETLINK_CB(skb).portid, seq, AUDIT_GET, 0, 0,
- &s, sizeof(s));
+ audit_send_reply(skb, seq, AUDIT_GET, 0, 0, &s, sizeof(s));
break;
}
case AUDIT_SET: {
@@ -907,7 +907,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
seq, data, nlmsg_len(nlh));
break;
- err = audit_list_rules_send(NETLINK_CB(skb).portid, seq);
+ err = audit_list_rules_send(skb, seq);
break;
audit_trim_trees();
@@ -972,8 +972,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
memcpy(sig_data->ctx, ctx, len);
security_release_secctx(ctx, len);
}
- audit_send_reply(NETLINK_CB(skb).portid, seq, AUDIT_SIGNAL_INFO,
- 0, 0, sig_data, sizeof(*sig_data) + len);
+ audit_send_reply(skb, seq, AUDIT_SIGNAL_INFO, 0, 0,
+ sig_data, sizeof(*sig_data) + len);
kfree(sig_data);
break;
case AUDIT_TTY_GET: {
@@ -985,8 +985,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
s.log_passwd = tsk->signal->audit_tty_log_passwd;
spin_unlock(&tsk->sighand->siglock);
- audit_send_reply(NETLINK_CB(skb).portid, seq,
- AUDIT_TTY_GET, 0, 0, &s, sizeof(s));
+ audit_send_reply(skb, seq, AUDIT_TTY_GET, 0, 0, &s, sizeof(s));
break;
}
case AUDIT_TTY_SET: {
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index a5e3d73d73e4..e8d1c7c515d7 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -30,6 +30,7 @@
#include <linux/slab.h>
#include <linux/security.h>
#include <net/net_namespace.h>
+#include <net/sock.h>
#include "audit.h"
/*
@@ -1069,8 +1070,10 @@ int audit_rule_change(int type, __u32 portid, int seq, void *data,
*/
-int audit_list_rules_send(__u32 portid, int seq)
+int audit_list_rules_send(struct sk_buff *request_skb, int seq)
{
+ u32 portid = NETLINK_CB(request_skb).portid;
+ struct net *net = sock_net(NETLINK_CB(request_skb).sk);
struct task_struct *tsk;
struct audit_netlink_list *dest;
int err = 0;
@@ -1084,7 +1087,7 @@ int audit_list_rules_send(__u32 portid, int seq)
dest = kmalloc(sizeof(struct audit_netlink_list), GFP_KERNEL);
if (!dest)
return -ENOMEM;
- dest->net = get_net(current->nsproxy->net_ns);
+ dest->net = get_net(net);
dest->portid = portid;
skb_queue_head_init(&dest->q);
--
1.7.5.4
- RGB

--
Richard Guy Briggs <***@redhat.com>
Senior Software Engineer, Kernel Security, AMER ENG Base Operating Systems, Red Hat
Remote, Ottawa, Canada
Voice: +1.647.777.2635, Internal: (81) 32635, Alt: +1.613.693.0684x3545
Richard Guy Briggs
2014-03-16 19:13:05 UTC
Permalink
Post by Richard Guy Briggs
Post by Eric W. Biederman
In perverse cases of file descriptor passing the current network
namespace of a process and the network namespace of a socket used by
that socket may differ. Therefore use the network namespace of the
appropiate socket to ensure replies always go to the appropiate
socket.
---
This is an incremental change on top of my previous patch to guarantee
that replies always happen in the appropriate network namespace.
include/linux/audit.h | 3 ++-
kernel/audit.c | 21 ++++++++++-----------
kernel/auditfilter.c | 7 +++++--
3 files changed, 17 insertions(+), 14 deletions(-)
diff --git a/include/linux/audit.h b/include/linux/audit.h
index aa865a9a4c4f..ec1464df4c60 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -43,6 +43,7 @@ struct mq_attr;
struct mqstat;
struct audit_watch;
struct audit_tree;
+struct sk_buff;
struct audit_krule {
int vers_ops;
@@ -463,7 +464,7 @@ extern int audit_filter_user(int type);
extern int audit_filter_type(int type);
extern int audit_rule_change(int type, __u32 portid, int seq,
void *data, size_t datasz);
-extern int audit_list_rules_send(__u32 portid, int seq);
+extern int audit_list_rules_send(struct sk_buff *request_skb, int seq);
extern u32 audit_enabled;
#else /* CONFIG_AUDIT */
diff --git a/kernel/audit.c b/kernel/audit.c
index 1e5756f16f6f..32086bff5564 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -570,9 +570,11 @@ static int audit_send_reply_thread(void *arg)
* Allocates an skb, builds the netlink message, and sends it to the port id.
* No failure notifications.
*/
-static void audit_send_reply(__u32 portid, int seq, int type, int done,
+static void audit_send_reply(struct sk_buff *request_skb, int seq, int type, int done,
int multi, const void *payload, int size)
{
+ u32 portid = NETLINK_CB(request_skb).portid;
+ struct net *net = sock_net(NETLINK_CB(request_skb).sk);
struct sk_buff *skb;
struct task_struct *tsk;
struct audit_reply *reply = kmalloc(sizeof(struct audit_reply),
@@ -585,7 +587,7 @@ static void audit_send_reply(__u32 portid, int seq, int type, int done,
if (!skb)
goto out;
- reply->net = get_net(current->nsproxy->net_ns);
+ reply->net = get_net(net);
reply->portid = portid;
reply->skb = skb;
@@ -675,8 +677,7 @@ static int audit_get_feature(struct sk_buff *skb)
seq = nlmsg_hdr(skb)->nlmsg_seq;
- audit_send_reply(NETLINK_CB(skb).portid, seq, AUDIT_GET, 0, 0,
- &af, sizeof(af));
+ audit_send_reply(skb, seq, AUDIT_GET, 0, 0, &af, sizeof(af));
return 0;
}
@@ -796,8 +797,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
s.backlog = skb_queue_len(&audit_skb_queue);
s.version = AUDIT_VERSION_LATEST;
s.backlog_wait_time = audit_backlog_wait_time;
- audit_send_reply(NETLINK_CB(skb).portid, seq, AUDIT_GET, 0, 0,
- &s, sizeof(s));
+ audit_send_reply(skb, seq, AUDIT_GET, 0, 0, &s, sizeof(s));
break;
}
case AUDIT_SET: {
@@ -907,7 +907,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
seq, data, nlmsg_len(nlh));
break;
- err = audit_list_rules_send(NETLINK_CB(skb).portid, seq);
+ err = audit_list_rules_send(skb, seq);
break;
audit_trim_trees();
@@ -972,8 +972,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
memcpy(sig_data->ctx, ctx, len);
security_release_secctx(ctx, len);
}
- audit_send_reply(NETLINK_CB(skb).portid, seq, AUDIT_SIGNAL_INFO,
- 0, 0, sig_data, sizeof(*sig_data) + len);
+ audit_send_reply(skb, seq, AUDIT_SIGNAL_INFO, 0, 0,
+ sig_data, sizeof(*sig_data) + len);
kfree(sig_data);
break;
case AUDIT_TTY_GET: {
@@ -985,8 +985,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
s.log_passwd = tsk->signal->audit_tty_log_passwd;
spin_unlock(&tsk->sighand->siglock);
- audit_send_reply(NETLINK_CB(skb).portid, seq,
- AUDIT_TTY_GET, 0, 0, &s, sizeof(s));
+ audit_send_reply(skb, seq, AUDIT_TTY_GET, 0, 0, &s, sizeof(s));
break;
}
case AUDIT_TTY_SET: {
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index a5e3d73d73e4..e8d1c7c515d7 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -30,6 +30,7 @@
#include <linux/slab.h>
#include <linux/security.h>
#include <net/net_namespace.h>
+#include <net/sock.h>
#include "audit.h"
/*
@@ -1069,8 +1070,10 @@ int audit_rule_change(int type, __u32 portid, int seq, void *data,
*/
-int audit_list_rules_send(__u32 portid, int seq)
+int audit_list_rules_send(struct sk_buff *request_skb, int seq)
{
+ u32 portid = NETLINK_CB(request_skb).portid;
+ struct net *net = sock_net(NETLINK_CB(request_skb).sk);
struct task_struct *tsk;
struct audit_netlink_list *dest;
int err = 0;
@@ -1084,7 +1087,7 @@ int audit_list_rules_send(__u32 portid, int seq)
dest = kmalloc(sizeof(struct audit_netlink_list), GFP_KERNEL);
if (!dest)
return -ENOMEM;
- dest->net = get_net(current->nsproxy->net_ns);
+ dest->net = get_net(net);
dest->portid = portid;
skb_queue_head_init(&dest->q);
--
1.7.5.4
- RGB
--
Senior Software Engineer, Kernel Security, AMER ENG Base Operating Systems, Red Hat
Remote, Ottawa, Canada
Voice: +1.647.777.2635, Internal: (81) 32635, Alt: +1.613.693.0684x3545
- RGB

--
Richard Guy Briggs <***@redhat.com>
Senior Software Engineer, Kernel Security, AMER ENG Base Operating Systems, Red Hat
Remote, Ottawa, Canada
Voice: +1.647.777.2635, Internal: (81) 32635, Alt: +1.613.693.0684x3545
Richard Guy Briggs
2014-03-16 18:15:12 UTC
Permalink
Post by Eric W. Biederman
While reading through 3.14-rc1 I found a pretty siginficant mishandling
of network namespaces in the recent audit changes.
In struct audit_netlink_list and audit_reply add a reference to the
network namespace of the caller and remove the userspace pid of the
caller. This cleanly remembers the callers network namespace, and
removes a huge class of races and nasty failure modes that can occur
when attempting to relook up the callers network namespace from a pid_t
(including the caller's network namespace changing, pid wraparound, and
the pid simply not being present).
---
kernel/audit.c | 10 ++++++----
kernel/audit.h | 2 +-
kernel/auditfilter.c | 3 ++-
3 files changed, 9 insertions(+), 6 deletions(-)
diff --git a/kernel/audit.c b/kernel/audit.c
index 34c5a2310fbf..1e5756f16f6f 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -182,7 +182,7 @@ struct audit_buffer {
struct audit_reply {
__u32 portid;
- pid_t pid;
+ struct net *net;
struct sk_buff *skb;
};
@@ -500,7 +500,7 @@ int audit_send_list(void *_dest)
{
struct audit_netlink_list *dest = _dest;
struct sk_buff *skb;
- struct net *net = get_net_ns_by_pid(dest->pid);
+ struct net *net = dest->net;
struct audit_net *aunet = net_generic(net, audit_net_id);
/* wait for parent to finish and send an ACK */
@@ -510,6 +510,7 @@ int audit_send_list(void *_dest)
while ((skb = __skb_dequeue(&dest->q)) != NULL)
netlink_unicast(aunet->nlsk, skb, dest->portid, 0);
+ put_net(net);
kfree(dest);
return 0;
static int audit_send_reply_thread(void *arg)
{
struct audit_reply *reply = (struct audit_reply *)arg;
- struct net *net = get_net_ns_by_pid(reply->pid);
+ struct net *net = reply->net;
struct audit_net *aunet = net_generic(net, audit_net_id);
mutex_lock(&audit_cmd_mutex);
@@ -552,6 +553,7 @@ static int audit_send_reply_thread(void *arg)
/* Ignore failure. It'll only happen if the sender goes away,
because our timeout is set to infinite. */
netlink_unicast(aunet->nlsk , reply->skb, reply->portid, 0);
+ put_net(net);
kfree(reply);
return 0;
}
@@ -583,8 +585,8 @@ static void audit_send_reply(__u32 portid, int seq, int type, int done,
if (!skb)
goto out;
+ reply->net = get_net(current->nsproxy->net_ns);
reply->portid = portid;
- reply->pid = task_pid_vnr(current);
reply->skb = skb;
tsk = kthread_run(audit_send_reply_thread, reply, "audit_send_reply");
diff --git a/kernel/audit.h b/kernel/audit.h
index 57cc64d67718..8df132214606 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -247,7 +247,7 @@ extern void audit_panic(const char *message);
struct audit_netlink_list {
__u32 portid;
- pid_t pid;
+ struct net *net;
struct sk_buff_head q;
};
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 14a78cca384e..a5e3d73d73e4 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -29,6 +29,7 @@
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/security.h>
+#include <net/net_namespace.h>
#include "audit.h"
/*
@@ -1083,8 +1084,8 @@ int audit_list_rules_send(__u32 portid, int seq)
dest = kmalloc(sizeof(struct audit_netlink_list), GFP_KERNEL);
if (!dest)
return -ENOMEM;
+ dest->net = get_net(current->nsproxy->net_ns);
dest->portid = portid;
- dest->pid = task_pid_vnr(current);
skb_queue_head_init(&dest->q);
mutex_lock(&audit_filter_mutex);
--
1.7.5.4
- RGB

--
Richard Guy Briggs <***@redhat.com>
Senior Software Engineer, Kernel Security, AMER ENG Base Operating Systems, Red Hat
Remote, Ottawa, Canada
Voice: +1.647.777.2635, Internal: (81) 32635, Alt: +1.613.693.0684x3545
Richard Guy Briggs
2014-03-16 19:12:25 UTC
Permalink
Post by Richard Guy Briggs
Post by Eric W. Biederman
While reading through 3.14-rc1 I found a pretty siginficant mishandling
of network namespaces in the recent audit changes.
In struct audit_netlink_list and audit_reply add a reference to the
network namespace of the caller and remove the userspace pid of the
caller. This cleanly remembers the callers network namespace, and
removes a huge class of races and nasty failure modes that can occur
when attempting to relook up the callers network namespace from a pid_t
(including the caller's network namespace changing, pid wraparound, and
the pid simply not being present).
---
kernel/audit.c | 10 ++++++----
kernel/audit.h | 2 +-
kernel/auditfilter.c | 3 ++-
3 files changed, 9 insertions(+), 6 deletions(-)
diff --git a/kernel/audit.c b/kernel/audit.c
index 34c5a2310fbf..1e5756f16f6f 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -182,7 +182,7 @@ struct audit_buffer {
struct audit_reply {
__u32 portid;
- pid_t pid;
+ struct net *net;
struct sk_buff *skb;
};
@@ -500,7 +500,7 @@ int audit_send_list(void *_dest)
{
struct audit_netlink_list *dest = _dest;
struct sk_buff *skb;
- struct net *net = get_net_ns_by_pid(dest->pid);
+ struct net *net = dest->net;
struct audit_net *aunet = net_generic(net, audit_net_id);
/* wait for parent to finish and send an ACK */
@@ -510,6 +510,7 @@ int audit_send_list(void *_dest)
while ((skb = __skb_dequeue(&dest->q)) != NULL)
netlink_unicast(aunet->nlsk, skb, dest->portid, 0);
+ put_net(net);
kfree(dest);
return 0;
static int audit_send_reply_thread(void *arg)
{
struct audit_reply *reply = (struct audit_reply *)arg;
- struct net *net = get_net_ns_by_pid(reply->pid);
+ struct net *net = reply->net;
struct audit_net *aunet = net_generic(net, audit_net_id);
mutex_lock(&audit_cmd_mutex);
@@ -552,6 +553,7 @@ static int audit_send_reply_thread(void *arg)
/* Ignore failure. It'll only happen if the sender goes away,
because our timeout is set to infinite. */
netlink_unicast(aunet->nlsk , reply->skb, reply->portid, 0);
+ put_net(net);
kfree(reply);
return 0;
}
@@ -583,8 +585,8 @@ static void audit_send_reply(__u32 portid, int seq, int type, int done,
if (!skb)
goto out;
+ reply->net = get_net(current->nsproxy->net_ns);
reply->portid = portid;
- reply->pid = task_pid_vnr(current);
reply->skb = skb;
tsk = kthread_run(audit_send_reply_thread, reply, "audit_send_reply");
diff --git a/kernel/audit.h b/kernel/audit.h
index 57cc64d67718..8df132214606 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -247,7 +247,7 @@ extern void audit_panic(const char *message);
struct audit_netlink_list {
__u32 portid;
- pid_t pid;
+ struct net *net;
struct sk_buff_head q;
};
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 14a78cca384e..a5e3d73d73e4 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -29,6 +29,7 @@
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/security.h>
+#include <net/net_namespace.h>
#include "audit.h"
/*
@@ -1083,8 +1084,8 @@ int audit_list_rules_send(__u32 portid, int seq)
dest = kmalloc(sizeof(struct audit_netlink_list), GFP_KERNEL);
if (!dest)
return -ENOMEM;
+ dest->net = get_net(current->nsproxy->net_ns);
dest->portid = portid;
- dest->pid = task_pid_vnr(current);
skb_queue_head_init(&dest->q);
mutex_lock(&audit_filter_mutex);
--
1.7.5.4
- RGB
--
Senior Software Engineer, Kernel Security, AMER ENG Base Operating Systems, Red Hat
Remote, Ottawa, Canada
Voice: +1.647.777.2635, Internal: (81) 32635, Alt: +1.613.693.0684x3545
- RGB

--
Richard Guy Briggs <***@redhat.com>
Senior Software Engineer, Kernel Security, AMER ENG Base Operating Systems, Red Hat
Remote, Ottawa, Canada
Voice: +1.647.777.2635, Internal: (81) 32635, Alt: +1.613.693.0684x3545
Loading...